-
Notifications
You must be signed in to change notification settings - Fork 1
/
generate_feed.py
109 lines (85 loc) · 3.41 KB
/
generate_feed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import datetime
import requests
from bs4 import BeautifulSoup
# Constants
BASE_URL = "https://www.paulgraham.com/"
ARTICLES_URL = BASE_URL + "articles.html"
FEED_FILE = "feed.xml"
FEED_FILE_WITHOUT_CONTENT = "feed_without_content.xml"
def fetch_article_content(url):
"""Fetches the content of a single article."""
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
# Extract the main content. Most PG essays are inside <font> tags.
content = ""
for font_tag in soup.find_all("font"):
content += font_tag.get_text(separator="\n", strip=True)
return content.strip()
def fetch_articles():
"""Fetches the list of articles with title, URL, and content."""
response = requests.get(ARTICLES_URL)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
articles = []
articles_without_content = []
# TODO_HACK: Num of links to skip. The first 3 links are not the most recent essays.
# Rather, Paul shares the best essays to start "if you're not sure which to read [first]
# Skipping the first 3 links (non-recent, recommended essays)
num_links_to_skip = 3
# Find all 'a' tags with hrefs, skip those that have no actual text
for link in soup.find_all("a", href=True)[num_links_to_skip + 1 :]:
href = link["href"]
title = link.get_text(strip=True) # Extracts only the text, skips inner HTML tags
if href.endswith(".html") and title: # Only process if title has actual text
url = BASE_URL + href
content = fetch_article_content(url) # Fetch the content of each article
articles.append({"title": title, "url": url, "content": content})
articles_without_content.append({"title": title, "url": url})
return articles, articles_without_content
def generate_rss_feed(articles):
"""Generates the RSS feed with articles and their content."""
now = datetime.datetime.utcnow().strftime("%a, %d %b %Y %H:%M:%S GMT")
rss_feed = f"""<?xml version="1.0"?>
<rss version="2.0">
<channel>
<title>Paul Graham: Essays</title>
<link>{BASE_URL}</link>
<description>Scraped feed of essays from paulgraham.com</description>
<lastBuildDate>{now}</lastBuildDate>
"""
for article in articles:
if "content" in article:
rss_feed += f"""
<item>
<title>{article['title']}</title>
<link>{article['url']}</link>
<description><![CDATA[{article['content']}]]></description>
</item>
"""
else:
rss_feed += f"""
<item>
<title>{article['title']}</title>
<link>{article['url']}</link>
</item>
"""
rss_feed += """
</channel>
</rss>
"""
return rss_feed
def save_rss_feed(rss_feed, file):
"""Saves the RSS feed to a file."""
with open(file, "w") as f:
f.write(rss_feed)
def main():
"""Main entry point of the script."""
articles, articles_without_content = fetch_articles()
rss_feed = generate_rss_feed(articles)
save_rss_feed(rss_feed, FEED_FILE)
rss_feed_without_content = generate_rss_feed(articles_without_content)
save_rss_feed(rss_feed_without_content, FEED_FILE_WITHOUT_CONTENT)
print(f"RSS feed generated with {len(articles)} articles")
if __name__ == "__main__":
main()