Spaces:
Running
Running
Update rss_processor.py
Browse files- rss_processor.py +17 -11
rss_processor.py
CHANGED
@@ -69,6 +69,7 @@ vector_db = Chroma(persist_directory=LOCAL_DB_DIR, embedding_function=embedding_
|
|
69 |
|
70 |
def fetch_rss_feeds():
|
71 |
articles = []
|
|
|
72 |
for feed_url in RSS_FEEDS:
|
73 |
try:
|
74 |
logger.info(f"Fetching feed: {feed_url}")
|
@@ -77,16 +78,22 @@ def fetch_rss_feeds():
|
|
77 |
logger.warning(f"Failed to parse {feed_url}: {feed.bozo_exception}")
|
78 |
continue
|
79 |
for entry in feed.entries[:5]:
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
"
|
87 |
-
|
88 |
-
|
89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
except Exception as e:
|
91 |
logger.error(f"Error fetching {feed_url}: {e}")
|
92 |
return articles
|
@@ -109,7 +116,6 @@ def process_and_store_articles(articles):
|
|
109 |
documents = []
|
110 |
for article in articles:
|
111 |
try:
|
112 |
-
# Ensure no None values in metadata
|
113 |
metadata = {
|
114 |
"title": article["title"] or "No Title",
|
115 |
"link": article["link"] or "",
|
|
|
69 |
|
70 |
def fetch_rss_feeds():
|
71 |
articles = []
|
72 |
+
seen_articles = set() # Track unique articles by title and link
|
73 |
for feed_url in RSS_FEEDS:
|
74 |
try:
|
75 |
logger.info(f"Fetching feed: {feed_url}")
|
|
|
78 |
logger.warning(f"Failed to parse {feed_url}: {feed.bozo_exception}")
|
79 |
continue
|
80 |
for entry in feed.entries[:5]:
|
81 |
+
title = entry.get("title", "No Title")
|
82 |
+
link = entry.get("link", "")
|
83 |
+
# Create a unique key for deduplication
|
84 |
+
article_key = f"{title}|{link}"
|
85 |
+
if article_key not in seen_articles:
|
86 |
+
seen_articles.add(article_key)
|
87 |
+
image = entry.get("media_content", [{}])[0].get("url") or entry.get("media_thumbnail", [{}])[0].get("url") or ""
|
88 |
+
articles.append({
|
89 |
+
"title": title,
|
90 |
+
"link": link,
|
91 |
+
"description": entry.get("summary", entry.get("description", "No Description")),
|
92 |
+
"published": entry.get("published", "Unknown Date"),
|
93 |
+
"category": categorize_feed(feed_url),
|
94 |
+
"image": image if image else "",
|
95 |
+
})
|
96 |
+
logger.info(f"Processed {len(feed.entries[:5])} unique entries from {feed_url}")
|
97 |
except Exception as e:
|
98 |
logger.error(f"Error fetching {feed_url}: {e}")
|
99 |
return articles
|
|
|
116 |
documents = []
|
117 |
for article in articles:
|
118 |
try:
|
|
|
119 |
metadata = {
|
120 |
"title": article["title"] or "No Title",
|
121 |
"link": article["link"] or "",
|