broadfield-dev commited on
Commit
8091043
·
verified ·
1 Parent(s): efdc13f

Update rss_processor.py

Browse files
Files changed (1) hide show
  1. rss_processor.py +17 -11
rss_processor.py CHANGED
@@ -69,6 +69,7 @@ vector_db = Chroma(persist_directory=LOCAL_DB_DIR, embedding_function=embedding_
69
 
70
  def fetch_rss_feeds():
71
  articles = []
 
72
  for feed_url in RSS_FEEDS:
73
  try:
74
  logger.info(f"Fetching feed: {feed_url}")
@@ -77,16 +78,22 @@ def fetch_rss_feeds():
77
  logger.warning(f"Failed to parse {feed_url}: {feed.bozo_exception}")
78
  continue
79
  for entry in feed.entries[:5]:
80
- image = entry.get("media_content", [{}])[0].get("url") or entry.get("media_thumbnail", [{}])[0].get("url") or ""
81
- articles.append({
82
- "title": entry.get("title", "No Title"),
83
- "link": entry.get("link", ""),
84
- "description": entry.get("summary", entry.get("description", "No Description")),
85
- "published": entry.get("published", "Unknown Date"),
86
- "category": categorize_feed(feed_url),
87
- "image": image if image else "",
88
- })
89
- logger.info(f"Processed {len(feed.entries[:5])} entries from {feed_url}")
 
 
 
 
 
 
90
  except Exception as e:
91
  logger.error(f"Error fetching {feed_url}: {e}")
92
  return articles
@@ -109,7 +116,6 @@ def process_and_store_articles(articles):
109
  documents = []
110
  for article in articles:
111
  try:
112
- # Ensure no None values in metadata
113
  metadata = {
114
  "title": article["title"] or "No Title",
115
  "link": article["link"] or "",
 
69
 
70
  def fetch_rss_feeds():
71
  articles = []
72
+ seen_articles = set() # Track unique articles by title and link
73
  for feed_url in RSS_FEEDS:
74
  try:
75
  logger.info(f"Fetching feed: {feed_url}")
 
78
  logger.warning(f"Failed to parse {feed_url}: {feed.bozo_exception}")
79
  continue
80
  for entry in feed.entries[:5]:
81
+ title = entry.get("title", "No Title")
82
+ link = entry.get("link", "")
83
+ # Create a unique key for deduplication
84
+ article_key = f"{title}|{link}"
85
+ if article_key not in seen_articles:
86
+ seen_articles.add(article_key)
87
+ image = entry.get("media_content", [{}])[0].get("url") or entry.get("media_thumbnail", [{}])[0].get("url") or ""
88
+ articles.append({
89
+ "title": title,
90
+ "link": link,
91
+ "description": entry.get("summary", entry.get("description", "No Description")),
92
+ "published": entry.get("published", "Unknown Date"),
93
+ "category": categorize_feed(feed_url),
94
+ "image": image if image else "",
95
+ })
96
+ logger.info(f"Processed {len(feed.entries[:5])} unique entries from {feed_url}")
97
  except Exception as e:
98
  logger.error(f"Error fetching {feed_url}: {e}")
99
  return articles
 
116
  documents = []
117
  for article in articles:
118
  try:
 
119
  metadata = {
120
  "title": article["title"] or "No Title",
121
  "link": article["link"] or "",