import feedparser import time import logging import re import ssl import requests from datetime import datetime, timedelta from threading import Timer from urllib.parse import urlparse from concurrent.futures import ThreadPoolExecutor, as_completed logger = logging.getLogger("misinformation_detector") # Disable SSL certificate verification for feeds with self-signed certs ssl._create_default_https_context = ssl._create_unverified_context # List of RSS feeds to check for news # These are popular news sources with reliable and frequently updated RSS feeds RSS_FEEDS = [ # -------------------- # 🌐 General World News # -------------------- # "http://rss.cnn.com/rss/cnn_topstories.rss", # CNN Top Stories; Removed in round 2 "http://rss.cnn.com/rss/cnn_world.rss", # CNN World News; Duplicate with category_detection # "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml", # NYT Home Page "https://rss.nytimes.com/services/xml/rss/nyt/World.xml", # NYT World News; Duplicate with category_detection # "https://rss.nytimes.com/services/xml/rss/nyt/US.xml", # NYT US News "https://feeds.washingtonpost.com/rss/world", # The Washington Post World News; Removed in round 2 # "https://feeds.washingtonpost.com/rss/national", # The Washington Post National News # "https://feeds.bbci.co.uk/news/rss.xml", # BBC News - Top Stories; Removed in round 2 "https://feeds.bbci.co.uk/news/world/rss.xml", # BBC News - World # "https://news.google.com/rss?gl=IN&ceid=IN:en&topic=w&hl=en-IN", # Google News India - World; Removed in round 2 # "https://news.google.com/rss?gl=US&ceid=US:en&topic=w&hl=en-US", # Google News US - World; Removed in round 2 # -------------------- # 🧠 Tech & Startup News (Global) # -------------------- "https://techcrunch.com/feed/", # TechCrunch - Startup and Technology News; Duplicate with category_detection "https://venturebeat.com/feed/", # VentureBeat - Tech News # "https://www.theverge.com/rss/index.xml", # The Verge - Technology News "https://www.wired.com/feed/rss", # Wired - Technology News "https://www.cnet.com/rss/news/", # CNET - Technology News # "https://sifted.eu/feed/", # Sifted - European Startups and Tech # "https://feeds.feedburner.com/fastcompany/headlines", # Fast Company - Business Innovation # "https://feeds.bbci.co.uk/news/technology/rss.xml", # BBC News - Technology "https://news.google.com/rss?gl=IN&ceid=IN:en&topic=t&hl=en-IN", # Google News India - Technology "https://news.google.com/rss?gl=US&ceid=US:en&topic=t&hl=en-US", # Google News US - Technology # -------------------- # 💼 Startup & VC Focused # -------------------- "https://news.crunchbase.com/feed/", # Crunchbase News - Startup Funding # "https://avc.com/feed/", # AVC - Musings of a VC in NYC "https://techstartups.com/feed/", # Tech Startups - Startup News # "https://tech.eu/feed/", # Tech.eu - European Tech News # "https://www.menabytes.com/feed/", # MENAbytes - Middle East & North Africa Startups # "http://feeds.feedburner.com/venturebeat/SZYF", # VentureBeat - Deals # -------------------- # 📰 Global Business & Corporate Feeds # -------------------- "https://feeds.bloomberg.com/technology/news.rss", # Bloomberg Technology News "https://www.ft.com/technology?format=rss", # Financial Times Technology News # "https://ir.thomsonreuters.com/rss/news-releases.xml", # Thomson Reuters Press Releases # "https://feeds.bbci.co.uk/news/business/rss.xml", # BBC News - Business "https://news.google.com/rss?gl=IN&ceid=IN:en&topic=b&hl=en-IN", # Google News India - Business # "https://news.google.com/rss?gl=US&ceid=US:en&topic=b&hl=en-US", # Google News US - Business; Removed in round 2 # -------------------- # 🇮🇳 India-specific News # -------------------- "https://inc42.com/feed/", # Inc42 - Indian Startups and Technology # "https://yourstory.com/rss", # YourStory - Indian Startup Stories # "https://economictimes.indiatimes.com/startups/rssfeeds/49979279.cms", # Economic Times - Startups "https://timesofindia.indiatimes.com/rssfeedstopstories.cms", # TOI - Top Stories "https://timesofindia.indiatimes.com/rssfeedmostrecent.cms", # TOI - Most Recent Stories "https://timesofindia.indiatimes.com/rssfeeds/-2128936835.cms", # TOI - India News "https://timesofindia.indiatimes.com/rssfeeds/296589292.cms", # TOI - World News "https://timesofindia.indiatimes.com/rssfeeds/1898055.cms", # TOI - Business News "https://timesofindia.indiatimes.com/rssfeeds/54829575.cms", # TOI - Cricket News "https://timesofindia.indiatimes.com/rssfeeds/4719148.cms", # TOI - Sports News "https://timesofindia.indiatimes.com/rssfeeds/-2128672765.cms", # TOI - Science News # "https://timesofindia.indiatimes.com/rssfeeds/66949542.cms", # TOI - Technology News # "https://timesofindia.indiatimes.com/rssfeeds/1081479906.cms", # TOI - Education News # -------------------- # 🏏 Sports News (Global + Cricket) # -------------------- "https://www.espn.com/espn/rss/news", # ESPN - Top Sports News; Duplicate with category_detection # "https://api.foxsports.com/v2/content/optimized-rss?partnerKey=MB0Wehpmuj2lUhuRhQaafhBjAJqaPU244mlTDK1i&size=30", # Fox Sports; Removed in round 2 "https://feeds.skynews.com/feeds/rss/sports.xml", # Sky News - Sports "https://sports.ndtv.com/rss/all", # NDTV Sports "https://www.espncricinfo.com/rss/content/story/feeds/0.xml", # ESPN Cricinfo - Cricket News; Duplicate with category_detection # "https://crickettimes.com/feed/", # Cricket Times - Cricket News # -------------------- # ✅ Fact-Checking Sources # -------------------- "https://www.snopes.com/feed/", # Snopes - Fact Checking; Duplicate with category_detection "https://www.politifact.com/rss/all/", # PolitiFact - Fact Checking; Duplicate with category_detection # -------------------- # 🗳️ Politics & Policy (General) # -------------------- "https://feeds.bbci.co.uk/news/politics/rss.xml", # BBC News - Politics; Duplicate with category_detection "https://feeds.bbci.co.uk/news/science_and_environment/rss.xml", # BBC - Science & Environment # -------------------- # 🗳️ Science # -------------------- "https://www.nature.com/nature.rss", # Nature science; Duplicate with category_detection "https://feeds.science.org/rss/science-advances.xml" # science.org ] def clean_html(raw_html): """Remove HTML tags from text""" if not raw_html: return "" clean_regex = re.compile('<.*?>') clean_text = re.sub(clean_regex, '', raw_html) # Remove extra whitespace clean_text = re.sub(r'\s+', ' ', clean_text).strip() return clean_text def parse_feed(feed_url, timeout=5): """ Parse a single RSS feed with proper timeout handling Uses requests with timeout first, then passes content to feedparser """ try: # Use requests with timeout to fetch the RSS content response = requests.get(feed_url, timeout=timeout) response.raise_for_status() # Then parse the content with feedparser (which doesn't support timeout) feed = feedparser.parse(response.content) # Basic validation of the feed if hasattr(feed, 'entries') and feed.entries: return feed else: logger.warning(f"Feed {feed_url} parsed but contains no entries") return None except requests.exceptions.Timeout: logger.warning(f"Timeout while fetching feed {feed_url}") return None except requests.exceptions.RequestException as e: logger.error(f"Request error fetching feed {feed_url}: {str(e)}") return None except Exception as e: logger.error(f"Error parsing feed {feed_url}: {str(e)}") return None def fetch_all_feeds(feeds_list=None, max_workers=5, timeout=5): """ Fetch multiple RSS feeds with proper timeout handling Returns a list of (domain, feed) tuples for successfully fetched feeds """ # Use default RSS_FEEDS list if none provided if feeds_list is None: feeds_list = RSS_FEEDS results = [] with ThreadPoolExecutor(max_workers=max_workers) as executor: future_to_url = {executor.submit(parse_feed, url, timeout): url for url in feeds_list} for future in as_completed(future_to_url): url = future_to_url[future] try: feed = future.result() if feed and hasattr(feed, 'entries') and feed.entries: # Extract domain for source attribution domain = urlparse(url).netloc results.append((domain, feed)) logger.info(f"Successfully fetched {domain} with {len(feed.entries)} entries") except Exception as e: logger.error(f"Error processing {url}: {str(e)}") return results def extract_date(entry): """Extract and normalize publication date from entry""" for date_field in ['published_parsed', 'updated_parsed', 'created_parsed']: if hasattr(entry, date_field) and getattr(entry, date_field): try: # Convert time tuple to datetime time_tuple = getattr(entry, date_field) return datetime(time_tuple[0], time_tuple[1], time_tuple[2], time_tuple[3], time_tuple[4], time_tuple[5]) except Exception as e: logger.debug(f"Error parsing {date_field}: {e}") continue # Try string dates for date_field in ['published', 'updated', 'pubDate']: if hasattr(entry, date_field) and getattr(entry, date_field): try: date_str = getattr(entry, date_field) # Try various formats for fmt in ['%a, %d %b %Y %H:%M:%S %z', '%a, %d %b %Y %H:%M:%S %Z', '%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S%z']: try: return datetime.strptime(date_str, fmt) except: continue except Exception as e: logger.debug(f"Error parsing date string {date_field}: {e}") continue # Default to current time if parsing fails return datetime.now() def is_recent(entry_date, max_days=3): """Check if an entry is recent (within the last few days)""" if not entry_date: return False cutoff = datetime.now() - timedelta(days=max_days) return entry_date > cutoff def get_entry_relevance(entry, query_terms, domain): """Calculate relevance score for an entry based on query match and recency""" if not hasattr(entry, 'title') or not entry.title: return 0 # Extract text content title = entry.title or "" description = clean_html(entry.description) if hasattr(entry, 'description') else "" content = "" if hasattr(entry, 'content'): for content_item in entry.content: if 'value' in content_item: content += clean_html(content_item['value']) + " " # Extract published date pub_date = extract_date(entry) # Calculate recency score (0-1) recency_score = 0 if pub_date: days_old = (datetime.now() - pub_date).days if days_old <= 1: # Today or yesterday recency_score = 1.0 elif days_old <= 2: recency_score = 0.8 elif days_old <= 3: recency_score = 0.5 else: recency_score = 0.2 # Calculate relevance score based on keyword matches text = f"{title} {description} {content}".lower() # Count how many query terms appear in the content query_terms_lower = [term.lower() for term in query_terms] matches = sum(1 for term in query_terms_lower if term in text) # Calculate match score (0-1) match_score = min(1.0, matches / max(1, len(query_terms) * 0.7)) # Boost score for exact phrase matches query_phrase = " ".join(query_terms_lower) if query_phrase in text: match_score += 0.5 # Additional boost for title matches (they're more relevant) title_matches = sum(1 for term in query_terms_lower if term in title.lower()) if title_matches > 0: match_score += 0.2 * (title_matches / len(query_terms_lower)) # Source quality factor (can be adjusted based on source reliability) source_factor = 1.0 high_quality_domains = ['bbc.co.uk', 'nytimes.com', 'reuters.com', 'washingtonpost.com', 'espncricinfo.com', 'cricbuzz.com', 'snopes.com'] if any(quality_domain in domain for quality_domain in high_quality_domains): source_factor = 1.2 # Calculate final score final_score = (match_score * 0.6) + (recency_score * 0.4) * source_factor return min(1.0, final_score) # Cap at 1.0 def retrieve_evidence_from_rss(claim, max_results=3, category_feeds=None): """ Retrieve evidence from RSS feeds for a given claim Args: claim (str): The claim to verify max_results (int): Maximum number of results to return category_feeds (list, optional): List of category-specific RSS feeds to check Returns: list: List of relevant evidence items """ start_time = time.time() logger.info(f"Retrieving evidence from RSS feeds for: {claim}") # Extract key terms from claim terms = [term.strip() for term in re.findall(r'\b\w+\b', claim) if len(term.strip()) > 2] try: # Use category-specific feeds if provided feeds_to_use = category_feeds if category_feeds else RSS_FEEDS # Log which feeds we're using if category_feeds: logger.info(f"Using {len(category_feeds)} category-specific RSS feeds") else: logger.info(f"Using {len(RSS_FEEDS)} default RSS feeds") # Limit the number of feeds to process for efficiency if len(feeds_to_use) > 10: # If we have too many feeds, select a subset # Prioritize fact-checking sources fact_check_feeds = [feed for feed in feeds_to_use if "fact" in feed.lower() or "snopes" in feed.lower() or "politifact" in feed.lower()] other_feeds = [feed for feed in feeds_to_use if feed not in fact_check_feeds] # Take all fact-checking feeds plus a random selection of others import random selected_feeds = fact_check_feeds + random.sample(other_feeds, min(10 - len(fact_check_feeds), len(other_feeds))) else: selected_feeds = feeds_to_use # Fetch all feeds in parallel with the selected feeds feeds = fetch_all_feeds(selected_feeds) if not feeds: logger.warning("No RSS feeds could be fetched") return [] all_entries = [] # Process all feed entries for domain, feed in feeds: for entry in feed.entries: # Calculate relevance score relevance = get_entry_relevance(entry, terms, domain) if relevance > 0.3: # Only consider somewhat relevant entries # Extract entry details title = entry.title if hasattr(entry, 'title') else "No title" link = entry.link if hasattr(entry, 'link') else "" # Extract and clean description/content description = "" if hasattr(entry, 'description'): description = clean_html(entry.description) elif hasattr(entry, 'summary'): description = clean_html(entry.summary) elif hasattr(entry, 'content'): for content_item in entry.content: if 'value' in content_item: description += clean_html(content_item['value']) + " " # Truncate description if too long if len(description) > 250: description = description[:247] + "..." # Get publication date pub_date = extract_date(entry) date_str = pub_date.strftime('%Y-%m-%d') if pub_date else "Unknown date" # Format as evidence text evidence_text = ( f"Title: {title}, " f"Source: {domain} (RSS), " f"Date: {date_str}, " f"URL: {link}, " f"Content: {description}" ) all_entries.append({ "text": evidence_text, "relevance": relevance, "date": pub_date or datetime.now() }) # Sort entries by relevance all_entries.sort(key=lambda x: x["relevance"], reverse=True) # Take top results top_entries = all_entries[:max_results] logger.info(f"Retrieved {len(top_entries)} relevant RSS items from {len(feeds)} feeds in {time.time() - start_time:.2f}s") # Return just the text portion return [entry["text"] for entry in top_entries] except Exception as e: logger.error(f"Error in RSS retrieval: {str(e)}") return []