import feedparser
import time
import logging
import re
import ssl
import requests
from datetime import datetime, timedelta
from threading import Timer
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed

logger = logging.getLogger("misinformation_detector")

# Disable SSL certificate verification for feeds with self-signed certs
ssl._create_default_https_context = ssl._create_unverified_context

# List of RSS feeds to check for news
# These are popular news sources with reliable and frequently updated RSS feeds
RSS_FEEDS = [
# --------------------
# 🌐 General World News
# --------------------
# "http://rss.cnn.com/rss/cnn_topstories.rss",                    # CNN Top Stories; Removed in round 2
"http://rss.cnn.com/rss/cnn_world.rss",                         # CNN World News; Duplicate with category_detection
# "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml",   # NYT Home Page
"https://rss.nytimes.com/services/xml/rss/nyt/World.xml",      # NYT World News; Duplicate with category_detection
# "https://rss.nytimes.com/services/xml/rss/nyt/US.xml",         # NYT US News
"https://feeds.washingtonpost.com/rss/world",                  # The Washington Post World News; Removed in round 2
# "https://feeds.washingtonpost.com/rss/national",               # The Washington Post National News
# "https://feeds.bbci.co.uk/news/rss.xml",                       # BBC News - Top Stories; Removed in round 2
"https://feeds.bbci.co.uk/news/world/rss.xml",                 # BBC News - World
# "https://news.google.com/rss?gl=IN&ceid=IN:en&topic=w&hl=en-IN",  # Google News India - World; Removed in round 2
# "https://news.google.com/rss?gl=US&ceid=US:en&topic=w&hl=en-US",  # Google News US - World; Removed in round 2

# --------------------
# 🧠 Tech & Startup News (Global)
# --------------------
"https://techcrunch.com/feed/",                                # TechCrunch - Startup and Technology News; Duplicate with category_detection
"https://venturebeat.com/feed/",                               # VentureBeat - Tech News
# "https://www.theverge.com/rss/index.xml",                      # The Verge - Technology News
"https://www.wired.com/feed/rss",                              # Wired - Technology News
"https://www.cnet.com/rss/news/",                              # CNET - Technology News
# "https://sifted.eu/feed/",                                     # Sifted - European Startups and Tech
# "https://feeds.feedburner.com/fastcompany/headlines",          # Fast Company - Business Innovation
# "https://feeds.bbci.co.uk/news/technology/rss.xml",            # BBC News - Technology
"https://news.google.com/rss?gl=IN&ceid=IN:en&topic=t&hl=en-IN",  # Google News India - Technology
"https://news.google.com/rss?gl=US&ceid=US:en&topic=t&hl=en-US",  # Google News US - Technology

# --------------------
# 💼 Startup & VC Focused
# --------------------
"https://news.crunchbase.com/feed/",                           # Crunchbase News - Startup Funding
# "https://avc.com/feed/",                                       # AVC - Musings of a VC in NYC
"https://techstartups.com/feed/",                              # Tech Startups - Startup News
# "https://tech.eu/feed/",                                       # Tech.eu - European Tech News
# "https://www.menabytes.com/feed/",                             # MENAbytes - Middle East & North Africa Startups
# "http://feeds.feedburner.com/venturebeat/SZYF",                # VentureBeat - Deals

# --------------------
# 📰 Global Business & Corporate Feeds
# --------------------
"https://feeds.bloomberg.com/technology/news.rss",             # Bloomberg Technology News
"https://www.ft.com/technology?format=rss",                    # Financial Times Technology News
# "https://ir.thomsonreuters.com/rss/news-releases.xml",         # Thomson Reuters Press Releases
# "https://feeds.bbci.co.uk/news/business/rss.xml",              # BBC News - Business
"https://news.google.com/rss?gl=IN&ceid=IN:en&topic=b&hl=en-IN",  # Google News India - Business
# "https://news.google.com/rss?gl=US&ceid=US:en&topic=b&hl=en-US",  # Google News US - Business; Removed in round 2

# --------------------
# 🇮🇳 India-specific News
# --------------------
"https://inc42.com/feed/",                                     # Inc42 - Indian Startups and Technology
# "https://yourstory.com/rss",                                   # YourStory - Indian Startup Stories
# "https://economictimes.indiatimes.com/startups/rssfeeds/49979279.cms",  # Economic Times - Startups
"https://timesofindia.indiatimes.com/rssfeedstopstories.cms",           # TOI - Top Stories
"https://timesofindia.indiatimes.com/rssfeedmostrecent.cms",            # TOI - Most Recent Stories
"https://timesofindia.indiatimes.com/rssfeeds/-2128936835.cms",         # TOI - India News
"https://timesofindia.indiatimes.com/rssfeeds/296589292.cms",           # TOI - World News
"https://timesofindia.indiatimes.com/rssfeeds/1898055.cms",             # TOI - Business News
"https://timesofindia.indiatimes.com/rssfeeds/54829575.cms",            # TOI - Cricket News
"https://timesofindia.indiatimes.com/rssfeeds/4719148.cms",             # TOI - Sports News
"https://timesofindia.indiatimes.com/rssfeeds/-2128672765.cms",         # TOI - Science News
# "https://timesofindia.indiatimes.com/rssfeeds/66949542.cms",            # TOI - Technology News
# "https://timesofindia.indiatimes.com/rssfeeds/1081479906.cms",          # TOI - Education News

# --------------------
# 🏏 Sports News (Global + Cricket)
# --------------------
"https://www.espn.com/espn/rss/news",                          # ESPN - Top Sports News; Duplicate with category_detection
# "https://api.foxsports.com/v2/content/optimized-rss?partnerKey=MB0Wehpmuj2lUhuRhQaafhBjAJqaPU244mlTDK1i&size=30",  # Fox Sports; Removed in round 2
"https://feeds.skynews.com/feeds/rss/sports.xml",              # Sky News - Sports
"https://sports.ndtv.com/rss/all",                                 # NDTV Sports
"https://www.espncricinfo.com/rss/content/story/feeds/0.xml",  # ESPN Cricinfo - Cricket News; Duplicate with category_detection
# "https://crickettimes.com/feed/",                              # Cricket Times - Cricket News

# --------------------
# ✅ Fact-Checking Sources
# --------------------
"https://www.snopes.com/feed/",                                # Snopes - Fact Checking; Duplicate with category_detection
"https://www.politifact.com/rss/all/",                         # PolitiFact - Fact Checking; Duplicate with category_detection

# --------------------
# 🗳️ Politics & Policy (General)
# --------------------
"https://feeds.bbci.co.uk/news/politics/rss.xml",              # BBC News - Politics; Duplicate with category_detection
"https://feeds.bbci.co.uk/news/science_and_environment/rss.xml",  # BBC - Science & Environment

# --------------------
# 🗳️ Science
# --------------------
"https://www.nature.com/nature.rss",                              # Nature science; Duplicate with category_detection
"https://feeds.science.org/rss/science-advances.xml"              # science.org
]

def clean_html(raw_html):
    """Remove HTML tags from text"""
    if not raw_html:
        return ""
    clean_regex = re.compile('<.*?>')
    clean_text = re.sub(clean_regex, '', raw_html)
    # Remove extra whitespace
    clean_text = re.sub(r'\s+', ' ', clean_text).strip()
    return clean_text

def parse_feed(feed_url, timeout=5):
    """
    Parse a single RSS feed with proper timeout handling
    Uses requests with timeout first, then passes content to feedparser
    """
    try:
        # Use requests with timeout to fetch the RSS content
        response = requests.get(feed_url, timeout=timeout)
        response.raise_for_status()
        
        # Then parse the content with feedparser (which doesn't support timeout)
        feed = feedparser.parse(response.content)
        
        # Basic validation of the feed
        if hasattr(feed, 'entries') and feed.entries:
            return feed
        else:
            logger.warning(f"Feed {feed_url} parsed but contains no entries")
            return None
            
    except requests.exceptions.Timeout:
        logger.warning(f"Timeout while fetching feed {feed_url}")
        return None
    except requests.exceptions.RequestException as e:
        logger.error(f"Request error fetching feed {feed_url}: {str(e)}")
        return None
    except Exception as e:
        logger.error(f"Error parsing feed {feed_url}: {str(e)}")
        return None

def fetch_all_feeds(feeds_list=None, max_workers=5, timeout=5):
    """
    Fetch multiple RSS feeds with proper timeout handling
    Returns a list of (domain, feed) tuples for successfully fetched feeds
    """
    # Use default RSS_FEEDS list if none provided
    if feeds_list is None:
        feeds_list = RSS_FEEDS
    
    results = []
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_url = {executor.submit(parse_feed, url, timeout): url for url in feeds_list}
        for future in as_completed(future_to_url):
            url = future_to_url[future]
            try:
                feed = future.result()
                if feed and hasattr(feed, 'entries') and feed.entries:
                    # Extract domain for source attribution
                    domain = urlparse(url).netloc
                    results.append((domain, feed))
                    logger.info(f"Successfully fetched {domain} with {len(feed.entries)} entries")
            except Exception as e:
                logger.error(f"Error processing {url}: {str(e)}")
    
    return results

def extract_date(entry):
    """Extract and normalize publication date from entry"""
    for date_field in ['published_parsed', 'updated_parsed', 'created_parsed']:
        if hasattr(entry, date_field) and getattr(entry, date_field):
            try:
                # Convert time tuple to datetime
                time_tuple = getattr(entry, date_field)
                return datetime(time_tuple[0], time_tuple[1], time_tuple[2], 
                               time_tuple[3], time_tuple[4], time_tuple[5])
            except Exception as e:
                logger.debug(f"Error parsing {date_field}: {e}")
                continue
    
    # Try string dates
    for date_field in ['published', 'updated', 'pubDate']:
        if hasattr(entry, date_field) and getattr(entry, date_field):
            try:
                date_str = getattr(entry, date_field)
                # Try various formats
                for fmt in ['%a, %d %b %Y %H:%M:%S %z', '%a, %d %b %Y %H:%M:%S %Z', 
                           '%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S%z']:
                    try:
                        return datetime.strptime(date_str, fmt)
                    except:
                        continue
            except Exception as e:
                logger.debug(f"Error parsing date string {date_field}: {e}")
                continue
    
    # Default to current time if parsing fails
    return datetime.now()

def is_recent(entry_date, max_days=3):
    """Check if an entry is recent (within the last few days)"""
    if not entry_date:
        return False
    cutoff = datetime.now() - timedelta(days=max_days)
    return entry_date > cutoff

def get_entry_relevance(entry, query_terms, domain):
    """Calculate relevance score for an entry based on query match and recency"""
    if not hasattr(entry, 'title') or not entry.title:
        return 0
    
    # Extract text content
    title = entry.title or ""
    description = clean_html(entry.description) if hasattr(entry, 'description') else ""
    content = ""
    if hasattr(entry, 'content'):
        for content_item in entry.content:
            if 'value' in content_item:
                content += clean_html(content_item['value']) + " "
    
    # Extract published date
    pub_date = extract_date(entry)
    
    # Calculate recency score (0-1)
    recency_score = 0
    if pub_date:
        days_old = (datetime.now() - pub_date).days
        if days_old <= 1:  # Today or yesterday
            recency_score = 1.0
        elif days_old <= 2:
            recency_score = 0.8
        elif days_old <= 3:
            recency_score = 0.5
        else:
            recency_score = 0.2
    
    # Calculate relevance score based on keyword matches
    text = f"{title} {description} {content}".lower()
    
    # Count how many query terms appear in the content
    query_terms_lower = [term.lower() for term in query_terms]
    matches = sum(1 for term in query_terms_lower if term in text)
    
    # Calculate match score (0-1)
    match_score = min(1.0, matches / max(1, len(query_terms) * 0.7))
    
    # Boost score for exact phrase matches
    query_phrase = " ".join(query_terms_lower)
    if query_phrase in text:
        match_score += 0.5
    
    # Additional boost for title matches (they're more relevant)
    title_matches = sum(1 for term in query_terms_lower if term in title.lower())
    if title_matches > 0:
        match_score += 0.2 * (title_matches / len(query_terms_lower))
    
    # Source quality factor (can be adjusted based on source reliability)
    source_factor = 1.0
    high_quality_domains = ['bbc.co.uk', 'nytimes.com', 'reuters.com', 'washingtonpost.com', 
                           'espncricinfo.com', 'cricbuzz.com', 'snopes.com']
    if any(quality_domain in domain for quality_domain in high_quality_domains):
        source_factor = 1.2
    
    # Calculate final score
    final_score = (match_score * 0.6) + (recency_score * 0.4) * source_factor
    
    return min(1.0, final_score)  # Cap at 1.0

def retrieve_evidence_from_rss(claim, max_results=3, category_feeds=None):
    """
    Retrieve evidence from RSS feeds for a given claim
    
    Args:
        claim (str): The claim to verify
        max_results (int): Maximum number of results to return
        category_feeds (list, optional): List of category-specific RSS feeds to check
        
    Returns:
        list: List of relevant evidence items
    """
    start_time = time.time()
    logger.info(f"Retrieving evidence from RSS feeds for: {claim}")
    
    # Extract key terms from claim
    terms = [term.strip() for term in re.findall(r'\b\w+\b', claim) if len(term.strip()) > 2]
    
    try:
        # Use category-specific feeds if provided
        feeds_to_use = category_feeds if category_feeds else RSS_FEEDS
        
        # Log which feeds we're using
        if category_feeds:
            logger.info(f"Using {len(category_feeds)} category-specific RSS feeds")
        else:
            logger.info(f"Using {len(RSS_FEEDS)} default RSS feeds")
        
        # Limit the number of feeds to process for efficiency
        if len(feeds_to_use) > 10:
            # If we have too many feeds, select a subset
            # Prioritize fact-checking sources
            fact_check_feeds = [feed for feed in feeds_to_use if "fact" in feed.lower() or "snopes" in feed.lower() or "politifact" in feed.lower()]
            other_feeds = [feed for feed in feeds_to_use if feed not in fact_check_feeds]
            
            # Take all fact-checking feeds plus a random selection of others
            import random
            selected_feeds = fact_check_feeds + random.sample(other_feeds, min(10 - len(fact_check_feeds), len(other_feeds)))
        else:
            selected_feeds = feeds_to_use
            
        # Fetch all feeds in parallel with the selected feeds
        feeds = fetch_all_feeds(selected_feeds)
        
        if not feeds:
            logger.warning("No RSS feeds could be fetched")
            return []
        
        all_entries = []
        
        # Process all feed entries
        for domain, feed in feeds:
            for entry in feed.entries:
                # Calculate relevance score
                relevance = get_entry_relevance(entry, terms, domain)
                
                if relevance > 0.3:  # Only consider somewhat relevant entries
                    # Extract entry details
                    title = entry.title if hasattr(entry, 'title') else "No title"
                    link = entry.link if hasattr(entry, 'link') else ""
                    
                    # Extract and clean description/content
                    description = ""
                    if hasattr(entry, 'description'):
                        description = clean_html(entry.description)
                    elif hasattr(entry, 'summary'):
                        description = clean_html(entry.summary)
                    elif hasattr(entry, 'content'):
                        for content_item in entry.content:
                            if 'value' in content_item:
                                description += clean_html(content_item['value']) + " "
                    
                    # Truncate description if too long
                    if len(description) > 250:
                        description = description[:247] + "..."
                    
                    # Get publication date
                    pub_date = extract_date(entry)
                    date_str = pub_date.strftime('%Y-%m-%d') if pub_date else "Unknown date"
                    
                    # Format as evidence text
                    evidence_text = (
                        f"Title: {title}, "
                        f"Source: {domain} (RSS), "
                        f"Date: {date_str}, "
                        f"URL: {link}, "
                        f"Content: {description}"
                    )
                    
                    all_entries.append({
                        "text": evidence_text,
                        "relevance": relevance,
                        "date": pub_date or datetime.now()
                    })
        
        # Sort entries by relevance
        all_entries.sort(key=lambda x: x["relevance"], reverse=True)
        
        # Take top results
        top_entries = all_entries[:max_results]
        
        logger.info(f"Retrieved {len(top_entries)} relevant RSS items from {len(feeds)} feeds in {time.time() - start_time:.2f}s")
        
        # Return just the text portion
        return [entry["text"] for entry in top_entries]
    
    except Exception as e:
        logger.error(f"Error in RSS retrieval: {str(e)}")
        return []