Spaces:

ankanghosh
/

askveracity

Running

App Files Files Community

ankanghosh commited on Apr 15

Commit

5dc3509

verified ·

1 Parent(s): 132ab9e

Upload 12 files

Browse files

Files changed (12) hide show

modules/__init__.py +19 -0
modules/category_detection.py +880 -0
modules/claim_extraction.py +236 -0
modules/classification.py +521 -0
modules/evidence_retrieval.py +944 -0
modules/explanation.py +181 -0
modules/rss_feed.py +391 -0
modules/semantic_analysis.py +503 -0
utils/__init__.py +20 -0
utils/api_utils.py +229 -0
utils/models.py +157 -0
utils/performance.py +135 -0

modules/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+"""
+Modules package initialization.
+This package contains the core modules for the AskVeracity fact-checking system.
+"""
+from .claim_extraction import extract_claims, shorten_claim_for_evidence
+from .evidence_retrieval import retrieve_combined_evidence
+from .classification import classify_with_llm, aggregate_evidence
+from .explanation import generate_explanation
+__all__ = [
+    'extract_claims',
+    'shorten_claim_for_evidence',
+    'retrieve_combined_evidence',
+    'classify_with_llm',
+    'aggregate_evidence',
+    'generate_explanation'
+]

modules/category_detection.py ADDED Viewed

	@@ -0,0 +1,880 @@

+import logging
+import re
+from typing import Tuple, List, Dict, Optional
+import os
+import time
+# Set up logging
+logger = logging.getLogger("misinformation_detector")
+# Define categories and their keywords
+CLAIM_CATEGORIES = {
+    "ai": [
+        # General AI terms
+        "AI", "artificial intelligence", "machine learning", "ML", "deep learning", "DL",
+        "neural network", "neural nets", "generative AI", "GenAI", "AGI", "artificial general intelligence",
+        "transformer", "attention mechanism", "fine-tuning", "pre-training", "training", "inference",
+        # AI Models and Architectures
+        "language model", "large language model", "LLM", "foundation model", "multimodal model",
+        "vision language model", "VLM", "text-to-speech", "TTS", "speech-to-text", "STT",
+        "text-to-image", "image-to-text", "diffusion model", "generative model", "discriminative model",
+        "GPT", "BERT", "T5", "PaLM", "Claude", "Llama", "Gemini", "Mistral", "Mixtral", "Stable Diffusion",
+        "Dall-E", "Midjourney", "Sora", "transformer", "MoE", "mixture of experts", "sparse model",
+        "dense model", "encoder", "decoder", "encoder-decoder", "autoencoder", "VAE",
+        "mixture of experts", "MoE", "sparse MoE", "switch transformer", "gated experts",
+        "routing network", "expert routing", "pathways", "multi-query attention", "multi-head attention",
+        "rotary position embedding", "RoPE", "grouped-query attention", "GQA", "flash attention",
+        "state space model", "SSM", "mamba", "recurrent neural network", "RNN", "LSTM", "GRU",
+        "convolutional neural network", "CNN", "residual connection", "skip connection", "normalization",
+        "layer norm", "group norm", "batch norm", "parameter efficient fine-tuning", "PEFT",
+        "LoRA", "low-rank adaptation", "QLoRA", "adapters", "prompt tuning", "prefix tuning",
+        # AI Learning Paradigms
+        "supervised learning", "unsupervised learning", "reinforcement learning", "RL",
+        "meta-learning", "transfer learning", "federated learning", "self-supervised learning",
+        "semi-supervised learning", "few-shot learning", "zero-shot learning", "one-shot learning",
+        "contrastive learning", "curriculum learning", "imitation learning", "active learning",
+        "reinforcement learning from human feedback", "RLHF", "direct preference optimization", "DPO",
+        "constitutional AI", "red teaming", "adversarial training", "GAN", "generative adversarial network",
+        "diffusion", "latent diffusion", "flow-based model", "variational autoencoder", "VAE",
+        # AI Capabilities and Applications
+        "natural language processing", "NLP", "computer vision", "CV", "speech recognition",
+        "text generation", "image generation", "video generation", "multimodal", "multi-modal",
+        "recommendation system", "recommender system", "chatbot", "conversational AI",
+        "sentiment analysis", "entity recognition", "semantic search", "vector search", "embedding",
+        "classification", "regression", "clustering", "anomaly detection", "agent", "AI agent",
+        "autonomous agent", "agentic", "RAG", "retrieval augmented generation", "tool use",
+        "function calling", "reasoning", "chain-of-thought", "CoT", "tree-of-thought", "ToT",
+        "planning", "decision making", "multi-agent", "agent swarm", "multi-agent simulation",
+        # AI Technical Terms
+        "token", "tokenizer", "tokenization", "embedding", "vector", "prompt", "prompt engineering",
+        "context window", "parameter", "weights", "bias", "activation function", "loss function",
+        "gradient descent", "backpropagation", "epoch", "batch", "mini-batch", "regularization",
+        "dropout", "overfitting", "underfitting", "hyperparameter", "latent space", "latent variable",
+        "feature extraction", "dimensionality reduction", "optimization", "quantization", "pruning",
+        "fine-tuning", "transfer learning", "knowledge distillation", "int4", "int8", "bfloat16",
+        "float16", "mixed precision", "GPTQ", "AWQ", "GGUF", "GGML", "KV cache", "speculative decoding",
+        "beam search", "greedy decoding", "temperature", "top-k", "top-p", "nucleus sampling",
+        # AI Tools and Frameworks
+        "TensorFlow", "PyTorch", "JAX", "Keras", "Hugging Face", "Transformers", "Diffusers",
+        "LangChain", "Llama Index", "OpenAI", "Anthropic", "NVIDIA", "GPU", "TPU", "IPU", "NPU", "CUDA",
+        "MLOps", "model monitoring", "model deployment", "model serving", "inference endpoint",
+        "vLLM", "TGI", "text generation inference", "triton", "onnx", "tensorRT",
+        # AI Ethics and Concerns
+        "AI ethics", "responsible AI", "AI safety", "AI alignment", "AI governance",
+        "bias", "fairness", "interpretability", "explainability", "XAI", "transparency",
+        "hallucination", "toxicity", "safe deployment", "AI risk", "AI capabilities",
+        "alignment tax", "red teaming", "jailbreak", "prompt injection", "data poisoning",
+        # AI Companies and Organizations
+        "OpenAI", "Anthropic", "Google DeepMind", "Meta AI", "Microsoft", "NVIDIA",
+        "Hugging Face", "Mistral AI", "Cohere", "AI21 Labs", "Stability AI", "Midjourney",
+        "EleutherAI", "Allen AI", "DeepMind", "Character AI", "Inflection AI", "xAI"
+    ],
+    "science": [
+        # General scientific terms
+        "study", "research", "scientist", "scientific", "discovered", "experiment",
+        "laboratory", "clinical", "trial", "hypothesis", "theory", "evidence-based",
+        "peer-reviewed", "journal", "publication", "finding", "breakthrough", "innovation",
+        "discovery", "analysis", "data", "measurement", "observation", "empirical",
+        # Biology and medicine
+        "biology", "chemistry", "physics", "genetics", "genomics", "DNA", "RNA",
+        "medicine", "gene", "protein", "molecule", "cell", "brain", "neuro",
+        "cancer", "disease", "cure", "treatment", "vaccine", "health", "medical",
+        "pharmaceutical", "drug", "therapy", "symptom", "diagnosis", "prognosis",
+        "patient", "doctor", "hospital", "clinic", "surgery", "immune", "antibody",
+        "virus", "bacteria", "pathogen", "infection", "epidemic", "pandemic",
+        "organism", "evolution", "mutation", "chromosome", "enzyme", "hormone",
+        # Physics and astronomy
+        "quantum", "particle", "atom", "nuclear", "electron", "neutron", "proton",
+        "atomic", "subatomic", "molecular", "energy", "matter", "mass", "force",
+        "space", "NASA", "telescope", "planet", "exoplanet", "moon", "lunar", "mars",
+        "star", "galaxy", "cosmic", "astronomical", "universe", "solar", "celestial",
+        "orbit", "gravitational", "gravity", "relativity", "quantum mechanics",
+        "string theory", "dark matter", "dark energy", "black hole", "supernova",
+        "radiation", "radioactive", "isotope", "fission", "fusion", "accelerator",
+        # Environmental science
+        "climate", "carbon", "environment", "ecosystem", "species", "extinct",
+        "endangered", "biodiversity", "conservation", "sustainable", "renewable",
+        "fossil fuel", "greenhouse", "global warming", "polar", "ice cap", "glacier",
+        "ozone", "atmosphere", "weather", "meteorology", "geology", "earthquake",
+        "volcanic", "ocean", "marine", "coral reef", "deforestation", "pollution",
+        # Math and computer science (non-AI specific)
+        "equation", "formula", "theorem", "calculus", "statistical", "probability",
+        "dataset", "parameter", "variable", "function", "matrix", "optimization",
+        # Organizations
+        "CERN", "NIH", "CDC", "WHO", "NOAA", "ESA", "SpaceX", "Blue Origin", "JPL",
+        "laboratory", "institute", "university", "academic", "faculty", "professor",
+        # Science tools
+        "Matlab", "SPSS", "SAS", "ImageJ", "LabVIEW", "ANSYS", "Cadence", "Origin",
+        "Avogadro", "ChemDraw", "Mathematica", "Wolfram Alpha", "COMSOL", "LAMMPS",
+        "VASP", "Gaussian", "GIS", "ArcGIS", "QGIS", "Maple", "R Studio"
+    ],
+    "technology": [
+        # General tech terms
+        "computer", "software", "hardware", "internet", "cyber", "digital", "tech",
+        "robot", "automation", "autonomous", "code", "programming", "data", "cloud",
+        "server", "network", "encryption", "blockchain", "crypto", "bitcoin", "ethereum",
+        "technology", "innovation", "breakthrough", "prototype", "development",
+        "engineering", "technical", "specification", "feature", "functionality",
+        "interface", "system", "infrastructure", "integration", "implementation",
+        # Devices and hardware
+        "smartphone", "device", "gadget", "laptop", "desktop", "tablet", "wearable",
+        "smartwatch", "IoT", "internet of things", "sensor", "chip", "semiconductor",
+        "processor", "CPU", "GPU", "memory", "RAM", "storage", "hard drive", "SSD",
+        "electronic", "circuit", "motherboard", "component", "peripheral", "accessory",
+        "display", "screen", "touchscreen", "camera", "lens", "microphone", "speaker",
+        "battery", "charger", "wireless", "bluetooth", "WiFi", "router", "modem",
+        # Software and internet
+        "app", "application", "platform", "website", "online", "web", "browser",
+        "operating system", "Windows", "macOS", "Linux", "Android", "iOS", "software",
+        "program", "code", "coding", "development", "framework", "library", "API",
+        "interface", "backend", "frontend", "full-stack", "developer", "programmer",
+        "database", "SQL", "NoSQL", "cloud computing", "SaaS", "PaaS", "IaaS",
+        "DevOps", "agile", "scrum", "sprint", "version control", "git", "repository",
+        # Communications and networking
+        "5G", "6G", "broadband", "fiber", "network", "wireless", "cellular", "mobile",
+        "telecommunications", "telecom", "transmission", "bandwidth", "latency",
+        "protocol", "IP address", "DNS", "server", "hosting", "data center",
+        # Company and product names
+        "Apple", "Google", "Microsoft", "Amazon", "Facebook", "Meta", "Tesla",
+        "IBM", "Intel", "AMD", "Nvidia", "Qualcomm", "Cisco", "Oracle", "SAP",
+        "Huawei", "Samsung", "Sony", "LG", "Dell", "HP", "Lenovo", "Xiaomi",
+        "iPhone", "iPad", "MacBook", "Surface", "Galaxy", "Pixel", "Windows",
+        "Android", "iOS", "Chrome", "Firefox", "Edge", "Safari", "Office",
+        "Azure", "AWS", "Google Cloud", "Gmail", "Outlook", "Teams", "Zoom",
+        # Advanced technologies
+        "VR", "AR", "XR", "virtual reality", "augmented reality", "mixed reality",
+        "metaverse", "3D printing", "additive manufacturing", "quantum computing",
+        "nanotechnology", "biotechnology", "electric vehicle", "self-driving",
+        "autonomous vehicle", "drone", "UAV", "robotics", "cybersecurity",
+        # Social media
+        "social media", "social network", "Facebook", "Instagram", "Twitter", "X",
+        "LinkedIn", "TikTok", "Snapchat", "YouTube", "Pinterest", "Reddit",
+        "streaming", "content creator", "influencer", "follower", "like", "share",
+        "post", "tweet", "user-generated", "viral", "trending", "engagement",
+        # Technology tools
+        "NumPy", "Pandas", "Matplotlib", "Seaborn", "Scikit-learn", "Jupyter",
+        "Visual Studio", "VS Code", "IntelliJ", "PyCharm", "Eclipse", "Android Studio",
+        "Xcode", "Docker", "Kubernetes", "Jenkins", "Ansible", "Terraform", "Vagrant",
+        "AWS CLI", "Azure CLI", "GCP CLI", "PowerShell", "Bash", "npm", "pip", "conda",
+        "React", "Angular", "Vue.js", "Node.js", "Django", "Flask", "Spring", "Laravel",
+        "PostgreSQL", "MySQL", "MongoDB", "Redis", "Elasticsearch", "Kafka", "RabbitMQ",
+        # Optimization terms
+        "optimization", "efficiency", "performance tuning", "benchmarking", "profiling",
+        "refactoring", "scaling", "bottleneck", "throughput", "latency reduction",
+        "response time", "caching", "load balancing", "distributed computing",
+        "parallel processing", "concurrency", "asynchronous", "memory management"
+    ],
+    "politics": [
+        # Government structure
+        "president", "prime minister", "government", "parliament", "congress",
+        "senate", "house", "representative", "minister", "secretary", "cabinet",
+        "administration", "mayor", "governor", "politician", "official", "authority",
+        "federal", "state", "local", "municipal", "county", "city", "town",
+        "constituency", "district", "precinct", "ward", "judiciary", "executive",
+        "legislative", "branch", "checks and balances", "separation of powers",
+        # Political activities
+        "policy", "election", "campaign", "vote", "voter", "ballot", "polling",
+        "political", "politics", "debate", "speech", "address", "press conference",
+        "approval rating", "opinion poll", "candidate", "incumbent", "challenger",
+        "primary", "caucus", "convention", "delegate", "nomination", "campaign trail",
+        "fundraising", "lobbying", "advocacy", "activism", "protest", "demonstration",
+        # Political ideologies
+        "democracy", "democratic", "republican", "conservative", "liberal",
+        "progressive", "left-wing", "right-wing", "centrist", "moderate",
+        "socialist", "capitalist", "communist", "libertarian", "populist",
+        "nationalist", "globalist", "isolationist", "hawk", "dove",
+        "ideology", "partisan", "bipartisan", "coalition", "majority", "minority",
+        # Laws and regulations
+        "bill", "law", "legislation", "regulation", "policy", "statute", "code",
+        "amendment", "reform", "repeal", "enact", "implement", "enforce",
+        "constitutional", "unconstitutional", "legal", "illegal", "legalize",
+        "criminalize", "deregulate", "regulatory", "compliance", "mandate",
+        # Judicial and legal
+        "court", "supreme", "justice", "judge", "ruling", "decision", "opinion",
+        "case", "lawsuit", "litigation", "plaintiff", "defendant", "prosecutor",
+        "attorney", "lawyer", "advocate", "judicial review", "precedent",
+        "constitution", "amendment", "rights", "civil rights", "human rights",
+        # International relations
+        "treaty", "international", "diplomatic", "diplomacy", "relations",
+        "foreign policy", "domestic policy", "UN", "NATO", "EU", "United Nations",
+        "sanctions", "embargo", "tariff", "trade war", "diplomat", "embassy",
+        "consulate", "ambassador", "delegation", "summit", "bilateral", "multilateral",
+        "alliance", "ally", "adversary", "geopolitical", "sovereignty", "regime",
+        # Security and defense
+        "national security", "homeland security", "defense", "military", "armed forces",
+        "army", "navy", "air force", "marines", "coast guard", "intelligence",
+        "CIA", "FBI", "NSA", "Pentagon", "war", "conflict", "peacekeeping",
+        "terrorism", "counterterrorism", "insurgency", "nuclear weapon", "missile",
+        "disarmament", "nonproliferation", "surveillance", "espionage",
+        # Political institutions
+        "White House", "Kremlin", "Downing Street", "Capitol Hill", "Westminster",
+        "United Nations", "European Union", "NATO", "World Bank", "IMF", "WTO",
+        "ASEAN", "African Union", "BRICS", "G7", "G20",
+        # Political parties and movements
+        "Democrat", "Republican", "Labour", "Conservative", "Green Party",
+        "Socialist", "Communist", "Libertarian", "Independent", "Tea Party",
+        "progressive movement", "civil rights movement", "womens rights",
+        "LGBTQ rights", "Black Lives Matter", "environmental movement"
+    ],
+    "business": [
+        # Companies and organization types
+        "company", "corporation", "business", "startup", "firm", "enterprise",
+        "corporate", "industry", "sector", "conglomerate", "multinational",
+        "organization", "entity", "private", "public", "incorporated", "LLC",
+        "partnership", "proprietorship", "franchise", "subsidiary", "parent company",
+        "headquarters", "office", "facility", "plant", "factory", "warehouse",
+        "retail", "wholesale", "ecommerce", "brick-and-mortar", "chain", "outlet",
+        # Business roles and management
+        "executive", "CEO", "CFO", "CTO", "COO", "CMO", "CIO", "CHRO", "chief",
+        "director", "board", "chairman", "chairwoman", "chairperson", "president",
+        "vice president", "senior", "junior", "manager", "management", "supervisor",
+        "founder", "entrepreneur", "owner", "shareholder", "stakeholder",
+        "employee", "staff", "workforce", "personnel", "human resources", "HR",
+        "recruit", "hire", "layoff", "downsizing", "restructuring", "reorganization",
+        # Financial terms
+        "profit", "revenue", "sales", "income", "earnings", "EBITDA", "turnover",
+        "loss", "deficit", "expense", "cost", "overhead", "margin", "markup",
+        "budget", "forecast", "projection", "estimate", "actual", "variance",
+        "balance sheet", "income statement", "cash flow", "P&L", "liquidity",
+        "solvency", "asset", "liability", "equity", "debt", "leverage", "capital",
+        "working capital", "cash", "funds", "money", "payment", "transaction",
+        # Markets and trading
+        "market", "stock", "share", "bond", "security", "commodity", "futures",
+        "option", "derivative", "forex", "foreign exchange", "currency", "crypto",
+        "trader", "trading", "buy", "sell", "long", "short", "position", "portfolio",
+        "diversification", "hedge", "risk", "return", "yield", "dividend", "interest",
+        "bull market", "bear market", "correction", "crash", "rally", "volatile",
+        "volatility", "index", "benchmark", "Dow Jones", "NASDAQ", "S&P 500", "NYSE",
+        # Investment and funding
+        "investor", "investment", "fund", "mutual fund", "ETF", "hedge fund",
+        "private equity", "venture", "venture capital", "VC", "angel investor",
+        "seed", "Series A", "Series B", "Series C", "funding", "financing",
+        "loan", "credit", "debt", "equity", "fundraising", "crowdfunding",
+        "IPO", "initial public offering", "going public", "listed", "delisted",
+        "merger", "acquisition", "M&A", "takeover", "buyout", "divestiture",
+        "valuation", "billion", "million", "trillion", "unicorn", "decacorn",
+        # Economic terms
+        "economy", "economic", "economics", "macro", "micro", "fiscal", "monetary",
+        "supply", "demand", "market forces", "competition", "competitive", "monopoly",
+        "oligopoly", "antitrust", "regulation", "deregulation", "growth", "decline",
+        "recession", "depression", "recovery", "expansion", "contraction", "cycle",
+        "inflation", "deflation", "stagflation", "hyperinflation", "CPI", "price",
+        "GDP", "gross domestic product", "GNP", "productivity", "output", "input",
+        # Banking and finance
+        "finance", "financial", "bank", "banking", "commercial bank", "investment bank",
+        "central bank", "Federal Reserve", "Fed", "ECB", "Bank of England", "BOJ",
+        "interest rate", "prime rate", "discount rate", "basis point", "monetary policy",
+        "quantitative easing", "tightening", "loosening", "credit", "lending",
+        "borrowing", "loan", "mortgage", "consumer credit", "credit card", "debit card",
+        "checking", "savings", "deposit", "withdrawal", "ATM", "branch", "online banking",
+        # Currencies and payments
+        "dollar", "euro", "pound", "yen", "yuan", "rupee", "ruble", "real", "peso",
+        "currency", "money", "fiat", "exchange rate", "remittance", "transfer",
+        "payment", "transaction", "wire", "ACH", "SWIFT", "clearing", "settlement",
+        "cryptocurrency", "bitcoin", "ethereum", "blockchain", "fintech", "paytech",
+        # Business operations
+        "product", "service", "solution", "offering", "launch", "rollout", "release",
+        "operation", "production", "manufacturing", "supply chain", "logistics",
+        "procurement", "inventory", "distribution", "shipping", "delivery",
+        "quality", "control", "assurance", "standard", "certification", "compliance",
+        "process", "procedure", "workflow", "efficiency", "optimization",
+        # Marketing and sales
+        "marketing", "advertise", "advertising", "campaign", "promotion", "publicity",
+        "PR", "public relations", "brand", "branding", "identity", "image", "reputation",
+        "sales", "selling", "deal", "transaction", "pipeline", "lead", "prospect",
+        "customer", "client", "consumer", "buyer", "purchaser", "target market",
+        "segment", "demographic", "psychographic", "B2B", "B2C", "retail", "wholesale",
+        "price", "pricing", "discount", "premium", "luxury", "value", "bargain"
+    ],
+    "world": [
+        # General international terms
+        "country", "nation", "state", "republic", "kingdom", "global", "international",
+        "foreign", "world", "worldwide", "domestic", "abroad", "overseas",
+        "developed", "developing", "industrialized", "emerging", "third world",
+        "global south", "global north", "east", "west", "western", "eastern",
+        "bilateral", "multilateral", "transnational", "multinational", "sovereignty",
+        # Regions and continents
+        "Europe", "European", "Asia", "Asian", "Africa", "African", "North America",
+        "South America", "Latin America", "Australia", "Oceania", "Antarctica",
+        "Middle East", "Central Asia", "Southeast Asia", "East Asia", "South Asia",
+        "Eastern Europe", "Western Europe", "Northern Europe", "Southern Europe",
+        "Mediterranean", "Scandinavia", "Nordic", "Baltic", "Balkans", "Caucasus",
+        "Caribbean", "Central America", "South Pacific", "Polynesia", "Micronesia",
+        # Major countries and regions
+        "China", "Chinese", "Russia", "Russian", "India", "Indian", "Japan", "Japanese",
+        "UK", "British", "England", "English", "Scotland", "Scottish", "Wales", "Welsh",
+        "Germany", "German", "France", "French", "Italy", "Italian", "Spain", "Spanish",
+        "Canada", "Canadian", "Brazil", "Brazilian", "Mexico", "Mexican", "Turkey", "Turkish",
+        "United States", "US", "USA", "American", "Britain", "Korea", "Korean",
+        "North Korea", "South Korea", "Saudi", "Saudi Arabia", "Saudi Arabian",
+        "Iran", "Iranian", "Iraq", "Iraqi", "Israel", "Israeli", "Palestine", "Palestinian",
+        "Egypt", "Egyptian", "Pakistan", "Pakistani", "Indonesia", "Indonesian",
+        "Australia", "Australian", "New Zealand", "Nigeria", "Nigerian", "South Africa",
+        "Argentina", "Argentinian", "Colombia", "Colombian", "Venezuela", "Venezuelan",
+        "Ukraine", "Ukrainian", "Poland", "Polish", "Switzerland", "Swiss",
+        "Netherlands", "Dutch", "Belgium", "Belgian", "Sweden", "Swedish", "Norway", "Norwegian",
+        # International issues and topics
+        "war", "conflict", "crisis", "tension", "dispute", "hostility", "peace",
+        "peacekeeping", "ceasefire", "truce", "armistice", "treaty", "agreement",
+        "compromise", "negotiation", "mediation", "resolution", "settlement",
+        "refugee", "migrant", "asylum seeker", "displacement", "humanitarian",
+        "border", "frontier", "territory", "territorial", "sovereignty", "jurisdiction",
+        "terror", "terrorism", "extremism", "radicalism", "insurgency", "militant",
+        "sanction", "embargo", "restriction", "isolation", "blockade",
+        # International trade and economy
+        "trade", "import", "export", "tariff", "duty", "quota", "subsidy",
+        "protectionism", "free trade", "fair trade", "globalization", "trade war",
+        "trade agreement", "trade deal", "trade deficit", "trade surplus",
+        "supply chain", "outsourcing", "offshoring", "reshoring", "nearshoring",
+        # Diplomacy and international relations
+        "embassy", "consulate", "diplomatic", "diplomacy", "diplomat", "ambassador",
+        "consul", "attaché", "envoy", "emissary", "delegation", "mission",
+        "foreign policy", "international relations", "geopolitics", "geopolitical",
+        "influence", "power", "superpower", "hegemony", "alliance", "coalition",
+        "bloc", "axis", "sphere of influence", "buffer state", "proxy",
+        # International organizations
+        "UN", "United Nations", "EU", "European Union", "NATO", "NAFTA", "USMCA",
+        "ASEAN", "OPEC", "Commonwealth", "Arab League", "African Union", "AU",
+        "BRICS", "G7", "G20", "IMF", "World Bank", "WTO", "WHO", "UNESCO",
+        "Security Council", "General Assembly", "International Court of Justice",
+        # Travel and cultural exchange
+        "visa", "passport", "immigration", "emigration", "migration", "travel",
+        "tourism", "tourist", "visitor", "foreigner", "expatriate", "expat",
+        "citizenship", "nationality", "dual citizen", "naturalization",
+        "cultural", "tradition", "heritage", "indigenous", "native", "local",
+        "language", "dialect", "translation", "interpreter", "cross-cultural"
+    ],
+    "sports": [
+        # General sports terms
+        "game", "match", "tournament", "championship", "league", "cup", "Olympics",
+        "olympic", "world cup", "competition", "contest", "event", "series",
+        "sport", "sporting", "athletics", "physical", "play", "compete", "competition",
+        "amateur", "professional", "pro", "season", "preseason", "regular season",
+        "postseason", "playoff", "final", "semifinal", "quarterfinal", "qualifying",
+        # Team sports
+        "football", "soccer", "American football", "rugby", "basketball", "baseball",
+        "cricket", "hockey", "ice hockey", "field hockey", "volleyball", "handball",
+        "water polo", "lacrosse", "ultimate frisbee", "netball", "kabaddi",
+        "team", "club", "franchise", "squad", "roster", "lineup", "formation",
+        "player", "coach", "manager", "trainer", "captain", "starter", "substitute",
+        "bench", "draft", "trade", "free agent", "contract", "transfer", "loan",
+        # Individual sports
+        "tennis", "golf", "boxing", "wrestling", "martial arts", "MMA", "UFC",
+        "athletics", "track and field", "swimming", "diving", "gymnastics",
+        "skiing", "snowboarding", "skating", "figure skating", "speed skating",
+        "cycling", "mountain biking", "BMX", "motorsport", "F1", "Formula 1",
+        "NASCAR", "IndyCar", "MotoGP", "rally", "marathon", "triathlon", "decathlon",
+        "archery", "shooting", "fencing", "equestrian", "rowing", "canoeing", "kayaking",
+        "surfing", "skateboarding", "climbing", "bouldering", "weightlifting",
+        # Scoring and results
+        "score", "point", "goal", "touchdown", "basket", "run", "wicket", "try",
+        "win", "lose", "draw", "tie", "defeat", "victory", "champion", "winner",
+        "loser", "runner-up", "finalist", "semifinalist", "eliminated", "advance",
+        "qualify", "record", "personal best", "world record", "Olympic record",
+        "streak", "undefeated", "unbeaten", "perfect season", "comeback",
+        # Performance and training
+        "fitness", "training", "practice", "drill", "workout", "exercise", "regime",
+        "conditioning", "strength", "endurance", "speed", "agility", "flexibility",
+        "skill", "technique", "form", "style", "strategy", "tactic", "playbook",
+        "offense", "defense", "attack", "counter", "press", "formation",
+        "injury", "rehabilitation", "recovery", "physiotherapy", "sports medicine",
+        # Sports infrastructure
+        "stadium", "arena", "court", "field", "pitch", "rink", "pool", "track",
+        "course", "gymnasium", "gym", "complex", "venue", "facility", "locker room",
+        "dugout", "bench", "sideline", "grandstand", "spectator", "fan", "supporter",
+        # Sports organizations and competitions
+        "medal", "gold", "silver", "bronze", "podium", "Olympics", "Paralympic",
+        "commonwealth games", "Asian games", "Pan American games", "world championship",
+        "grand slam", "masters", "open", "invitational", "classic", "tour", "circuit",
+        "IPL", "Indian Premier League", "MLB", "Major League Baseball",
+        "NBA", "National Basketball Association", "NFL", "National Football League",
+        "NHL", "National Hockey League", "FIFA", "UEFA", "ATP", "WTA", "ICC",
+        "Premier League", "La Liga", "Bundesliga", "Serie A", "Ligue 1", "MLS",
+        "Champions League", "Europa League", "Super Bowl", "World Series", "Stanley Cup",
+        "NCAA", "collegiate", "college", "university", "varsity", "intramural",
+        # Sports media and business
+        "broadcast", "coverage", "commentator", "announcer", "pundit", "analyst",
+        "highlight", "replay", "sports network", "ESPN", "Sky Sports", "Fox Sports",
+        "sponsorship", "endorsement", "advertisement", "merchandise", "jersey", "kit",
+        "ticket", "season ticket", "box seat", "premium", "concession", "vendor",
+        # Sports media and business (continued)
+        "broadcast", "coverage", "commentator", "announcer", "pundit", "analyst",
+        "highlight", "replay", "sports network", "ESPN", "Sky Sports", "Fox Sports",
+        "sponsorship", "endorsement", "advertisement", "merchandise", "jersey", "kit",
+        "ticket", "season ticket", "box seat", "premium", "concession", "vendor"
+    ],
+    "entertainment": [
+        # Film and cinema
+        "movie", "film", "cinema", "feature", "short film", "documentary", "animation",
+        "blockbuster", "indie", "independent film", "foreign film", "box office",
+        "screening", "premiere", "release", "theatrical", "stream", "streaming",
+        "director", "producer", "screenwriter", "script", "screenplay", "adaptation",
+        "cinematography", "cinematographer", "editing", "editor", "visual effects",
+        "special effects", "CGI", "motion capture", "sound design", "soundtrack",
+        "score", "composer", "scene", "shot", "take", "cut", "sequel", "prequel",
+        "trilogy", "franchise", "universe", "reboot", "remake", "spin-off",
+        "genre", "action", "comedy", "drama", "thriller", "horror", "sci-fi",
+        "science fiction", "fantasy", "romance", "romantic comedy", "rom-com",
+        "mystery", "crime", "western", "historical", "biographical", "biopic",
+        # Television
+        "TV", "television", "show", "series", "episode", "season", "pilot",
+        "finale", "midseason", "sitcom", "drama series", "miniseries", "limited series",
+        "anthology", "reality TV", "game show", "talk show", "variety show",
+        "network", "cable", "premium cable", "broadcast", "channel", "program",
+        "primetime", "daytime", "syndication", "rerun", "renewed", "cancelled",
+        "showrunner", "creator", "writer", "TV writer", "episode writer", "staff writer",
+        # Performing arts
+        "actor", "actress", "performer", "cast", "casting", "star", "co-star",
+        "supporting", "lead", "protagonist", "antagonist", "villain", "hero", "anti-hero",
+        "character", "role", "performance", "portrayal", "acting", "dialogue",
+        "monologue", "line", "script", "improv", "improvisation", "stand-up",
+        "comedian", "comic", "sketch", "theater", "theatre", "stage", "Broadway",
+        "West End", "play", "musical", "opera", "ballet", "dance", "choreography",
+        "production", "rehearsal", "audition", "understudy", "troupe", "ensemble",
+        # Music
+        "music", "song", "track", "single", "album", "EP", "LP", "record",
+        "release", "drop", "artist", "musician", "singer", "vocalist", "band",
+        "group", "duo", "trio", "soloist", "frontman", "frontwoman", "lead singer",
+        "songwriter", "composer", "producer", "DJ", "rapper", "MC", "beatmaker",
+        "guitarist", "bassist", "drummer", "pianist", "keyboardist", "violinist",
+        "instrumentalist", "orchestra", "symphony", "philharmonic", "conductor",
+        "genre", "rock", "pop", "hip-hop", "rap", "R&B", "soul", "funk", "jazz",
+        "blues", "country", "folk", "electronic", "EDM", "dance", "techno", "house",
+        "metal", "punk", "alternative", "indie", "classical", "reggae", "latin",
+        "hit", "chart", "Billboard", "Grammy", "award-winning", "platinum", "gold",
+        "concert", "tour", "gig", "show", "performance", "live", "venue", "arena",
+        "stadium", "festival", "Coachella", "Glastonbury", "Lollapalooza", "Bonnaroo",
+        # Celebrity culture
+        "celebrity", "star", "fame", "famous", "A-list", "B-list", "icon", "iconic",
+        "superstar", "public figure", "household name", "stardom", "limelight",
+        "popular", "popularity", "fan", "fanbase", "followers", "stan", "groupie",
+        "paparazzi", "tabloid", "gossip", "rumor", "scandal", "controversy",
+        "interview", "press conference", "red carpet", "premiere", "gala", "award show",
+        # Awards and recognition
+        "award", "nominee", "nomination", "winner", "recipient", "honor", "accolade",
+        "Oscar", "Academy Award", "Emmy", "Grammy", "Tony", "Golden Globe", "BAFTA",
+        "MTV Award", "People's Choice", "Critics' Choice", "SAG Award", "Billboard Award",
+        "best actor", "best actress", "best director", "best picture", "best film",
+        "best album", "best song", "hall of fame", "lifetime achievement", "legacy",
+        # Media and publishing
+        "book", "novel", "fiction", "non-fiction", "memoir", "biography", "autobiography",
+        "bestseller", "bestselling", "author", "writer", "novelist", "literary",
+        "literature", "publisher", "publishing", "imprint", "edition", "volume",
+        "chapter", "page", "paragraph", "prose", "narrative", "plot", "storyline",
+        "character", "protagonist", "antagonist", "setting", "theme", "genre",
+        "mystery", "thriller", "romance", "sci-fi", "fantasy", "young adult", "YA",
+        "comic", "comic book", "graphic novel", "manga", "anime", "cartoon",
+        # Digital entertainment
+        "streaming", "stream", "subscription", "platform", "service", "content",
+        "Netflix", "Disney+", "Amazon Prime", "Hulu", "HBO", "HBO Max", "Apple TV+",
+        "Peacock", "Paramount+", "YouTube", "YouTube Premium", "TikTok", "Instagram",
+        "influencer", "content creator", "vlogger", "blogger", "podcaster", "podcast",
+        "episode", "download", "subscriber", "follower", "like", "share", "viral",
+        "trending", "binge-watch", "marathon", "spoiler", "recap", "review", "trailer",
+        "teaser", "behind the scenes", "BTS", "exclusive", "original"
+    ]
+}
+# Add domain-specific RSS feeds for different categories
+CATEGORY_SPECIFIC_FEEDS = {
+    "science": [
+        # "https://www.science.org/rss/news_feeds/carousel.xml",
+        "https://www.science.org/rss/news_current.xml",
+        "https://www.nature.com/nature.rss",
+        # "https://www.scientificamerican.com/rss/",
+        "http://rss.sciam.com/basic-science",
+        # "https://rss.sciam.com/ScientificAmerican-Global",
+        "http://rss.sciam.com/ScientificAmerican-Global",
+        # "https://feeds.newscientist.com/science-news",
+        "https://www.newscientist.com/feed/home/?cmpid=RSS|NSNS-Home",
+        "https://phys.org/rss-feed/"
+    ],
+    "technology": [
+        # "https://feed.wired.com/rss/category/business/feed.rss",
+        "https://www.wired.com/feed/category/business/latest/rss",
+        "https://techcrunch.com/feed/",
+        "https://www.technologyreview.com/feed/",
+        "https://arstechnica.com/feed/",
+        "https://www.theverge.com/rss/index.xml",
+        "https://news.ycombinator.com/rss"
+    ],
+    "politics": [
+        "https://feeds.washingtonpost.com/rss/politics",
+        "https://rss.nytimes.com/services/xml/rss/nyt/Politics.xml",
+        "https://feeds.bbci.co.uk/news/politics/rss.xml",
+        "https://www.politico.com/rss/politicopicks.xml",
+        "https://www.realclearpolitics.com/index.xml"
+    ],
+    "business": [
+        "https://www.ft.com/rss/home",
+        "https://feeds.bloomberg.com/markets/news.rss",
+        # "https://www.forbes.com/business/feed/",
+        "https://rss.nytimes.com/services/xml/rss/nyt/Business.xml",
+        "https://feeds.washingtonpost.com/rss/business",
+        "https://www.entrepreneur.com/latest.rss",
+        # "https://www.cnbc.com/id/10001147/device/rss/rss.htm",
+        "https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=10001147",
+        "https://feeds.content.dowjones.io/public/rss/WSJcomUSBusiness",
+        "https://feeds.a.dj.com/rss/RSSMarketsMain.xml"
+    ],
+    "world": [
+        "https://feeds.bbci.co.uk/news/world/rss.xml",
+        "https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
+        "https://www.aljazeera.com/xml/rss/all.xml",
+        "https://feeds.washingtonpost.com/rss/world",
+        # "https://rss.cnn.com/rss/edition_world.rss"
+        "http://rss.cnn.com/rss/cnn_world.rss"
+    ],
+    "sports": [
+        "https://www.espn.com/espn/rss/news",
+        "https://www.cbssports.com/rss/headlines/",
+        # "https://feeds.skysports.com/feeds/rss/latest.xml",
+        "https://www.espncricinfo.com/rss/content/story/feeds/0.xml",
+        "https://api.foxsports.com/v1/rss",
+        "https://www.sportingnews.com/us/rss",
+        "https://www.theguardian.com/sport/rss",
+    ],
+    "entertainment": [
+        "https://www.hollywoodreporter.com/feed/",
+        "https://variety.com/feed/",
+        # "https://feeds.eonline.com/mrss/article/",
+        "https://www.eonline.com/syndication/feeds/rssfeeds/topstories.xml",
+        "https://www.rollingstone.com/feed/",
+        "https://rss.nytimes.com/services/xml/rss/nyt/Arts.xml"
+    ],
+    "fact_checking": [
+        "https://www.snopes.com/feed/",
+        "https://www.politifact.com/rss/all/",
+        "https://www.factcheck.org/feed/",
+        "https://leadstories.com/atom.xml",
+        # "https://apnews.com/hub/fact-check/rss",
+        # "https://apnews.com/apf-fact-check"
+        "https://fullfact.org/feed/all/",
+        "https://www.truthorfiction.com/feed/"
+    ]
+}
+# Reliability boosts for sources by category
+SOURCE_RELIABILITY_BY_CATEGORY = {
+    "science": {
+        "nature.com": 0.95,
+        "science.org": 0.95,
+        "nih.gov": 0.95,
+        "nasa.gov": 0.95,
+        "scientificamerican.com": 0.9,
+        "newscientist.com": 0.9,
+        "pnas.org": 0.95,
+        "cell.com": 0.95,
+        "sciencedirect.com": 0.9,
+        "plos.org": 0.9,
+        "arxiv.org": 0.85
+    },
+    "technology": {
+        "wired.com": 0.9,
+        "techcrunch.com": 0.85,
+        "arstechnica.com": 0.9,
+        "technologyreview.com": 0.9,
+        "theverge.com": 0.85,
+        "cnet.com": 0.85,
+        "engadget.com": 0.85
+    },
+    "fact_checking": {
+        "snopes.com": 0.95,
+        "politifact.com": 0.9,
+        "factcheck.org": 0.9,
+        "apnews.com/hub/fact-check": 0.95,
+        "reuters.com/fact-check": 0.95
+    }
+}
+def detect_claim_category(claim: str) -> Tuple[str, float]:
+    """
+    Detect the most likely category of a claim and its confidence score
+    Args:
+        claim (str): The claim text
+    Returns:
+        tuple: (category_name, confidence_score)
+    """
+    if not claim:
+        return "general", 0.3
+    # Lowercase for better matching
+    claim_lower = claim.lower()
+    # Count matches for each category
+    category_scores = {}
+    for category, keywords in CLAIM_CATEGORIES.items():
+        # Count how many keywords from this category appear in the claim
+        matches = sum(1 for keyword in keywords if keyword.lower() in claim_lower)
+        # Calculate a simple score based on matches
+        if matches > 0:
+            # Calculate a more significant score based on number of matches
+            score = min(0.9, 0.3 + (matches * 0.1))  # Base 0.3 + 0.1 per match, max 0.9
+            category_scores[category] = score
+    # Find category with highest score
+    if not category_scores:
+        return "general", 0.3
+    top_category = max(category_scores.items(), key=lambda x: x[1])
+    category_name, confidence = top_category
+    # If the top score is too low, return general
+    if confidence < 0.3:
+        return "general", 0.3
+    return category_name, confidence
+def get_topic_specific_sources(claim: str, existing_sources: Dict) -> Dict:
+    """
+    Enrich existing sources dict with topic-specific sources
+    Args:
+        claim (str): The claim text
+        existing_sources (dict): Current sources configuration
+    Returns:
+        dict: Updated sources with topic-specific priorities
+    """
+    # Detect claim category
+    category, confidence = detect_claim_category(claim)
+    logger.info(f"Claim category detected: {category} (confidence: {confidence:.2f})")
+    # If confidence is low, keep existing sources
+    if confidence < 0.4:
+        return existing_sources
+    # Get specific feeds for the category
+    category_feeds = CATEGORY_SPECIFIC_FEEDS.get(category, [])
+    # Only proceed if we have category-specific feeds
+    if not category_feeds:
+        return existing_sources
+    # Create a new sources dictionary with category-specific modifications
+    updated_sources = existing_sources.copy()
+    # If the category is science, add the category-specific feeds to the list
+    # and prioritize them by putting them first in RSS feeds
+    if category in CATEGORY_SPECIFIC_FEEDS:
+        # Add up to 5 category-specific RSS feeds (if we have them)
+        category_feeds_sample = category_feeds[:min(5, len(category_feeds))]
+        # Add or update source reliability data
+        if category in SOURCE_RELIABILITY_BY_CATEGORY:
+            for domain, reliability in SOURCE_RELIABILITY_BY_CATEGORY[category].items():
+                updated_sources["source_credibility"] = updated_sources.get("source_credibility", {})
+                updated_sources["source_credibility"][domain] = reliability
+        # Return updated sources with prioritized feeds
+        return {
+            "category": category,
+            "confidence": confidence,
+            "rss_feeds": category_feeds_sample + (updated_sources.get("rss_feeds", []) or []),
+            "source_credibility": updated_sources.get("source_credibility", {})
+        }
+    return existing_sources
+def get_prioritized_sources(claim: str, claim_category: Optional[str] = None) -> Dict[str, List[str]]:
+    """
+    Get prioritized sources for a claim based on its category
+    Args:
+        claim (str): The claim to check
+        claim_category (str, optional): Override detected category
+    Returns:
+        dict: Dictionary with source types prioritized by relevance
+    """
+    # Detect category if not provided
+    if not claim_category:
+        category, confidence = detect_claim_category(claim)
+    else:
+        category = claim_category
+        confidence = 0.8  # Assume high confidence if category is explicitly provided
+    # Log detected category
+    logger.info(f"Using claim category: {category} for source prioritization")
+    # Default priorities
+    priorities = {
+        "primary": ["wikipedia", "news", "claimreview"],
+        "secondary": ["rss", "scholarly", "wikidata"]
+    }
+    # Needs recent evidence check (existing logic)
+    temporal_terms = ["is", "are", "remains", "continues", "still", "currently",
+                      "now", "today", "recent", "latest"]
+    negation_terms = ["not", "no longer", "isn't", "aren't", "doesn't", "don't",
+                      "can't", "cannot", "anymore"]
+    requires_recent = any(term in claim.lower() for term in temporal_terms) or \
+                     any(term in claim.lower() for term in negation_terms)
+    # Adjust priorities based on category
+    if category == "science":
+        if requires_recent:
+            priorities = {
+                "primary": ["scholarly", "rss", "wikipedia"],
+                "secondary": ["news", "claimreview", "wikidata"]
+            }
+        else:
+            priorities = {
+                "primary": ["scholarly", "wikipedia", "rss"],
+                "secondary": ["claimreview", "news", "wikidata"]
+            }
+    elif category == "technology":
+        if requires_recent:
+            priorities = {
+                "primary": ["rss", "news", "scholarly"],
+                "secondary": ["wikipedia", "claimreview", "wikidata"]
+            }
+        else:
+            priorities = {
+                "primary": ["news", "scholarly", "wikipedia"],
+                "secondary": ["rss", "claimreview", "wikidata"]
+            }
+    elif category == "politics":
+        if requires_recent:
+            priorities = {
+                "primary": ["rss", "news", "claimreview"],
+                "secondary": ["wikipedia", "wikidata", "scholarly"]
+            }
+        else:
+            priorities = {
+                "primary": ["claimreview", "news", "wikipedia"],
+                "secondary": ["rss", "wikidata", "scholarly"]
+            }
+    elif category == "business" or category == "world":
+        if requires_recent:
+            priorities = {
+                "primary": ["rss", "news", "wikipedia"],
+                "secondary": ["claimreview", "wikidata", "scholarly"]
+            }
+        else:
+            priorities = {
+                "primary": ["news", "wikipedia", "rss"],
+                "secondary": ["claimreview", "wikidata", "scholarly"]
+            }
+    elif category == "sports":
+        if requires_recent:
+            priorities = {
+                "primary": ["rss", "news", "wikipedia"],
+                "secondary": ["wikidata", "claimreview", "scholarly"]
+            }
+        else:
+            priorities = {
+                "primary": ["wikipedia", "news", "rss"],
+                "secondary": ["wikidata", "claimreview", "scholarly"]
+            }
+    elif category == "entertainment":
+        if requires_recent:
+            priorities = {
+                "primary": ["rss", "news", "claimreview"],
+                "secondary": ["wikipedia", "wikidata", "scholarly"]
+            }
+        else:
+            priorities = {
+                "primary": ["news", "wikipedia", "claimreview"],
+                "secondary": ["rss", "wikidata", "scholarly"]
+            }
+    # Add category and confidence for reference
+    priorities["category"] = category
+    priorities["confidence"] = confidence
+    priorities["requires_recent"] = requires_recent
+    return priorities
+def get_category_specific_rss_feeds(category: str, max_feeds: int = 5) -> List[str]:
+    """
+    Get a list of RSS feeds specific to a category
+    Args:
+        category (str): The claim category
+        max_feeds (int): Maximum number of feeds to return
+    Returns:
+        list: List of RSS feed URLs
+    """
+    # Get category-specific feeds
+    category_feeds = CATEGORY_SPECIFIC_FEEDS.get(category, [])
+    # Limit to max_feeds
+    return category_feeds[:min(max_feeds, len(category_feeds))]

modules/claim_extraction.py ADDED Viewed

	@@ -0,0 +1,236 @@

+import logging
+import time
+import re
+from langdetect import detect
+import spacy
+from utils.performance import PerformanceTracker
+from utils.models import get_nlp_model, get_llm_model
+logger = logging.getLogger("misinformation_detector")
+performance_tracker = PerformanceTracker()
+def extract_claims(text):
+    """
+    Extract the main factual claim from the provided text.
+    For concise claims (<20 words), preserves them exactly.
+    For longer text, uses OpenAI to extract the claim.
+    """
+    logger.info(f"Extracting claims from: {text}")
+    start_time = time.time()
+    # First, check if the input already appears to be a concise claim
+    if len(text.split()) < 20:
+        logger.info("Input appears to be a concise claim already, preserving as-is")
+        performance_tracker.log_processing_time(start_time)
+        performance_tracker.log_claim_processed()
+        return text
+    try:
+        # For longer text, use OpenAI for extraction
+        extracted_claim = extract_with_openai(text)
+        # Log processing time
+        performance_tracker.log_processing_time(start_time)
+        performance_tracker.log_claim_processed()
+        logger.info(f"Extracted claim: {extracted_claim}")
+        return extracted_claim
+    except Exception as e:
+        logger.error(f"Error extracting claims: {str(e)}")
+        # Fallback to original text on error
+        return text
+def extract_with_openai(text):
+    """
+    Use OpenAI model for claim extraction
+    """
+    try:
+        # Get LLM model
+        llm_model = get_llm_model()
+        # Create a very explicit prompt to avoid hallucination
+        prompt = f"""
+        Extract the main factual claim from the following text.
+        DO NOT add any information not present in the original text.
+        DO NOT add locations, dates, or other details.
+        ONLY extract what is explicitly stated.
+        Text: {text}
+        Main factual claim:
+        """
+        # Call OpenAI with temperature=0 for deterministic output
+        response = llm_model.invoke(prompt, temperature=0)
+        extracted_claim = response.content.strip()
+        # Further clean up any explanations or extra text
+        if ":" in extracted_claim:
+            parts = extracted_claim.split(":")
+            if len(parts) > 1:
+                extracted_claim = parts[-1].strip()
+        logger.info(f"OpenAI extraction: {extracted_claim}")
+        # Validate that we're not adding info not in the original
+        nlp = get_nlp_model()
+        extracted_claim = validate_extraction(text, extracted_claim, nlp)
+        return extracted_claim
+    except Exception as e:
+        logger.error(f"Error in OpenAI claim extraction: {str(e)}")
+        return text  # Fallback to original
+def validate_extraction(original_text, extracted_claim, nlp):
+    """
+    Validate that the extracted claim doesn't add information not present in the original text
+    """
+    # If extraction fails or is empty, return original
+    if not extracted_claim or extracted_claim.strip() == "":
+        logger.warning("Empty extraction result, using original text")
+        return original_text
+    # Check for added location information
+    location_terms = ["united states", "america", "u.s.", "usa", "china", "india", "europe",
+                      "russia", "japan", "uk", "germany", "france", "australia"]
+    for term in location_terms:
+        if term in extracted_claim.lower() and term not in original_text.lower():
+            logger.warning(f"Extraction added location '{term}' not in original, using original text")
+            return original_text
+    # Check for entity preservation/addition using spaCy
+    try:
+        # Get entities from extracted text
+        extracted_doc = nlp(extracted_claim)
+        extracted_entities = [ent.text.lower() for ent in extracted_doc.ents]
+        # Get entities from original text
+        original_doc = nlp(original_text)
+        original_entities = [ent.text.lower() for ent in original_doc.ents]
+        # Check for new entities that don't exist in original
+        for entity in extracted_entities:
+            if not any(entity in orig_entity or orig_entity in entity for orig_entity in original_entities):
+                logger.warning(f"Extraction added new entity '{entity}', using original text")
+                return original_text
+        return extracted_claim
+    except Exception as e:
+        logger.error(f"Error in extraction validation: {str(e)}")
+        return original_text  # On error, safer to return original
+def shorten_claim_for_evidence(claim):
+    """
+    Shorten a claim to use for evidence retrieval by preserving important keywords
+    while maintaining claim context
+    """
+    try:
+        # Get NLP model
+        nlp = get_nlp_model()
+        # Use NER to extract key entities
+        doc = nlp(claim)
+        # Extract all entities for search
+        entities = [ent.text for ent in doc.ents]
+        # Extract key proper nouns, entities, and important context words
+        important_words = []
+        # Add all named entities
+        for ent in doc.ents:
+            important_words.append(ent.text)
+        # Add important nouns and adjectives not already added
+        for token in doc:
+            if token.pos_ in ["NOUN", "PROPN"] and token.text not in important_words:
+                important_words.append(token.text)
+        # Make sure we include key terms like "prime minister", "president", etc.
+        title_terms = ["president", "prime minister", "minister", "chancellor", "premier", "governor", "mayor", "senator"]
+        for term in title_terms:
+            if term in claim.lower() and not any(term in word.lower() for word in important_words):
+                # Find the full phrase (e.g., "Canadian Prime Minister")
+                matches = re.finditer(r'(?i)(?:\w+\s+)*\b' + re.escape(term) + r'\b(?:\s+\w+)*', claim)
+                for match in matches:
+                    phrase = match.group(0)
+                    if phrase not in important_words:
+                        important_words.append(phrase)
+        # Add country names or important place references
+        country_terms = ["canada", "canadian", "us", "united states", "american", "uk", "british", "australia", "china", "russian"]
+        for term in country_terms:
+            if term in claim.lower() and not any(term in word.lower() for word in important_words):
+                for token in doc:
+                    if token.text.lower() == term and token.text not in important_words:
+                        important_words.append(token.text)
+        # Always include negation words as they're critical for meaning
+        negation_terms = ["not", "no longer", "former", "ex-", "isn't", "aren't", "doesn't", "don't"]
+        negation_found = False
+        for term in negation_terms:
+            if term in claim.lower():
+                # Find the context around the negation (3 words before and after)
+                matches = re.finditer(r'(?i)(?:\w+\s+){0,3}\b' + re.escape(term) + r'\b(?:\s+\w+){0,3}', claim)
+                for match in matches:
+                    phrase = match.group(0)
+                    if phrase not in important_words:
+                        important_words.append(phrase)
+                        negation_found = True
+        # Special handling for time-sensitive claims with negations
+        is_time_sensitive = any(term in claim.lower() for term in ["anymore", "still", "currently", "now", "today", "recent"])
+        # If we have both negation and time sensitivity, ensure we keep those key aspects
+        if negation_found and is_time_sensitive:
+            # Ensure we keep time-sensitive terms
+            time_terms = ["anymore", "still", "currently", "now", "today", "recent"]
+            for term in time_terms:
+                if term in claim.lower() and not any(term in word.lower() for word in important_words):
+                    # Add the context around the time term
+                    matches = re.finditer(r'(?i)(?:\w+\s+){0,2}\b' + re.escape(term) + r'\b(?:\s+\w+){0,2}', claim)
+                    for match in matches:
+                        phrase = match.group(0)
+                        if phrase not in important_words:
+                            important_words.append(phrase)
+        # If entities plus titles don't give us enough, include key parts of claim
+        if len(entities) < 2 and not any("minister" in word.lower() for word in important_words):
+            words = claim.split()
+            # Use first 8 words
+            return " ".join(words[:min(8, len(words))])
+        # Remove duplicates while preserving order
+        seen = set()
+        unique_terms = []
+        for word in important_words:
+            if word.lower() not in seen:
+                seen.add(word.lower())
+                unique_terms.append(word)
+        # Ensure we have a reasonable number of search terms (maintain more for complex claims)
+        search_terms = unique_terms[:min(6, len(unique_terms))]
+        # Sort search terms to try to maintain original word order from claim
+        def get_position(term):
+            return claim.lower().find(term.lower())
+        search_terms.sort(key=get_position)
+        # Join terms to create search query
+        shortened_claim = " ".join(search_terms)
+        # If the shortened claim is too short compared to original, use more of original
+        if len(shortened_claim.split()) < 3 and len(claim.split()) > 5:
+            words = claim.split()
+            shortened_claim = " ".join(words[:min(8, len(words))])
+        logger.info(f"Shortened Claim: {shortened_claim}")
+        return shortened_claim
+    except Exception as e:
+        logger.error(f"Error in shortening claim: {str(e)}")
+        # Return original claim on error
+        return claim

modules/classification.py ADDED Viewed

	@@ -0,0 +1,521 @@

+import logging
+import re
+from utils.models import get_llm_model
+from utils.performance import PerformanceTracker
+logger = logging.getLogger("misinformation_detector")
+performance_tracker = PerformanceTracker()
+def classify_with_llm(claim, evidence):
+    """
+    Optimized classification function that handles evidence classification
+    and verdict generation in a single LLM call with robust parsing
+    """
+    logger.info(f"Classifying evidence for claim: {claim}")
+    # Get the LLM model
+    llm_model = get_llm_model()
+    # Skip if no evidence
+    if not evidence:
+        logger.warning("No evidence provided for classification")
+        return []
+    # Normalize evidence to a list
+    if not isinstance(evidence, list):
+        if evidence:
+            try:
+                evidence = [evidence]
+            except Exception as e:
+                logger.error(f"Could not convert evidence to list: {e}")
+                return []
+        else:
+            return []
+    # Does the claim contain strong assertions that require specific evidence?
+    strong_assertion_markers = [
+        "solved", "cured", "discovered", "confirmed", "proven", "definitive",
+        "breakthrough", "revolutionary", "successfully", "first ever", "extends",
+        "conclusive", "unprecedented", "remarkable", "definitively"
+    ]
+    # Check if the claim contains strong assertions that would require specific supporting evidence
+    contains_strong_assertions = any(marker in claim.lower() for marker in strong_assertion_markers)
+    # Limit to top 5 evidence items to reduce token usage
+    evidence = evidence[:5]
+    try:
+        # Format evidence items
+        evidence_text = ""
+        for idx, chunk in enumerate(evidence):
+            # Truncate long evidence
+            chunk_text = str(chunk)
+            if len(chunk_text) > 300:
+                chunk_text = chunk_text[:297] + "..."
+            evidence_text += f"EVIDENCE {idx+1}:\n{chunk_text}\n\n"
+        # Create a structured prompt with explicit formatting instructions
+        # Adjust instructions based on claim characteristics
+        if contains_strong_assertions:
+            prompt = f"""
+            CLAIM: {claim}
+            EVIDENCE:
+            {evidence_text}
+            TASK: Evaluate if the evidence supports, contradicts, or is irrelevant to the claim.
+            IMPORTANT CONTEXT: This claim makes strong assertions that require specific supporting evidence.
+            When evaluating such claims:
+            1. Strong assertions require strong, direct evidence - look for specific confirmation from credible sources
+            2. General information about the topic is not sufficient to support specific assertions
+            3. Evidence of ongoing work or research is not sufficient to support claims of completion or success
+            4. If the evidence doesn't directly confirm the specific assertion, classify it as "insufficient" rather than "support"
+            INSTRUCTIONS:
+            1. For each evidence, provide your analysis in EXACTLY this format:
+            EVIDENCE 1 ANALYSIS:
+            Relevance: [relevant/irrelevant]
+            Classification: [support/contradict/insufficient/irrelevant]
+            Confidence: [number between 0-100]
+            Reason: [brief explanation focusing on whether evidence directly confirms the specific assertion]
+            2. After analyzing all evidence pieces, provide a final verdict in this format:
+            FINAL VERDICT: [clear statement if evidence collectively supports or contradicts the claim]
+            Without specific, direct supporting evidence, default to "The evidence does not support the claim" rather than "insufficient evidence."
+            CRITICAL INSTRUCTION: FOCUS ON THE EXACT CLAIM. Evaluate ONLY the specific claim, not related topics
+            """
+        else:
+            prompt = f"""
+            CLAIM: {claim}
+            EVIDENCE:
+            {evidence_text}
+            TASK: Evaluate if the evidence supports, contradicts, or is irrelevant to the claim.
+            INSTRUCTIONS:
+            1. For each evidence, provide your analysis in EXACTLY this format:
+            EVIDENCE 1 ANALYSIS:
+            Relevance: [relevant/irrelevant]
+            Classification: [support/contradict/insufficient/irrelevant]
+            Confidence: [number between 0-100]
+            Reason: [brief explanation]
+            2. After analyzing all evidence pieces, provide a final verdict in this format:
+            FINAL VERDICT: [clear statement if evidence collectively supports or contradicts the claim]
+            CRITICAL INSTRUCTION: FOCUS ON THE EXACT CLAIM. Evaluate ONLY the specific claim, not related topics
+            """
+        # Get response with temperature=0 for consistency
+        result = llm_model.invoke(prompt, temperature=0)
+        result_text = result.content.strip()
+        # Extract final verdict first since it's most important
+        final_verdict = None
+        final_match = re.search(r'FINAL VERDICT:\s*(.*?)(?=\s*$|\n\n)', result_text, re.DOTALL | re.IGNORECASE)
+        if final_match:
+            final_verdict = final_match.group(1).strip()
+            logger.info(f"Final assessment: {final_verdict}")
+        # Define a precise regex pattern matching the requested format
+        analysis_pattern = r'EVIDENCE\s+(\d+)\s+ANALYSIS:\s*\n+Relevance:\s*(relevant|irrelevant)\s*\n+Classification:\s*(support|contradict|neutral|irrelevant|insufficient)\s*\n+Confidence:\s*(\d+)\s*\n+Reason:\s*(.*?)(?=\s*EVIDENCE\s+\d+\s+ANALYSIS:|\s*FINAL VERDICT:|\s*$)'
+        # Parse each evidence analysis
+        classification_results = []
+        matched_evidence = set()
+        # Try matching with our strict pattern first
+        matches = list(re.finditer(analysis_pattern, result_text, re.IGNORECASE | re.DOTALL))
+        # If no matches, try a more flexible pattern
+        if not matches:
+            flexible_pattern = r'(?:EVIDENCE|Evidence)\s+(\d+)(?:\s+ANALYSIS)?:?\s*\n+(?:Relevance|relevance):\s*(relevant|irrelevant|unknown)\s*\n+(?:Classification|classification):\s*(support|contradict|neutral|irrelevant|insufficient|unknown)\s*\n+(?:Confidence|confidence):\s*(\d+)\s*\n+(?:Reason|reason|Brief reason):\s*(.*?)(?=\s*(?:EVIDENCE|Evidence)\s+\d+|FINAL VERDICT:|$)'
+            matches = list(re.finditer(flexible_pattern, result_text, re.IGNORECASE | re.DOTALL))
+        # Process matches
+        for match in matches:
+            try:
+                evidence_idx = int(match.group(1)) - 1
+                relevance = match.group(2).lower()
+                classification = match.group(3).lower()
+                confidence = int(match.group(4))
+                reason = match.group(5).strip()
+                # Normalize classification terms
+                if classification == "neutral":
+                    classification = "insufficient"
+                # For strong assertions, apply confidence adjustments based on classification
+                if contains_strong_assertions:
+                    if classification == "support":
+                        # Check if the reasoning indicates direct or indirect support
+                        indirect_support_markers = ["general", "doesn't directly", "does not directly",
+                                                  "doesn't specifically", "not specific", "related to",
+                                                  "doesn't confirm"]
+                        if any(marker in reason.lower() for marker in indirect_support_markers):
+                            # Downgrade support confidence for indirect evidence
+                            confidence = max(5, confidence - 20)
+                    elif classification == "contradict":
+                        # For contradictions of strong assertions, slightly boost confidence
+                        confidence = min(95, confidence + 5)
+                # Ensure index is valid
+                if 0 <= evidence_idx < len(evidence):
+                    matched_evidence.add(evidence_idx)
+                    # Create result entry
+                    classification_results.append({
+                        "label": classification,
+                        "confidence": confidence / 100.0,
+                        "evidence": evidence[evidence_idx],
+                        "relevance": relevance,
+                        "reason": reason,
+                        "final_assessment": final_verdict
+                    })
+            except (ValueError, IndexError) as e:
+                logger.error(f"Error parsing evidence analysis: {e}")
+        # Handle any unmatched evidence items
+        if matches:  # Only add defaults if we successfully matched some
+            for idx, ev in enumerate(evidence):
+                if idx not in matched_evidence:
+                    # Check if the evidence text itself suggests a classification
+                    contains_support = bool(re.search(r'support|confirm|verify|true|correct|released', final_verdict or "", re.IGNORECASE))
+                    contains_contradicting = bool(re.search(r'not yet|hasn\'t|have not|doesn\'t|don\'t|cannot|preliminary|proposed', str(ev).lower()))
+                    # For claims with strong assertions without explicit evidence, be more cautious
+                    if contains_strong_assertions:
+                        if contains_contradicting:
+                            label = "contradict"
+                            confidence = 0.6
+                        elif contains_support:
+                            label = "insufficient"  # Default to insufficient for strong assertions without clear analysis
+                            confidence = 0.5
+                        else:
+                            label = "insufficient"
+                            confidence = 0.5
+                    else:
+                        label = "support" if contains_support else "unknown"
+                        confidence = 0.7 if contains_support else 0.5
+                    classification_results.append({
+                        "label": label,
+                        "confidence": confidence,
+                        "evidence": ev,
+                        "relevance": "relevant" if (contains_support or contains_contradicting) else "unknown",
+                        "reason": "Based on overall assessment",
+                        "final_assessment": final_verdict
+                    })
+        else:
+            # No structured parsing worked, use final verdict to create simple results
+            contains_support = bool(re.search(r'support|confirm|verify|true|correct|released', final_verdict or "", re.IGNORECASE))
+            contains_contradict = bool(re.search(r'contradict|against|false|incorrect|not support|does not support|insufficient evidence|does not confirm|no evidence', final_verdict or "", re.IGNORECASE))
+            contains_insufficient = bool(re.search(r'insufficient|not enough|cannot determine|no evidence|lack of evidence', final_verdict or "", re.IGNORECASE))
+            # For claims with strong assertions, be more stringent
+            if contains_strong_assertions:
+                if contains_support and not contains_insufficient and not contains_contradict:
+                    label = "support"
+                    confidence = 0.6  # Lower confidence even for support of strong assertions
+                elif contains_contradict:
+                    label = "contradict"
+                    confidence = 0.8  # Higher confidence for contradiction of strong assertions
+                else:
+                    label = "insufficient"
+                    confidence = 0.7  # Good confidence for insufficient judgment
+            else:
+                label = "support" if contains_support else "contradict" if contains_contradict else "unknown"
+                confidence = 0.7 if (contains_support or contains_contradict) else 0.5
+            # Create basic results based on final verdict
+            for ev in evidence:
+                classification_results.append({
+                    "label": label,
+                    "confidence": confidence,
+                    "evidence": ev,
+                    "relevance": "relevant" if (contains_support or contains_contradict) else "unknown",
+                    "reason": final_verdict or "Based on collective evidence",
+                    "final_assessment": final_verdict
+                })
+        logger.info(f"Classified {len(classification_results)} evidence items")
+        return classification_results
+    except Exception as e:
+        logger.error(f"Error in evidence classification: {str(e)}")
+        # Provide a basic fallback that checks for keywords in evidence
+        try:
+            fallback_results = []
+            for ev in evidence:
+                ev_text = str(ev).lower()
+                supports = False
+                contradicts = False
+                # Basic keyword checking as last resort
+                if claim.lower() in ev_text:
+                    keywords = [word for word in claim.lower().split() if len(word) > 3]
+                    matching_keywords = [k for k in keywords if k in ev_text]
+                    # If substantial keywords match, consider it support
+                    supports = len(matching_keywords) >= max(1, len(keywords) // 2)
+                # Check for contradiction terms
+                contradiction_terms = ["not yet", "hasn't", "haven't", "cannot", "can't",
+                                      "doesn't", "don't", "no evidence", "insufficient",
+                                      "preliminary", "proposed", "in development", "future"]
+                contradicts = any(term in ev_text for term in contradiction_terms)
+                # For claims with strong assertions, be more conservative in the fallback case
+                if contains_strong_assertions:
+                    if contradicts:
+                        fallback_results.append({
+                            "label": "contradict",
+                            "confidence": 0.6,
+                            "evidence": ev,
+                            "relevance": "relevant",
+                            "reason": "Evidence suggests the claim is not yet proven (fallback method)"
+                        })
+                    elif supports:
+                        fallback_results.append({
+                            "label": "insufficient",
+                            "confidence": 0.6,
+                            "evidence": ev,
+                            "relevance": "relevant",
+                            "reason": "Evidence is related but doesn't conclusively confirm the assertion (fallback method)"
+                        })
+                    else:
+                        fallback_results.append({
+                            "label": "unknown",
+                            "confidence": 0.5,
+                            "evidence": ev,
+                            "relevance": "unknown",
+                            "reason": "Cannot determine relevance (fallback method)"
+                        })
+                else:
+                    fallback_results.append({
+                        "label": "support" if supports else "unknown",
+                        "confidence": 0.6 if supports else 0.5,
+                        "evidence": ev,
+                        "relevance": "relevant" if supports else "unknown",
+                        "reason": "Based on keyword matching (fallback method)"
+                    })
+            return fallback_results
+        except:
+            # Absolute last resort
+            return [{"label": "unknown", "confidence": 0.5, "evidence": ev} for ev in evidence]
+def aggregate_evidence(classification_results):
+    """
+    Aggregate evidence classifications to determine overall verdict
+    with robust fallback mechanisms for reliable results
+    """
+    logger.info(f"Aggregating evidence from {len(classification_results) if classification_results else 0} results")
+    if not classification_results:
+        logger.warning("No classification results to aggregate")
+        return "Uncertain", 0.3  # Default with low confidence
+    # Assess the claim's characteristics (without relying on explicit category detection)
+    # Does the claim contain strong assertions that require specific evidence?
+    strong_assertion_markers = [
+        "solved", "cured", "discovered", "confirmed", "proven", "definitive",
+        "breakthrough", "revolutionary", "successfully", "first ever", "extends",
+        "conclusive", "unprecedented", "remarkable", "definitively"
+    ]
+    # Check if claim text is available in final assessment
+    claim_text = None
+    claim_has_strong_assertions = False
+    # Extract claim from final assessment if available
+    for item in classification_results:
+        if "final_assessment" in item and item["final_assessment"]:
+            match = re.search(r'the claim (?:that )?"?([^"]+)"?', item["final_assessment"], re.IGNORECASE)
+            if match:
+                claim_text = match.group(1)
+                claim_has_strong_assertions = any(marker in claim_text.lower() for marker in strong_assertion_markers)
+                break
+    # If we couldn't extract the claim, check evidence context for assertion indicators
+    if not claim_text:
+        # Check if evidence reasons suggest dealing with strong assertions
+        assertion_context_indicators = ["conclusive evidence", "definitive proof", "solved", "breakthrough",
+                                      "revolutionary", "directly confirms", "specific confirmation"]
+        reasons = [item.get("reason", "").lower() for item in classification_results if "reason" in item]
+        assertion_indicators_count = sum(1 for indicator in assertion_context_indicators
+                                        for reason in reasons if indicator in reason)
+        claim_has_strong_assertions = assertion_indicators_count >= 2
+    # Extract final assessment if present
+    final_assessment = None
+    for item in classification_results:
+        if "final_assessment" in item and item["final_assessment"]:
+            final_assessment = item["final_assessment"]
+            break
+    # Count evidence by classification
+    support_items = [item for item in classification_results if item.get("label") == "support"]
+    contradict_items = [item for item in classification_results if item.get("label") == "contradict"]
+    insufficient_items = [item for item in classification_results if item.get("label") in ["insufficient", "neutral"]]
+    relevant_items = [item for item in classification_results
+                     if item.get("relevance") == "relevant" or item.get("label") in ["support", "contradict"]]
+    # Calculate the proportion of supported evidence
+    total_relevant = len(relevant_items)
+    # Direct keyword detection from final assessment or evidence
+    if final_assessment:
+        # Check for support indicators in final assessment
+        supports_pattern = r'\b(support|confirm|verify|true|correct|released|proves|validates|evidence (?:that |for |of )(?:the claim|it) is true)\b'
+        contradicts_pattern = r'\b(contradict|refute|deny|false|incorrect|not released|doesn\'t support|does not support|no evidence|cannot support|is not true|evidence (?:that |for |of )(?:the claim|it) is false)\b'
+        insufficient_pattern = r'\b(uncertain|insufficient|not enough|inconclusive|cannot determine|unable to determine|lack of evidence)\b'
+        supports_match = re.search(supports_pattern, final_assessment, re.IGNORECASE)
+        contradicts_match = re.search(contradicts_pattern, final_assessment, re.IGNORECASE)
+        insufficient_match = re.search(insufficient_pattern, final_assessment, re.IGNORECASE)
+        # Direct determination based on final assessment keywords
+        if supports_match and not contradicts_match and not insufficient_match:
+            # Get max confidence from supporting evidence
+            confidence = max([item.get("confidence", 0) for item in support_items]) if support_items else 0.7
+            # Adjust confidence for claims with strong assertions
+            if claim_has_strong_assertions:
+                confidence = min(confidence, 0.8)  # Cap confidence for strong assertions
+            return "True (Based on Evidence)", max(0.6, confidence)  # Minimum 0.6 confidence
+        if contradicts_match and not supports_match:
+            # Get max confidence from contradicting evidence
+            confidence = max([item.get("confidence", 0) for item in contradict_items]) if contradict_items else 0.7
+            # For claims with strong assertions, increase confidence in contradiction
+            if claim_has_strong_assertions:
+                confidence = max(confidence, 0.7)  # Minimum 0.7 confidence for contradicting strong assertions
+            return "False (Based on Evidence)", max(0.6, confidence)  # Minimum 0.6 confidence
+        if insufficient_match:
+            # For claims with strong assertions without confirming evidence,
+            # change "Uncertain" to a clearer negative verdict
+            if claim_has_strong_assertions:
+                return "False (Based on Evidence)", 0.7
+            return "Uncertain", 0.4  # Medium-low confidence
+    # If we have distinct classifications, weigh them by confidence and quantity
+    if support_items and (not contradict_items or all(item.get("confidence", 0) < 0.95 for item in contradict_items)):
+        # Check if there's high confidence support evidence (greater than 0.95)
+        high_confidence_support = [item for item in support_items if item.get("confidence", 0) > 0.95]
+        if high_confidence_support:
+            # High confidence support evidence exists, use it even if there are some contradictions
+            confidence = max([item.get("confidence", 0) for item in high_confidence_support])
+            # For claims with strong assertions, be more conservative with pure support
+            if claim_has_strong_assertions:
+                confidence = min(confidence, 0.8)
+            return "True (Based on Evidence)", max(0.7, confidence)
+        elif not contradict_items:
+            # All supportive evidence with no contradictions (standard case)
+            confidence = max([item.get("confidence", 0) for item in support_items])
+            # For claims with strong assertions, be more conservative with pure support
+            if claim_has_strong_assertions:
+                # For strong assertions with only support but no contradictions, be cautious
+                confidence = min(confidence, 0.7)
+                # If the support is from low-quality evidence, consider it uncertain
+                support_reasons = [item.get("reason", "").lower() for item in support_items]
+                weak_supports = sum(1 for reason in support_reasons if
+                                   "general information" in reason or
+                                   "doesn't specify" in reason or
+                                   "does not directly" in reason)
+                if weak_supports / max(1, len(support_items)) > 0.5:
+                    return "Uncertain", 0.6
+            return "True (Based on Evidence)", max(0.6, confidence)
+    if contradict_items and not support_items:
+        # All contradicting evidence
+        confidence = max([item.get("confidence", 0) for item in contradict_items])
+        # For claims with strong assertions, increase confidence in contradiction
+        if claim_has_strong_assertions:
+            confidence = max(confidence, 0.7)
+        return "False (Based on Evidence)", max(0.6, confidence)
+    if insufficient_items and len(insufficient_items) > len(support_items) + len(contradict_items):
+        # Mostly insufficient evidence
+        # For claims with strong assertions and mainly insufficient evidence, lean toward "False"
+        if claim_has_strong_assertions:
+            return "False (Based on Evidence)", 0.7
+        return "Uncertain", 0.5  # Medium confidence for explicitly uncertain
+    if support_items and contradict_items:
+        # Competing evidence - compare confidence and quantity
+        support_confidence = max([item.get("confidence", 0) for item in support_items])
+        contradict_confidence = max([item.get("confidence", 0) for item in contradict_items])
+        # For claims with strong assertions, require stronger support to overcome contradiction
+        if claim_has_strong_assertions:
+            # Higher threshold for strong assertions
+            if support_confidence > contradict_confidence + 0.3:
+                return "True (Based on Evidence)", support_confidence * 0.9  # Apply a confidence penalty
+            elif contradict_confidence >= support_confidence - 0.1:  # Lower threshold for contradiction
+                return "False (Based on Evidence)", max(contradict_confidence, 0.7)  # Minimum 0.7 confidence
+            else:
+                # Default to uncertain for close calls on strong assertions
+                return "Uncertain", 0.6
+        else:
+            # Standard threshold for regular claims
+            if support_confidence > contradict_confidence + 0.2:
+                return "True (Based on Evidence)", support_confidence
+            elif contradict_confidence > support_confidence + 0.2:
+                return "False (Based on Evidence)", contradict_confidence
+            else:
+                # Close call - check quantity of evidence
+                if len(support_items) > len(contradict_items) * 2:
+                    return "True (Based on Evidence)", support_confidence * 0.9  # Slight confidence penalty
+                elif len(contradict_items) > len(support_items) * 2:
+                    return "False (Based on Evidence)", contradict_confidence * 0.9  # Slight confidence penalty
+                else:
+                    # Truly conflicting evidence
+                    return "Uncertain", 0.5  # Medium confidence
+    # Check for evidence quality issues
+    all_unknown = all(item.get("label") == "unknown" for item in classification_results)
+    evidence_text = " ".join([str(item.get("evidence", "")) for item in classification_results])
+    # General case: For any claims with all unknown labels that contain markers of strong assertions
+    if all_unknown and claim_has_strong_assertions:
+        # Absence of clear supporting evidence for claims with strong assertions points to "False"
+        return "False (Based on Evidence)", 0.7
+    # For general claims, if all items are unknown but evidence clearly mentions the claim
+    if all_unknown:
+        # Examples of direct evidence matching as fallback
+        if re.search(r'\bllama\s*4\b', evidence_text, re.IGNORECASE) and re.search(r'\bmeta\b|\bfacebook\b', evidence_text, re.IGNORECASE) and re.search(r'\breleas', evidence_text, re.IGNORECASE):
+            return "True (Based on Evidence)", 0.7
+        elif re.search(r'\bnot\s+releas', evidence_text, re.IGNORECASE) or re.search(r'\bdenies\b|\bdenied\b', evidence_text, re.IGNORECASE):
+            return "False (Based on Evidence)", 0.7
+    # Default to uncertain if no clear pattern - but with special case for claims with strong assertions
+    if claim_has_strong_assertions:
+        # For claims with strong assertions with no clear evidence, default to false
+        return "False (Based on Evidence)", 0.7
+    return "Uncertain", 0.3

modules/evidence_retrieval.py ADDED Viewed

	@@ -0,0 +1,944 @@

+import logging
+import time
+import re
+import random
+import requests
+import json
+import ssl
+from urllib.parse import urlencode
+from bs4 import BeautifulSoup
+from SPARQLWrapper import SPARQLWrapper, JSON
+from datetime import datetime, timedelta
+from concurrent.futures import ThreadPoolExecutor, as_completed, wait, FIRST_COMPLETED
+from utils.api_utils import api_error_handler, safe_json_parse
+from utils.models import get_nlp_model
+from modules.claim_extraction import shorten_claim_for_evidence, extract_claims
+from modules.rss_feed import retrieve_evidence_from_rss
+from modules.semantic_analysis import analyze_evidence_relevance, select_diverse_evidence
+from config import SOURCE_CREDIBILITY, NEWS_API_KEY, FACTCHECK_API_KEY
+# Import the performance tracker
+from utils.performance import PerformanceTracker
+performance_tracker = PerformanceTracker()
+logger = logging.getLogger("misinformation_detector")
+# Define early analysis function at the module level so it's available everywhere
+def analyze_early_evidence(claim, source_name, source_evidence):
+    """Pre-analyze evidence while waiting for other sources to complete"""
+    try:
+        if not source_evidence:
+            return None
+        logger.info(f"Pre-analyzing {len(source_evidence)} evidence items from {source_name}")
+        # Do a quick relevance check using similarity scoring
+        nlp_model = get_nlp_model()
+        claim_doc = nlp_model(claim)
+        relevant_evidence = []
+        for evidence in source_evidence:
+            if not isinstance(evidence, str):
+                continue
+            # Look for direct keyword matches first (fast check)
+            is_related = False
+            keywords = [word.lower() for word in claim.split() if len(word) > 3]
+            for keyword in keywords:
+                if keyword in evidence.lower():
+                    is_related = True
+                    break
+            # If no keywords match, do a basic entity check
+            if not is_related:
+                # Check if claim and evidence share any entities
+                evidence_doc = nlp_model(evidence[:500])  # Limit for speed
+                claim_entities = [ent.text.lower() for ent in claim_doc.ents]
+                evidence_entities = [ent.text.lower() for ent in evidence_doc.ents]
+                common_entities = set(claim_entities).intersection(set(evidence_entities))
+                if common_entities:
+                    is_related = True
+            if is_related:
+                relevant_evidence.append(evidence)
+        logger.info(f"Found {len(relevant_evidence)} relevant items out of {len(source_evidence)} from {source_name}")
+        return relevant_evidence
+    except Exception as e:
+        logger.error(f"Error in early evidence analysis: {e}")
+        return source_evidence  # On error, return original evidence
+# New function to get recent date for filtering news
+def get_recent_date_range():
+    """Return date range for recent news filtering - last 3 days"""
+    today = datetime.now()
+    three_days_ago = today - timedelta(days=3)
+    return three_days_ago.strftime('%Y-%m-%d'), today.strftime('%Y-%m-%d')
+@api_error_handler("wikipedia")
+def retrieve_evidence_from_wikipedia(claim):
+    """Retrieve evidence from Wikipedia for a given claim"""
+    logger.info(f"Retrieving evidence from Wikipedia for: {claim}")
+    # Ensure shortened_claim is a string
+    try:
+        shortened_claim = shorten_claim_for_evidence(claim)
+    except Exception as e:
+        logger.error(f"Error in claim shortening: {e}")
+        shortened_claim = claim  # Fallback to original claim
+    # Ensure query_parts is a list of strings
+    query_parts = str(shortened_claim).split()
+    evidence = []
+    source_count = {"wikipedia": 0}
+    for i in range(len(query_parts), 0, -1):  # Start with full query, shorten iteratively
+        try:
+            # Safely join and encode query
+            current_query = "+".join(query_parts[:i])
+            search_url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={current_query}&format=json"
+            logger.info(f"Wikipedia search URL: {search_url}")
+            headers = {
+                "User-Agent": "MisinformationDetectionResearchBot/1.0 (Research Project)"
+            }
+            # Make the search request with reduced timeout
+            response = requests.get(search_url, headers=headers, timeout=7)
+            response.raise_for_status()
+            # Safely parse JSON
+            search_data = safe_json_parse(response, "wikipedia")
+            # Safely extract search results
+            search_results = search_data.get("query", {}).get("search", [])
+            # Ensure search_results is a list
+            if not isinstance(search_results, list):
+                logger.warning(f"Unexpected search results type: {type(search_results)}")
+                search_results = []
+            # Use ThreadPoolExecutor to fetch page content in parallel
+            with ThreadPoolExecutor(max_workers=3) as executor:
+                # Submit up to 3 page requests in parallel
+                futures = []
+                for idx, result in enumerate(search_results[:3]):
+                    # Ensure result is a dictionary
+                    if not isinstance(result, dict):
+                        logger.warning(f"Skipping non-dictionary result: {type(result)}")
+                        continue
+                    # Safely extract title
+                    page_title = result.get("title", "")
+                    if not page_title:
+                        continue
+                    page_url = f"https://en.wikipedia.org/wiki/{page_title.replace(' ', '_')}"
+                    # Submit the page request task to executor
+                    futures.append(executor.submit(
+                        fetch_wikipedia_page_content,
+                        page_url,
+                        page_title,
+                        headers
+                    ))
+                # Process completed futures as they finish
+                for future in as_completed(futures):
+                    try:
+                        page_result = future.result()
+                        if page_result:
+                            evidence.append(page_result)
+                            source_count["wikipedia"] += 1
+                    except Exception as e:
+                        logger.error(f"Error processing Wikipedia page: {e}")
+            # Stop if we found any evidence
+            if evidence:
+                break
+        except Exception as e:
+            logger.error(f"Error retrieving from Wikipedia: {str(e)}")
+            continue
+    # Ensure success is a boolean
+    success = bool(evidence)
+    # Safely log evidence retrieval
+    try:
+        performance_tracker.log_evidence_retrieval(success, source_count)
+    except Exception as e:
+        logger.error(f"Error logging evidence retrieval: {e}")
+    if not evidence:
+        logger.warning("No evidence found from Wikipedia.")
+    return evidence
+def fetch_wikipedia_page_content(page_url, page_title, headers):
+    """Helper function to fetch and parse Wikipedia page content"""
+    try:
+        # Get page content with reduced timeout
+        page_response = requests.get(page_url, headers=headers, timeout=5)
+        page_response.raise_for_status()
+        # Extract relevant sections using BeautifulSoup
+        soup = BeautifulSoup(page_response.text, 'html.parser')
+        paragraphs = soup.find_all('p', limit=3)  # Limit to first 3 paragraphs
+        content = " ".join([para.get_text(strip=True) for para in paragraphs])
+        # Truncate content to reduce token usage earlier in the pipeline
+        if len(content) > 300:
+            content = content[:297] + "..."
+        if content.strip():  # Ensure content is not empty
+            return f"Title: {page_title}, URL: {page_url}, Content: {content}"
+        return None
+    except Exception as e:
+        logger.error(f"Error fetching Wikipedia page {page_url}: {e}")
+        return None
+# Update the WikiData function to fix SSL issues
+@api_error_handler("wikidata")
+def retrieve_evidence_from_wikidata(claim):
+    """Retrieve evidence from WikiData for a given claim"""
+    logger.info(f"Retrieving evidence from WikiData for: {claim}")
+    # Prepare entities for SPARQL query
+    shortened_claim = shorten_claim_for_evidence(claim)
+    query_terms = shortened_claim.split()
+    # Initialize SPARQLWrapper for WikiData
+    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
+    # Use a more conservative user agent to avoid blocks
+    sparql.addCustomHttpHeader("User-Agent", "MisinformationDetectionResearchBot/1.0")
+    # Fix SSL issues by disabling SSL verification for this specific request
+    try:
+        # Create a context where we don't verify SSL certs
+        import ssl
+        import urllib.request
+        # Create a context that doesn't verify certificates
+        ssl_context = ssl._create_unverified_context()
+        # Monkey patch the opener for SPARQLWrapper
+        opener = urllib.request.build_opener(urllib.request.HTTPSHandler(context=ssl_context))
+        urllib.request.install_opener(opener)
+    except Exception as e:
+        logger.error(f"Error setting up SSL context: {str(e)}")
+    # Construct basic SPARQL query for relevant entities
+    query = """
+    SELECT ?item ?itemLabel ?description ?article WHERE {
+      SERVICE wikibase:mwapi {
+        bd:serviceParam wikibase:api "EntitySearch" .
+        bd:serviceParam wikibase:endpoint "www.wikidata.org" .
+        bd:serviceParam mwapi:search "%s" .
+        bd:serviceParam mwapi:language "en" .
+        ?item wikibase:apiOutputItem mwapi:item .
+      }
+      ?item schema:description ?description .
+      FILTER(LANG(?description) = "en")
+      OPTIONAL {
+        ?article schema:about ?item .
+        ?article schema:isPartOf <https://en.wikipedia.org/> .
+      }
+      SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
+    }
+    LIMIT 5
+    """ % " ".join(query_terms)
+    sparql.setQuery(query)
+    sparql.setReturnFormat(JSON)
+    try:
+        results = sparql.query().convert()
+        wikidata_evidence = []
+        for result in results["results"]["bindings"]:
+            entity_label = result.get("itemLabel", {}).get("value", "Unknown")
+            description = result.get("description", {}).get("value", "No description")
+            article_url = result.get("article", {}).get("value", "")
+            # Truncate description to reduce token usage
+            if len(description) > 200:
+                description = description[:197] + "..."
+            evidence_text = f"Entity: {entity_label}, Description: {description}"
+            if article_url:
+                evidence_text += f", URL: {article_url}"
+            wikidata_evidence.append(evidence_text)
+        logger.info(f"Retrieved {len(wikidata_evidence)} WikiData entities")
+        return wikidata_evidence
+    except Exception as e:
+        logger.error(f"Error retrieving from WikiData: {str(e)}")
+        return []
+@api_error_handler("openalex")
+def retrieve_evidence_from_openalex(claim):
+    """Retrieve evidence from OpenAlex for a given claim (replacement for Semantic Scholar)"""
+    logger.info(f"Retrieving evidence from OpenAlex for: {claim}")
+    try:
+        shortened_claim = shorten_claim_for_evidence(claim)
+        query = shortened_claim.replace(" ", "+")
+        # OpenAlex API endpoint
+        api_url = f"https://api.openalex.org/works?search={query}&filter=is_paratext:false&per_page=3"
+        headers = {
+            "Accept": "application/json",
+            "User-Agent": "MisinformationDetectionResearchBot/1.0 ([email protected])",
+        }
+        scholarly_evidence = []
+        try:
+            # Request with reduced timeout
+            response = requests.get(api_url, headers=headers, timeout=8)
+            # Check response status
+            if response.status_code == 200:
+                # Successfully retrieved data
+                data = safe_json_parse(response, "openalex")
+                papers = data.get("results", [])
+                for paper in papers:
+                    title = paper.get("title", "Unknown Title")
+                    abstract = paper.get("abstract_inverted_index", None)
+                    # OpenAlex stores abstracts in an inverted index format, so we need to reconstruct it
+                    abstract_text = "No abstract available"
+                    if abstract:
+                        try:
+                            # Simple approach to reconstruct from inverted index
+                            # For a production app, implement a proper reconstruction algorithm
+                            words = list(abstract.keys())
+                            abstract_text = " ".join(words[:30]) + "..."
+                        except Exception as e:
+                            logger.error(f"Error reconstructing abstract: {e}")
+                    url = paper.get("doi", "")
+                    if url and not url.startswith("http"):
+                        url = f"https://doi.org/{url}"
+                    year = ""
+                    publication_date = paper.get("publication_date", "")
+                    if publication_date:
+                        year = publication_date.split("-")[0]
+                    # Truncate abstract to reasonable length
+                    if len(abstract_text) > 250:
+                        abstract_text = abstract_text[:247] + "..."
+                    evidence_text = f"Title: {title}, Year: {year}, Abstract: {abstract_text}, URL: {url}"
+                    scholarly_evidence.append(evidence_text)
+            else:
+                logger.error(f"OpenAlex API error: {response.status_code}")
+        except requests.exceptions.Timeout:
+            logger.warning("OpenAlex request timed out")
+        except requests.exceptions.ConnectionError:
+            logger.warning("OpenAlex connection error")
+        except Exception as e:
+            logger.error(f"Unexpected error in OpenAlex request: {str(e)}")
+        logger.info(f"Retrieved {len(scholarly_evidence)} scholarly papers from OpenAlex")
+        return scholarly_evidence
+    except Exception as e:
+        logger.error(f"Fatal error in OpenAlex retrieval: {str(e)}")
+        return []
+@api_error_handler("factcheck")
+def retrieve_evidence_from_claimreview(claim):
+    """Retrieve evidence from Google's ClaimReview for a given claim"""
+    logger.info(f"Retrieving evidence from ClaimReview for: {claim}")
+    factcheck_api_key = FACTCHECK_API_KEY
+    # Safely shorten claim
+    try:
+        shortened_claim = shorten_claim_for_evidence(claim)
+    except Exception as e:
+        logger.error(f"Error shortening claim: {e}")
+        shortened_claim = claim
+    query_parts = str(shortened_claim).split()
+    factcheck_results = []
+    source_count = {"factcheck": 0}
+    for i in range(len(query_parts), 0, -1):  # Iteratively try shorter queries
+        try:
+            current_query = " ".join(query_parts[:i])
+            encoded_query = urlencode({"query": current_query})
+            factcheck_url = f"https://factchecktools.googleapis.com/v1alpha1/claims:search?{encoded_query}&key={factcheck_api_key}"
+            logger.info(f"Factcheck URL: {factcheck_url}")
+            # Make request with reduced timeout
+            response = requests.get(factcheck_url, timeout=7)
+            response.raise_for_status()
+            data = safe_json_parse(response, "factcheck")
+            # Safely extract claims
+            claims = data.get("claims", [])
+            if not isinstance(claims, list):
+                logger.warning(f"Unexpected claims type: {type(claims)}")
+                claims = []
+            if claims:  # If results found
+                logger.info(f"Results found for query '{current_query}'.")
+                for item in claims:
+                    try:
+                        # Ensure item is a dictionary
+                        if not isinstance(item, dict):
+                            logger.warning(f"Skipping non-dictionary item: {type(item)}")
+                            continue
+                        claim_text = str(item.get("text", ""))
+                        # Truncate claim text
+                        if len(claim_text) > 200:
+                            claim_text = claim_text[:197] + "..."
+                        reviews = item.get("claimReview", [])
+                        # Ensure reviews is a list
+                        if not isinstance(reviews, list):
+                            logger.warning(f"Unexpected reviews type: {type(reviews)}")
+                            reviews = []
+                        for review in reviews:
+                            # Ensure review is a dictionary
+                            if not isinstance(review, dict):
+                                logger.warning(f"Skipping non-dictionary review: {type(review)}")
+                                continue
+                            publisher = str(review.get("publisher", {}).get("name", "Unknown Source"))
+                            rating = str(review.get("textualRating", "Unknown"))
+                            review_url = str(review.get("url", ""))
+                            if claim_text:
+                                factcheck_results.append(
+                                    f"Claim: {claim_text}, Rating: {rating}, " +
+                                    f"Source: {publisher}, URL: {review_url}"
+                                )
+                                source_count["factcheck"] += 1
+                    except Exception as e:
+                        logger.error(f"Error processing FactCheck result: {e}")
+                break  # Break once we have results
+            else:
+                logger.info(f"No results for query '{current_query}', trying shorter version.")
+        except Exception as e:
+            logger.error(f"Error in FactCheck retrieval: {e}")
+    # Safely log evidence retrieval
+    try:
+        success = bool(factcheck_results)
+        performance_tracker.log_evidence_retrieval(success, source_count)
+    except Exception as e:
+        logger.error(f"Error logging evidence retrieval: {e}")
+    if not factcheck_results:
+        logger.warning("No factcheck evidence found after trying all query variants.")
+    return factcheck_results
+@api_error_handler("newsapi")
+def retrieve_news_articles(claim):
+    """Retrieve evidence from NewsAPI for a given claim with improved single request approach"""
+    logger.info(f"Retrieving evidence from News API for: {claim}")
+    # Get API key
+    news_api_key = NEWS_API_KEY
+    if not news_api_key:
+        logger.error("No NewsAPI key available")
+        return []
+    news_results = []
+    source_count = {"news": 0}
+    # Get date range for recent news
+    from_date, to_date = get_recent_date_range()
+    logger.info(f"Filtering for news from {from_date} to {to_date}")
+    try:
+        # Extract a simplified claim for better matching
+        shortened_claim = shorten_claim_for_evidence(claim)
+        # Use a single endpoint with proper parameters
+        encoded_query = urlencode({"q": shortened_claim})
+        # Use the 'everything' endpoint as it's more comprehensive
+        news_api_url = f"https://newsapi.org/v2/everything?{encoded_query}&apiKey={news_api_key}&language=en&pageSize=5&sortBy=publishedAt&from={from_date}&to={to_date}"
+        log_url = news_api_url.replace(news_api_key, "API_KEY_REDACTED")
+        logger.info(f"Requesting: {log_url}")
+        # Make a single request with proper headers and reduced timeout
+        headers = {
+            "User-Agent": "MisinformationDetectionResearchBot/1.0",
+            "X-Api-Key": news_api_key,
+            "Accept": "application/json"
+        }
+        response = requests.get(
+            news_api_url,
+            headers=headers,
+            timeout=8
+        )
+        logger.info(f"Response status: {response.status_code}")
+        if response.status_code == 200:
+            data = safe_json_parse(response, "newsapi")
+            if data.get("status") == "ok":
+                articles = data.get("articles", [])
+                logger.info(f"Found {len(articles)} articles")
+                for article in articles:
+                    try:
+                        # Robust article parsing
+                        title = str(article.get("title", ""))
+                        description = str(article.get("description", ""))
+                        content = str(article.get("content", ""))
+                        source_name = str(article.get("source", {}).get("name", "Unknown"))
+                        url = str(article.get("url", ""))
+                        published_at = str(article.get("publishedAt", ""))
+                        # Parse date to prioritize recent content
+                        article_date = None
+                        try:
+                            if published_at:
+                                article_date = datetime.strptime(published_at.split('T')[0], '%Y-%m-%d')
+                        except Exception as date_error:
+                            logger.warning(f"Could not parse date: {published_at}")
+                        # Calculate recency score (higher = more recent)
+                        recency_score = 1.0  # Default
+                        if article_date:
+                            days_old = (datetime.now() - article_date).days
+                            if days_old == 0:  # Today
+                                recency_score = 3.0
+                            elif days_old == 1:  # Yesterday
+                                recency_score = 2.0
+                        # Use description if content is empty or too short
+                        if not content or len(content) < 50:
+                            content = description
+                        # Truncate content to reduce token usage
+                        if len(content) > 250:
+                            content = content[:247] + "..."
+                        # Ensure meaningful content
+                        if title and (content or description):
+                            news_item = {
+                                "text": (
+                                    f"Title: {title}, " +
+                                    f"Source: {source_name}, " +
+                                    f"Date: {published_at}, " +
+                                    f"URL: {url}, " +
+                                    f"Content: {content}"
+                                ),
+                                "recency_score": recency_score,
+                                "date": article_date
+                            }
+                            news_results.append(news_item)
+                            source_count["news"] += 1
+                            logger.info(f"Added article: {title}")
+                    except Exception as article_error:
+                        logger.error(f"Error processing article: {article_error}")
+                # Sort results by recency
+                if news_results:
+                    news_results.sort(key=lambda x: x.get('recency_score', 0), reverse=True)
+    except Exception as query_error:
+        logger.error(f"Error processing query: {query_error}")
+    # Convert to plain text list for compatibility with existing code
+    news_texts = [item["text"] for item in news_results]
+    # Log evidence retrieval
+    try:
+        success = bool(news_texts)
+        performance_tracker.log_evidence_retrieval(success, source_count)
+    except Exception as log_error:
+        logger.error(f"Error logging evidence retrieval: {log_error}")
+    # Log results
+    if news_texts:
+        logger.info(f"Retrieved {len(news_texts)} news articles")
+    else:
+        logger.warning("No news articles found")
+    return news_texts
+def retrieve_combined_evidence(claim):
+    """
+    Retrieve evidence from multiple sources in parallel and analyze relevance using semantic similarity
+    with category-aware source prioritization and optimized parallel processing
+    """
+    logger.info(f"Starting evidence retrieval for: {claim}")
+    start_time = time.time()
+    # Use the category detector to prioritize sources
+    from modules.category_detection import get_prioritized_sources, get_category_specific_rss_feeds
+    # Get source priorities based on claim category
+    priorities = get_prioritized_sources(claim)
+    claim_category = priorities.get("category", "general")
+    requires_recent_evidence = priorities.get("requires_recent", False)
+    logger.info(f"Detected claim category: {claim_category} (recent: {requires_recent_evidence})")
+    # Initialize results dictionary
+    results = {
+        "wikipedia": [],
+        "wikidata": [],
+        "claimreview": [],
+        "news": [],
+        "scholarly": [],
+        "rss": []
+    }
+    # Track source counts and relevant evidence
+    source_counts = {}
+    relevant_evidence = {}
+    total_evidence_count = 0
+    relevant_evidence_count = 0
+    # Define primary and secondary sources outside the try block
+    # so they're available in the except block
+    primary_sources = []
+    for source_name in priorities.get("primary", []):
+        if source_name == "wikipedia":
+            primary_sources.append(("wikipedia", retrieve_evidence_from_wikipedia, claim))
+        elif source_name == "wikidata":
+            primary_sources.append(("wikidata", retrieve_evidence_from_wikidata, claim))
+        elif source_name == "claimreview":
+            primary_sources.append(("claimreview", retrieve_evidence_from_claimreview, claim))
+        elif source_name == "news":
+            primary_sources.append(("news", retrieve_news_articles, claim))
+        elif source_name == "scholarly":
+            primary_sources.append(("scholarly", retrieve_evidence_from_openalex, claim))
+        elif source_name == "rss":
+            # Get category-specific RSS max count
+            max_results = 8 if requires_recent_evidence else 5
+            # If the claim is science or technology related and we need to optimize
+            # use category-specific RSS feeds
+            if claim_category in ["science", "technology", "politics"]:
+                # Get specialized RSS module to temporarily use category-specific feeds
+                category_feeds = get_category_specific_rss_feeds(claim_category)
+                if category_feeds:
+                    primary_sources.append(("rss", retrieve_evidence_from_rss, claim, max_results, category_feeds))
+                else:
+                    primary_sources.append(("rss", retrieve_evidence_from_rss, claim, max_results))
+            else:
+                primary_sources.append(("rss", retrieve_evidence_from_rss, claim, max_results))
+    # Prepare secondary sources
+    secondary_sources = []
+    for source_name in priorities.get("secondary", []):
+        if source_name == "wikipedia":
+            secondary_sources.append(("wikipedia", retrieve_evidence_from_wikipedia, claim))
+        elif source_name == "wikidata":
+            secondary_sources.append(("wikidata", retrieve_evidence_from_wikidata, claim))
+        elif source_name == "claimreview":
+            secondary_sources.append(("claimreview", retrieve_evidence_from_claimreview, claim))
+        elif source_name == "news":
+            secondary_sources.append(("news", retrieve_news_articles, claim))
+        elif source_name == "scholarly":
+            secondary_sources.append(("scholarly", retrieve_evidence_from_openalex, claim))
+        elif source_name == "rss":
+            max_results = 5 if requires_recent_evidence else 3
+            # Use category-specific feeds if available
+            if claim_category in ["science", "technology", "politics"]:
+                category_feeds = get_category_specific_rss_feeds(claim_category)
+                if category_feeds:
+                    secondary_sources.append(("rss", retrieve_evidence_from_rss, claim, max_results, category_feeds))
+                else:
+                    secondary_sources.append(("rss", retrieve_evidence_from_rss, claim, max_results))
+            else:
+                secondary_sources.append(("rss", retrieve_evidence_from_rss, claim, max_results))
+    # Optimize parallel processing for evidence retrieval with early results processing
+    try:
+        # Define function to safely retrieve evidence
+        def safe_retrieve(source_name, retrieval_func, *args):
+            try:
+                source_result = retrieval_func(*args) or []
+                return source_name, source_result
+            except Exception as e:
+                logger.error(f"Error retrieving from {source_name}: {str(e)}")
+                return source_name, []
+        # Define function to analyze evidence relevance
+        def analyze_evidence_quick(evidence_items, claim_text):
+            if not evidence_items or not claim_text:
+                return []
+            # Extract important keywords from claim
+            keywords = [word.lower() for word in claim_text.split() if len(word) > 3]
+            # Check for direct relevance
+            relevant_items = []
+            for evidence in evidence_items:
+                if not isinstance(evidence, str):
+                    continue
+                evidence_lower = evidence.lower()
+                # Check if evidence contains any important keywords from claim
+                if any(keyword in evidence_lower for keyword in keywords):
+                    relevant_items.append(evidence)
+                    continue
+                # Check for claim subject in evidence (e.g. "earth" in "earth is flat")
+                claim_parts = claim_text.split()
+                if len(claim_parts) > 0 and claim_parts[0].lower() in evidence_lower:
+                    relevant_items.append(evidence)
+                    continue
+            return relevant_items
+        # Use ThreadPoolExecutor with a reasonable number of workers
+        # Start with primary sources first - use all available sources in parallel
+        with ThreadPoolExecutor(max_workers=min(4, len(primary_sources))) as executor:
+            # Submit all primary source tasks
+            futures_to_source = {
+                executor.submit(safe_retrieve, source_name, func, *args): source_name
+                for source_name, func, *args in primary_sources
+            }
+            # Track completed sources
+            completed_sources = set()
+            # Process results as they complete using as_completed for early processing
+            for future in as_completed(futures_to_source):
+                try:
+                    source_name, source_results = future.result()
+                    results[source_name] = source_results
+                    source_counts[source_name] = len(source_results)
+                    completed_sources.add(source_name)
+                    logger.info(f"Retrieved {len(source_results)} results from {source_name}")
+                    # Quick relevance analysis
+                    if source_results:
+                        relevant_items = analyze_evidence_quick(source_results, claim)
+                        relevant_evidence[source_name] = relevant_items
+                        total_evidence_count += len(source_results)
+                        relevant_evidence_count += len(relevant_items)
+                        logger.info(f"Found {len(relevant_items)} relevant items out of {len(source_results)} from {source_name}")
+                        # Start background pre-analysis while waiting for other sources
+                        try:
+                            executor.submit(
+                                analyze_early_evidence,
+                                claim,
+                                source_name,
+                                source_results
+                            )
+                        except Exception as e:
+                            logger.error(f"Error in early evidence analysis: {e}")
+                except Exception as e:
+                    logger.error(f"Error processing future result: {str(e)}")
+        # Check if we have sufficient RELEVANT evidence from primary sources
+        # If not enough relevant evidence, query secondary sources
+        # in parallel even if we have a lot of total evidence
+        if relevant_evidence_count < 2:
+            logger.info(f"Only found {relevant_evidence_count} relevant evidence items, querying secondary sources")
+            # Add Wikipedia and Wikidata if they weren't in primary sources and haven't been queried yet
+            must_check_sources = []
+            if "wikipedia" not in completed_sources:
+                must_check_sources.append(("wikipedia", retrieve_evidence_from_wikipedia, claim))
+            if "wikidata" not in completed_sources:
+                must_check_sources.append(("wikidata", retrieve_evidence_from_wikidata, claim))
+            # Combine with other secondary sources
+            remaining_sources = must_check_sources + [
+                (source_name, func, *args) for source_name, func, *args in secondary_sources
+                if source_name not in completed_sources
+            ]
+            with ThreadPoolExecutor(max_workers=min(3, len(remaining_sources))) as executor:
+                # Submit all secondary source tasks
+                futures_to_source = {
+                    executor.submit(safe_retrieve, source_name, func, *args): source_name
+                    for source_name, func, *args in remaining_sources
+                }
+                # Process results as they complete
+                for future in as_completed(futures_to_source):
+                    try:
+                        source_name, source_results = future.result()
+                        results[source_name] = source_results
+                        source_counts[source_name] = len(source_results)
+                        logger.info(f"Retrieved {len(source_results)} results from {source_name}")
+                        # Quick relevance analysis for these as well
+                        if source_results:
+                            relevant_items = analyze_evidence_quick(source_results, claim)
+                            relevant_evidence[source_name] = relevant_items
+                            total_evidence_count += len(source_results)
+                            relevant_evidence_count += len(relevant_items)
+                            logger.info(f"Found {len(relevant_items)} relevant items out of {len(source_results)} from {source_name}")
+                    except Exception as e:
+                        logger.error(f"Error processing future result: {str(e)}")
+    except Exception as e:
+        logger.error(f"Error in parallel evidence retrieval: {str(e)}")
+        # Fall back to sequential retrieval as a last resort
+        try:
+            logger.warning("Falling back to sequential retrieval due to parallel execution failure")
+            # Sequential retrieval as fallback method - now primary_sources is in scope
+            for source_name, func, *args in primary_sources:
+                try:
+                    results[source_name] = func(*args) or []
+                    source_counts[source_name] = len(results[source_name])
+                except Exception as source_error:
+                    logger.error(f"Error in sequential {source_name} retrieval: {str(source_error)}")
+            # For sequential retrieval, always check Wikipedia and Wikidata as fallbacks
+            if "wikipedia" not in completed_sources:
+                try:
+                    results["wikipedia"] = retrieve_evidence_from_wikipedia(claim) or []
+                    source_counts["wikipedia"] = len(results["wikipedia"])
+                except Exception as e:
+                    logger.error(f"Error in fallback Wikipedia retrieval: {e}")
+            if "wikidata" not in completed_sources:
+                try:
+                    results["wikidata"] = retrieve_evidence_from_wikidata(claim) or []
+                    source_counts["wikidata"] = len(results["wikidata"])
+                except Exception as e:
+                    logger.error(f"Error in fallback Wikidata retrieval: {e}")
+        except Exception as fallback_error:
+            logger.error(f"Error in fallback sequential retrieval: {str(fallback_error)}")
+    # Gather all evidence
+    all_evidence = []
+    for source, items in results.items():
+        if isinstance(items, list):
+            for item in items:
+                if item and isinstance(item, str):
+                    all_evidence.append(item)
+    # Skip processing if no evidence
+    if not all_evidence:
+        logger.warning("No evidence collected")
+        # Fallback: try direct search for the claim subject
+        try:
+            logger.info("No evidence found, trying fallback subject search")
+            # Extract the main subject using NLP
+            nlp = get_nlp_model()
+            doc = nlp(claim)
+            # Find main subject entities or nouns
+            subjects = []
+            for ent in doc.ents:
+                if ent.label_ in ["PERSON", "ORG", "GPE"]:
+                    subjects.append(ent.text)
+            # If no entities found, use first noun phrase
+            if not subjects:
+                for chunk in doc.noun_chunks:
+                    subjects.append(chunk.text)
+                    break
+            if subjects:
+                # Try a direct search with just the subject
+                logger.info(f"Trying fallback search with subject: {subjects[0]}")
+                # Make sure we try Wikipedia for the subject regardless of priorities
+                try:
+                    wiki_evidence = retrieve_evidence_from_wikipedia(subjects[0]) or []
+                    all_evidence.extend(wiki_evidence)
+                    logger.info(f"Retrieved {len(wiki_evidence)} results from fallback Wikipedia search")
+                except Exception as e:
+                    logger.error(f"Error in fallback Wikipedia search: {e}")
+                # If still no evidence, try other sources
+                if not all_evidence:
+                    # Do fallback searches in parallel
+                    with ThreadPoolExecutor(max_workers=2) as executor:
+                        fallback_futures = {
+                            "news": executor.submit(retrieve_news_articles, subjects[0]),
+                            "wikidata": executor.submit(retrieve_evidence_from_wikidata, subjects[0])
+                        }
+                        # Process results as they complete
+                        for source, future in fallback_futures.items():
+                            try:
+                                fallback_results = future.result() or []
+                                if fallback_results:
+                                    all_evidence.extend(fallback_results[:2])  # Add up to 2 results from each
+                                    logger.info(f"Retrieved {len(fallback_results)} results from fallback {source} search")
+                            except Exception as e:
+                                logger.error(f"Error in fallback {source} search: {str(e)}")
+        except Exception as subj_error:
+            logger.error(f"Error in fallback subject search: {str(subj_error)}")
+        # If still no evidence, return empty list
+        if not all_evidence:
+            return []
+    # Use semantic analysis to score and select the most relevant evidence
+    try:
+        # For science and technology claims, boost the weight of scholarly sources
+        if claim_category in ["science", "technology"]:
+            from config import SOURCE_CREDIBILITY
+            # Create a temporary copy with boosted reliability for relevant sources
+            enhanced_credibility = dict(SOURCE_CREDIBILITY)
+            # Add enhanced weights for scientific sources
+            from modules.category_detection import SOURCE_RELIABILITY_BY_CATEGORY
+            for domain, reliability in SOURCE_RELIABILITY_BY_CATEGORY.get(claim_category, {}).items():
+                enhanced_credibility[domain] = reliability
+            # Use the enhanced credibility for evidence analysis
+            analyzed_evidence = analyze_evidence_relevance(claim, all_evidence, enhanced_credibility)
+        else:
+            # Analyze evidence relevance using semantic similarity with default weights
+            from config import SOURCE_CREDIBILITY
+            analyzed_evidence = analyze_evidence_relevance(claim, all_evidence, SOURCE_CREDIBILITY)
+        # Log evidence scoring
+        logger.info(f"Analyzed {len(analyzed_evidence)} evidence items")
+        # Select diverse, relevant evidence items
+        final_evidence = select_diverse_evidence(analyzed_evidence, max_items=5)
+        # Log source distribution and selected count
+        logger.info(f"Evidence source distribution: {source_counts}")
+        logger.info(f"Selected evidence count: {len(final_evidence)}")
+        # Return maximum 5 evidence items (to control API costs)
+        return final_evidence[:5]
+    except Exception as e:
+        logger.error(f"Error in evidence analysis: {str(e)}")
+        # Fallback to simple selection (top 5 items)
+        return all_evidence[:5]

modules/explanation.py ADDED Viewed

	@@ -0,0 +1,181 @@

+import logging
+import re
+import ast
+from utils.models import get_llm_model
+logger = logging.getLogger("misinformation_detector")
+def extract_most_relevant_evidence(evidence_results):
+    """
+    Intelligently extract the most relevant piece of evidence
+    Args:
+        evidence_results (list): List of evidence items
+    Returns:
+        str: Most relevant evidence piece
+    """
+    if not evidence_results:
+        return None
+    # If evidence is a dictionary with 'evidence' key
+    if isinstance(evidence_results[0], dict):
+        # Sort by confidence if available
+        sorted_evidence = sorted(
+            evidence_results,
+            key=lambda x: x.get('confidence', 0),
+            reverse=True
+        )
+        # Return the evidence from the highest confidence item
+        for item in sorted_evidence:
+            evidence = item.get('evidence')
+            if evidence:
+                return evidence
+    # If plain list of evidence
+    return next((ev for ev in evidence_results if ev and isinstance(ev, str)), None)
+def generate_explanation(claim, evidence_results, truth_label, confidence=None):
+    """
+    Generate an explanation for the claim's classification
+    Args:
+        claim (str): The original claim
+        evidence_results (list/str): Evidence supporting the classification
+        truth_label (str): Classification of the claim
+        confidence (float): Confidence level (0-1)
+    Returns:
+        str: Explanation of the claim's classification
+    """
+    logger.info(f"Generating explanation for claim with verdict: {truth_label}")
+    try:
+        # Normalize evidence_results to a list
+        if not isinstance(evidence_results, list):
+            try:
+                evidence_results = ast.literal_eval(str(evidence_results)) if evidence_results else []
+            except:
+                evidence_results = [evidence_results] if evidence_results else []
+        # Get the LLM model
+        explanation_model = get_llm_model()
+        # Extract most relevant evidence
+        most_relevant_evidence = extract_most_relevant_evidence(evidence_results)
+        # Prepare evidence text for prompt
+        evidence_text = "\n".join([
+            f"Evidence {i+1}: {str(ev)[:200] + '...' if len(str(ev)) > 200 else str(ev)}"
+            for i, ev in enumerate(evidence_results[:3])
+        ])
+        # Convert confidence to percentage and description
+        confidence_desc = ""
+        if confidence is not None:
+            confidence_pct = int(confidence * 100)
+            if confidence < 0.3:
+                confidence_desc = f"very low confidence ({confidence_pct}%)"
+            elif confidence < 0.5:
+                confidence_desc = f"low confidence ({confidence_pct}%)"
+            elif confidence < 0.7:
+                confidence_desc = f"moderate confidence ({confidence_pct}%)"
+            elif confidence < 0.9:
+                confidence_desc = f"high confidence ({confidence_pct}%)"
+            else:
+                confidence_desc = f"very high confidence ({confidence_pct}%)"
+        else:
+            # Determine confidence context from label if not explicitly provided
+            confidence_desc = (
+                "high confidence" if "High Confidence" in truth_label else
+                "moderate confidence" if "Likely" in truth_label else
+                "low confidence"
+            )
+        # Create prompt with specific instructions based on the type of claim
+        has_negation = any(neg in claim.lower() for neg in ["not", "no longer", "isn't", "doesn't", "won't", "cannot"])
+        # For claims with "True" verdict
+        if "True" in truth_label:
+            prompt = f"""
+            Claim: "{claim}"
+            Verdict: {truth_label} (with {confidence_desc})
+            Available Evidence:
+            {evidence_text}
+            Task: Generate a clear explanation that:
+            1. Clearly states that the claim IS TRUE based on the evidence
+            2. {"Pay special attention to the logical relationship since the claim contains negation" if has_negation else "Explains why the evidence supports the claim"}
+            3. Uses confidence level of {confidence_desc}
+            4. Highlights the most relevant supporting evidence
+            5. Is factual and precise
+            """
+        # For claims with "False" verdict
+        elif "False" in truth_label:
+            prompt = f"""
+            Claim: "{claim}"
+            Verdict: {truth_label} (with {confidence_desc})
+            Available Evidence:
+            {evidence_text}
+            Task: Generate a clear explanation that:
+            1. Clearly states that the claim IS FALSE based on the evidence
+            2. {"Pay special attention to the logical relationship since the claim contains negation" if has_negation else "Explains why the evidence contradicts the claim"}
+            3. Uses confidence level of {confidence_desc}
+            4. Highlights the contradicting evidence
+            5. Is factual and precise
+            IMPORTANT: If the claim contains negation (words like 'not', 'no longer', etc.), be extra careful with the logical relationship between the evidence and the claim.
+            """
+        # For uncertain claims
+        else:
+            prompt = f"""
+            Claim: "{claim}"
+            Verdict: {truth_label} (with {confidence_desc})
+            Available Evidence:
+            {evidence_text}
+            Task: Generate a clear explanation that:
+            1. Clearly states that there is insufficient evidence to determine if the claim is true or false
+            2. Explains what information is missing or why the available evidence is insufficient
+            3. Uses confidence level of {confidence_desc}
+            4. Makes NO speculation about whether the claim might be true or false
+            5. Mentions that the user should seek information from other reliable sources
+            """
+        # Generate explanation with multiple attempts
+        max_attempts = 3
+        for attempt in range(max_attempts):
+            try:
+                # Invoke the model
+                response = explanation_model.invoke(prompt)
+                explanation = response.content.strip()
+                # Validate explanation length
+                if explanation and len(explanation.split()) >= 5:
+                    return explanation
+            except Exception as attempt_error:
+                logger.error(f"Explanation generation attempt {attempt+1} failed: {str(attempt_error)}")
+        # Ultimate fallback explanation
+        if "Uncertain" in truth_label:
+            return f"The claim '{claim}' cannot be verified due to insufficient evidence. The available information does not provide clear support for or against this claim. Consider consulting reliable sources for verification."
+        elif "True" in truth_label:
+            return f"The claim '{claim}' is supported by the evidence with {confidence_desc}. {most_relevant_evidence or 'The evidence indicates this claim is accurate.'}"
+        else:
+            return f"The claim '{claim}' is contradicted by the evidence with {confidence_desc}. {most_relevant_evidence or 'The evidence indicates this claim is not accurate.'}"
+    except Exception as e:
+        logger.error(f"Comprehensive error in explanation generation: {str(e)}")
+        # Final fallback
+        return f"The claim is classified as {truth_label} based on the available evidence."

modules/rss_feed.py ADDED Viewed

	@@ -0,0 +1,391 @@

+import feedparser
+import time
+import logging
+import re
+import ssl
+import requests
+from datetime import datetime, timedelta
+from threading import Timer
+from urllib.parse import urlparse
+from concurrent.futures import ThreadPoolExecutor, as_completed
+logger = logging.getLogger("misinformation_detector")
+# Disable SSL certificate verification for feeds with self-signed certs
+ssl._create_default_https_context = ssl._create_unverified_context
+# List of RSS feeds to check for news
+# These are popular news sources with reliable and frequently updated RSS feeds
+RSS_FEEDS = [
+# --------------------
+# 🌐 General World News
+# --------------------
+# "http://rss.cnn.com/rss/cnn_topstories.rss",                    # CNN Top Stories; Removed in round 2
+"http://rss.cnn.com/rss/cnn_world.rss",                         # CNN World News; Duplicate with category_detection
+# "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml",   # NYT Home Page
+"https://rss.nytimes.com/services/xml/rss/nyt/World.xml",      # NYT World News; Duplicate with category_detection
+# "https://rss.nytimes.com/services/xml/rss/nyt/US.xml",         # NYT US News
+"https://feeds.washingtonpost.com/rss/world",                  # The Washington Post World News; Removed in round 2
+# "https://feeds.washingtonpost.com/rss/national",               # The Washington Post National News
+# "https://feeds.bbci.co.uk/news/rss.xml",                       # BBC News - Top Stories; Removed in round 2
+"https://feeds.bbci.co.uk/news/world/rss.xml",                 # BBC News - World
+# "https://news.google.com/rss?gl=IN&ceid=IN:en&topic=w&hl=en-IN",  # Google News India - World; Removed in round 2
+# "https://news.google.com/rss?gl=US&ceid=US:en&topic=w&hl=en-US",  # Google News US - World; Removed in round 2
+# --------------------
+# 🧠 Tech & Startup News (Global)
+# --------------------
+"https://techcrunch.com/feed/",                                # TechCrunch - Startup and Technology News; Duplicate with category_detection
+"https://venturebeat.com/feed/",                               # VentureBeat - Tech News
+# "https://www.theverge.com/rss/index.xml",                      # The Verge - Technology News
+"https://www.wired.com/feed/rss",                              # Wired - Technology News
+"https://www.cnet.com/rss/news/",                              # CNET - Technology News
+# "https://sifted.eu/feed/",                                     # Sifted - European Startups and Tech
+# "https://feeds.feedburner.com/fastcompany/headlines",          # Fast Company - Business Innovation
+# "https://feeds.bbci.co.uk/news/technology/rss.xml",            # BBC News - Technology
+"https://news.google.com/rss?gl=IN&ceid=IN:en&topic=t&hl=en-IN",  # Google News India - Technology
+"https://news.google.com/rss?gl=US&ceid=US:en&topic=t&hl=en-US",  # Google News US - Technology
+# --------------------
+# 💼 Startup & VC Focused
+# --------------------
+"https://news.crunchbase.com/feed/",                           # Crunchbase News - Startup Funding
+# "https://avc.com/feed/",                                       # AVC - Musings of a VC in NYC
+"https://techstartups.com/feed/",                              # Tech Startups - Startup News
+# "https://tech.eu/feed/",                                       # Tech.eu - European Tech News
+# "https://www.menabytes.com/feed/",                             # MENAbytes - Middle East & North Africa Startups
+# "http://feeds.feedburner.com/venturebeat/SZYF",                # VentureBeat - Deals
+# --------------------
+# 📰 Global Business & Corporate Feeds
+# --------------------
+"https://feeds.bloomberg.com/technology/news.rss",             # Bloomberg Technology News
+"https://www.ft.com/technology?format=rss",                    # Financial Times Technology News
+# "https://ir.thomsonreuters.com/rss/news-releases.xml",         # Thomson Reuters Press Releases
+# "https://feeds.bbci.co.uk/news/business/rss.xml",              # BBC News - Business
+"https://news.google.com/rss?gl=IN&ceid=IN:en&topic=b&hl=en-IN",  # Google News India - Business
+# "https://news.google.com/rss?gl=US&ceid=US:en&topic=b&hl=en-US",  # Google News US - Business; Removed in round 2
+# --------------------
+# 🇮🇳 India-specific News
+# --------------------
+"https://inc42.com/feed/",                                     # Inc42 - Indian Startups and Technology
+# "https://yourstory.com/rss",                                   # YourStory - Indian Startup Stories
+# "https://economictimes.indiatimes.com/startups/rssfeeds/49979279.cms",  # Economic Times - Startups
+"https://timesofindia.indiatimes.com/rssfeedstopstories.cms",           # TOI - Top Stories
+"https://timesofindia.indiatimes.com/rssfeedmostrecent.cms",            # TOI - Most Recent Stories
+"https://timesofindia.indiatimes.com/rssfeeds/-2128936835.cms",         # TOI - India News
+"https://timesofindia.indiatimes.com/rssfeeds/296589292.cms",           # TOI - World News
+"https://timesofindia.indiatimes.com/rssfeeds/1898055.cms",             # TOI - Business News
+"https://timesofindia.indiatimes.com/rssfeeds/54829575.cms",            # TOI - Cricket News
+"https://timesofindia.indiatimes.com/rssfeeds/4719148.cms",             # TOI - Sports News
+"https://timesofindia.indiatimes.com/rssfeeds/-2128672765.cms",         # TOI - Science News
+# "https://timesofindia.indiatimes.com/rssfeeds/66949542.cms",            # TOI - Technology News
+# "https://timesofindia.indiatimes.com/rssfeeds/1081479906.cms",          # TOI - Education News
+# --------------------
+# 🏏 Sports News (Global + Cricket)
+# --------------------
+"https://www.espn.com/espn/rss/news",                          # ESPN - Top Sports News; Duplicate with category_detection
+# "https://api.foxsports.com/v2/content/optimized-rss?partnerKey=MB0Wehpmuj2lUhuRhQaafhBjAJqaPU244mlTDK1i&size=30",  # Fox Sports; Removed in round 2
+"https://feeds.skynews.com/feeds/rss/sports.xml",              # Sky News - Sports
+"https://sports.ndtv.com/rss/all",                                 # NDTV Sports
+"https://www.espncricinfo.com/rss/content/story/feeds/0.xml",  # ESPN Cricinfo - Cricket News; Duplicate with category_detection
+# "https://crickettimes.com/feed/",                              # Cricket Times - Cricket News
+# --------------------
+# ✅ Fact-Checking Sources
+# --------------------
+"https://www.snopes.com/feed/",                                # Snopes - Fact Checking; Duplicate with category_detection
+"https://www.politifact.com/rss/all/",                         # PolitiFact - Fact Checking; Duplicate with category_detection
+# --------------------
+# 🗳️ Politics & Policy (General)
+# --------------------
+"https://feeds.bbci.co.uk/news/politics/rss.xml",              # BBC News - Politics; Duplicate with category_detection
+"https://feeds.bbci.co.uk/news/science_and_environment/rss.xml",  # BBC - Science & Environment
+# --------------------
+# 🗳️ Science
+# --------------------
+"https://www.nature.com/nature.rss",                              # Nature science; Duplicate with category_detection
+"https://feeds.science.org/rss/science-advances.xml"              # science.org
+]
+def clean_html(raw_html):
+    """Remove HTML tags from text"""
+    if not raw_html:
+        return ""
+    clean_regex = re.compile('<.*?>')
+    clean_text = re.sub(clean_regex, '', raw_html)
+    # Remove extra whitespace
+    clean_text = re.sub(r'\s+', ' ', clean_text).strip()
+    return clean_text
+def parse_feed(feed_url, timeout=5):
+    """
+    Parse a single RSS feed with proper timeout handling
+    Uses requests with timeout first, then passes content to feedparser
+    """
+    try:
+        # Use requests with timeout to fetch the RSS content
+        response = requests.get(feed_url, timeout=timeout)
+        response.raise_for_status()
+        # Then parse the content with feedparser (which doesn't support timeout)
+        feed = feedparser.parse(response.content)
+        # Basic validation of the feed
+        if hasattr(feed, 'entries') and feed.entries:
+            return feed
+        else:
+            logger.warning(f"Feed {feed_url} parsed but contains no entries")
+            return None
+    except requests.exceptions.Timeout:
+        logger.warning(f"Timeout while fetching feed {feed_url}")
+        return None
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Request error fetching feed {feed_url}: {str(e)}")
+        return None
+    except Exception as e:
+        logger.error(f"Error parsing feed {feed_url}: {str(e)}")
+        return None
+def fetch_all_feeds(feeds_list=None, max_workers=5, timeout=5):
+    """
+    Fetch multiple RSS feeds with proper timeout handling
+    Returns a list of (domain, feed) tuples for successfully fetched feeds
+    """
+    # Use default RSS_FEEDS list if none provided
+    if feeds_list is None:
+        feeds_list = RSS_FEEDS
+    results = []
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        future_to_url = {executor.submit(parse_feed, url, timeout): url for url in feeds_list}
+        for future in as_completed(future_to_url):
+            url = future_to_url[future]
+            try:
+                feed = future.result()
+                if feed and hasattr(feed, 'entries') and feed.entries:
+                    # Extract domain for source attribution
+                    domain = urlparse(url).netloc
+                    results.append((domain, feed))
+                    logger.info(f"Successfully fetched {domain} with {len(feed.entries)} entries")
+            except Exception as e:
+                logger.error(f"Error processing {url}: {str(e)}")
+    return results
+def extract_date(entry):
+    """Extract and normalize publication date from entry"""
+    for date_field in ['published_parsed', 'updated_parsed', 'created_parsed']:
+        if hasattr(entry, date_field) and getattr(entry, date_field):
+            try:
+                # Convert time tuple to datetime
+                time_tuple = getattr(entry, date_field)
+                return datetime(time_tuple[0], time_tuple[1], time_tuple[2],
+                               time_tuple[3], time_tuple[4], time_tuple[5])
+            except Exception as e:
+                logger.debug(f"Error parsing {date_field}: {e}")
+                continue
+    # Try string dates
+    for date_field in ['published', 'updated', 'pubDate']:
+        if hasattr(entry, date_field) and getattr(entry, date_field):
+            try:
+                date_str = getattr(entry, date_field)
+                # Try various formats
+                for fmt in ['%a, %d %b %Y %H:%M:%S %z', '%a, %d %b %Y %H:%M:%S %Z',
+                           '%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S%z']:
+                    try:
+                        return datetime.strptime(date_str, fmt)
+                    except:
+                        continue
+            except Exception as e:
+                logger.debug(f"Error parsing date string {date_field}: {e}")
+                continue
+    # Default to current time if parsing fails
+    return datetime.now()
+def is_recent(entry_date, max_days=3):
+    """Check if an entry is recent (within the last few days)"""
+    if not entry_date:
+        return False
+    cutoff = datetime.now() - timedelta(days=max_days)
+    return entry_date > cutoff
+def get_entry_relevance(entry, query_terms, domain):
+    """Calculate relevance score for an entry based on query match and recency"""
+    if not hasattr(entry, 'title') or not entry.title:
+        return 0
+    # Extract text content
+    title = entry.title or ""
+    description = clean_html(entry.description) if hasattr(entry, 'description') else ""
+    content = ""
+    if hasattr(entry, 'content'):
+        for content_item in entry.content:
+            if 'value' in content_item:
+                content += clean_html(content_item['value']) + " "
+    # Extract published date
+    pub_date = extract_date(entry)
+    # Calculate recency score (0-1)
+    recency_score = 0
+    if pub_date:
+        days_old = (datetime.now() - pub_date).days
+        if days_old <= 1:  # Today or yesterday
+            recency_score = 1.0
+        elif days_old <= 2:
+            recency_score = 0.8
+        elif days_old <= 3:
+            recency_score = 0.5
+        else:
+            recency_score = 0.2
+    # Calculate relevance score based on keyword matches
+    text = f"{title} {description} {content}".lower()
+    # Count how many query terms appear in the content
+    query_terms_lower = [term.lower() for term in query_terms]
+    matches = sum(1 for term in query_terms_lower if term in text)
+    # Calculate match score (0-1)
+    match_score = min(1.0, matches / max(1, len(query_terms) * 0.7))
+    # Boost score for exact phrase matches
+    query_phrase = " ".join(query_terms_lower)
+    if query_phrase in text:
+        match_score += 0.5
+    # Additional boost for title matches (they're more relevant)
+    title_matches = sum(1 for term in query_terms_lower if term in title.lower())
+    if title_matches > 0:
+        match_score += 0.2 * (title_matches / len(query_terms_lower))
+    # Source quality factor (can be adjusted based on source reliability)
+    source_factor = 1.0
+    high_quality_domains = ['bbc.co.uk', 'nytimes.com', 'reuters.com', 'washingtonpost.com',
+                           'espncricinfo.com', 'cricbuzz.com', 'snopes.com']
+    if any(quality_domain in domain for quality_domain in high_quality_domains):
+        source_factor = 1.2
+    # Calculate final score
+    final_score = (match_score * 0.6) + (recency_score * 0.4) * source_factor
+    return min(1.0, final_score)  # Cap at 1.0
+def retrieve_evidence_from_rss(claim, max_results=3, category_feeds=None):
+    """
+    Retrieve evidence from RSS feeds for a given claim
+    Args:
+        claim (str): The claim to verify
+        max_results (int): Maximum number of results to return
+        category_feeds (list, optional): List of category-specific RSS feeds to check
+    Returns:
+        list: List of relevant evidence items
+    """
+    start_time = time.time()
+    logger.info(f"Retrieving evidence from RSS feeds for: {claim}")
+    # Extract key terms from claim
+    terms = [term.strip() for term in re.findall(r'\b\w+\b', claim) if len(term.strip()) > 2]
+    try:
+        # Use category-specific feeds if provided
+        feeds_to_use = category_feeds if category_feeds else RSS_FEEDS
+        # Log which feeds we're using
+        if category_feeds:
+            logger.info(f"Using {len(category_feeds)} category-specific RSS feeds")
+        else:
+            logger.info(f"Using {len(RSS_FEEDS)} default RSS feeds")
+        # Limit the number of feeds to process for efficiency
+        if len(feeds_to_use) > 10:
+            # If we have too many feeds, select a subset
+            # Prioritize fact-checking sources
+            fact_check_feeds = [feed for feed in feeds_to_use if "fact" in feed.lower() or "snopes" in feed.lower() or "politifact" in feed.lower()]
+            other_feeds = [feed for feed in feeds_to_use if feed not in fact_check_feeds]
+            # Take all fact-checking feeds plus a random selection of others
+            import random
+            selected_feeds = fact_check_feeds + random.sample(other_feeds, min(10 - len(fact_check_feeds), len(other_feeds)))
+        else:
+            selected_feeds = feeds_to_use
+        # Fetch all feeds in parallel with the selected feeds
+        feeds = fetch_all_feeds(selected_feeds)
+        if not feeds:
+            logger.warning("No RSS feeds could be fetched")
+            return []
+        all_entries = []
+        # Process all feed entries
+        for domain, feed in feeds:
+            for entry in feed.entries:
+                # Calculate relevance score
+                relevance = get_entry_relevance(entry, terms, domain)
+                if relevance > 0.3:  # Only consider somewhat relevant entries
+                    # Extract entry details
+                    title = entry.title if hasattr(entry, 'title') else "No title"
+                    link = entry.link if hasattr(entry, 'link') else ""
+                    # Extract and clean description/content
+                    description = ""
+                    if hasattr(entry, 'description'):
+                        description = clean_html(entry.description)
+                    elif hasattr(entry, 'summary'):
+                        description = clean_html(entry.summary)
+                    elif hasattr(entry, 'content'):
+                        for content_item in entry.content:
+                            if 'value' in content_item:
+                                description += clean_html(content_item['value']) + " "
+                    # Truncate description if too long
+                    if len(description) > 250:
+                        description = description[:247] + "..."
+                    # Get publication date
+                    pub_date = extract_date(entry)
+                    date_str = pub_date.strftime('%Y-%m-%d') if pub_date else "Unknown date"
+                    # Format as evidence text
+                    evidence_text = (
+                        f"Title: {title}, "
+                        f"Source: {domain} (RSS), "
+                        f"Date: {date_str}, "
+                        f"URL: {link}, "
+                        f"Content: {description}"
+                    )
+                    all_entries.append({
+                        "text": evidence_text,
+                        "relevance": relevance,
+                        "date": pub_date or datetime.now()
+                    })
+        # Sort entries by relevance
+        all_entries.sort(key=lambda x: x["relevance"], reverse=True)
+        # Take top results
+        top_entries = all_entries[:max_results]
+        logger.info(f"Retrieved {len(top_entries)} relevant RSS items from {len(feeds)} feeds in {time.time() - start_time:.2f}s")
+        # Return just the text portion
+        return [entry["text"] for entry in top_entries]
+    except Exception as e:
+        logger.error(f"Error in RSS retrieval: {str(e)}")
+        return []

modules/semantic_analysis.py ADDED Viewed

	@@ -0,0 +1,503 @@

+import logging
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+from datetime import datetime, timedelta
+import re
+# Import the centralized NLP model handler
+from utils.models import get_nlp_model
+logger = logging.getLogger("misinformation_detector")
+def extract_entities(text):
+    """Extract named entities from text"""
+    if not text:
+        return []
+    try:
+        # Use centralized NLP model
+        nlp_model = get_nlp_model()
+        doc = nlp_model(text)
+        entities = [
+            {
+                "text": ent.text,
+                "label": ent.label_,
+                "start": ent.start_char,
+                "end": ent.end_char
+            }
+            for ent in doc.ents
+        ]
+        return entities
+    except Exception as e:
+        logger.error(f"Error extracting entities: {str(e)}")
+        return []
+def get_vector_representation(text):
+    """Get vector representation of text using spaCy"""
+    if not text:
+        return None
+    try:
+        # Use centralized NLP model
+        nlp_model = get_nlp_model()
+        doc = nlp_model(text)
+        # Return document vector if available
+        if doc.has_vector:
+            return doc.vector
+        # Fallback: average of token vectors
+        vectors = [token.vector for token in doc if token.has_vector]
+        if vectors:
+            return np.mean(vectors, axis=0)
+        return None
+    except Exception as e:
+        logger.error(f"Error getting vector representation: {str(e)}")
+        return None
+def calculate_similarity(text1, text2):
+    """Calculate semantic similarity between two texts"""
+    if not text1 or not text2:
+        return 0.0
+    try:
+        vec1 = get_vector_representation(text1)
+        vec2 = get_vector_representation(text2)
+        if vec1 is None or vec2 is None:
+            return 0.0
+        # Reshape vectors for cosine_similarity
+        vec1 = vec1.reshape(1, -1)
+        vec2 = vec2.reshape(1, -1)
+        # Calculate cosine similarity
+        similarity = cosine_similarity(vec1, vec2)[0][0]
+        return float(similarity)
+    except Exception as e:
+        logger.error(f"Error calculating similarity: {str(e)}")
+        return 0.0
+def extract_date_from_evidence(evidence_text):
+    """Extract date from evidence text"""
+    if not evidence_text:
+        return None
+    try:
+        # Look for date patterns in text
+        date_patterns = [
+            r'Date: (\d{4}-\d{2}-\d{2})',  # ISO format
+            r'published.*?(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',  # published on MM/DD/YYYY
+            r'(\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4})',  # DD Month YYYY
+            r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}'  # Month DD, YYYY
+        ]
+        for pattern in date_patterns:
+            match = re.search(pattern, evidence_text)
+            if match:
+                date_str = match.group(1)
+                # Parse date string based on format
+                try:
+                    if '-' in date_str:
+                        return datetime.strptime(date_str, '%Y-%m-%d')
+                    elif '/' in date_str or '-' in date_str:
+                        formats = ['%m/%d/%Y', '%d/%m/%Y', '%m-%d-%Y', '%d-%m-%Y']
+                        for fmt in formats:
+                            try:
+                                return datetime.strptime(date_str, fmt)
+                            except ValueError:
+                                continue
+                    else:
+                        # Try different month formats
+                        formats = ['%d %B %Y', '%B %d, %Y', '%B %d %Y']
+                        for fmt in formats:
+                            try:
+                                return datetime.strptime(date_str, fmt)
+                            except ValueError:
+                                continue
+                except Exception:
+                    pass
+        return None
+    except Exception as e:
+        logger.error(f"Error extracting date from evidence: {str(e)}")
+        return None
+def is_temporally_relevant(evidence_text, claim_text, max_days_old=30):
+    """Check if evidence is temporally relevant to the claim"""
+    # Check if claim seems to require recent evidence
+    temporal_terms = ["today", "now", "current", "currently", "recent", "recently", "latest", "just", "this week", "this month", "this year"]
+    requires_recent = any(term in claim_text.lower() for term in temporal_terms)
+    # If claim doesn't specify temporality, consider evidence relevant
+    if not requires_recent:
+        return True
+    # Extract date from evidence
+    date = extract_date_from_evidence(evidence_text)
+    if not date:
+        return True  # If we can't determine date, assume it's relevant
+    # Check if evidence is recent enough
+    cutoff = datetime.now() - timedelta(days=max_days_old)
+    return date >= cutoff
+def has_authority_signal(evidence_text):
+    """Check if evidence contains authority signals"""
+    authority_signals = {
+        "scientific_consensus": ["consensus", "scientists agree", "research shows", "studies confirm", "experts agree"],
+        "fact_check": ["fact check", "rated false", "rated true", "debunked", "confirmed", "verification"],
+        "high_authority": ["nasa", "world health organization", "who", "cdc", "national academy",
+                          "oxford", "harvard", "stanford", "mit", "cambridge", "yale",
+                          "princeton", "government", "official", "authorities", "minister",
+                          "ministry", "department", "administration", "university", "professor"]
+    }
+    evidence_lower = evidence_text.lower()
+    authority_type = None
+    authority_score = 1.0
+    for signal_type, phrases in authority_signals.items():
+        if any(phrase in evidence_lower for phrase in phrases):
+            if signal_type == "scientific_consensus":
+                authority_score = 1.8
+                authority_type = "scientific_consensus"
+            elif signal_type == "fact_check":
+                authority_score = 1.5
+                authority_type = "fact_check"
+            elif signal_type == "high_authority":
+                authority_score = 1.3
+                authority_type = "high_authority"
+            break
+    return authority_score, authority_type
+def analyze_evidence_relevance(claim, evidence_list, source_credibility=None):
+    """
+    Analyze evidence relevance to claim using semantic similarity with improved handling
+    for claims requiring strong evidence
+    Args:
+        claim (str): The claim being verified
+        evidence_list (list): List of evidence items
+        source_credibility (dict): Dictionary mapping source domains to credibility scores
+    Returns:
+        list: Sorted list of evidence items with relevance scores
+    """
+    if not evidence_list:
+        return []
+    # Ensure evidence_list is a list of strings
+    if not isinstance(evidence_list, list):
+        evidence_list = [str(evidence_list)]
+    # Filter out None or empty items
+    evidence_list = [item for item in evidence_list if item]
+    # Check if claim contains strong assertions that would require specific evidence
+    strong_assertion_markers = [
+        "solved", "cured", "discovered", "breakthrough", "revolutionary",
+        "first ever", "confirmed", "definitive", "conclusive", "proven",
+        "groundbreaking", "unprecedented", "remarkable", "extends lifespan",
+        "extends life", "definitively", "successfully"
+    ]
+    # Determine if claim contains strong assertions
+    claim_has_strong_assertions = any(marker in claim.lower() for marker in strong_assertion_markers)
+    # Log detection result
+    if claim_has_strong_assertions:
+        logger.info(f"Evidence analysis: Detected claim with strong assertions requiring specific evidence")
+    # Extract named entities from claim
+    claim_entities = extract_entities(claim)
+    claim_entity_texts = [entity["text"].lower() for entity in claim_entities]
+    # Process each evidence item
+    analyzed_evidence = []
+    # Track domains found in evidence to identify source diversity
+    found_domains = set()
+    for evidence in evidence_list:
+        if not isinstance(evidence, str):
+            continue
+        # Calculate semantic similarity
+        similarity = calculate_similarity(claim, evidence)
+        # Check for entity overlap
+        evidence_entities = extract_entities(evidence)
+        evidence_entity_texts = [entity["text"].lower() for entity in evidence_entities]
+        # Calculate entity overlap
+        common_entities = set(claim_entity_texts).intersection(set(evidence_entity_texts))
+        entity_overlap = len(common_entities) / max(1, len(claim_entity_texts))
+        # Check temporal relevance
+        temporal_relevance = 1.0
+        if is_temporally_relevant(evidence, claim):
+            temporal_relevance = 1.2
+        else:
+            # Penalty for temporally irrelevant evidence
+            temporal_relevance = 0.7
+        # Check for authority signals
+        authority_score, authority_type = has_authority_signal(evidence)
+        # Extract source from evidence if available
+        source_boost = 1.0
+        domain = None
+        if source_credibility:
+            # Try to extract domain from URL in evidence
+            domain_match = re.search(r'URL: https?://(?:www\.)?([^/]+)', evidence)
+            if domain_match:
+                domain = domain_match.group(1)
+                # Check if domain or its parent domain is in credibility list
+                for cred_domain, cred_score in source_credibility.items():
+                    if cred_domain in domain:
+                        try:
+                            source_boost = float(cred_score)
+                            break
+                        except (ValueError, TypeError):
+                            pass
+                # Track this domain for source diversity
+                if domain:
+                    found_domains.add(domain)
+        # For claims with strong assertions: check if evidence specifically addresses assertions
+        claim_specificity_match = 1.0
+        evidence_specificity_match = 1.0
+        if claim_has_strong_assertions:
+            # Check if evidence provides specific confirmation or contradiction
+            direct_contradiction_terms = [
+                "not yet", "has not", "have not", "cannot", "can't", "doesn't", "don't",
+                "unlikely", "challenging", "remains a challenge", "in the future",
+                "experimental", "in development", "proposed", "theoretical",
+                "preliminary", "hypothesized", "potential", "promising but"
+            ]
+            # Check for contradictions to strong assertions
+            if any(term in evidence.lower() for term in direct_contradiction_terms):
+                # This evidence likely contradicts the strong assertion
+                evidence_specificity_match = 2.0  # Boost relevance of contradicting evidence
+                logger.debug(f"Found contradiction to strong assertion in evidence")
+            # For claims with strong assertions, check if evidence specifically confirms
+            direct_confirmation_terms = [
+                "successfully demonstrated", "breakthrough", "solved", "cured",
+                "confirmed", "definitive evidence", "conclusive results", "proven",
+                "revolutionary results", "milestone achievement", "groundbreaking results"
+            ]
+            # If evidence confirms the strong assertion, adjust relevance
+            if any(term in evidence.lower() for term in direct_confirmation_terms):
+                # Apply higher scoring for evidence that specifically confirms
+                evidence_specificity_match = 1.8
+                logger.debug(f"Found confirmation of strong assertion in evidence")
+            # For claims with strong assertions, check for high-quality sources
+            high_quality_source_markers = [
+                "journal", "doi.org", "research", "university", "institute",
+                "laboratory", "professor", "study", "publication", "published in"
+            ]
+            is_high_quality = any(term in evidence.lower() for term in high_quality_source_markers)
+            quality_boost = 1.4 if is_high_quality else 1.0
+            # Apply the quality boost
+            source_boost *= quality_boost
+        # Calculate final relevance score with improvements for all claim types
+        if claim_has_strong_assertions:
+            relevance_score = (
+                (similarity * 0.35) +  # Semantic similarity
+                (entity_overlap * 0.25) +  # Entity overlap
+                (0.25)  # Base value to ensure all evidence has some relevance
+            ) * temporal_relevance * authority_score * source_boost * claim_specificity_match * evidence_specificity_match
+        else:
+            # Original formula for regular claims
+            relevance_score = (
+                (similarity * 0.4) +  # Semantic similarity
+                (entity_overlap * 0.3) +  # Entity overlap
+                (0.3)  # Base value to ensure all evidence has some relevance
+            ) * temporal_relevance * authority_score * source_boost
+        # Add metadata and relevance score
+        analyzed_evidence.append({
+            "text": evidence,
+            "relevance_score": relevance_score,
+            "similarity": similarity,
+            "entity_overlap": entity_overlap,
+            "temporal_relevance": temporal_relevance,
+            "authority_score": authority_score,
+            "authority_type": authority_type,
+            "source_boost": source_boost,
+            "domain": domain
+        })
+    # Sort by relevance score (descending)
+    analyzed_evidence.sort(key=lambda x: x["relevance_score"], reverse=True)
+    # Ensure we have diverse sources in top results for all claims
+    if len(found_domains) > 1:
+        # Try to promote evidence from reliable sources if we haven't selected any yet
+        reliable_sources_seen = False
+        # Check if top 3 results contain any reliable sources
+        for item in analyzed_evidence[:3]:
+            domain = item.get("domain", "")
+            if domain and source_credibility and any(cred_domain in domain for cred_domain in source_credibility):
+                reliable_sources_seen = True
+                break
+        # If no reliable sources in top results, promote one if available
+        if not reliable_sources_seen:
+            for i, item in enumerate(analyzed_evidence[3:]):
+                domain = item.get("domain", "")
+                if domain and source_credibility and any(cred_domain in domain for cred_domain in source_credibility):
+                    # Swap this item into the top 3
+                    analyzed_evidence.insert(2, analyzed_evidence.pop(i+3))
+                    break
+    return analyzed_evidence
+def select_diverse_evidence(analyzed_evidence, max_items=5):
+    """
+    Select diverse evidence items based on relevance, source diversity and claim characteristics
+    Args:
+        analyzed_evidence (list): List of evidence items with relevance scores
+        max_items (int): Maximum number of evidence items to return
+    Returns:
+        list: Selected diverse evidence items
+    """
+    if not analyzed_evidence:
+        return []
+    # Check if top evidence suggests claim has strong assertions
+    strong_assertion_markers = [
+        "solved", "cured", "discovered", "breakthrough", "revolutionary",
+        "first ever", "confirmed", "definitive", "conclusive", "proven",
+        "groundbreaking", "unprecedented", "extends lifespan", "definitively"
+    ]
+    # Determine if this is a claim with strong assertions by checking evidence text
+    has_strong_assertions = False
+    for item in analyzed_evidence[:3]:  # Check just the top items for efficiency
+        if "text" in item:
+            item_text = item["text"].lower()
+            if any(f"claim {marker}" in item_text or f"claim has {marker}" in item_text
+                  for marker in strong_assertion_markers):
+                has_strong_assertions = True
+                break
+    # Also check for contradiction markers in evidence which can indicate a strong assertion
+    contradiction_markers = [
+        "not yet solved", "hasn't been proven", "no evidence that",
+        "remains unsolved", "has not been confirmed", "remains theoretical"
+    ]
+    if not has_strong_assertions:
+        for item in analyzed_evidence[:3]:
+            if "text" in item:
+                item_text = item["text"].lower()
+                if any(marker in item_text for marker in contradiction_markers):
+                    has_strong_assertions = True
+                    break
+    # Ensure we don't select more than available
+    max_items = min(max_items, len(analyzed_evidence))
+    # Initialize selected items with the most relevant item
+    selected = [analyzed_evidence[0]]
+    remaining = analyzed_evidence[1:]
+    # Track sources to ensure diversity
+    selected_sources = set()
+    for item in selected:
+        # Try to extract source from evidence
+        source_match = re.search(r'Source: ([^,]+)', item["text"])
+        if source_match:
+            selected_sources.add(source_match.group(1))
+    # For all claims, track if we have high-quality sources yet
+    has_quality_source = False
+    quality_source_markers = ["journal", "doi.org", "research", "university",
+                             "institute", "laboratory", "professor", "study"]
+    # Check if our top item is already from a quality source
+    if any(marker in selected[0]["text"].lower() for marker in quality_source_markers):
+        has_quality_source = True
+    # Select remaining items balancing relevance and diversity
+    while len(selected) < max_items and remaining:
+        best_item = None
+        best_score = -1
+        for i, item in enumerate(remaining):
+            # Base score is the item's relevance
+            score = item["relevance_score"]
+            # Extract source if available
+            source = None
+            source_match = re.search(r'Source: ([^,]+)', item["text"])
+            if source_match:
+                source = source_match.group(1)
+            # Apply diversity bonus if source is new
+            if source and source not in selected_sources:
+                score *= 1.2  # Diversity bonus
+            # For claims with strong assertions, apply bonus for contradicting evidence
+            if has_strong_assertions:
+                # Check for contradiction markers in the text
+                if any(marker in item["text"].lower() for marker in contradiction_markers):
+                    score *= 1.3  # Bonus for evidence that may contradict strong assertions
+            # For any claim, apply bonus for high-quality sources if we don't have one yet
+            if not has_quality_source:
+                is_item_quality = any(marker in item["text"].lower() for marker in quality_source_markers)
+                if is_item_quality:
+                    score *= 1.5  # Significant bonus for quality sources
+            if score > best_score:
+                best_score = score
+                best_item = (i, item)
+        if best_item:
+            idx, item = best_item
+            selected.append(item)
+            remaining.pop(idx)
+            # Add source to selected sources
+            source_match = re.search(r'Source: ([^,]+)', item["text"])
+            if source_match:
+                selected_sources.add(source_match.group(1))
+            # Check if we found a quality source
+            if not has_quality_source:
+                if any(marker in item["text"].lower() for marker in quality_source_markers):
+                    has_quality_source = True
+        else:
+            break
+    # For any claim with strong assertions, ensure we have at least one quality source if available
+    if has_strong_assertions and not has_quality_source and remaining:
+        for i, item in enumerate(remaining):
+            if any(marker in item["text"].lower() for marker in quality_source_markers):
+                # Replace the least relevant selected item with this quality one
+                selected.sort(key=lambda x: x["relevance_score"])
+                selected[0] = item
+                break
+    # Return only the text portion
+    return [item["text"] for item in selected]

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+"""
+Utils package initialization.
+This package provides utility functions for the AskVeracity fact-checking system.
+"""
+from .api_utils import api_error_handler, safe_json_parse, RateLimiter
+from .performance import PerformanceTracker
+from .models import initialize_models, get_nlp_model, get_llm_model
+__all__ = [
+    'api_error_handler',
+    'safe_json_parse',
+    'RateLimiter',
+    'PerformanceTracker',
+    'initialize_models',
+    'get_nlp_model',
+    'get_llm_model'
+]

utils/api_utils.py ADDED Viewed

	@@ -0,0 +1,229 @@

+"""
+API utilities for the Fake News Detector application.
+This module provides utilities for handling API calls, rate limiting,
+error handling, and exponential backoff for retrying failed requests.
+"""
+import time
+import functools
+import random
+import logging
+import requests
+from datetime import datetime, timedelta
+from collections import deque
+from config import RATE_LIMITS, ERROR_BACKOFF
+logger = logging.getLogger("misinformation_detector")
+class RateLimiter:
+    """
+    Rate limiter for API calls with support for different APIs.
+    This class implements a token bucket algorithm for rate limiting,
+    with support for different rate limits for different APIs.
+    It also provides exponential backoff for error handling.
+    """
+    def __init__(self):
+        """Initialize the rate limiter with configuration from settings."""
+        # Store rate limits for different APIs
+        self.limits = {}
+        # Initialize limits from config
+        for api_name, limit_info in RATE_LIMITS.items():
+            self.limits[api_name] = {
+                "requests": limit_info["requests"],
+                "period": limit_info["period"],
+                "timestamps": deque()
+            }
+        # Error backoff settings
+        self.max_retries = ERROR_BACKOFF["max_retries"]
+        self.initial_backoff = ERROR_BACKOFF["initial_backoff"]
+        self.backoff_factor = ERROR_BACKOFF["backoff_factor"]
+    def check_and_update(self, api_name):
+        """
+        Check if request is allowed and update timestamps.
+        Args:
+            api_name (str): Name of the API to check
+        Returns:
+            tuple: (allowed, wait_time)
+                - allowed (bool): Whether the request is allowed
+                - wait_time (float): Time to wait if not allowed
+        """
+        if api_name not in self.limits:
+            return True, 0  # Unknown API, allow by default
+        now = datetime.now()
+        limit_info = self.limits[api_name]
+        # Remove timestamps older than the period
+        cutoff = now - timedelta(seconds=limit_info["period"])
+        while limit_info["timestamps"] and limit_info["timestamps"][0] < cutoff:
+            limit_info["timestamps"].popleft()
+        # Check if we're at the rate limit
+        if len(limit_info["timestamps"]) >= limit_info["requests"]:
+            # Calculate wait time until oldest timestamp expires
+            wait_time = (limit_info["timestamps"][0] + timedelta(seconds=limit_info["period"]) - now).total_seconds()
+            return False, max(0, wait_time)
+        # Add current timestamp and allow request
+        limit_info["timestamps"].append(now)
+        return True, 0
+    def wait_if_needed(self, api_name):
+        """
+        Wait if rate limit is reached.
+        Args:
+            api_name (str): Name of the API to check
+        Returns:
+            bool: True if waited, False otherwise
+        """
+        allowed, wait_time = self.check_and_update(api_name)
+        if not allowed:
+            logger.info(f"Rate limit reached for {api_name}. Waiting {wait_time:.2f} seconds...")
+            time.sleep(wait_time + 0.1)  # Add a small buffer
+            return True
+        return False
+    def get_backoff_time(self, attempt):
+        """
+        Calculate exponential backoff time with jitter.
+        Args:
+            attempt (int): Current attempt number (0-based)
+        Returns:
+            float: Backoff time in seconds
+        """
+        backoff = self.initial_backoff * (self.backoff_factor ** attempt)
+        # Add jitter to prevent thundering herd problem
+        jitter = random.uniform(0, 0.1 * backoff)
+        return backoff + jitter
+# Create rate limiter instance
+rate_limiter = RateLimiter()
+# API Error Handler decorator
+def api_error_handler(api_name):
+    """
+    Decorator for API calls with error handling and rate limiting.
+    This decorator handles rate limiting, retries with exponential
+    backoff, and error handling for API calls.
+    Args:
+        api_name (str): Name of the API being called
+    Returns:
+        callable: Decorated function
+    """
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            try:
+                # Apply rate limiting - make sure rate_limiter exists and has the method
+                if hasattr(rate_limiter, 'wait_if_needed'):
+                    rate_limiter.wait_if_needed(api_name)
+                # Track retries
+                for attempt in range(rate_limiter.max_retries):
+                    try:
+                        return func(*args, **kwargs)
+                    except requests.exceptions.HTTPError as e:
+                        status_code = e.response.status_code if hasattr(e, 'response') else 0
+                        # Handle specific HTTP errors
+                        if status_code == 429:  # Too Many Requests
+                            logger.warning(f"{api_name} rate limit exceeded (429). Attempt {attempt+1}/{rate_limiter.max_retries}")
+                            # Get retry-after header or use exponential backoff
+                            retry_after = e.response.headers.get('Retry-After')
+                            if retry_after and retry_after.isdigit():
+                                wait_time = int(retry_after)
+                            else:
+                                wait_time = rate_limiter.get_backoff_time(attempt)
+                            logger.info(f"Waiting {wait_time} seconds before retry...")
+                            time.sleep(wait_time)
+                        elif status_code >= 500:  # Server errors
+                            logger.warning(f"{api_name} server error ({status_code}). Attempt {attempt+1}/{rate_limiter.max_retries}")
+                            time.sleep(rate_limiter.get_backoff_time(attempt))
+                        elif status_code == 403:  # Forbidden - likely API key issue
+                            logger.error(f"{api_name} access forbidden (403). Check API key.")
+                            return None  # Don't retry on auth errors
+                        elif status_code == 404:  # Not Found
+                            logger.warning(f"{api_name} resource not found (404).")
+                            return None  # Don't retry on resource not found
+                        else:
+                            logger.error(f"{api_name} HTTP error: {e}")
+                            if attempt < rate_limiter.max_retries - 1:
+                                wait_time = rate_limiter.get_backoff_time(attempt)
+                                logger.info(f"Waiting {wait_time} seconds before retry...")
+                                time.sleep(wait_time)
+                            else:
+                                return None
+                    except requests.exceptions.ConnectionError as e:
+                        logger.error(f"{api_name} connection error: {e}")
+                        if attempt < rate_limiter.max_retries - 1:
+                            wait_time = rate_limiter.get_backoff_time(attempt)
+                            logger.info(f"Waiting {wait_time} seconds before retry...")
+                            time.sleep(wait_time)
+                        else:
+                            return None
+                    except requests.exceptions.Timeout as e:
+                        logger.error(f"{api_name} timeout error: {e}")
+                        if attempt < rate_limiter.max_retries - 1:
+                            wait_time = rate_limiter.get_backoff_time(attempt)
+                            logger.info(f"Waiting {wait_time} seconds before retry...")
+                            time.sleep(wait_time)
+                        else:
+                            return None
+                    except Exception as e:
+                        logger.error(f"{api_name} unexpected error: {str(e)}")
+                        if attempt < rate_limiter.max_retries - 1:
+                            wait_time = rate_limiter.get_backoff_time(attempt)
+                            logger.info(f"Waiting {wait_time} seconds before retry...")
+                            time.sleep(wait_time)
+                        else:
+                            return None
+                # If we've exhausted all retries
+                logger.error(f"{api_name} call failed after {rate_limiter.max_retries} attempts")
+                return None
+            except Exception as e:
+                # Catch any unexpected errors in the decorator itself
+                logger.error(f"{api_name} decorator error: {str(e)}")
+                return None
+        return wrapper
+    return decorator
+def safe_json_parse(response, api_name):
+    """
+    Safely parse JSON response with error handling.
+    Args:
+        response (requests.Response): Response object to parse
+        api_name (str): Name of the API for logging
+    Returns:
+        dict: Parsed JSON or empty dict on error
+    """
+    try:
+        return response.json()
+    except ValueError as e:
+        logger.error(f"Error parsing {api_name} JSON response: {e}")
+        logger.debug(f"Response content: {response.text[:500]}...")
+        return {}

utils/models.py ADDED Viewed

	@@ -0,0 +1,157 @@

+"""
+Model management utility for the Fake News Detector application.
+This module provides functions for initializing, caching, and
+retrieving language models used throughout the application.
+It ensures models are loaded efficiently and reused appropriately.
+"""
+import os
+import logging
+import functools
+from langchain_openai import ChatOpenAI
+import spacy
+logger = logging.getLogger("misinformation_detector")
+# Global variables for models
+nlp = None
+model = None
+models_initialized = False
+# Add caching decorator
+def cached_model(func):
+    """
+    Decorator to cache model loading for improved performance.
+    This decorator ensures that models are only loaded once and
+    then reused for subsequent calls, improving performance by
+    avoiding redundant model loading.
+    Args:
+        func (callable): Function that loads a model
+    Returns:
+        callable: Wrapped function that returns a cached model
+    """
+    cache = {}
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        # Use function name as cache key
+        key = func.__name__
+        if key not in cache:
+            logger.info(f"Model not in cache, calling {key}...")
+            cache[key] = func(*args, **kwargs)
+        return cache[key]
+    return wrapper
+def initialize_models():
+    """
+    Initialize all required models.
+    This function loads and initializes all the language models
+    needed by the application, including spaCy for NLP tasks and
+    OpenAI for LLM-based processing.
+    Returns:
+        str: Initialization status message
+    Raises:
+        ValueError: If OpenAI API key is not set
+    """
+    global nlp, model, models_initialized
+    # Skip initialization if already done
+    if models_initialized:
+        logger.info("Models already initialized, skipping initialization")
+        return "Models already initialized"
+    # Check OpenAI API key
+    if "OPENAI_API_KEY" not in os.environ or not os.environ["OPENAI_API_KEY"].strip():
+        logger.error("OPENAI_API_KEY environment variable not set or empty")
+        raise ValueError("OpenAI API key is required. Please set it in the Hugging Face Space secrets.")
+    try:
+        # Load NLP model
+        try:
+            logger.info("Loading spaCy NLP model...")
+            nlp = spacy.load("en_core_web_sm")
+            logger.info("Loaded spaCy NLP model")
+        except OSError as e:
+            # This handles the case if the model wasn't installed correctly
+            logger.warning(f"Could not load spaCy model: {str(e)}")
+            logger.info("Attempting to download spaCy model...")
+            try:
+                import subprocess
+                import sys
+                # This downloads the model if it's missing
+                subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
+                # Try loading again
+                nlp = spacy.load("en_core_web_sm")
+                logger.info("Successfully downloaded and loaded spaCy model")
+            except Exception as download_err:
+                logger.error(f"Failed to download spaCy model: {str(download_err)}")
+                # Continue with other initialization, we'll handle missing NLP model elsewhere
+        # Set up OpenAI model
+        logger.info("Initializing ChatOpenAI model...")
+        model = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
+        logger.info("Initialized ChatOpenAI model")
+        # Mark initialization as complete
+        models_initialized = True
+        return "Models initialized successfully"
+    except Exception as e:
+        logger.error(f"Error initializing models: {str(e)}")
+        raise e
+@cached_model
+def get_nlp_model():
+    """
+    Get the spaCy NLP model, initializing if needed.
+    This function returns a cached spaCy model for NLP tasks.
+    If the model hasn't been loaded yet, it will be loaded.
+    Returns:
+        spacy.Language: Loaded spaCy model
+    """
+    global nlp
+    if nlp is None:
+        try:
+            # Try to load just the spaCy model if not loaded yet
+            logger.info("Loading spaCy NLP model...")
+            nlp = spacy.load("en_core_web_sm")
+            logger.info("Loaded spaCy NLP model")
+        except Exception as e:
+            logger.error(f"Error loading spaCy model: {str(e)}")
+            # Fall back to full initialization
+            initialize_models()
+    return nlp
+@cached_model
+def get_llm_model():
+    """
+    Get the ChatOpenAI model, initializing if needed.
+    This function returns a cached OpenAI LLM model.
+    If the model hasn't been loaded yet, it will be loaded.
+    Returns:
+        ChatOpenAI: Loaded LLM model
+    """
+    global model
+    if model is None:
+        try:
+            # Try to load just the LLM model if not loaded yet
+            logger.info("Initializing ChatOpenAI model...")
+            model = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
+            logger.info("Initialized ChatOpenAI model")
+        except Exception as e:
+            logger.error(f"Error initializing ChatOpenAI model: {str(e)}")
+            # Fall back to full initialization
+            initialize_models()
+    return model

utils/performance.py ADDED Viewed

	@@ -0,0 +1,135 @@

+"""
+Performance tracking utility for the Fake News Detector application.
+This module provides functionality to track and analyze the
+performance of the application, including processing times,
+success rates, and resource utilization.
+"""
+import time
+import logging
+logger = logging.getLogger("misinformation_detector")
+class PerformanceTracker:
+    """
+    Tracks and logs performance metrics for the fact-checking system.
+    This class maintains counters and statistics for various performance
+    metrics, such as processing times, evidence retrieval success rates,
+    and confidence scores.
+    """
+    def __init__(self):
+        """Initialize the performance tracker with empty metrics."""
+        self.metrics = {
+            "claims_processed": 0,
+            "evidence_retrieval_success_rate": [],
+            "processing_times": [],
+            "confidence_scores": [],
+            "source_types_used": {},
+            "temporal_relevance": []
+        }
+    def log_claim_processed(self):
+        """
+        Increment the counter for processed claims.
+        This should be called whenever a claim is processed successfully.
+        """
+        self.metrics["claims_processed"] += 1
+    def log_evidence_retrieval(self, success, sources_count):
+        """
+        Log the success or failure of evidence retrieval.
+        Args:
+            success (bool): Whether evidence retrieval was successful
+            sources_count (dict): Count of evidence items by source type
+        """
+        # Ensure success is a boolean
+        success_value = 1 if success else 0
+        self.metrics["evidence_retrieval_success_rate"].append(success_value)
+        # Safely process source types
+        if isinstance(sources_count, dict):
+            for source_type, count in sources_count.items():
+                # Ensure source_type is a string and count is an integer
+                source_type = str(source_type)
+                try:
+                    count = int(count)
+                except (ValueError, TypeError):
+                    count = 1
+                # Update source types used
+                self.metrics["source_types_used"][source_type] = \
+                    self.metrics["source_types_used"].get(source_type, 0) + count
+    def log_processing_time(self, start_time):
+        """
+        Log the processing time for an operation.
+        Args:
+            start_time (float): Start time obtained from time.time()
+        """
+        end_time = time.time()
+        processing_time = end_time - start_time
+        self.metrics["processing_times"].append(processing_time)
+    def log_confidence_score(self, score):
+        """
+        Log a confidence score.
+        Args:
+            score (float): Confidence score between 0 and 1
+        """
+        # Ensure score is a float between 0 and 1
+        try:
+            score = float(score)
+            if 0 <= score <= 1:
+                self.metrics["confidence_scores"].append(score)
+        except (ValueError, TypeError):
+            logger.warning(f"Invalid confidence score: {score}")
+    def log_temporal_relevance(self, relevance_score):
+        """
+        Log a temporal relevance score.
+        Args:
+            relevance_score (float): Temporal relevance score between 0 and 1
+        """
+        # Ensure relevance score is a float between 0 and 1
+        try:
+            relevance_score = float(relevance_score)
+            if 0 <= relevance_score <= 1:
+                self.metrics["temporal_relevance"].append(relevance_score)
+        except (ValueError, TypeError):
+            logger.warning(f"Invalid temporal relevance score: {relevance_score}")
+    def get_summary(self):
+        """
+        Get a summary of all performance metrics.
+        Returns:
+            dict: Summary of performance metrics
+        """
+        # Safely calculate averages with error handling
+        def safe_avg(metric_list):
+            try:
+                return sum(metric_list) / max(len(metric_list), 1)
+            except (TypeError, ValueError):
+                return 0.0
+        return {
+            "claims_processed": self.metrics["claims_processed"],
+            "avg_evidence_retrieval_success_rate": safe_avg(self.metrics["evidence_retrieval_success_rate"]),
+            "avg_processing_time": safe_avg(self.metrics["processing_times"]),
+            "avg_confidence_score": safe_avg(self.metrics["confidence_scores"]),
+            "source_types_used": dict(self.metrics["source_types_used"]),
+            "avg_temporal_relevance": safe_avg(self.metrics["temporal_relevance"])
+        }
+    def reset(self):
+        """Reset all performance metrics."""
+        self.__init__()
+        logger.info("Performance metrics have been reset")
+        return "Performance metrics reset successfully"