Spaces:

ankanghosh
/

askveracity

Running

App Files Files Community

ankanghosh commited on Apr 27

Commit

87591ba

verified ·

1 Parent(s): 5b9788e

Delete modules

Browse files

Files changed (8) hide show

modules/__init__.py +0 -19
modules/category_detection.py +0 -880
modules/claim_extraction.py +0 -236
modules/classification.py +0 -521
modules/evidence_retrieval.py +0 -944
modules/explanation.py +0 -181
modules/rss_feed.py +0 -391
modules/semantic_analysis.py +0 -503

modules/__init__.py DELETED Viewed

@@ -1,19 +0,0 @@
-"""
-Modules package initialization.
-This package contains the core modules for the AskVeracity fact-checking system.
-"""
-from .claim_extraction import extract_claims, shorten_claim_for_evidence
-from .evidence_retrieval import retrieve_combined_evidence
-from .classification import classify_with_llm, aggregate_evidence
-from .explanation import generate_explanation
-__all__ = [
-    'extract_claims',
-    'shorten_claim_for_evidence',
-    'retrieve_combined_evidence',
-    'classify_with_llm',
-    'aggregate_evidence',
-    'generate_explanation'
-]

modules/category_detection.py DELETED Viewed

@@ -1,880 +0,0 @@
-import logging
-import re
-from typing import Tuple, List, Dict, Optional
-import os
-import time
-# Set up logging
-logger = logging.getLogger("misinformation_detector")
-# Define categories and their keywords
-CLAIM_CATEGORIES = {
-    "ai": [
-        # General AI terms
-        "AI", "artificial intelligence", "machine learning", "ML", "deep learning", "DL",
-        "neural network", "neural nets", "generative AI", "GenAI", "AGI", "artificial general intelligence",
-        "transformer", "attention mechanism", "fine-tuning", "pre-training", "training", "inference",
-        # AI Models and Architectures
-        "language model", "large language model", "LLM", "foundation model", "multimodal model",
-        "vision language model", "VLM", "text-to-speech", "TTS", "speech-to-text", "STT",
-        "text-to-image", "image-to-text", "diffusion model", "generative model", "discriminative model",
-        "GPT", "BERT", "T5", "PaLM", "Claude", "Llama", "Gemini", "Mistral", "Mixtral", "Stable Diffusion",
-        "Dall-E", "Midjourney", "Sora", "transformer", "MoE", "mixture of experts", "sparse model",
-        "dense model", "encoder", "decoder", "encoder-decoder", "autoencoder", "VAE",
-        "mixture of experts", "MoE", "sparse MoE", "switch transformer", "gated experts",
-        "routing network", "expert routing", "pathways", "multi-query attention", "multi-head attention",
-        "rotary position embedding", "RoPE", "grouped-query attention", "GQA", "flash attention",
-        "state space model", "SSM", "mamba", "recurrent neural network", "RNN", "LSTM", "GRU",
-        "convolutional neural network", "CNN", "residual connection", "skip connection", "normalization",
-        "layer norm", "group norm", "batch norm", "parameter efficient fine-tuning", "PEFT",
-        "LoRA", "low-rank adaptation", "QLoRA", "adapters", "prompt tuning", "prefix tuning",
-        # AI Learning Paradigms
-        "supervised learning", "unsupervised learning", "reinforcement learning", "RL",
-        "meta-learning", "transfer learning", "federated learning", "self-supervised learning",
-        "semi-supervised learning", "few-shot learning", "zero-shot learning", "one-shot learning",
-        "contrastive learning", "curriculum learning", "imitation learning", "active learning",
-        "reinforcement learning from human feedback", "RLHF", "direct preference optimization", "DPO",
-        "constitutional AI", "red teaming", "adversarial training", "GAN", "generative adversarial network",
-        "diffusion", "latent diffusion", "flow-based model", "variational autoencoder", "VAE",
-        # AI Capabilities and Applications
-        "natural language processing", "NLP", "computer vision", "CV", "speech recognition",
-        "text generation", "image generation", "video generation", "multimodal", "multi-modal",
-        "recommendation system", "recommender system", "chatbot", "conversational AI",
-        "sentiment analysis", "entity recognition", "semantic search", "vector search", "embedding",
-        "classification", "regression", "clustering", "anomaly detection", "agent", "AI agent",
-        "autonomous agent", "agentic", "RAG", "retrieval augmented generation", "tool use",
-        "function calling", "reasoning", "chain-of-thought", "CoT", "tree-of-thought", "ToT",
-        "planning", "decision making", "multi-agent", "agent swarm", "multi-agent simulation",
-        # AI Technical Terms
-        "token", "tokenizer", "tokenization", "embedding", "vector", "prompt", "prompt engineering",
-        "context window", "parameter", "weights", "bias", "activation function", "loss function",
-        "gradient descent", "backpropagation", "epoch", "batch", "mini-batch", "regularization",
-        "dropout", "overfitting", "underfitting", "hyperparameter", "latent space", "latent variable",
-        "feature extraction", "dimensionality reduction", "optimization", "quantization", "pruning",
-        "fine-tuning", "transfer learning", "knowledge distillation", "int4", "int8", "bfloat16",
-        "float16", "mixed precision", "GPTQ", "AWQ", "GGUF", "GGML", "KV cache", "speculative decoding",
-        "beam search", "greedy decoding", "temperature", "top-k", "top-p", "nucleus sampling",
-        # AI Tools and Frameworks
-        "TensorFlow", "PyTorch", "JAX", "Keras", "Hugging Face", "Transformers", "Diffusers",
-        "LangChain", "Llama Index", "OpenAI", "Anthropic", "NVIDIA", "GPU", "TPU", "IPU", "NPU", "CUDA",
-        "MLOps", "model monitoring", "model deployment", "model serving", "inference endpoint",
-        "vLLM", "TGI", "text generation inference", "triton", "onnx", "tensorRT",
-        # AI Ethics and Concerns
-        "AI ethics", "responsible AI", "AI safety", "AI alignment", "AI governance",
-        "bias", "fairness", "interpretability", "explainability", "XAI", "transparency",
-        "hallucination", "toxicity", "safe deployment", "AI risk", "AI capabilities",
-        "alignment tax", "red teaming", "jailbreak", "prompt injection", "data poisoning",
-        # AI Companies and Organizations
-        "OpenAI", "Anthropic", "Google DeepMind", "Meta AI", "Microsoft", "NVIDIA",
-        "Hugging Face", "Mistral AI", "Cohere", "AI21 Labs", "Stability AI", "Midjourney",
-        "EleutherAI", "Allen AI", "DeepMind", "Character AI", "Inflection AI", "xAI"
-    ],
-    "science": [
-        # General scientific terms
-        "study", "research", "scientist", "scientific", "discovered", "experiment",
-        "laboratory", "clinical", "trial", "hypothesis", "theory", "evidence-based",
-        "peer-reviewed", "journal", "publication", "finding", "breakthrough", "innovation",
-        "discovery", "analysis", "data", "measurement", "observation", "empirical",
-        # Biology and medicine
-        "biology", "chemistry", "physics", "genetics", "genomics", "DNA", "RNA",
-        "medicine", "gene", "protein", "molecule", "cell", "brain", "neuro",
-        "cancer", "disease", "cure", "treatment", "vaccine", "health", "medical",
-        "pharmaceutical", "drug", "therapy", "symptom", "diagnosis", "prognosis",
-        "patient", "doctor", "hospital", "clinic", "surgery", "immune", "antibody",
-        "virus", "bacteria", "pathogen", "infection", "epidemic", "pandemic",
-        "organism", "evolution", "mutation", "chromosome", "enzyme", "hormone",
-        # Physics and astronomy
-        "quantum", "particle", "atom", "nuclear", "electron", "neutron", "proton",
-        "atomic", "subatomic", "molecular", "energy", "matter", "mass", "force",
-        "space", "NASA", "telescope", "planet", "exoplanet", "moon", "lunar", "mars",
-        "star", "galaxy", "cosmic", "astronomical", "universe", "solar", "celestial",
-        "orbit", "gravitational", "gravity", "relativity", "quantum mechanics",
-        "string theory", "dark matter", "dark energy", "black hole", "supernova",
-        "radiation", "radioactive", "isotope", "fission", "fusion", "accelerator",
-        # Environmental science
-        "climate", "carbon", "environment", "ecosystem", "species", "extinct",
-        "endangered", "biodiversity", "conservation", "sustainable", "renewable",
-        "fossil fuel", "greenhouse", "global warming", "polar", "ice cap", "glacier",
-        "ozone", "atmosphere", "weather", "meteorology", "geology", "earthquake",
-        "volcanic", "ocean", "marine", "coral reef", "deforestation", "pollution",
-        # Math and computer science (non-AI specific)
-        "equation", "formula", "theorem", "calculus", "statistical", "probability",
-        "dataset", "parameter", "variable", "function", "matrix", "optimization",
-        # Organizations
-        "CERN", "NIH", "CDC", "WHO", "NOAA", "ESA", "SpaceX", "Blue Origin", "JPL",
-        "laboratory", "institute", "university", "academic", "faculty", "professor",
-        # Science tools
-        "Matlab", "SPSS", "SAS", "ImageJ", "LabVIEW", "ANSYS", "Cadence", "Origin",
-        "Avogadro", "ChemDraw", "Mathematica", "Wolfram Alpha", "COMSOL", "LAMMPS",
-        "VASP", "Gaussian", "GIS", "ArcGIS", "QGIS", "Maple", "R Studio"
-    ],
-    "technology": [
-        # General tech terms
-        "computer", "software", "hardware", "internet", "cyber", "digital", "tech",
-        "robot", "automation", "autonomous", "code", "programming", "data", "cloud",
-        "server", "network", "encryption", "blockchain", "crypto", "bitcoin", "ethereum",
-        "technology", "innovation", "breakthrough", "prototype", "development",
-        "engineering", "technical", "specification", "feature", "functionality",
-        "interface", "system", "infrastructure", "integration", "implementation",
-        # Devices and hardware
-        "smartphone", "device", "gadget", "laptop", "desktop", "tablet", "wearable",
-        "smartwatch", "IoT", "internet of things", "sensor", "chip", "semiconductor",
-        "processor", "CPU", "GPU", "memory", "RAM", "storage", "hard drive", "SSD",
-        "electronic", "circuit", "motherboard", "component", "peripheral", "accessory",
-        "display", "screen", "touchscreen", "camera", "lens", "microphone", "speaker",
-        "battery", "charger", "wireless", "bluetooth", "WiFi", "router", "modem",
-        # Software and internet
-        "app", "application", "platform", "website", "online", "web", "browser",
-        "operating system", "Windows", "macOS", "Linux", "Android", "iOS", "software",
-        "program", "code", "coding", "development", "framework", "library", "API",
-        "interface", "backend", "frontend", "full-stack", "developer", "programmer",
-        "database", "SQL", "NoSQL", "cloud computing", "SaaS", "PaaS", "IaaS",
-        "DevOps", "agile", "scrum", "sprint", "version control", "git", "repository",
-        # Communications and networking
-        "5G", "6G", "broadband", "fiber", "network", "wireless", "cellular", "mobile",
-        "telecommunications", "telecom", "transmission", "bandwidth", "latency",
-        "protocol", "IP address", "DNS", "server", "hosting", "data center",
-        # Company and product names
-        "Apple", "Google", "Microsoft", "Amazon", "Facebook", "Meta", "Tesla",
-        "IBM", "Intel", "AMD", "Nvidia", "Qualcomm", "Cisco", "Oracle", "SAP",
-        "Huawei", "Samsung", "Sony", "LG", "Dell", "HP", "Lenovo", "Xiaomi",
-        "iPhone", "iPad", "MacBook", "Surface", "Galaxy", "Pixel", "Windows",
-        "Android", "iOS", "Chrome", "Firefox", "Edge", "Safari", "Office",
-        "Azure", "AWS", "Google Cloud", "Gmail", "Outlook", "Teams", "Zoom",
-        # Advanced technologies
-        "VR", "AR", "XR", "virtual reality", "augmented reality", "mixed reality",
-        "metaverse", "3D printing", "additive manufacturing", "quantum computing",
-        "nanotechnology", "biotechnology", "electric vehicle", "self-driving",
-        "autonomous vehicle", "drone", "UAV", "robotics", "cybersecurity",
-        # Social media
-        "social media", "social network", "Facebook", "Instagram", "Twitter", "X",
-        "LinkedIn", "TikTok", "Snapchat", "YouTube", "Pinterest", "Reddit",
-        "streaming", "content creator", "influencer", "follower", "like", "share",
-        "post", "tweet", "user-generated", "viral", "trending", "engagement",
-        # Technology tools
-        "NumPy", "Pandas", "Matplotlib", "Seaborn", "Scikit-learn", "Jupyter",
-        "Visual Studio", "VS Code", "IntelliJ", "PyCharm", "Eclipse", "Android Studio",
-        "Xcode", "Docker", "Kubernetes", "Jenkins", "Ansible", "Terraform", "Vagrant",
-        "AWS CLI", "Azure CLI", "GCP CLI", "PowerShell", "Bash", "npm", "pip", "conda",
-        "React", "Angular", "Vue.js", "Node.js", "Django", "Flask", "Spring", "Laravel",
-        "PostgreSQL", "MySQL", "MongoDB", "Redis", "Elasticsearch", "Kafka", "RabbitMQ",
-        # Optimization terms
-        "optimization", "efficiency", "performance tuning", "benchmarking", "profiling",
-        "refactoring", "scaling", "bottleneck", "throughput", "latency reduction",
-        "response time", "caching", "load balancing", "distributed computing",
-        "parallel processing", "concurrency", "asynchronous", "memory management"
-    ],
-    "politics": [
-        # Government structure
-        "president", "prime minister", "government", "parliament", "congress",
-        "senate", "house", "representative", "minister", "secretary", "cabinet",
-        "administration", "mayor", "governor", "politician", "official", "authority",
-        "federal", "state", "local", "municipal", "county", "city", "town",
-        "constituency", "district", "precinct", "ward", "judiciary", "executive",
-        "legislative", "branch", "checks and balances", "separation of powers",
-        # Political activities
-        "policy", "election", "campaign", "vote", "voter", "ballot", "polling",
-        "political", "politics", "debate", "speech", "address", "press conference",
-        "approval rating", "opinion poll", "candidate", "incumbent", "challenger",
-        "primary", "caucus", "convention", "delegate", "nomination", "campaign trail",
-        "fundraising", "lobbying", "advocacy", "activism", "protest", "demonstration",
-        # Political ideologies
-        "democracy", "democratic", "republican", "conservative", "liberal",
-        "progressive", "left-wing", "right-wing", "centrist", "moderate",
-        "socialist", "capitalist", "communist", "libertarian", "populist",
-        "nationalist", "globalist", "isolationist", "hawk", "dove",
-        "ideology", "partisan", "bipartisan", "coalition", "majority", "minority",
-        # Laws and regulations
-        "bill", "law", "legislation", "regulation", "policy", "statute", "code",
-        "amendment", "reform", "repeal", "enact", "implement", "enforce",
-        "constitutional", "unconstitutional", "legal", "illegal", "legalize",
-        "criminalize", "deregulate", "regulatory", "compliance", "mandate",
-        # Judicial and legal
-        "court", "supreme", "justice", "judge", "ruling", "decision", "opinion",
-        "case", "lawsuit", "litigation", "plaintiff", "defendant", "prosecutor",
-        "attorney", "lawyer", "advocate", "judicial review", "precedent",
-        "constitution", "amendment", "rights", "civil rights", "human rights",
-        # International relations
-        "treaty", "international", "diplomatic", "diplomacy", "relations",
-        "foreign policy", "domestic policy", "UN", "NATO", "EU", "United Nations",
-        "sanctions", "embargo", "tariff", "trade war", "diplomat", "embassy",
-        "consulate", "ambassador", "delegation", "summit", "bilateral", "multilateral",
-        "alliance", "ally", "adversary", "geopolitical", "sovereignty", "regime",
-        # Security and defense
-        "national security", "homeland security", "defense", "military", "armed forces",
-        "army", "navy", "air force", "marines", "coast guard", "intelligence",
-        "CIA", "FBI", "NSA", "Pentagon", "war", "conflict", "peacekeeping",
-        "terrorism", "counterterrorism", "insurgency", "nuclear weapon", "missile",
-        "disarmament", "nonproliferation", "surveillance", "espionage",
-        # Political institutions
-        "White House", "Kremlin", "Downing Street", "Capitol Hill", "Westminster",
-        "United Nations", "European Union", "NATO", "World Bank", "IMF", "WTO",
-        "ASEAN", "African Union", "BRICS", "G7", "G20",
-        # Political parties and movements
-        "Democrat", "Republican", "Labour", "Conservative", "Green Party",
-        "Socialist", "Communist", "Libertarian", "Independent", "Tea Party",
-        "progressive movement", "civil rights movement", "womens rights",
-        "LGBTQ rights", "Black Lives Matter", "environmental movement"
-    ],
-    "business": [
-        # Companies and organization types
-        "company", "corporation", "business", "startup", "firm", "enterprise",
-        "corporate", "industry", "sector", "conglomerate", "multinational",
-        "organization", "entity", "private", "public", "incorporated", "LLC",
-        "partnership", "proprietorship", "franchise", "subsidiary", "parent company",
-        "headquarters", "office", "facility", "plant", "factory", "warehouse",
-        "retail", "wholesale", "ecommerce", "brick-and-mortar", "chain", "outlet",
-        # Business roles and management
-        "executive", "CEO", "CFO", "CTO", "COO", "CMO", "CIO", "CHRO", "chief",
-        "director", "board", "chairman", "chairwoman", "chairperson", "president",
-        "vice president", "senior", "junior", "manager", "management", "supervisor",
-        "founder", "entrepreneur", "owner", "shareholder", "stakeholder",
-        "employee", "staff", "workforce", "personnel", "human resources", "HR",
-        "recruit", "hire", "layoff", "downsizing", "restructuring", "reorganization",
-        # Financial terms
-        "profit", "revenue", "sales", "income", "earnings", "EBITDA", "turnover",
-        "loss", "deficit", "expense", "cost", "overhead", "margin", "markup",
-        "budget", "forecast", "projection", "estimate", "actual", "variance",
-        "balance sheet", "income statement", "cash flow", "P&L", "liquidity",
-        "solvency", "asset", "liability", "equity", "debt", "leverage", "capital",
-        "working capital", "cash", "funds", "money", "payment", "transaction",
-        # Markets and trading
-        "market", "stock", "share", "bond", "security", "commodity", "futures",
-        "option", "derivative", "forex", "foreign exchange", "currency", "crypto",
-        "trader", "trading", "buy", "sell", "long", "short", "position", "portfolio",
-        "diversification", "hedge", "risk", "return", "yield", "dividend", "interest",
-        "bull market", "bear market", "correction", "crash", "rally", "volatile",
-        "volatility", "index", "benchmark", "Dow Jones", "NASDAQ", "S&P 500", "NYSE",
-        # Investment and funding
-        "investor", "investment", "fund", "mutual fund", "ETF", "hedge fund",
-        "private equity", "venture", "venture capital", "VC", "angel investor",
-        "seed", "Series A", "Series B", "Series C", "funding", "financing",
-        "loan", "credit", "debt", "equity", "fundraising", "crowdfunding",
-        "IPO", "initial public offering", "going public", "listed", "delisted",
-        "merger", "acquisition", "M&A", "takeover", "buyout", "divestiture",
-        "valuation", "billion", "million", "trillion", "unicorn", "decacorn",
-        # Economic terms
-        "economy", "economic", "economics", "macro", "micro", "fiscal", "monetary",
-        "supply", "demand", "market forces", "competition", "competitive", "monopoly",
-        "oligopoly", "antitrust", "regulation", "deregulation", "growth", "decline",
-        "recession", "depression", "recovery", "expansion", "contraction", "cycle",
-        "inflation", "deflation", "stagflation", "hyperinflation", "CPI", "price",
-        "GDP", "gross domestic product", "GNP", "productivity", "output", "input",
-        # Banking and finance
-        "finance", "financial", "bank", "banking", "commercial bank", "investment bank",
-        "central bank", "Federal Reserve", "Fed", "ECB", "Bank of England", "BOJ",
-        "interest rate", "prime rate", "discount rate", "basis point", "monetary policy",
-        "quantitative easing", "tightening", "loosening", "credit", "lending",
-        "borrowing", "loan", "mortgage", "consumer credit", "credit card", "debit card",
-        "checking", "savings", "deposit", "withdrawal", "ATM", "branch", "online banking",
-        # Currencies and payments
-        "dollar", "euro", "pound", "yen", "yuan", "rupee", "ruble", "real", "peso",
-        "currency", "money", "fiat", "exchange rate", "remittance", "transfer",
-        "payment", "transaction", "wire", "ACH", "SWIFT", "clearing", "settlement",
-        "cryptocurrency", "bitcoin", "ethereum", "blockchain", "fintech", "paytech",
-        # Business operations
-        "product", "service", "solution", "offering", "launch", "rollout", "release",
-        "operation", "production", "manufacturing", "supply chain", "logistics",
-        "procurement", "inventory", "distribution", "shipping", "delivery",
-        "quality", "control", "assurance", "standard", "certification", "compliance",
-        "process", "procedure", "workflow", "efficiency", "optimization",
-        # Marketing and sales
-        "marketing", "advertise", "advertising", "campaign", "promotion", "publicity",
-        "PR", "public relations", "brand", "branding", "identity", "image", "reputation",
-        "sales", "selling", "deal", "transaction", "pipeline", "lead", "prospect",
-        "customer", "client", "consumer", "buyer", "purchaser", "target market",
-        "segment", "demographic", "psychographic", "B2B", "B2C", "retail", "wholesale",
-        "price", "pricing", "discount", "premium", "luxury", "value", "bargain"
-    ],
-    "world": [
-        # General international terms
-        "country", "nation", "state", "republic", "kingdom", "global", "international",
-        "foreign", "world", "worldwide", "domestic", "abroad", "overseas",
-        "developed", "developing", "industrialized", "emerging", "third world",
-        "global south", "global north", "east", "west", "western", "eastern",
-        "bilateral", "multilateral", "transnational", "multinational", "sovereignty",
-        # Regions and continents
-        "Europe", "European", "Asia", "Asian", "Africa", "African", "North America",
-        "South America", "Latin America", "Australia", "Oceania", "Antarctica",
-        "Middle East", "Central Asia", "Southeast Asia", "East Asia", "South Asia",
-        "Eastern Europe", "Western Europe", "Northern Europe", "Southern Europe",
-        "Mediterranean", "Scandinavia", "Nordic", "Baltic", "Balkans", "Caucasus",
-        "Caribbean", "Central America", "South Pacific", "Polynesia", "Micronesia",
-        # Major countries and regions
-        "China", "Chinese", "Russia", "Russian", "India", "Indian", "Japan", "Japanese",
-        "UK", "British", "England", "English", "Scotland", "Scottish", "Wales", "Welsh",
-        "Germany", "German", "France", "French", "Italy", "Italian", "Spain", "Spanish",
-        "Canada", "Canadian", "Brazil", "Brazilian", "Mexico", "Mexican", "Turkey", "Turkish",
-        "United States", "US", "USA", "American", "Britain", "Korea", "Korean",
-        "North Korea", "South Korea", "Saudi", "Saudi Arabia", "Saudi Arabian",
-        "Iran", "Iranian", "Iraq", "Iraqi", "Israel", "Israeli", "Palestine", "Palestinian",
-        "Egypt", "Egyptian", "Pakistan", "Pakistani", "Indonesia", "Indonesian",
-        "Australia", "Australian", "New Zealand", "Nigeria", "Nigerian", "South Africa",
-        "Argentina", "Argentinian", "Colombia", "Colombian", "Venezuela", "Venezuelan",
-        "Ukraine", "Ukrainian", "Poland", "Polish", "Switzerland", "Swiss",
-        "Netherlands", "Dutch", "Belgium", "Belgian", "Sweden", "Swedish", "Norway", "Norwegian",
-        # International issues and topics
-        "war", "conflict", "crisis", "tension", "dispute", "hostility", "peace",
-        "peacekeeping", "ceasefire", "truce", "armistice", "treaty", "agreement",
-        "compromise", "negotiation", "mediation", "resolution", "settlement",
-        "refugee", "migrant", "asylum seeker", "displacement", "humanitarian",
-        "border", "frontier", "territory", "territorial", "sovereignty", "jurisdiction",
-        "terror", "terrorism", "extremism", "radicalism", "insurgency", "militant",
-        "sanction", "embargo", "restriction", "isolation", "blockade",
-        # International trade and economy
-        "trade", "import", "export", "tariff", "duty", "quota", "subsidy",
-        "protectionism", "free trade", "fair trade", "globalization", "trade war",
-        "trade agreement", "trade deal", "trade deficit", "trade surplus",
-        "supply chain", "outsourcing", "offshoring", "reshoring", "nearshoring",
-        # Diplomacy and international relations
-        "embassy", "consulate", "diplomatic", "diplomacy", "diplomat", "ambassador",
-        "consul", "attaché", "envoy", "emissary", "delegation", "mission",
-        "foreign policy", "international relations", "geopolitics", "geopolitical",
-        "influence", "power", "superpower", "hegemony", "alliance", "coalition",
-        "bloc", "axis", "sphere of influence", "buffer state", "proxy",
-        # International organizations
-        "UN", "United Nations", "EU", "European Union", "NATO", "NAFTA", "USMCA",
-        "ASEAN", "OPEC", "Commonwealth", "Arab League", "African Union", "AU",
-        "BRICS", "G7", "G20", "IMF", "World Bank", "WTO", "WHO", "UNESCO",
-        "Security Council", "General Assembly", "International Court of Justice",
-        # Travel and cultural exchange
-        "visa", "passport", "immigration", "emigration", "migration", "travel",
-        "tourism", "tourist", "visitor", "foreigner", "expatriate", "expat",
-        "citizenship", "nationality", "dual citizen", "naturalization",
-        "cultural", "tradition", "heritage", "indigenous", "native", "local",
-        "language", "dialect", "translation", "interpreter", "cross-cultural"
-    ],
-    "sports": [
-        # General sports terms
-        "game", "match", "tournament", "championship", "league", "cup", "Olympics",
-        "olympic", "world cup", "competition", "contest", "event", "series",
-        "sport", "sporting", "athletics", "physical", "play", "compete", "competition",
-        "amateur", "professional", "pro", "season", "preseason", "regular season",
-        "postseason", "playoff", "final", "semifinal", "quarterfinal", "qualifying",
-        # Team sports
-        "football", "soccer", "American football", "rugby", "basketball", "baseball",
-        "cricket", "hockey", "ice hockey", "field hockey", "volleyball", "handball",
-        "water polo", "lacrosse", "ultimate frisbee", "netball", "kabaddi",
-        "team", "club", "franchise", "squad", "roster", "lineup", "formation",
-        "player", "coach", "manager", "trainer", "captain", "starter", "substitute",
-        "bench", "draft", "trade", "free agent", "contract", "transfer", "loan",
-        # Individual sports
-        "tennis", "golf", "boxing", "wrestling", "martial arts", "MMA", "UFC",
-        "athletics", "track and field", "swimming", "diving", "gymnastics",
-        "skiing", "snowboarding", "skating", "figure skating", "speed skating",
-        "cycling", "mountain biking", "BMX", "motorsport", "F1", "Formula 1",
-        "NASCAR", "IndyCar", "MotoGP", "rally", "marathon", "triathlon", "decathlon",
-        "archery", "shooting", "fencing", "equestrian", "rowing", "canoeing", "kayaking",
-        "surfing", "skateboarding", "climbing", "bouldering", "weightlifting",
-        # Scoring and results
-        "score", "point", "goal", "touchdown", "basket", "run", "wicket", "try",
-        "win", "lose", "draw", "tie", "defeat", "victory", "champion", "winner",
-        "loser", "runner-up", "finalist", "semifinalist", "eliminated", "advance",
-        "qualify", "record", "personal best", "world record", "Olympic record",
-        "streak", "undefeated", "unbeaten", "perfect season", "comeback",
-        # Performance and training
-        "fitness", "training", "practice", "drill", "workout", "exercise", "regime",
-        "conditioning", "strength", "endurance", "speed", "agility", "flexibility",
-        "skill", "technique", "form", "style", "strategy", "tactic", "playbook",
-        "offense", "defense", "attack", "counter", "press", "formation",
-        "injury", "rehabilitation", "recovery", "physiotherapy", "sports medicine",
-        # Sports infrastructure
-        "stadium", "arena", "court", "field", "pitch", "rink", "pool", "track",
-        "course", "gymnasium", "gym", "complex", "venue", "facility", "locker room",
-        "dugout", "bench", "sideline", "grandstand", "spectator", "fan", "supporter",
-        # Sports organizations and competitions
-        "medal", "gold", "silver", "bronze", "podium", "Olympics", "Paralympic",
-        "commonwealth games", "Asian games", "Pan American games", "world championship",
-        "grand slam", "masters", "open", "invitational", "classic", "tour", "circuit",
-        "IPL", "Indian Premier League", "MLB", "Major League Baseball",
-        "NBA", "National Basketball Association", "NFL", "National Football League",
-        "NHL", "National Hockey League", "FIFA", "UEFA", "ATP", "WTA", "ICC",
-        "Premier League", "La Liga", "Bundesliga", "Serie A", "Ligue 1", "MLS",
-        "Champions League", "Europa League", "Super Bowl", "World Series", "Stanley Cup",
-        "NCAA", "collegiate", "college", "university", "varsity", "intramural",
-        # Sports media and business
-        "broadcast", "coverage", "commentator", "announcer", "pundit", "analyst",
-        "highlight", "replay", "sports network", "ESPN", "Sky Sports", "Fox Sports",
-        "sponsorship", "endorsement", "advertisement", "merchandise", "jersey", "kit",
-        "ticket", "season ticket", "box seat", "premium", "concession", "vendor",
-        # Sports media and business (continued)
-        "broadcast", "coverage", "commentator", "announcer", "pundit", "analyst",
-        "highlight", "replay", "sports network", "ESPN", "Sky Sports", "Fox Sports",
-        "sponsorship", "endorsement", "advertisement", "merchandise", "jersey", "kit",
-        "ticket", "season ticket", "box seat", "premium", "concession", "vendor"
-    ],
-    "entertainment": [
-        # Film and cinema
-        "movie", "film", "cinema", "feature", "short film", "documentary", "animation",
-        "blockbuster", "indie", "independent film", "foreign film", "box office",
-        "screening", "premiere", "release", "theatrical", "stream", "streaming",
-        "director", "producer", "screenwriter", "script", "screenplay", "adaptation",
-        "cinematography", "cinematographer", "editing", "editor", "visual effects",
-        "special effects", "CGI", "motion capture", "sound design", "soundtrack",
-        "score", "composer", "scene", "shot", "take", "cut", "sequel", "prequel",
-        "trilogy", "franchise", "universe", "reboot", "remake", "spin-off",
-        "genre", "action", "comedy", "drama", "thriller", "horror", "sci-fi",
-        "science fiction", "fantasy", "romance", "romantic comedy", "rom-com",
-        "mystery", "crime", "western", "historical", "biographical", "biopic",
-        # Television
-        "TV", "television", "show", "series", "episode", "season", "pilot",
-        "finale", "midseason", "sitcom", "drama series", "miniseries", "limited series",
-        "anthology", "reality TV", "game show", "talk show", "variety show",
-        "network", "cable", "premium cable", "broadcast", "channel", "program",
-        "primetime", "daytime", "syndication", "rerun", "renewed", "cancelled",
-        "showrunner", "creator", "writer", "TV writer", "episode writer", "staff writer",
-        # Performing arts
-        "actor", "actress", "performer", "cast", "casting", "star", "co-star",
-        "supporting", "lead", "protagonist", "antagonist", "villain", "hero", "anti-hero",
-        "character", "role", "performance", "portrayal", "acting", "dialogue",
-        "monologue", "line", "script", "improv", "improvisation", "stand-up",
-        "comedian", "comic", "sketch", "theater", "theatre", "stage", "Broadway",
-        "West End", "play", "musical", "opera", "ballet", "dance", "choreography",
-        "production", "rehearsal", "audition", "understudy", "troupe", "ensemble",
-        # Music
-        "music", "song", "track", "single", "album", "EP", "LP", "record",
-        "release", "drop", "artist", "musician", "singer", "vocalist", "band",
-        "group", "duo", "trio", "soloist", "frontman", "frontwoman", "lead singer",
-        "songwriter", "composer", "producer", "DJ", "rapper", "MC", "beatmaker",
-        "guitarist", "bassist", "drummer", "pianist", "keyboardist", "violinist",
-        "instrumentalist", "orchestra", "symphony", "philharmonic", "conductor",
-        "genre", "rock", "pop", "hip-hop", "rap", "R&B", "soul", "funk", "jazz",
-        "blues", "country", "folk", "electronic", "EDM", "dance", "techno", "house",
-        "metal", "punk", "alternative", "indie", "classical", "reggae", "latin",
-        "hit", "chart", "Billboard", "Grammy", "award-winning", "platinum", "gold",
-        "concert", "tour", "gig", "show", "performance", "live", "venue", "arena",
-        "stadium", "festival", "Coachella", "Glastonbury", "Lollapalooza", "Bonnaroo",
-        # Celebrity culture
-        "celebrity", "star", "fame", "famous", "A-list", "B-list", "icon", "iconic",
-        "superstar", "public figure", "household name", "stardom", "limelight",
-        "popular", "popularity", "fan", "fanbase", "followers", "stan", "groupie",
-        "paparazzi", "tabloid", "gossip", "rumor", "scandal", "controversy",
-        "interview", "press conference", "red carpet", "premiere", "gala", "award show",
-        # Awards and recognition
-        "award", "nominee", "nomination", "winner", "recipient", "honor", "accolade",
-        "Oscar", "Academy Award", "Emmy", "Grammy", "Tony", "Golden Globe", "BAFTA",
-        "MTV Award", "People's Choice", "Critics' Choice", "SAG Award", "Billboard Award",
-        "best actor", "best actress", "best director", "best picture", "best film",
-        "best album", "best song", "hall of fame", "lifetime achievement", "legacy",
-        # Media and publishing
-        "book", "novel", "fiction", "non-fiction", "memoir", "biography", "autobiography",
-        "bestseller", "bestselling", "author", "writer", "novelist", "literary",
-        "literature", "publisher", "publishing", "imprint", "edition", "volume",
-        "chapter", "page", "paragraph", "prose", "narrative", "plot", "storyline",
-        "character", "protagonist", "antagonist", "setting", "theme", "genre",
-        "mystery", "thriller", "romance", "sci-fi", "fantasy", "young adult", "YA",
-        "comic", "comic book", "graphic novel", "manga", "anime", "cartoon",
-        # Digital entertainment
-        "streaming", "stream", "subscription", "platform", "service", "content",
-        "Netflix", "Disney+", "Amazon Prime", "Hulu", "HBO", "HBO Max", "Apple TV+",
-        "Peacock", "Paramount+", "YouTube", "YouTube Premium", "TikTok", "Instagram",
-        "influencer", "content creator", "vlogger", "blogger", "podcaster", "podcast",
-        "episode", "download", "subscriber", "follower", "like", "share", "viral",
-        "trending", "binge-watch", "marathon", "spoiler", "recap", "review", "trailer",
-        "teaser", "behind the scenes", "BTS", "exclusive", "original"
-    ]
-}
-# Add domain-specific RSS feeds for different categories
-CATEGORY_SPECIFIC_FEEDS = {
-    "science": [
-        # "https://www.science.org/rss/news_feeds/carousel.xml",
-        "https://www.science.org/rss/news_current.xml",
-        "https://www.nature.com/nature.rss",
-        # "https://www.scientificamerican.com/rss/",
-        "http://rss.sciam.com/basic-science",
-        # "https://rss.sciam.com/ScientificAmerican-Global",
-        "http://rss.sciam.com/ScientificAmerican-Global",
-        # "https://feeds.newscientist.com/science-news",
-        "https://www.newscientist.com/feed/home/?cmpid=RSS|NSNS-Home",
-        "https://phys.org/rss-feed/"
-    ],
-    "technology": [
-        # "https://feed.wired.com/rss/category/business/feed.rss",
-        "https://www.wired.com/feed/category/business/latest/rss",
-        "https://techcrunch.com/feed/",
-        "https://www.technologyreview.com/feed/",
-        "https://arstechnica.com/feed/",
-        "https://www.theverge.com/rss/index.xml",
-        "https://news.ycombinator.com/rss"
-    ],
-    "politics": [
-        "https://feeds.washingtonpost.com/rss/politics",
-        "https://rss.nytimes.com/services/xml/rss/nyt/Politics.xml",
-        "https://feeds.bbci.co.uk/news/politics/rss.xml",
-        "https://www.politico.com/rss/politicopicks.xml",
-        "https://www.realclearpolitics.com/index.xml"
-    ],
-    "business": [
-        "https://www.ft.com/rss/home",
-        "https://feeds.bloomberg.com/markets/news.rss",
-        # "https://www.forbes.com/business/feed/",
-        "https://rss.nytimes.com/services/xml/rss/nyt/Business.xml",
-        "https://feeds.washingtonpost.com/rss/business",
-        "https://www.entrepreneur.com/latest.rss",
-        # "https://www.cnbc.com/id/10001147/device/rss/rss.htm",
-        "https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=10001147",
-        "https://feeds.content.dowjones.io/public/rss/WSJcomUSBusiness",
-        "https://feeds.a.dj.com/rss/RSSMarketsMain.xml"
-    ],
-    "world": [
-        "https://feeds.bbci.co.uk/news/world/rss.xml",
-        "https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
-        "https://www.aljazeera.com/xml/rss/all.xml",
-        "https://feeds.washingtonpost.com/rss/world",
-        # "https://rss.cnn.com/rss/edition_world.rss"
-        "http://rss.cnn.com/rss/cnn_world.rss"
-    ],
-    "sports": [
-        "https://www.espn.com/espn/rss/news",
-        "https://www.cbssports.com/rss/headlines/",
-        # "https://feeds.skysports.com/feeds/rss/latest.xml",
-        "https://www.espncricinfo.com/rss/content/story/feeds/0.xml",
-        "https://api.foxsports.com/v1/rss",
-        "https://www.sportingnews.com/us/rss",
-        "https://www.theguardian.com/sport/rss",
-    ],
-    "entertainment": [
-        "https://www.hollywoodreporter.com/feed/",
-        "https://variety.com/feed/",
-        # "https://feeds.eonline.com/mrss/article/",
-        "https://www.eonline.com/syndication/feeds/rssfeeds/topstories.xml",
-        "https://www.rollingstone.com/feed/",
-        "https://rss.nytimes.com/services/xml/rss/nyt/Arts.xml"
-    ],
-    "fact_checking": [
-        "https://www.snopes.com/feed/",
-        "https://www.politifact.com/rss/all/",
-        "https://www.factcheck.org/feed/",
-        "https://leadstories.com/atom.xml",
-        # "https://apnews.com/hub/fact-check/rss",
-        # "https://apnews.com/apf-fact-check"
-        "https://fullfact.org/feed/all/",
-        "https://www.truthorfiction.com/feed/"
-    ]
-}
-# Reliability boosts for sources by category
-SOURCE_RELIABILITY_BY_CATEGORY = {
-    "science": {
-        "nature.com": 0.95,
-        "science.org": 0.95,
-        "nih.gov": 0.95,
-        "nasa.gov": 0.95,
-        "scientificamerican.com": 0.9,
-        "newscientist.com": 0.9,
-        "pnas.org": 0.95,
-        "cell.com": 0.95,
-        "sciencedirect.com": 0.9,
-        "plos.org": 0.9,
-        "arxiv.org": 0.85
-    },
-    "technology": {
-        "wired.com": 0.9,
-        "techcrunch.com": 0.85,
-        "arstechnica.com": 0.9,
-        "technologyreview.com": 0.9,
-        "theverge.com": 0.85,
-        "cnet.com": 0.85,
-        "engadget.com": 0.85
-    },
-    "fact_checking": {
-        "snopes.com": 0.95,
-        "politifact.com": 0.9,
-        "factcheck.org": 0.9,
-        "apnews.com/hub/fact-check": 0.95,
-        "reuters.com/fact-check": 0.95
-    }
-}
-def detect_claim_category(claim: str) -> Tuple[str, float]:
-    """
-    Detect the most likely category of a claim and its confidence score
-    Args:
-        claim (str): The claim text
-    Returns:
-        tuple: (category_name, confidence_score)
-    """
-    if not claim:
-        return "general", 0.3
-    # Lowercase for better matching
-    claim_lower = claim.lower()
-    # Count matches for each category
-    category_scores = {}
-    for category, keywords in CLAIM_CATEGORIES.items():
-        # Count how many keywords from this category appear in the claim
-        matches = sum(1 for keyword in keywords if keyword.lower() in claim_lower)
-        # Calculate a simple score based on matches
-        if matches > 0:
-            # Calculate a more significant score based on number of matches
-            score = min(0.9, 0.3 + (matches * 0.1))  # Base 0.3 + 0.1 per match, max 0.9
-            category_scores[category] = score
-    # Find category with highest score
-    if not category_scores:
-        return "general", 0.3
-    top_category = max(category_scores.items(), key=lambda x: x[1])
-    category_name, confidence = top_category
-    # If the top score is too low, return general
-    if confidence < 0.3:
-        return "general", 0.3
-    return category_name, confidence
-def get_topic_specific_sources(claim: str, existing_sources: Dict) -> Dict:
-    """
-    Enrich existing sources dict with topic-specific sources
-    Args:
-        claim (str): The claim text
-        existing_sources (dict): Current sources configuration
-    Returns:
-        dict: Updated sources with topic-specific priorities
-    """
-    # Detect claim category
-    category, confidence = detect_claim_category(claim)
-    logger.info(f"Claim category detected: {category} (confidence: {confidence:.2f})")
-    # If confidence is low, keep existing sources
-    if confidence < 0.4:
-        return existing_sources
-    # Get specific feeds for the category
-    category_feeds = CATEGORY_SPECIFIC_FEEDS.get(category, [])
-    # Only proceed if we have category-specific feeds
-    if not category_feeds:
-        return existing_sources
-    # Create a new sources dictionary with category-specific modifications
-    updated_sources = existing_sources.copy()
-    # If the category is science, add the category-specific feeds to the list
-    # and prioritize them by putting them first in RSS feeds
-    if category in CATEGORY_SPECIFIC_FEEDS:
-        # Add up to 5 category-specific RSS feeds (if we have them)
-        category_feeds_sample = category_feeds[:min(5, len(category_feeds))]
-        # Add or update source reliability data
-        if category in SOURCE_RELIABILITY_BY_CATEGORY:
-            for domain, reliability in SOURCE_RELIABILITY_BY_CATEGORY[category].items():
-                updated_sources["source_credibility"] = updated_sources.get("source_credibility", {})
-                updated_sources["source_credibility"][domain] = reliability
-        # Return updated sources with prioritized feeds
-        return {
-            "category": category,
-            "confidence": confidence,
-            "rss_feeds": category_feeds_sample + (updated_sources.get("rss_feeds", []) or []),
-            "source_credibility": updated_sources.get("source_credibility", {})
-        }
-    return existing_sources
-def get_prioritized_sources(claim: str, claim_category: Optional[str] = None) -> Dict[str, List[str]]:
-    """
-    Get prioritized sources for a claim based on its category
-    Args:
-        claim (str): The claim to check
-        claim_category (str, optional): Override detected category
-    Returns:
-        dict: Dictionary with source types prioritized by relevance
-    """
-    # Detect category if not provided
-    if not claim_category:
-        category, confidence = detect_claim_category(claim)
-    else:
-        category = claim_category
-        confidence = 0.8  # Assume high confidence if category is explicitly provided
-    # Log detected category
-    logger.info(f"Using claim category: {category} for source prioritization")
-    # Default priorities
-    priorities = {
-        "primary": ["wikipedia", "news", "claimreview"],
-        "secondary": ["rss", "scholarly", "wikidata"]
-    }
-    # Needs recent evidence check (existing logic)
-    temporal_terms = ["is", "are", "remains", "continues", "still", "currently",
-                      "now", "today", "recent", "latest"]
-    negation_terms = ["not", "no longer", "isn't", "aren't", "doesn't", "don't",
-                      "can't", "cannot", "anymore"]
-    requires_recent = any(term in claim.lower() for term in temporal_terms) or \
-                     any(term in claim.lower() for term in negation_terms)
-    # Adjust priorities based on category
-    if category == "science":
-        if requires_recent:
-            priorities = {
-                "primary": ["scholarly", "rss", "wikipedia"],
-                "secondary": ["news", "claimreview", "wikidata"]
-            }
-        else:
-            priorities = {
-                "primary": ["scholarly", "wikipedia", "rss"],
-                "secondary": ["claimreview", "news", "wikidata"]
-            }
-    elif category == "technology":
-        if requires_recent:
-            priorities = {
-                "primary": ["rss", "news", "scholarly"],
-                "secondary": ["wikipedia", "claimreview", "wikidata"]
-            }
-        else:
-            priorities = {
-                "primary": ["news", "scholarly", "wikipedia"],
-                "secondary": ["rss", "claimreview", "wikidata"]
-            }
-    elif category == "politics":
-        if requires_recent:
-            priorities = {
-                "primary": ["rss", "news", "claimreview"],
-                "secondary": ["wikipedia", "wikidata", "scholarly"]
-            }
-        else:
-            priorities = {
-                "primary": ["claimreview", "news", "wikipedia"],
-                "secondary": ["rss", "wikidata", "scholarly"]
-            }
-    elif category == "business" or category == "world":
-        if requires_recent:
-            priorities = {
-                "primary": ["rss", "news", "wikipedia"],
-                "secondary": ["claimreview", "wikidata", "scholarly"]
-            }
-        else:
-            priorities = {
-                "primary": ["news", "wikipedia", "rss"],
-                "secondary": ["claimreview", "wikidata", "scholarly"]
-            }
-    elif category == "sports":
-        if requires_recent:
-            priorities = {
-                "primary": ["rss", "news", "wikipedia"],
-                "secondary": ["wikidata", "claimreview", "scholarly"]
-            }
-        else:
-            priorities = {
-                "primary": ["wikipedia", "news", "rss"],
-                "secondary": ["wikidata", "claimreview", "scholarly"]
-            }
-    elif category == "entertainment":
-        if requires_recent:
-            priorities = {
-                "primary": ["rss", "news", "claimreview"],
-                "secondary": ["wikipedia", "wikidata", "scholarly"]
-            }
-        else:
-            priorities = {
-                "primary": ["news", "wikipedia", "claimreview"],
-                "secondary": ["rss", "wikidata", "scholarly"]
-            }
-    # Add category and confidence for reference
-    priorities["category"] = category
-    priorities["confidence"] = confidence
-    priorities["requires_recent"] = requires_recent
-    return priorities
-def get_category_specific_rss_feeds(category: str, max_feeds: int = 5) -> List[str]:
-    """
-    Get a list of RSS feeds specific to a category
-    Args:
-        category (str): The claim category
-        max_feeds (int): Maximum number of feeds to return
-    Returns:
-        list: List of RSS feed URLs
-    """
-    # Get category-specific feeds
-    category_feeds = CATEGORY_SPECIFIC_FEEDS.get(category, [])
-    # Limit to max_feeds
-    return category_feeds[:min(max_feeds, len(category_feeds))]

modules/claim_extraction.py DELETED Viewed

@@ -1,236 +0,0 @@
-import logging
-import time
-import re
-from langdetect import detect
-import spacy
-from utils.performance import PerformanceTracker
-from utils.models import get_nlp_model, get_llm_model
-logger = logging.getLogger("misinformation_detector")
-performance_tracker = PerformanceTracker()
-def extract_claims(text):
-    """
-    Extract the main factual claim from the provided text.
-    For concise claims (<20 words), preserves them exactly.
-    For longer text, uses OpenAI to extract the claim.
-    """
-    logger.info(f"Extracting claims from: {text}")
-    start_time = time.time()
-    # First, check if the input already appears to be a concise claim
-    if len(text.split()) < 20:
-        logger.info("Input appears to be a concise claim already, preserving as-is")
-        performance_tracker.log_processing_time(start_time)
-        performance_tracker.log_claim_processed()
-        return text
-    try:
-        # For longer text, use OpenAI for extraction
-        extracted_claim = extract_with_openai(text)
-        # Log processing time
-        performance_tracker.log_processing_time(start_time)
-        performance_tracker.log_claim_processed()
-        logger.info(f"Extracted claim: {extracted_claim}")
-        return extracted_claim
-    except Exception as e:
-        logger.error(f"Error extracting claims: {str(e)}")
-        # Fallback to original text on error
-        return text
-def extract_with_openai(text):
-    """
-    Use OpenAI model for claim extraction
-    """
-    try:
-        # Get LLM model
-        llm_model = get_llm_model()
-        # Create a very explicit prompt to avoid hallucination
-        prompt = f"""
-        Extract the main factual claim from the following text.
-        DO NOT add any information not present in the original text.
-        DO NOT add locations, dates, or other details.
-        ONLY extract what is explicitly stated.
-        Text: {text}
-        Main factual claim:
-        """
-        # Call OpenAI with temperature=0 for deterministic output
-        response = llm_model.invoke(prompt, temperature=0)
-        extracted_claim = response.content.strip()
-        # Further clean up any explanations or extra text
-        if ":" in extracted_claim:
-            parts = extracted_claim.split(":")
-            if len(parts) > 1:
-                extracted_claim = parts[-1].strip()
-        logger.info(f"OpenAI extraction: {extracted_claim}")
-        # Validate that we're not adding info not in the original
-        nlp = get_nlp_model()
-        extracted_claim = validate_extraction(text, extracted_claim, nlp)
-        return extracted_claim
-    except Exception as e:
-        logger.error(f"Error in OpenAI claim extraction: {str(e)}")
-        return text  # Fallback to original
-def validate_extraction(original_text, extracted_claim, nlp):
-    """
-    Validate that the extracted claim doesn't add information not present in the original text
-    """
-    # If extraction fails or is empty, return original
-    if not extracted_claim or extracted_claim.strip() == "":
-        logger.warning("Empty extraction result, using original text")
-        return original_text
-    # Check for added location information
-    location_terms = ["united states", "america", "u.s.", "usa", "china", "india", "europe",
-                      "russia", "japan", "uk", "germany", "france", "australia"]
-    for term in location_terms:
-        if term in extracted_claim.lower() and term not in original_text.lower():
-            logger.warning(f"Extraction added location '{term}' not in original, using original text")
-            return original_text
-    # Check for entity preservation/addition using spaCy
-    try:
-        # Get entities from extracted text
-        extracted_doc = nlp(extracted_claim)
-        extracted_entities = [ent.text.lower() for ent in extracted_doc.ents]
-        # Get entities from original text
-        original_doc = nlp(original_text)
-        original_entities = [ent.text.lower() for ent in original_doc.ents]
-        # Check for new entities that don't exist in original
-        for entity in extracted_entities:
-            if not any(entity in orig_entity or orig_entity in entity for orig_entity in original_entities):
-                logger.warning(f"Extraction added new entity '{entity}', using original text")
-                return original_text
-        return extracted_claim
-    except Exception as e:
-        logger.error(f"Error in extraction validation: {str(e)}")
-        return original_text  # On error, safer to return original
-def shorten_claim_for_evidence(claim):
-    """
-    Shorten a claim to use for evidence retrieval by preserving important keywords
-    while maintaining claim context
-    """
-    try:
-        # Get NLP model
-        nlp = get_nlp_model()
-        # Use NER to extract key entities
-        doc = nlp(claim)
-        # Extract all entities for search
-        entities = [ent.text for ent in doc.ents]
-        # Extract key proper nouns, entities, and important context words
-        important_words = []
-        # Add all named entities
-        for ent in doc.ents:
-            important_words.append(ent.text)
-        # Add important nouns and adjectives not already added
-        for token in doc:
-            if token.pos_ in ["NOUN", "PROPN"] and token.text not in important_words:
-                important_words.append(token.text)
-        # Make sure we include key terms like "prime minister", "president", etc.
-        title_terms = ["president", "prime minister", "minister", "chancellor", "premier", "governor", "mayor", "senator"]
-        for term in title_terms:
-            if term in claim.lower() and not any(term in word.lower() for word in important_words):
-                # Find the full phrase (e.g., "Canadian Prime Minister")
-                matches = re.finditer(r'(?i)(?:\w+\s+)*\b' + re.escape(term) + r'\b(?:\s+\w+)*', claim)
-                for match in matches:
-                    phrase = match.group(0)
-                    if phrase not in important_words:
-                        important_words.append(phrase)
-        # Add country names or important place references
-        country_terms = ["canada", "canadian", "us", "united states", "american", "uk", "british", "australia", "china", "russian"]
-        for term in country_terms:
-            if term in claim.lower() and not any(term in word.lower() for word in important_words):
-                for token in doc:
-                    if token.text.lower() == term and token.text not in important_words:
-                        important_words.append(token.text)
-        # Always include negation words as they're critical for meaning
-        negation_terms = ["not", "no longer", "former", "ex-", "isn't", "aren't", "doesn't", "don't"]
-        negation_found = False
-        for term in negation_terms:
-            if term in claim.lower():
-                # Find the context around the negation (3 words before and after)
-                matches = re.finditer(r'(?i)(?:\w+\s+){0,3}\b' + re.escape(term) + r'\b(?:\s+\w+){0,3}', claim)
-                for match in matches:
-                    phrase = match.group(0)
-                    if phrase not in important_words:
-                        important_words.append(phrase)
-                        negation_found = True
-        # Special handling for time-sensitive claims with negations
-        is_time_sensitive = any(term in claim.lower() for term in ["anymore", "still", "currently", "now", "today", "recent"])
-        # If we have both negation and time sensitivity, ensure we keep those key aspects
-        if negation_found and is_time_sensitive:
-            # Ensure we keep time-sensitive terms
-            time_terms = ["anymore", "still", "currently", "now", "today", "recent"]
-            for term in time_terms:
-                if term in claim.lower() and not any(term in word.lower() for word in important_words):
-                    # Add the context around the time term
-                    matches = re.finditer(r'(?i)(?:\w+\s+){0,2}\b' + re.escape(term) + r'\b(?:\s+\w+){0,2}', claim)
-                    for match in matches:
-                        phrase = match.group(0)
-                        if phrase not in important_words:
-                            important_words.append(phrase)
-        # If entities plus titles don't give us enough, include key parts of claim
-        if len(entities) < 2 and not any("minister" in word.lower() for word in important_words):
-            words = claim.split()
-            # Use first 8 words
-            return " ".join(words[:min(8, len(words))])
-        # Remove duplicates while preserving order
-        seen = set()
-        unique_terms = []
-        for word in important_words:
-            if word.lower() not in seen:
-                seen.add(word.lower())
-                unique_terms.append(word)
-        # Ensure we have a reasonable number of search terms (maintain more for complex claims)
-        search_terms = unique_terms[:min(6, len(unique_terms))]
-        # Sort search terms to try to maintain original word order from claim
-        def get_position(term):
-            return claim.lower().find(term.lower())
-        search_terms.sort(key=get_position)
-        # Join terms to create search query
-        shortened_claim = " ".join(search_terms)
-        # If the shortened claim is too short compared to original, use more of original
-        if len(shortened_claim.split()) < 3 and len(claim.split()) > 5:
-            words = claim.split()
-            shortened_claim = " ".join(words[:min(8, len(words))])
-        logger.info(f"Shortened Claim: {shortened_claim}")
-        return shortened_claim
-    except Exception as e:
-        logger.error(f"Error in shortening claim: {str(e)}")
-        # Return original claim on error
-        return claim

modules/classification.py DELETED Viewed

@@ -1,521 +0,0 @@
-import logging
-import re
-from utils.models import get_llm_model
-from utils.performance import PerformanceTracker
-logger = logging.getLogger("misinformation_detector")
-performance_tracker = PerformanceTracker()
-def classify_with_llm(claim, evidence):
-    """
-    Optimized classification function that handles evidence classification
-    and verdict generation in a single LLM call with robust parsing
-    """
-    logger.info(f"Classifying evidence for claim: {claim}")
-    # Get the LLM model
-    llm_model = get_llm_model()
-    # Skip if no evidence
-    if not evidence:
-        logger.warning("No evidence provided for classification")
-        return []
-    # Normalize evidence to a list
-    if not isinstance(evidence, list):
-        if evidence:
-            try:
-                evidence = [evidence]
-            except Exception as e:
-                logger.error(f"Could not convert evidence to list: {e}")
-                return []
-        else:
-            return []
-    # Does the claim contain strong assertions that require specific evidence?
-    strong_assertion_markers = [
-        "solved", "cured", "discovered", "confirmed", "proven", "definitive",
-        "breakthrough", "revolutionary", "successfully", "first ever", "extends",
-        "conclusive", "unprecedented", "remarkable", "definitively"
-    ]
-    # Check if the claim contains strong assertions that would require specific supporting evidence
-    contains_strong_assertions = any(marker in claim.lower() for marker in strong_assertion_markers)
-    # Limit to top 5 evidence items to reduce token usage
-    evidence = evidence[:5]
-    try:
-        # Format evidence items
-        evidence_text = ""
-        for idx, chunk in enumerate(evidence):
-            # Truncate long evidence
-            chunk_text = str(chunk)
-            if len(chunk_text) > 300:
-                chunk_text = chunk_text[:297] + "..."
-            evidence_text += f"EVIDENCE {idx+1}:\n{chunk_text}\n\n"
-        # Create a structured prompt with explicit formatting instructions
-        # Adjust instructions based on claim characteristics
-        if contains_strong_assertions:
-            prompt = f"""
-            CLAIM: {claim}
-            EVIDENCE:
-            {evidence_text}
-            TASK: Evaluate if the evidence supports, contradicts, or is irrelevant to the claim.
-            IMPORTANT CONTEXT: This claim makes strong assertions that require specific supporting evidence.
-            When evaluating such claims:
-            1. Strong assertions require strong, direct evidence - look for specific confirmation from credible sources
-            2. General information about the topic is not sufficient to support specific assertions
-            3. Evidence of ongoing work or research is not sufficient to support claims of completion or success
-            4. If the evidence doesn't directly confirm the specific assertion, classify it as "insufficient" rather than "support"
-            INSTRUCTIONS:
-            1. For each evidence, provide your analysis in EXACTLY this format:
-            EVIDENCE 1 ANALYSIS:
-            Relevance: [relevant/irrelevant]
-            Classification: [support/contradict/insufficient/irrelevant]
-            Confidence: [number between 0-100]
-            Reason: [brief explanation focusing on whether evidence directly confirms the specific assertion]
-            2. After analyzing all evidence pieces, provide a final verdict in this format:
-            FINAL VERDICT: [clear statement if evidence collectively supports or contradicts the claim]
-            Without specific, direct supporting evidence, default to "The evidence does not support the claim" rather than "insufficient evidence."
-            CRITICAL INSTRUCTION: FOCUS ON THE EXACT CLAIM. Evaluate ONLY the specific claim, not related topics
-            """
-        else:
-            prompt = f"""
-            CLAIM: {claim}
-            EVIDENCE:
-            {evidence_text}
-            TASK: Evaluate if the evidence supports, contradicts, or is irrelevant to the claim.
-            INSTRUCTIONS:
-            1. For each evidence, provide your analysis in EXACTLY this format:
-            EVIDENCE 1 ANALYSIS:
-            Relevance: [relevant/irrelevant]
-            Classification: [support/contradict/insufficient/irrelevant]
-            Confidence: [number between 0-100]
-            Reason: [brief explanation]
-            2. After analyzing all evidence pieces, provide a final verdict in this format:
-            FINAL VERDICT: [clear statement if evidence collectively supports or contradicts the claim]
-            CRITICAL INSTRUCTION: FOCUS ON THE EXACT CLAIM. Evaluate ONLY the specific claim, not related topics
-            """
-        # Get response with temperature=0 for consistency
-        result = llm_model.invoke(prompt, temperature=0)
-        result_text = result.content.strip()
-        # Extract final verdict first since it's most important
-        final_verdict = None
-        final_match = re.search(r'FINAL VERDICT:\s*(.*?)(?=\s*$|\n\n)', result_text, re.DOTALL | re.IGNORECASE)
-        if final_match:
-            final_verdict = final_match.group(1).strip()
-            logger.info(f"Final assessment: {final_verdict}")
-        # Define a precise regex pattern matching the requested format
-        analysis_pattern = r'EVIDENCE\s+(\d+)\s+ANALYSIS:\s*\n+Relevance:\s*(relevant|irrelevant)\s*\n+Classification:\s*(support|contradict|neutral|irrelevant|insufficient)\s*\n+Confidence:\s*(\d+)\s*\n+Reason:\s*(.*?)(?=\s*EVIDENCE\s+\d+\s+ANALYSIS:|\s*FINAL VERDICT:|\s*$)'
-        # Parse each evidence analysis
-        classification_results = []
-        matched_evidence = set()
-        # Try matching with our strict pattern first
-        matches = list(re.finditer(analysis_pattern, result_text, re.IGNORECASE | re.DOTALL))
-        # If no matches, try a more flexible pattern
-        if not matches:
-            flexible_pattern = r'(?:EVIDENCE|Evidence)\s+(\d+)(?:\s+ANALYSIS)?:?\s*\n+(?:Relevance|relevance):\s*(relevant|irrelevant|unknown)\s*\n+(?:Classification|classification):\s*(support|contradict|neutral|irrelevant|insufficient|unknown)\s*\n+(?:Confidence|confidence):\s*(\d+)\s*\n+(?:Reason|reason|Brief reason):\s*(.*?)(?=\s*(?:EVIDENCE|Evidence)\s+\d+|FINAL VERDICT:|$)'
-            matches = list(re.finditer(flexible_pattern, result_text, re.IGNORECASE | re.DOTALL))
-        # Process matches
-        for match in matches:
-            try:
-                evidence_idx = int(match.group(1)) - 1
-                relevance = match.group(2).lower()
-                classification = match.group(3).lower()
-                confidence = int(match.group(4))
-                reason = match.group(5).strip()
-                # Normalize classification terms
-                if classification == "neutral":
-                    classification = "insufficient"
-                # For strong assertions, apply confidence adjustments based on classification
-                if contains_strong_assertions:
-                    if classification == "support":
-                        # Check if the reasoning indicates direct or indirect support
-                        indirect_support_markers = ["general", "doesn't directly", "does not directly",
-                                                  "doesn't specifically", "not specific", "related to",
-                                                  "doesn't confirm"]
-                        if any(marker in reason.lower() for marker in indirect_support_markers):
-                            # Downgrade support confidence for indirect evidence
-                            confidence = max(5, confidence - 20)
-                    elif classification == "contradict":
-                        # For contradictions of strong assertions, slightly boost confidence
-                        confidence = min(95, confidence + 5)
-                # Ensure index is valid
-                if 0 <= evidence_idx < len(evidence):
-                    matched_evidence.add(evidence_idx)
-                    # Create result entry
-                    classification_results.append({
-                        "label": classification,
-                        "confidence": confidence / 100.0,
-                        "evidence": evidence[evidence_idx],
-                        "relevance": relevance,
-                        "reason": reason,
-                        "final_assessment": final_verdict
-                    })
-            except (ValueError, IndexError) as e:
-                logger.error(f"Error parsing evidence analysis: {e}")
-        # Handle any unmatched evidence items
-        if matches:  # Only add defaults if we successfully matched some
-            for idx, ev in enumerate(evidence):
-                if idx not in matched_evidence:
-                    # Check if the evidence text itself suggests a classification
-                    contains_support = bool(re.search(r'support|confirm|verify|true|correct|released', final_verdict or "", re.IGNORECASE))
-                    contains_contradicting = bool(re.search(r'not yet|hasn\'t|have not|doesn\'t|don\'t|cannot|preliminary|proposed', str(ev).lower()))
-                    # For claims with strong assertions without explicit evidence, be more cautious
-                    if contains_strong_assertions:
-                        if contains_contradicting:
-                            label = "contradict"
-                            confidence = 0.6
-                        elif contains_support:
-                            label = "insufficient"  # Default to insufficient for strong assertions without clear analysis
-                            confidence = 0.5
-                        else:
-                            label = "insufficient"
-                            confidence = 0.5
-                    else:
-                        label = "support" if contains_support else "unknown"
-                        confidence = 0.7 if contains_support else 0.5
-                    classification_results.append({
-                        "label": label,
-                        "confidence": confidence,
-                        "evidence": ev,
-                        "relevance": "relevant" if (contains_support or contains_contradicting) else "unknown",
-                        "reason": "Based on overall assessment",
-                        "final_assessment": final_verdict
-                    })
-        else:
-            # No structured parsing worked, use final verdict to create simple results
-            contains_support = bool(re.search(r'support|confirm|verify|true|correct|released', final_verdict or "", re.IGNORECASE))
-            contains_contradict = bool(re.search(r'contradict|against|false|incorrect|not support|does not support|insufficient evidence|does not confirm|no evidence', final_verdict or "", re.IGNORECASE))
-            contains_insufficient = bool(re.search(r'insufficient|not enough|cannot determine|no evidence|lack of evidence', final_verdict or "", re.IGNORECASE))
-            # For claims with strong assertions, be more stringent
-            if contains_strong_assertions:
-                if contains_support and not contains_insufficient and not contains_contradict:
-                    label = "support"
-                    confidence = 0.6  # Lower confidence even for support of strong assertions
-                elif contains_contradict:
-                    label = "contradict"
-                    confidence = 0.8  # Higher confidence for contradiction of strong assertions
-                else:
-                    label = "insufficient"
-                    confidence = 0.7  # Good confidence for insufficient judgment
-            else:
-                label = "support" if contains_support else "contradict" if contains_contradict else "unknown"
-                confidence = 0.7 if (contains_support or contains_contradict) else 0.5
-            # Create basic results based on final verdict
-            for ev in evidence:
-                classification_results.append({
-                    "label": label,
-                    "confidence": confidence,
-                    "evidence": ev,
-                    "relevance": "relevant" if (contains_support or contains_contradict) else "unknown",
-                    "reason": final_verdict or "Based on collective evidence",
-                    "final_assessment": final_verdict
-                })
-        logger.info(f"Classified {len(classification_results)} evidence items")
-        return classification_results
-    except Exception as e:
-        logger.error(f"Error in evidence classification: {str(e)}")
-        # Provide a basic fallback that checks for keywords in evidence
-        try:
-            fallback_results = []
-            for ev in evidence:
-                ev_text = str(ev).lower()
-                supports = False
-                contradicts = False
-                # Basic keyword checking as last resort
-                if claim.lower() in ev_text:
-                    keywords = [word for word in claim.lower().split() if len(word) > 3]
-                    matching_keywords = [k for k in keywords if k in ev_text]
-                    # If substantial keywords match, consider it support
-                    supports = len(matching_keywords) >= max(1, len(keywords) // 2)
-                # Check for contradiction terms
-                contradiction_terms = ["not yet", "hasn't", "haven't", "cannot", "can't",
-                                      "doesn't", "don't", "no evidence", "insufficient",
-                                      "preliminary", "proposed", "in development", "future"]
-                contradicts = any(term in ev_text for term in contradiction_terms)
-                # For claims with strong assertions, be more conservative in the fallback case
-                if contains_strong_assertions:
-                    if contradicts:
-                        fallback_results.append({
-                            "label": "contradict",
-                            "confidence": 0.6,
-                            "evidence": ev,
-                            "relevance": "relevant",
-                            "reason": "Evidence suggests the claim is not yet proven (fallback method)"
-                        })
-                    elif supports:
-                        fallback_results.append({
-                            "label": "insufficient",
-                            "confidence": 0.6,
-                            "evidence": ev,
-                            "relevance": "relevant",
-                            "reason": "Evidence is related but doesn't conclusively confirm the assertion (fallback method)"
-                        })
-                    else:
-                        fallback_results.append({
-                            "label": "unknown",
-                            "confidence": 0.5,
-                            "evidence": ev,
-                            "relevance": "unknown",
-                            "reason": "Cannot determine relevance (fallback method)"
-                        })
-                else:
-                    fallback_results.append({
-                        "label": "support" if supports else "unknown",
-                        "confidence": 0.6 if supports else 0.5,
-                        "evidence": ev,
-                        "relevance": "relevant" if supports else "unknown",
-                        "reason": "Based on keyword matching (fallback method)"
-                    })
-            return fallback_results
-        except:
-            # Absolute last resort
-            return [{"label": "unknown", "confidence": 0.5, "evidence": ev} for ev in evidence]
-def aggregate_evidence(classification_results):
-    """
-    Aggregate evidence classifications to determine overall verdict
-    with robust fallback mechanisms for reliable results
-    """
-    logger.info(f"Aggregating evidence from {len(classification_results) if classification_results else 0} results")
-    if not classification_results:
-        logger.warning("No classification results to aggregate")
-        return "Uncertain", 0.3  # Default with low confidence
-    # Assess the claim's characteristics (without relying on explicit category detection)
-    # Does the claim contain strong assertions that require specific evidence?
-    strong_assertion_markers = [
-        "solved", "cured", "discovered", "confirmed", "proven", "definitive",
-        "breakthrough", "revolutionary", "successfully", "first ever", "extends",
-        "conclusive", "unprecedented", "remarkable", "definitively"
-    ]
-    # Check if claim text is available in final assessment
-    claim_text = None
-    claim_has_strong_assertions = False
-    # Extract claim from final assessment if available
-    for item in classification_results:
-        if "final_assessment" in item and item["final_assessment"]:
-            match = re.search(r'the claim (?:that )?"?([^"]+)"?', item["final_assessment"], re.IGNORECASE)
-            if match:
-                claim_text = match.group(1)
-                claim_has_strong_assertions = any(marker in claim_text.lower() for marker in strong_assertion_markers)
-                break
-    # If we couldn't extract the claim, check evidence context for assertion indicators
-    if not claim_text:
-        # Check if evidence reasons suggest dealing with strong assertions
-        assertion_context_indicators = ["conclusive evidence", "definitive proof", "solved", "breakthrough",
-                                      "revolutionary", "directly confirms", "specific confirmation"]
-        reasons = [item.get("reason", "").lower() for item in classification_results if "reason" in item]
-        assertion_indicators_count = sum(1 for indicator in assertion_context_indicators
-                                        for reason in reasons if indicator in reason)
-        claim_has_strong_assertions = assertion_indicators_count >= 2
-    # Extract final assessment if present
-    final_assessment = None
-    for item in classification_results:
-        if "final_assessment" in item and item["final_assessment"]:
-            final_assessment = item["final_assessment"]
-            break
-    # Count evidence by classification
-    support_items = [item for item in classification_results if item.get("label") == "support"]
-    contradict_items = [item for item in classification_results if item.get("label") == "contradict"]
-    insufficient_items = [item for item in classification_results if item.get("label") in ["insufficient", "neutral"]]
-    relevant_items = [item for item in classification_results
-                     if item.get("relevance") == "relevant" or item.get("label") in ["support", "contradict"]]
-    # Calculate the proportion of supported evidence
-    total_relevant = len(relevant_items)
-    # Direct keyword detection from final assessment or evidence
-    if final_assessment:
-        # Check for support indicators in final assessment
-        supports_pattern = r'\b(support|confirm|verify|true|correct|released|proves|validates|evidence (?:that |for |of )(?:the claim|it) is true)\b'
-        contradicts_pattern = r'\b(contradict|refute|deny|false|incorrect|not released|doesn\'t support|does not support|no evidence|cannot support|is not true|evidence (?:that |for |of )(?:the claim|it) is false)\b'
-        insufficient_pattern = r'\b(uncertain|insufficient|not enough|inconclusive|cannot determine|unable to determine|lack of evidence)\b'
-        supports_match = re.search(supports_pattern, final_assessment, re.IGNORECASE)
-        contradicts_match = re.search(contradicts_pattern, final_assessment, re.IGNORECASE)
-        insufficient_match = re.search(insufficient_pattern, final_assessment, re.IGNORECASE)
-        # Direct determination based on final assessment keywords
-        if supports_match and not contradicts_match and not insufficient_match:
-            # Get max confidence from supporting evidence
-            confidence = max([item.get("confidence", 0) for item in support_items]) if support_items else 0.7
-            # Adjust confidence for claims with strong assertions
-            if claim_has_strong_assertions:
-                confidence = min(confidence, 0.8)  # Cap confidence for strong assertions
-            return "True (Based on Evidence)", max(0.6, confidence)  # Minimum 0.6 confidence
-        if contradicts_match and not supports_match:
-            # Get max confidence from contradicting evidence
-            confidence = max([item.get("confidence", 0) for item in contradict_items]) if contradict_items else 0.7
-            # For claims with strong assertions, increase confidence in contradiction
-            if claim_has_strong_assertions:
-                confidence = max(confidence, 0.7)  # Minimum 0.7 confidence for contradicting strong assertions
-            return "False (Based on Evidence)", max(0.6, confidence)  # Minimum 0.6 confidence
-        if insufficient_match:
-            # For claims with strong assertions without confirming evidence,
-            # change "Uncertain" to a clearer negative verdict
-            if claim_has_strong_assertions:
-                return "False (Based on Evidence)", 0.7
-            return "Uncertain", 0.4  # Medium-low confidence
-    # If we have distinct classifications, weigh them by confidence and quantity
-    if support_items and (not contradict_items or all(item.get("confidence", 0) < 0.95 for item in contradict_items)):
-        # Check if there's high confidence support evidence (greater than 0.95)
-        high_confidence_support = [item for item in support_items if item.get("confidence", 0) > 0.95]
-        if high_confidence_support:
-            # High confidence support evidence exists, use it even if there are some contradictions
-            confidence = max([item.get("confidence", 0) for item in high_confidence_support])
-            # For claims with strong assertions, be more conservative with pure support
-            if claim_has_strong_assertions:
-                confidence = min(confidence, 0.8)
-            return "True (Based on Evidence)", max(0.7, confidence)
-        elif not contradict_items:
-            # All supportive evidence with no contradictions (standard case)
-            confidence = max([item.get("confidence", 0) for item in support_items])
-            # For claims with strong assertions, be more conservative with pure support
-            if claim_has_strong_assertions:
-                # For strong assertions with only support but no contradictions, be cautious
-                confidence = min(confidence, 0.7)
-                # If the support is from low-quality evidence, consider it uncertain
-                support_reasons = [item.get("reason", "").lower() for item in support_items]
-                weak_supports = sum(1 for reason in support_reasons if
-                                   "general information" in reason or
-                                   "doesn't specify" in reason or
-                                   "does not directly" in reason)
-                if weak_supports / max(1, len(support_items)) > 0.5:
-                    return "Uncertain", 0.6
-            return "True (Based on Evidence)", max(0.6, confidence)
-    if contradict_items and not support_items:
-        # All contradicting evidence
-        confidence = max([item.get("confidence", 0) for item in contradict_items])
-        # For claims with strong assertions, increase confidence in contradiction
-        if claim_has_strong_assertions:
-            confidence = max(confidence, 0.7)
-        return "False (Based on Evidence)", max(0.6, confidence)
-    if insufficient_items and len(insufficient_items) > len(support_items) + len(contradict_items):
-        # Mostly insufficient evidence
-        # For claims with strong assertions and mainly insufficient evidence, lean toward "False"
-        if claim_has_strong_assertions:
-            return "False (Based on Evidence)", 0.7
-        return "Uncertain", 0.5  # Medium confidence for explicitly uncertain
-    if support_items and contradict_items:
-        # Competing evidence - compare confidence and quantity
-        support_confidence = max([item.get("confidence", 0) for item in support_items])
-        contradict_confidence = max([item.get("confidence", 0) for item in contradict_items])
-        # For claims with strong assertions, require stronger support to overcome contradiction
-        if claim_has_strong_assertions:
-            # Higher threshold for strong assertions
-            if support_confidence > contradict_confidence + 0.3:
-                return "True (Based on Evidence)", support_confidence * 0.9  # Apply a confidence penalty
-            elif contradict_confidence >= support_confidence - 0.1:  # Lower threshold for contradiction
-                return "False (Based on Evidence)", max(contradict_confidence, 0.7)  # Minimum 0.7 confidence
-            else:
-                # Default to uncertain for close calls on strong assertions
-                return "Uncertain", 0.6
-        else:
-            # Standard threshold for regular claims
-            if support_confidence > contradict_confidence + 0.2:
-                return "True (Based on Evidence)", support_confidence
-            elif contradict_confidence > support_confidence + 0.2:
-                return "False (Based on Evidence)", contradict_confidence
-            else:
-                # Close call - check quantity of evidence
-                if len(support_items) > len(contradict_items) * 2:
-                    return "True (Based on Evidence)", support_confidence * 0.9  # Slight confidence penalty
-                elif len(contradict_items) > len(support_items) * 2:
-                    return "False (Based on Evidence)", contradict_confidence * 0.9  # Slight confidence penalty
-                else:
-                    # Truly conflicting evidence
-                    return "Uncertain", 0.5  # Medium confidence
-    # Check for evidence quality issues
-    all_unknown = all(item.get("label") == "unknown" for item in classification_results)
-    evidence_text = " ".join([str(item.get("evidence", "")) for item in classification_results])
-    # General case: For any claims with all unknown labels that contain markers of strong assertions
-    if all_unknown and claim_has_strong_assertions:
-        # Absence of clear supporting evidence for claims with strong assertions points to "False"
-        return "False (Based on Evidence)", 0.7
-    # For general claims, if all items are unknown but evidence clearly mentions the claim
-    if all_unknown:
-        # Examples of direct evidence matching as fallback
-        if re.search(r'\bllama\s*4\b', evidence_text, re.IGNORECASE) and re.search(r'\bmeta\b|\bfacebook\b', evidence_text, re.IGNORECASE) and re.search(r'\breleas', evidence_text, re.IGNORECASE):
-            return "True (Based on Evidence)", 0.7
-        elif re.search(r'\bnot\s+releas', evidence_text, re.IGNORECASE) or re.search(r'\bdenies\b|\bdenied\b', evidence_text, re.IGNORECASE):
-            return "False (Based on Evidence)", 0.7
-    # Default to uncertain if no clear pattern - but with special case for claims with strong assertions
-    if claim_has_strong_assertions:
-        # For claims with strong assertions with no clear evidence, default to false
-        return "False (Based on Evidence)", 0.7
-    return "Uncertain", 0.3

modules/evidence_retrieval.py DELETED Viewed

@@ -1,944 +0,0 @@
-import logging
-import time
-import re
-import random
-import requests
-import json
-import ssl
-from urllib.parse import urlencode
-from bs4 import BeautifulSoup
-from SPARQLWrapper import SPARQLWrapper, JSON
-from datetime import datetime, timedelta
-from concurrent.futures import ThreadPoolExecutor, as_completed, wait, FIRST_COMPLETED
-from utils.api_utils import api_error_handler, safe_json_parse
-from utils.models import get_nlp_model
-from modules.claim_extraction import shorten_claim_for_evidence, extract_claims
-from modules.rss_feed import retrieve_evidence_from_rss
-from modules.semantic_analysis import analyze_evidence_relevance, select_diverse_evidence
-from config import SOURCE_CREDIBILITY, NEWS_API_KEY, FACTCHECK_API_KEY
-# Import the performance tracker
-from utils.performance import PerformanceTracker
-performance_tracker = PerformanceTracker()
-logger = logging.getLogger("misinformation_detector")
-# Define early analysis function at the module level so it's available everywhere
-def analyze_early_evidence(claim, source_name, source_evidence):
-    """Pre-analyze evidence while waiting for other sources to complete"""
-    try:
-        if not source_evidence:
-            return None
-        logger.info(f"Pre-analyzing {len(source_evidence)} evidence items from {source_name}")
-        # Do a quick relevance check using similarity scoring
-        nlp_model = get_nlp_model()
-        claim_doc = nlp_model(claim)
-        relevant_evidence = []
-        for evidence in source_evidence:
-            if not isinstance(evidence, str):
-                continue
-            # Look for direct keyword matches first (fast check)
-            is_related = False
-            keywords = [word.lower() for word in claim.split() if len(word) > 3]
-            for keyword in keywords:
-                if keyword in evidence.lower():
-                    is_related = True
-                    break
-            # If no keywords match, do a basic entity check
-            if not is_related:
-                # Check if claim and evidence share any entities
-                evidence_doc = nlp_model(evidence[:500])  # Limit for speed
-                claim_entities = [ent.text.lower() for ent in claim_doc.ents]
-                evidence_entities = [ent.text.lower() for ent in evidence_doc.ents]
-                common_entities = set(claim_entities).intersection(set(evidence_entities))
-                if common_entities:
-                    is_related = True
-            if is_related:
-                relevant_evidence.append(evidence)
-        logger.info(f"Found {len(relevant_evidence)} relevant items out of {len(source_evidence)} from {source_name}")
-        return relevant_evidence
-    except Exception as e:
-        logger.error(f"Error in early evidence analysis: {e}")
-        return source_evidence  # On error, return original evidence
-# New function to get recent date for filtering news
-def get_recent_date_range():
-    """Return date range for recent news filtering - last 3 days"""
-    today = datetime.now()
-    three_days_ago = today - timedelta(days=3)
-    return three_days_ago.strftime('%Y-%m-%d'), today.strftime('%Y-%m-%d')
-@api_error_handler("wikipedia")
-def retrieve_evidence_from_wikipedia(claim):
-    """Retrieve evidence from Wikipedia for a given claim"""
-    logger.info(f"Retrieving evidence from Wikipedia for: {claim}")
-    # Ensure shortened_claim is a string
-    try:
-        shortened_claim = shorten_claim_for_evidence(claim)
-    except Exception as e:
-        logger.error(f"Error in claim shortening: {e}")
-        shortened_claim = claim  # Fallback to original claim
-    # Ensure query_parts is a list of strings
-    query_parts = str(shortened_claim).split()
-    evidence = []
-    source_count = {"wikipedia": 0}
-    for i in range(len(query_parts), 0, -1):  # Start with full query, shorten iteratively
-        try:
-            # Safely join and encode query
-            current_query = "+".join(query_parts[:i])
-            search_url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={current_query}&format=json"
-            logger.info(f"Wikipedia search URL: {search_url}")
-            headers = {
-                "User-Agent": "MisinformationDetectionResearchBot/1.0 (Research Project)"
-            }
-            # Make the search request with reduced timeout
-            response = requests.get(search_url, headers=headers, timeout=7)
-            response.raise_for_status()
-            # Safely parse JSON
-            search_data = safe_json_parse(response, "wikipedia")
-            # Safely extract search results
-            search_results = search_data.get("query", {}).get("search", [])
-            # Ensure search_results is a list
-            if not isinstance(search_results, list):
-                logger.warning(f"Unexpected search results type: {type(search_results)}")
-                search_results = []
-            # Use ThreadPoolExecutor to fetch page content in parallel
-            with ThreadPoolExecutor(max_workers=3) as executor:
-                # Submit up to 3 page requests in parallel
-                futures = []
-                for idx, result in enumerate(search_results[:3]):
-                    # Ensure result is a dictionary
-                    if not isinstance(result, dict):
-                        logger.warning(f"Skipping non-dictionary result: {type(result)}")
-                        continue
-                    # Safely extract title
-                    page_title = result.get("title", "")
-                    if not page_title:
-                        continue
-                    page_url = f"https://en.wikipedia.org/wiki/{page_title.replace(' ', '_')}"
-                    # Submit the page request task to executor
-                    futures.append(executor.submit(
-                        fetch_wikipedia_page_content,
-                        page_url,
-                        page_title,
-                        headers
-                    ))
-                # Process completed futures as they finish
-                for future in as_completed(futures):
-                    try:
-                        page_result = future.result()
-                        if page_result:
-                            evidence.append(page_result)
-                            source_count["wikipedia"] += 1
-                    except Exception as e:
-                        logger.error(f"Error processing Wikipedia page: {e}")
-            # Stop if we found any evidence
-            if evidence:
-                break
-        except Exception as e:
-            logger.error(f"Error retrieving from Wikipedia: {str(e)}")
-            continue
-    # Ensure success is a boolean
-    success = bool(evidence)
-    # Safely log evidence retrieval
-    try:
-        performance_tracker.log_evidence_retrieval(success, source_count)
-    except Exception as e:
-        logger.error(f"Error logging evidence retrieval: {e}")
-    if not evidence:
-        logger.warning("No evidence found from Wikipedia.")
-    return evidence
-def fetch_wikipedia_page_content(page_url, page_title, headers):
-    """Helper function to fetch and parse Wikipedia page content"""
-    try:
-        # Get page content with reduced timeout
-        page_response = requests.get(page_url, headers=headers, timeout=5)
-        page_response.raise_for_status()
-        # Extract relevant sections using BeautifulSoup
-        soup = BeautifulSoup(page_response.text, 'html.parser')
-        paragraphs = soup.find_all('p', limit=3)  # Limit to first 3 paragraphs
-        content = " ".join([para.get_text(strip=True) for para in paragraphs])
-        # Truncate content to reduce token usage earlier in the pipeline
-        if len(content) > 300:
-            content = content[:297] + "..."
-        if content.strip():  # Ensure content is not empty
-            return f"Title: {page_title}, URL: {page_url}, Content: {content}"
-        return None
-    except Exception as e:
-        logger.error(f"Error fetching Wikipedia page {page_url}: {e}")
-        return None
-# Update the WikiData function to fix SSL issues
-@api_error_handler("wikidata")
-def retrieve_evidence_from_wikidata(claim):
-    """Retrieve evidence from WikiData for a given claim"""
-    logger.info(f"Retrieving evidence from WikiData for: {claim}")
-    # Prepare entities for SPARQL query
-    shortened_claim = shorten_claim_for_evidence(claim)
-    query_terms = shortened_claim.split()
-    # Initialize SPARQLWrapper for WikiData
-    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
-    # Use a more conservative user agent to avoid blocks
-    sparql.addCustomHttpHeader("User-Agent", "MisinformationDetectionResearchBot/1.0")
-    # Fix SSL issues by disabling SSL verification for this specific request
-    try:
-        # Create a context where we don't verify SSL certs
-        import ssl
-        import urllib.request
-        # Create a context that doesn't verify certificates
-        ssl_context = ssl._create_unverified_context()
-        # Monkey patch the opener for SPARQLWrapper
-        opener = urllib.request.build_opener(urllib.request.HTTPSHandler(context=ssl_context))
-        urllib.request.install_opener(opener)
-    except Exception as e:
-        logger.error(f"Error setting up SSL context: {str(e)}")
-    # Construct basic SPARQL query for relevant entities
-    query = """
-    SELECT ?item ?itemLabel ?description ?article WHERE {
-      SERVICE wikibase:mwapi {
-        bd:serviceParam wikibase:api "EntitySearch" .
-        bd:serviceParam wikibase:endpoint "www.wikidata.org" .
-        bd:serviceParam mwapi:search "%s" .
-        bd:serviceParam mwapi:language "en" .
-        ?item wikibase:apiOutputItem mwapi:item .
-      }
-      ?item schema:description ?description .
-      FILTER(LANG(?description) = "en")
-      OPTIONAL {
-        ?article schema:about ?item .
-        ?article schema:isPartOf <https://en.wikipedia.org/> .
-      }
-      SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
-    }
-    LIMIT 5
-    """ % " ".join(query_terms)
-    sparql.setQuery(query)
-    sparql.setReturnFormat(JSON)
-    try:
-        results = sparql.query().convert()
-        wikidata_evidence = []
-        for result in results["results"]["bindings"]:
-            entity_label = result.get("itemLabel", {}).get("value", "Unknown")
-            description = result.get("description", {}).get("value", "No description")
-            article_url = result.get("article", {}).get("value", "")
-            # Truncate description to reduce token usage
-            if len(description) > 200:
-                description = description[:197] + "..."
-            evidence_text = f"Entity: {entity_label}, Description: {description}"
-            if article_url:
-                evidence_text += f", URL: {article_url}"
-            wikidata_evidence.append(evidence_text)
-        logger.info(f"Retrieved {len(wikidata_evidence)} WikiData entities")
-        return wikidata_evidence
-    except Exception as e:
-        logger.error(f"Error retrieving from WikiData: {str(e)}")
-        return []
-@api_error_handler("openalex")
-def retrieve_evidence_from_openalex(claim):
-    """Retrieve evidence from OpenAlex for a given claim (replacement for Semantic Scholar)"""
-    logger.info(f"Retrieving evidence from OpenAlex for: {claim}")
-    try:
-        shortened_claim = shorten_claim_for_evidence(claim)
-        query = shortened_claim.replace(" ", "+")
-        # OpenAlex API endpoint
-        api_url = f"https://api.openalex.org/works?search={query}&filter=is_paratext:false&per_page=3"
-        headers = {
-            "Accept": "application/json",
-            "User-Agent": "MisinformationDetectionResearchBot/1.0 ([email protected])",
-        }
-        scholarly_evidence = []
-        try:
-            # Request with reduced timeout
-            response = requests.get(api_url, headers=headers, timeout=8)
-            # Check response status
-            if response.status_code == 200:
-                # Successfully retrieved data
-                data = safe_json_parse(response, "openalex")
-                papers = data.get("results", [])
-                for paper in papers:
-                    title = paper.get("title", "Unknown Title")
-                    abstract = paper.get("abstract_inverted_index", None)
-                    # OpenAlex stores abstracts in an inverted index format, so we need to reconstruct it
-                    abstract_text = "No abstract available"
-                    if abstract:
-                        try:
-                            # Simple approach to reconstruct from inverted index
-                            # For a production app, implement a proper reconstruction algorithm
-                            words = list(abstract.keys())
-                            abstract_text = " ".join(words[:30]) + "..."
-                        except Exception as e:
-                            logger.error(f"Error reconstructing abstract: {e}")
-                    url = paper.get("doi", "")
-                    if url and not url.startswith("http"):
-                        url = f"https://doi.org/{url}"
-                    year = ""
-                    publication_date = paper.get("publication_date", "")
-                    if publication_date:
-                        year = publication_date.split("-")[0]
-                    # Truncate abstract to reasonable length
-                    if len(abstract_text) > 250:
-                        abstract_text = abstract_text[:247] + "..."
-                    evidence_text = f"Title: {title}, Year: {year}, Abstract: {abstract_text}, URL: {url}"
-                    scholarly_evidence.append(evidence_text)
-            else:
-                logger.error(f"OpenAlex API error: {response.status_code}")
-        except requests.exceptions.Timeout:
-            logger.warning("OpenAlex request timed out")
-        except requests.exceptions.ConnectionError:
-            logger.warning("OpenAlex connection error")
-        except Exception as e:
-            logger.error(f"Unexpected error in OpenAlex request: {str(e)}")
-        logger.info(f"Retrieved {len(scholarly_evidence)} scholarly papers from OpenAlex")
-        return scholarly_evidence
-    except Exception as e:
-        logger.error(f"Fatal error in OpenAlex retrieval: {str(e)}")
-        return []
-@api_error_handler("factcheck")
-def retrieve_evidence_from_claimreview(claim):
-    """Retrieve evidence from Google's ClaimReview for a given claim"""
-    logger.info(f"Retrieving evidence from ClaimReview for: {claim}")
-    factcheck_api_key = FACTCHECK_API_KEY
-    # Safely shorten claim
-    try:
-        shortened_claim = shorten_claim_for_evidence(claim)
-    except Exception as e:
-        logger.error(f"Error shortening claim: {e}")
-        shortened_claim = claim
-    query_parts = str(shortened_claim).split()
-    factcheck_results = []
-    source_count = {"factcheck": 0}
-    for i in range(len(query_parts), 0, -1):  # Iteratively try shorter queries
-        try:
-            current_query = " ".join(query_parts[:i])
-            encoded_query = urlencode({"query": current_query})
-            factcheck_url = f"https://factchecktools.googleapis.com/v1alpha1/claims:search?{encoded_query}&key={factcheck_api_key}"
-            logger.info(f"Factcheck URL: {factcheck_url}")
-            # Make request with reduced timeout
-            response = requests.get(factcheck_url, timeout=7)
-            response.raise_for_status()
-            data = safe_json_parse(response, "factcheck")
-            # Safely extract claims
-            claims = data.get("claims", [])
-            if not isinstance(claims, list):
-                logger.warning(f"Unexpected claims type: {type(claims)}")
-                claims = []
-            if claims:  # If results found
-                logger.info(f"Results found for query '{current_query}'.")
-                for item in claims:
-                    try:
-                        # Ensure item is a dictionary
-                        if not isinstance(item, dict):
-                            logger.warning(f"Skipping non-dictionary item: {type(item)}")
-                            continue
-                        claim_text = str(item.get("text", ""))
-                        # Truncate claim text
-                        if len(claim_text) > 200:
-                            claim_text = claim_text[:197] + "..."
-                        reviews = item.get("claimReview", [])
-                        # Ensure reviews is a list
-                        if not isinstance(reviews, list):
-                            logger.warning(f"Unexpected reviews type: {type(reviews)}")
-                            reviews = []
-                        for review in reviews:
-                            # Ensure review is a dictionary
-                            if not isinstance(review, dict):
-                                logger.warning(f"Skipping non-dictionary review: {type(review)}")
-                                continue
-                            publisher = str(review.get("publisher", {}).get("name", "Unknown Source"))
-                            rating = str(review.get("textualRating", "Unknown"))
-                            review_url = str(review.get("url", ""))
-                            if claim_text:
-                                factcheck_results.append(
-                                    f"Claim: {claim_text}, Rating: {rating}, " +
-                                    f"Source: {publisher}, URL: {review_url}"
-                                )
-                                source_count["factcheck"] += 1
-                    except Exception as e:
-                        logger.error(f"Error processing FactCheck result: {e}")
-                break  # Break once we have results
-            else:
-                logger.info(f"No results for query '{current_query}', trying shorter version.")
-        except Exception as e:
-            logger.error(f"Error in FactCheck retrieval: {e}")
-    # Safely log evidence retrieval
-    try:
-        success = bool(factcheck_results)
-        performance_tracker.log_evidence_retrieval(success, source_count)
-    except Exception as e:
-        logger.error(f"Error logging evidence retrieval: {e}")
-    if not factcheck_results:
-        logger.warning("No factcheck evidence found after trying all query variants.")
-    return factcheck_results
-@api_error_handler("newsapi")
-def retrieve_news_articles(claim):
-    """Retrieve evidence from NewsAPI for a given claim with improved single request approach"""
-    logger.info(f"Retrieving evidence from News API for: {claim}")
-    # Get API key
-    news_api_key = NEWS_API_KEY
-    if not news_api_key:
-        logger.error("No NewsAPI key available")
-        return []
-    news_results = []
-    source_count = {"news": 0}
-    # Get date range for recent news
-    from_date, to_date = get_recent_date_range()
-    logger.info(f"Filtering for news from {from_date} to {to_date}")
-    try:
-        # Extract a simplified claim for better matching
-        shortened_claim = shorten_claim_for_evidence(claim)
-        # Use a single endpoint with proper parameters
-        encoded_query = urlencode({"q": shortened_claim})
-        # Use the 'everything' endpoint as it's more comprehensive
-        news_api_url = f"https://newsapi.org/v2/everything?{encoded_query}&apiKey={news_api_key}&language=en&pageSize=5&sortBy=publishedAt&from={from_date}&to={to_date}"
-        log_url = news_api_url.replace(news_api_key, "API_KEY_REDACTED")
-        logger.info(f"Requesting: {log_url}")
-        # Make a single request with proper headers and reduced timeout
-        headers = {
-            "User-Agent": "MisinformationDetectionResearchBot/1.0",
-            "X-Api-Key": news_api_key,
-            "Accept": "application/json"
-        }
-        response = requests.get(
-            news_api_url,
-            headers=headers,
-            timeout=8
-        )
-        logger.info(f"Response status: {response.status_code}")
-        if response.status_code == 200:
-            data = safe_json_parse(response, "newsapi")
-            if data.get("status") == "ok":
-                articles = data.get("articles", [])
-                logger.info(f"Found {len(articles)} articles")
-                for article in articles:
-                    try:
-                        # Robust article parsing
-                        title = str(article.get("title", ""))
-                        description = str(article.get("description", ""))
-                        content = str(article.get("content", ""))
-                        source_name = str(article.get("source", {}).get("name", "Unknown"))
-                        url = str(article.get("url", ""))
-                        published_at = str(article.get("publishedAt", ""))
-                        # Parse date to prioritize recent content
-                        article_date = None
-                        try:
-                            if published_at:
-                                article_date = datetime.strptime(published_at.split('T')[0], '%Y-%m-%d')
-                        except Exception as date_error:
-                            logger.warning(f"Could not parse date: {published_at}")
-                        # Calculate recency score (higher = more recent)
-                        recency_score = 1.0  # Default
-                        if article_date:
-                            days_old = (datetime.now() - article_date).days
-                            if days_old == 0:  # Today
-                                recency_score = 3.0
-                            elif days_old == 1:  # Yesterday
-                                recency_score = 2.0
-                        # Use description if content is empty or too short
-                        if not content or len(content) < 50:
-                            content = description
-                        # Truncate content to reduce token usage
-                        if len(content) > 250:
-                            content = content[:247] + "..."
-                        # Ensure meaningful content
-                        if title and (content or description):
-                            news_item = {
-                                "text": (
-                                    f"Title: {title}, " +
-                                    f"Source: {source_name}, " +
-                                    f"Date: {published_at}, " +
-                                    f"URL: {url}, " +
-                                    f"Content: {content}"
-                                ),
-                                "recency_score": recency_score,
-                                "date": article_date
-                            }
-                            news_results.append(news_item)
-                            source_count["news"] += 1
-                            logger.info(f"Added article: {title}")
-                    except Exception as article_error:
-                        logger.error(f"Error processing article: {article_error}")
-                # Sort results by recency
-                if news_results:
-                    news_results.sort(key=lambda x: x.get('recency_score', 0), reverse=True)
-    except Exception as query_error:
-        logger.error(f"Error processing query: {query_error}")
-    # Convert to plain text list for compatibility with existing code
-    news_texts = [item["text"] for item in news_results]
-    # Log evidence retrieval
-    try:
-        success = bool(news_texts)
-        performance_tracker.log_evidence_retrieval(success, source_count)
-    except Exception as log_error:
-        logger.error(f"Error logging evidence retrieval: {log_error}")
-    # Log results
-    if news_texts:
-        logger.info(f"Retrieved {len(news_texts)} news articles")
-    else:
-        logger.warning("No news articles found")
-    return news_texts
-def retrieve_combined_evidence(claim):
-    """
-    Retrieve evidence from multiple sources in parallel and analyze relevance using semantic similarity
-    with category-aware source prioritization and optimized parallel processing
-    """
-    logger.info(f"Starting evidence retrieval for: {claim}")
-    start_time = time.time()
-    # Use the category detector to prioritize sources
-    from modules.category_detection import get_prioritized_sources, get_category_specific_rss_feeds
-    # Get source priorities based on claim category
-    priorities = get_prioritized_sources(claim)
-    claim_category = priorities.get("category", "general")
-    requires_recent_evidence = priorities.get("requires_recent", False)
-    logger.info(f"Detected claim category: {claim_category} (recent: {requires_recent_evidence})")
-    # Initialize results dictionary
-    results = {
-        "wikipedia": [],
-        "wikidata": [],
-        "claimreview": [],
-        "news": [],
-        "scholarly": [],
-        "rss": []
-    }
-    # Track source counts and relevant evidence
-    source_counts = {}
-    relevant_evidence = {}
-    total_evidence_count = 0
-    relevant_evidence_count = 0
-    # Define primary and secondary sources outside the try block
-    # so they're available in the except block
-    primary_sources = []
-    for source_name in priorities.get("primary", []):
-        if source_name == "wikipedia":
-            primary_sources.append(("wikipedia", retrieve_evidence_from_wikipedia, claim))
-        elif source_name == "wikidata":
-            primary_sources.append(("wikidata", retrieve_evidence_from_wikidata, claim))
-        elif source_name == "claimreview":
-            primary_sources.append(("claimreview", retrieve_evidence_from_claimreview, claim))
-        elif source_name == "news":
-            primary_sources.append(("news", retrieve_news_articles, claim))
-        elif source_name == "scholarly":
-            primary_sources.append(("scholarly", retrieve_evidence_from_openalex, claim))
-        elif source_name == "rss":
-            # Get category-specific RSS max count
-            max_results = 8 if requires_recent_evidence else 5
-            # If the claim is science or technology related and we need to optimize
-            # use category-specific RSS feeds
-            if claim_category in ["science", "technology", "politics"]:
-                # Get specialized RSS module to temporarily use category-specific feeds
-                category_feeds = get_category_specific_rss_feeds(claim_category)
-                if category_feeds:
-                    primary_sources.append(("rss", retrieve_evidence_from_rss, claim, max_results, category_feeds))
-                else:
-                    primary_sources.append(("rss", retrieve_evidence_from_rss, claim, max_results))
-            else:
-                primary_sources.append(("rss", retrieve_evidence_from_rss, claim, max_results))
-    # Prepare secondary sources
-    secondary_sources = []
-    for source_name in priorities.get("secondary", []):
-        if source_name == "wikipedia":
-            secondary_sources.append(("wikipedia", retrieve_evidence_from_wikipedia, claim))
-        elif source_name == "wikidata":
-            secondary_sources.append(("wikidata", retrieve_evidence_from_wikidata, claim))
-        elif source_name == "claimreview":
-            secondary_sources.append(("claimreview", retrieve_evidence_from_claimreview, claim))
-        elif source_name == "news":
-            secondary_sources.append(("news", retrieve_news_articles, claim))
-        elif source_name == "scholarly":
-            secondary_sources.append(("scholarly", retrieve_evidence_from_openalex, claim))
-        elif source_name == "rss":
-            max_results = 5 if requires_recent_evidence else 3
-            # Use category-specific feeds if available
-            if claim_category in ["science", "technology", "politics"]:
-                category_feeds = get_category_specific_rss_feeds(claim_category)
-                if category_feeds:
-                    secondary_sources.append(("rss", retrieve_evidence_from_rss, claim, max_results, category_feeds))
-                else:
-                    secondary_sources.append(("rss", retrieve_evidence_from_rss, claim, max_results))
-            else:
-                secondary_sources.append(("rss", retrieve_evidence_from_rss, claim, max_results))
-    # Optimize parallel processing for evidence retrieval with early results processing
-    try:
-        # Define function to safely retrieve evidence
-        def safe_retrieve(source_name, retrieval_func, *args):
-            try:
-                source_result = retrieval_func(*args) or []
-                return source_name, source_result
-            except Exception as e:
-                logger.error(f"Error retrieving from {source_name}: {str(e)}")
-                return source_name, []
-        # Define function to analyze evidence relevance
-        def analyze_evidence_quick(evidence_items, claim_text):
-            if not evidence_items or not claim_text:
-                return []
-            # Extract important keywords from claim
-            keywords = [word.lower() for word in claim_text.split() if len(word) > 3]
-            # Check for direct relevance
-            relevant_items = []
-            for evidence in evidence_items:
-                if not isinstance(evidence, str):
-                    continue
-                evidence_lower = evidence.lower()
-                # Check if evidence contains any important keywords from claim
-                if any(keyword in evidence_lower for keyword in keywords):
-                    relevant_items.append(evidence)
-                    continue
-                # Check for claim subject in evidence (e.g. "earth" in "earth is flat")
-                claim_parts = claim_text.split()
-                if len(claim_parts) > 0 and claim_parts[0].lower() in evidence_lower:
-                    relevant_items.append(evidence)
-                    continue
-            return relevant_items
-        # Use ThreadPoolExecutor with a reasonable number of workers
-        # Start with primary sources first - use all available sources in parallel
-        with ThreadPoolExecutor(max_workers=min(4, len(primary_sources))) as executor:
-            # Submit all primary source tasks
-            futures_to_source = {
-                executor.submit(safe_retrieve, source_name, func, *args): source_name
-                for source_name, func, *args in primary_sources
-            }
-            # Track completed sources
-            completed_sources = set()
-            # Process results as they complete using as_completed for early processing
-            for future in as_completed(futures_to_source):
-                try:
-                    source_name, source_results = future.result()
-                    results[source_name] = source_results
-                    source_counts[source_name] = len(source_results)
-                    completed_sources.add(source_name)
-                    logger.info(f"Retrieved {len(source_results)} results from {source_name}")
-                    # Quick relevance analysis
-                    if source_results:
-                        relevant_items = analyze_evidence_quick(source_results, claim)
-                        relevant_evidence[source_name] = relevant_items
-                        total_evidence_count += len(source_results)
-                        relevant_evidence_count += len(relevant_items)
-                        logger.info(f"Found {len(relevant_items)} relevant items out of {len(source_results)} from {source_name}")
-                        # Start background pre-analysis while waiting for other sources
-                        try:
-                            executor.submit(
-                                analyze_early_evidence,
-                                claim,
-                                source_name,
-                                source_results
-                            )
-                        except Exception as e:
-                            logger.error(f"Error in early evidence analysis: {e}")
-                except Exception as e:
-                    logger.error(f"Error processing future result: {str(e)}")
-        # Check if we have sufficient RELEVANT evidence from primary sources
-        # If not enough relevant evidence, query secondary sources
-        # in parallel even if we have a lot of total evidence
-        if relevant_evidence_count < 2:
-            logger.info(f"Only found {relevant_evidence_count} relevant evidence items, querying secondary sources")
-            # Add Wikipedia and Wikidata if they weren't in primary sources and haven't been queried yet
-            must_check_sources = []
-            if "wikipedia" not in completed_sources:
-                must_check_sources.append(("wikipedia", retrieve_evidence_from_wikipedia, claim))
-            if "wikidata" not in completed_sources:
-                must_check_sources.append(("wikidata", retrieve_evidence_from_wikidata, claim))
-            # Combine with other secondary sources
-            remaining_sources = must_check_sources + [
-                (source_name, func, *args) for source_name, func, *args in secondary_sources
-                if source_name not in completed_sources
-            ]
-            with ThreadPoolExecutor(max_workers=min(3, len(remaining_sources))) as executor:
-                # Submit all secondary source tasks
-                futures_to_source = {
-                    executor.submit(safe_retrieve, source_name, func, *args): source_name
-                    for source_name, func, *args in remaining_sources
-                }
-                # Process results as they complete
-                for future in as_completed(futures_to_source):
-                    try:
-                        source_name, source_results = future.result()
-                        results[source_name] = source_results
-                        source_counts[source_name] = len(source_results)
-                        logger.info(f"Retrieved {len(source_results)} results from {source_name}")
-                        # Quick relevance analysis for these as well
-                        if source_results:
-                            relevant_items = analyze_evidence_quick(source_results, claim)
-                            relevant_evidence[source_name] = relevant_items
-                            total_evidence_count += len(source_results)
-                            relevant_evidence_count += len(relevant_items)
-                            logger.info(f"Found {len(relevant_items)} relevant items out of {len(source_results)} from {source_name}")
-                    except Exception as e:
-                        logger.error(f"Error processing future result: {str(e)}")
-    except Exception as e:
-        logger.error(f"Error in parallel evidence retrieval: {str(e)}")
-        # Fall back to sequential retrieval as a last resort
-        try:
-            logger.warning("Falling back to sequential retrieval due to parallel execution failure")
-            # Sequential retrieval as fallback method - now primary_sources is in scope
-            for source_name, func, *args in primary_sources:
-                try:
-                    results[source_name] = func(*args) or []
-                    source_counts[source_name] = len(results[source_name])
-                except Exception as source_error:
-                    logger.error(f"Error in sequential {source_name} retrieval: {str(source_error)}")
-            # For sequential retrieval, always check Wikipedia and Wikidata as fallbacks
-            if "wikipedia" not in completed_sources:
-                try:
-                    results["wikipedia"] = retrieve_evidence_from_wikipedia(claim) or []
-                    source_counts["wikipedia"] = len(results["wikipedia"])
-                except Exception as e:
-                    logger.error(f"Error in fallback Wikipedia retrieval: {e}")
-            if "wikidata" not in completed_sources:
-                try:
-                    results["wikidata"] = retrieve_evidence_from_wikidata(claim) or []
-                    source_counts["wikidata"] = len(results["wikidata"])
-                except Exception as e:
-                    logger.error(f"Error in fallback Wikidata retrieval: {e}")
-        except Exception as fallback_error:
-            logger.error(f"Error in fallback sequential retrieval: {str(fallback_error)}")
-    # Gather all evidence
-    all_evidence = []
-    for source, items in results.items():
-        if isinstance(items, list):
-            for item in items:
-                if item and isinstance(item, str):
-                    all_evidence.append(item)
-    # Skip processing if no evidence
-    if not all_evidence:
-        logger.warning("No evidence collected")
-        # Fallback: try direct search for the claim subject
-        try:
-            logger.info("No evidence found, trying fallback subject search")
-            # Extract the main subject using NLP
-            nlp = get_nlp_model()
-            doc = nlp(claim)
-            # Find main subject entities or nouns
-            subjects = []
-            for ent in doc.ents:
-                if ent.label_ in ["PERSON", "ORG", "GPE"]:
-                    subjects.append(ent.text)
-            # If no entities found, use first noun phrase
-            if not subjects:
-                for chunk in doc.noun_chunks:
-                    subjects.append(chunk.text)
-                    break
-            if subjects:
-                # Try a direct search with just the subject
-                logger.info(f"Trying fallback search with subject: {subjects[0]}")
-                # Make sure we try Wikipedia for the subject regardless of priorities
-                try:
-                    wiki_evidence = retrieve_evidence_from_wikipedia(subjects[0]) or []
-                    all_evidence.extend(wiki_evidence)
-                    logger.info(f"Retrieved {len(wiki_evidence)} results from fallback Wikipedia search")
-                except Exception as e:
-                    logger.error(f"Error in fallback Wikipedia search: {e}")
-                # If still no evidence, try other sources
-                if not all_evidence:
-                    # Do fallback searches in parallel
-                    with ThreadPoolExecutor(max_workers=2) as executor:
-                        fallback_futures = {
-                            "news": executor.submit(retrieve_news_articles, subjects[0]),
-                            "wikidata": executor.submit(retrieve_evidence_from_wikidata, subjects[0])
-                        }
-                        # Process results as they complete
-                        for source, future in fallback_futures.items():
-                            try:
-                                fallback_results = future.result() or []
-                                if fallback_results:
-                                    all_evidence.extend(fallback_results[:2])  # Add up to 2 results from each
-                                    logger.info(f"Retrieved {len(fallback_results)} results from fallback {source} search")
-                            except Exception as e:
-                                logger.error(f"Error in fallback {source} search: {str(e)}")
-        except Exception as subj_error:
-            logger.error(f"Error in fallback subject search: {str(subj_error)}")
-        # If still no evidence, return empty list
-        if not all_evidence:
-            return []
-    # Use semantic analysis to score and select the most relevant evidence
-    try:
-        # For science and technology claims, boost the weight of scholarly sources
-        if claim_category in ["science", "technology"]:
-            from config import SOURCE_CREDIBILITY
-            # Create a temporary copy with boosted reliability for relevant sources
-            enhanced_credibility = dict(SOURCE_CREDIBILITY)
-            # Add enhanced weights for scientific sources
-            from modules.category_detection import SOURCE_RELIABILITY_BY_CATEGORY
-            for domain, reliability in SOURCE_RELIABILITY_BY_CATEGORY.get(claim_category, {}).items():
-                enhanced_credibility[domain] = reliability
-            # Use the enhanced credibility for evidence analysis
-            analyzed_evidence = analyze_evidence_relevance(claim, all_evidence, enhanced_credibility)
-        else:
-            # Analyze evidence relevance using semantic similarity with default weights
-            from config import SOURCE_CREDIBILITY
-            analyzed_evidence = analyze_evidence_relevance(claim, all_evidence, SOURCE_CREDIBILITY)
-        # Log evidence scoring
-        logger.info(f"Analyzed {len(analyzed_evidence)} evidence items")
-        # Select diverse, relevant evidence items
-        final_evidence = select_diverse_evidence(analyzed_evidence, max_items=5)
-        # Log source distribution and selected count
-        logger.info(f"Evidence source distribution: {source_counts}")
-        logger.info(f"Selected evidence count: {len(final_evidence)}")
-        # Return maximum 5 evidence items (to control API costs)
-        return final_evidence[:5]
-    except Exception as e:
-        logger.error(f"Error in evidence analysis: {str(e)}")
-        # Fallback to simple selection (top 5 items)
-        return all_evidence[:5]

modules/explanation.py DELETED Viewed

@@ -1,181 +0,0 @@
-import logging
-import re
-import ast
-from utils.models import get_llm_model
-logger = logging.getLogger("misinformation_detector")
-def extract_most_relevant_evidence(evidence_results):
-    """
-    Intelligently extract the most relevant piece of evidence
-    Args:
-        evidence_results (list): List of evidence items
-    Returns:
-        str: Most relevant evidence piece
-    """
-    if not evidence_results:
-        return None
-    # If evidence is a dictionary with 'evidence' key
-    if isinstance(evidence_results[0], dict):
-        # Sort by confidence if available
-        sorted_evidence = sorted(
-            evidence_results,
-            key=lambda x: x.get('confidence', 0),
-            reverse=True
-        )
-        # Return the evidence from the highest confidence item
-        for item in sorted_evidence:
-            evidence = item.get('evidence')
-            if evidence:
-                return evidence
-    # If plain list of evidence
-    return next((ev for ev in evidence_results if ev and isinstance(ev, str)), None)
-def generate_explanation(claim, evidence_results, truth_label, confidence=None):
-    """
-    Generate an explanation for the claim's classification
-    Args:
-        claim (str): The original claim
-        evidence_results (list/str): Evidence supporting the classification
-        truth_label (str): Classification of the claim
-        confidence (float): Confidence level (0-1)
-    Returns:
-        str: Explanation of the claim's classification
-    """
-    logger.info(f"Generating explanation for claim with verdict: {truth_label}")
-    try:
-        # Normalize evidence_results to a list
-        if not isinstance(evidence_results, list):
-            try:
-                evidence_results = ast.literal_eval(str(evidence_results)) if evidence_results else []
-            except:
-                evidence_results = [evidence_results] if evidence_results else []
-        # Get the LLM model
-        explanation_model = get_llm_model()
-        # Extract most relevant evidence
-        most_relevant_evidence = extract_most_relevant_evidence(evidence_results)
-        # Prepare evidence text for prompt
-        evidence_text = "\n".join([
-            f"Evidence {i+1}: {str(ev)[:200] + '...' if len(str(ev)) > 200 else str(ev)}"
-            for i, ev in enumerate(evidence_results[:3])
-        ])
-        # Convert confidence to percentage and description
-        confidence_desc = ""
-        if confidence is not None:
-            confidence_pct = int(confidence * 100)
-            if confidence < 0.3:
-                confidence_desc = f"very low confidence ({confidence_pct}%)"
-            elif confidence < 0.5:
-                confidence_desc = f"low confidence ({confidence_pct}%)"
-            elif confidence < 0.7:
-                confidence_desc = f"moderate confidence ({confidence_pct}%)"
-            elif confidence < 0.9:
-                confidence_desc = f"high confidence ({confidence_pct}%)"
-            else:
-                confidence_desc = f"very high confidence ({confidence_pct}%)"
-        else:
-            # Determine confidence context from label if not explicitly provided
-            confidence_desc = (
-                "high confidence" if "High Confidence" in truth_label else
-                "moderate confidence" if "Likely" in truth_label else
-                "low confidence"
-            )
-        # Create prompt with specific instructions based on the type of claim
-        has_negation = any(neg in claim.lower() for neg in ["not", "no longer", "isn't", "doesn't", "won't", "cannot"])
-        # For claims with "True" verdict
-        if "True" in truth_label:
-            prompt = f"""
-            Claim: "{claim}"
-            Verdict: {truth_label} (with {confidence_desc})
-            Available Evidence:
-            {evidence_text}
-            Task: Generate a clear explanation that:
-            1. Clearly states that the claim IS TRUE based on the evidence
-            2. {"Pay special attention to the logical relationship since the claim contains negation" if has_negation else "Explains why the evidence supports the claim"}
-            3. Uses confidence level of {confidence_desc}
-            4. Highlights the most relevant supporting evidence
-            5. Is factual and precise
-            """
-        # For claims with "False" verdict
-        elif "False" in truth_label:
-            prompt = f"""
-            Claim: "{claim}"
-            Verdict: {truth_label} (with {confidence_desc})
-            Available Evidence:
-            {evidence_text}
-            Task: Generate a clear explanation that:
-            1. Clearly states that the claim IS FALSE based on the evidence
-            2. {"Pay special attention to the logical relationship since the claim contains negation" if has_negation else "Explains why the evidence contradicts the claim"}
-            3. Uses confidence level of {confidence_desc}
-            4. Highlights the contradicting evidence
-            5. Is factual and precise
-            IMPORTANT: If the claim contains negation (words like 'not', 'no longer', etc.), be extra careful with the logical relationship between the evidence and the claim.
-            """
-        # For uncertain claims
-        else:
-            prompt = f"""
-            Claim: "{claim}"
-            Verdict: {truth_label} (with {confidence_desc})
-            Available Evidence:
-            {evidence_text}
-            Task: Generate a clear explanation that:
-            1. Clearly states that there is insufficient evidence to determine if the claim is true or false
-            2. Explains what information is missing or why the available evidence is insufficient
-            3. Uses confidence level of {confidence_desc}
-            4. Makes NO speculation about whether the claim might be true or false
-            5. Mentions that the user should seek information from other reliable sources
-            """
-        # Generate explanation with multiple attempts
-        max_attempts = 3
-        for attempt in range(max_attempts):
-            try:
-                # Invoke the model
-                response = explanation_model.invoke(prompt)
-                explanation = response.content.strip()
-                # Validate explanation length
-                if explanation and len(explanation.split()) >= 5:
-                    return explanation
-            except Exception as attempt_error:
-                logger.error(f"Explanation generation attempt {attempt+1} failed: {str(attempt_error)}")
-        # Ultimate fallback explanation
-        if "Uncertain" in truth_label:
-            return f"The claim '{claim}' cannot be verified due to insufficient evidence. The available information does not provide clear support for or against this claim. Consider consulting reliable sources for verification."
-        elif "True" in truth_label:
-            return f"The claim '{claim}' is supported by the evidence with {confidence_desc}. {most_relevant_evidence or 'The evidence indicates this claim is accurate.'}"
-        else:
-            return f"The claim '{claim}' is contradicted by the evidence with {confidence_desc}. {most_relevant_evidence or 'The evidence indicates this claim is not accurate.'}"
-    except Exception as e:
-        logger.error(f"Comprehensive error in explanation generation: {str(e)}")
-        # Final fallback
-        return f"The claim is classified as {truth_label} based on the available evidence."

modules/rss_feed.py DELETED Viewed

@@ -1,391 +0,0 @@
-import feedparser
-import time
-import logging
-import re
-import ssl
-import requests
-from datetime import datetime, timedelta
-from threading import Timer
-from urllib.parse import urlparse
-from concurrent.futures import ThreadPoolExecutor, as_completed
-logger = logging.getLogger("misinformation_detector")
-# Disable SSL certificate verification for feeds with self-signed certs
-ssl._create_default_https_context = ssl._create_unverified_context
-# List of RSS feeds to check for news
-# These are popular news sources with reliable and frequently updated RSS feeds
-RSS_FEEDS = [
-# --------------------
-# 🌐 General World News
-# --------------------
-# "http://rss.cnn.com/rss/cnn_topstories.rss",                    # CNN Top Stories; Removed in round 2
-"http://rss.cnn.com/rss/cnn_world.rss",                         # CNN World News; Duplicate with category_detection
-# "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml",   # NYT Home Page
-"https://rss.nytimes.com/services/xml/rss/nyt/World.xml",      # NYT World News; Duplicate with category_detection
-# "https://rss.nytimes.com/services/xml/rss/nyt/US.xml",         # NYT US News
-"https://feeds.washingtonpost.com/rss/world",                  # The Washington Post World News; Removed in round 2
-# "https://feeds.washingtonpost.com/rss/national",               # The Washington Post National News
-# "https://feeds.bbci.co.uk/news/rss.xml",                       # BBC News - Top Stories; Removed in round 2
-"https://feeds.bbci.co.uk/news/world/rss.xml",                 # BBC News - World
-# "https://news.google.com/rss?gl=IN&ceid=IN:en&topic=w&hl=en-IN",  # Google News India - World; Removed in round 2
-# "https://news.google.com/rss?gl=US&ceid=US:en&topic=w&hl=en-US",  # Google News US - World; Removed in round 2
-# --------------------
-# 🧠 Tech & Startup News (Global)
-# --------------------
-"https://techcrunch.com/feed/",                                # TechCrunch - Startup and Technology News; Duplicate with category_detection
-"https://venturebeat.com/feed/",                               # VentureBeat - Tech News
-# "https://www.theverge.com/rss/index.xml",                      # The Verge - Technology News
-"https://www.wired.com/feed/rss",                              # Wired - Technology News
-"https://www.cnet.com/rss/news/",                              # CNET - Technology News
-# "https://sifted.eu/feed/",                                     # Sifted - European Startups and Tech
-# "https://feeds.feedburner.com/fastcompany/headlines",          # Fast Company - Business Innovation
-# "https://feeds.bbci.co.uk/news/technology/rss.xml",            # BBC News - Technology
-"https://news.google.com/rss?gl=IN&ceid=IN:en&topic=t&hl=en-IN",  # Google News India - Technology
-"https://news.google.com/rss?gl=US&ceid=US:en&topic=t&hl=en-US",  # Google News US - Technology
-# --------------------
-# 💼 Startup & VC Focused
-# --------------------
-"https://news.crunchbase.com/feed/",                           # Crunchbase News - Startup Funding
-# "https://avc.com/feed/",                                       # AVC - Musings of a VC in NYC
-"https://techstartups.com/feed/",                              # Tech Startups - Startup News
-# "https://tech.eu/feed/",                                       # Tech.eu - European Tech News
-# "https://www.menabytes.com/feed/",                             # MENAbytes - Middle East & North Africa Startups
-# "http://feeds.feedburner.com/venturebeat/SZYF",                # VentureBeat - Deals
-# --------------------
-# 📰 Global Business & Corporate Feeds
-# --------------------
-"https://feeds.bloomberg.com/technology/news.rss",             # Bloomberg Technology News
-"https://www.ft.com/technology?format=rss",                    # Financial Times Technology News
-# "https://ir.thomsonreuters.com/rss/news-releases.xml",         # Thomson Reuters Press Releases
-# "https://feeds.bbci.co.uk/news/business/rss.xml",              # BBC News - Business
-"https://news.google.com/rss?gl=IN&ceid=IN:en&topic=b&hl=en-IN",  # Google News India - Business
-# "https://news.google.com/rss?gl=US&ceid=US:en&topic=b&hl=en-US",  # Google News US - Business; Removed in round 2
-# --------------------
-# 🇮🇳 India-specific News
-# --------------------
-"https://inc42.com/feed/",                                     # Inc42 - Indian Startups and Technology
-# "https://yourstory.com/rss",                                   # YourStory - Indian Startup Stories
-# "https://economictimes.indiatimes.com/startups/rssfeeds/49979279.cms",  # Economic Times - Startups
-"https://timesofindia.indiatimes.com/rssfeedstopstories.cms",           # TOI - Top Stories
-"https://timesofindia.indiatimes.com/rssfeedmostrecent.cms",            # TOI - Most Recent Stories
-"https://timesofindia.indiatimes.com/rssfeeds/-2128936835.cms",         # TOI - India News
-"https://timesofindia.indiatimes.com/rssfeeds/296589292.cms",           # TOI - World News
-"https://timesofindia.indiatimes.com/rssfeeds/1898055.cms",             # TOI - Business News
-"https://timesofindia.indiatimes.com/rssfeeds/54829575.cms",            # TOI - Cricket News
-"https://timesofindia.indiatimes.com/rssfeeds/4719148.cms",             # TOI - Sports News
-"https://timesofindia.indiatimes.com/rssfeeds/-2128672765.cms",         # TOI - Science News
-# "https://timesofindia.indiatimes.com/rssfeeds/66949542.cms",            # TOI - Technology News
-# "https://timesofindia.indiatimes.com/rssfeeds/1081479906.cms",          # TOI - Education News
-# --------------------
-# 🏏 Sports News (Global + Cricket)
-# --------------------
-"https://www.espn.com/espn/rss/news",                          # ESPN - Top Sports News; Duplicate with category_detection
-# "https://api.foxsports.com/v2/content/optimized-rss?partnerKey=MB0Wehpmuj2lUhuRhQaafhBjAJqaPU244mlTDK1i&size=30",  # Fox Sports; Removed in round 2
-"https://feeds.skynews.com/feeds/rss/sports.xml",              # Sky News - Sports
-"https://sports.ndtv.com/rss/all",                                 # NDTV Sports
-"https://www.espncricinfo.com/rss/content/story/feeds/0.xml",  # ESPN Cricinfo - Cricket News; Duplicate with category_detection
-# "https://crickettimes.com/feed/",                              # Cricket Times - Cricket News
-# --------------------
-# ✅ Fact-Checking Sources
-# --------------------
-"https://www.snopes.com/feed/",                                # Snopes - Fact Checking; Duplicate with category_detection
-"https://www.politifact.com/rss/all/",                         # PolitiFact - Fact Checking; Duplicate with category_detection
-# --------------------
-# 🗳️ Politics & Policy (General)
-# --------------------
-"https://feeds.bbci.co.uk/news/politics/rss.xml",              # BBC News - Politics; Duplicate with category_detection
-"https://feeds.bbci.co.uk/news/science_and_environment/rss.xml",  # BBC - Science & Environment
-# --------------------
-# 🗳️ Science
-# --------------------
-"https://www.nature.com/nature.rss",                              # Nature science; Duplicate with category_detection
-"https://feeds.science.org/rss/science-advances.xml"              # science.org
-]
-def clean_html(raw_html):
-    """Remove HTML tags from text"""
-    if not raw_html:
-        return ""
-    clean_regex = re.compile('<.*?>')
-    clean_text = re.sub(clean_regex, '', raw_html)
-    # Remove extra whitespace
-    clean_text = re.sub(r'\s+', ' ', clean_text).strip()
-    return clean_text
-def parse_feed(feed_url, timeout=5):
-    """
-    Parse a single RSS feed with proper timeout handling
-    Uses requests with timeout first, then passes content to feedparser
-    """
-    try:
-        # Use requests with timeout to fetch the RSS content
-        response = requests.get(feed_url, timeout=timeout)
-        response.raise_for_status()
-        # Then parse the content with feedparser (which doesn't support timeout)
-        feed = feedparser.parse(response.content)
-        # Basic validation of the feed
-        if hasattr(feed, 'entries') and feed.entries:
-            return feed
-        else:
-            logger.warning(f"Feed {feed_url} parsed but contains no entries")
-            return None
-    except requests.exceptions.Timeout:
-        logger.warning(f"Timeout while fetching feed {feed_url}")
-        return None
-    except requests.exceptions.RequestException as e:
-        logger.error(f"Request error fetching feed {feed_url}: {str(e)}")
-        return None
-    except Exception as e:
-        logger.error(f"Error parsing feed {feed_url}: {str(e)}")
-        return None
-def fetch_all_feeds(feeds_list=None, max_workers=5, timeout=5):
-    """
-    Fetch multiple RSS feeds with proper timeout handling
-    Returns a list of (domain, feed) tuples for successfully fetched feeds
-    """
-    # Use default RSS_FEEDS list if none provided
-    if feeds_list is None:
-        feeds_list = RSS_FEEDS
-    results = []
-    with ThreadPoolExecutor(max_workers=max_workers) as executor:
-        future_to_url = {executor.submit(parse_feed, url, timeout): url for url in feeds_list}
-        for future in as_completed(future_to_url):
-            url = future_to_url[future]
-            try:
-                feed = future.result()
-                if feed and hasattr(feed, 'entries') and feed.entries:
-                    # Extract domain for source attribution
-                    domain = urlparse(url).netloc
-                    results.append((domain, feed))
-                    logger.info(f"Successfully fetched {domain} with {len(feed.entries)} entries")
-            except Exception as e:
-                logger.error(f"Error processing {url}: {str(e)}")
-    return results
-def extract_date(entry):
-    """Extract and normalize publication date from entry"""
-    for date_field in ['published_parsed', 'updated_parsed', 'created_parsed']:
-        if hasattr(entry, date_field) and getattr(entry, date_field):
-            try:
-                # Convert time tuple to datetime
-                time_tuple = getattr(entry, date_field)
-                return datetime(time_tuple[0], time_tuple[1], time_tuple[2],
-                               time_tuple[3], time_tuple[4], time_tuple[5])
-            except Exception as e:
-                logger.debug(f"Error parsing {date_field}: {e}")
-                continue
-    # Try string dates
-    for date_field in ['published', 'updated', 'pubDate']:
-        if hasattr(entry, date_field) and getattr(entry, date_field):
-            try:
-                date_str = getattr(entry, date_field)
-                # Try various formats
-                for fmt in ['%a, %d %b %Y %H:%M:%S %z', '%a, %d %b %Y %H:%M:%S %Z',
-                           '%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S%z']:
-                    try:
-                        return datetime.strptime(date_str, fmt)
-                    except:
-                        continue
-            except Exception as e:
-                logger.debug(f"Error parsing date string {date_field}: {e}")
-                continue
-    # Default to current time if parsing fails
-    return datetime.now()
-def is_recent(entry_date, max_days=3):
-    """Check if an entry is recent (within the last few days)"""
-    if not entry_date:
-        return False
-    cutoff = datetime.now() - timedelta(days=max_days)
-    return entry_date > cutoff
-def get_entry_relevance(entry, query_terms, domain):
-    """Calculate relevance score for an entry based on query match and recency"""
-    if not hasattr(entry, 'title') or not entry.title:
-        return 0
-    # Extract text content
-    title = entry.title or ""
-    description = clean_html(entry.description) if hasattr(entry, 'description') else ""
-    content = ""
-    if hasattr(entry, 'content'):
-        for content_item in entry.content:
-            if 'value' in content_item:
-                content += clean_html(content_item['value']) + " "
-    # Extract published date
-    pub_date = extract_date(entry)
-    # Calculate recency score (0-1)
-    recency_score = 0
-    if pub_date:
-        days_old = (datetime.now() - pub_date).days
-        if days_old <= 1:  # Today or yesterday
-            recency_score = 1.0
-        elif days_old <= 2:
-            recency_score = 0.8
-        elif days_old <= 3:
-            recency_score = 0.5
-        else:
-            recency_score = 0.2
-    # Calculate relevance score based on keyword matches
-    text = f"{title} {description} {content}".lower()
-    # Count how many query terms appear in the content
-    query_terms_lower = [term.lower() for term in query_terms]
-    matches = sum(1 for term in query_terms_lower if term in text)
-    # Calculate match score (0-1)
-    match_score = min(1.0, matches / max(1, len(query_terms) * 0.7))
-    # Boost score for exact phrase matches
-    query_phrase = " ".join(query_terms_lower)
-    if query_phrase in text:
-        match_score += 0.5
-    # Additional boost for title matches (they're more relevant)
-    title_matches = sum(1 for term in query_terms_lower if term in title.lower())
-    if title_matches > 0:
-        match_score += 0.2 * (title_matches / len(query_terms_lower))
-    # Source quality factor (can be adjusted based on source reliability)
-    source_factor = 1.0
-    high_quality_domains = ['bbc.co.uk', 'nytimes.com', 'reuters.com', 'washingtonpost.com',
-                           'espncricinfo.com', 'cricbuzz.com', 'snopes.com']
-    if any(quality_domain in domain for quality_domain in high_quality_domains):
-        source_factor = 1.2
-    # Calculate final score
-    final_score = (match_score * 0.6) + (recency_score * 0.4) * source_factor
-    return min(1.0, final_score)  # Cap at 1.0
-def retrieve_evidence_from_rss(claim, max_results=3, category_feeds=None):
-    """
-    Retrieve evidence from RSS feeds for a given claim
-    Args:
-        claim (str): The claim to verify
-        max_results (int): Maximum number of results to return
-        category_feeds (list, optional): List of category-specific RSS feeds to check
-    Returns:
-        list: List of relevant evidence items
-    """
-    start_time = time.time()
-    logger.info(f"Retrieving evidence from RSS feeds for: {claim}")
-    # Extract key terms from claim
-    terms = [term.strip() for term in re.findall(r'\b\w+\b', claim) if len(term.strip()) > 2]
-    try:
-        # Use category-specific feeds if provided
-        feeds_to_use = category_feeds if category_feeds else RSS_FEEDS
-        # Log which feeds we're using
-        if category_feeds:
-            logger.info(f"Using {len(category_feeds)} category-specific RSS feeds")
-        else:
-            logger.info(f"Using {len(RSS_FEEDS)} default RSS feeds")
-        # Limit the number of feeds to process for efficiency
-        if len(feeds_to_use) > 10:
-            # If we have too many feeds, select a subset
-            # Prioritize fact-checking sources
-            fact_check_feeds = [feed for feed in feeds_to_use if "fact" in feed.lower() or "snopes" in feed.lower() or "politifact" in feed.lower()]
-            other_feeds = [feed for feed in feeds_to_use if feed not in fact_check_feeds]
-            # Take all fact-checking feeds plus a random selection of others
-            import random
-            selected_feeds = fact_check_feeds + random.sample(other_feeds, min(10 - len(fact_check_feeds), len(other_feeds)))
-        else:
-            selected_feeds = feeds_to_use
-        # Fetch all feeds in parallel with the selected feeds
-        feeds = fetch_all_feeds(selected_feeds)
-        if not feeds:
-            logger.warning("No RSS feeds could be fetched")
-            return []
-        all_entries = []
-        # Process all feed entries
-        for domain, feed in feeds:
-            for entry in feed.entries:
-                # Calculate relevance score
-                relevance = get_entry_relevance(entry, terms, domain)
-                if relevance > 0.3:  # Only consider somewhat relevant entries
-                    # Extract entry details
-                    title = entry.title if hasattr(entry, 'title') else "No title"
-                    link = entry.link if hasattr(entry, 'link') else ""
-                    # Extract and clean description/content
-                    description = ""
-                    if hasattr(entry, 'description'):
-                        description = clean_html(entry.description)
-                    elif hasattr(entry, 'summary'):
-                        description = clean_html(entry.summary)
-                    elif hasattr(entry, 'content'):
-                        for content_item in entry.content:
-                            if 'value' in content_item:
-                                description += clean_html(content_item['value']) + " "
-                    # Truncate description if too long
-                    if len(description) > 250:
-                        description = description[:247] + "..."
-                    # Get publication date
-                    pub_date = extract_date(entry)
-                    date_str = pub_date.strftime('%Y-%m-%d') if pub_date else "Unknown date"
-                    # Format as evidence text
-                    evidence_text = (
-                        f"Title: {title}, "
-                        f"Source: {domain} (RSS), "
-                        f"Date: {date_str}, "
-                        f"URL: {link}, "
-                        f"Content: {description}"
-                    )
-                    all_entries.append({
-                        "text": evidence_text,
-                        "relevance": relevance,
-                        "date": pub_date or datetime.now()
-                    })
-        # Sort entries by relevance
-        all_entries.sort(key=lambda x: x["relevance"], reverse=True)
-        # Take top results
-        top_entries = all_entries[:max_results]
-        logger.info(f"Retrieved {len(top_entries)} relevant RSS items from {len(feeds)} feeds in {time.time() - start_time:.2f}s")
-        # Return just the text portion
-        return [entry["text"] for entry in top_entries]
-    except Exception as e:
-        logger.error(f"Error in RSS retrieval: {str(e)}")
-        return []

modules/semantic_analysis.py DELETED Viewed

@@ -1,503 +0,0 @@
-import logging
-import numpy as np
-from sklearn.metrics.pairwise import cosine_similarity
-from datetime import datetime, timedelta
-import re
-# Import the centralized NLP model handler
-from utils.models import get_nlp_model
-logger = logging.getLogger("misinformation_detector")
-def extract_entities(text):
-    """Extract named entities from text"""
-    if not text:
-        return []
-    try:
-        # Use centralized NLP model
-        nlp_model = get_nlp_model()
-        doc = nlp_model(text)
-        entities = [
-            {
-                "text": ent.text,
-                "label": ent.label_,
-                "start": ent.start_char,
-                "end": ent.end_char
-            }
-            for ent in doc.ents
-        ]
-        return entities
-    except Exception as e:
-        logger.error(f"Error extracting entities: {str(e)}")
-        return []
-def get_vector_representation(text):
-    """Get vector representation of text using spaCy"""
-    if not text:
-        return None
-    try:
-        # Use centralized NLP model
-        nlp_model = get_nlp_model()
-        doc = nlp_model(text)
-        # Return document vector if available
-        if doc.has_vector:
-            return doc.vector
-        # Fallback: average of token vectors
-        vectors = [token.vector for token in doc if token.has_vector]
-        if vectors:
-            return np.mean(vectors, axis=0)
-        return None
-    except Exception as e:
-        logger.error(f"Error getting vector representation: {str(e)}")
-        return None
-def calculate_similarity(text1, text2):
-    """Calculate semantic similarity between two texts"""
-    if not text1 or not text2:
-        return 0.0
-    try:
-        vec1 = get_vector_representation(text1)
-        vec2 = get_vector_representation(text2)
-        if vec1 is None or vec2 is None:
-            return 0.0
-        # Reshape vectors for cosine_similarity
-        vec1 = vec1.reshape(1, -1)
-        vec2 = vec2.reshape(1, -1)
-        # Calculate cosine similarity
-        similarity = cosine_similarity(vec1, vec2)[0][0]
-        return float(similarity)
-    except Exception as e:
-        logger.error(f"Error calculating similarity: {str(e)}")
-        return 0.0
-def extract_date_from_evidence(evidence_text):
-    """Extract date from evidence text"""
-    if not evidence_text:
-        return None
-    try:
-        # Look for date patterns in text
-        date_patterns = [
-            r'Date: (\d{4}-\d{2}-\d{2})',  # ISO format
-            r'published.*?(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',  # published on MM/DD/YYYY
-            r'(\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4})',  # DD Month YYYY
-            r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}'  # Month DD, YYYY
-        ]
-        for pattern in date_patterns:
-            match = re.search(pattern, evidence_text)
-            if match:
-                date_str = match.group(1)
-                # Parse date string based on format
-                try:
-                    if '-' in date_str:
-                        return datetime.strptime(date_str, '%Y-%m-%d')
-                    elif '/' in date_str or '-' in date_str:
-                        formats = ['%m/%d/%Y', '%d/%m/%Y', '%m-%d-%Y', '%d-%m-%Y']
-                        for fmt in formats:
-                            try:
-                                return datetime.strptime(date_str, fmt)
-                            except ValueError:
-                                continue
-                    else:
-                        # Try different month formats
-                        formats = ['%d %B %Y', '%B %d, %Y', '%B %d %Y']
-                        for fmt in formats:
-                            try:
-                                return datetime.strptime(date_str, fmt)
-                            except ValueError:
-                                continue
-                except Exception:
-                    pass
-        return None
-    except Exception as e:
-        logger.error(f"Error extracting date from evidence: {str(e)}")
-        return None
-def is_temporally_relevant(evidence_text, claim_text, max_days_old=30):
-    """Check if evidence is temporally relevant to the claim"""
-    # Check if claim seems to require recent evidence
-    temporal_terms = ["today", "now", "current", "currently", "recent", "recently", "latest", "just", "this week", "this month", "this year"]
-    requires_recent = any(term in claim_text.lower() for term in temporal_terms)
-    # If claim doesn't specify temporality, consider evidence relevant
-    if not requires_recent:
-        return True
-    # Extract date from evidence
-    date = extract_date_from_evidence(evidence_text)
-    if not date:
-        return True  # If we can't determine date, assume it's relevant
-    # Check if evidence is recent enough
-    cutoff = datetime.now() - timedelta(days=max_days_old)
-    return date >= cutoff
-def has_authority_signal(evidence_text):
-    """Check if evidence contains authority signals"""
-    authority_signals = {
-        "scientific_consensus": ["consensus", "scientists agree", "research shows", "studies confirm", "experts agree"],
-        "fact_check": ["fact check", "rated false", "rated true", "debunked", "confirmed", "verification"],
-        "high_authority": ["nasa", "world health organization", "who", "cdc", "national academy",
-                          "oxford", "harvard", "stanford", "mit", "cambridge", "yale",
-                          "princeton", "government", "official", "authorities", "minister",
-                          "ministry", "department", "administration", "university", "professor"]
-    }
-    evidence_lower = evidence_text.lower()
-    authority_type = None
-    authority_score = 1.0
-    for signal_type, phrases in authority_signals.items():
-        if any(phrase in evidence_lower for phrase in phrases):
-            if signal_type == "scientific_consensus":
-                authority_score = 1.8
-                authority_type = "scientific_consensus"
-            elif signal_type == "fact_check":
-                authority_score = 1.5
-                authority_type = "fact_check"
-            elif signal_type == "high_authority":
-                authority_score = 1.3
-                authority_type = "high_authority"
-            break
-    return authority_score, authority_type
-def analyze_evidence_relevance(claim, evidence_list, source_credibility=None):
-    """
-    Analyze evidence relevance to claim using semantic similarity with improved handling
-    for claims requiring strong evidence
-    Args:
-        claim (str): The claim being verified
-        evidence_list (list): List of evidence items
-        source_credibility (dict): Dictionary mapping source domains to credibility scores
-    Returns:
-        list: Sorted list of evidence items with relevance scores
-    """
-    if not evidence_list:
-        return []
-    # Ensure evidence_list is a list of strings
-    if not isinstance(evidence_list, list):
-        evidence_list = [str(evidence_list)]
-    # Filter out None or empty items
-    evidence_list = [item for item in evidence_list if item]
-    # Check if claim contains strong assertions that would require specific evidence
-    strong_assertion_markers = [
-        "solved", "cured", "discovered", "breakthrough", "revolutionary",
-        "first ever", "confirmed", "definitive", "conclusive", "proven",
-        "groundbreaking", "unprecedented", "remarkable", "extends lifespan",
-        "extends life", "definitively", "successfully"
-    ]
-    # Determine if claim contains strong assertions
-    claim_has_strong_assertions = any(marker in claim.lower() for marker in strong_assertion_markers)
-    # Log detection result
-    if claim_has_strong_assertions:
-        logger.info(f"Evidence analysis: Detected claim with strong assertions requiring specific evidence")
-    # Extract named entities from claim
-    claim_entities = extract_entities(claim)
-    claim_entity_texts = [entity["text"].lower() for entity in claim_entities]
-    # Process each evidence item
-    analyzed_evidence = []
-    # Track domains found in evidence to identify source diversity
-    found_domains = set()
-    for evidence in evidence_list:
-        if not isinstance(evidence, str):
-            continue
-        # Calculate semantic similarity
-        similarity = calculate_similarity(claim, evidence)
-        # Check for entity overlap
-        evidence_entities = extract_entities(evidence)
-        evidence_entity_texts = [entity["text"].lower() for entity in evidence_entities]
-        # Calculate entity overlap
-        common_entities = set(claim_entity_texts).intersection(set(evidence_entity_texts))
-        entity_overlap = len(common_entities) / max(1, len(claim_entity_texts))
-        # Check temporal relevance
-        temporal_relevance = 1.0
-        if is_temporally_relevant(evidence, claim):
-            temporal_relevance = 1.2
-        else:
-            # Penalty for temporally irrelevant evidence
-            temporal_relevance = 0.7
-        # Check for authority signals
-        authority_score, authority_type = has_authority_signal(evidence)
-        # Extract source from evidence if available
-        source_boost = 1.0
-        domain = None
-        if source_credibility:
-            # Try to extract domain from URL in evidence
-            domain_match = re.search(r'URL: https?://(?:www\.)?([^/]+)', evidence)
-            if domain_match:
-                domain = domain_match.group(1)
-                # Check if domain or its parent domain is in credibility list
-                for cred_domain, cred_score in source_credibility.items():
-                    if cred_domain in domain:
-                        try:
-                            source_boost = float(cred_score)
-                            break
-                        except (ValueError, TypeError):
-                            pass
-                # Track this domain for source diversity
-                if domain:
-                    found_domains.add(domain)
-        # For claims with strong assertions: check if evidence specifically addresses assertions
-        claim_specificity_match = 1.0
-        evidence_specificity_match = 1.0
-        if claim_has_strong_assertions:
-            # Check if evidence provides specific confirmation or contradiction
-            direct_contradiction_terms = [
-                "not yet", "has not", "have not", "cannot", "can't", "doesn't", "don't",
-                "unlikely", "challenging", "remains a challenge", "in the future",
-                "experimental", "in development", "proposed", "theoretical",
-                "preliminary", "hypothesized", "potential", "promising but"
-            ]
-            # Check for contradictions to strong assertions
-            if any(term in evidence.lower() for term in direct_contradiction_terms):
-                # This evidence likely contradicts the strong assertion
-                evidence_specificity_match = 2.0  # Boost relevance of contradicting evidence
-                logger.debug(f"Found contradiction to strong assertion in evidence")
-            # For claims with strong assertions, check if evidence specifically confirms
-            direct_confirmation_terms = [
-                "successfully demonstrated", "breakthrough", "solved", "cured",
-                "confirmed", "definitive evidence", "conclusive results", "proven",
-                "revolutionary results", "milestone achievement", "groundbreaking results"
-            ]
-            # If evidence confirms the strong assertion, adjust relevance
-            if any(term in evidence.lower() for term in direct_confirmation_terms):
-                # Apply higher scoring for evidence that specifically confirms
-                evidence_specificity_match = 1.8
-                logger.debug(f"Found confirmation of strong assertion in evidence")
-            # For claims with strong assertions, check for high-quality sources
-            high_quality_source_markers = [
-                "journal", "doi.org", "research", "university", "institute",
-                "laboratory", "professor", "study", "publication", "published in"
-            ]
-            is_high_quality = any(term in evidence.lower() for term in high_quality_source_markers)
-            quality_boost = 1.4 if is_high_quality else 1.0
-            # Apply the quality boost
-            source_boost *= quality_boost
-        # Calculate final relevance score with improvements for all claim types
-        if claim_has_strong_assertions:
-            relevance_score = (
-                (similarity * 0.35) +  # Semantic similarity
-                (entity_overlap * 0.25) +  # Entity overlap
-                (0.25)  # Base value to ensure all evidence has some relevance
-            ) * temporal_relevance * authority_score * source_boost * claim_specificity_match * evidence_specificity_match
-        else:
-            # Original formula for regular claims
-            relevance_score = (
-                (similarity * 0.4) +  # Semantic similarity
-                (entity_overlap * 0.3) +  # Entity overlap
-                (0.3)  # Base value to ensure all evidence has some relevance
-            ) * temporal_relevance * authority_score * source_boost
-        # Add metadata and relevance score
-        analyzed_evidence.append({
-            "text": evidence,
-            "relevance_score": relevance_score,
-            "similarity": similarity,
-            "entity_overlap": entity_overlap,
-            "temporal_relevance": temporal_relevance,
-            "authority_score": authority_score,
-            "authority_type": authority_type,
-            "source_boost": source_boost,
-            "domain": domain
-        })
-    # Sort by relevance score (descending)
-    analyzed_evidence.sort(key=lambda x: x["relevance_score"], reverse=True)
-    # Ensure we have diverse sources in top results for all claims
-    if len(found_domains) > 1:
-        # Try to promote evidence from reliable sources if we haven't selected any yet
-        reliable_sources_seen = False
-        # Check if top 3 results contain any reliable sources
-        for item in analyzed_evidence[:3]:
-            domain = item.get("domain", "")
-            if domain and source_credibility and any(cred_domain in domain for cred_domain in source_credibility):
-                reliable_sources_seen = True
-                break
-        # If no reliable sources in top results, promote one if available
-        if not reliable_sources_seen:
-            for i, item in enumerate(analyzed_evidence[3:]):
-                domain = item.get("domain", "")
-                if domain and source_credibility and any(cred_domain in domain for cred_domain in source_credibility):
-                    # Swap this item into the top 3
-                    analyzed_evidence.insert(2, analyzed_evidence.pop(i+3))
-                    break
-    return analyzed_evidence
-def select_diverse_evidence(analyzed_evidence, max_items=5):
-    """
-    Select diverse evidence items based on relevance, source diversity and claim characteristics
-    Args:
-        analyzed_evidence (list): List of evidence items with relevance scores
-        max_items (int): Maximum number of evidence items to return
-    Returns:
-        list: Selected diverse evidence items
-    """
-    if not analyzed_evidence:
-        return []
-    # Check if top evidence suggests claim has strong assertions
-    strong_assertion_markers = [
-        "solved", "cured", "discovered", "breakthrough", "revolutionary",
-        "first ever", "confirmed", "definitive", "conclusive", "proven",
-        "groundbreaking", "unprecedented", "extends lifespan", "definitively"
-    ]
-    # Determine if this is a claim with strong assertions by checking evidence text
-    has_strong_assertions = False
-    for item in analyzed_evidence[:3]:  # Check just the top items for efficiency
-        if "text" in item:
-            item_text = item["text"].lower()
-            if any(f"claim {marker}" in item_text or f"claim has {marker}" in item_text
-                  for marker in strong_assertion_markers):
-                has_strong_assertions = True
-                break
-    # Also check for contradiction markers in evidence which can indicate a strong assertion
-    contradiction_markers = [
-        "not yet solved", "hasn't been proven", "no evidence that",
-        "remains unsolved", "has not been confirmed", "remains theoretical"
-    ]
-    if not has_strong_assertions:
-        for item in analyzed_evidence[:3]:
-            if "text" in item:
-                item_text = item["text"].lower()
-                if any(marker in item_text for marker in contradiction_markers):
-                    has_strong_assertions = True
-                    break
-    # Ensure we don't select more than available
-    max_items = min(max_items, len(analyzed_evidence))
-    # Initialize selected items with the most relevant item
-    selected = [analyzed_evidence[0]]
-    remaining = analyzed_evidence[1:]
-    # Track sources to ensure diversity
-    selected_sources = set()
-    for item in selected:
-        # Try to extract source from evidence
-        source_match = re.search(r'Source: ([^,]+)', item["text"])
-        if source_match:
-            selected_sources.add(source_match.group(1))
-    # For all claims, track if we have high-quality sources yet
-    has_quality_source = False
-    quality_source_markers = ["journal", "doi.org", "research", "university",
-                             "institute", "laboratory", "professor", "study"]
-    # Check if our top item is already from a quality source
-    if any(marker in selected[0]["text"].lower() for marker in quality_source_markers):
-        has_quality_source = True
-    # Select remaining items balancing relevance and diversity
-    while len(selected) < max_items and remaining:
-        best_item = None
-        best_score = -1
-        for i, item in enumerate(remaining):
-            # Base score is the item's relevance
-            score = item["relevance_score"]
-            # Extract source if available
-            source = None
-            source_match = re.search(r'Source: ([^,]+)', item["text"])
-            if source_match:
-                source = source_match.group(1)
-            # Apply diversity bonus if source is new
-            if source and source not in selected_sources:
-                score *= 1.2  # Diversity bonus
-            # For claims with strong assertions, apply bonus for contradicting evidence
-            if has_strong_assertions:
-                # Check for contradiction markers in the text
-                if any(marker in item["text"].lower() for marker in contradiction_markers):
-                    score *= 1.3  # Bonus for evidence that may contradict strong assertions
-            # For any claim, apply bonus for high-quality sources if we don't have one yet
-            if not has_quality_source:
-                is_item_quality = any(marker in item["text"].lower() for marker in quality_source_markers)
-                if is_item_quality:
-                    score *= 1.5  # Significant bonus for quality sources
-            if score > best_score:
-                best_score = score
-                best_item = (i, item)
-        if best_item:
-            idx, item = best_item
-            selected.append(item)
-            remaining.pop(idx)
-            # Add source to selected sources
-            source_match = re.search(r'Source: ([^,]+)', item["text"])
-            if source_match:
-                selected_sources.add(source_match.group(1))
-            # Check if we found a quality source
-            if not has_quality_source:
-                if any(marker in item["text"].lower() for marker in quality_source_markers):
-                    has_quality_source = True
-        else:
-            break
-    # For any claim with strong assertions, ensure we have at least one quality source if available
-    if has_strong_assertions and not has_quality_source and remaining:
-        for i, item in enumerate(remaining):
-            if any(marker in item["text"].lower() for marker in quality_source_markers):
-                # Replace the least relevant selected item with this quality one
-                selected.sort(key=lambda x: x["relevance_score"])
-                selected[0] = item
-                break
-    # Return only the text portion
-    return [item["text"] for item in selected]