ankanghosh commited on
Commit
5dc3509
·
verified ·
1 Parent(s): 132ab9e

Upload 12 files

Browse files
modules/__init__.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Modules package initialization.
3
+
4
+ This package contains the core modules for the AskVeracity fact-checking system.
5
+ """
6
+
7
+ from .claim_extraction import extract_claims, shorten_claim_for_evidence
8
+ from .evidence_retrieval import retrieve_combined_evidence
9
+ from .classification import classify_with_llm, aggregate_evidence
10
+ from .explanation import generate_explanation
11
+
12
+ __all__ = [
13
+ 'extract_claims',
14
+ 'shorten_claim_for_evidence',
15
+ 'retrieve_combined_evidence',
16
+ 'classify_with_llm',
17
+ 'aggregate_evidence',
18
+ 'generate_explanation'
19
+ ]
modules/category_detection.py ADDED
@@ -0,0 +1,880 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import re
3
+ from typing import Tuple, List, Dict, Optional
4
+ import os
5
+ import time
6
+
7
+ # Set up logging
8
+ logger = logging.getLogger("misinformation_detector")
9
+
10
+ # Define categories and their keywords
11
+ CLAIM_CATEGORIES = {
12
+ "ai": [
13
+ # General AI terms
14
+ "AI", "artificial intelligence", "machine learning", "ML", "deep learning", "DL",
15
+ "neural network", "neural nets", "generative AI", "GenAI", "AGI", "artificial general intelligence",
16
+ "transformer", "attention mechanism", "fine-tuning", "pre-training", "training", "inference",
17
+
18
+ # AI Models and Architectures
19
+ "language model", "large language model", "LLM", "foundation model", "multimodal model",
20
+ "vision language model", "VLM", "text-to-speech", "TTS", "speech-to-text", "STT",
21
+ "text-to-image", "image-to-text", "diffusion model", "generative model", "discriminative model",
22
+ "GPT", "BERT", "T5", "PaLM", "Claude", "Llama", "Gemini", "Mistral", "Mixtral", "Stable Diffusion",
23
+ "Dall-E", "Midjourney", "Sora", "transformer", "MoE", "mixture of experts", "sparse model",
24
+ "dense model", "encoder", "decoder", "encoder-decoder", "autoencoder", "VAE",
25
+ "mixture of experts", "MoE", "sparse MoE", "switch transformer", "gated experts",
26
+ "routing network", "expert routing", "pathways", "multi-query attention", "multi-head attention",
27
+ "rotary position embedding", "RoPE", "grouped-query attention", "GQA", "flash attention",
28
+ "state space model", "SSM", "mamba", "recurrent neural network", "RNN", "LSTM", "GRU",
29
+ "convolutional neural network", "CNN", "residual connection", "skip connection", "normalization",
30
+ "layer norm", "group norm", "batch norm", "parameter efficient fine-tuning", "PEFT",
31
+ "LoRA", "low-rank adaptation", "QLoRA", "adapters", "prompt tuning", "prefix tuning",
32
+
33
+ # AI Learning Paradigms
34
+ "supervised learning", "unsupervised learning", "reinforcement learning", "RL",
35
+ "meta-learning", "transfer learning", "federated learning", "self-supervised learning",
36
+ "semi-supervised learning", "few-shot learning", "zero-shot learning", "one-shot learning",
37
+ "contrastive learning", "curriculum learning", "imitation learning", "active learning",
38
+ "reinforcement learning from human feedback", "RLHF", "direct preference optimization", "DPO",
39
+ "constitutional AI", "red teaming", "adversarial training", "GAN", "generative adversarial network",
40
+ "diffusion", "latent diffusion", "flow-based model", "variational autoencoder", "VAE",
41
+
42
+ # AI Capabilities and Applications
43
+ "natural language processing", "NLP", "computer vision", "CV", "speech recognition",
44
+ "text generation", "image generation", "video generation", "multimodal", "multi-modal",
45
+ "recommendation system", "recommender system", "chatbot", "conversational AI",
46
+ "sentiment analysis", "entity recognition", "semantic search", "vector search", "embedding",
47
+ "classification", "regression", "clustering", "anomaly detection", "agent", "AI agent",
48
+ "autonomous agent", "agentic", "RAG", "retrieval augmented generation", "tool use",
49
+ "function calling", "reasoning", "chain-of-thought", "CoT", "tree-of-thought", "ToT",
50
+ "planning", "decision making", "multi-agent", "agent swarm", "multi-agent simulation",
51
+
52
+ # AI Technical Terms
53
+ "token", "tokenizer", "tokenization", "embedding", "vector", "prompt", "prompt engineering",
54
+ "context window", "parameter", "weights", "bias", "activation function", "loss function",
55
+ "gradient descent", "backpropagation", "epoch", "batch", "mini-batch", "regularization",
56
+ "dropout", "overfitting", "underfitting", "hyperparameter", "latent space", "latent variable",
57
+ "feature extraction", "dimensionality reduction", "optimization", "quantization", "pruning",
58
+ "fine-tuning", "transfer learning", "knowledge distillation", "int4", "int8", "bfloat16",
59
+ "float16", "mixed precision", "GPTQ", "AWQ", "GGUF", "GGML", "KV cache", "speculative decoding",
60
+ "beam search", "greedy decoding", "temperature", "top-k", "top-p", "nucleus sampling",
61
+
62
+ # AI Tools and Frameworks
63
+ "TensorFlow", "PyTorch", "JAX", "Keras", "Hugging Face", "Transformers", "Diffusers",
64
+ "LangChain", "Llama Index", "OpenAI", "Anthropic", "NVIDIA", "GPU", "TPU", "IPU", "NPU", "CUDA",
65
+ "MLOps", "model monitoring", "model deployment", "model serving", "inference endpoint",
66
+ "vLLM", "TGI", "text generation inference", "triton", "onnx", "tensorRT",
67
+
68
+ # AI Ethics and Concerns
69
+ "AI ethics", "responsible AI", "AI safety", "AI alignment", "AI governance",
70
+ "bias", "fairness", "interpretability", "explainability", "XAI", "transparency",
71
+ "hallucination", "toxicity", "safe deployment", "AI risk", "AI capabilities",
72
+ "alignment tax", "red teaming", "jailbreak", "prompt injection", "data poisoning",
73
+
74
+ # AI Companies and Organizations
75
+ "OpenAI", "Anthropic", "Google DeepMind", "Meta AI", "Microsoft", "NVIDIA",
76
+ "Hugging Face", "Mistral AI", "Cohere", "AI21 Labs", "Stability AI", "Midjourney",
77
+ "EleutherAI", "Allen AI", "DeepMind", "Character AI", "Inflection AI", "xAI"
78
+ ],
79
+
80
+ "science": [
81
+ # General scientific terms
82
+ "study", "research", "scientist", "scientific", "discovered", "experiment",
83
+ "laboratory", "clinical", "trial", "hypothesis", "theory", "evidence-based",
84
+ "peer-reviewed", "journal", "publication", "finding", "breakthrough", "innovation",
85
+ "discovery", "analysis", "data", "measurement", "observation", "empirical",
86
+
87
+ # Biology and medicine
88
+ "biology", "chemistry", "physics", "genetics", "genomics", "DNA", "RNA",
89
+ "medicine", "gene", "protein", "molecule", "cell", "brain", "neuro",
90
+ "cancer", "disease", "cure", "treatment", "vaccine", "health", "medical",
91
+ "pharmaceutical", "drug", "therapy", "symptom", "diagnosis", "prognosis",
92
+ "patient", "doctor", "hospital", "clinic", "surgery", "immune", "antibody",
93
+ "virus", "bacteria", "pathogen", "infection", "epidemic", "pandemic",
94
+ "organism", "evolution", "mutation", "chromosome", "enzyme", "hormone",
95
+
96
+ # Physics and astronomy
97
+ "quantum", "particle", "atom", "nuclear", "electron", "neutron", "proton",
98
+ "atomic", "subatomic", "molecular", "energy", "matter", "mass", "force",
99
+ "space", "NASA", "telescope", "planet", "exoplanet", "moon", "lunar", "mars",
100
+ "star", "galaxy", "cosmic", "astronomical", "universe", "solar", "celestial",
101
+ "orbit", "gravitational", "gravity", "relativity", "quantum mechanics",
102
+ "string theory", "dark matter", "dark energy", "black hole", "supernova",
103
+ "radiation", "radioactive", "isotope", "fission", "fusion", "accelerator",
104
+
105
+ # Environmental science
106
+ "climate", "carbon", "environment", "ecosystem", "species", "extinct",
107
+ "endangered", "biodiversity", "conservation", "sustainable", "renewable",
108
+ "fossil fuel", "greenhouse", "global warming", "polar", "ice cap", "glacier",
109
+ "ozone", "atmosphere", "weather", "meteorology", "geology", "earthquake",
110
+ "volcanic", "ocean", "marine", "coral reef", "deforestation", "pollution",
111
+
112
+ # Math and computer science (non-AI specific)
113
+ "equation", "formula", "theorem", "calculus", "statistical", "probability",
114
+ "dataset", "parameter", "variable", "function", "matrix", "optimization",
115
+
116
+ # Organizations
117
+ "CERN", "NIH", "CDC", "WHO", "NOAA", "ESA", "SpaceX", "Blue Origin", "JPL",
118
+ "laboratory", "institute", "university", "academic", "faculty", "professor",
119
+
120
+ # Science tools
121
+ "Matlab", "SPSS", "SAS", "ImageJ", "LabVIEW", "ANSYS", "Cadence", "Origin",
122
+ "Avogadro", "ChemDraw", "Mathematica", "Wolfram Alpha", "COMSOL", "LAMMPS",
123
+ "VASP", "Gaussian", "GIS", "ArcGIS", "QGIS", "Maple", "R Studio"
124
+ ],
125
+
126
+ "technology": [
127
+ # General tech terms
128
+ "computer", "software", "hardware", "internet", "cyber", "digital", "tech",
129
+ "robot", "automation", "autonomous", "code", "programming", "data", "cloud",
130
+ "server", "network", "encryption", "blockchain", "crypto", "bitcoin", "ethereum",
131
+ "technology", "innovation", "breakthrough", "prototype", "development",
132
+ "engineering", "technical", "specification", "feature", "functionality",
133
+ "interface", "system", "infrastructure", "integration", "implementation",
134
+
135
+ # Devices and hardware
136
+ "smartphone", "device", "gadget", "laptop", "desktop", "tablet", "wearable",
137
+ "smartwatch", "IoT", "internet of things", "sensor", "chip", "semiconductor",
138
+ "processor", "CPU", "GPU", "memory", "RAM", "storage", "hard drive", "SSD",
139
+ "electronic", "circuit", "motherboard", "component", "peripheral", "accessory",
140
+ "display", "screen", "touchscreen", "camera", "lens", "microphone", "speaker",
141
+ "battery", "charger", "wireless", "bluetooth", "WiFi", "router", "modem",
142
+
143
+ # Software and internet
144
+ "app", "application", "platform", "website", "online", "web", "browser",
145
+ "operating system", "Windows", "macOS", "Linux", "Android", "iOS", "software",
146
+ "program", "code", "coding", "development", "framework", "library", "API",
147
+ "interface", "backend", "frontend", "full-stack", "developer", "programmer",
148
+ "database", "SQL", "NoSQL", "cloud computing", "SaaS", "PaaS", "IaaS",
149
+ "DevOps", "agile", "scrum", "sprint", "version control", "git", "repository",
150
+
151
+ # Communications and networking
152
+ "5G", "6G", "broadband", "fiber", "network", "wireless", "cellular", "mobile",
153
+ "telecommunications", "telecom", "transmission", "bandwidth", "latency",
154
+ "protocol", "IP address", "DNS", "server", "hosting", "data center",
155
+
156
+ # Company and product names
157
+ "Apple", "Google", "Microsoft", "Amazon", "Facebook", "Meta", "Tesla",
158
+ "IBM", "Intel", "AMD", "Nvidia", "Qualcomm", "Cisco", "Oracle", "SAP",
159
+ "Huawei", "Samsung", "Sony", "LG", "Dell", "HP", "Lenovo", "Xiaomi",
160
+ "iPhone", "iPad", "MacBook", "Surface", "Galaxy", "Pixel", "Windows",
161
+ "Android", "iOS", "Chrome", "Firefox", "Edge", "Safari", "Office",
162
+ "Azure", "AWS", "Google Cloud", "Gmail", "Outlook", "Teams", "Zoom",
163
+
164
+ # Advanced technologies
165
+ "VR", "AR", "XR", "virtual reality", "augmented reality", "mixed reality",
166
+ "metaverse", "3D printing", "additive manufacturing", "quantum computing",
167
+ "nanotechnology", "biotechnology", "electric vehicle", "self-driving",
168
+ "autonomous vehicle", "drone", "UAV", "robotics", "cybersecurity",
169
+
170
+ # Social media
171
+ "social media", "social network", "Facebook", "Instagram", "Twitter", "X",
172
+ "LinkedIn", "TikTok", "Snapchat", "YouTube", "Pinterest", "Reddit",
173
+ "streaming", "content creator", "influencer", "follower", "like", "share",
174
+ "post", "tweet", "user-generated", "viral", "trending", "engagement",
175
+
176
+ # Technology tools
177
+ "NumPy", "Pandas", "Matplotlib", "Seaborn", "Scikit-learn", "Jupyter",
178
+ "Visual Studio", "VS Code", "IntelliJ", "PyCharm", "Eclipse", "Android Studio",
179
+ "Xcode", "Docker", "Kubernetes", "Jenkins", "Ansible", "Terraform", "Vagrant",
180
+ "AWS CLI", "Azure CLI", "GCP CLI", "PowerShell", "Bash", "npm", "pip", "conda",
181
+ "React", "Angular", "Vue.js", "Node.js", "Django", "Flask", "Spring", "Laravel",
182
+ "PostgreSQL", "MySQL", "MongoDB", "Redis", "Elasticsearch", "Kafka", "RabbitMQ",
183
+
184
+ # Optimization terms
185
+ "optimization", "efficiency", "performance tuning", "benchmarking", "profiling",
186
+ "refactoring", "scaling", "bottleneck", "throughput", "latency reduction",
187
+ "response time", "caching", "load balancing", "distributed computing",
188
+ "parallel processing", "concurrency", "asynchronous", "memory management"
189
+ ],
190
+
191
+ "politics": [
192
+ # Government structure
193
+ "president", "prime minister", "government", "parliament", "congress",
194
+ "senate", "house", "representative", "minister", "secretary", "cabinet",
195
+ "administration", "mayor", "governor", "politician", "official", "authority",
196
+ "federal", "state", "local", "municipal", "county", "city", "town",
197
+ "constituency", "district", "precinct", "ward", "judiciary", "executive",
198
+ "legislative", "branch", "checks and balances", "separation of powers",
199
+
200
+ # Political activities
201
+ "policy", "election", "campaign", "vote", "voter", "ballot", "polling",
202
+ "political", "politics", "debate", "speech", "address", "press conference",
203
+ "approval rating", "opinion poll", "candidate", "incumbent", "challenger",
204
+ "primary", "caucus", "convention", "delegate", "nomination", "campaign trail",
205
+ "fundraising", "lobbying", "advocacy", "activism", "protest", "demonstration",
206
+
207
+ # Political ideologies
208
+ "democracy", "democratic", "republican", "conservative", "liberal",
209
+ "progressive", "left-wing", "right-wing", "centrist", "moderate",
210
+ "socialist", "capitalist", "communist", "libertarian", "populist",
211
+ "nationalist", "globalist", "isolationist", "hawk", "dove",
212
+ "ideology", "partisan", "bipartisan", "coalition", "majority", "minority",
213
+
214
+ # Laws and regulations
215
+ "bill", "law", "legislation", "regulation", "policy", "statute", "code",
216
+ "amendment", "reform", "repeal", "enact", "implement", "enforce",
217
+ "constitutional", "unconstitutional", "legal", "illegal", "legalize",
218
+ "criminalize", "deregulate", "regulatory", "compliance", "mandate",
219
+
220
+ # Judicial and legal
221
+ "court", "supreme", "justice", "judge", "ruling", "decision", "opinion",
222
+ "case", "lawsuit", "litigation", "plaintiff", "defendant", "prosecutor",
223
+ "attorney", "lawyer", "advocate", "judicial review", "precedent",
224
+ "constitution", "amendment", "rights", "civil rights", "human rights",
225
+
226
+ # International relations
227
+ "treaty", "international", "diplomatic", "diplomacy", "relations",
228
+ "foreign policy", "domestic policy", "UN", "NATO", "EU", "United Nations",
229
+ "sanctions", "embargo", "tariff", "trade war", "diplomat", "embassy",
230
+ "consulate", "ambassador", "delegation", "summit", "bilateral", "multilateral",
231
+ "alliance", "ally", "adversary", "geopolitical", "sovereignty", "regime",
232
+
233
+ # Security and defense
234
+ "national security", "homeland security", "defense", "military", "armed forces",
235
+ "army", "navy", "air force", "marines", "coast guard", "intelligence",
236
+ "CIA", "FBI", "NSA", "Pentagon", "war", "conflict", "peacekeeping",
237
+ "terrorism", "counterterrorism", "insurgency", "nuclear weapon", "missile",
238
+ "disarmament", "nonproliferation", "surveillance", "espionage",
239
+
240
+ # Political institutions
241
+ "White House", "Kremlin", "Downing Street", "Capitol Hill", "Westminster",
242
+ "United Nations", "European Union", "NATO", "World Bank", "IMF", "WTO",
243
+ "ASEAN", "African Union", "BRICS", "G7", "G20",
244
+
245
+ # Political parties and movements
246
+ "Democrat", "Republican", "Labour", "Conservative", "Green Party",
247
+ "Socialist", "Communist", "Libertarian", "Independent", "Tea Party",
248
+ "progressive movement", "civil rights movement", "womens rights",
249
+ "LGBTQ rights", "Black Lives Matter", "environmental movement"
250
+ ],
251
+
252
+ "business": [
253
+ # Companies and organization types
254
+ "company", "corporation", "business", "startup", "firm", "enterprise",
255
+ "corporate", "industry", "sector", "conglomerate", "multinational",
256
+ "organization", "entity", "private", "public", "incorporated", "LLC",
257
+ "partnership", "proprietorship", "franchise", "subsidiary", "parent company",
258
+ "headquarters", "office", "facility", "plant", "factory", "warehouse",
259
+ "retail", "wholesale", "ecommerce", "brick-and-mortar", "chain", "outlet",
260
+
261
+ # Business roles and management
262
+ "executive", "CEO", "CFO", "CTO", "COO", "CMO", "CIO", "CHRO", "chief",
263
+ "director", "board", "chairman", "chairwoman", "chairperson", "president",
264
+ "vice president", "senior", "junior", "manager", "management", "supervisor",
265
+ "founder", "entrepreneur", "owner", "shareholder", "stakeholder",
266
+ "employee", "staff", "workforce", "personnel", "human resources", "HR",
267
+ "recruit", "hire", "layoff", "downsizing", "restructuring", "reorganization",
268
+
269
+ # Financial terms
270
+ "profit", "revenue", "sales", "income", "earnings", "EBITDA", "turnover",
271
+ "loss", "deficit", "expense", "cost", "overhead", "margin", "markup",
272
+ "budget", "forecast", "projection", "estimate", "actual", "variance",
273
+ "balance sheet", "income statement", "cash flow", "P&L", "liquidity",
274
+ "solvency", "asset", "liability", "equity", "debt", "leverage", "capital",
275
+ "working capital", "cash", "funds", "money", "payment", "transaction",
276
+
277
+ # Markets and trading
278
+ "market", "stock", "share", "bond", "security", "commodity", "futures",
279
+ "option", "derivative", "forex", "foreign exchange", "currency", "crypto",
280
+ "trader", "trading", "buy", "sell", "long", "short", "position", "portfolio",
281
+ "diversification", "hedge", "risk", "return", "yield", "dividend", "interest",
282
+ "bull market", "bear market", "correction", "crash", "rally", "volatile",
283
+ "volatility", "index", "benchmark", "Dow Jones", "NASDAQ", "S&P 500", "NYSE",
284
+
285
+ # Investment and funding
286
+ "investor", "investment", "fund", "mutual fund", "ETF", "hedge fund",
287
+ "private equity", "venture", "venture capital", "VC", "angel investor",
288
+ "seed", "Series A", "Series B", "Series C", "funding", "financing",
289
+ "loan", "credit", "debt", "equity", "fundraising", "crowdfunding",
290
+ "IPO", "initial public offering", "going public", "listed", "delisted",
291
+ "merger", "acquisition", "M&A", "takeover", "buyout", "divestiture",
292
+ "valuation", "billion", "million", "trillion", "unicorn", "decacorn",
293
+
294
+ # Economic terms
295
+ "economy", "economic", "economics", "macro", "micro", "fiscal", "monetary",
296
+ "supply", "demand", "market forces", "competition", "competitive", "monopoly",
297
+ "oligopoly", "antitrust", "regulation", "deregulation", "growth", "decline",
298
+ "recession", "depression", "recovery", "expansion", "contraction", "cycle",
299
+ "inflation", "deflation", "stagflation", "hyperinflation", "CPI", "price",
300
+ "GDP", "gross domestic product", "GNP", "productivity", "output", "input",
301
+
302
+ # Banking and finance
303
+ "finance", "financial", "bank", "banking", "commercial bank", "investment bank",
304
+ "central bank", "Federal Reserve", "Fed", "ECB", "Bank of England", "BOJ",
305
+ "interest rate", "prime rate", "discount rate", "basis point", "monetary policy",
306
+ "quantitative easing", "tightening", "loosening", "credit", "lending",
307
+ "borrowing", "loan", "mortgage", "consumer credit", "credit card", "debit card",
308
+ "checking", "savings", "deposit", "withdrawal", "ATM", "branch", "online banking",
309
+
310
+ # Currencies and payments
311
+ "dollar", "euro", "pound", "yen", "yuan", "rupee", "ruble", "real", "peso",
312
+ "currency", "money", "fiat", "exchange rate", "remittance", "transfer",
313
+ "payment", "transaction", "wire", "ACH", "SWIFT", "clearing", "settlement",
314
+ "cryptocurrency", "bitcoin", "ethereum", "blockchain", "fintech", "paytech",
315
+
316
+ # Business operations
317
+ "product", "service", "solution", "offering", "launch", "rollout", "release",
318
+ "operation", "production", "manufacturing", "supply chain", "logistics",
319
+ "procurement", "inventory", "distribution", "shipping", "delivery",
320
+ "quality", "control", "assurance", "standard", "certification", "compliance",
321
+ "process", "procedure", "workflow", "efficiency", "optimization",
322
+
323
+ # Marketing and sales
324
+ "marketing", "advertise", "advertising", "campaign", "promotion", "publicity",
325
+ "PR", "public relations", "brand", "branding", "identity", "image", "reputation",
326
+ "sales", "selling", "deal", "transaction", "pipeline", "lead", "prospect",
327
+ "customer", "client", "consumer", "buyer", "purchaser", "target market",
328
+ "segment", "demographic", "psychographic", "B2B", "B2C", "retail", "wholesale",
329
+ "price", "pricing", "discount", "premium", "luxury", "value", "bargain"
330
+ ],
331
+
332
+ "world": [
333
+ # General international terms
334
+ "country", "nation", "state", "republic", "kingdom", "global", "international",
335
+ "foreign", "world", "worldwide", "domestic", "abroad", "overseas",
336
+ "developed", "developing", "industrialized", "emerging", "third world",
337
+ "global south", "global north", "east", "west", "western", "eastern",
338
+ "bilateral", "multilateral", "transnational", "multinational", "sovereignty",
339
+
340
+ # Regions and continents
341
+ "Europe", "European", "Asia", "Asian", "Africa", "African", "North America",
342
+ "South America", "Latin America", "Australia", "Oceania", "Antarctica",
343
+ "Middle East", "Central Asia", "Southeast Asia", "East Asia", "South Asia",
344
+ "Eastern Europe", "Western Europe", "Northern Europe", "Southern Europe",
345
+ "Mediterranean", "Scandinavia", "Nordic", "Baltic", "Balkans", "Caucasus",
346
+ "Caribbean", "Central America", "South Pacific", "Polynesia", "Micronesia",
347
+
348
+ # Major countries and regions
349
+ "China", "Chinese", "Russia", "Russian", "India", "Indian", "Japan", "Japanese",
350
+ "UK", "British", "England", "English", "Scotland", "Scottish", "Wales", "Welsh",
351
+ "Germany", "German", "France", "French", "Italy", "Italian", "Spain", "Spanish",
352
+ "Canada", "Canadian", "Brazil", "Brazilian", "Mexico", "Mexican", "Turkey", "Turkish",
353
+ "United States", "US", "USA", "American", "Britain", "Korea", "Korean",
354
+ "North Korea", "South Korea", "Saudi", "Saudi Arabia", "Saudi Arabian",
355
+ "Iran", "Iranian", "Iraq", "Iraqi", "Israel", "Israeli", "Palestine", "Palestinian",
356
+ "Egypt", "Egyptian", "Pakistan", "Pakistani", "Indonesia", "Indonesian",
357
+ "Australia", "Australian", "New Zealand", "Nigeria", "Nigerian", "South Africa",
358
+ "Argentina", "Argentinian", "Colombia", "Colombian", "Venezuela", "Venezuelan",
359
+ "Ukraine", "Ukrainian", "Poland", "Polish", "Switzerland", "Swiss",
360
+ "Netherlands", "Dutch", "Belgium", "Belgian", "Sweden", "Swedish", "Norway", "Norwegian",
361
+
362
+ # International issues and topics
363
+ "war", "conflict", "crisis", "tension", "dispute", "hostility", "peace",
364
+ "peacekeeping", "ceasefire", "truce", "armistice", "treaty", "agreement",
365
+ "compromise", "negotiation", "mediation", "resolution", "settlement",
366
+ "refugee", "migrant", "asylum seeker", "displacement", "humanitarian",
367
+ "border", "frontier", "territory", "territorial", "sovereignty", "jurisdiction",
368
+ "terror", "terrorism", "extremism", "radicalism", "insurgency", "militant",
369
+ "sanction", "embargo", "restriction", "isolation", "blockade",
370
+
371
+ # International trade and economy
372
+ "trade", "import", "export", "tariff", "duty", "quota", "subsidy",
373
+ "protectionism", "free trade", "fair trade", "globalization", "trade war",
374
+ "trade agreement", "trade deal", "trade deficit", "trade surplus",
375
+ "supply chain", "outsourcing", "offshoring", "reshoring", "nearshoring",
376
+
377
+ # Diplomacy and international relations
378
+ "embassy", "consulate", "diplomatic", "diplomacy", "diplomat", "ambassador",
379
+ "consul", "attaché", "envoy", "emissary", "delegation", "mission",
380
+ "foreign policy", "international relations", "geopolitics", "geopolitical",
381
+ "influence", "power", "superpower", "hegemony", "alliance", "coalition",
382
+ "bloc", "axis", "sphere of influence", "buffer state", "proxy",
383
+
384
+ # International organizations
385
+ "UN", "United Nations", "EU", "European Union", "NATO", "NAFTA", "USMCA",
386
+ "ASEAN", "OPEC", "Commonwealth", "Arab League", "African Union", "AU",
387
+ "BRICS", "G7", "G20", "IMF", "World Bank", "WTO", "WHO", "UNESCO",
388
+ "Security Council", "General Assembly", "International Court of Justice",
389
+
390
+ # Travel and cultural exchange
391
+ "visa", "passport", "immigration", "emigration", "migration", "travel",
392
+ "tourism", "tourist", "visitor", "foreigner", "expatriate", "expat",
393
+ "citizenship", "nationality", "dual citizen", "naturalization",
394
+ "cultural", "tradition", "heritage", "indigenous", "native", "local",
395
+ "language", "dialect", "translation", "interpreter", "cross-cultural"
396
+ ],
397
+
398
+ "sports": [
399
+ # General sports terms
400
+ "game", "match", "tournament", "championship", "league", "cup", "Olympics",
401
+ "olympic", "world cup", "competition", "contest", "event", "series",
402
+ "sport", "sporting", "athletics", "physical", "play", "compete", "competition",
403
+ "amateur", "professional", "pro", "season", "preseason", "regular season",
404
+ "postseason", "playoff", "final", "semifinal", "quarterfinal", "qualifying",
405
+
406
+ # Team sports
407
+ "football", "soccer", "American football", "rugby", "basketball", "baseball",
408
+ "cricket", "hockey", "ice hockey", "field hockey", "volleyball", "handball",
409
+ "water polo", "lacrosse", "ultimate frisbee", "netball", "kabaddi",
410
+ "team", "club", "franchise", "squad", "roster", "lineup", "formation",
411
+ "player", "coach", "manager", "trainer", "captain", "starter", "substitute",
412
+ "bench", "draft", "trade", "free agent", "contract", "transfer", "loan",
413
+
414
+ # Individual sports
415
+ "tennis", "golf", "boxing", "wrestling", "martial arts", "MMA", "UFC",
416
+ "athletics", "track and field", "swimming", "diving", "gymnastics",
417
+ "skiing", "snowboarding", "skating", "figure skating", "speed skating",
418
+ "cycling", "mountain biking", "BMX", "motorsport", "F1", "Formula 1",
419
+ "NASCAR", "IndyCar", "MotoGP", "rally", "marathon", "triathlon", "decathlon",
420
+ "archery", "shooting", "fencing", "equestrian", "rowing", "canoeing", "kayaking",
421
+ "surfing", "skateboarding", "climbing", "bouldering", "weightlifting",
422
+
423
+ # Scoring and results
424
+ "score", "point", "goal", "touchdown", "basket", "run", "wicket", "try",
425
+ "win", "lose", "draw", "tie", "defeat", "victory", "champion", "winner",
426
+ "loser", "runner-up", "finalist", "semifinalist", "eliminated", "advance",
427
+ "qualify", "record", "personal best", "world record", "Olympic record",
428
+ "streak", "undefeated", "unbeaten", "perfect season", "comeback",
429
+
430
+ # Performance and training
431
+ "fitness", "training", "practice", "drill", "workout", "exercise", "regime",
432
+ "conditioning", "strength", "endurance", "speed", "agility", "flexibility",
433
+ "skill", "technique", "form", "style", "strategy", "tactic", "playbook",
434
+ "offense", "defense", "attack", "counter", "press", "formation",
435
+ "injury", "rehabilitation", "recovery", "physiotherapy", "sports medicine",
436
+
437
+ # Sports infrastructure
438
+ "stadium", "arena", "court", "field", "pitch", "rink", "pool", "track",
439
+ "course", "gymnasium", "gym", "complex", "venue", "facility", "locker room",
440
+ "dugout", "bench", "sideline", "grandstand", "spectator", "fan", "supporter",
441
+
442
+ # Sports organizations and competitions
443
+ "medal", "gold", "silver", "bronze", "podium", "Olympics", "Paralympic",
444
+ "commonwealth games", "Asian games", "Pan American games", "world championship",
445
+ "grand slam", "masters", "open", "invitational", "classic", "tour", "circuit",
446
+ "IPL", "Indian Premier League", "MLB", "Major League Baseball",
447
+ "NBA", "National Basketball Association", "NFL", "National Football League",
448
+ "NHL", "National Hockey League", "FIFA", "UEFA", "ATP", "WTA", "ICC",
449
+ "Premier League", "La Liga", "Bundesliga", "Serie A", "Ligue 1", "MLS",
450
+ "Champions League", "Europa League", "Super Bowl", "World Series", "Stanley Cup",
451
+ "NCAA", "collegiate", "college", "university", "varsity", "intramural",
452
+
453
+ # Sports media and business
454
+ "broadcast", "coverage", "commentator", "announcer", "pundit", "analyst",
455
+ "highlight", "replay", "sports network", "ESPN", "Sky Sports", "Fox Sports",
456
+ "sponsorship", "endorsement", "advertisement", "merchandise", "jersey", "kit",
457
+ "ticket", "season ticket", "box seat", "premium", "concession", "vendor",
458
+ # Sports media and business (continued)
459
+ "broadcast", "coverage", "commentator", "announcer", "pundit", "analyst",
460
+ "highlight", "replay", "sports network", "ESPN", "Sky Sports", "Fox Sports",
461
+ "sponsorship", "endorsement", "advertisement", "merchandise", "jersey", "kit",
462
+ "ticket", "season ticket", "box seat", "premium", "concession", "vendor"
463
+ ],
464
+
465
+ "entertainment": [
466
+ # Film and cinema
467
+ "movie", "film", "cinema", "feature", "short film", "documentary", "animation",
468
+ "blockbuster", "indie", "independent film", "foreign film", "box office",
469
+ "screening", "premiere", "release", "theatrical", "stream", "streaming",
470
+ "director", "producer", "screenwriter", "script", "screenplay", "adaptation",
471
+ "cinematography", "cinematographer", "editing", "editor", "visual effects",
472
+ "special effects", "CGI", "motion capture", "sound design", "soundtrack",
473
+ "score", "composer", "scene", "shot", "take", "cut", "sequel", "prequel",
474
+ "trilogy", "franchise", "universe", "reboot", "remake", "spin-off",
475
+ "genre", "action", "comedy", "drama", "thriller", "horror", "sci-fi",
476
+ "science fiction", "fantasy", "romance", "romantic comedy", "rom-com",
477
+ "mystery", "crime", "western", "historical", "biographical", "biopic",
478
+
479
+ # Television
480
+ "TV", "television", "show", "series", "episode", "season", "pilot",
481
+ "finale", "midseason", "sitcom", "drama series", "miniseries", "limited series",
482
+ "anthology", "reality TV", "game show", "talk show", "variety show",
483
+ "network", "cable", "premium cable", "broadcast", "channel", "program",
484
+ "primetime", "daytime", "syndication", "rerun", "renewed", "cancelled",
485
+ "showrunner", "creator", "writer", "TV writer", "episode writer", "staff writer",
486
+
487
+ # Performing arts
488
+ "actor", "actress", "performer", "cast", "casting", "star", "co-star",
489
+ "supporting", "lead", "protagonist", "antagonist", "villain", "hero", "anti-hero",
490
+ "character", "role", "performance", "portrayal", "acting", "dialogue",
491
+ "monologue", "line", "script", "improv", "improvisation", "stand-up",
492
+ "comedian", "comic", "sketch", "theater", "theatre", "stage", "Broadway",
493
+ "West End", "play", "musical", "opera", "ballet", "dance", "choreography",
494
+ "production", "rehearsal", "audition", "understudy", "troupe", "ensemble",
495
+
496
+ # Music
497
+ "music", "song", "track", "single", "album", "EP", "LP", "record",
498
+ "release", "drop", "artist", "musician", "singer", "vocalist", "band",
499
+ "group", "duo", "trio", "soloist", "frontman", "frontwoman", "lead singer",
500
+ "songwriter", "composer", "producer", "DJ", "rapper", "MC", "beatmaker",
501
+ "guitarist", "bassist", "drummer", "pianist", "keyboardist", "violinist",
502
+ "instrumentalist", "orchestra", "symphony", "philharmonic", "conductor",
503
+ "genre", "rock", "pop", "hip-hop", "rap", "R&B", "soul", "funk", "jazz",
504
+ "blues", "country", "folk", "electronic", "EDM", "dance", "techno", "house",
505
+ "metal", "punk", "alternative", "indie", "classical", "reggae", "latin",
506
+ "hit", "chart", "Billboard", "Grammy", "award-winning", "platinum", "gold",
507
+ "concert", "tour", "gig", "show", "performance", "live", "venue", "arena",
508
+ "stadium", "festival", "Coachella", "Glastonbury", "Lollapalooza", "Bonnaroo",
509
+
510
+ # Celebrity culture
511
+ "celebrity", "star", "fame", "famous", "A-list", "B-list", "icon", "iconic",
512
+ "superstar", "public figure", "household name", "stardom", "limelight",
513
+ "popular", "popularity", "fan", "fanbase", "followers", "stan", "groupie",
514
+ "paparazzi", "tabloid", "gossip", "rumor", "scandal", "controversy",
515
+ "interview", "press conference", "red carpet", "premiere", "gala", "award show",
516
+
517
+ # Awards and recognition
518
+ "award", "nominee", "nomination", "winner", "recipient", "honor", "accolade",
519
+ "Oscar", "Academy Award", "Emmy", "Grammy", "Tony", "Golden Globe", "BAFTA",
520
+ "MTV Award", "People's Choice", "Critics' Choice", "SAG Award", "Billboard Award",
521
+ "best actor", "best actress", "best director", "best picture", "best film",
522
+ "best album", "best song", "hall of fame", "lifetime achievement", "legacy",
523
+
524
+ # Media and publishing
525
+ "book", "novel", "fiction", "non-fiction", "memoir", "biography", "autobiography",
526
+ "bestseller", "bestselling", "author", "writer", "novelist", "literary",
527
+ "literature", "publisher", "publishing", "imprint", "edition", "volume",
528
+ "chapter", "page", "paragraph", "prose", "narrative", "plot", "storyline",
529
+ "character", "protagonist", "antagonist", "setting", "theme", "genre",
530
+ "mystery", "thriller", "romance", "sci-fi", "fantasy", "young adult", "YA",
531
+ "comic", "comic book", "graphic novel", "manga", "anime", "cartoon",
532
+
533
+ # Digital entertainment
534
+ "streaming", "stream", "subscription", "platform", "service", "content",
535
+ "Netflix", "Disney+", "Amazon Prime", "Hulu", "HBO", "HBO Max", "Apple TV+",
536
+ "Peacock", "Paramount+", "YouTube", "YouTube Premium", "TikTok", "Instagram",
537
+ "influencer", "content creator", "vlogger", "blogger", "podcaster", "podcast",
538
+ "episode", "download", "subscriber", "follower", "like", "share", "viral",
539
+ "trending", "binge-watch", "marathon", "spoiler", "recap", "review", "trailer",
540
+ "teaser", "behind the scenes", "BTS", "exclusive", "original"
541
+ ]
542
+ }
543
+
544
+ # Add domain-specific RSS feeds for different categories
545
+ CATEGORY_SPECIFIC_FEEDS = {
546
+ "science": [
547
+ # "https://www.science.org/rss/news_feeds/carousel.xml",
548
+ "https://www.science.org/rss/news_current.xml",
549
+ "https://www.nature.com/nature.rss",
550
+ # "https://www.scientificamerican.com/rss/",
551
+ "http://rss.sciam.com/basic-science",
552
+ # "https://rss.sciam.com/ScientificAmerican-Global",
553
+ "http://rss.sciam.com/ScientificAmerican-Global",
554
+ # "https://feeds.newscientist.com/science-news",
555
+ "https://www.newscientist.com/feed/home/?cmpid=RSS|NSNS-Home",
556
+ "https://phys.org/rss-feed/"
557
+ ],
558
+ "technology": [
559
+ # "https://feed.wired.com/rss/category/business/feed.rss",
560
+ "https://www.wired.com/feed/category/business/latest/rss",
561
+ "https://techcrunch.com/feed/",
562
+ "https://www.technologyreview.com/feed/",
563
+ "https://arstechnica.com/feed/",
564
+ "https://www.theverge.com/rss/index.xml",
565
+ "https://news.ycombinator.com/rss"
566
+ ],
567
+ "politics": [
568
+ "https://feeds.washingtonpost.com/rss/politics",
569
+ "https://rss.nytimes.com/services/xml/rss/nyt/Politics.xml",
570
+ "https://feeds.bbci.co.uk/news/politics/rss.xml",
571
+ "https://www.politico.com/rss/politicopicks.xml",
572
+ "https://www.realclearpolitics.com/index.xml"
573
+ ],
574
+ "business": [
575
+ "https://www.ft.com/rss/home",
576
+ "https://feeds.bloomberg.com/markets/news.rss",
577
+ # "https://www.forbes.com/business/feed/",
578
+ "https://rss.nytimes.com/services/xml/rss/nyt/Business.xml",
579
+ "https://feeds.washingtonpost.com/rss/business",
580
+ "https://www.entrepreneur.com/latest.rss",
581
+ # "https://www.cnbc.com/id/10001147/device/rss/rss.htm",
582
+ "https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=10001147",
583
+ "https://feeds.content.dowjones.io/public/rss/WSJcomUSBusiness",
584
+ "https://feeds.a.dj.com/rss/RSSMarketsMain.xml"
585
+ ],
586
+ "world": [
587
+ "https://feeds.bbci.co.uk/news/world/rss.xml",
588
+ "https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
589
+ "https://www.aljazeera.com/xml/rss/all.xml",
590
+ "https://feeds.washingtonpost.com/rss/world",
591
+ # "https://rss.cnn.com/rss/edition_world.rss"
592
+ "http://rss.cnn.com/rss/cnn_world.rss"
593
+ ],
594
+ "sports": [
595
+ "https://www.espn.com/espn/rss/news",
596
+ "https://www.cbssports.com/rss/headlines/",
597
+ # "https://feeds.skysports.com/feeds/rss/latest.xml",
598
+ "https://www.espncricinfo.com/rss/content/story/feeds/0.xml",
599
+ "https://api.foxsports.com/v1/rss",
600
+ "https://www.sportingnews.com/us/rss",
601
+ "https://www.theguardian.com/sport/rss",
602
+ ],
603
+ "entertainment": [
604
+ "https://www.hollywoodreporter.com/feed/",
605
+ "https://variety.com/feed/",
606
+ # "https://feeds.eonline.com/mrss/article/",
607
+ "https://www.eonline.com/syndication/feeds/rssfeeds/topstories.xml",
608
+ "https://www.rollingstone.com/feed/",
609
+ "https://rss.nytimes.com/services/xml/rss/nyt/Arts.xml"
610
+ ],
611
+ "fact_checking": [
612
+ "https://www.snopes.com/feed/",
613
+ "https://www.politifact.com/rss/all/",
614
+ "https://www.factcheck.org/feed/",
615
+ "https://leadstories.com/atom.xml",
616
+ # "https://apnews.com/hub/fact-check/rss",
617
+ # "https://apnews.com/apf-fact-check"
618
+ "https://fullfact.org/feed/all/",
619
+ "https://www.truthorfiction.com/feed/"
620
+ ]
621
+ }
622
+
623
+ # Reliability boosts for sources by category
624
+ SOURCE_RELIABILITY_BY_CATEGORY = {
625
+ "science": {
626
+ "nature.com": 0.95,
627
+ "science.org": 0.95,
628
+ "nih.gov": 0.95,
629
+ "nasa.gov": 0.95,
630
+ "scientificamerican.com": 0.9,
631
+ "newscientist.com": 0.9,
632
+ "pnas.org": 0.95,
633
+ "cell.com": 0.95,
634
+ "sciencedirect.com": 0.9,
635
+ "plos.org": 0.9,
636
+ "arxiv.org": 0.85
637
+ },
638
+ "technology": {
639
+ "wired.com": 0.9,
640
+ "techcrunch.com": 0.85,
641
+ "arstechnica.com": 0.9,
642
+ "technologyreview.com": 0.9,
643
+ "theverge.com": 0.85,
644
+ "cnet.com": 0.85,
645
+ "engadget.com": 0.85
646
+ },
647
+ "fact_checking": {
648
+ "snopes.com": 0.95,
649
+ "politifact.com": 0.9,
650
+ "factcheck.org": 0.9,
651
+ "apnews.com/hub/fact-check": 0.95,
652
+ "reuters.com/fact-check": 0.95
653
+ }
654
+ }
655
+
656
+ def detect_claim_category(claim: str) -> Tuple[str, float]:
657
+ """
658
+ Detect the most likely category of a claim and its confidence score
659
+
660
+ Args:
661
+ claim (str): The claim text
662
+
663
+ Returns:
664
+ tuple: (category_name, confidence_score)
665
+ """
666
+ if not claim:
667
+ return "general", 0.3
668
+
669
+ # Lowercase for better matching
670
+ claim_lower = claim.lower()
671
+
672
+ # Count matches for each category
673
+ category_scores = {}
674
+
675
+ for category, keywords in CLAIM_CATEGORIES.items():
676
+ # Count how many keywords from this category appear in the claim
677
+ matches = sum(1 for keyword in keywords if keyword.lower() in claim_lower)
678
+
679
+ # Calculate a simple score based on matches
680
+ if matches > 0:
681
+ # Calculate a more significant score based on number of matches
682
+ score = min(0.9, 0.3 + (matches * 0.1)) # Base 0.3 + 0.1 per match, max 0.9
683
+ category_scores[category] = score
684
+
685
+ # Find category with highest score
686
+ if not category_scores:
687
+ return "general", 0.3
688
+
689
+ top_category = max(category_scores.items(), key=lambda x: x[1])
690
+ category_name, confidence = top_category
691
+
692
+ # If the top score is too low, return general
693
+ if confidence < 0.3:
694
+ return "general", 0.3
695
+
696
+ return category_name, confidence
697
+
698
+ def get_topic_specific_sources(claim: str, existing_sources: Dict) -> Dict:
699
+ """
700
+ Enrich existing sources dict with topic-specific sources
701
+
702
+ Args:
703
+ claim (str): The claim text
704
+ existing_sources (dict): Current sources configuration
705
+
706
+ Returns:
707
+ dict: Updated sources with topic-specific priorities
708
+ """
709
+ # Detect claim category
710
+ category, confidence = detect_claim_category(claim)
711
+ logger.info(f"Claim category detected: {category} (confidence: {confidence:.2f})")
712
+
713
+ # If confidence is low, keep existing sources
714
+ if confidence < 0.4:
715
+ return existing_sources
716
+
717
+ # Get specific feeds for the category
718
+ category_feeds = CATEGORY_SPECIFIC_FEEDS.get(category, [])
719
+
720
+ # Only proceed if we have category-specific feeds
721
+ if not category_feeds:
722
+ return existing_sources
723
+
724
+ # Create a new sources dictionary with category-specific modifications
725
+ updated_sources = existing_sources.copy()
726
+
727
+ # If the category is science, add the category-specific feeds to the list
728
+ # and prioritize them by putting them first in RSS feeds
729
+ if category in CATEGORY_SPECIFIC_FEEDS:
730
+ # Add up to 5 category-specific RSS feeds (if we have them)
731
+ category_feeds_sample = category_feeds[:min(5, len(category_feeds))]
732
+
733
+ # Add or update source reliability data
734
+ if category in SOURCE_RELIABILITY_BY_CATEGORY:
735
+ for domain, reliability in SOURCE_RELIABILITY_BY_CATEGORY[category].items():
736
+ updated_sources["source_credibility"] = updated_sources.get("source_credibility", {})
737
+ updated_sources["source_credibility"][domain] = reliability
738
+
739
+ # Return updated sources with prioritized feeds
740
+ return {
741
+ "category": category,
742
+ "confidence": confidence,
743
+ "rss_feeds": category_feeds_sample + (updated_sources.get("rss_feeds", []) or []),
744
+ "source_credibility": updated_sources.get("source_credibility", {})
745
+ }
746
+
747
+ return existing_sources
748
+
749
+ def get_prioritized_sources(claim: str, claim_category: Optional[str] = None) -> Dict[str, List[str]]:
750
+ """
751
+ Get prioritized sources for a claim based on its category
752
+
753
+ Args:
754
+ claim (str): The claim to check
755
+ claim_category (str, optional): Override detected category
756
+
757
+ Returns:
758
+ dict: Dictionary with source types prioritized by relevance
759
+ """
760
+ # Detect category if not provided
761
+ if not claim_category:
762
+ category, confidence = detect_claim_category(claim)
763
+ else:
764
+ category = claim_category
765
+ confidence = 0.8 # Assume high confidence if category is explicitly provided
766
+
767
+ # Log detected category
768
+ logger.info(f"Using claim category: {category} for source prioritization")
769
+
770
+ # Default priorities
771
+ priorities = {
772
+ "primary": ["wikipedia", "news", "claimreview"],
773
+ "secondary": ["rss", "scholarly", "wikidata"]
774
+ }
775
+
776
+ # Needs recent evidence check (existing logic)
777
+ temporal_terms = ["is", "are", "remains", "continues", "still", "currently",
778
+ "now", "today", "recent", "latest"]
779
+ negation_terms = ["not", "no longer", "isn't", "aren't", "doesn't", "don't",
780
+ "can't", "cannot", "anymore"]
781
+
782
+ requires_recent = any(term in claim.lower() for term in temporal_terms) or \
783
+ any(term in claim.lower() for term in negation_terms)
784
+
785
+ # Adjust priorities based on category
786
+ if category == "science":
787
+ if requires_recent:
788
+ priorities = {
789
+ "primary": ["scholarly", "rss", "wikipedia"],
790
+ "secondary": ["news", "claimreview", "wikidata"]
791
+ }
792
+ else:
793
+ priorities = {
794
+ "primary": ["scholarly", "wikipedia", "rss"],
795
+ "secondary": ["claimreview", "news", "wikidata"]
796
+ }
797
+
798
+ elif category == "technology":
799
+ if requires_recent:
800
+ priorities = {
801
+ "primary": ["rss", "news", "scholarly"],
802
+ "secondary": ["wikipedia", "claimreview", "wikidata"]
803
+ }
804
+ else:
805
+ priorities = {
806
+ "primary": ["news", "scholarly", "wikipedia"],
807
+ "secondary": ["rss", "claimreview", "wikidata"]
808
+ }
809
+
810
+ elif category == "politics":
811
+ if requires_recent:
812
+ priorities = {
813
+ "primary": ["rss", "news", "claimreview"],
814
+ "secondary": ["wikipedia", "wikidata", "scholarly"]
815
+ }
816
+ else:
817
+ priorities = {
818
+ "primary": ["claimreview", "news", "wikipedia"],
819
+ "secondary": ["rss", "wikidata", "scholarly"]
820
+ }
821
+
822
+ elif category == "business" or category == "world":
823
+ if requires_recent:
824
+ priorities = {
825
+ "primary": ["rss", "news", "wikipedia"],
826
+ "secondary": ["claimreview", "wikidata", "scholarly"]
827
+ }
828
+ else:
829
+ priorities = {
830
+ "primary": ["news", "wikipedia", "rss"],
831
+ "secondary": ["claimreview", "wikidata", "scholarly"]
832
+ }
833
+
834
+ elif category == "sports":
835
+ if requires_recent:
836
+ priorities = {
837
+ "primary": ["rss", "news", "wikipedia"],
838
+ "secondary": ["wikidata", "claimreview", "scholarly"]
839
+ }
840
+ else:
841
+ priorities = {
842
+ "primary": ["wikipedia", "news", "rss"],
843
+ "secondary": ["wikidata", "claimreview", "scholarly"]
844
+ }
845
+
846
+ elif category == "entertainment":
847
+ if requires_recent:
848
+ priorities = {
849
+ "primary": ["rss", "news", "claimreview"],
850
+ "secondary": ["wikipedia", "wikidata", "scholarly"]
851
+ }
852
+ else:
853
+ priorities = {
854
+ "primary": ["news", "wikipedia", "claimreview"],
855
+ "secondary": ["rss", "wikidata", "scholarly"]
856
+ }
857
+
858
+ # Add category and confidence for reference
859
+ priorities["category"] = category
860
+ priorities["confidence"] = confidence
861
+ priorities["requires_recent"] = requires_recent
862
+
863
+ return priorities
864
+
865
+ def get_category_specific_rss_feeds(category: str, max_feeds: int = 5) -> List[str]:
866
+ """
867
+ Get a list of RSS feeds specific to a category
868
+
869
+ Args:
870
+ category (str): The claim category
871
+ max_feeds (int): Maximum number of feeds to return
872
+
873
+ Returns:
874
+ list: List of RSS feed URLs
875
+ """
876
+ # Get category-specific feeds
877
+ category_feeds = CATEGORY_SPECIFIC_FEEDS.get(category, [])
878
+
879
+ # Limit to max_feeds
880
+ return category_feeds[:min(max_feeds, len(category_feeds))]
modules/claim_extraction.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import time
3
+ import re
4
+ from langdetect import detect
5
+ import spacy
6
+
7
+ from utils.performance import PerformanceTracker
8
+ from utils.models import get_nlp_model, get_llm_model
9
+
10
+ logger = logging.getLogger("misinformation_detector")
11
+
12
+ performance_tracker = PerformanceTracker()
13
+
14
+ def extract_claims(text):
15
+ """
16
+ Extract the main factual claim from the provided text.
17
+ For concise claims (<20 words), preserves them exactly.
18
+ For longer text, uses OpenAI to extract the claim.
19
+ """
20
+ logger.info(f"Extracting claims from: {text}")
21
+ start_time = time.time()
22
+
23
+ # First, check if the input already appears to be a concise claim
24
+ if len(text.split()) < 20:
25
+ logger.info("Input appears to be a concise claim already, preserving as-is")
26
+ performance_tracker.log_processing_time(start_time)
27
+ performance_tracker.log_claim_processed()
28
+ return text
29
+
30
+ try:
31
+ # For longer text, use OpenAI for extraction
32
+ extracted_claim = extract_with_openai(text)
33
+
34
+ # Log processing time
35
+ performance_tracker.log_processing_time(start_time)
36
+ performance_tracker.log_claim_processed()
37
+
38
+ logger.info(f"Extracted claim: {extracted_claim}")
39
+ return extracted_claim
40
+ except Exception as e:
41
+ logger.error(f"Error extracting claims: {str(e)}")
42
+ # Fallback to original text on error
43
+ return text
44
+
45
+ def extract_with_openai(text):
46
+ """
47
+ Use OpenAI model for claim extraction
48
+ """
49
+ try:
50
+ # Get LLM model
51
+ llm_model = get_llm_model()
52
+
53
+ # Create a very explicit prompt to avoid hallucination
54
+ prompt = f"""
55
+ Extract the main factual claim from the following text.
56
+ DO NOT add any information not present in the original text.
57
+ DO NOT add locations, dates, or other details.
58
+ ONLY extract what is explicitly stated.
59
+
60
+ Text: {text}
61
+
62
+ Main factual claim:
63
+ """
64
+
65
+ # Call OpenAI with temperature=0 for deterministic output
66
+ response = llm_model.invoke(prompt, temperature=0)
67
+ extracted_claim = response.content.strip()
68
+
69
+ # Further clean up any explanations or extra text
70
+ if ":" in extracted_claim:
71
+ parts = extracted_claim.split(":")
72
+ if len(parts) > 1:
73
+ extracted_claim = parts[-1].strip()
74
+
75
+ logger.info(f"OpenAI extraction: {extracted_claim}")
76
+
77
+ # Validate that we're not adding info not in the original
78
+ nlp = get_nlp_model()
79
+ extracted_claim = validate_extraction(text, extracted_claim, nlp)
80
+
81
+ return extracted_claim
82
+ except Exception as e:
83
+ logger.error(f"Error in OpenAI claim extraction: {str(e)}")
84
+ return text # Fallback to original
85
+
86
+ def validate_extraction(original_text, extracted_claim, nlp):
87
+ """
88
+ Validate that the extracted claim doesn't add information not present in the original text
89
+ """
90
+ # If extraction fails or is empty, return original
91
+ if not extracted_claim or extracted_claim.strip() == "":
92
+ logger.warning("Empty extraction result, using original text")
93
+ return original_text
94
+
95
+ # Check for added location information
96
+ location_terms = ["united states", "america", "u.s.", "usa", "china", "india", "europe",
97
+ "russia", "japan", "uk", "germany", "france", "australia"]
98
+ for term in location_terms:
99
+ if term in extracted_claim.lower() and term not in original_text.lower():
100
+ logger.warning(f"Extraction added location '{term}' not in original, using original text")
101
+ return original_text
102
+
103
+ # Check for entity preservation/addition using spaCy
104
+ try:
105
+ # Get entities from extracted text
106
+ extracted_doc = nlp(extracted_claim)
107
+ extracted_entities = [ent.text.lower() for ent in extracted_doc.ents]
108
+
109
+ # Get entities from original text
110
+ original_doc = nlp(original_text)
111
+ original_entities = [ent.text.lower() for ent in original_doc.ents]
112
+
113
+ # Check for new entities that don't exist in original
114
+ for entity in extracted_entities:
115
+ if not any(entity in orig_entity or orig_entity in entity for orig_entity in original_entities):
116
+ logger.warning(f"Extraction added new entity '{entity}', using original text")
117
+ return original_text
118
+
119
+ return extracted_claim
120
+ except Exception as e:
121
+ logger.error(f"Error in extraction validation: {str(e)}")
122
+ return original_text # On error, safer to return original
123
+
124
+ def shorten_claim_for_evidence(claim):
125
+ """
126
+ Shorten a claim to use for evidence retrieval by preserving important keywords
127
+ while maintaining claim context
128
+ """
129
+ try:
130
+ # Get NLP model
131
+ nlp = get_nlp_model()
132
+
133
+ # Use NER to extract key entities
134
+ doc = nlp(claim)
135
+
136
+ # Extract all entities for search
137
+ entities = [ent.text for ent in doc.ents]
138
+
139
+ # Extract key proper nouns, entities, and important context words
140
+ important_words = []
141
+
142
+ # Add all named entities
143
+ for ent in doc.ents:
144
+ important_words.append(ent.text)
145
+
146
+ # Add important nouns and adjectives not already added
147
+ for token in doc:
148
+ if token.pos_ in ["NOUN", "PROPN"] and token.text not in important_words:
149
+ important_words.append(token.text)
150
+
151
+ # Make sure we include key terms like "prime minister", "president", etc.
152
+ title_terms = ["president", "prime minister", "minister", "chancellor", "premier", "governor", "mayor", "senator"]
153
+ for term in title_terms:
154
+ if term in claim.lower() and not any(term in word.lower() for word in important_words):
155
+ # Find the full phrase (e.g., "Canadian Prime Minister")
156
+ matches = re.finditer(r'(?i)(?:\w+\s+)*\b' + re.escape(term) + r'\b(?:\s+\w+)*', claim)
157
+ for match in matches:
158
+ phrase = match.group(0)
159
+ if phrase not in important_words:
160
+ important_words.append(phrase)
161
+
162
+ # Add country names or important place references
163
+ country_terms = ["canada", "canadian", "us", "united states", "american", "uk", "british", "australia", "china", "russian"]
164
+ for term in country_terms:
165
+ if term in claim.lower() and not any(term in word.lower() for word in important_words):
166
+ for token in doc:
167
+ if token.text.lower() == term and token.text not in important_words:
168
+ important_words.append(token.text)
169
+
170
+ # Always include negation words as they're critical for meaning
171
+ negation_terms = ["not", "no longer", "former", "ex-", "isn't", "aren't", "doesn't", "don't"]
172
+ negation_found = False
173
+ for term in negation_terms:
174
+ if term in claim.lower():
175
+ # Find the context around the negation (3 words before and after)
176
+ matches = re.finditer(r'(?i)(?:\w+\s+){0,3}\b' + re.escape(term) + r'\b(?:\s+\w+){0,3}', claim)
177
+ for match in matches:
178
+ phrase = match.group(0)
179
+ if phrase not in important_words:
180
+ important_words.append(phrase)
181
+ negation_found = True
182
+
183
+ # Special handling for time-sensitive claims with negations
184
+ is_time_sensitive = any(term in claim.lower() for term in ["anymore", "still", "currently", "now", "today", "recent"])
185
+
186
+ # If we have both negation and time sensitivity, ensure we keep those key aspects
187
+ if negation_found and is_time_sensitive:
188
+ # Ensure we keep time-sensitive terms
189
+ time_terms = ["anymore", "still", "currently", "now", "today", "recent"]
190
+ for term in time_terms:
191
+ if term in claim.lower() and not any(term in word.lower() for word in important_words):
192
+ # Add the context around the time term
193
+ matches = re.finditer(r'(?i)(?:\w+\s+){0,2}\b' + re.escape(term) + r'\b(?:\s+\w+){0,2}', claim)
194
+ for match in matches:
195
+ phrase = match.group(0)
196
+ if phrase not in important_words:
197
+ important_words.append(phrase)
198
+
199
+ # If entities plus titles don't give us enough, include key parts of claim
200
+ if len(entities) < 2 and not any("minister" in word.lower() for word in important_words):
201
+ words = claim.split()
202
+ # Use first 8 words
203
+ return " ".join(words[:min(8, len(words))])
204
+
205
+ # Remove duplicates while preserving order
206
+ seen = set()
207
+ unique_terms = []
208
+ for word in important_words:
209
+ if word.lower() not in seen:
210
+ seen.add(word.lower())
211
+ unique_terms.append(word)
212
+
213
+ # Ensure we have a reasonable number of search terms (maintain more for complex claims)
214
+ search_terms = unique_terms[:min(6, len(unique_terms))]
215
+
216
+ # Sort search terms to try to maintain original word order from claim
217
+ def get_position(term):
218
+ return claim.lower().find(term.lower())
219
+
220
+ search_terms.sort(key=get_position)
221
+
222
+ # Join terms to create search query
223
+ shortened_claim = " ".join(search_terms)
224
+
225
+ # If the shortened claim is too short compared to original, use more of original
226
+ if len(shortened_claim.split()) < 3 and len(claim.split()) > 5:
227
+ words = claim.split()
228
+ shortened_claim = " ".join(words[:min(8, len(words))])
229
+
230
+ logger.info(f"Shortened Claim: {shortened_claim}")
231
+
232
+ return shortened_claim
233
+ except Exception as e:
234
+ logger.error(f"Error in shortening claim: {str(e)}")
235
+ # Return original claim on error
236
+ return claim
modules/classification.py ADDED
@@ -0,0 +1,521 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import re
3
+ from utils.models import get_llm_model
4
+ from utils.performance import PerformanceTracker
5
+
6
+ logger = logging.getLogger("misinformation_detector")
7
+
8
+ performance_tracker = PerformanceTracker()
9
+
10
+ def classify_with_llm(claim, evidence):
11
+ """
12
+ Optimized classification function that handles evidence classification
13
+ and verdict generation in a single LLM call with robust parsing
14
+ """
15
+ logger.info(f"Classifying evidence for claim: {claim}")
16
+
17
+ # Get the LLM model
18
+ llm_model = get_llm_model()
19
+
20
+ # Skip if no evidence
21
+ if not evidence:
22
+ logger.warning("No evidence provided for classification")
23
+ return []
24
+
25
+ # Normalize evidence to a list
26
+ if not isinstance(evidence, list):
27
+ if evidence:
28
+ try:
29
+ evidence = [evidence]
30
+ except Exception as e:
31
+ logger.error(f"Could not convert evidence to list: {e}")
32
+ return []
33
+ else:
34
+ return []
35
+
36
+ # Does the claim contain strong assertions that require specific evidence?
37
+ strong_assertion_markers = [
38
+ "solved", "cured", "discovered", "confirmed", "proven", "definitive",
39
+ "breakthrough", "revolutionary", "successfully", "first ever", "extends",
40
+ "conclusive", "unprecedented", "remarkable", "definitively"
41
+ ]
42
+
43
+ # Check if the claim contains strong assertions that would require specific supporting evidence
44
+ contains_strong_assertions = any(marker in claim.lower() for marker in strong_assertion_markers)
45
+
46
+ # Limit to top 5 evidence items to reduce token usage
47
+ evidence = evidence[:5]
48
+
49
+ try:
50
+ # Format evidence items
51
+ evidence_text = ""
52
+ for idx, chunk in enumerate(evidence):
53
+ # Truncate long evidence
54
+ chunk_text = str(chunk)
55
+ if len(chunk_text) > 300:
56
+ chunk_text = chunk_text[:297] + "..."
57
+
58
+ evidence_text += f"EVIDENCE {idx+1}:\n{chunk_text}\n\n"
59
+
60
+ # Create a structured prompt with explicit formatting instructions
61
+ # Adjust instructions based on claim characteristics
62
+ if contains_strong_assertions:
63
+ prompt = f"""
64
+ CLAIM: {claim}
65
+
66
+ EVIDENCE:
67
+ {evidence_text}
68
+
69
+ TASK: Evaluate if the evidence supports, contradicts, or is irrelevant to the claim.
70
+
71
+ IMPORTANT CONTEXT: This claim makes strong assertions that require specific supporting evidence.
72
+
73
+ When evaluating such claims:
74
+ 1. Strong assertions require strong, direct evidence - look for specific confirmation from credible sources
75
+ 2. General information about the topic is not sufficient to support specific assertions
76
+ 3. Evidence of ongoing work or research is not sufficient to support claims of completion or success
77
+ 4. If the evidence doesn't directly confirm the specific assertion, classify it as "insufficient" rather than "support"
78
+
79
+ INSTRUCTIONS:
80
+ 1. For each evidence, provide your analysis in EXACTLY this format:
81
+
82
+ EVIDENCE 1 ANALYSIS:
83
+ Relevance: [relevant/irrelevant]
84
+ Classification: [support/contradict/insufficient/irrelevant]
85
+ Confidence: [number between 0-100]
86
+ Reason: [brief explanation focusing on whether evidence directly confirms the specific assertion]
87
+
88
+ 2. After analyzing all evidence pieces, provide a final verdict in this format:
89
+
90
+ FINAL VERDICT: [clear statement if evidence collectively supports or contradicts the claim]
91
+
92
+ Without specific, direct supporting evidence, default to "The evidence does not support the claim" rather than "insufficient evidence."
93
+
94
+ CRITICAL INSTRUCTION: FOCUS ON THE EXACT CLAIM. Evaluate ONLY the specific claim, not related topics
95
+ """
96
+ else:
97
+ prompt = f"""
98
+ CLAIM: {claim}
99
+
100
+ EVIDENCE:
101
+ {evidence_text}
102
+
103
+ TASK: Evaluate if the evidence supports, contradicts, or is irrelevant to the claim.
104
+
105
+ INSTRUCTIONS:
106
+ 1. For each evidence, provide your analysis in EXACTLY this format:
107
+
108
+ EVIDENCE 1 ANALYSIS:
109
+ Relevance: [relevant/irrelevant]
110
+ Classification: [support/contradict/insufficient/irrelevant]
111
+ Confidence: [number between 0-100]
112
+ Reason: [brief explanation]
113
+
114
+ 2. After analyzing all evidence pieces, provide a final verdict in this format:
115
+
116
+ FINAL VERDICT: [clear statement if evidence collectively supports or contradicts the claim]
117
+
118
+ CRITICAL INSTRUCTION: FOCUS ON THE EXACT CLAIM. Evaluate ONLY the specific claim, not related topics
119
+ """
120
+
121
+ # Get response with temperature=0 for consistency
122
+ result = llm_model.invoke(prompt, temperature=0)
123
+ result_text = result.content.strip()
124
+
125
+ # Extract final verdict first since it's most important
126
+ final_verdict = None
127
+ final_match = re.search(r'FINAL VERDICT:\s*(.*?)(?=\s*$|\n\n)', result_text, re.DOTALL | re.IGNORECASE)
128
+ if final_match:
129
+ final_verdict = final_match.group(1).strip()
130
+ logger.info(f"Final assessment: {final_verdict}")
131
+
132
+ # Define a precise regex pattern matching the requested format
133
+ analysis_pattern = r'EVIDENCE\s+(\d+)\s+ANALYSIS:\s*\n+Relevance:\s*(relevant|irrelevant)\s*\n+Classification:\s*(support|contradict|neutral|irrelevant|insufficient)\s*\n+Confidence:\s*(\d+)\s*\n+Reason:\s*(.*?)(?=\s*EVIDENCE\s+\d+\s+ANALYSIS:|\s*FINAL VERDICT:|\s*$)'
134
+
135
+ # Parse each evidence analysis
136
+ classification_results = []
137
+ matched_evidence = set()
138
+
139
+ # Try matching with our strict pattern first
140
+ matches = list(re.finditer(analysis_pattern, result_text, re.IGNORECASE | re.DOTALL))
141
+
142
+ # If no matches, try a more flexible pattern
143
+ if not matches:
144
+ flexible_pattern = r'(?:EVIDENCE|Evidence)\s+(\d+)(?:\s+ANALYSIS)?:?\s*\n+(?:Relevance|relevance):\s*(relevant|irrelevant|unknown)\s*\n+(?:Classification|classification):\s*(support|contradict|neutral|irrelevant|insufficient|unknown)\s*\n+(?:Confidence|confidence):\s*(\d+)\s*\n+(?:Reason|reason|Brief reason):\s*(.*?)(?=\s*(?:EVIDENCE|Evidence)\s+\d+|FINAL VERDICT:|$)'
145
+ matches = list(re.finditer(flexible_pattern, result_text, re.IGNORECASE | re.DOTALL))
146
+
147
+ # Process matches
148
+ for match in matches:
149
+ try:
150
+ evidence_idx = int(match.group(1)) - 1
151
+ relevance = match.group(2).lower()
152
+ classification = match.group(3).lower()
153
+ confidence = int(match.group(4))
154
+ reason = match.group(5).strip()
155
+
156
+ # Normalize classification terms
157
+ if classification == "neutral":
158
+ classification = "insufficient"
159
+
160
+ # For strong assertions, apply confidence adjustments based on classification
161
+ if contains_strong_assertions:
162
+ if classification == "support":
163
+ # Check if the reasoning indicates direct or indirect support
164
+ indirect_support_markers = ["general", "doesn't directly", "does not directly",
165
+ "doesn't specifically", "not specific", "related to",
166
+ "doesn't confirm"]
167
+ if any(marker in reason.lower() for marker in indirect_support_markers):
168
+ # Downgrade support confidence for indirect evidence
169
+ confidence = max(5, confidence - 20)
170
+ elif classification == "contradict":
171
+ # For contradictions of strong assertions, slightly boost confidence
172
+ confidence = min(95, confidence + 5)
173
+
174
+ # Ensure index is valid
175
+ if 0 <= evidence_idx < len(evidence):
176
+ matched_evidence.add(evidence_idx)
177
+
178
+ # Create result entry
179
+ classification_results.append({
180
+ "label": classification,
181
+ "confidence": confidence / 100.0,
182
+ "evidence": evidence[evidence_idx],
183
+ "relevance": relevance,
184
+ "reason": reason,
185
+ "final_assessment": final_verdict
186
+ })
187
+ except (ValueError, IndexError) as e:
188
+ logger.error(f"Error parsing evidence analysis: {e}")
189
+
190
+ # Handle any unmatched evidence items
191
+ if matches: # Only add defaults if we successfully matched some
192
+ for idx, ev in enumerate(evidence):
193
+ if idx not in matched_evidence:
194
+ # Check if the evidence text itself suggests a classification
195
+ contains_support = bool(re.search(r'support|confirm|verify|true|correct|released', final_verdict or "", re.IGNORECASE))
196
+ contains_contradicting = bool(re.search(r'not yet|hasn\'t|have not|doesn\'t|don\'t|cannot|preliminary|proposed', str(ev).lower()))
197
+
198
+ # For claims with strong assertions without explicit evidence, be more cautious
199
+ if contains_strong_assertions:
200
+ if contains_contradicting:
201
+ label = "contradict"
202
+ confidence = 0.6
203
+ elif contains_support:
204
+ label = "insufficient" # Default to insufficient for strong assertions without clear analysis
205
+ confidence = 0.5
206
+ else:
207
+ label = "insufficient"
208
+ confidence = 0.5
209
+ else:
210
+ label = "support" if contains_support else "unknown"
211
+ confidence = 0.7 if contains_support else 0.5
212
+
213
+ classification_results.append({
214
+ "label": label,
215
+ "confidence": confidence,
216
+ "evidence": ev,
217
+ "relevance": "relevant" if (contains_support or contains_contradicting) else "unknown",
218
+ "reason": "Based on overall assessment",
219
+ "final_assessment": final_verdict
220
+ })
221
+ else:
222
+ # No structured parsing worked, use final verdict to create simple results
223
+ contains_support = bool(re.search(r'support|confirm|verify|true|correct|released', final_verdict or "", re.IGNORECASE))
224
+ contains_contradict = bool(re.search(r'contradict|against|false|incorrect|not support|does not support|insufficient evidence|does not confirm|no evidence', final_verdict or "", re.IGNORECASE))
225
+ contains_insufficient = bool(re.search(r'insufficient|not enough|cannot determine|no evidence|lack of evidence', final_verdict or "", re.IGNORECASE))
226
+
227
+ # For claims with strong assertions, be more stringent
228
+ if contains_strong_assertions:
229
+ if contains_support and not contains_insufficient and not contains_contradict:
230
+ label = "support"
231
+ confidence = 0.6 # Lower confidence even for support of strong assertions
232
+ elif contains_contradict:
233
+ label = "contradict"
234
+ confidence = 0.8 # Higher confidence for contradiction of strong assertions
235
+ else:
236
+ label = "insufficient"
237
+ confidence = 0.7 # Good confidence for insufficient judgment
238
+ else:
239
+ label = "support" if contains_support else "contradict" if contains_contradict else "unknown"
240
+ confidence = 0.7 if (contains_support or contains_contradict) else 0.5
241
+
242
+ # Create basic results based on final verdict
243
+ for ev in evidence:
244
+ classification_results.append({
245
+ "label": label,
246
+ "confidence": confidence,
247
+ "evidence": ev,
248
+ "relevance": "relevant" if (contains_support or contains_contradict) else "unknown",
249
+ "reason": final_verdict or "Based on collective evidence",
250
+ "final_assessment": final_verdict
251
+ })
252
+
253
+ logger.info(f"Classified {len(classification_results)} evidence items")
254
+ return classification_results
255
+
256
+ except Exception as e:
257
+ logger.error(f"Error in evidence classification: {str(e)}")
258
+ # Provide a basic fallback that checks for keywords in evidence
259
+ try:
260
+ fallback_results = []
261
+ for ev in evidence:
262
+ ev_text = str(ev).lower()
263
+ supports = False
264
+ contradicts = False
265
+
266
+ # Basic keyword checking as last resort
267
+ if claim.lower() in ev_text:
268
+ keywords = [word for word in claim.lower().split() if len(word) > 3]
269
+ matching_keywords = [k for k in keywords if k in ev_text]
270
+
271
+ # If substantial keywords match, consider it support
272
+ supports = len(matching_keywords) >= max(1, len(keywords) // 2)
273
+
274
+ # Check for contradiction terms
275
+ contradiction_terms = ["not yet", "hasn't", "haven't", "cannot", "can't",
276
+ "doesn't", "don't", "no evidence", "insufficient",
277
+ "preliminary", "proposed", "in development", "future"]
278
+ contradicts = any(term in ev_text for term in contradiction_terms)
279
+
280
+ # For claims with strong assertions, be more conservative in the fallback case
281
+ if contains_strong_assertions:
282
+ if contradicts:
283
+ fallback_results.append({
284
+ "label": "contradict",
285
+ "confidence": 0.6,
286
+ "evidence": ev,
287
+ "relevance": "relevant",
288
+ "reason": "Evidence suggests the claim is not yet proven (fallback method)"
289
+ })
290
+ elif supports:
291
+ fallback_results.append({
292
+ "label": "insufficient",
293
+ "confidence": 0.6,
294
+ "evidence": ev,
295
+ "relevance": "relevant",
296
+ "reason": "Evidence is related but doesn't conclusively confirm the assertion (fallback method)"
297
+ })
298
+ else:
299
+ fallback_results.append({
300
+ "label": "unknown",
301
+ "confidence": 0.5,
302
+ "evidence": ev,
303
+ "relevance": "unknown",
304
+ "reason": "Cannot determine relevance (fallback method)"
305
+ })
306
+ else:
307
+ fallback_results.append({
308
+ "label": "support" if supports else "unknown",
309
+ "confidence": 0.6 if supports else 0.5,
310
+ "evidence": ev,
311
+ "relevance": "relevant" if supports else "unknown",
312
+ "reason": "Based on keyword matching (fallback method)"
313
+ })
314
+
315
+ return fallback_results
316
+ except:
317
+ # Absolute last resort
318
+ return [{"label": "unknown", "confidence": 0.5, "evidence": ev} for ev in evidence]
319
+
320
+ def aggregate_evidence(classification_results):
321
+ """
322
+ Aggregate evidence classifications to determine overall verdict
323
+ with robust fallback mechanisms for reliable results
324
+ """
325
+ logger.info(f"Aggregating evidence from {len(classification_results) if classification_results else 0} results")
326
+
327
+ if not classification_results:
328
+ logger.warning("No classification results to aggregate")
329
+ return "Uncertain", 0.3 # Default with low confidence
330
+
331
+ # Assess the claim's characteristics (without relying on explicit category detection)
332
+ # Does the claim contain strong assertions that require specific evidence?
333
+ strong_assertion_markers = [
334
+ "solved", "cured", "discovered", "confirmed", "proven", "definitive",
335
+ "breakthrough", "revolutionary", "successfully", "first ever", "extends",
336
+ "conclusive", "unprecedented", "remarkable", "definitively"
337
+ ]
338
+
339
+ # Check if claim text is available in final assessment
340
+ claim_text = None
341
+ claim_has_strong_assertions = False
342
+
343
+ # Extract claim from final assessment if available
344
+ for item in classification_results:
345
+ if "final_assessment" in item and item["final_assessment"]:
346
+ match = re.search(r'the claim (?:that )?"?([^"]+)"?', item["final_assessment"], re.IGNORECASE)
347
+ if match:
348
+ claim_text = match.group(1)
349
+ claim_has_strong_assertions = any(marker in claim_text.lower() for marker in strong_assertion_markers)
350
+ break
351
+
352
+ # If we couldn't extract the claim, check evidence context for assertion indicators
353
+ if not claim_text:
354
+ # Check if evidence reasons suggest dealing with strong assertions
355
+ assertion_context_indicators = ["conclusive evidence", "definitive proof", "solved", "breakthrough",
356
+ "revolutionary", "directly confirms", "specific confirmation"]
357
+
358
+ reasons = [item.get("reason", "").lower() for item in classification_results if "reason" in item]
359
+ assertion_indicators_count = sum(1 for indicator in assertion_context_indicators
360
+ for reason in reasons if indicator in reason)
361
+
362
+ claim_has_strong_assertions = assertion_indicators_count >= 2
363
+
364
+ # Extract final assessment if present
365
+ final_assessment = None
366
+ for item in classification_results:
367
+ if "final_assessment" in item and item["final_assessment"]:
368
+ final_assessment = item["final_assessment"]
369
+ break
370
+
371
+ # Count evidence by classification
372
+ support_items = [item for item in classification_results if item.get("label") == "support"]
373
+ contradict_items = [item for item in classification_results if item.get("label") == "contradict"]
374
+ insufficient_items = [item for item in classification_results if item.get("label") in ["insufficient", "neutral"]]
375
+ relevant_items = [item for item in classification_results
376
+ if item.get("relevance") == "relevant" or item.get("label") in ["support", "contradict"]]
377
+
378
+ # Calculate the proportion of supported evidence
379
+ total_relevant = len(relevant_items)
380
+
381
+ # Direct keyword detection from final assessment or evidence
382
+ if final_assessment:
383
+ # Check for support indicators in final assessment
384
+ supports_pattern = r'\b(support|confirm|verify|true|correct|released|proves|validates|evidence (?:that |for |of )(?:the claim|it) is true)\b'
385
+ contradicts_pattern = r'\b(contradict|refute|deny|false|incorrect|not released|doesn\'t support|does not support|no evidence|cannot support|is not true|evidence (?:that |for |of )(?:the claim|it) is false)\b'
386
+ insufficient_pattern = r'\b(uncertain|insufficient|not enough|inconclusive|cannot determine|unable to determine|lack of evidence)\b'
387
+
388
+ supports_match = re.search(supports_pattern, final_assessment, re.IGNORECASE)
389
+ contradicts_match = re.search(contradicts_pattern, final_assessment, re.IGNORECASE)
390
+ insufficient_match = re.search(insufficient_pattern, final_assessment, re.IGNORECASE)
391
+
392
+ # Direct determination based on final assessment keywords
393
+ if supports_match and not contradicts_match and not insufficient_match:
394
+ # Get max confidence from supporting evidence
395
+ confidence = max([item.get("confidence", 0) for item in support_items]) if support_items else 0.7
396
+
397
+ # Adjust confidence for claims with strong assertions
398
+ if claim_has_strong_assertions:
399
+ confidence = min(confidence, 0.8) # Cap confidence for strong assertions
400
+
401
+ return "True (Based on Evidence)", max(0.6, confidence) # Minimum 0.6 confidence
402
+
403
+ if contradicts_match and not supports_match:
404
+ # Get max confidence from contradicting evidence
405
+ confidence = max([item.get("confidence", 0) for item in contradict_items]) if contradict_items else 0.7
406
+
407
+ # For claims with strong assertions, increase confidence in contradiction
408
+ if claim_has_strong_assertions:
409
+ confidence = max(confidence, 0.7) # Minimum 0.7 confidence for contradicting strong assertions
410
+
411
+ return "False (Based on Evidence)", max(0.6, confidence) # Minimum 0.6 confidence
412
+
413
+ if insufficient_match:
414
+ # For claims with strong assertions without confirming evidence,
415
+ # change "Uncertain" to a clearer negative verdict
416
+ if claim_has_strong_assertions:
417
+ return "False (Based on Evidence)", 0.7
418
+ return "Uncertain", 0.4 # Medium-low confidence
419
+
420
+ # If we have distinct classifications, weigh them by confidence and quantity
421
+ if support_items and (not contradict_items or all(item.get("confidence", 0) < 0.95 for item in contradict_items)):
422
+ # Check if there's high confidence support evidence (greater than 0.95)
423
+ high_confidence_support = [item for item in support_items if item.get("confidence", 0) > 0.95]
424
+
425
+ if high_confidence_support:
426
+ # High confidence support evidence exists, use it even if there are some contradictions
427
+ confidence = max([item.get("confidence", 0) for item in high_confidence_support])
428
+ # For claims with strong assertions, be more conservative with pure support
429
+ if claim_has_strong_assertions:
430
+ confidence = min(confidence, 0.8)
431
+ return "True (Based on Evidence)", max(0.7, confidence)
432
+ elif not contradict_items:
433
+ # All supportive evidence with no contradictions (standard case)
434
+ confidence = max([item.get("confidence", 0) for item in support_items])
435
+
436
+ # For claims with strong assertions, be more conservative with pure support
437
+ if claim_has_strong_assertions:
438
+ # For strong assertions with only support but no contradictions, be cautious
439
+ confidence = min(confidence, 0.7)
440
+ # If the support is from low-quality evidence, consider it uncertain
441
+ support_reasons = [item.get("reason", "").lower() for item in support_items]
442
+ weak_supports = sum(1 for reason in support_reasons if
443
+ "general information" in reason or
444
+ "doesn't specify" in reason or
445
+ "does not directly" in reason)
446
+ if weak_supports / max(1, len(support_items)) > 0.5:
447
+ return "Uncertain", 0.6
448
+
449
+ return "True (Based on Evidence)", max(0.6, confidence)
450
+
451
+ if contradict_items and not support_items:
452
+ # All contradicting evidence
453
+ confidence = max([item.get("confidence", 0) for item in contradict_items])
454
+
455
+ # For claims with strong assertions, increase confidence in contradiction
456
+ if claim_has_strong_assertions:
457
+ confidence = max(confidence, 0.7)
458
+
459
+ return "False (Based on Evidence)", max(0.6, confidence)
460
+
461
+ if insufficient_items and len(insufficient_items) > len(support_items) + len(contradict_items):
462
+ # Mostly insufficient evidence
463
+ # For claims with strong assertions and mainly insufficient evidence, lean toward "False"
464
+ if claim_has_strong_assertions:
465
+ return "False (Based on Evidence)", 0.7
466
+ return "Uncertain", 0.5 # Medium confidence for explicitly uncertain
467
+
468
+ if support_items and contradict_items:
469
+ # Competing evidence - compare confidence and quantity
470
+ support_confidence = max([item.get("confidence", 0) for item in support_items])
471
+ contradict_confidence = max([item.get("confidence", 0) for item in contradict_items])
472
+
473
+ # For claims with strong assertions, require stronger support to overcome contradiction
474
+ if claim_has_strong_assertions:
475
+ # Higher threshold for strong assertions
476
+ if support_confidence > contradict_confidence + 0.3:
477
+ return "True (Based on Evidence)", support_confidence * 0.9 # Apply a confidence penalty
478
+ elif contradict_confidence >= support_confidence - 0.1: # Lower threshold for contradiction
479
+ return "False (Based on Evidence)", max(contradict_confidence, 0.7) # Minimum 0.7 confidence
480
+ else:
481
+ # Default to uncertain for close calls on strong assertions
482
+ return "Uncertain", 0.6
483
+ else:
484
+ # Standard threshold for regular claims
485
+ if support_confidence > contradict_confidence + 0.2:
486
+ return "True (Based on Evidence)", support_confidence
487
+ elif contradict_confidence > support_confidence + 0.2:
488
+ return "False (Based on Evidence)", contradict_confidence
489
+ else:
490
+ # Close call - check quantity of evidence
491
+ if len(support_items) > len(contradict_items) * 2:
492
+ return "True (Based on Evidence)", support_confidence * 0.9 # Slight confidence penalty
493
+ elif len(contradict_items) > len(support_items) * 2:
494
+ return "False (Based on Evidence)", contradict_confidence * 0.9 # Slight confidence penalty
495
+ else:
496
+ # Truly conflicting evidence
497
+ return "Uncertain", 0.5 # Medium confidence
498
+
499
+ # Check for evidence quality issues
500
+ all_unknown = all(item.get("label") == "unknown" for item in classification_results)
501
+ evidence_text = " ".join([str(item.get("evidence", "")) for item in classification_results])
502
+
503
+ # General case: For any claims with all unknown labels that contain markers of strong assertions
504
+ if all_unknown and claim_has_strong_assertions:
505
+ # Absence of clear supporting evidence for claims with strong assertions points to "False"
506
+ return "False (Based on Evidence)", 0.7
507
+
508
+ # For general claims, if all items are unknown but evidence clearly mentions the claim
509
+ if all_unknown:
510
+ # Examples of direct evidence matching as fallback
511
+ if re.search(r'\bllama\s*4\b', evidence_text, re.IGNORECASE) and re.search(r'\bmeta\b|\bfacebook\b', evidence_text, re.IGNORECASE) and re.search(r'\breleas', evidence_text, re.IGNORECASE):
512
+ return "True (Based on Evidence)", 0.7
513
+ elif re.search(r'\bnot\s+releas', evidence_text, re.IGNORECASE) or re.search(r'\bdenies\b|\bdenied\b', evidence_text, re.IGNORECASE):
514
+ return "False (Based on Evidence)", 0.7
515
+
516
+ # Default to uncertain if no clear pattern - but with special case for claims with strong assertions
517
+ if claim_has_strong_assertions:
518
+ # For claims with strong assertions with no clear evidence, default to false
519
+ return "False (Based on Evidence)", 0.7
520
+
521
+ return "Uncertain", 0.3
modules/evidence_retrieval.py ADDED
@@ -0,0 +1,944 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import time
3
+ import re
4
+ import random
5
+ import requests
6
+ import json
7
+ import ssl
8
+ from urllib.parse import urlencode
9
+ from bs4 import BeautifulSoup
10
+ from SPARQLWrapper import SPARQLWrapper, JSON
11
+ from datetime import datetime, timedelta
12
+ from concurrent.futures import ThreadPoolExecutor, as_completed, wait, FIRST_COMPLETED
13
+
14
+ from utils.api_utils import api_error_handler, safe_json_parse
15
+ from utils.models import get_nlp_model
16
+ from modules.claim_extraction import shorten_claim_for_evidence, extract_claims
17
+ from modules.rss_feed import retrieve_evidence_from_rss
18
+ from modules.semantic_analysis import analyze_evidence_relevance, select_diverse_evidence
19
+ from config import SOURCE_CREDIBILITY, NEWS_API_KEY, FACTCHECK_API_KEY
20
+
21
+ # Import the performance tracker
22
+ from utils.performance import PerformanceTracker
23
+ performance_tracker = PerformanceTracker()
24
+
25
+ logger = logging.getLogger("misinformation_detector")
26
+
27
+ # Define early analysis function at the module level so it's available everywhere
28
+ def analyze_early_evidence(claim, source_name, source_evidence):
29
+ """Pre-analyze evidence while waiting for other sources to complete"""
30
+ try:
31
+ if not source_evidence:
32
+ return None
33
+
34
+ logger.info(f"Pre-analyzing {len(source_evidence)} evidence items from {source_name}")
35
+
36
+ # Do a quick relevance check using similarity scoring
37
+ nlp_model = get_nlp_model()
38
+ claim_doc = nlp_model(claim)
39
+
40
+ relevant_evidence = []
41
+ for evidence in source_evidence:
42
+ if not isinstance(evidence, str):
43
+ continue
44
+
45
+ # Look for direct keyword matches first (fast check)
46
+ is_related = False
47
+ keywords = [word.lower() for word in claim.split() if len(word) > 3]
48
+ for keyword in keywords:
49
+ if keyword in evidence.lower():
50
+ is_related = True
51
+ break
52
+
53
+ # If no keywords match, do a basic entity check
54
+ if not is_related:
55
+ # Check if claim and evidence share any entities
56
+ evidence_doc = nlp_model(evidence[:500]) # Limit for speed
57
+ claim_entities = [ent.text.lower() for ent in claim_doc.ents]
58
+ evidence_entities = [ent.text.lower() for ent in evidence_doc.ents]
59
+
60
+ common_entities = set(claim_entities).intersection(set(evidence_entities))
61
+ if common_entities:
62
+ is_related = True
63
+
64
+ if is_related:
65
+ relevant_evidence.append(evidence)
66
+
67
+ logger.info(f"Found {len(relevant_evidence)} relevant items out of {len(source_evidence)} from {source_name}")
68
+ return relevant_evidence
69
+ except Exception as e:
70
+ logger.error(f"Error in early evidence analysis: {e}")
71
+ return source_evidence # On error, return original evidence
72
+
73
+ # New function to get recent date for filtering news
74
+ def get_recent_date_range():
75
+ """Return date range for recent news filtering - last 3 days"""
76
+ today = datetime.now()
77
+ three_days_ago = today - timedelta(days=3)
78
+ return three_days_ago.strftime('%Y-%m-%d'), today.strftime('%Y-%m-%d')
79
+
80
+ @api_error_handler("wikipedia")
81
+ def retrieve_evidence_from_wikipedia(claim):
82
+ """Retrieve evidence from Wikipedia for a given claim"""
83
+ logger.info(f"Retrieving evidence from Wikipedia for: {claim}")
84
+
85
+ # Ensure shortened_claim is a string
86
+ try:
87
+ shortened_claim = shorten_claim_for_evidence(claim)
88
+ except Exception as e:
89
+ logger.error(f"Error in claim shortening: {e}")
90
+ shortened_claim = claim # Fallback to original claim
91
+
92
+ # Ensure query_parts is a list of strings
93
+ query_parts = str(shortened_claim).split()
94
+ evidence = []
95
+ source_count = {"wikipedia": 0}
96
+
97
+ for i in range(len(query_parts), 0, -1): # Start with full query, shorten iteratively
98
+ try:
99
+ # Safely join and encode query
100
+ current_query = "+".join(query_parts[:i])
101
+ search_url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={current_query}&format=json"
102
+ logger.info(f"Wikipedia search URL: {search_url}")
103
+
104
+ headers = {
105
+ "User-Agent": "MisinformationDetectionResearchBot/1.0 (Research Project)"
106
+ }
107
+
108
+ # Make the search request with reduced timeout
109
+ response = requests.get(search_url, headers=headers, timeout=7)
110
+ response.raise_for_status()
111
+
112
+ # Safely parse JSON
113
+ search_data = safe_json_parse(response, "wikipedia")
114
+
115
+ # Safely extract search results
116
+ search_results = search_data.get("query", {}).get("search", [])
117
+
118
+ # Ensure search_results is a list
119
+ if not isinstance(search_results, list):
120
+ logger.warning(f"Unexpected search results type: {type(search_results)}")
121
+ search_results = []
122
+
123
+ # Use ThreadPoolExecutor to fetch page content in parallel
124
+ with ThreadPoolExecutor(max_workers=3) as executor:
125
+ # Submit up to 3 page requests in parallel
126
+ futures = []
127
+ for idx, result in enumerate(search_results[:3]):
128
+ # Ensure result is a dictionary
129
+ if not isinstance(result, dict):
130
+ logger.warning(f"Skipping non-dictionary result: {type(result)}")
131
+ continue
132
+
133
+ # Safely extract title
134
+ page_title = result.get("title", "")
135
+ if not page_title:
136
+ continue
137
+
138
+ page_url = f"https://en.wikipedia.org/wiki/{page_title.replace(' ', '_')}"
139
+
140
+ # Submit the page request task to executor
141
+ futures.append(executor.submit(
142
+ fetch_wikipedia_page_content,
143
+ page_url,
144
+ page_title,
145
+ headers
146
+ ))
147
+
148
+ # Process completed futures as they finish
149
+ for future in as_completed(futures):
150
+ try:
151
+ page_result = future.result()
152
+ if page_result:
153
+ evidence.append(page_result)
154
+ source_count["wikipedia"] += 1
155
+ except Exception as e:
156
+ logger.error(f"Error processing Wikipedia page: {e}")
157
+
158
+ # Stop if we found any evidence
159
+ if evidence:
160
+ break
161
+
162
+ except Exception as e:
163
+ logger.error(f"Error retrieving from Wikipedia: {str(e)}")
164
+ continue
165
+
166
+ # Ensure success is a boolean
167
+ success = bool(evidence)
168
+
169
+ # Safely log evidence retrieval
170
+ try:
171
+ performance_tracker.log_evidence_retrieval(success, source_count)
172
+ except Exception as e:
173
+ logger.error(f"Error logging evidence retrieval: {e}")
174
+
175
+ if not evidence:
176
+ logger.warning("No evidence found from Wikipedia.")
177
+
178
+ return evidence
179
+
180
+ def fetch_wikipedia_page_content(page_url, page_title, headers):
181
+ """Helper function to fetch and parse Wikipedia page content"""
182
+ try:
183
+ # Get page content with reduced timeout
184
+ page_response = requests.get(page_url, headers=headers, timeout=5)
185
+ page_response.raise_for_status()
186
+
187
+ # Extract relevant sections using BeautifulSoup
188
+ soup = BeautifulSoup(page_response.text, 'html.parser')
189
+ paragraphs = soup.find_all('p', limit=3) # Limit to first 3 paragraphs
190
+ content = " ".join([para.get_text(strip=True) for para in paragraphs])
191
+
192
+ # Truncate content to reduce token usage earlier in the pipeline
193
+ if len(content) > 300:
194
+ content = content[:297] + "..."
195
+
196
+ if content.strip(): # Ensure content is not empty
197
+ return f"Title: {page_title}, URL: {page_url}, Content: {content}"
198
+ return None
199
+ except Exception as e:
200
+ logger.error(f"Error fetching Wikipedia page {page_url}: {e}")
201
+ return None
202
+
203
+ # Update the WikiData function to fix SSL issues
204
+ @api_error_handler("wikidata")
205
+ def retrieve_evidence_from_wikidata(claim):
206
+ """Retrieve evidence from WikiData for a given claim"""
207
+ logger.info(f"Retrieving evidence from WikiData for: {claim}")
208
+
209
+ # Prepare entities for SPARQL query
210
+ shortened_claim = shorten_claim_for_evidence(claim)
211
+ query_terms = shortened_claim.split()
212
+
213
+ # Initialize SPARQLWrapper for WikiData
214
+ sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
215
+
216
+ # Use a more conservative user agent to avoid blocks
217
+ sparql.addCustomHttpHeader("User-Agent", "MisinformationDetectionResearchBot/1.0")
218
+
219
+ # Fix SSL issues by disabling SSL verification for this specific request
220
+ try:
221
+ # Create a context where we don't verify SSL certs
222
+ import ssl
223
+ import urllib.request
224
+
225
+ # Create a context that doesn't verify certificates
226
+ ssl_context = ssl._create_unverified_context()
227
+
228
+ # Monkey patch the opener for SPARQLWrapper
229
+ opener = urllib.request.build_opener(urllib.request.HTTPSHandler(context=ssl_context))
230
+ urllib.request.install_opener(opener)
231
+ except Exception as e:
232
+ logger.error(f"Error setting up SSL context: {str(e)}")
233
+
234
+ # Construct basic SPARQL query for relevant entities
235
+ query = """
236
+ SELECT ?item ?itemLabel ?description ?article WHERE {
237
+ SERVICE wikibase:mwapi {
238
+ bd:serviceParam wikibase:api "EntitySearch" .
239
+ bd:serviceParam wikibase:endpoint "www.wikidata.org" .
240
+ bd:serviceParam mwapi:search "%s" .
241
+ bd:serviceParam mwapi:language "en" .
242
+ ?item wikibase:apiOutputItem mwapi:item .
243
+ }
244
+ ?item schema:description ?description .
245
+ FILTER(LANG(?description) = "en")
246
+ OPTIONAL {
247
+ ?article schema:about ?item .
248
+ ?article schema:isPartOf <https://en.wikipedia.org/> .
249
+ }
250
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
251
+ }
252
+ LIMIT 5
253
+ """ % " ".join(query_terms)
254
+
255
+ sparql.setQuery(query)
256
+ sparql.setReturnFormat(JSON)
257
+
258
+ try:
259
+ results = sparql.query().convert()
260
+
261
+ wikidata_evidence = []
262
+
263
+ for result in results["results"]["bindings"]:
264
+ entity_label = result.get("itemLabel", {}).get("value", "Unknown")
265
+ description = result.get("description", {}).get("value", "No description")
266
+ article_url = result.get("article", {}).get("value", "")
267
+
268
+ # Truncate description to reduce token usage
269
+ if len(description) > 200:
270
+ description = description[:197] + "..."
271
+
272
+ evidence_text = f"Entity: {entity_label}, Description: {description}"
273
+ if article_url:
274
+ evidence_text += f", URL: {article_url}"
275
+
276
+ wikidata_evidence.append(evidence_text)
277
+
278
+ logger.info(f"Retrieved {len(wikidata_evidence)} WikiData entities")
279
+ return wikidata_evidence
280
+
281
+ except Exception as e:
282
+ logger.error(f"Error retrieving from WikiData: {str(e)}")
283
+ return []
284
+
285
+ @api_error_handler("openalex")
286
+ def retrieve_evidence_from_openalex(claim):
287
+ """Retrieve evidence from OpenAlex for a given claim (replacement for Semantic Scholar)"""
288
+ logger.info(f"Retrieving evidence from OpenAlex for: {claim}")
289
+
290
+ try:
291
+ shortened_claim = shorten_claim_for_evidence(claim)
292
+ query = shortened_claim.replace(" ", "+")
293
+
294
+ # OpenAlex API endpoint
295
+ api_url = f"https://api.openalex.org/works?search={query}&filter=is_paratext:false&per_page=3"
296
+
297
+ headers = {
298
+ "Accept": "application/json",
299
+ "User-Agent": "MisinformationDetectionResearchBot/1.0 ([email protected])",
300
+ }
301
+
302
+ scholarly_evidence = []
303
+
304
+ try:
305
+ # Request with reduced timeout
306
+ response = requests.get(api_url, headers=headers, timeout=8)
307
+
308
+ # Check response status
309
+ if response.status_code == 200:
310
+ # Successfully retrieved data
311
+ data = safe_json_parse(response, "openalex")
312
+ papers = data.get("results", [])
313
+
314
+ for paper in papers:
315
+ title = paper.get("title", "Unknown Title")
316
+ abstract = paper.get("abstract_inverted_index", None)
317
+
318
+ # OpenAlex stores abstracts in an inverted index format, so we need to reconstruct it
319
+ abstract_text = "No abstract available"
320
+ if abstract:
321
+ try:
322
+ # Simple approach to reconstruct from inverted index
323
+ # For a production app, implement a proper reconstruction algorithm
324
+ words = list(abstract.keys())
325
+ abstract_text = " ".join(words[:30]) + "..."
326
+ except Exception as e:
327
+ logger.error(f"Error reconstructing abstract: {e}")
328
+
329
+ url = paper.get("doi", "")
330
+ if url and not url.startswith("http"):
331
+ url = f"https://doi.org/{url}"
332
+
333
+ year = ""
334
+ publication_date = paper.get("publication_date", "")
335
+ if publication_date:
336
+ year = publication_date.split("-")[0]
337
+
338
+ # Truncate abstract to reasonable length
339
+ if len(abstract_text) > 250:
340
+ abstract_text = abstract_text[:247] + "..."
341
+
342
+ evidence_text = f"Title: {title}, Year: {year}, Abstract: {abstract_text}, URL: {url}"
343
+ scholarly_evidence.append(evidence_text)
344
+
345
+ else:
346
+ logger.error(f"OpenAlex API error: {response.status_code}")
347
+
348
+ except requests.exceptions.Timeout:
349
+ logger.warning("OpenAlex request timed out")
350
+ except requests.exceptions.ConnectionError:
351
+ logger.warning("OpenAlex connection error")
352
+ except Exception as e:
353
+ logger.error(f"Unexpected error in OpenAlex request: {str(e)}")
354
+
355
+ logger.info(f"Retrieved {len(scholarly_evidence)} scholarly papers from OpenAlex")
356
+ return scholarly_evidence
357
+
358
+ except Exception as e:
359
+ logger.error(f"Fatal error in OpenAlex retrieval: {str(e)}")
360
+ return []
361
+
362
+ @api_error_handler("factcheck")
363
+ def retrieve_evidence_from_claimreview(claim):
364
+ """Retrieve evidence from Google's ClaimReview for a given claim"""
365
+ logger.info(f"Retrieving evidence from ClaimReview for: {claim}")
366
+ factcheck_api_key = FACTCHECK_API_KEY
367
+
368
+ # Safely shorten claim
369
+ try:
370
+ shortened_claim = shorten_claim_for_evidence(claim)
371
+ except Exception as e:
372
+ logger.error(f"Error shortening claim: {e}")
373
+ shortened_claim = claim
374
+
375
+ query_parts = str(shortened_claim).split()
376
+ factcheck_results = []
377
+ source_count = {"factcheck": 0}
378
+
379
+ for i in range(len(query_parts), 0, -1): # Iteratively try shorter queries
380
+ try:
381
+ current_query = " ".join(query_parts[:i])
382
+ encoded_query = urlencode({"query": current_query})
383
+ factcheck_url = f"https://factchecktools.googleapis.com/v1alpha1/claims:search?{encoded_query}&key={factcheck_api_key}"
384
+ logger.info(f"Factcheck URL: {factcheck_url}")
385
+
386
+ # Make request with reduced timeout
387
+ response = requests.get(factcheck_url, timeout=7)
388
+ response.raise_for_status()
389
+ data = safe_json_parse(response, "factcheck")
390
+
391
+ # Safely extract claims
392
+ claims = data.get("claims", [])
393
+ if not isinstance(claims, list):
394
+ logger.warning(f"Unexpected claims type: {type(claims)}")
395
+ claims = []
396
+
397
+ if claims: # If results found
398
+ logger.info(f"Results found for query '{current_query}'.")
399
+ for item in claims:
400
+ try:
401
+ # Ensure item is a dictionary
402
+ if not isinstance(item, dict):
403
+ logger.warning(f"Skipping non-dictionary item: {type(item)}")
404
+ continue
405
+
406
+ claim_text = str(item.get("text", ""))
407
+ # Truncate claim text
408
+ if len(claim_text) > 200:
409
+ claim_text = claim_text[:197] + "..."
410
+
411
+ reviews = item.get("claimReview", [])
412
+
413
+ # Ensure reviews is a list
414
+ if not isinstance(reviews, list):
415
+ logger.warning(f"Unexpected reviews type: {type(reviews)}")
416
+ reviews = []
417
+
418
+ for review in reviews:
419
+ # Ensure review is a dictionary
420
+ if not isinstance(review, dict):
421
+ logger.warning(f"Skipping non-dictionary review: {type(review)}")
422
+ continue
423
+
424
+ publisher = str(review.get("publisher", {}).get("name", "Unknown Source"))
425
+ rating = str(review.get("textualRating", "Unknown"))
426
+ review_url = str(review.get("url", ""))
427
+
428
+ if claim_text:
429
+ factcheck_results.append(
430
+ f"Claim: {claim_text}, Rating: {rating}, " +
431
+ f"Source: {publisher}, URL: {review_url}"
432
+ )
433
+ source_count["factcheck"] += 1
434
+
435
+ except Exception as e:
436
+ logger.error(f"Error processing FactCheck result: {e}")
437
+
438
+ break # Break once we have results
439
+ else:
440
+ logger.info(f"No results for query '{current_query}', trying shorter version.")
441
+
442
+ except Exception as e:
443
+ logger.error(f"Error in FactCheck retrieval: {e}")
444
+
445
+ # Safely log evidence retrieval
446
+ try:
447
+ success = bool(factcheck_results)
448
+ performance_tracker.log_evidence_retrieval(success, source_count)
449
+ except Exception as e:
450
+ logger.error(f"Error logging evidence retrieval: {e}")
451
+
452
+ if not factcheck_results:
453
+ logger.warning("No factcheck evidence found after trying all query variants.")
454
+
455
+ return factcheck_results
456
+
457
+ @api_error_handler("newsapi")
458
+ def retrieve_news_articles(claim):
459
+ """Retrieve evidence from NewsAPI for a given claim with improved single request approach"""
460
+ logger.info(f"Retrieving evidence from News API for: {claim}")
461
+
462
+ # Get API key
463
+ news_api_key = NEWS_API_KEY
464
+ if not news_api_key:
465
+ logger.error("No NewsAPI key available")
466
+ return []
467
+
468
+ news_results = []
469
+ source_count = {"news": 0}
470
+
471
+ # Get date range for recent news
472
+ from_date, to_date = get_recent_date_range()
473
+ logger.info(f"Filtering for news from {from_date} to {to_date}")
474
+
475
+ try:
476
+ # Extract a simplified claim for better matching
477
+ shortened_claim = shorten_claim_for_evidence(claim)
478
+
479
+ # Use a single endpoint with proper parameters
480
+ encoded_query = urlencode({"q": shortened_claim})
481
+
482
+ # Use the 'everything' endpoint as it's more comprehensive
483
+ news_api_url = f"https://newsapi.org/v2/everything?{encoded_query}&apiKey={news_api_key}&language=en&pageSize=5&sortBy=publishedAt&from={from_date}&to={to_date}"
484
+
485
+ log_url = news_api_url.replace(news_api_key, "API_KEY_REDACTED")
486
+ logger.info(f"Requesting: {log_url}")
487
+
488
+ # Make a single request with proper headers and reduced timeout
489
+ headers = {
490
+ "User-Agent": "MisinformationDetectionResearchBot/1.0",
491
+ "X-Api-Key": news_api_key,
492
+ "Accept": "application/json"
493
+ }
494
+
495
+ response = requests.get(
496
+ news_api_url,
497
+ headers=headers,
498
+ timeout=8
499
+ )
500
+
501
+ logger.info(f"Response status: {response.status_code}")
502
+
503
+ if response.status_code == 200:
504
+ data = safe_json_parse(response, "newsapi")
505
+
506
+ if data.get("status") == "ok":
507
+ articles = data.get("articles", [])
508
+ logger.info(f"Found {len(articles)} articles")
509
+
510
+ for article in articles:
511
+ try:
512
+ # Robust article parsing
513
+ title = str(article.get("title", ""))
514
+ description = str(article.get("description", ""))
515
+ content = str(article.get("content", ""))
516
+ source_name = str(article.get("source", {}).get("name", "Unknown"))
517
+ url = str(article.get("url", ""))
518
+ published_at = str(article.get("publishedAt", ""))
519
+
520
+ # Parse date to prioritize recent content
521
+ article_date = None
522
+ try:
523
+ if published_at:
524
+ article_date = datetime.strptime(published_at.split('T')[0], '%Y-%m-%d')
525
+ except Exception as date_error:
526
+ logger.warning(f"Could not parse date: {published_at}")
527
+
528
+ # Calculate recency score (higher = more recent)
529
+ recency_score = 1.0 # Default
530
+ if article_date:
531
+ days_old = (datetime.now() - article_date).days
532
+ if days_old == 0: # Today
533
+ recency_score = 3.0
534
+ elif days_old == 1: # Yesterday
535
+ recency_score = 2.0
536
+
537
+ # Use description if content is empty or too short
538
+ if not content or len(content) < 50:
539
+ content = description
540
+
541
+ # Truncate content to reduce token usage
542
+ if len(content) > 250:
543
+ content = content[:247] + "..."
544
+
545
+ # Ensure meaningful content
546
+ if title and (content or description):
547
+ news_item = {
548
+ "text": (
549
+ f"Title: {title}, " +
550
+ f"Source: {source_name}, " +
551
+ f"Date: {published_at}, " +
552
+ f"URL: {url}, " +
553
+ f"Content: {content}"
554
+ ),
555
+ "recency_score": recency_score,
556
+ "date": article_date
557
+ }
558
+ news_results.append(news_item)
559
+ source_count["news"] += 1
560
+ logger.info(f"Added article: {title}")
561
+
562
+ except Exception as article_error:
563
+ logger.error(f"Error processing article: {article_error}")
564
+
565
+ # Sort results by recency
566
+ if news_results:
567
+ news_results.sort(key=lambda x: x.get('recency_score', 0), reverse=True)
568
+
569
+ except Exception as query_error:
570
+ logger.error(f"Error processing query: {query_error}")
571
+
572
+ # Convert to plain text list for compatibility with existing code
573
+ news_texts = [item["text"] for item in news_results]
574
+
575
+ # Log evidence retrieval
576
+ try:
577
+ success = bool(news_texts)
578
+ performance_tracker.log_evidence_retrieval(success, source_count)
579
+ except Exception as log_error:
580
+ logger.error(f"Error logging evidence retrieval: {log_error}")
581
+
582
+ # Log results
583
+ if news_texts:
584
+ logger.info(f"Retrieved {len(news_texts)} news articles")
585
+ else:
586
+ logger.warning("No news articles found")
587
+
588
+ return news_texts
589
+
590
+ def retrieve_combined_evidence(claim):
591
+ """
592
+ Retrieve evidence from multiple sources in parallel and analyze relevance using semantic similarity
593
+ with category-aware source prioritization and optimized parallel processing
594
+ """
595
+ logger.info(f"Starting evidence retrieval for: {claim}")
596
+ start_time = time.time()
597
+
598
+ # Use the category detector to prioritize sources
599
+ from modules.category_detection import get_prioritized_sources, get_category_specific_rss_feeds
600
+
601
+ # Get source priorities based on claim category
602
+ priorities = get_prioritized_sources(claim)
603
+ claim_category = priorities.get("category", "general")
604
+ requires_recent_evidence = priorities.get("requires_recent", False)
605
+
606
+ logger.info(f"Detected claim category: {claim_category} (recent: {requires_recent_evidence})")
607
+
608
+ # Initialize results dictionary
609
+ results = {
610
+ "wikipedia": [],
611
+ "wikidata": [],
612
+ "claimreview": [],
613
+ "news": [],
614
+ "scholarly": [],
615
+ "rss": []
616
+ }
617
+
618
+ # Track source counts and relevant evidence
619
+ source_counts = {}
620
+ relevant_evidence = {}
621
+ total_evidence_count = 0
622
+ relevant_evidence_count = 0
623
+
624
+ # Define primary and secondary sources outside the try block
625
+ # so they're available in the except block
626
+ primary_sources = []
627
+ for source_name in priorities.get("primary", []):
628
+ if source_name == "wikipedia":
629
+ primary_sources.append(("wikipedia", retrieve_evidence_from_wikipedia, claim))
630
+ elif source_name == "wikidata":
631
+ primary_sources.append(("wikidata", retrieve_evidence_from_wikidata, claim))
632
+ elif source_name == "claimreview":
633
+ primary_sources.append(("claimreview", retrieve_evidence_from_claimreview, claim))
634
+ elif source_name == "news":
635
+ primary_sources.append(("news", retrieve_news_articles, claim))
636
+ elif source_name == "scholarly":
637
+ primary_sources.append(("scholarly", retrieve_evidence_from_openalex, claim))
638
+ elif source_name == "rss":
639
+ # Get category-specific RSS max count
640
+ max_results = 8 if requires_recent_evidence else 5
641
+
642
+ # If the claim is science or technology related and we need to optimize
643
+ # use category-specific RSS feeds
644
+ if claim_category in ["science", "technology", "politics"]:
645
+ # Get specialized RSS module to temporarily use category-specific feeds
646
+ category_feeds = get_category_specific_rss_feeds(claim_category)
647
+ if category_feeds:
648
+ primary_sources.append(("rss", retrieve_evidence_from_rss, claim, max_results, category_feeds))
649
+ else:
650
+ primary_sources.append(("rss", retrieve_evidence_from_rss, claim, max_results))
651
+ else:
652
+ primary_sources.append(("rss", retrieve_evidence_from_rss, claim, max_results))
653
+
654
+ # Prepare secondary sources
655
+ secondary_sources = []
656
+ for source_name in priorities.get("secondary", []):
657
+ if source_name == "wikipedia":
658
+ secondary_sources.append(("wikipedia", retrieve_evidence_from_wikipedia, claim))
659
+ elif source_name == "wikidata":
660
+ secondary_sources.append(("wikidata", retrieve_evidence_from_wikidata, claim))
661
+ elif source_name == "claimreview":
662
+ secondary_sources.append(("claimreview", retrieve_evidence_from_claimreview, claim))
663
+ elif source_name == "news":
664
+ secondary_sources.append(("news", retrieve_news_articles, claim))
665
+ elif source_name == "scholarly":
666
+ secondary_sources.append(("scholarly", retrieve_evidence_from_openalex, claim))
667
+ elif source_name == "rss":
668
+ max_results = 5 if requires_recent_evidence else 3
669
+ # Use category-specific feeds if available
670
+ if claim_category in ["science", "technology", "politics"]:
671
+ category_feeds = get_category_specific_rss_feeds(claim_category)
672
+ if category_feeds:
673
+ secondary_sources.append(("rss", retrieve_evidence_from_rss, claim, max_results, category_feeds))
674
+ else:
675
+ secondary_sources.append(("rss", retrieve_evidence_from_rss, claim, max_results))
676
+ else:
677
+ secondary_sources.append(("rss", retrieve_evidence_from_rss, claim, max_results))
678
+
679
+ # Optimize parallel processing for evidence retrieval with early results processing
680
+ try:
681
+ # Define function to safely retrieve evidence
682
+ def safe_retrieve(source_name, retrieval_func, *args):
683
+ try:
684
+ source_result = retrieval_func(*args) or []
685
+ return source_name, source_result
686
+ except Exception as e:
687
+ logger.error(f"Error retrieving from {source_name}: {str(e)}")
688
+ return source_name, []
689
+
690
+ # Define function to analyze evidence relevance
691
+ def analyze_evidence_quick(evidence_items, claim_text):
692
+ if not evidence_items or not claim_text:
693
+ return []
694
+
695
+ # Extract important keywords from claim
696
+ keywords = [word.lower() for word in claim_text.split() if len(word) > 3]
697
+
698
+ # Check for direct relevance
699
+ relevant_items = []
700
+ for evidence in evidence_items:
701
+ if not isinstance(evidence, str):
702
+ continue
703
+
704
+ evidence_lower = evidence.lower()
705
+
706
+ # Check if evidence contains any important keywords from claim
707
+ if any(keyword in evidence_lower for keyword in keywords):
708
+ relevant_items.append(evidence)
709
+ continue
710
+
711
+ # Check for claim subject in evidence (e.g. "earth" in "earth is flat")
712
+ claim_parts = claim_text.split()
713
+ if len(claim_parts) > 0 and claim_parts[0].lower() in evidence_lower:
714
+ relevant_items.append(evidence)
715
+ continue
716
+
717
+ return relevant_items
718
+
719
+ # Use ThreadPoolExecutor with a reasonable number of workers
720
+ # Start with primary sources first - use all available sources in parallel
721
+ with ThreadPoolExecutor(max_workers=min(4, len(primary_sources))) as executor:
722
+ # Submit all primary source tasks
723
+ futures_to_source = {
724
+ executor.submit(safe_retrieve, source_name, func, *args): source_name
725
+ for source_name, func, *args in primary_sources
726
+ }
727
+
728
+ # Track completed sources
729
+ completed_sources = set()
730
+
731
+ # Process results as they complete using as_completed for early processing
732
+ for future in as_completed(futures_to_source):
733
+ try:
734
+ source_name, source_results = future.result()
735
+ results[source_name] = source_results
736
+ source_counts[source_name] = len(source_results)
737
+ completed_sources.add(source_name)
738
+ logger.info(f"Retrieved {len(source_results)} results from {source_name}")
739
+
740
+ # Quick relevance analysis
741
+ if source_results:
742
+ relevant_items = analyze_evidence_quick(source_results, claim)
743
+ relevant_evidence[source_name] = relevant_items
744
+ total_evidence_count += len(source_results)
745
+ relevant_evidence_count += len(relevant_items)
746
+ logger.info(f"Found {len(relevant_items)} relevant items out of {len(source_results)} from {source_name}")
747
+
748
+ # Start background pre-analysis while waiting for other sources
749
+ try:
750
+ executor.submit(
751
+ analyze_early_evidence,
752
+ claim,
753
+ source_name,
754
+ source_results
755
+ )
756
+ except Exception as e:
757
+ logger.error(f"Error in early evidence analysis: {e}")
758
+
759
+ except Exception as e:
760
+ logger.error(f"Error processing future result: {str(e)}")
761
+
762
+ # Check if we have sufficient RELEVANT evidence from primary sources
763
+ # If not enough relevant evidence, query secondary sources
764
+ # in parallel even if we have a lot of total evidence
765
+ if relevant_evidence_count < 2:
766
+ logger.info(f"Only found {relevant_evidence_count} relevant evidence items, querying secondary sources")
767
+
768
+ # Add Wikipedia and Wikidata if they weren't in primary sources and haven't been queried yet
769
+ must_check_sources = []
770
+ if "wikipedia" not in completed_sources:
771
+ must_check_sources.append(("wikipedia", retrieve_evidence_from_wikipedia, claim))
772
+
773
+ if "wikidata" not in completed_sources:
774
+ must_check_sources.append(("wikidata", retrieve_evidence_from_wikidata, claim))
775
+
776
+ # Combine with other secondary sources
777
+ remaining_sources = must_check_sources + [
778
+ (source_name, func, *args) for source_name, func, *args in secondary_sources
779
+ if source_name not in completed_sources
780
+ ]
781
+
782
+ with ThreadPoolExecutor(max_workers=min(3, len(remaining_sources))) as executor:
783
+ # Submit all secondary source tasks
784
+ futures_to_source = {
785
+ executor.submit(safe_retrieve, source_name, func, *args): source_name
786
+ for source_name, func, *args in remaining_sources
787
+ }
788
+
789
+ # Process results as they complete
790
+ for future in as_completed(futures_to_source):
791
+ try:
792
+ source_name, source_results = future.result()
793
+ results[source_name] = source_results
794
+ source_counts[source_name] = len(source_results)
795
+ logger.info(f"Retrieved {len(source_results)} results from {source_name}")
796
+
797
+ # Quick relevance analysis for these as well
798
+ if source_results:
799
+ relevant_items = analyze_evidence_quick(source_results, claim)
800
+ relevant_evidence[source_name] = relevant_items
801
+ total_evidence_count += len(source_results)
802
+ relevant_evidence_count += len(relevant_items)
803
+ logger.info(f"Found {len(relevant_items)} relevant items out of {len(source_results)} from {source_name}")
804
+ except Exception as e:
805
+ logger.error(f"Error processing future result: {str(e)}")
806
+
807
+ except Exception as e:
808
+ logger.error(f"Error in parallel evidence retrieval: {str(e)}")
809
+ # Fall back to sequential retrieval as a last resort
810
+ try:
811
+ logger.warning("Falling back to sequential retrieval due to parallel execution failure")
812
+ # Sequential retrieval as fallback method - now primary_sources is in scope
813
+ for source_name, func, *args in primary_sources:
814
+ try:
815
+ results[source_name] = func(*args) or []
816
+ source_counts[source_name] = len(results[source_name])
817
+ except Exception as source_error:
818
+ logger.error(f"Error in sequential {source_name} retrieval: {str(source_error)}")
819
+
820
+ # For sequential retrieval, always check Wikipedia and Wikidata as fallbacks
821
+ if "wikipedia" not in completed_sources:
822
+ try:
823
+ results["wikipedia"] = retrieve_evidence_from_wikipedia(claim) or []
824
+ source_counts["wikipedia"] = len(results["wikipedia"])
825
+ except Exception as e:
826
+ logger.error(f"Error in fallback Wikipedia retrieval: {e}")
827
+
828
+ if "wikidata" not in completed_sources:
829
+ try:
830
+ results["wikidata"] = retrieve_evidence_from_wikidata(claim) or []
831
+ source_counts["wikidata"] = len(results["wikidata"])
832
+ except Exception as e:
833
+ logger.error(f"Error in fallback Wikidata retrieval: {e}")
834
+
835
+ except Exception as fallback_error:
836
+ logger.error(f"Error in fallback sequential retrieval: {str(fallback_error)}")
837
+
838
+ # Gather all evidence
839
+ all_evidence = []
840
+ for source, items in results.items():
841
+ if isinstance(items, list):
842
+ for item in items:
843
+ if item and isinstance(item, str):
844
+ all_evidence.append(item)
845
+
846
+ # Skip processing if no evidence
847
+ if not all_evidence:
848
+ logger.warning("No evidence collected")
849
+
850
+ # Fallback: try direct search for the claim subject
851
+ try:
852
+ logger.info("No evidence found, trying fallback subject search")
853
+
854
+ # Extract the main subject using NLP
855
+ nlp = get_nlp_model()
856
+ doc = nlp(claim)
857
+
858
+ # Find main subject entities or nouns
859
+ subjects = []
860
+ for ent in doc.ents:
861
+ if ent.label_ in ["PERSON", "ORG", "GPE"]:
862
+ subjects.append(ent.text)
863
+
864
+ # If no entities found, use first noun phrase
865
+ if not subjects:
866
+ for chunk in doc.noun_chunks:
867
+ subjects.append(chunk.text)
868
+ break
869
+
870
+ if subjects:
871
+ # Try a direct search with just the subject
872
+ logger.info(f"Trying fallback search with subject: {subjects[0]}")
873
+
874
+ # Make sure we try Wikipedia for the subject regardless of priorities
875
+ try:
876
+ wiki_evidence = retrieve_evidence_from_wikipedia(subjects[0]) or []
877
+ all_evidence.extend(wiki_evidence)
878
+ logger.info(f"Retrieved {len(wiki_evidence)} results from fallback Wikipedia search")
879
+ except Exception as e:
880
+ logger.error(f"Error in fallback Wikipedia search: {e}")
881
+
882
+ # If still no evidence, try other sources
883
+ if not all_evidence:
884
+ # Do fallback searches in parallel
885
+ with ThreadPoolExecutor(max_workers=2) as executor:
886
+ fallback_futures = {
887
+ "news": executor.submit(retrieve_news_articles, subjects[0]),
888
+ "wikidata": executor.submit(retrieve_evidence_from_wikidata, subjects[0])
889
+ }
890
+
891
+ # Process results as they complete
892
+ for source, future in fallback_futures.items():
893
+ try:
894
+ fallback_results = future.result() or []
895
+ if fallback_results:
896
+ all_evidence.extend(fallback_results[:2]) # Add up to 2 results from each
897
+ logger.info(f"Retrieved {len(fallback_results)} results from fallback {source} search")
898
+ except Exception as e:
899
+ logger.error(f"Error in fallback {source} search: {str(e)}")
900
+
901
+ except Exception as subj_error:
902
+ logger.error(f"Error in fallback subject search: {str(subj_error)}")
903
+
904
+ # If still no evidence, return empty list
905
+ if not all_evidence:
906
+ return []
907
+
908
+ # Use semantic analysis to score and select the most relevant evidence
909
+ try:
910
+ # For science and technology claims, boost the weight of scholarly sources
911
+ if claim_category in ["science", "technology"]:
912
+ from config import SOURCE_CREDIBILITY
913
+ # Create a temporary copy with boosted reliability for relevant sources
914
+ enhanced_credibility = dict(SOURCE_CREDIBILITY)
915
+
916
+ # Add enhanced weights for scientific sources
917
+ from modules.category_detection import SOURCE_RELIABILITY_BY_CATEGORY
918
+ for domain, reliability in SOURCE_RELIABILITY_BY_CATEGORY.get(claim_category, {}).items():
919
+ enhanced_credibility[domain] = reliability
920
+
921
+ # Use the enhanced credibility for evidence analysis
922
+ analyzed_evidence = analyze_evidence_relevance(claim, all_evidence, enhanced_credibility)
923
+ else:
924
+ # Analyze evidence relevance using semantic similarity with default weights
925
+ from config import SOURCE_CREDIBILITY
926
+ analyzed_evidence = analyze_evidence_relevance(claim, all_evidence, SOURCE_CREDIBILITY)
927
+
928
+ # Log evidence scoring
929
+ logger.info(f"Analyzed {len(analyzed_evidence)} evidence items")
930
+
931
+ # Select diverse, relevant evidence items
932
+ final_evidence = select_diverse_evidence(analyzed_evidence, max_items=5)
933
+
934
+ # Log source distribution and selected count
935
+ logger.info(f"Evidence source distribution: {source_counts}")
936
+ logger.info(f"Selected evidence count: {len(final_evidence)}")
937
+
938
+ # Return maximum 5 evidence items (to control API costs)
939
+ return final_evidence[:5]
940
+
941
+ except Exception as e:
942
+ logger.error(f"Error in evidence analysis: {str(e)}")
943
+ # Fallback to simple selection (top 5 items)
944
+ return all_evidence[:5]
modules/explanation.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import re
3
+ import ast
4
+ from utils.models import get_llm_model
5
+
6
+ logger = logging.getLogger("misinformation_detector")
7
+
8
+ def extract_most_relevant_evidence(evidence_results):
9
+ """
10
+ Intelligently extract the most relevant piece of evidence
11
+
12
+ Args:
13
+ evidence_results (list): List of evidence items
14
+
15
+ Returns:
16
+ str: Most relevant evidence piece
17
+ """
18
+ if not evidence_results:
19
+ return None
20
+
21
+ # If evidence is a dictionary with 'evidence' key
22
+ if isinstance(evidence_results[0], dict):
23
+ # Sort by confidence if available
24
+ sorted_evidence = sorted(
25
+ evidence_results,
26
+ key=lambda x: x.get('confidence', 0),
27
+ reverse=True
28
+ )
29
+
30
+ # Return the evidence from the highest confidence item
31
+ for item in sorted_evidence:
32
+ evidence = item.get('evidence')
33
+ if evidence:
34
+ return evidence
35
+
36
+ # If plain list of evidence
37
+ return next((ev for ev in evidence_results if ev and isinstance(ev, str)), None)
38
+
39
+ def generate_explanation(claim, evidence_results, truth_label, confidence=None):
40
+ """
41
+ Generate an explanation for the claim's classification
42
+
43
+ Args:
44
+ claim (str): The original claim
45
+ evidence_results (list/str): Evidence supporting the classification
46
+ truth_label (str): Classification of the claim
47
+ confidence (float): Confidence level (0-1)
48
+
49
+ Returns:
50
+ str: Explanation of the claim's classification
51
+ """
52
+ logger.info(f"Generating explanation for claim with verdict: {truth_label}")
53
+
54
+ try:
55
+ # Normalize evidence_results to a list
56
+ if not isinstance(evidence_results, list):
57
+ try:
58
+ evidence_results = ast.literal_eval(str(evidence_results)) if evidence_results else []
59
+ except:
60
+ evidence_results = [evidence_results] if evidence_results else []
61
+
62
+ # Get the LLM model
63
+ explanation_model = get_llm_model()
64
+
65
+ # Extract most relevant evidence
66
+ most_relevant_evidence = extract_most_relevant_evidence(evidence_results)
67
+
68
+ # Prepare evidence text for prompt
69
+ evidence_text = "\n".join([
70
+ f"Evidence {i+1}: {str(ev)[:200] + '...' if len(str(ev)) > 200 else str(ev)}"
71
+ for i, ev in enumerate(evidence_results[:3])
72
+ ])
73
+
74
+ # Convert confidence to percentage and description
75
+ confidence_desc = ""
76
+ if confidence is not None:
77
+ confidence_pct = int(confidence * 100)
78
+ if confidence < 0.3:
79
+ confidence_desc = f"very low confidence ({confidence_pct}%)"
80
+ elif confidence < 0.5:
81
+ confidence_desc = f"low confidence ({confidence_pct}%)"
82
+ elif confidence < 0.7:
83
+ confidence_desc = f"moderate confidence ({confidence_pct}%)"
84
+ elif confidence < 0.9:
85
+ confidence_desc = f"high confidence ({confidence_pct}%)"
86
+ else:
87
+ confidence_desc = f"very high confidence ({confidence_pct}%)"
88
+ else:
89
+ # Determine confidence context from label if not explicitly provided
90
+ confidence_desc = (
91
+ "high confidence" if "High Confidence" in truth_label else
92
+ "moderate confidence" if "Likely" in truth_label else
93
+ "low confidence"
94
+ )
95
+
96
+ # Create prompt with specific instructions based on the type of claim
97
+ has_negation = any(neg in claim.lower() for neg in ["not", "no longer", "isn't", "doesn't", "won't", "cannot"])
98
+
99
+ # For claims with "True" verdict
100
+ if "True" in truth_label:
101
+ prompt = f"""
102
+ Claim: "{claim}"
103
+
104
+ Verdict: {truth_label} (with {confidence_desc})
105
+
106
+ Available Evidence:
107
+ {evidence_text}
108
+
109
+ Task: Generate a clear explanation that:
110
+ 1. Clearly states that the claim IS TRUE based on the evidence
111
+ 2. {"Pay special attention to the logical relationship since the claim contains negation" if has_negation else "Explains why the evidence supports the claim"}
112
+ 3. Uses confidence level of {confidence_desc}
113
+ 4. Highlights the most relevant supporting evidence
114
+ 5. Is factual and precise
115
+ """
116
+
117
+ # For claims with "False" verdict
118
+ elif "False" in truth_label:
119
+ prompt = f"""
120
+ Claim: "{claim}"
121
+
122
+ Verdict: {truth_label} (with {confidence_desc})
123
+
124
+ Available Evidence:
125
+ {evidence_text}
126
+
127
+ Task: Generate a clear explanation that:
128
+ 1. Clearly states that the claim IS FALSE based on the evidence
129
+ 2. {"Pay special attention to the logical relationship since the claim contains negation" if has_negation else "Explains why the evidence contradicts the claim"}
130
+ 3. Uses confidence level of {confidence_desc}
131
+ 4. Highlights the contradicting evidence
132
+ 5. Is factual and precise
133
+
134
+ IMPORTANT: If the claim contains negation (words like 'not', 'no longer', etc.), be extra careful with the logical relationship between the evidence and the claim.
135
+ """
136
+
137
+ # For uncertain claims
138
+ else:
139
+ prompt = f"""
140
+ Claim: "{claim}"
141
+
142
+ Verdict: {truth_label} (with {confidence_desc})
143
+
144
+ Available Evidence:
145
+ {evidence_text}
146
+
147
+ Task: Generate a clear explanation that:
148
+ 1. Clearly states that there is insufficient evidence to determine if the claim is true or false
149
+ 2. Explains what information is missing or why the available evidence is insufficient
150
+ 3. Uses confidence level of {confidence_desc}
151
+ 4. Makes NO speculation about whether the claim might be true or false
152
+ 5. Mentions that the user should seek information from other reliable sources
153
+ """
154
+
155
+ # Generate explanation with multiple attempts
156
+ max_attempts = 3
157
+ for attempt in range(max_attempts):
158
+ try:
159
+ # Invoke the model
160
+ response = explanation_model.invoke(prompt)
161
+ explanation = response.content.strip()
162
+
163
+ # Validate explanation length
164
+ if explanation and len(explanation.split()) >= 5:
165
+ return explanation
166
+
167
+ except Exception as attempt_error:
168
+ logger.error(f"Explanation generation attempt {attempt+1} failed: {str(attempt_error)}")
169
+
170
+ # Ultimate fallback explanation
171
+ if "Uncertain" in truth_label:
172
+ return f"The claim '{claim}' cannot be verified due to insufficient evidence. The available information does not provide clear support for or against this claim. Consider consulting reliable sources for verification."
173
+ elif "True" in truth_label:
174
+ return f"The claim '{claim}' is supported by the evidence with {confidence_desc}. {most_relevant_evidence or 'The evidence indicates this claim is accurate.'}"
175
+ else:
176
+ return f"The claim '{claim}' is contradicted by the evidence with {confidence_desc}. {most_relevant_evidence or 'The evidence indicates this claim is not accurate.'}"
177
+
178
+ except Exception as e:
179
+ logger.error(f"Comprehensive error in explanation generation: {str(e)}")
180
+ # Final fallback
181
+ return f"The claim is classified as {truth_label} based on the available evidence."
modules/rss_feed.py ADDED
@@ -0,0 +1,391 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import feedparser
2
+ import time
3
+ import logging
4
+ import re
5
+ import ssl
6
+ import requests
7
+ from datetime import datetime, timedelta
8
+ from threading import Timer
9
+ from urllib.parse import urlparse
10
+ from concurrent.futures import ThreadPoolExecutor, as_completed
11
+
12
+ logger = logging.getLogger("misinformation_detector")
13
+
14
+ # Disable SSL certificate verification for feeds with self-signed certs
15
+ ssl._create_default_https_context = ssl._create_unverified_context
16
+
17
+ # List of RSS feeds to check for news
18
+ # These are popular news sources with reliable and frequently updated RSS feeds
19
+ RSS_FEEDS = [
20
+ # --------------------
21
+ # 🌐 General World News
22
+ # --------------------
23
+ # "http://rss.cnn.com/rss/cnn_topstories.rss", # CNN Top Stories; Removed in round 2
24
+ "http://rss.cnn.com/rss/cnn_world.rss", # CNN World News; Duplicate with category_detection
25
+ # "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml", # NYT Home Page
26
+ "https://rss.nytimes.com/services/xml/rss/nyt/World.xml", # NYT World News; Duplicate with category_detection
27
+ # "https://rss.nytimes.com/services/xml/rss/nyt/US.xml", # NYT US News
28
+ "https://feeds.washingtonpost.com/rss/world", # The Washington Post World News; Removed in round 2
29
+ # "https://feeds.washingtonpost.com/rss/national", # The Washington Post National News
30
+ # "https://feeds.bbci.co.uk/news/rss.xml", # BBC News - Top Stories; Removed in round 2
31
+ "https://feeds.bbci.co.uk/news/world/rss.xml", # BBC News - World
32
+ # "https://news.google.com/rss?gl=IN&ceid=IN:en&topic=w&hl=en-IN", # Google News India - World; Removed in round 2
33
+ # "https://news.google.com/rss?gl=US&ceid=US:en&topic=w&hl=en-US", # Google News US - World; Removed in round 2
34
+
35
+ # --------------------
36
+ # 🧠 Tech & Startup News (Global)
37
+ # --------------------
38
+ "https://techcrunch.com/feed/", # TechCrunch - Startup and Technology News; Duplicate with category_detection
39
+ "https://venturebeat.com/feed/", # VentureBeat - Tech News
40
+ # "https://www.theverge.com/rss/index.xml", # The Verge - Technology News
41
+ "https://www.wired.com/feed/rss", # Wired - Technology News
42
+ "https://www.cnet.com/rss/news/", # CNET - Technology News
43
+ # "https://sifted.eu/feed/", # Sifted - European Startups and Tech
44
+ # "https://feeds.feedburner.com/fastcompany/headlines", # Fast Company - Business Innovation
45
+ # "https://feeds.bbci.co.uk/news/technology/rss.xml", # BBC News - Technology
46
+ "https://news.google.com/rss?gl=IN&ceid=IN:en&topic=t&hl=en-IN", # Google News India - Technology
47
+ "https://news.google.com/rss?gl=US&ceid=US:en&topic=t&hl=en-US", # Google News US - Technology
48
+
49
+ # --------------------
50
+ # 💼 Startup & VC Focused
51
+ # --------------------
52
+ "https://news.crunchbase.com/feed/", # Crunchbase News - Startup Funding
53
+ # "https://avc.com/feed/", # AVC - Musings of a VC in NYC
54
+ "https://techstartups.com/feed/", # Tech Startups - Startup News
55
+ # "https://tech.eu/feed/", # Tech.eu - European Tech News
56
+ # "https://www.menabytes.com/feed/", # MENAbytes - Middle East & North Africa Startups
57
+ # "http://feeds.feedburner.com/venturebeat/SZYF", # VentureBeat - Deals
58
+
59
+ # --------------------
60
+ # 📰 Global Business & Corporate Feeds
61
+ # --------------------
62
+ "https://feeds.bloomberg.com/technology/news.rss", # Bloomberg Technology News
63
+ "https://www.ft.com/technology?format=rss", # Financial Times Technology News
64
+ # "https://ir.thomsonreuters.com/rss/news-releases.xml", # Thomson Reuters Press Releases
65
+ # "https://feeds.bbci.co.uk/news/business/rss.xml", # BBC News - Business
66
+ "https://news.google.com/rss?gl=IN&ceid=IN:en&topic=b&hl=en-IN", # Google News India - Business
67
+ # "https://news.google.com/rss?gl=US&ceid=US:en&topic=b&hl=en-US", # Google News US - Business; Removed in round 2
68
+
69
+ # --------------------
70
+ # 🇮🇳 India-specific News
71
+ # --------------------
72
+ "https://inc42.com/feed/", # Inc42 - Indian Startups and Technology
73
+ # "https://yourstory.com/rss", # YourStory - Indian Startup Stories
74
+ # "https://economictimes.indiatimes.com/startups/rssfeeds/49979279.cms", # Economic Times - Startups
75
+ "https://timesofindia.indiatimes.com/rssfeedstopstories.cms", # TOI - Top Stories
76
+ "https://timesofindia.indiatimes.com/rssfeedmostrecent.cms", # TOI - Most Recent Stories
77
+ "https://timesofindia.indiatimes.com/rssfeeds/-2128936835.cms", # TOI - India News
78
+ "https://timesofindia.indiatimes.com/rssfeeds/296589292.cms", # TOI - World News
79
+ "https://timesofindia.indiatimes.com/rssfeeds/1898055.cms", # TOI - Business News
80
+ "https://timesofindia.indiatimes.com/rssfeeds/54829575.cms", # TOI - Cricket News
81
+ "https://timesofindia.indiatimes.com/rssfeeds/4719148.cms", # TOI - Sports News
82
+ "https://timesofindia.indiatimes.com/rssfeeds/-2128672765.cms", # TOI - Science News
83
+ # "https://timesofindia.indiatimes.com/rssfeeds/66949542.cms", # TOI - Technology News
84
+ # "https://timesofindia.indiatimes.com/rssfeeds/1081479906.cms", # TOI - Education News
85
+
86
+ # --------------------
87
+ # 🏏 Sports News (Global + Cricket)
88
+ # --------------------
89
+ "https://www.espn.com/espn/rss/news", # ESPN - Top Sports News; Duplicate with category_detection
90
+ # "https://api.foxsports.com/v2/content/optimized-rss?partnerKey=MB0Wehpmuj2lUhuRhQaafhBjAJqaPU244mlTDK1i&size=30", # Fox Sports; Removed in round 2
91
+ "https://feeds.skynews.com/feeds/rss/sports.xml", # Sky News - Sports
92
+ "https://sports.ndtv.com/rss/all", # NDTV Sports
93
+ "https://www.espncricinfo.com/rss/content/story/feeds/0.xml", # ESPN Cricinfo - Cricket News; Duplicate with category_detection
94
+ # "https://crickettimes.com/feed/", # Cricket Times - Cricket News
95
+
96
+ # --------------------
97
+ # ✅ Fact-Checking Sources
98
+ # --------------------
99
+ "https://www.snopes.com/feed/", # Snopes - Fact Checking; Duplicate with category_detection
100
+ "https://www.politifact.com/rss/all/", # PolitiFact - Fact Checking; Duplicate with category_detection
101
+
102
+ # --------------------
103
+ # 🗳️ Politics & Policy (General)
104
+ # --------------------
105
+ "https://feeds.bbci.co.uk/news/politics/rss.xml", # BBC News - Politics; Duplicate with category_detection
106
+ "https://feeds.bbci.co.uk/news/science_and_environment/rss.xml", # BBC - Science & Environment
107
+
108
+ # --------------------
109
+ # 🗳️ Science
110
+ # --------------------
111
+ "https://www.nature.com/nature.rss", # Nature science; Duplicate with category_detection
112
+ "https://feeds.science.org/rss/science-advances.xml" # science.org
113
+ ]
114
+
115
+ def clean_html(raw_html):
116
+ """Remove HTML tags from text"""
117
+ if not raw_html:
118
+ return ""
119
+ clean_regex = re.compile('<.*?>')
120
+ clean_text = re.sub(clean_regex, '', raw_html)
121
+ # Remove extra whitespace
122
+ clean_text = re.sub(r'\s+', ' ', clean_text).strip()
123
+ return clean_text
124
+
125
+ def parse_feed(feed_url, timeout=5):
126
+ """
127
+ Parse a single RSS feed with proper timeout handling
128
+ Uses requests with timeout first, then passes content to feedparser
129
+ """
130
+ try:
131
+ # Use requests with timeout to fetch the RSS content
132
+ response = requests.get(feed_url, timeout=timeout)
133
+ response.raise_for_status()
134
+
135
+ # Then parse the content with feedparser (which doesn't support timeout)
136
+ feed = feedparser.parse(response.content)
137
+
138
+ # Basic validation of the feed
139
+ if hasattr(feed, 'entries') and feed.entries:
140
+ return feed
141
+ else:
142
+ logger.warning(f"Feed {feed_url} parsed but contains no entries")
143
+ return None
144
+
145
+ except requests.exceptions.Timeout:
146
+ logger.warning(f"Timeout while fetching feed {feed_url}")
147
+ return None
148
+ except requests.exceptions.RequestException as e:
149
+ logger.error(f"Request error fetching feed {feed_url}: {str(e)}")
150
+ return None
151
+ except Exception as e:
152
+ logger.error(f"Error parsing feed {feed_url}: {str(e)}")
153
+ return None
154
+
155
+ def fetch_all_feeds(feeds_list=None, max_workers=5, timeout=5):
156
+ """
157
+ Fetch multiple RSS feeds with proper timeout handling
158
+ Returns a list of (domain, feed) tuples for successfully fetched feeds
159
+ """
160
+ # Use default RSS_FEEDS list if none provided
161
+ if feeds_list is None:
162
+ feeds_list = RSS_FEEDS
163
+
164
+ results = []
165
+
166
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
167
+ future_to_url = {executor.submit(parse_feed, url, timeout): url for url in feeds_list}
168
+ for future in as_completed(future_to_url):
169
+ url = future_to_url[future]
170
+ try:
171
+ feed = future.result()
172
+ if feed and hasattr(feed, 'entries') and feed.entries:
173
+ # Extract domain for source attribution
174
+ domain = urlparse(url).netloc
175
+ results.append((domain, feed))
176
+ logger.info(f"Successfully fetched {domain} with {len(feed.entries)} entries")
177
+ except Exception as e:
178
+ logger.error(f"Error processing {url}: {str(e)}")
179
+
180
+ return results
181
+
182
+ def extract_date(entry):
183
+ """Extract and normalize publication date from entry"""
184
+ for date_field in ['published_parsed', 'updated_parsed', 'created_parsed']:
185
+ if hasattr(entry, date_field) and getattr(entry, date_field):
186
+ try:
187
+ # Convert time tuple to datetime
188
+ time_tuple = getattr(entry, date_field)
189
+ return datetime(time_tuple[0], time_tuple[1], time_tuple[2],
190
+ time_tuple[3], time_tuple[4], time_tuple[5])
191
+ except Exception as e:
192
+ logger.debug(f"Error parsing {date_field}: {e}")
193
+ continue
194
+
195
+ # Try string dates
196
+ for date_field in ['published', 'updated', 'pubDate']:
197
+ if hasattr(entry, date_field) and getattr(entry, date_field):
198
+ try:
199
+ date_str = getattr(entry, date_field)
200
+ # Try various formats
201
+ for fmt in ['%a, %d %b %Y %H:%M:%S %z', '%a, %d %b %Y %H:%M:%S %Z',
202
+ '%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S%z']:
203
+ try:
204
+ return datetime.strptime(date_str, fmt)
205
+ except:
206
+ continue
207
+ except Exception as e:
208
+ logger.debug(f"Error parsing date string {date_field}: {e}")
209
+ continue
210
+
211
+ # Default to current time if parsing fails
212
+ return datetime.now()
213
+
214
+ def is_recent(entry_date, max_days=3):
215
+ """Check if an entry is recent (within the last few days)"""
216
+ if not entry_date:
217
+ return False
218
+ cutoff = datetime.now() - timedelta(days=max_days)
219
+ return entry_date > cutoff
220
+
221
+ def get_entry_relevance(entry, query_terms, domain):
222
+ """Calculate relevance score for an entry based on query match and recency"""
223
+ if not hasattr(entry, 'title') or not entry.title:
224
+ return 0
225
+
226
+ # Extract text content
227
+ title = entry.title or ""
228
+ description = clean_html(entry.description) if hasattr(entry, 'description') else ""
229
+ content = ""
230
+ if hasattr(entry, 'content'):
231
+ for content_item in entry.content:
232
+ if 'value' in content_item:
233
+ content += clean_html(content_item['value']) + " "
234
+
235
+ # Extract published date
236
+ pub_date = extract_date(entry)
237
+
238
+ # Calculate recency score (0-1)
239
+ recency_score = 0
240
+ if pub_date:
241
+ days_old = (datetime.now() - pub_date).days
242
+ if days_old <= 1: # Today or yesterday
243
+ recency_score = 1.0
244
+ elif days_old <= 2:
245
+ recency_score = 0.8
246
+ elif days_old <= 3:
247
+ recency_score = 0.5
248
+ else:
249
+ recency_score = 0.2
250
+
251
+ # Calculate relevance score based on keyword matches
252
+ text = f"{title} {description} {content}".lower()
253
+
254
+ # Count how many query terms appear in the content
255
+ query_terms_lower = [term.lower() for term in query_terms]
256
+ matches = sum(1 for term in query_terms_lower if term in text)
257
+
258
+ # Calculate match score (0-1)
259
+ match_score = min(1.0, matches / max(1, len(query_terms) * 0.7))
260
+
261
+ # Boost score for exact phrase matches
262
+ query_phrase = " ".join(query_terms_lower)
263
+ if query_phrase in text:
264
+ match_score += 0.5
265
+
266
+ # Additional boost for title matches (they're more relevant)
267
+ title_matches = sum(1 for term in query_terms_lower if term in title.lower())
268
+ if title_matches > 0:
269
+ match_score += 0.2 * (title_matches / len(query_terms_lower))
270
+
271
+ # Source quality factor (can be adjusted based on source reliability)
272
+ source_factor = 1.0
273
+ high_quality_domains = ['bbc.co.uk', 'nytimes.com', 'reuters.com', 'washingtonpost.com',
274
+ 'espncricinfo.com', 'cricbuzz.com', 'snopes.com']
275
+ if any(quality_domain in domain for quality_domain in high_quality_domains):
276
+ source_factor = 1.2
277
+
278
+ # Calculate final score
279
+ final_score = (match_score * 0.6) + (recency_score * 0.4) * source_factor
280
+
281
+ return min(1.0, final_score) # Cap at 1.0
282
+
283
+ def retrieve_evidence_from_rss(claim, max_results=3, category_feeds=None):
284
+ """
285
+ Retrieve evidence from RSS feeds for a given claim
286
+
287
+ Args:
288
+ claim (str): The claim to verify
289
+ max_results (int): Maximum number of results to return
290
+ category_feeds (list, optional): List of category-specific RSS feeds to check
291
+
292
+ Returns:
293
+ list: List of relevant evidence items
294
+ """
295
+ start_time = time.time()
296
+ logger.info(f"Retrieving evidence from RSS feeds for: {claim}")
297
+
298
+ # Extract key terms from claim
299
+ terms = [term.strip() for term in re.findall(r'\b\w+\b', claim) if len(term.strip()) > 2]
300
+
301
+ try:
302
+ # Use category-specific feeds if provided
303
+ feeds_to_use = category_feeds if category_feeds else RSS_FEEDS
304
+
305
+ # Log which feeds we're using
306
+ if category_feeds:
307
+ logger.info(f"Using {len(category_feeds)} category-specific RSS feeds")
308
+ else:
309
+ logger.info(f"Using {len(RSS_FEEDS)} default RSS feeds")
310
+
311
+ # Limit the number of feeds to process for efficiency
312
+ if len(feeds_to_use) > 10:
313
+ # If we have too many feeds, select a subset
314
+ # Prioritize fact-checking sources
315
+ fact_check_feeds = [feed for feed in feeds_to_use if "fact" in feed.lower() or "snopes" in feed.lower() or "politifact" in feed.lower()]
316
+ other_feeds = [feed for feed in feeds_to_use if feed not in fact_check_feeds]
317
+
318
+ # Take all fact-checking feeds plus a random selection of others
319
+ import random
320
+ selected_feeds = fact_check_feeds + random.sample(other_feeds, min(10 - len(fact_check_feeds), len(other_feeds)))
321
+ else:
322
+ selected_feeds = feeds_to_use
323
+
324
+ # Fetch all feeds in parallel with the selected feeds
325
+ feeds = fetch_all_feeds(selected_feeds)
326
+
327
+ if not feeds:
328
+ logger.warning("No RSS feeds could be fetched")
329
+ return []
330
+
331
+ all_entries = []
332
+
333
+ # Process all feed entries
334
+ for domain, feed in feeds:
335
+ for entry in feed.entries:
336
+ # Calculate relevance score
337
+ relevance = get_entry_relevance(entry, terms, domain)
338
+
339
+ if relevance > 0.3: # Only consider somewhat relevant entries
340
+ # Extract entry details
341
+ title = entry.title if hasattr(entry, 'title') else "No title"
342
+ link = entry.link if hasattr(entry, 'link') else ""
343
+
344
+ # Extract and clean description/content
345
+ description = ""
346
+ if hasattr(entry, 'description'):
347
+ description = clean_html(entry.description)
348
+ elif hasattr(entry, 'summary'):
349
+ description = clean_html(entry.summary)
350
+ elif hasattr(entry, 'content'):
351
+ for content_item in entry.content:
352
+ if 'value' in content_item:
353
+ description += clean_html(content_item['value']) + " "
354
+
355
+ # Truncate description if too long
356
+ if len(description) > 250:
357
+ description = description[:247] + "..."
358
+
359
+ # Get publication date
360
+ pub_date = extract_date(entry)
361
+ date_str = pub_date.strftime('%Y-%m-%d') if pub_date else "Unknown date"
362
+
363
+ # Format as evidence text
364
+ evidence_text = (
365
+ f"Title: {title}, "
366
+ f"Source: {domain} (RSS), "
367
+ f"Date: {date_str}, "
368
+ f"URL: {link}, "
369
+ f"Content: {description}"
370
+ )
371
+
372
+ all_entries.append({
373
+ "text": evidence_text,
374
+ "relevance": relevance,
375
+ "date": pub_date or datetime.now()
376
+ })
377
+
378
+ # Sort entries by relevance
379
+ all_entries.sort(key=lambda x: x["relevance"], reverse=True)
380
+
381
+ # Take top results
382
+ top_entries = all_entries[:max_results]
383
+
384
+ logger.info(f"Retrieved {len(top_entries)} relevant RSS items from {len(feeds)} feeds in {time.time() - start_time:.2f}s")
385
+
386
+ # Return just the text portion
387
+ return [entry["text"] for entry in top_entries]
388
+
389
+ except Exception as e:
390
+ logger.error(f"Error in RSS retrieval: {str(e)}")
391
+ return []
modules/semantic_analysis.py ADDED
@@ -0,0 +1,503 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import numpy as np
3
+ from sklearn.metrics.pairwise import cosine_similarity
4
+ from datetime import datetime, timedelta
5
+ import re
6
+
7
+ # Import the centralized NLP model handler
8
+ from utils.models import get_nlp_model
9
+
10
+ logger = logging.getLogger("misinformation_detector")
11
+
12
+ def extract_entities(text):
13
+ """Extract named entities from text"""
14
+ if not text:
15
+ return []
16
+
17
+ try:
18
+ # Use centralized NLP model
19
+ nlp_model = get_nlp_model()
20
+ doc = nlp_model(text)
21
+ entities = [
22
+ {
23
+ "text": ent.text,
24
+ "label": ent.label_,
25
+ "start": ent.start_char,
26
+ "end": ent.end_char
27
+ }
28
+ for ent in doc.ents
29
+ ]
30
+ return entities
31
+ except Exception as e:
32
+ logger.error(f"Error extracting entities: {str(e)}")
33
+ return []
34
+
35
+ def get_vector_representation(text):
36
+ """Get vector representation of text using spaCy"""
37
+ if not text:
38
+ return None
39
+
40
+ try:
41
+ # Use centralized NLP model
42
+ nlp_model = get_nlp_model()
43
+ doc = nlp_model(text)
44
+
45
+ # Return document vector if available
46
+ if doc.has_vector:
47
+ return doc.vector
48
+
49
+ # Fallback: average of token vectors
50
+ vectors = [token.vector for token in doc if token.has_vector]
51
+ if vectors:
52
+ return np.mean(vectors, axis=0)
53
+
54
+ return None
55
+ except Exception as e:
56
+ logger.error(f"Error getting vector representation: {str(e)}")
57
+ return None
58
+
59
+ def calculate_similarity(text1, text2):
60
+ """Calculate semantic similarity between two texts"""
61
+ if not text1 or not text2:
62
+ return 0.0
63
+
64
+ try:
65
+ vec1 = get_vector_representation(text1)
66
+ vec2 = get_vector_representation(text2)
67
+
68
+ if vec1 is None or vec2 is None:
69
+ return 0.0
70
+
71
+ # Reshape vectors for cosine_similarity
72
+ vec1 = vec1.reshape(1, -1)
73
+ vec2 = vec2.reshape(1, -1)
74
+
75
+ # Calculate cosine similarity
76
+ similarity = cosine_similarity(vec1, vec2)[0][0]
77
+ return float(similarity)
78
+ except Exception as e:
79
+ logger.error(f"Error calculating similarity: {str(e)}")
80
+ return 0.0
81
+
82
+ def extract_date_from_evidence(evidence_text):
83
+ """Extract date from evidence text"""
84
+ if not evidence_text:
85
+ return None
86
+
87
+ try:
88
+ # Look for date patterns in text
89
+ date_patterns = [
90
+ r'Date: (\d{4}-\d{2}-\d{2})', # ISO format
91
+ r'published.*?(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})', # published on MM/DD/YYYY
92
+ r'(\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4})', # DD Month YYYY
93
+ r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}' # Month DD, YYYY
94
+ ]
95
+
96
+ for pattern in date_patterns:
97
+ match = re.search(pattern, evidence_text)
98
+ if match:
99
+ date_str = match.group(1)
100
+ # Parse date string based on format
101
+ try:
102
+ if '-' in date_str:
103
+ return datetime.strptime(date_str, '%Y-%m-%d')
104
+ elif '/' in date_str or '-' in date_str:
105
+ formats = ['%m/%d/%Y', '%d/%m/%Y', '%m-%d-%Y', '%d-%m-%Y']
106
+ for fmt in formats:
107
+ try:
108
+ return datetime.strptime(date_str, fmt)
109
+ except ValueError:
110
+ continue
111
+ else:
112
+ # Try different month formats
113
+ formats = ['%d %B %Y', '%B %d, %Y', '%B %d %Y']
114
+ for fmt in formats:
115
+ try:
116
+ return datetime.strptime(date_str, fmt)
117
+ except ValueError:
118
+ continue
119
+ except Exception:
120
+ pass
121
+
122
+ return None
123
+ except Exception as e:
124
+ logger.error(f"Error extracting date from evidence: {str(e)}")
125
+ return None
126
+
127
+ def is_temporally_relevant(evidence_text, claim_text, max_days_old=30):
128
+ """Check if evidence is temporally relevant to the claim"""
129
+ # Check if claim seems to require recent evidence
130
+ temporal_terms = ["today", "now", "current", "currently", "recent", "recently", "latest", "just", "this week", "this month", "this year"]
131
+ requires_recent = any(term in claim_text.lower() for term in temporal_terms)
132
+
133
+ # If claim doesn't specify temporality, consider evidence relevant
134
+ if not requires_recent:
135
+ return True
136
+
137
+ # Extract date from evidence
138
+ date = extract_date_from_evidence(evidence_text)
139
+ if not date:
140
+ return True # If we can't determine date, assume it's relevant
141
+
142
+ # Check if evidence is recent enough
143
+ cutoff = datetime.now() - timedelta(days=max_days_old)
144
+ return date >= cutoff
145
+
146
+ def has_authority_signal(evidence_text):
147
+ """Check if evidence contains authority signals"""
148
+ authority_signals = {
149
+ "scientific_consensus": ["consensus", "scientists agree", "research shows", "studies confirm", "experts agree"],
150
+ "fact_check": ["fact check", "rated false", "rated true", "debunked", "confirmed", "verification"],
151
+ "high_authority": ["nasa", "world health organization", "who", "cdc", "national academy",
152
+ "oxford", "harvard", "stanford", "mit", "cambridge", "yale",
153
+ "princeton", "government", "official", "authorities", "minister",
154
+ "ministry", "department", "administration", "university", "professor"]
155
+ }
156
+
157
+ evidence_lower = evidence_text.lower()
158
+
159
+ authority_type = None
160
+ authority_score = 1.0
161
+
162
+ for signal_type, phrases in authority_signals.items():
163
+ if any(phrase in evidence_lower for phrase in phrases):
164
+ if signal_type == "scientific_consensus":
165
+ authority_score = 1.8
166
+ authority_type = "scientific_consensus"
167
+ elif signal_type == "fact_check":
168
+ authority_score = 1.5
169
+ authority_type = "fact_check"
170
+ elif signal_type == "high_authority":
171
+ authority_score = 1.3
172
+ authority_type = "high_authority"
173
+ break
174
+
175
+ return authority_score, authority_type
176
+
177
+ def analyze_evidence_relevance(claim, evidence_list, source_credibility=None):
178
+ """
179
+ Analyze evidence relevance to claim using semantic similarity with improved handling
180
+ for claims requiring strong evidence
181
+
182
+ Args:
183
+ claim (str): The claim being verified
184
+ evidence_list (list): List of evidence items
185
+ source_credibility (dict): Dictionary mapping source domains to credibility scores
186
+
187
+ Returns:
188
+ list: Sorted list of evidence items with relevance scores
189
+ """
190
+ if not evidence_list:
191
+ return []
192
+
193
+ # Ensure evidence_list is a list of strings
194
+ if not isinstance(evidence_list, list):
195
+ evidence_list = [str(evidence_list)]
196
+
197
+ # Filter out None or empty items
198
+ evidence_list = [item for item in evidence_list if item]
199
+
200
+ # Check if claim contains strong assertions that would require specific evidence
201
+ strong_assertion_markers = [
202
+ "solved", "cured", "discovered", "breakthrough", "revolutionary",
203
+ "first ever", "confirmed", "definitive", "conclusive", "proven",
204
+ "groundbreaking", "unprecedented", "remarkable", "extends lifespan",
205
+ "extends life", "definitively", "successfully"
206
+ ]
207
+
208
+ # Determine if claim contains strong assertions
209
+ claim_has_strong_assertions = any(marker in claim.lower() for marker in strong_assertion_markers)
210
+
211
+ # Log detection result
212
+ if claim_has_strong_assertions:
213
+ logger.info(f"Evidence analysis: Detected claim with strong assertions requiring specific evidence")
214
+
215
+ # Extract named entities from claim
216
+ claim_entities = extract_entities(claim)
217
+ claim_entity_texts = [entity["text"].lower() for entity in claim_entities]
218
+
219
+ # Process each evidence item
220
+ analyzed_evidence = []
221
+
222
+ # Track domains found in evidence to identify source diversity
223
+ found_domains = set()
224
+
225
+ for evidence in evidence_list:
226
+ if not isinstance(evidence, str):
227
+ continue
228
+
229
+ # Calculate semantic similarity
230
+ similarity = calculate_similarity(claim, evidence)
231
+
232
+ # Check for entity overlap
233
+ evidence_entities = extract_entities(evidence)
234
+ evidence_entity_texts = [entity["text"].lower() for entity in evidence_entities]
235
+
236
+ # Calculate entity overlap
237
+ common_entities = set(claim_entity_texts).intersection(set(evidence_entity_texts))
238
+ entity_overlap = len(common_entities) / max(1, len(claim_entity_texts))
239
+
240
+ # Check temporal relevance
241
+ temporal_relevance = 1.0
242
+ if is_temporally_relevant(evidence, claim):
243
+ temporal_relevance = 1.2
244
+ else:
245
+ # Penalty for temporally irrelevant evidence
246
+ temporal_relevance = 0.7
247
+
248
+ # Check for authority signals
249
+ authority_score, authority_type = has_authority_signal(evidence)
250
+
251
+ # Extract source from evidence if available
252
+ source_boost = 1.0
253
+ domain = None
254
+
255
+ if source_credibility:
256
+ # Try to extract domain from URL in evidence
257
+ domain_match = re.search(r'URL: https?://(?:www\.)?([^/]+)', evidence)
258
+ if domain_match:
259
+ domain = domain_match.group(1)
260
+ # Check if domain or its parent domain is in credibility list
261
+ for cred_domain, cred_score in source_credibility.items():
262
+ if cred_domain in domain:
263
+ try:
264
+ source_boost = float(cred_score)
265
+ break
266
+ except (ValueError, TypeError):
267
+ pass
268
+
269
+ # Track this domain for source diversity
270
+ if domain:
271
+ found_domains.add(domain)
272
+
273
+ # For claims with strong assertions: check if evidence specifically addresses assertions
274
+ claim_specificity_match = 1.0
275
+ evidence_specificity_match = 1.0
276
+
277
+ if claim_has_strong_assertions:
278
+ # Check if evidence provides specific confirmation or contradiction
279
+ direct_contradiction_terms = [
280
+ "not yet", "has not", "have not", "cannot", "can't", "doesn't", "don't",
281
+ "unlikely", "challenging", "remains a challenge", "in the future",
282
+ "experimental", "in development", "proposed", "theoretical",
283
+ "preliminary", "hypothesized", "potential", "promising but"
284
+ ]
285
+
286
+ # Check for contradictions to strong assertions
287
+ if any(term in evidence.lower() for term in direct_contradiction_terms):
288
+ # This evidence likely contradicts the strong assertion
289
+ evidence_specificity_match = 2.0 # Boost relevance of contradicting evidence
290
+ logger.debug(f"Found contradiction to strong assertion in evidence")
291
+
292
+ # For claims with strong assertions, check if evidence specifically confirms
293
+ direct_confirmation_terms = [
294
+ "successfully demonstrated", "breakthrough", "solved", "cured",
295
+ "confirmed", "definitive evidence", "conclusive results", "proven",
296
+ "revolutionary results", "milestone achievement", "groundbreaking results"
297
+ ]
298
+
299
+ # If evidence confirms the strong assertion, adjust relevance
300
+ if any(term in evidence.lower() for term in direct_confirmation_terms):
301
+ # Apply higher scoring for evidence that specifically confirms
302
+ evidence_specificity_match = 1.8
303
+ logger.debug(f"Found confirmation of strong assertion in evidence")
304
+
305
+ # For claims with strong assertions, check for high-quality sources
306
+ high_quality_source_markers = [
307
+ "journal", "doi.org", "research", "university", "institute",
308
+ "laboratory", "professor", "study", "publication", "published in"
309
+ ]
310
+
311
+ is_high_quality = any(term in evidence.lower() for term in high_quality_source_markers)
312
+ quality_boost = 1.4 if is_high_quality else 1.0
313
+
314
+ # Apply the quality boost
315
+ source_boost *= quality_boost
316
+
317
+ # Calculate final relevance score with improvements for all claim types
318
+ if claim_has_strong_assertions:
319
+ relevance_score = (
320
+ (similarity * 0.35) + # Semantic similarity
321
+ (entity_overlap * 0.25) + # Entity overlap
322
+ (0.25) # Base value to ensure all evidence has some relevance
323
+ ) * temporal_relevance * authority_score * source_boost * claim_specificity_match * evidence_specificity_match
324
+ else:
325
+ # Original formula for regular claims
326
+ relevance_score = (
327
+ (similarity * 0.4) + # Semantic similarity
328
+ (entity_overlap * 0.3) + # Entity overlap
329
+ (0.3) # Base value to ensure all evidence has some relevance
330
+ ) * temporal_relevance * authority_score * source_boost
331
+
332
+ # Add metadata and relevance score
333
+ analyzed_evidence.append({
334
+ "text": evidence,
335
+ "relevance_score": relevance_score,
336
+ "similarity": similarity,
337
+ "entity_overlap": entity_overlap,
338
+ "temporal_relevance": temporal_relevance,
339
+ "authority_score": authority_score,
340
+ "authority_type": authority_type,
341
+ "source_boost": source_boost,
342
+ "domain": domain
343
+ })
344
+
345
+ # Sort by relevance score (descending)
346
+ analyzed_evidence.sort(key=lambda x: x["relevance_score"], reverse=True)
347
+
348
+ # Ensure we have diverse sources in top results for all claims
349
+ if len(found_domains) > 1:
350
+ # Try to promote evidence from reliable sources if we haven't selected any yet
351
+ reliable_sources_seen = False
352
+
353
+ # Check if top 3 results contain any reliable sources
354
+ for item in analyzed_evidence[:3]:
355
+ domain = item.get("domain", "")
356
+ if domain and source_credibility and any(cred_domain in domain for cred_domain in source_credibility):
357
+ reliable_sources_seen = True
358
+ break
359
+
360
+ # If no reliable sources in top results, promote one if available
361
+ if not reliable_sources_seen:
362
+ for i, item in enumerate(analyzed_evidence[3:]):
363
+ domain = item.get("domain", "")
364
+ if domain and source_credibility and any(cred_domain in domain for cred_domain in source_credibility):
365
+ # Swap this item into the top 3
366
+ analyzed_evidence.insert(2, analyzed_evidence.pop(i+3))
367
+ break
368
+
369
+ return analyzed_evidence
370
+
371
+ def select_diverse_evidence(analyzed_evidence, max_items=5):
372
+ """
373
+ Select diverse evidence items based on relevance, source diversity and claim characteristics
374
+
375
+ Args:
376
+ analyzed_evidence (list): List of evidence items with relevance scores
377
+ max_items (int): Maximum number of evidence items to return
378
+
379
+ Returns:
380
+ list: Selected diverse evidence items
381
+ """
382
+ if not analyzed_evidence:
383
+ return []
384
+
385
+ # Check if top evidence suggests claim has strong assertions
386
+ strong_assertion_markers = [
387
+ "solved", "cured", "discovered", "breakthrough", "revolutionary",
388
+ "first ever", "confirmed", "definitive", "conclusive", "proven",
389
+ "groundbreaking", "unprecedented", "extends lifespan", "definitively"
390
+ ]
391
+
392
+ # Determine if this is a claim with strong assertions by checking evidence text
393
+ has_strong_assertions = False
394
+
395
+ for item in analyzed_evidence[:3]: # Check just the top items for efficiency
396
+ if "text" in item:
397
+ item_text = item["text"].lower()
398
+ if any(f"claim {marker}" in item_text or f"claim has {marker}" in item_text
399
+ for marker in strong_assertion_markers):
400
+ has_strong_assertions = True
401
+ break
402
+
403
+ # Also check for contradiction markers in evidence which can indicate a strong assertion
404
+ contradiction_markers = [
405
+ "not yet solved", "hasn't been proven", "no evidence that",
406
+ "remains unsolved", "has not been confirmed", "remains theoretical"
407
+ ]
408
+
409
+ if not has_strong_assertions:
410
+ for item in analyzed_evidence[:3]:
411
+ if "text" in item:
412
+ item_text = item["text"].lower()
413
+ if any(marker in item_text for marker in contradiction_markers):
414
+ has_strong_assertions = True
415
+ break
416
+
417
+ # Ensure we don't select more than available
418
+ max_items = min(max_items, len(analyzed_evidence))
419
+
420
+ # Initialize selected items with the most relevant item
421
+ selected = [analyzed_evidence[0]]
422
+ remaining = analyzed_evidence[1:]
423
+
424
+ # Track sources to ensure diversity
425
+ selected_sources = set()
426
+ for item in selected:
427
+ # Try to extract source from evidence
428
+ source_match = re.search(r'Source: ([^,]+)', item["text"])
429
+ if source_match:
430
+ selected_sources.add(source_match.group(1))
431
+
432
+ # For all claims, track if we have high-quality sources yet
433
+ has_quality_source = False
434
+ quality_source_markers = ["journal", "doi.org", "research", "university",
435
+ "institute", "laboratory", "professor", "study"]
436
+
437
+ # Check if our top item is already from a quality source
438
+ if any(marker in selected[0]["text"].lower() for marker in quality_source_markers):
439
+ has_quality_source = True
440
+
441
+ # Select remaining items balancing relevance and diversity
442
+ while len(selected) < max_items and remaining:
443
+ best_item = None
444
+ best_score = -1
445
+
446
+ for i, item in enumerate(remaining):
447
+ # Base score is the item's relevance
448
+ score = item["relevance_score"]
449
+
450
+ # Extract source if available
451
+ source = None
452
+ source_match = re.search(r'Source: ([^,]+)', item["text"])
453
+ if source_match:
454
+ source = source_match.group(1)
455
+
456
+ # Apply diversity bonus if source is new
457
+ if source and source not in selected_sources:
458
+ score *= 1.2 # Diversity bonus
459
+
460
+ # For claims with strong assertions, apply bonus for contradicting evidence
461
+ if has_strong_assertions:
462
+ # Check for contradiction markers in the text
463
+ if any(marker in item["text"].lower() for marker in contradiction_markers):
464
+ score *= 1.3 # Bonus for evidence that may contradict strong assertions
465
+
466
+ # For any claim, apply bonus for high-quality sources if we don't have one yet
467
+ if not has_quality_source:
468
+ is_item_quality = any(marker in item["text"].lower() for marker in quality_source_markers)
469
+ if is_item_quality:
470
+ score *= 1.5 # Significant bonus for quality sources
471
+
472
+ if score > best_score:
473
+ best_score = score
474
+ best_item = (i, item)
475
+
476
+ if best_item:
477
+ idx, item = best_item
478
+ selected.append(item)
479
+ remaining.pop(idx)
480
+
481
+ # Add source to selected sources
482
+ source_match = re.search(r'Source: ([^,]+)', item["text"])
483
+ if source_match:
484
+ selected_sources.add(source_match.group(1))
485
+
486
+ # Check if we found a quality source
487
+ if not has_quality_source:
488
+ if any(marker in item["text"].lower() for marker in quality_source_markers):
489
+ has_quality_source = True
490
+ else:
491
+ break
492
+
493
+ # For any claim with strong assertions, ensure we have at least one quality source if available
494
+ if has_strong_assertions and not has_quality_source and remaining:
495
+ for i, item in enumerate(remaining):
496
+ if any(marker in item["text"].lower() for marker in quality_source_markers):
497
+ # Replace the least relevant selected item with this quality one
498
+ selected.sort(key=lambda x: x["relevance_score"])
499
+ selected[0] = item
500
+ break
501
+
502
+ # Return only the text portion
503
+ return [item["text"] for item in selected]
utils/__init__.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utils package initialization.
3
+
4
+ This package provides utility functions for the AskVeracity fact-checking system.
5
+ """
6
+
7
+ from .api_utils import api_error_handler, safe_json_parse, RateLimiter
8
+ from .performance import PerformanceTracker
9
+ from .models import initialize_models, get_nlp_model, get_llm_model
10
+
11
+
12
+ __all__ = [
13
+ 'api_error_handler',
14
+ 'safe_json_parse',
15
+ 'RateLimiter',
16
+ 'PerformanceTracker',
17
+ 'initialize_models',
18
+ 'get_nlp_model',
19
+ 'get_llm_model'
20
+ ]
utils/api_utils.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ API utilities for the Fake News Detector application.
3
+
4
+ This module provides utilities for handling API calls, rate limiting,
5
+ error handling, and exponential backoff for retrying failed requests.
6
+ """
7
+
8
+ import time
9
+ import functools
10
+ import random
11
+ import logging
12
+ import requests
13
+ from datetime import datetime, timedelta
14
+ from collections import deque
15
+
16
+ from config import RATE_LIMITS, ERROR_BACKOFF
17
+
18
+ logger = logging.getLogger("misinformation_detector")
19
+
20
+ class RateLimiter:
21
+ """
22
+ Rate limiter for API calls with support for different APIs.
23
+
24
+ This class implements a token bucket algorithm for rate limiting,
25
+ with support for different rate limits for different APIs.
26
+ It also provides exponential backoff for error handling.
27
+ """
28
+
29
+ def __init__(self):
30
+ """Initialize the rate limiter with configuration from settings."""
31
+ # Store rate limits for different APIs
32
+ self.limits = {}
33
+
34
+ # Initialize limits from config
35
+ for api_name, limit_info in RATE_LIMITS.items():
36
+ self.limits[api_name] = {
37
+ "requests": limit_info["requests"],
38
+ "period": limit_info["period"],
39
+ "timestamps": deque()
40
+ }
41
+
42
+ # Error backoff settings
43
+ self.max_retries = ERROR_BACKOFF["max_retries"]
44
+ self.initial_backoff = ERROR_BACKOFF["initial_backoff"]
45
+ self.backoff_factor = ERROR_BACKOFF["backoff_factor"]
46
+
47
+ def check_and_update(self, api_name):
48
+ """
49
+ Check if request is allowed and update timestamps.
50
+
51
+ Args:
52
+ api_name (str): Name of the API to check
53
+
54
+ Returns:
55
+ tuple: (allowed, wait_time)
56
+ - allowed (bool): Whether the request is allowed
57
+ - wait_time (float): Time to wait if not allowed
58
+ """
59
+ if api_name not in self.limits:
60
+ return True, 0 # Unknown API, allow by default
61
+
62
+ now = datetime.now()
63
+ limit_info = self.limits[api_name]
64
+
65
+ # Remove timestamps older than the period
66
+ cutoff = now - timedelta(seconds=limit_info["period"])
67
+ while limit_info["timestamps"] and limit_info["timestamps"][0] < cutoff:
68
+ limit_info["timestamps"].popleft()
69
+
70
+ # Check if we're at the rate limit
71
+ if len(limit_info["timestamps"]) >= limit_info["requests"]:
72
+ # Calculate wait time until oldest timestamp expires
73
+ wait_time = (limit_info["timestamps"][0] + timedelta(seconds=limit_info["period"]) - now).total_seconds()
74
+ return False, max(0, wait_time)
75
+
76
+ # Add current timestamp and allow request
77
+ limit_info["timestamps"].append(now)
78
+ return True, 0
79
+
80
+ def wait_if_needed(self, api_name):
81
+ """
82
+ Wait if rate limit is reached.
83
+
84
+ Args:
85
+ api_name (str): Name of the API to check
86
+
87
+ Returns:
88
+ bool: True if waited, False otherwise
89
+ """
90
+ allowed, wait_time = self.check_and_update(api_name)
91
+ if not allowed:
92
+ logger.info(f"Rate limit reached for {api_name}. Waiting {wait_time:.2f} seconds...")
93
+ time.sleep(wait_time + 0.1) # Add a small buffer
94
+ return True
95
+ return False
96
+
97
+ def get_backoff_time(self, attempt):
98
+ """
99
+ Calculate exponential backoff time with jitter.
100
+
101
+ Args:
102
+ attempt (int): Current attempt number (0-based)
103
+
104
+ Returns:
105
+ float: Backoff time in seconds
106
+ """
107
+ backoff = self.initial_backoff * (self.backoff_factor ** attempt)
108
+ # Add jitter to prevent thundering herd problem
109
+ jitter = random.uniform(0, 0.1 * backoff)
110
+ return backoff + jitter
111
+
112
+
113
+ # Create rate limiter instance
114
+ rate_limiter = RateLimiter()
115
+
116
+ # API Error Handler decorator
117
+ def api_error_handler(api_name):
118
+ """
119
+ Decorator for API calls with error handling and rate limiting.
120
+
121
+ This decorator handles rate limiting, retries with exponential
122
+ backoff, and error handling for API calls.
123
+
124
+ Args:
125
+ api_name (str): Name of the API being called
126
+
127
+ Returns:
128
+ callable: Decorated function
129
+ """
130
+ def decorator(func):
131
+ @functools.wraps(func)
132
+ def wrapper(*args, **kwargs):
133
+ try:
134
+ # Apply rate limiting - make sure rate_limiter exists and has the method
135
+ if hasattr(rate_limiter, 'wait_if_needed'):
136
+ rate_limiter.wait_if_needed(api_name)
137
+
138
+ # Track retries
139
+ for attempt in range(rate_limiter.max_retries):
140
+ try:
141
+ return func(*args, **kwargs)
142
+ except requests.exceptions.HTTPError as e:
143
+ status_code = e.response.status_code if hasattr(e, 'response') else 0
144
+
145
+ # Handle specific HTTP errors
146
+ if status_code == 429: # Too Many Requests
147
+ logger.warning(f"{api_name} rate limit exceeded (429). Attempt {attempt+1}/{rate_limiter.max_retries}")
148
+ # Get retry-after header or use exponential backoff
149
+ retry_after = e.response.headers.get('Retry-After')
150
+ if retry_after and retry_after.isdigit():
151
+ wait_time = int(retry_after)
152
+ else:
153
+ wait_time = rate_limiter.get_backoff_time(attempt)
154
+ logger.info(f"Waiting {wait_time} seconds before retry...")
155
+ time.sleep(wait_time)
156
+ elif status_code >= 500: # Server errors
157
+ logger.warning(f"{api_name} server error ({status_code}). Attempt {attempt+1}/{rate_limiter.max_retries}")
158
+ time.sleep(rate_limiter.get_backoff_time(attempt))
159
+ elif status_code == 403: # Forbidden - likely API key issue
160
+ logger.error(f"{api_name} access forbidden (403). Check API key.")
161
+ return None # Don't retry on auth errors
162
+ elif status_code == 404: # Not Found
163
+ logger.warning(f"{api_name} resource not found (404).")
164
+ return None # Don't retry on resource not found
165
+ else:
166
+ logger.error(f"{api_name} HTTP error: {e}")
167
+ if attempt < rate_limiter.max_retries - 1:
168
+ wait_time = rate_limiter.get_backoff_time(attempt)
169
+ logger.info(f"Waiting {wait_time} seconds before retry...")
170
+ time.sleep(wait_time)
171
+ else:
172
+ return None
173
+
174
+ except requests.exceptions.ConnectionError as e:
175
+ logger.error(f"{api_name} connection error: {e}")
176
+ if attempt < rate_limiter.max_retries - 1:
177
+ wait_time = rate_limiter.get_backoff_time(attempt)
178
+ logger.info(f"Waiting {wait_time} seconds before retry...")
179
+ time.sleep(wait_time)
180
+ else:
181
+ return None
182
+
183
+ except requests.exceptions.Timeout as e:
184
+ logger.error(f"{api_name} timeout error: {e}")
185
+ if attempt < rate_limiter.max_retries - 1:
186
+ wait_time = rate_limiter.get_backoff_time(attempt)
187
+ logger.info(f"Waiting {wait_time} seconds before retry...")
188
+ time.sleep(wait_time)
189
+ else:
190
+ return None
191
+
192
+ except Exception as e:
193
+ logger.error(f"{api_name} unexpected error: {str(e)}")
194
+ if attempt < rate_limiter.max_retries - 1:
195
+ wait_time = rate_limiter.get_backoff_time(attempt)
196
+ logger.info(f"Waiting {wait_time} seconds before retry...")
197
+ time.sleep(wait_time)
198
+ else:
199
+ return None
200
+
201
+ # If we've exhausted all retries
202
+ logger.error(f"{api_name} call failed after {rate_limiter.max_retries} attempts")
203
+ return None
204
+
205
+ except Exception as e:
206
+ # Catch any unexpected errors in the decorator itself
207
+ logger.error(f"{api_name} decorator error: {str(e)}")
208
+ return None
209
+
210
+ return wrapper
211
+ return decorator
212
+
213
+ def safe_json_parse(response, api_name):
214
+ """
215
+ Safely parse JSON response with error handling.
216
+
217
+ Args:
218
+ response (requests.Response): Response object to parse
219
+ api_name (str): Name of the API for logging
220
+
221
+ Returns:
222
+ dict: Parsed JSON or empty dict on error
223
+ """
224
+ try:
225
+ return response.json()
226
+ except ValueError as e:
227
+ logger.error(f"Error parsing {api_name} JSON response: {e}")
228
+ logger.debug(f"Response content: {response.text[:500]}...")
229
+ return {}
utils/models.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Model management utility for the Fake News Detector application.
3
+
4
+ This module provides functions for initializing, caching, and
5
+ retrieving language models used throughout the application.
6
+ It ensures models are loaded efficiently and reused appropriately.
7
+ """
8
+
9
+ import os
10
+ import logging
11
+ import functools
12
+ from langchain_openai import ChatOpenAI
13
+ import spacy
14
+
15
+ logger = logging.getLogger("misinformation_detector")
16
+
17
+ # Global variables for models
18
+ nlp = None
19
+ model = None
20
+ models_initialized = False
21
+
22
+ # Add caching decorator
23
+ def cached_model(func):
24
+ """
25
+ Decorator to cache model loading for improved performance.
26
+
27
+ This decorator ensures that models are only loaded once and
28
+ then reused for subsequent calls, improving performance by
29
+ avoiding redundant model loading.
30
+
31
+ Args:
32
+ func (callable): Function that loads a model
33
+
34
+ Returns:
35
+ callable: Wrapped function that returns a cached model
36
+ """
37
+ cache = {}
38
+
39
+ @functools.wraps(func)
40
+ def wrapper(*args, **kwargs):
41
+ # Use function name as cache key
42
+ key = func.__name__
43
+ if key not in cache:
44
+ logger.info(f"Model not in cache, calling {key}...")
45
+ cache[key] = func(*args, **kwargs)
46
+ return cache[key]
47
+
48
+ return wrapper
49
+
50
+ def initialize_models():
51
+ """
52
+ Initialize all required models.
53
+
54
+ This function loads and initializes all the language models
55
+ needed by the application, including spaCy for NLP tasks and
56
+ OpenAI for LLM-based processing.
57
+
58
+ Returns:
59
+ str: Initialization status message
60
+
61
+ Raises:
62
+ ValueError: If OpenAI API key is not set
63
+ """
64
+ global nlp, model, models_initialized
65
+
66
+ # Skip initialization if already done
67
+ if models_initialized:
68
+ logger.info("Models already initialized, skipping initialization")
69
+ return "Models already initialized"
70
+
71
+ # Check OpenAI API key
72
+ if "OPENAI_API_KEY" not in os.environ or not os.environ["OPENAI_API_KEY"].strip():
73
+ logger.error("OPENAI_API_KEY environment variable not set or empty")
74
+ raise ValueError("OpenAI API key is required. Please set it in the Hugging Face Space secrets.")
75
+
76
+ try:
77
+ # Load NLP model
78
+ try:
79
+ logger.info("Loading spaCy NLP model...")
80
+ nlp = spacy.load("en_core_web_sm")
81
+ logger.info("Loaded spaCy NLP model")
82
+ except OSError as e:
83
+ # This handles the case if the model wasn't installed correctly
84
+ logger.warning(f"Could not load spaCy model: {str(e)}")
85
+ logger.info("Attempting to download spaCy model...")
86
+ try:
87
+ import subprocess
88
+ import sys
89
+ # This downloads the model if it's missing
90
+ subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
91
+ # Try loading again
92
+ nlp = spacy.load("en_core_web_sm")
93
+ logger.info("Successfully downloaded and loaded spaCy model")
94
+ except Exception as download_err:
95
+ logger.error(f"Failed to download spaCy model: {str(download_err)}")
96
+ # Continue with other initialization, we'll handle missing NLP model elsewhere
97
+
98
+ # Set up OpenAI model
99
+ logger.info("Initializing ChatOpenAI model...")
100
+ model = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
101
+ logger.info("Initialized ChatOpenAI model")
102
+
103
+ # Mark initialization as complete
104
+ models_initialized = True
105
+ return "Models initialized successfully"
106
+
107
+ except Exception as e:
108
+ logger.error(f"Error initializing models: {str(e)}")
109
+ raise e
110
+
111
+ @cached_model
112
+ def get_nlp_model():
113
+ """
114
+ Get the spaCy NLP model, initializing if needed.
115
+
116
+ This function returns a cached spaCy model for NLP tasks.
117
+ If the model hasn't been loaded yet, it will be loaded.
118
+
119
+ Returns:
120
+ spacy.Language: Loaded spaCy model
121
+ """
122
+ global nlp
123
+ if nlp is None:
124
+ try:
125
+ # Try to load just the spaCy model if not loaded yet
126
+ logger.info("Loading spaCy NLP model...")
127
+ nlp = spacy.load("en_core_web_sm")
128
+ logger.info("Loaded spaCy NLP model")
129
+ except Exception as e:
130
+ logger.error(f"Error loading spaCy model: {str(e)}")
131
+ # Fall back to full initialization
132
+ initialize_models()
133
+ return nlp
134
+
135
+ @cached_model
136
+ def get_llm_model():
137
+ """
138
+ Get the ChatOpenAI model, initializing if needed.
139
+
140
+ This function returns a cached OpenAI LLM model.
141
+ If the model hasn't been loaded yet, it will be loaded.
142
+
143
+ Returns:
144
+ ChatOpenAI: Loaded LLM model
145
+ """
146
+ global model
147
+ if model is None:
148
+ try:
149
+ # Try to load just the LLM model if not loaded yet
150
+ logger.info("Initializing ChatOpenAI model...")
151
+ model = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
152
+ logger.info("Initialized ChatOpenAI model")
153
+ except Exception as e:
154
+ logger.error(f"Error initializing ChatOpenAI model: {str(e)}")
155
+ # Fall back to full initialization
156
+ initialize_models()
157
+ return model
utils/performance.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Performance tracking utility for the Fake News Detector application.
3
+
4
+ This module provides functionality to track and analyze the
5
+ performance of the application, including processing times,
6
+ success rates, and resource utilization.
7
+ """
8
+
9
+ import time
10
+ import logging
11
+
12
+ logger = logging.getLogger("misinformation_detector")
13
+
14
+ class PerformanceTracker:
15
+ """
16
+ Tracks and logs performance metrics for the fact-checking system.
17
+
18
+ This class maintains counters and statistics for various performance
19
+ metrics, such as processing times, evidence retrieval success rates,
20
+ and confidence scores.
21
+ """
22
+
23
+ def __init__(self):
24
+ """Initialize the performance tracker with empty metrics."""
25
+ self.metrics = {
26
+ "claims_processed": 0,
27
+ "evidence_retrieval_success_rate": [],
28
+ "processing_times": [],
29
+ "confidence_scores": [],
30
+ "source_types_used": {},
31
+ "temporal_relevance": []
32
+ }
33
+
34
+ def log_claim_processed(self):
35
+ """
36
+ Increment the counter for processed claims.
37
+ This should be called whenever a claim is processed successfully.
38
+ """
39
+ self.metrics["claims_processed"] += 1
40
+
41
+ def log_evidence_retrieval(self, success, sources_count):
42
+ """
43
+ Log the success or failure of evidence retrieval.
44
+
45
+ Args:
46
+ success (bool): Whether evidence retrieval was successful
47
+ sources_count (dict): Count of evidence items by source type
48
+ """
49
+ # Ensure success is a boolean
50
+ success_value = 1 if success else 0
51
+ self.metrics["evidence_retrieval_success_rate"].append(success_value)
52
+
53
+ # Safely process source types
54
+ if isinstance(sources_count, dict):
55
+ for source_type, count in sources_count.items():
56
+ # Ensure source_type is a string and count is an integer
57
+ source_type = str(source_type)
58
+ try:
59
+ count = int(count)
60
+ except (ValueError, TypeError):
61
+ count = 1
62
+
63
+ # Update source types used
64
+ self.metrics["source_types_used"][source_type] = \
65
+ self.metrics["source_types_used"].get(source_type, 0) + count
66
+
67
+ def log_processing_time(self, start_time):
68
+ """
69
+ Log the processing time for an operation.
70
+
71
+ Args:
72
+ start_time (float): Start time obtained from time.time()
73
+ """
74
+ end_time = time.time()
75
+ processing_time = end_time - start_time
76
+ self.metrics["processing_times"].append(processing_time)
77
+
78
+ def log_confidence_score(self, score):
79
+ """
80
+ Log a confidence score.
81
+
82
+ Args:
83
+ score (float): Confidence score between 0 and 1
84
+ """
85
+ # Ensure score is a float between 0 and 1
86
+ try:
87
+ score = float(score)
88
+ if 0 <= score <= 1:
89
+ self.metrics["confidence_scores"].append(score)
90
+ except (ValueError, TypeError):
91
+ logger.warning(f"Invalid confidence score: {score}")
92
+
93
+ def log_temporal_relevance(self, relevance_score):
94
+ """
95
+ Log a temporal relevance score.
96
+
97
+ Args:
98
+ relevance_score (float): Temporal relevance score between 0 and 1
99
+ """
100
+ # Ensure relevance score is a float between 0 and 1
101
+ try:
102
+ relevance_score = float(relevance_score)
103
+ if 0 <= relevance_score <= 1:
104
+ self.metrics["temporal_relevance"].append(relevance_score)
105
+ except (ValueError, TypeError):
106
+ logger.warning(f"Invalid temporal relevance score: {relevance_score}")
107
+
108
+ def get_summary(self):
109
+ """
110
+ Get a summary of all performance metrics.
111
+
112
+ Returns:
113
+ dict: Summary of performance metrics
114
+ """
115
+ # Safely calculate averages with error handling
116
+ def safe_avg(metric_list):
117
+ try:
118
+ return sum(metric_list) / max(len(metric_list), 1)
119
+ except (TypeError, ValueError):
120
+ return 0.0
121
+
122
+ return {
123
+ "claims_processed": self.metrics["claims_processed"],
124
+ "avg_evidence_retrieval_success_rate": safe_avg(self.metrics["evidence_retrieval_success_rate"]),
125
+ "avg_processing_time": safe_avg(self.metrics["processing_times"]),
126
+ "avg_confidence_score": safe_avg(self.metrics["confidence_scores"]),
127
+ "source_types_used": dict(self.metrics["source_types_used"]),
128
+ "avg_temporal_relevance": safe_avg(self.metrics["temporal_relevance"])
129
+ }
130
+
131
+ def reset(self):
132
+ """Reset all performance metrics."""
133
+ self.__init__()
134
+ logger.info("Performance metrics have been reset")
135
+ return "Performance metrics reset successfully"