ankanghosh commited on
Commit
87591ba
·
verified ·
1 Parent(s): 5b9788e

Delete modules

Browse files
modules/__init__.py DELETED
@@ -1,19 +0,0 @@
1
- """
2
- Modules package initialization.
3
-
4
- This package contains the core modules for the AskVeracity fact-checking system.
5
- """
6
-
7
- from .claim_extraction import extract_claims, shorten_claim_for_evidence
8
- from .evidence_retrieval import retrieve_combined_evidence
9
- from .classification import classify_with_llm, aggregate_evidence
10
- from .explanation import generate_explanation
11
-
12
- __all__ = [
13
- 'extract_claims',
14
- 'shorten_claim_for_evidence',
15
- 'retrieve_combined_evidence',
16
- 'classify_with_llm',
17
- 'aggregate_evidence',
18
- 'generate_explanation'
19
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
modules/category_detection.py DELETED
@@ -1,880 +0,0 @@
1
- import logging
2
- import re
3
- from typing import Tuple, List, Dict, Optional
4
- import os
5
- import time
6
-
7
- # Set up logging
8
- logger = logging.getLogger("misinformation_detector")
9
-
10
- # Define categories and their keywords
11
- CLAIM_CATEGORIES = {
12
- "ai": [
13
- # General AI terms
14
- "AI", "artificial intelligence", "machine learning", "ML", "deep learning", "DL",
15
- "neural network", "neural nets", "generative AI", "GenAI", "AGI", "artificial general intelligence",
16
- "transformer", "attention mechanism", "fine-tuning", "pre-training", "training", "inference",
17
-
18
- # AI Models and Architectures
19
- "language model", "large language model", "LLM", "foundation model", "multimodal model",
20
- "vision language model", "VLM", "text-to-speech", "TTS", "speech-to-text", "STT",
21
- "text-to-image", "image-to-text", "diffusion model", "generative model", "discriminative model",
22
- "GPT", "BERT", "T5", "PaLM", "Claude", "Llama", "Gemini", "Mistral", "Mixtral", "Stable Diffusion",
23
- "Dall-E", "Midjourney", "Sora", "transformer", "MoE", "mixture of experts", "sparse model",
24
- "dense model", "encoder", "decoder", "encoder-decoder", "autoencoder", "VAE",
25
- "mixture of experts", "MoE", "sparse MoE", "switch transformer", "gated experts",
26
- "routing network", "expert routing", "pathways", "multi-query attention", "multi-head attention",
27
- "rotary position embedding", "RoPE", "grouped-query attention", "GQA", "flash attention",
28
- "state space model", "SSM", "mamba", "recurrent neural network", "RNN", "LSTM", "GRU",
29
- "convolutional neural network", "CNN", "residual connection", "skip connection", "normalization",
30
- "layer norm", "group norm", "batch norm", "parameter efficient fine-tuning", "PEFT",
31
- "LoRA", "low-rank adaptation", "QLoRA", "adapters", "prompt tuning", "prefix tuning",
32
-
33
- # AI Learning Paradigms
34
- "supervised learning", "unsupervised learning", "reinforcement learning", "RL",
35
- "meta-learning", "transfer learning", "federated learning", "self-supervised learning",
36
- "semi-supervised learning", "few-shot learning", "zero-shot learning", "one-shot learning",
37
- "contrastive learning", "curriculum learning", "imitation learning", "active learning",
38
- "reinforcement learning from human feedback", "RLHF", "direct preference optimization", "DPO",
39
- "constitutional AI", "red teaming", "adversarial training", "GAN", "generative adversarial network",
40
- "diffusion", "latent diffusion", "flow-based model", "variational autoencoder", "VAE",
41
-
42
- # AI Capabilities and Applications
43
- "natural language processing", "NLP", "computer vision", "CV", "speech recognition",
44
- "text generation", "image generation", "video generation", "multimodal", "multi-modal",
45
- "recommendation system", "recommender system", "chatbot", "conversational AI",
46
- "sentiment analysis", "entity recognition", "semantic search", "vector search", "embedding",
47
- "classification", "regression", "clustering", "anomaly detection", "agent", "AI agent",
48
- "autonomous agent", "agentic", "RAG", "retrieval augmented generation", "tool use",
49
- "function calling", "reasoning", "chain-of-thought", "CoT", "tree-of-thought", "ToT",
50
- "planning", "decision making", "multi-agent", "agent swarm", "multi-agent simulation",
51
-
52
- # AI Technical Terms
53
- "token", "tokenizer", "tokenization", "embedding", "vector", "prompt", "prompt engineering",
54
- "context window", "parameter", "weights", "bias", "activation function", "loss function",
55
- "gradient descent", "backpropagation", "epoch", "batch", "mini-batch", "regularization",
56
- "dropout", "overfitting", "underfitting", "hyperparameter", "latent space", "latent variable",
57
- "feature extraction", "dimensionality reduction", "optimization", "quantization", "pruning",
58
- "fine-tuning", "transfer learning", "knowledge distillation", "int4", "int8", "bfloat16",
59
- "float16", "mixed precision", "GPTQ", "AWQ", "GGUF", "GGML", "KV cache", "speculative decoding",
60
- "beam search", "greedy decoding", "temperature", "top-k", "top-p", "nucleus sampling",
61
-
62
- # AI Tools and Frameworks
63
- "TensorFlow", "PyTorch", "JAX", "Keras", "Hugging Face", "Transformers", "Diffusers",
64
- "LangChain", "Llama Index", "OpenAI", "Anthropic", "NVIDIA", "GPU", "TPU", "IPU", "NPU", "CUDA",
65
- "MLOps", "model monitoring", "model deployment", "model serving", "inference endpoint",
66
- "vLLM", "TGI", "text generation inference", "triton", "onnx", "tensorRT",
67
-
68
- # AI Ethics and Concerns
69
- "AI ethics", "responsible AI", "AI safety", "AI alignment", "AI governance",
70
- "bias", "fairness", "interpretability", "explainability", "XAI", "transparency",
71
- "hallucination", "toxicity", "safe deployment", "AI risk", "AI capabilities",
72
- "alignment tax", "red teaming", "jailbreak", "prompt injection", "data poisoning",
73
-
74
- # AI Companies and Organizations
75
- "OpenAI", "Anthropic", "Google DeepMind", "Meta AI", "Microsoft", "NVIDIA",
76
- "Hugging Face", "Mistral AI", "Cohere", "AI21 Labs", "Stability AI", "Midjourney",
77
- "EleutherAI", "Allen AI", "DeepMind", "Character AI", "Inflection AI", "xAI"
78
- ],
79
-
80
- "science": [
81
- # General scientific terms
82
- "study", "research", "scientist", "scientific", "discovered", "experiment",
83
- "laboratory", "clinical", "trial", "hypothesis", "theory", "evidence-based",
84
- "peer-reviewed", "journal", "publication", "finding", "breakthrough", "innovation",
85
- "discovery", "analysis", "data", "measurement", "observation", "empirical",
86
-
87
- # Biology and medicine
88
- "biology", "chemistry", "physics", "genetics", "genomics", "DNA", "RNA",
89
- "medicine", "gene", "protein", "molecule", "cell", "brain", "neuro",
90
- "cancer", "disease", "cure", "treatment", "vaccine", "health", "medical",
91
- "pharmaceutical", "drug", "therapy", "symptom", "diagnosis", "prognosis",
92
- "patient", "doctor", "hospital", "clinic", "surgery", "immune", "antibody",
93
- "virus", "bacteria", "pathogen", "infection", "epidemic", "pandemic",
94
- "organism", "evolution", "mutation", "chromosome", "enzyme", "hormone",
95
-
96
- # Physics and astronomy
97
- "quantum", "particle", "atom", "nuclear", "electron", "neutron", "proton",
98
- "atomic", "subatomic", "molecular", "energy", "matter", "mass", "force",
99
- "space", "NASA", "telescope", "planet", "exoplanet", "moon", "lunar", "mars",
100
- "star", "galaxy", "cosmic", "astronomical", "universe", "solar", "celestial",
101
- "orbit", "gravitational", "gravity", "relativity", "quantum mechanics",
102
- "string theory", "dark matter", "dark energy", "black hole", "supernova",
103
- "radiation", "radioactive", "isotope", "fission", "fusion", "accelerator",
104
-
105
- # Environmental science
106
- "climate", "carbon", "environment", "ecosystem", "species", "extinct",
107
- "endangered", "biodiversity", "conservation", "sustainable", "renewable",
108
- "fossil fuel", "greenhouse", "global warming", "polar", "ice cap", "glacier",
109
- "ozone", "atmosphere", "weather", "meteorology", "geology", "earthquake",
110
- "volcanic", "ocean", "marine", "coral reef", "deforestation", "pollution",
111
-
112
- # Math and computer science (non-AI specific)
113
- "equation", "formula", "theorem", "calculus", "statistical", "probability",
114
- "dataset", "parameter", "variable", "function", "matrix", "optimization",
115
-
116
- # Organizations
117
- "CERN", "NIH", "CDC", "WHO", "NOAA", "ESA", "SpaceX", "Blue Origin", "JPL",
118
- "laboratory", "institute", "university", "academic", "faculty", "professor",
119
-
120
- # Science tools
121
- "Matlab", "SPSS", "SAS", "ImageJ", "LabVIEW", "ANSYS", "Cadence", "Origin",
122
- "Avogadro", "ChemDraw", "Mathematica", "Wolfram Alpha", "COMSOL", "LAMMPS",
123
- "VASP", "Gaussian", "GIS", "ArcGIS", "QGIS", "Maple", "R Studio"
124
- ],
125
-
126
- "technology": [
127
- # General tech terms
128
- "computer", "software", "hardware", "internet", "cyber", "digital", "tech",
129
- "robot", "automation", "autonomous", "code", "programming", "data", "cloud",
130
- "server", "network", "encryption", "blockchain", "crypto", "bitcoin", "ethereum",
131
- "technology", "innovation", "breakthrough", "prototype", "development",
132
- "engineering", "technical", "specification", "feature", "functionality",
133
- "interface", "system", "infrastructure", "integration", "implementation",
134
-
135
- # Devices and hardware
136
- "smartphone", "device", "gadget", "laptop", "desktop", "tablet", "wearable",
137
- "smartwatch", "IoT", "internet of things", "sensor", "chip", "semiconductor",
138
- "processor", "CPU", "GPU", "memory", "RAM", "storage", "hard drive", "SSD",
139
- "electronic", "circuit", "motherboard", "component", "peripheral", "accessory",
140
- "display", "screen", "touchscreen", "camera", "lens", "microphone", "speaker",
141
- "battery", "charger", "wireless", "bluetooth", "WiFi", "router", "modem",
142
-
143
- # Software and internet
144
- "app", "application", "platform", "website", "online", "web", "browser",
145
- "operating system", "Windows", "macOS", "Linux", "Android", "iOS", "software",
146
- "program", "code", "coding", "development", "framework", "library", "API",
147
- "interface", "backend", "frontend", "full-stack", "developer", "programmer",
148
- "database", "SQL", "NoSQL", "cloud computing", "SaaS", "PaaS", "IaaS",
149
- "DevOps", "agile", "scrum", "sprint", "version control", "git", "repository",
150
-
151
- # Communications and networking
152
- "5G", "6G", "broadband", "fiber", "network", "wireless", "cellular", "mobile",
153
- "telecommunications", "telecom", "transmission", "bandwidth", "latency",
154
- "protocol", "IP address", "DNS", "server", "hosting", "data center",
155
-
156
- # Company and product names
157
- "Apple", "Google", "Microsoft", "Amazon", "Facebook", "Meta", "Tesla",
158
- "IBM", "Intel", "AMD", "Nvidia", "Qualcomm", "Cisco", "Oracle", "SAP",
159
- "Huawei", "Samsung", "Sony", "LG", "Dell", "HP", "Lenovo", "Xiaomi",
160
- "iPhone", "iPad", "MacBook", "Surface", "Galaxy", "Pixel", "Windows",
161
- "Android", "iOS", "Chrome", "Firefox", "Edge", "Safari", "Office",
162
- "Azure", "AWS", "Google Cloud", "Gmail", "Outlook", "Teams", "Zoom",
163
-
164
- # Advanced technologies
165
- "VR", "AR", "XR", "virtual reality", "augmented reality", "mixed reality",
166
- "metaverse", "3D printing", "additive manufacturing", "quantum computing",
167
- "nanotechnology", "biotechnology", "electric vehicle", "self-driving",
168
- "autonomous vehicle", "drone", "UAV", "robotics", "cybersecurity",
169
-
170
- # Social media
171
- "social media", "social network", "Facebook", "Instagram", "Twitter", "X",
172
- "LinkedIn", "TikTok", "Snapchat", "YouTube", "Pinterest", "Reddit",
173
- "streaming", "content creator", "influencer", "follower", "like", "share",
174
- "post", "tweet", "user-generated", "viral", "trending", "engagement",
175
-
176
- # Technology tools
177
- "NumPy", "Pandas", "Matplotlib", "Seaborn", "Scikit-learn", "Jupyter",
178
- "Visual Studio", "VS Code", "IntelliJ", "PyCharm", "Eclipse", "Android Studio",
179
- "Xcode", "Docker", "Kubernetes", "Jenkins", "Ansible", "Terraform", "Vagrant",
180
- "AWS CLI", "Azure CLI", "GCP CLI", "PowerShell", "Bash", "npm", "pip", "conda",
181
- "React", "Angular", "Vue.js", "Node.js", "Django", "Flask", "Spring", "Laravel",
182
- "PostgreSQL", "MySQL", "MongoDB", "Redis", "Elasticsearch", "Kafka", "RabbitMQ",
183
-
184
- # Optimization terms
185
- "optimization", "efficiency", "performance tuning", "benchmarking", "profiling",
186
- "refactoring", "scaling", "bottleneck", "throughput", "latency reduction",
187
- "response time", "caching", "load balancing", "distributed computing",
188
- "parallel processing", "concurrency", "asynchronous", "memory management"
189
- ],
190
-
191
- "politics": [
192
- # Government structure
193
- "president", "prime minister", "government", "parliament", "congress",
194
- "senate", "house", "representative", "minister", "secretary", "cabinet",
195
- "administration", "mayor", "governor", "politician", "official", "authority",
196
- "federal", "state", "local", "municipal", "county", "city", "town",
197
- "constituency", "district", "precinct", "ward", "judiciary", "executive",
198
- "legislative", "branch", "checks and balances", "separation of powers",
199
-
200
- # Political activities
201
- "policy", "election", "campaign", "vote", "voter", "ballot", "polling",
202
- "political", "politics", "debate", "speech", "address", "press conference",
203
- "approval rating", "opinion poll", "candidate", "incumbent", "challenger",
204
- "primary", "caucus", "convention", "delegate", "nomination", "campaign trail",
205
- "fundraising", "lobbying", "advocacy", "activism", "protest", "demonstration",
206
-
207
- # Political ideologies
208
- "democracy", "democratic", "republican", "conservative", "liberal",
209
- "progressive", "left-wing", "right-wing", "centrist", "moderate",
210
- "socialist", "capitalist", "communist", "libertarian", "populist",
211
- "nationalist", "globalist", "isolationist", "hawk", "dove",
212
- "ideology", "partisan", "bipartisan", "coalition", "majority", "minority",
213
-
214
- # Laws and regulations
215
- "bill", "law", "legislation", "regulation", "policy", "statute", "code",
216
- "amendment", "reform", "repeal", "enact", "implement", "enforce",
217
- "constitutional", "unconstitutional", "legal", "illegal", "legalize",
218
- "criminalize", "deregulate", "regulatory", "compliance", "mandate",
219
-
220
- # Judicial and legal
221
- "court", "supreme", "justice", "judge", "ruling", "decision", "opinion",
222
- "case", "lawsuit", "litigation", "plaintiff", "defendant", "prosecutor",
223
- "attorney", "lawyer", "advocate", "judicial review", "precedent",
224
- "constitution", "amendment", "rights", "civil rights", "human rights",
225
-
226
- # International relations
227
- "treaty", "international", "diplomatic", "diplomacy", "relations",
228
- "foreign policy", "domestic policy", "UN", "NATO", "EU", "United Nations",
229
- "sanctions", "embargo", "tariff", "trade war", "diplomat", "embassy",
230
- "consulate", "ambassador", "delegation", "summit", "bilateral", "multilateral",
231
- "alliance", "ally", "adversary", "geopolitical", "sovereignty", "regime",
232
-
233
- # Security and defense
234
- "national security", "homeland security", "defense", "military", "armed forces",
235
- "army", "navy", "air force", "marines", "coast guard", "intelligence",
236
- "CIA", "FBI", "NSA", "Pentagon", "war", "conflict", "peacekeeping",
237
- "terrorism", "counterterrorism", "insurgency", "nuclear weapon", "missile",
238
- "disarmament", "nonproliferation", "surveillance", "espionage",
239
-
240
- # Political institutions
241
- "White House", "Kremlin", "Downing Street", "Capitol Hill", "Westminster",
242
- "United Nations", "European Union", "NATO", "World Bank", "IMF", "WTO",
243
- "ASEAN", "African Union", "BRICS", "G7", "G20",
244
-
245
- # Political parties and movements
246
- "Democrat", "Republican", "Labour", "Conservative", "Green Party",
247
- "Socialist", "Communist", "Libertarian", "Independent", "Tea Party",
248
- "progressive movement", "civil rights movement", "womens rights",
249
- "LGBTQ rights", "Black Lives Matter", "environmental movement"
250
- ],
251
-
252
- "business": [
253
- # Companies and organization types
254
- "company", "corporation", "business", "startup", "firm", "enterprise",
255
- "corporate", "industry", "sector", "conglomerate", "multinational",
256
- "organization", "entity", "private", "public", "incorporated", "LLC",
257
- "partnership", "proprietorship", "franchise", "subsidiary", "parent company",
258
- "headquarters", "office", "facility", "plant", "factory", "warehouse",
259
- "retail", "wholesale", "ecommerce", "brick-and-mortar", "chain", "outlet",
260
-
261
- # Business roles and management
262
- "executive", "CEO", "CFO", "CTO", "COO", "CMO", "CIO", "CHRO", "chief",
263
- "director", "board", "chairman", "chairwoman", "chairperson", "president",
264
- "vice president", "senior", "junior", "manager", "management", "supervisor",
265
- "founder", "entrepreneur", "owner", "shareholder", "stakeholder",
266
- "employee", "staff", "workforce", "personnel", "human resources", "HR",
267
- "recruit", "hire", "layoff", "downsizing", "restructuring", "reorganization",
268
-
269
- # Financial terms
270
- "profit", "revenue", "sales", "income", "earnings", "EBITDA", "turnover",
271
- "loss", "deficit", "expense", "cost", "overhead", "margin", "markup",
272
- "budget", "forecast", "projection", "estimate", "actual", "variance",
273
- "balance sheet", "income statement", "cash flow", "P&L", "liquidity",
274
- "solvency", "asset", "liability", "equity", "debt", "leverage", "capital",
275
- "working capital", "cash", "funds", "money", "payment", "transaction",
276
-
277
- # Markets and trading
278
- "market", "stock", "share", "bond", "security", "commodity", "futures",
279
- "option", "derivative", "forex", "foreign exchange", "currency", "crypto",
280
- "trader", "trading", "buy", "sell", "long", "short", "position", "portfolio",
281
- "diversification", "hedge", "risk", "return", "yield", "dividend", "interest",
282
- "bull market", "bear market", "correction", "crash", "rally", "volatile",
283
- "volatility", "index", "benchmark", "Dow Jones", "NASDAQ", "S&P 500", "NYSE",
284
-
285
- # Investment and funding
286
- "investor", "investment", "fund", "mutual fund", "ETF", "hedge fund",
287
- "private equity", "venture", "venture capital", "VC", "angel investor",
288
- "seed", "Series A", "Series B", "Series C", "funding", "financing",
289
- "loan", "credit", "debt", "equity", "fundraising", "crowdfunding",
290
- "IPO", "initial public offering", "going public", "listed", "delisted",
291
- "merger", "acquisition", "M&A", "takeover", "buyout", "divestiture",
292
- "valuation", "billion", "million", "trillion", "unicorn", "decacorn",
293
-
294
- # Economic terms
295
- "economy", "economic", "economics", "macro", "micro", "fiscal", "monetary",
296
- "supply", "demand", "market forces", "competition", "competitive", "monopoly",
297
- "oligopoly", "antitrust", "regulation", "deregulation", "growth", "decline",
298
- "recession", "depression", "recovery", "expansion", "contraction", "cycle",
299
- "inflation", "deflation", "stagflation", "hyperinflation", "CPI", "price",
300
- "GDP", "gross domestic product", "GNP", "productivity", "output", "input",
301
-
302
- # Banking and finance
303
- "finance", "financial", "bank", "banking", "commercial bank", "investment bank",
304
- "central bank", "Federal Reserve", "Fed", "ECB", "Bank of England", "BOJ",
305
- "interest rate", "prime rate", "discount rate", "basis point", "monetary policy",
306
- "quantitative easing", "tightening", "loosening", "credit", "lending",
307
- "borrowing", "loan", "mortgage", "consumer credit", "credit card", "debit card",
308
- "checking", "savings", "deposit", "withdrawal", "ATM", "branch", "online banking",
309
-
310
- # Currencies and payments
311
- "dollar", "euro", "pound", "yen", "yuan", "rupee", "ruble", "real", "peso",
312
- "currency", "money", "fiat", "exchange rate", "remittance", "transfer",
313
- "payment", "transaction", "wire", "ACH", "SWIFT", "clearing", "settlement",
314
- "cryptocurrency", "bitcoin", "ethereum", "blockchain", "fintech", "paytech",
315
-
316
- # Business operations
317
- "product", "service", "solution", "offering", "launch", "rollout", "release",
318
- "operation", "production", "manufacturing", "supply chain", "logistics",
319
- "procurement", "inventory", "distribution", "shipping", "delivery",
320
- "quality", "control", "assurance", "standard", "certification", "compliance",
321
- "process", "procedure", "workflow", "efficiency", "optimization",
322
-
323
- # Marketing and sales
324
- "marketing", "advertise", "advertising", "campaign", "promotion", "publicity",
325
- "PR", "public relations", "brand", "branding", "identity", "image", "reputation",
326
- "sales", "selling", "deal", "transaction", "pipeline", "lead", "prospect",
327
- "customer", "client", "consumer", "buyer", "purchaser", "target market",
328
- "segment", "demographic", "psychographic", "B2B", "B2C", "retail", "wholesale",
329
- "price", "pricing", "discount", "premium", "luxury", "value", "bargain"
330
- ],
331
-
332
- "world": [
333
- # General international terms
334
- "country", "nation", "state", "republic", "kingdom", "global", "international",
335
- "foreign", "world", "worldwide", "domestic", "abroad", "overseas",
336
- "developed", "developing", "industrialized", "emerging", "third world",
337
- "global south", "global north", "east", "west", "western", "eastern",
338
- "bilateral", "multilateral", "transnational", "multinational", "sovereignty",
339
-
340
- # Regions and continents
341
- "Europe", "European", "Asia", "Asian", "Africa", "African", "North America",
342
- "South America", "Latin America", "Australia", "Oceania", "Antarctica",
343
- "Middle East", "Central Asia", "Southeast Asia", "East Asia", "South Asia",
344
- "Eastern Europe", "Western Europe", "Northern Europe", "Southern Europe",
345
- "Mediterranean", "Scandinavia", "Nordic", "Baltic", "Balkans", "Caucasus",
346
- "Caribbean", "Central America", "South Pacific", "Polynesia", "Micronesia",
347
-
348
- # Major countries and regions
349
- "China", "Chinese", "Russia", "Russian", "India", "Indian", "Japan", "Japanese",
350
- "UK", "British", "England", "English", "Scotland", "Scottish", "Wales", "Welsh",
351
- "Germany", "German", "France", "French", "Italy", "Italian", "Spain", "Spanish",
352
- "Canada", "Canadian", "Brazil", "Brazilian", "Mexico", "Mexican", "Turkey", "Turkish",
353
- "United States", "US", "USA", "American", "Britain", "Korea", "Korean",
354
- "North Korea", "South Korea", "Saudi", "Saudi Arabia", "Saudi Arabian",
355
- "Iran", "Iranian", "Iraq", "Iraqi", "Israel", "Israeli", "Palestine", "Palestinian",
356
- "Egypt", "Egyptian", "Pakistan", "Pakistani", "Indonesia", "Indonesian",
357
- "Australia", "Australian", "New Zealand", "Nigeria", "Nigerian", "South Africa",
358
- "Argentina", "Argentinian", "Colombia", "Colombian", "Venezuela", "Venezuelan",
359
- "Ukraine", "Ukrainian", "Poland", "Polish", "Switzerland", "Swiss",
360
- "Netherlands", "Dutch", "Belgium", "Belgian", "Sweden", "Swedish", "Norway", "Norwegian",
361
-
362
- # International issues and topics
363
- "war", "conflict", "crisis", "tension", "dispute", "hostility", "peace",
364
- "peacekeeping", "ceasefire", "truce", "armistice", "treaty", "agreement",
365
- "compromise", "negotiation", "mediation", "resolution", "settlement",
366
- "refugee", "migrant", "asylum seeker", "displacement", "humanitarian",
367
- "border", "frontier", "territory", "territorial", "sovereignty", "jurisdiction",
368
- "terror", "terrorism", "extremism", "radicalism", "insurgency", "militant",
369
- "sanction", "embargo", "restriction", "isolation", "blockade",
370
-
371
- # International trade and economy
372
- "trade", "import", "export", "tariff", "duty", "quota", "subsidy",
373
- "protectionism", "free trade", "fair trade", "globalization", "trade war",
374
- "trade agreement", "trade deal", "trade deficit", "trade surplus",
375
- "supply chain", "outsourcing", "offshoring", "reshoring", "nearshoring",
376
-
377
- # Diplomacy and international relations
378
- "embassy", "consulate", "diplomatic", "diplomacy", "diplomat", "ambassador",
379
- "consul", "attaché", "envoy", "emissary", "delegation", "mission",
380
- "foreign policy", "international relations", "geopolitics", "geopolitical",
381
- "influence", "power", "superpower", "hegemony", "alliance", "coalition",
382
- "bloc", "axis", "sphere of influence", "buffer state", "proxy",
383
-
384
- # International organizations
385
- "UN", "United Nations", "EU", "European Union", "NATO", "NAFTA", "USMCA",
386
- "ASEAN", "OPEC", "Commonwealth", "Arab League", "African Union", "AU",
387
- "BRICS", "G7", "G20", "IMF", "World Bank", "WTO", "WHO", "UNESCO",
388
- "Security Council", "General Assembly", "International Court of Justice",
389
-
390
- # Travel and cultural exchange
391
- "visa", "passport", "immigration", "emigration", "migration", "travel",
392
- "tourism", "tourist", "visitor", "foreigner", "expatriate", "expat",
393
- "citizenship", "nationality", "dual citizen", "naturalization",
394
- "cultural", "tradition", "heritage", "indigenous", "native", "local",
395
- "language", "dialect", "translation", "interpreter", "cross-cultural"
396
- ],
397
-
398
- "sports": [
399
- # General sports terms
400
- "game", "match", "tournament", "championship", "league", "cup", "Olympics",
401
- "olympic", "world cup", "competition", "contest", "event", "series",
402
- "sport", "sporting", "athletics", "physical", "play", "compete", "competition",
403
- "amateur", "professional", "pro", "season", "preseason", "regular season",
404
- "postseason", "playoff", "final", "semifinal", "quarterfinal", "qualifying",
405
-
406
- # Team sports
407
- "football", "soccer", "American football", "rugby", "basketball", "baseball",
408
- "cricket", "hockey", "ice hockey", "field hockey", "volleyball", "handball",
409
- "water polo", "lacrosse", "ultimate frisbee", "netball", "kabaddi",
410
- "team", "club", "franchise", "squad", "roster", "lineup", "formation",
411
- "player", "coach", "manager", "trainer", "captain", "starter", "substitute",
412
- "bench", "draft", "trade", "free agent", "contract", "transfer", "loan",
413
-
414
- # Individual sports
415
- "tennis", "golf", "boxing", "wrestling", "martial arts", "MMA", "UFC",
416
- "athletics", "track and field", "swimming", "diving", "gymnastics",
417
- "skiing", "snowboarding", "skating", "figure skating", "speed skating",
418
- "cycling", "mountain biking", "BMX", "motorsport", "F1", "Formula 1",
419
- "NASCAR", "IndyCar", "MotoGP", "rally", "marathon", "triathlon", "decathlon",
420
- "archery", "shooting", "fencing", "equestrian", "rowing", "canoeing", "kayaking",
421
- "surfing", "skateboarding", "climbing", "bouldering", "weightlifting",
422
-
423
- # Scoring and results
424
- "score", "point", "goal", "touchdown", "basket", "run", "wicket", "try",
425
- "win", "lose", "draw", "tie", "defeat", "victory", "champion", "winner",
426
- "loser", "runner-up", "finalist", "semifinalist", "eliminated", "advance",
427
- "qualify", "record", "personal best", "world record", "Olympic record",
428
- "streak", "undefeated", "unbeaten", "perfect season", "comeback",
429
-
430
- # Performance and training
431
- "fitness", "training", "practice", "drill", "workout", "exercise", "regime",
432
- "conditioning", "strength", "endurance", "speed", "agility", "flexibility",
433
- "skill", "technique", "form", "style", "strategy", "tactic", "playbook",
434
- "offense", "defense", "attack", "counter", "press", "formation",
435
- "injury", "rehabilitation", "recovery", "physiotherapy", "sports medicine",
436
-
437
- # Sports infrastructure
438
- "stadium", "arena", "court", "field", "pitch", "rink", "pool", "track",
439
- "course", "gymnasium", "gym", "complex", "venue", "facility", "locker room",
440
- "dugout", "bench", "sideline", "grandstand", "spectator", "fan", "supporter",
441
-
442
- # Sports organizations and competitions
443
- "medal", "gold", "silver", "bronze", "podium", "Olympics", "Paralympic",
444
- "commonwealth games", "Asian games", "Pan American games", "world championship",
445
- "grand slam", "masters", "open", "invitational", "classic", "tour", "circuit",
446
- "IPL", "Indian Premier League", "MLB", "Major League Baseball",
447
- "NBA", "National Basketball Association", "NFL", "National Football League",
448
- "NHL", "National Hockey League", "FIFA", "UEFA", "ATP", "WTA", "ICC",
449
- "Premier League", "La Liga", "Bundesliga", "Serie A", "Ligue 1", "MLS",
450
- "Champions League", "Europa League", "Super Bowl", "World Series", "Stanley Cup",
451
- "NCAA", "collegiate", "college", "university", "varsity", "intramural",
452
-
453
- # Sports media and business
454
- "broadcast", "coverage", "commentator", "announcer", "pundit", "analyst",
455
- "highlight", "replay", "sports network", "ESPN", "Sky Sports", "Fox Sports",
456
- "sponsorship", "endorsement", "advertisement", "merchandise", "jersey", "kit",
457
- "ticket", "season ticket", "box seat", "premium", "concession", "vendor",
458
- # Sports media and business (continued)
459
- "broadcast", "coverage", "commentator", "announcer", "pundit", "analyst",
460
- "highlight", "replay", "sports network", "ESPN", "Sky Sports", "Fox Sports",
461
- "sponsorship", "endorsement", "advertisement", "merchandise", "jersey", "kit",
462
- "ticket", "season ticket", "box seat", "premium", "concession", "vendor"
463
- ],
464
-
465
- "entertainment": [
466
- # Film and cinema
467
- "movie", "film", "cinema", "feature", "short film", "documentary", "animation",
468
- "blockbuster", "indie", "independent film", "foreign film", "box office",
469
- "screening", "premiere", "release", "theatrical", "stream", "streaming",
470
- "director", "producer", "screenwriter", "script", "screenplay", "adaptation",
471
- "cinematography", "cinematographer", "editing", "editor", "visual effects",
472
- "special effects", "CGI", "motion capture", "sound design", "soundtrack",
473
- "score", "composer", "scene", "shot", "take", "cut", "sequel", "prequel",
474
- "trilogy", "franchise", "universe", "reboot", "remake", "spin-off",
475
- "genre", "action", "comedy", "drama", "thriller", "horror", "sci-fi",
476
- "science fiction", "fantasy", "romance", "romantic comedy", "rom-com",
477
- "mystery", "crime", "western", "historical", "biographical", "biopic",
478
-
479
- # Television
480
- "TV", "television", "show", "series", "episode", "season", "pilot",
481
- "finale", "midseason", "sitcom", "drama series", "miniseries", "limited series",
482
- "anthology", "reality TV", "game show", "talk show", "variety show",
483
- "network", "cable", "premium cable", "broadcast", "channel", "program",
484
- "primetime", "daytime", "syndication", "rerun", "renewed", "cancelled",
485
- "showrunner", "creator", "writer", "TV writer", "episode writer", "staff writer",
486
-
487
- # Performing arts
488
- "actor", "actress", "performer", "cast", "casting", "star", "co-star",
489
- "supporting", "lead", "protagonist", "antagonist", "villain", "hero", "anti-hero",
490
- "character", "role", "performance", "portrayal", "acting", "dialogue",
491
- "monologue", "line", "script", "improv", "improvisation", "stand-up",
492
- "comedian", "comic", "sketch", "theater", "theatre", "stage", "Broadway",
493
- "West End", "play", "musical", "opera", "ballet", "dance", "choreography",
494
- "production", "rehearsal", "audition", "understudy", "troupe", "ensemble",
495
-
496
- # Music
497
- "music", "song", "track", "single", "album", "EP", "LP", "record",
498
- "release", "drop", "artist", "musician", "singer", "vocalist", "band",
499
- "group", "duo", "trio", "soloist", "frontman", "frontwoman", "lead singer",
500
- "songwriter", "composer", "producer", "DJ", "rapper", "MC", "beatmaker",
501
- "guitarist", "bassist", "drummer", "pianist", "keyboardist", "violinist",
502
- "instrumentalist", "orchestra", "symphony", "philharmonic", "conductor",
503
- "genre", "rock", "pop", "hip-hop", "rap", "R&B", "soul", "funk", "jazz",
504
- "blues", "country", "folk", "electronic", "EDM", "dance", "techno", "house",
505
- "metal", "punk", "alternative", "indie", "classical", "reggae", "latin",
506
- "hit", "chart", "Billboard", "Grammy", "award-winning", "platinum", "gold",
507
- "concert", "tour", "gig", "show", "performance", "live", "venue", "arena",
508
- "stadium", "festival", "Coachella", "Glastonbury", "Lollapalooza", "Bonnaroo",
509
-
510
- # Celebrity culture
511
- "celebrity", "star", "fame", "famous", "A-list", "B-list", "icon", "iconic",
512
- "superstar", "public figure", "household name", "stardom", "limelight",
513
- "popular", "popularity", "fan", "fanbase", "followers", "stan", "groupie",
514
- "paparazzi", "tabloid", "gossip", "rumor", "scandal", "controversy",
515
- "interview", "press conference", "red carpet", "premiere", "gala", "award show",
516
-
517
- # Awards and recognition
518
- "award", "nominee", "nomination", "winner", "recipient", "honor", "accolade",
519
- "Oscar", "Academy Award", "Emmy", "Grammy", "Tony", "Golden Globe", "BAFTA",
520
- "MTV Award", "People's Choice", "Critics' Choice", "SAG Award", "Billboard Award",
521
- "best actor", "best actress", "best director", "best picture", "best film",
522
- "best album", "best song", "hall of fame", "lifetime achievement", "legacy",
523
-
524
- # Media and publishing
525
- "book", "novel", "fiction", "non-fiction", "memoir", "biography", "autobiography",
526
- "bestseller", "bestselling", "author", "writer", "novelist", "literary",
527
- "literature", "publisher", "publishing", "imprint", "edition", "volume",
528
- "chapter", "page", "paragraph", "prose", "narrative", "plot", "storyline",
529
- "character", "protagonist", "antagonist", "setting", "theme", "genre",
530
- "mystery", "thriller", "romance", "sci-fi", "fantasy", "young adult", "YA",
531
- "comic", "comic book", "graphic novel", "manga", "anime", "cartoon",
532
-
533
- # Digital entertainment
534
- "streaming", "stream", "subscription", "platform", "service", "content",
535
- "Netflix", "Disney+", "Amazon Prime", "Hulu", "HBO", "HBO Max", "Apple TV+",
536
- "Peacock", "Paramount+", "YouTube", "YouTube Premium", "TikTok", "Instagram",
537
- "influencer", "content creator", "vlogger", "blogger", "podcaster", "podcast",
538
- "episode", "download", "subscriber", "follower", "like", "share", "viral",
539
- "trending", "binge-watch", "marathon", "spoiler", "recap", "review", "trailer",
540
- "teaser", "behind the scenes", "BTS", "exclusive", "original"
541
- ]
542
- }
543
-
544
- # Add domain-specific RSS feeds for different categories
545
- CATEGORY_SPECIFIC_FEEDS = {
546
- "science": [
547
- # "https://www.science.org/rss/news_feeds/carousel.xml",
548
- "https://www.science.org/rss/news_current.xml",
549
- "https://www.nature.com/nature.rss",
550
- # "https://www.scientificamerican.com/rss/",
551
- "http://rss.sciam.com/basic-science",
552
- # "https://rss.sciam.com/ScientificAmerican-Global",
553
- "http://rss.sciam.com/ScientificAmerican-Global",
554
- # "https://feeds.newscientist.com/science-news",
555
- "https://www.newscientist.com/feed/home/?cmpid=RSS|NSNS-Home",
556
- "https://phys.org/rss-feed/"
557
- ],
558
- "technology": [
559
- # "https://feed.wired.com/rss/category/business/feed.rss",
560
- "https://www.wired.com/feed/category/business/latest/rss",
561
- "https://techcrunch.com/feed/",
562
- "https://www.technologyreview.com/feed/",
563
- "https://arstechnica.com/feed/",
564
- "https://www.theverge.com/rss/index.xml",
565
- "https://news.ycombinator.com/rss"
566
- ],
567
- "politics": [
568
- "https://feeds.washingtonpost.com/rss/politics",
569
- "https://rss.nytimes.com/services/xml/rss/nyt/Politics.xml",
570
- "https://feeds.bbci.co.uk/news/politics/rss.xml",
571
- "https://www.politico.com/rss/politicopicks.xml",
572
- "https://www.realclearpolitics.com/index.xml"
573
- ],
574
- "business": [
575
- "https://www.ft.com/rss/home",
576
- "https://feeds.bloomberg.com/markets/news.rss",
577
- # "https://www.forbes.com/business/feed/",
578
- "https://rss.nytimes.com/services/xml/rss/nyt/Business.xml",
579
- "https://feeds.washingtonpost.com/rss/business",
580
- "https://www.entrepreneur.com/latest.rss",
581
- # "https://www.cnbc.com/id/10001147/device/rss/rss.htm",
582
- "https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=10001147",
583
- "https://feeds.content.dowjones.io/public/rss/WSJcomUSBusiness",
584
- "https://feeds.a.dj.com/rss/RSSMarketsMain.xml"
585
- ],
586
- "world": [
587
- "https://feeds.bbci.co.uk/news/world/rss.xml",
588
- "https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
589
- "https://www.aljazeera.com/xml/rss/all.xml",
590
- "https://feeds.washingtonpost.com/rss/world",
591
- # "https://rss.cnn.com/rss/edition_world.rss"
592
- "http://rss.cnn.com/rss/cnn_world.rss"
593
- ],
594
- "sports": [
595
- "https://www.espn.com/espn/rss/news",
596
- "https://www.cbssports.com/rss/headlines/",
597
- # "https://feeds.skysports.com/feeds/rss/latest.xml",
598
- "https://www.espncricinfo.com/rss/content/story/feeds/0.xml",
599
- "https://api.foxsports.com/v1/rss",
600
- "https://www.sportingnews.com/us/rss",
601
- "https://www.theguardian.com/sport/rss",
602
- ],
603
- "entertainment": [
604
- "https://www.hollywoodreporter.com/feed/",
605
- "https://variety.com/feed/",
606
- # "https://feeds.eonline.com/mrss/article/",
607
- "https://www.eonline.com/syndication/feeds/rssfeeds/topstories.xml",
608
- "https://www.rollingstone.com/feed/",
609
- "https://rss.nytimes.com/services/xml/rss/nyt/Arts.xml"
610
- ],
611
- "fact_checking": [
612
- "https://www.snopes.com/feed/",
613
- "https://www.politifact.com/rss/all/",
614
- "https://www.factcheck.org/feed/",
615
- "https://leadstories.com/atom.xml",
616
- # "https://apnews.com/hub/fact-check/rss",
617
- # "https://apnews.com/apf-fact-check"
618
- "https://fullfact.org/feed/all/",
619
- "https://www.truthorfiction.com/feed/"
620
- ]
621
- }
622
-
623
- # Reliability boosts for sources by category
624
- SOURCE_RELIABILITY_BY_CATEGORY = {
625
- "science": {
626
- "nature.com": 0.95,
627
- "science.org": 0.95,
628
- "nih.gov": 0.95,
629
- "nasa.gov": 0.95,
630
- "scientificamerican.com": 0.9,
631
- "newscientist.com": 0.9,
632
- "pnas.org": 0.95,
633
- "cell.com": 0.95,
634
- "sciencedirect.com": 0.9,
635
- "plos.org": 0.9,
636
- "arxiv.org": 0.85
637
- },
638
- "technology": {
639
- "wired.com": 0.9,
640
- "techcrunch.com": 0.85,
641
- "arstechnica.com": 0.9,
642
- "technologyreview.com": 0.9,
643
- "theverge.com": 0.85,
644
- "cnet.com": 0.85,
645
- "engadget.com": 0.85
646
- },
647
- "fact_checking": {
648
- "snopes.com": 0.95,
649
- "politifact.com": 0.9,
650
- "factcheck.org": 0.9,
651
- "apnews.com/hub/fact-check": 0.95,
652
- "reuters.com/fact-check": 0.95
653
- }
654
- }
655
-
656
- def detect_claim_category(claim: str) -> Tuple[str, float]:
657
- """
658
- Detect the most likely category of a claim and its confidence score
659
-
660
- Args:
661
- claim (str): The claim text
662
-
663
- Returns:
664
- tuple: (category_name, confidence_score)
665
- """
666
- if not claim:
667
- return "general", 0.3
668
-
669
- # Lowercase for better matching
670
- claim_lower = claim.lower()
671
-
672
- # Count matches for each category
673
- category_scores = {}
674
-
675
- for category, keywords in CLAIM_CATEGORIES.items():
676
- # Count how many keywords from this category appear in the claim
677
- matches = sum(1 for keyword in keywords if keyword.lower() in claim_lower)
678
-
679
- # Calculate a simple score based on matches
680
- if matches > 0:
681
- # Calculate a more significant score based on number of matches
682
- score = min(0.9, 0.3 + (matches * 0.1)) # Base 0.3 + 0.1 per match, max 0.9
683
- category_scores[category] = score
684
-
685
- # Find category with highest score
686
- if not category_scores:
687
- return "general", 0.3
688
-
689
- top_category = max(category_scores.items(), key=lambda x: x[1])
690
- category_name, confidence = top_category
691
-
692
- # If the top score is too low, return general
693
- if confidence < 0.3:
694
- return "general", 0.3
695
-
696
- return category_name, confidence
697
-
698
- def get_topic_specific_sources(claim: str, existing_sources: Dict) -> Dict:
699
- """
700
- Enrich existing sources dict with topic-specific sources
701
-
702
- Args:
703
- claim (str): The claim text
704
- existing_sources (dict): Current sources configuration
705
-
706
- Returns:
707
- dict: Updated sources with topic-specific priorities
708
- """
709
- # Detect claim category
710
- category, confidence = detect_claim_category(claim)
711
- logger.info(f"Claim category detected: {category} (confidence: {confidence:.2f})")
712
-
713
- # If confidence is low, keep existing sources
714
- if confidence < 0.4:
715
- return existing_sources
716
-
717
- # Get specific feeds for the category
718
- category_feeds = CATEGORY_SPECIFIC_FEEDS.get(category, [])
719
-
720
- # Only proceed if we have category-specific feeds
721
- if not category_feeds:
722
- return existing_sources
723
-
724
- # Create a new sources dictionary with category-specific modifications
725
- updated_sources = existing_sources.copy()
726
-
727
- # If the category is science, add the category-specific feeds to the list
728
- # and prioritize them by putting them first in RSS feeds
729
- if category in CATEGORY_SPECIFIC_FEEDS:
730
- # Add up to 5 category-specific RSS feeds (if we have them)
731
- category_feeds_sample = category_feeds[:min(5, len(category_feeds))]
732
-
733
- # Add or update source reliability data
734
- if category in SOURCE_RELIABILITY_BY_CATEGORY:
735
- for domain, reliability in SOURCE_RELIABILITY_BY_CATEGORY[category].items():
736
- updated_sources["source_credibility"] = updated_sources.get("source_credibility", {})
737
- updated_sources["source_credibility"][domain] = reliability
738
-
739
- # Return updated sources with prioritized feeds
740
- return {
741
- "category": category,
742
- "confidence": confidence,
743
- "rss_feeds": category_feeds_sample + (updated_sources.get("rss_feeds", []) or []),
744
- "source_credibility": updated_sources.get("source_credibility", {})
745
- }
746
-
747
- return existing_sources
748
-
749
- def get_prioritized_sources(claim: str, claim_category: Optional[str] = None) -> Dict[str, List[str]]:
750
- """
751
- Get prioritized sources for a claim based on its category
752
-
753
- Args:
754
- claim (str): The claim to check
755
- claim_category (str, optional): Override detected category
756
-
757
- Returns:
758
- dict: Dictionary with source types prioritized by relevance
759
- """
760
- # Detect category if not provided
761
- if not claim_category:
762
- category, confidence = detect_claim_category(claim)
763
- else:
764
- category = claim_category
765
- confidence = 0.8 # Assume high confidence if category is explicitly provided
766
-
767
- # Log detected category
768
- logger.info(f"Using claim category: {category} for source prioritization")
769
-
770
- # Default priorities
771
- priorities = {
772
- "primary": ["wikipedia", "news", "claimreview"],
773
- "secondary": ["rss", "scholarly", "wikidata"]
774
- }
775
-
776
- # Needs recent evidence check (existing logic)
777
- temporal_terms = ["is", "are", "remains", "continues", "still", "currently",
778
- "now", "today", "recent", "latest"]
779
- negation_terms = ["not", "no longer", "isn't", "aren't", "doesn't", "don't",
780
- "can't", "cannot", "anymore"]
781
-
782
- requires_recent = any(term in claim.lower() for term in temporal_terms) or \
783
- any(term in claim.lower() for term in negation_terms)
784
-
785
- # Adjust priorities based on category
786
- if category == "science":
787
- if requires_recent:
788
- priorities = {
789
- "primary": ["scholarly", "rss", "wikipedia"],
790
- "secondary": ["news", "claimreview", "wikidata"]
791
- }
792
- else:
793
- priorities = {
794
- "primary": ["scholarly", "wikipedia", "rss"],
795
- "secondary": ["claimreview", "news", "wikidata"]
796
- }
797
-
798
- elif category == "technology":
799
- if requires_recent:
800
- priorities = {
801
- "primary": ["rss", "news", "scholarly"],
802
- "secondary": ["wikipedia", "claimreview", "wikidata"]
803
- }
804
- else:
805
- priorities = {
806
- "primary": ["news", "scholarly", "wikipedia"],
807
- "secondary": ["rss", "claimreview", "wikidata"]
808
- }
809
-
810
- elif category == "politics":
811
- if requires_recent:
812
- priorities = {
813
- "primary": ["rss", "news", "claimreview"],
814
- "secondary": ["wikipedia", "wikidata", "scholarly"]
815
- }
816
- else:
817
- priorities = {
818
- "primary": ["claimreview", "news", "wikipedia"],
819
- "secondary": ["rss", "wikidata", "scholarly"]
820
- }
821
-
822
- elif category == "business" or category == "world":
823
- if requires_recent:
824
- priorities = {
825
- "primary": ["rss", "news", "wikipedia"],
826
- "secondary": ["claimreview", "wikidata", "scholarly"]
827
- }
828
- else:
829
- priorities = {
830
- "primary": ["news", "wikipedia", "rss"],
831
- "secondary": ["claimreview", "wikidata", "scholarly"]
832
- }
833
-
834
- elif category == "sports":
835
- if requires_recent:
836
- priorities = {
837
- "primary": ["rss", "news", "wikipedia"],
838
- "secondary": ["wikidata", "claimreview", "scholarly"]
839
- }
840
- else:
841
- priorities = {
842
- "primary": ["wikipedia", "news", "rss"],
843
- "secondary": ["wikidata", "claimreview", "scholarly"]
844
- }
845
-
846
- elif category == "entertainment":
847
- if requires_recent:
848
- priorities = {
849
- "primary": ["rss", "news", "claimreview"],
850
- "secondary": ["wikipedia", "wikidata", "scholarly"]
851
- }
852
- else:
853
- priorities = {
854
- "primary": ["news", "wikipedia", "claimreview"],
855
- "secondary": ["rss", "wikidata", "scholarly"]
856
- }
857
-
858
- # Add category and confidence for reference
859
- priorities["category"] = category
860
- priorities["confidence"] = confidence
861
- priorities["requires_recent"] = requires_recent
862
-
863
- return priorities
864
-
865
- def get_category_specific_rss_feeds(category: str, max_feeds: int = 5) -> List[str]:
866
- """
867
- Get a list of RSS feeds specific to a category
868
-
869
- Args:
870
- category (str): The claim category
871
- max_feeds (int): Maximum number of feeds to return
872
-
873
- Returns:
874
- list: List of RSS feed URLs
875
- """
876
- # Get category-specific feeds
877
- category_feeds = CATEGORY_SPECIFIC_FEEDS.get(category, [])
878
-
879
- # Limit to max_feeds
880
- return category_feeds[:min(max_feeds, len(category_feeds))]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
modules/claim_extraction.py DELETED
@@ -1,236 +0,0 @@
1
- import logging
2
- import time
3
- import re
4
- from langdetect import detect
5
- import spacy
6
-
7
- from utils.performance import PerformanceTracker
8
- from utils.models import get_nlp_model, get_llm_model
9
-
10
- logger = logging.getLogger("misinformation_detector")
11
-
12
- performance_tracker = PerformanceTracker()
13
-
14
- def extract_claims(text):
15
- """
16
- Extract the main factual claim from the provided text.
17
- For concise claims (<20 words), preserves them exactly.
18
- For longer text, uses OpenAI to extract the claim.
19
- """
20
- logger.info(f"Extracting claims from: {text}")
21
- start_time = time.time()
22
-
23
- # First, check if the input already appears to be a concise claim
24
- if len(text.split()) < 20:
25
- logger.info("Input appears to be a concise claim already, preserving as-is")
26
- performance_tracker.log_processing_time(start_time)
27
- performance_tracker.log_claim_processed()
28
- return text
29
-
30
- try:
31
- # For longer text, use OpenAI for extraction
32
- extracted_claim = extract_with_openai(text)
33
-
34
- # Log processing time
35
- performance_tracker.log_processing_time(start_time)
36
- performance_tracker.log_claim_processed()
37
-
38
- logger.info(f"Extracted claim: {extracted_claim}")
39
- return extracted_claim
40
- except Exception as e:
41
- logger.error(f"Error extracting claims: {str(e)}")
42
- # Fallback to original text on error
43
- return text
44
-
45
- def extract_with_openai(text):
46
- """
47
- Use OpenAI model for claim extraction
48
- """
49
- try:
50
- # Get LLM model
51
- llm_model = get_llm_model()
52
-
53
- # Create a very explicit prompt to avoid hallucination
54
- prompt = f"""
55
- Extract the main factual claim from the following text.
56
- DO NOT add any information not present in the original text.
57
- DO NOT add locations, dates, or other details.
58
- ONLY extract what is explicitly stated.
59
-
60
- Text: {text}
61
-
62
- Main factual claim:
63
- """
64
-
65
- # Call OpenAI with temperature=0 for deterministic output
66
- response = llm_model.invoke(prompt, temperature=0)
67
- extracted_claim = response.content.strip()
68
-
69
- # Further clean up any explanations or extra text
70
- if ":" in extracted_claim:
71
- parts = extracted_claim.split(":")
72
- if len(parts) > 1:
73
- extracted_claim = parts[-1].strip()
74
-
75
- logger.info(f"OpenAI extraction: {extracted_claim}")
76
-
77
- # Validate that we're not adding info not in the original
78
- nlp = get_nlp_model()
79
- extracted_claim = validate_extraction(text, extracted_claim, nlp)
80
-
81
- return extracted_claim
82
- except Exception as e:
83
- logger.error(f"Error in OpenAI claim extraction: {str(e)}")
84
- return text # Fallback to original
85
-
86
- def validate_extraction(original_text, extracted_claim, nlp):
87
- """
88
- Validate that the extracted claim doesn't add information not present in the original text
89
- """
90
- # If extraction fails or is empty, return original
91
- if not extracted_claim or extracted_claim.strip() == "":
92
- logger.warning("Empty extraction result, using original text")
93
- return original_text
94
-
95
- # Check for added location information
96
- location_terms = ["united states", "america", "u.s.", "usa", "china", "india", "europe",
97
- "russia", "japan", "uk", "germany", "france", "australia"]
98
- for term in location_terms:
99
- if term in extracted_claim.lower() and term not in original_text.lower():
100
- logger.warning(f"Extraction added location '{term}' not in original, using original text")
101
- return original_text
102
-
103
- # Check for entity preservation/addition using spaCy
104
- try:
105
- # Get entities from extracted text
106
- extracted_doc = nlp(extracted_claim)
107
- extracted_entities = [ent.text.lower() for ent in extracted_doc.ents]
108
-
109
- # Get entities from original text
110
- original_doc = nlp(original_text)
111
- original_entities = [ent.text.lower() for ent in original_doc.ents]
112
-
113
- # Check for new entities that don't exist in original
114
- for entity in extracted_entities:
115
- if not any(entity in orig_entity or orig_entity in entity for orig_entity in original_entities):
116
- logger.warning(f"Extraction added new entity '{entity}', using original text")
117
- return original_text
118
-
119
- return extracted_claim
120
- except Exception as e:
121
- logger.error(f"Error in extraction validation: {str(e)}")
122
- return original_text # On error, safer to return original
123
-
124
- def shorten_claim_for_evidence(claim):
125
- """
126
- Shorten a claim to use for evidence retrieval by preserving important keywords
127
- while maintaining claim context
128
- """
129
- try:
130
- # Get NLP model
131
- nlp = get_nlp_model()
132
-
133
- # Use NER to extract key entities
134
- doc = nlp(claim)
135
-
136
- # Extract all entities for search
137
- entities = [ent.text for ent in doc.ents]
138
-
139
- # Extract key proper nouns, entities, and important context words
140
- important_words = []
141
-
142
- # Add all named entities
143
- for ent in doc.ents:
144
- important_words.append(ent.text)
145
-
146
- # Add important nouns and adjectives not already added
147
- for token in doc:
148
- if token.pos_ in ["NOUN", "PROPN"] and token.text not in important_words:
149
- important_words.append(token.text)
150
-
151
- # Make sure we include key terms like "prime minister", "president", etc.
152
- title_terms = ["president", "prime minister", "minister", "chancellor", "premier", "governor", "mayor", "senator"]
153
- for term in title_terms:
154
- if term in claim.lower() and not any(term in word.lower() for word in important_words):
155
- # Find the full phrase (e.g., "Canadian Prime Minister")
156
- matches = re.finditer(r'(?i)(?:\w+\s+)*\b' + re.escape(term) + r'\b(?:\s+\w+)*', claim)
157
- for match in matches:
158
- phrase = match.group(0)
159
- if phrase not in important_words:
160
- important_words.append(phrase)
161
-
162
- # Add country names or important place references
163
- country_terms = ["canada", "canadian", "us", "united states", "american", "uk", "british", "australia", "china", "russian"]
164
- for term in country_terms:
165
- if term in claim.lower() and not any(term in word.lower() for word in important_words):
166
- for token in doc:
167
- if token.text.lower() == term and token.text not in important_words:
168
- important_words.append(token.text)
169
-
170
- # Always include negation words as they're critical for meaning
171
- negation_terms = ["not", "no longer", "former", "ex-", "isn't", "aren't", "doesn't", "don't"]
172
- negation_found = False
173
- for term in negation_terms:
174
- if term in claim.lower():
175
- # Find the context around the negation (3 words before and after)
176
- matches = re.finditer(r'(?i)(?:\w+\s+){0,3}\b' + re.escape(term) + r'\b(?:\s+\w+){0,3}', claim)
177
- for match in matches:
178
- phrase = match.group(0)
179
- if phrase not in important_words:
180
- important_words.append(phrase)
181
- negation_found = True
182
-
183
- # Special handling for time-sensitive claims with negations
184
- is_time_sensitive = any(term in claim.lower() for term in ["anymore", "still", "currently", "now", "today", "recent"])
185
-
186
- # If we have both negation and time sensitivity, ensure we keep those key aspects
187
- if negation_found and is_time_sensitive:
188
- # Ensure we keep time-sensitive terms
189
- time_terms = ["anymore", "still", "currently", "now", "today", "recent"]
190
- for term in time_terms:
191
- if term in claim.lower() and not any(term in word.lower() for word in important_words):
192
- # Add the context around the time term
193
- matches = re.finditer(r'(?i)(?:\w+\s+){0,2}\b' + re.escape(term) + r'\b(?:\s+\w+){0,2}', claim)
194
- for match in matches:
195
- phrase = match.group(0)
196
- if phrase not in important_words:
197
- important_words.append(phrase)
198
-
199
- # If entities plus titles don't give us enough, include key parts of claim
200
- if len(entities) < 2 and not any("minister" in word.lower() for word in important_words):
201
- words = claim.split()
202
- # Use first 8 words
203
- return " ".join(words[:min(8, len(words))])
204
-
205
- # Remove duplicates while preserving order
206
- seen = set()
207
- unique_terms = []
208
- for word in important_words:
209
- if word.lower() not in seen:
210
- seen.add(word.lower())
211
- unique_terms.append(word)
212
-
213
- # Ensure we have a reasonable number of search terms (maintain more for complex claims)
214
- search_terms = unique_terms[:min(6, len(unique_terms))]
215
-
216
- # Sort search terms to try to maintain original word order from claim
217
- def get_position(term):
218
- return claim.lower().find(term.lower())
219
-
220
- search_terms.sort(key=get_position)
221
-
222
- # Join terms to create search query
223
- shortened_claim = " ".join(search_terms)
224
-
225
- # If the shortened claim is too short compared to original, use more of original
226
- if len(shortened_claim.split()) < 3 and len(claim.split()) > 5:
227
- words = claim.split()
228
- shortened_claim = " ".join(words[:min(8, len(words))])
229
-
230
- logger.info(f"Shortened Claim: {shortened_claim}")
231
-
232
- return shortened_claim
233
- except Exception as e:
234
- logger.error(f"Error in shortening claim: {str(e)}")
235
- # Return original claim on error
236
- return claim
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
modules/classification.py DELETED
@@ -1,521 +0,0 @@
1
- import logging
2
- import re
3
- from utils.models import get_llm_model
4
- from utils.performance import PerformanceTracker
5
-
6
- logger = logging.getLogger("misinformation_detector")
7
-
8
- performance_tracker = PerformanceTracker()
9
-
10
- def classify_with_llm(claim, evidence):
11
- """
12
- Optimized classification function that handles evidence classification
13
- and verdict generation in a single LLM call with robust parsing
14
- """
15
- logger.info(f"Classifying evidence for claim: {claim}")
16
-
17
- # Get the LLM model
18
- llm_model = get_llm_model()
19
-
20
- # Skip if no evidence
21
- if not evidence:
22
- logger.warning("No evidence provided for classification")
23
- return []
24
-
25
- # Normalize evidence to a list
26
- if not isinstance(evidence, list):
27
- if evidence:
28
- try:
29
- evidence = [evidence]
30
- except Exception as e:
31
- logger.error(f"Could not convert evidence to list: {e}")
32
- return []
33
- else:
34
- return []
35
-
36
- # Does the claim contain strong assertions that require specific evidence?
37
- strong_assertion_markers = [
38
- "solved", "cured", "discovered", "confirmed", "proven", "definitive",
39
- "breakthrough", "revolutionary", "successfully", "first ever", "extends",
40
- "conclusive", "unprecedented", "remarkable", "definitively"
41
- ]
42
-
43
- # Check if the claim contains strong assertions that would require specific supporting evidence
44
- contains_strong_assertions = any(marker in claim.lower() for marker in strong_assertion_markers)
45
-
46
- # Limit to top 5 evidence items to reduce token usage
47
- evidence = evidence[:5]
48
-
49
- try:
50
- # Format evidence items
51
- evidence_text = ""
52
- for idx, chunk in enumerate(evidence):
53
- # Truncate long evidence
54
- chunk_text = str(chunk)
55
- if len(chunk_text) > 300:
56
- chunk_text = chunk_text[:297] + "..."
57
-
58
- evidence_text += f"EVIDENCE {idx+1}:\n{chunk_text}\n\n"
59
-
60
- # Create a structured prompt with explicit formatting instructions
61
- # Adjust instructions based on claim characteristics
62
- if contains_strong_assertions:
63
- prompt = f"""
64
- CLAIM: {claim}
65
-
66
- EVIDENCE:
67
- {evidence_text}
68
-
69
- TASK: Evaluate if the evidence supports, contradicts, or is irrelevant to the claim.
70
-
71
- IMPORTANT CONTEXT: This claim makes strong assertions that require specific supporting evidence.
72
-
73
- When evaluating such claims:
74
- 1. Strong assertions require strong, direct evidence - look for specific confirmation from credible sources
75
- 2. General information about the topic is not sufficient to support specific assertions
76
- 3. Evidence of ongoing work or research is not sufficient to support claims of completion or success
77
- 4. If the evidence doesn't directly confirm the specific assertion, classify it as "insufficient" rather than "support"
78
-
79
- INSTRUCTIONS:
80
- 1. For each evidence, provide your analysis in EXACTLY this format:
81
-
82
- EVIDENCE 1 ANALYSIS:
83
- Relevance: [relevant/irrelevant]
84
- Classification: [support/contradict/insufficient/irrelevant]
85
- Confidence: [number between 0-100]
86
- Reason: [brief explanation focusing on whether evidence directly confirms the specific assertion]
87
-
88
- 2. After analyzing all evidence pieces, provide a final verdict in this format:
89
-
90
- FINAL VERDICT: [clear statement if evidence collectively supports or contradicts the claim]
91
-
92
- Without specific, direct supporting evidence, default to "The evidence does not support the claim" rather than "insufficient evidence."
93
-
94
- CRITICAL INSTRUCTION: FOCUS ON THE EXACT CLAIM. Evaluate ONLY the specific claim, not related topics
95
- """
96
- else:
97
- prompt = f"""
98
- CLAIM: {claim}
99
-
100
- EVIDENCE:
101
- {evidence_text}
102
-
103
- TASK: Evaluate if the evidence supports, contradicts, or is irrelevant to the claim.
104
-
105
- INSTRUCTIONS:
106
- 1. For each evidence, provide your analysis in EXACTLY this format:
107
-
108
- EVIDENCE 1 ANALYSIS:
109
- Relevance: [relevant/irrelevant]
110
- Classification: [support/contradict/insufficient/irrelevant]
111
- Confidence: [number between 0-100]
112
- Reason: [brief explanation]
113
-
114
- 2. After analyzing all evidence pieces, provide a final verdict in this format:
115
-
116
- FINAL VERDICT: [clear statement if evidence collectively supports or contradicts the claim]
117
-
118
- CRITICAL INSTRUCTION: FOCUS ON THE EXACT CLAIM. Evaluate ONLY the specific claim, not related topics
119
- """
120
-
121
- # Get response with temperature=0 for consistency
122
- result = llm_model.invoke(prompt, temperature=0)
123
- result_text = result.content.strip()
124
-
125
- # Extract final verdict first since it's most important
126
- final_verdict = None
127
- final_match = re.search(r'FINAL VERDICT:\s*(.*?)(?=\s*$|\n\n)', result_text, re.DOTALL | re.IGNORECASE)
128
- if final_match:
129
- final_verdict = final_match.group(1).strip()
130
- logger.info(f"Final assessment: {final_verdict}")
131
-
132
- # Define a precise regex pattern matching the requested format
133
- analysis_pattern = r'EVIDENCE\s+(\d+)\s+ANALYSIS:\s*\n+Relevance:\s*(relevant|irrelevant)\s*\n+Classification:\s*(support|contradict|neutral|irrelevant|insufficient)\s*\n+Confidence:\s*(\d+)\s*\n+Reason:\s*(.*?)(?=\s*EVIDENCE\s+\d+\s+ANALYSIS:|\s*FINAL VERDICT:|\s*$)'
134
-
135
- # Parse each evidence analysis
136
- classification_results = []
137
- matched_evidence = set()
138
-
139
- # Try matching with our strict pattern first
140
- matches = list(re.finditer(analysis_pattern, result_text, re.IGNORECASE | re.DOTALL))
141
-
142
- # If no matches, try a more flexible pattern
143
- if not matches:
144
- flexible_pattern = r'(?:EVIDENCE|Evidence)\s+(\d+)(?:\s+ANALYSIS)?:?\s*\n+(?:Relevance|relevance):\s*(relevant|irrelevant|unknown)\s*\n+(?:Classification|classification):\s*(support|contradict|neutral|irrelevant|insufficient|unknown)\s*\n+(?:Confidence|confidence):\s*(\d+)\s*\n+(?:Reason|reason|Brief reason):\s*(.*?)(?=\s*(?:EVIDENCE|Evidence)\s+\d+|FINAL VERDICT:|$)'
145
- matches = list(re.finditer(flexible_pattern, result_text, re.IGNORECASE | re.DOTALL))
146
-
147
- # Process matches
148
- for match in matches:
149
- try:
150
- evidence_idx = int(match.group(1)) - 1
151
- relevance = match.group(2).lower()
152
- classification = match.group(3).lower()
153
- confidence = int(match.group(4))
154
- reason = match.group(5).strip()
155
-
156
- # Normalize classification terms
157
- if classification == "neutral":
158
- classification = "insufficient"
159
-
160
- # For strong assertions, apply confidence adjustments based on classification
161
- if contains_strong_assertions:
162
- if classification == "support":
163
- # Check if the reasoning indicates direct or indirect support
164
- indirect_support_markers = ["general", "doesn't directly", "does not directly",
165
- "doesn't specifically", "not specific", "related to",
166
- "doesn't confirm"]
167
- if any(marker in reason.lower() for marker in indirect_support_markers):
168
- # Downgrade support confidence for indirect evidence
169
- confidence = max(5, confidence - 20)
170
- elif classification == "contradict":
171
- # For contradictions of strong assertions, slightly boost confidence
172
- confidence = min(95, confidence + 5)
173
-
174
- # Ensure index is valid
175
- if 0 <= evidence_idx < len(evidence):
176
- matched_evidence.add(evidence_idx)
177
-
178
- # Create result entry
179
- classification_results.append({
180
- "label": classification,
181
- "confidence": confidence / 100.0,
182
- "evidence": evidence[evidence_idx],
183
- "relevance": relevance,
184
- "reason": reason,
185
- "final_assessment": final_verdict
186
- })
187
- except (ValueError, IndexError) as e:
188
- logger.error(f"Error parsing evidence analysis: {e}")
189
-
190
- # Handle any unmatched evidence items
191
- if matches: # Only add defaults if we successfully matched some
192
- for idx, ev in enumerate(evidence):
193
- if idx not in matched_evidence:
194
- # Check if the evidence text itself suggests a classification
195
- contains_support = bool(re.search(r'support|confirm|verify|true|correct|released', final_verdict or "", re.IGNORECASE))
196
- contains_contradicting = bool(re.search(r'not yet|hasn\'t|have not|doesn\'t|don\'t|cannot|preliminary|proposed', str(ev).lower()))
197
-
198
- # For claims with strong assertions without explicit evidence, be more cautious
199
- if contains_strong_assertions:
200
- if contains_contradicting:
201
- label = "contradict"
202
- confidence = 0.6
203
- elif contains_support:
204
- label = "insufficient" # Default to insufficient for strong assertions without clear analysis
205
- confidence = 0.5
206
- else:
207
- label = "insufficient"
208
- confidence = 0.5
209
- else:
210
- label = "support" if contains_support else "unknown"
211
- confidence = 0.7 if contains_support else 0.5
212
-
213
- classification_results.append({
214
- "label": label,
215
- "confidence": confidence,
216
- "evidence": ev,
217
- "relevance": "relevant" if (contains_support or contains_contradicting) else "unknown",
218
- "reason": "Based on overall assessment",
219
- "final_assessment": final_verdict
220
- })
221
- else:
222
- # No structured parsing worked, use final verdict to create simple results
223
- contains_support = bool(re.search(r'support|confirm|verify|true|correct|released', final_verdict or "", re.IGNORECASE))
224
- contains_contradict = bool(re.search(r'contradict|against|false|incorrect|not support|does not support|insufficient evidence|does not confirm|no evidence', final_verdict or "", re.IGNORECASE))
225
- contains_insufficient = bool(re.search(r'insufficient|not enough|cannot determine|no evidence|lack of evidence', final_verdict or "", re.IGNORECASE))
226
-
227
- # For claims with strong assertions, be more stringent
228
- if contains_strong_assertions:
229
- if contains_support and not contains_insufficient and not contains_contradict:
230
- label = "support"
231
- confidence = 0.6 # Lower confidence even for support of strong assertions
232
- elif contains_contradict:
233
- label = "contradict"
234
- confidence = 0.8 # Higher confidence for contradiction of strong assertions
235
- else:
236
- label = "insufficient"
237
- confidence = 0.7 # Good confidence for insufficient judgment
238
- else:
239
- label = "support" if contains_support else "contradict" if contains_contradict else "unknown"
240
- confidence = 0.7 if (contains_support or contains_contradict) else 0.5
241
-
242
- # Create basic results based on final verdict
243
- for ev in evidence:
244
- classification_results.append({
245
- "label": label,
246
- "confidence": confidence,
247
- "evidence": ev,
248
- "relevance": "relevant" if (contains_support or contains_contradict) else "unknown",
249
- "reason": final_verdict or "Based on collective evidence",
250
- "final_assessment": final_verdict
251
- })
252
-
253
- logger.info(f"Classified {len(classification_results)} evidence items")
254
- return classification_results
255
-
256
- except Exception as e:
257
- logger.error(f"Error in evidence classification: {str(e)}")
258
- # Provide a basic fallback that checks for keywords in evidence
259
- try:
260
- fallback_results = []
261
- for ev in evidence:
262
- ev_text = str(ev).lower()
263
- supports = False
264
- contradicts = False
265
-
266
- # Basic keyword checking as last resort
267
- if claim.lower() in ev_text:
268
- keywords = [word for word in claim.lower().split() if len(word) > 3]
269
- matching_keywords = [k for k in keywords if k in ev_text]
270
-
271
- # If substantial keywords match, consider it support
272
- supports = len(matching_keywords) >= max(1, len(keywords) // 2)
273
-
274
- # Check for contradiction terms
275
- contradiction_terms = ["not yet", "hasn't", "haven't", "cannot", "can't",
276
- "doesn't", "don't", "no evidence", "insufficient",
277
- "preliminary", "proposed", "in development", "future"]
278
- contradicts = any(term in ev_text for term in contradiction_terms)
279
-
280
- # For claims with strong assertions, be more conservative in the fallback case
281
- if contains_strong_assertions:
282
- if contradicts:
283
- fallback_results.append({
284
- "label": "contradict",
285
- "confidence": 0.6,
286
- "evidence": ev,
287
- "relevance": "relevant",
288
- "reason": "Evidence suggests the claim is not yet proven (fallback method)"
289
- })
290
- elif supports:
291
- fallback_results.append({
292
- "label": "insufficient",
293
- "confidence": 0.6,
294
- "evidence": ev,
295
- "relevance": "relevant",
296
- "reason": "Evidence is related but doesn't conclusively confirm the assertion (fallback method)"
297
- })
298
- else:
299
- fallback_results.append({
300
- "label": "unknown",
301
- "confidence": 0.5,
302
- "evidence": ev,
303
- "relevance": "unknown",
304
- "reason": "Cannot determine relevance (fallback method)"
305
- })
306
- else:
307
- fallback_results.append({
308
- "label": "support" if supports else "unknown",
309
- "confidence": 0.6 if supports else 0.5,
310
- "evidence": ev,
311
- "relevance": "relevant" if supports else "unknown",
312
- "reason": "Based on keyword matching (fallback method)"
313
- })
314
-
315
- return fallback_results
316
- except:
317
- # Absolute last resort
318
- return [{"label": "unknown", "confidence": 0.5, "evidence": ev} for ev in evidence]
319
-
320
- def aggregate_evidence(classification_results):
321
- """
322
- Aggregate evidence classifications to determine overall verdict
323
- with robust fallback mechanisms for reliable results
324
- """
325
- logger.info(f"Aggregating evidence from {len(classification_results) if classification_results else 0} results")
326
-
327
- if not classification_results:
328
- logger.warning("No classification results to aggregate")
329
- return "Uncertain", 0.3 # Default with low confidence
330
-
331
- # Assess the claim's characteristics (without relying on explicit category detection)
332
- # Does the claim contain strong assertions that require specific evidence?
333
- strong_assertion_markers = [
334
- "solved", "cured", "discovered", "confirmed", "proven", "definitive",
335
- "breakthrough", "revolutionary", "successfully", "first ever", "extends",
336
- "conclusive", "unprecedented", "remarkable", "definitively"
337
- ]
338
-
339
- # Check if claim text is available in final assessment
340
- claim_text = None
341
- claim_has_strong_assertions = False
342
-
343
- # Extract claim from final assessment if available
344
- for item in classification_results:
345
- if "final_assessment" in item and item["final_assessment"]:
346
- match = re.search(r'the claim (?:that )?"?([^"]+)"?', item["final_assessment"], re.IGNORECASE)
347
- if match:
348
- claim_text = match.group(1)
349
- claim_has_strong_assertions = any(marker in claim_text.lower() for marker in strong_assertion_markers)
350
- break
351
-
352
- # If we couldn't extract the claim, check evidence context for assertion indicators
353
- if not claim_text:
354
- # Check if evidence reasons suggest dealing with strong assertions
355
- assertion_context_indicators = ["conclusive evidence", "definitive proof", "solved", "breakthrough",
356
- "revolutionary", "directly confirms", "specific confirmation"]
357
-
358
- reasons = [item.get("reason", "").lower() for item in classification_results if "reason" in item]
359
- assertion_indicators_count = sum(1 for indicator in assertion_context_indicators
360
- for reason in reasons if indicator in reason)
361
-
362
- claim_has_strong_assertions = assertion_indicators_count >= 2
363
-
364
- # Extract final assessment if present
365
- final_assessment = None
366
- for item in classification_results:
367
- if "final_assessment" in item and item["final_assessment"]:
368
- final_assessment = item["final_assessment"]
369
- break
370
-
371
- # Count evidence by classification
372
- support_items = [item for item in classification_results if item.get("label") == "support"]
373
- contradict_items = [item for item in classification_results if item.get("label") == "contradict"]
374
- insufficient_items = [item for item in classification_results if item.get("label") in ["insufficient", "neutral"]]
375
- relevant_items = [item for item in classification_results
376
- if item.get("relevance") == "relevant" or item.get("label") in ["support", "contradict"]]
377
-
378
- # Calculate the proportion of supported evidence
379
- total_relevant = len(relevant_items)
380
-
381
- # Direct keyword detection from final assessment or evidence
382
- if final_assessment:
383
- # Check for support indicators in final assessment
384
- supports_pattern = r'\b(support|confirm|verify|true|correct|released|proves|validates|evidence (?:that |for |of )(?:the claim|it) is true)\b'
385
- contradicts_pattern = r'\b(contradict|refute|deny|false|incorrect|not released|doesn\'t support|does not support|no evidence|cannot support|is not true|evidence (?:that |for |of )(?:the claim|it) is false)\b'
386
- insufficient_pattern = r'\b(uncertain|insufficient|not enough|inconclusive|cannot determine|unable to determine|lack of evidence)\b'
387
-
388
- supports_match = re.search(supports_pattern, final_assessment, re.IGNORECASE)
389
- contradicts_match = re.search(contradicts_pattern, final_assessment, re.IGNORECASE)
390
- insufficient_match = re.search(insufficient_pattern, final_assessment, re.IGNORECASE)
391
-
392
- # Direct determination based on final assessment keywords
393
- if supports_match and not contradicts_match and not insufficient_match:
394
- # Get max confidence from supporting evidence
395
- confidence = max([item.get("confidence", 0) for item in support_items]) if support_items else 0.7
396
-
397
- # Adjust confidence for claims with strong assertions
398
- if claim_has_strong_assertions:
399
- confidence = min(confidence, 0.8) # Cap confidence for strong assertions
400
-
401
- return "True (Based on Evidence)", max(0.6, confidence) # Minimum 0.6 confidence
402
-
403
- if contradicts_match and not supports_match:
404
- # Get max confidence from contradicting evidence
405
- confidence = max([item.get("confidence", 0) for item in contradict_items]) if contradict_items else 0.7
406
-
407
- # For claims with strong assertions, increase confidence in contradiction
408
- if claim_has_strong_assertions:
409
- confidence = max(confidence, 0.7) # Minimum 0.7 confidence for contradicting strong assertions
410
-
411
- return "False (Based on Evidence)", max(0.6, confidence) # Minimum 0.6 confidence
412
-
413
- if insufficient_match:
414
- # For claims with strong assertions without confirming evidence,
415
- # change "Uncertain" to a clearer negative verdict
416
- if claim_has_strong_assertions:
417
- return "False (Based on Evidence)", 0.7
418
- return "Uncertain", 0.4 # Medium-low confidence
419
-
420
- # If we have distinct classifications, weigh them by confidence and quantity
421
- if support_items and (not contradict_items or all(item.get("confidence", 0) < 0.95 for item in contradict_items)):
422
- # Check if there's high confidence support evidence (greater than 0.95)
423
- high_confidence_support = [item for item in support_items if item.get("confidence", 0) > 0.95]
424
-
425
- if high_confidence_support:
426
- # High confidence support evidence exists, use it even if there are some contradictions
427
- confidence = max([item.get("confidence", 0) for item in high_confidence_support])
428
- # For claims with strong assertions, be more conservative with pure support
429
- if claim_has_strong_assertions:
430
- confidence = min(confidence, 0.8)
431
- return "True (Based on Evidence)", max(0.7, confidence)
432
- elif not contradict_items:
433
- # All supportive evidence with no contradictions (standard case)
434
- confidence = max([item.get("confidence", 0) for item in support_items])
435
-
436
- # For claims with strong assertions, be more conservative with pure support
437
- if claim_has_strong_assertions:
438
- # For strong assertions with only support but no contradictions, be cautious
439
- confidence = min(confidence, 0.7)
440
- # If the support is from low-quality evidence, consider it uncertain
441
- support_reasons = [item.get("reason", "").lower() for item in support_items]
442
- weak_supports = sum(1 for reason in support_reasons if
443
- "general information" in reason or
444
- "doesn't specify" in reason or
445
- "does not directly" in reason)
446
- if weak_supports / max(1, len(support_items)) > 0.5:
447
- return "Uncertain", 0.6
448
-
449
- return "True (Based on Evidence)", max(0.6, confidence)
450
-
451
- if contradict_items and not support_items:
452
- # All contradicting evidence
453
- confidence = max([item.get("confidence", 0) for item in contradict_items])
454
-
455
- # For claims with strong assertions, increase confidence in contradiction
456
- if claim_has_strong_assertions:
457
- confidence = max(confidence, 0.7)
458
-
459
- return "False (Based on Evidence)", max(0.6, confidence)
460
-
461
- if insufficient_items and len(insufficient_items) > len(support_items) + len(contradict_items):
462
- # Mostly insufficient evidence
463
- # For claims with strong assertions and mainly insufficient evidence, lean toward "False"
464
- if claim_has_strong_assertions:
465
- return "False (Based on Evidence)", 0.7
466
- return "Uncertain", 0.5 # Medium confidence for explicitly uncertain
467
-
468
- if support_items and contradict_items:
469
- # Competing evidence - compare confidence and quantity
470
- support_confidence = max([item.get("confidence", 0) for item in support_items])
471
- contradict_confidence = max([item.get("confidence", 0) for item in contradict_items])
472
-
473
- # For claims with strong assertions, require stronger support to overcome contradiction
474
- if claim_has_strong_assertions:
475
- # Higher threshold for strong assertions
476
- if support_confidence > contradict_confidence + 0.3:
477
- return "True (Based on Evidence)", support_confidence * 0.9 # Apply a confidence penalty
478
- elif contradict_confidence >= support_confidence - 0.1: # Lower threshold for contradiction
479
- return "False (Based on Evidence)", max(contradict_confidence, 0.7) # Minimum 0.7 confidence
480
- else:
481
- # Default to uncertain for close calls on strong assertions
482
- return "Uncertain", 0.6
483
- else:
484
- # Standard threshold for regular claims
485
- if support_confidence > contradict_confidence + 0.2:
486
- return "True (Based on Evidence)", support_confidence
487
- elif contradict_confidence > support_confidence + 0.2:
488
- return "False (Based on Evidence)", contradict_confidence
489
- else:
490
- # Close call - check quantity of evidence
491
- if len(support_items) > len(contradict_items) * 2:
492
- return "True (Based on Evidence)", support_confidence * 0.9 # Slight confidence penalty
493
- elif len(contradict_items) > len(support_items) * 2:
494
- return "False (Based on Evidence)", contradict_confidence * 0.9 # Slight confidence penalty
495
- else:
496
- # Truly conflicting evidence
497
- return "Uncertain", 0.5 # Medium confidence
498
-
499
- # Check for evidence quality issues
500
- all_unknown = all(item.get("label") == "unknown" for item in classification_results)
501
- evidence_text = " ".join([str(item.get("evidence", "")) for item in classification_results])
502
-
503
- # General case: For any claims with all unknown labels that contain markers of strong assertions
504
- if all_unknown and claim_has_strong_assertions:
505
- # Absence of clear supporting evidence for claims with strong assertions points to "False"
506
- return "False (Based on Evidence)", 0.7
507
-
508
- # For general claims, if all items are unknown but evidence clearly mentions the claim
509
- if all_unknown:
510
- # Examples of direct evidence matching as fallback
511
- if re.search(r'\bllama\s*4\b', evidence_text, re.IGNORECASE) and re.search(r'\bmeta\b|\bfacebook\b', evidence_text, re.IGNORECASE) and re.search(r'\breleas', evidence_text, re.IGNORECASE):
512
- return "True (Based on Evidence)", 0.7
513
- elif re.search(r'\bnot\s+releas', evidence_text, re.IGNORECASE) or re.search(r'\bdenies\b|\bdenied\b', evidence_text, re.IGNORECASE):
514
- return "False (Based on Evidence)", 0.7
515
-
516
- # Default to uncertain if no clear pattern - but with special case for claims with strong assertions
517
- if claim_has_strong_assertions:
518
- # For claims with strong assertions with no clear evidence, default to false
519
- return "False (Based on Evidence)", 0.7
520
-
521
- return "Uncertain", 0.3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
modules/evidence_retrieval.py DELETED
@@ -1,944 +0,0 @@
1
- import logging
2
- import time
3
- import re
4
- import random
5
- import requests
6
- import json
7
- import ssl
8
- from urllib.parse import urlencode
9
- from bs4 import BeautifulSoup
10
- from SPARQLWrapper import SPARQLWrapper, JSON
11
- from datetime import datetime, timedelta
12
- from concurrent.futures import ThreadPoolExecutor, as_completed, wait, FIRST_COMPLETED
13
-
14
- from utils.api_utils import api_error_handler, safe_json_parse
15
- from utils.models import get_nlp_model
16
- from modules.claim_extraction import shorten_claim_for_evidence, extract_claims
17
- from modules.rss_feed import retrieve_evidence_from_rss
18
- from modules.semantic_analysis import analyze_evidence_relevance, select_diverse_evidence
19
- from config import SOURCE_CREDIBILITY, NEWS_API_KEY, FACTCHECK_API_KEY
20
-
21
- # Import the performance tracker
22
- from utils.performance import PerformanceTracker
23
- performance_tracker = PerformanceTracker()
24
-
25
- logger = logging.getLogger("misinformation_detector")
26
-
27
- # Define early analysis function at the module level so it's available everywhere
28
- def analyze_early_evidence(claim, source_name, source_evidence):
29
- """Pre-analyze evidence while waiting for other sources to complete"""
30
- try:
31
- if not source_evidence:
32
- return None
33
-
34
- logger.info(f"Pre-analyzing {len(source_evidence)} evidence items from {source_name}")
35
-
36
- # Do a quick relevance check using similarity scoring
37
- nlp_model = get_nlp_model()
38
- claim_doc = nlp_model(claim)
39
-
40
- relevant_evidence = []
41
- for evidence in source_evidence:
42
- if not isinstance(evidence, str):
43
- continue
44
-
45
- # Look for direct keyword matches first (fast check)
46
- is_related = False
47
- keywords = [word.lower() for word in claim.split() if len(word) > 3]
48
- for keyword in keywords:
49
- if keyword in evidence.lower():
50
- is_related = True
51
- break
52
-
53
- # If no keywords match, do a basic entity check
54
- if not is_related:
55
- # Check if claim and evidence share any entities
56
- evidence_doc = nlp_model(evidence[:500]) # Limit for speed
57
- claim_entities = [ent.text.lower() for ent in claim_doc.ents]
58
- evidence_entities = [ent.text.lower() for ent in evidence_doc.ents]
59
-
60
- common_entities = set(claim_entities).intersection(set(evidence_entities))
61
- if common_entities:
62
- is_related = True
63
-
64
- if is_related:
65
- relevant_evidence.append(evidence)
66
-
67
- logger.info(f"Found {len(relevant_evidence)} relevant items out of {len(source_evidence)} from {source_name}")
68
- return relevant_evidence
69
- except Exception as e:
70
- logger.error(f"Error in early evidence analysis: {e}")
71
- return source_evidence # On error, return original evidence
72
-
73
- # New function to get recent date for filtering news
74
- def get_recent_date_range():
75
- """Return date range for recent news filtering - last 3 days"""
76
- today = datetime.now()
77
- three_days_ago = today - timedelta(days=3)
78
- return three_days_ago.strftime('%Y-%m-%d'), today.strftime('%Y-%m-%d')
79
-
80
- @api_error_handler("wikipedia")
81
- def retrieve_evidence_from_wikipedia(claim):
82
- """Retrieve evidence from Wikipedia for a given claim"""
83
- logger.info(f"Retrieving evidence from Wikipedia for: {claim}")
84
-
85
- # Ensure shortened_claim is a string
86
- try:
87
- shortened_claim = shorten_claim_for_evidence(claim)
88
- except Exception as e:
89
- logger.error(f"Error in claim shortening: {e}")
90
- shortened_claim = claim # Fallback to original claim
91
-
92
- # Ensure query_parts is a list of strings
93
- query_parts = str(shortened_claim).split()
94
- evidence = []
95
- source_count = {"wikipedia": 0}
96
-
97
- for i in range(len(query_parts), 0, -1): # Start with full query, shorten iteratively
98
- try:
99
- # Safely join and encode query
100
- current_query = "+".join(query_parts[:i])
101
- search_url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={current_query}&format=json"
102
- logger.info(f"Wikipedia search URL: {search_url}")
103
-
104
- headers = {
105
- "User-Agent": "MisinformationDetectionResearchBot/1.0 (Research Project)"
106
- }
107
-
108
- # Make the search request with reduced timeout
109
- response = requests.get(search_url, headers=headers, timeout=7)
110
- response.raise_for_status()
111
-
112
- # Safely parse JSON
113
- search_data = safe_json_parse(response, "wikipedia")
114
-
115
- # Safely extract search results
116
- search_results = search_data.get("query", {}).get("search", [])
117
-
118
- # Ensure search_results is a list
119
- if not isinstance(search_results, list):
120
- logger.warning(f"Unexpected search results type: {type(search_results)}")
121
- search_results = []
122
-
123
- # Use ThreadPoolExecutor to fetch page content in parallel
124
- with ThreadPoolExecutor(max_workers=3) as executor:
125
- # Submit up to 3 page requests in parallel
126
- futures = []
127
- for idx, result in enumerate(search_results[:3]):
128
- # Ensure result is a dictionary
129
- if not isinstance(result, dict):
130
- logger.warning(f"Skipping non-dictionary result: {type(result)}")
131
- continue
132
-
133
- # Safely extract title
134
- page_title = result.get("title", "")
135
- if not page_title:
136
- continue
137
-
138
- page_url = f"https://en.wikipedia.org/wiki/{page_title.replace(' ', '_')}"
139
-
140
- # Submit the page request task to executor
141
- futures.append(executor.submit(
142
- fetch_wikipedia_page_content,
143
- page_url,
144
- page_title,
145
- headers
146
- ))
147
-
148
- # Process completed futures as they finish
149
- for future in as_completed(futures):
150
- try:
151
- page_result = future.result()
152
- if page_result:
153
- evidence.append(page_result)
154
- source_count["wikipedia"] += 1
155
- except Exception as e:
156
- logger.error(f"Error processing Wikipedia page: {e}")
157
-
158
- # Stop if we found any evidence
159
- if evidence:
160
- break
161
-
162
- except Exception as e:
163
- logger.error(f"Error retrieving from Wikipedia: {str(e)}")
164
- continue
165
-
166
- # Ensure success is a boolean
167
- success = bool(evidence)
168
-
169
- # Safely log evidence retrieval
170
- try:
171
- performance_tracker.log_evidence_retrieval(success, source_count)
172
- except Exception as e:
173
- logger.error(f"Error logging evidence retrieval: {e}")
174
-
175
- if not evidence:
176
- logger.warning("No evidence found from Wikipedia.")
177
-
178
- return evidence
179
-
180
- def fetch_wikipedia_page_content(page_url, page_title, headers):
181
- """Helper function to fetch and parse Wikipedia page content"""
182
- try:
183
- # Get page content with reduced timeout
184
- page_response = requests.get(page_url, headers=headers, timeout=5)
185
- page_response.raise_for_status()
186
-
187
- # Extract relevant sections using BeautifulSoup
188
- soup = BeautifulSoup(page_response.text, 'html.parser')
189
- paragraphs = soup.find_all('p', limit=3) # Limit to first 3 paragraphs
190
- content = " ".join([para.get_text(strip=True) for para in paragraphs])
191
-
192
- # Truncate content to reduce token usage earlier in the pipeline
193
- if len(content) > 300:
194
- content = content[:297] + "..."
195
-
196
- if content.strip(): # Ensure content is not empty
197
- return f"Title: {page_title}, URL: {page_url}, Content: {content}"
198
- return None
199
- except Exception as e:
200
- logger.error(f"Error fetching Wikipedia page {page_url}: {e}")
201
- return None
202
-
203
- # Update the WikiData function to fix SSL issues
204
- @api_error_handler("wikidata")
205
- def retrieve_evidence_from_wikidata(claim):
206
- """Retrieve evidence from WikiData for a given claim"""
207
- logger.info(f"Retrieving evidence from WikiData for: {claim}")
208
-
209
- # Prepare entities for SPARQL query
210
- shortened_claim = shorten_claim_for_evidence(claim)
211
- query_terms = shortened_claim.split()
212
-
213
- # Initialize SPARQLWrapper for WikiData
214
- sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
215
-
216
- # Use a more conservative user agent to avoid blocks
217
- sparql.addCustomHttpHeader("User-Agent", "MisinformationDetectionResearchBot/1.0")
218
-
219
- # Fix SSL issues by disabling SSL verification for this specific request
220
- try:
221
- # Create a context where we don't verify SSL certs
222
- import ssl
223
- import urllib.request
224
-
225
- # Create a context that doesn't verify certificates
226
- ssl_context = ssl._create_unverified_context()
227
-
228
- # Monkey patch the opener for SPARQLWrapper
229
- opener = urllib.request.build_opener(urllib.request.HTTPSHandler(context=ssl_context))
230
- urllib.request.install_opener(opener)
231
- except Exception as e:
232
- logger.error(f"Error setting up SSL context: {str(e)}")
233
-
234
- # Construct basic SPARQL query for relevant entities
235
- query = """
236
- SELECT ?item ?itemLabel ?description ?article WHERE {
237
- SERVICE wikibase:mwapi {
238
- bd:serviceParam wikibase:api "EntitySearch" .
239
- bd:serviceParam wikibase:endpoint "www.wikidata.org" .
240
- bd:serviceParam mwapi:search "%s" .
241
- bd:serviceParam mwapi:language "en" .
242
- ?item wikibase:apiOutputItem mwapi:item .
243
- }
244
- ?item schema:description ?description .
245
- FILTER(LANG(?description) = "en")
246
- OPTIONAL {
247
- ?article schema:about ?item .
248
- ?article schema:isPartOf <https://en.wikipedia.org/> .
249
- }
250
- SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
251
- }
252
- LIMIT 5
253
- """ % " ".join(query_terms)
254
-
255
- sparql.setQuery(query)
256
- sparql.setReturnFormat(JSON)
257
-
258
- try:
259
- results = sparql.query().convert()
260
-
261
- wikidata_evidence = []
262
-
263
- for result in results["results"]["bindings"]:
264
- entity_label = result.get("itemLabel", {}).get("value", "Unknown")
265
- description = result.get("description", {}).get("value", "No description")
266
- article_url = result.get("article", {}).get("value", "")
267
-
268
- # Truncate description to reduce token usage
269
- if len(description) > 200:
270
- description = description[:197] + "..."
271
-
272
- evidence_text = f"Entity: {entity_label}, Description: {description}"
273
- if article_url:
274
- evidence_text += f", URL: {article_url}"
275
-
276
- wikidata_evidence.append(evidence_text)
277
-
278
- logger.info(f"Retrieved {len(wikidata_evidence)} WikiData entities")
279
- return wikidata_evidence
280
-
281
- except Exception as e:
282
- logger.error(f"Error retrieving from WikiData: {str(e)}")
283
- return []
284
-
285
- @api_error_handler("openalex")
286
- def retrieve_evidence_from_openalex(claim):
287
- """Retrieve evidence from OpenAlex for a given claim (replacement for Semantic Scholar)"""
288
- logger.info(f"Retrieving evidence from OpenAlex for: {claim}")
289
-
290
- try:
291
- shortened_claim = shorten_claim_for_evidence(claim)
292
- query = shortened_claim.replace(" ", "+")
293
-
294
- # OpenAlex API endpoint
295
- api_url = f"https://api.openalex.org/works?search={query}&filter=is_paratext:false&per_page=3"
296
-
297
- headers = {
298
- "Accept": "application/json",
299
- "User-Agent": "MisinformationDetectionResearchBot/1.0 ([email protected])",
300
- }
301
-
302
- scholarly_evidence = []
303
-
304
- try:
305
- # Request with reduced timeout
306
- response = requests.get(api_url, headers=headers, timeout=8)
307
-
308
- # Check response status
309
- if response.status_code == 200:
310
- # Successfully retrieved data
311
- data = safe_json_parse(response, "openalex")
312
- papers = data.get("results", [])
313
-
314
- for paper in papers:
315
- title = paper.get("title", "Unknown Title")
316
- abstract = paper.get("abstract_inverted_index", None)
317
-
318
- # OpenAlex stores abstracts in an inverted index format, so we need to reconstruct it
319
- abstract_text = "No abstract available"
320
- if abstract:
321
- try:
322
- # Simple approach to reconstruct from inverted index
323
- # For a production app, implement a proper reconstruction algorithm
324
- words = list(abstract.keys())
325
- abstract_text = " ".join(words[:30]) + "..."
326
- except Exception as e:
327
- logger.error(f"Error reconstructing abstract: {e}")
328
-
329
- url = paper.get("doi", "")
330
- if url and not url.startswith("http"):
331
- url = f"https://doi.org/{url}"
332
-
333
- year = ""
334
- publication_date = paper.get("publication_date", "")
335
- if publication_date:
336
- year = publication_date.split("-")[0]
337
-
338
- # Truncate abstract to reasonable length
339
- if len(abstract_text) > 250:
340
- abstract_text = abstract_text[:247] + "..."
341
-
342
- evidence_text = f"Title: {title}, Year: {year}, Abstract: {abstract_text}, URL: {url}"
343
- scholarly_evidence.append(evidence_text)
344
-
345
- else:
346
- logger.error(f"OpenAlex API error: {response.status_code}")
347
-
348
- except requests.exceptions.Timeout:
349
- logger.warning("OpenAlex request timed out")
350
- except requests.exceptions.ConnectionError:
351
- logger.warning("OpenAlex connection error")
352
- except Exception as e:
353
- logger.error(f"Unexpected error in OpenAlex request: {str(e)}")
354
-
355
- logger.info(f"Retrieved {len(scholarly_evidence)} scholarly papers from OpenAlex")
356
- return scholarly_evidence
357
-
358
- except Exception as e:
359
- logger.error(f"Fatal error in OpenAlex retrieval: {str(e)}")
360
- return []
361
-
362
- @api_error_handler("factcheck")
363
- def retrieve_evidence_from_claimreview(claim):
364
- """Retrieve evidence from Google's ClaimReview for a given claim"""
365
- logger.info(f"Retrieving evidence from ClaimReview for: {claim}")
366
- factcheck_api_key = FACTCHECK_API_KEY
367
-
368
- # Safely shorten claim
369
- try:
370
- shortened_claim = shorten_claim_for_evidence(claim)
371
- except Exception as e:
372
- logger.error(f"Error shortening claim: {e}")
373
- shortened_claim = claim
374
-
375
- query_parts = str(shortened_claim).split()
376
- factcheck_results = []
377
- source_count = {"factcheck": 0}
378
-
379
- for i in range(len(query_parts), 0, -1): # Iteratively try shorter queries
380
- try:
381
- current_query = " ".join(query_parts[:i])
382
- encoded_query = urlencode({"query": current_query})
383
- factcheck_url = f"https://factchecktools.googleapis.com/v1alpha1/claims:search?{encoded_query}&key={factcheck_api_key}"
384
- logger.info(f"Factcheck URL: {factcheck_url}")
385
-
386
- # Make request with reduced timeout
387
- response = requests.get(factcheck_url, timeout=7)
388
- response.raise_for_status()
389
- data = safe_json_parse(response, "factcheck")
390
-
391
- # Safely extract claims
392
- claims = data.get("claims", [])
393
- if not isinstance(claims, list):
394
- logger.warning(f"Unexpected claims type: {type(claims)}")
395
- claims = []
396
-
397
- if claims: # If results found
398
- logger.info(f"Results found for query '{current_query}'.")
399
- for item in claims:
400
- try:
401
- # Ensure item is a dictionary
402
- if not isinstance(item, dict):
403
- logger.warning(f"Skipping non-dictionary item: {type(item)}")
404
- continue
405
-
406
- claim_text = str(item.get("text", ""))
407
- # Truncate claim text
408
- if len(claim_text) > 200:
409
- claim_text = claim_text[:197] + "..."
410
-
411
- reviews = item.get("claimReview", [])
412
-
413
- # Ensure reviews is a list
414
- if not isinstance(reviews, list):
415
- logger.warning(f"Unexpected reviews type: {type(reviews)}")
416
- reviews = []
417
-
418
- for review in reviews:
419
- # Ensure review is a dictionary
420
- if not isinstance(review, dict):
421
- logger.warning(f"Skipping non-dictionary review: {type(review)}")
422
- continue
423
-
424
- publisher = str(review.get("publisher", {}).get("name", "Unknown Source"))
425
- rating = str(review.get("textualRating", "Unknown"))
426
- review_url = str(review.get("url", ""))
427
-
428
- if claim_text:
429
- factcheck_results.append(
430
- f"Claim: {claim_text}, Rating: {rating}, " +
431
- f"Source: {publisher}, URL: {review_url}"
432
- )
433
- source_count["factcheck"] += 1
434
-
435
- except Exception as e:
436
- logger.error(f"Error processing FactCheck result: {e}")
437
-
438
- break # Break once we have results
439
- else:
440
- logger.info(f"No results for query '{current_query}', trying shorter version.")
441
-
442
- except Exception as e:
443
- logger.error(f"Error in FactCheck retrieval: {e}")
444
-
445
- # Safely log evidence retrieval
446
- try:
447
- success = bool(factcheck_results)
448
- performance_tracker.log_evidence_retrieval(success, source_count)
449
- except Exception as e:
450
- logger.error(f"Error logging evidence retrieval: {e}")
451
-
452
- if not factcheck_results:
453
- logger.warning("No factcheck evidence found after trying all query variants.")
454
-
455
- return factcheck_results
456
-
457
- @api_error_handler("newsapi")
458
- def retrieve_news_articles(claim):
459
- """Retrieve evidence from NewsAPI for a given claim with improved single request approach"""
460
- logger.info(f"Retrieving evidence from News API for: {claim}")
461
-
462
- # Get API key
463
- news_api_key = NEWS_API_KEY
464
- if not news_api_key:
465
- logger.error("No NewsAPI key available")
466
- return []
467
-
468
- news_results = []
469
- source_count = {"news": 0}
470
-
471
- # Get date range for recent news
472
- from_date, to_date = get_recent_date_range()
473
- logger.info(f"Filtering for news from {from_date} to {to_date}")
474
-
475
- try:
476
- # Extract a simplified claim for better matching
477
- shortened_claim = shorten_claim_for_evidence(claim)
478
-
479
- # Use a single endpoint with proper parameters
480
- encoded_query = urlencode({"q": shortened_claim})
481
-
482
- # Use the 'everything' endpoint as it's more comprehensive
483
- news_api_url = f"https://newsapi.org/v2/everything?{encoded_query}&apiKey={news_api_key}&language=en&pageSize=5&sortBy=publishedAt&from={from_date}&to={to_date}"
484
-
485
- log_url = news_api_url.replace(news_api_key, "API_KEY_REDACTED")
486
- logger.info(f"Requesting: {log_url}")
487
-
488
- # Make a single request with proper headers and reduced timeout
489
- headers = {
490
- "User-Agent": "MisinformationDetectionResearchBot/1.0",
491
- "X-Api-Key": news_api_key,
492
- "Accept": "application/json"
493
- }
494
-
495
- response = requests.get(
496
- news_api_url,
497
- headers=headers,
498
- timeout=8
499
- )
500
-
501
- logger.info(f"Response status: {response.status_code}")
502
-
503
- if response.status_code == 200:
504
- data = safe_json_parse(response, "newsapi")
505
-
506
- if data.get("status") == "ok":
507
- articles = data.get("articles", [])
508
- logger.info(f"Found {len(articles)} articles")
509
-
510
- for article in articles:
511
- try:
512
- # Robust article parsing
513
- title = str(article.get("title", ""))
514
- description = str(article.get("description", ""))
515
- content = str(article.get("content", ""))
516
- source_name = str(article.get("source", {}).get("name", "Unknown"))
517
- url = str(article.get("url", ""))
518
- published_at = str(article.get("publishedAt", ""))
519
-
520
- # Parse date to prioritize recent content
521
- article_date = None
522
- try:
523
- if published_at:
524
- article_date = datetime.strptime(published_at.split('T')[0], '%Y-%m-%d')
525
- except Exception as date_error:
526
- logger.warning(f"Could not parse date: {published_at}")
527
-
528
- # Calculate recency score (higher = more recent)
529
- recency_score = 1.0 # Default
530
- if article_date:
531
- days_old = (datetime.now() - article_date).days
532
- if days_old == 0: # Today
533
- recency_score = 3.0
534
- elif days_old == 1: # Yesterday
535
- recency_score = 2.0
536
-
537
- # Use description if content is empty or too short
538
- if not content or len(content) < 50:
539
- content = description
540
-
541
- # Truncate content to reduce token usage
542
- if len(content) > 250:
543
- content = content[:247] + "..."
544
-
545
- # Ensure meaningful content
546
- if title and (content or description):
547
- news_item = {
548
- "text": (
549
- f"Title: {title}, " +
550
- f"Source: {source_name}, " +
551
- f"Date: {published_at}, " +
552
- f"URL: {url}, " +
553
- f"Content: {content}"
554
- ),
555
- "recency_score": recency_score,
556
- "date": article_date
557
- }
558
- news_results.append(news_item)
559
- source_count["news"] += 1
560
- logger.info(f"Added article: {title}")
561
-
562
- except Exception as article_error:
563
- logger.error(f"Error processing article: {article_error}")
564
-
565
- # Sort results by recency
566
- if news_results:
567
- news_results.sort(key=lambda x: x.get('recency_score', 0), reverse=True)
568
-
569
- except Exception as query_error:
570
- logger.error(f"Error processing query: {query_error}")
571
-
572
- # Convert to plain text list for compatibility with existing code
573
- news_texts = [item["text"] for item in news_results]
574
-
575
- # Log evidence retrieval
576
- try:
577
- success = bool(news_texts)
578
- performance_tracker.log_evidence_retrieval(success, source_count)
579
- except Exception as log_error:
580
- logger.error(f"Error logging evidence retrieval: {log_error}")
581
-
582
- # Log results
583
- if news_texts:
584
- logger.info(f"Retrieved {len(news_texts)} news articles")
585
- else:
586
- logger.warning("No news articles found")
587
-
588
- return news_texts
589
-
590
- def retrieve_combined_evidence(claim):
591
- """
592
- Retrieve evidence from multiple sources in parallel and analyze relevance using semantic similarity
593
- with category-aware source prioritization and optimized parallel processing
594
- """
595
- logger.info(f"Starting evidence retrieval for: {claim}")
596
- start_time = time.time()
597
-
598
- # Use the category detector to prioritize sources
599
- from modules.category_detection import get_prioritized_sources, get_category_specific_rss_feeds
600
-
601
- # Get source priorities based on claim category
602
- priorities = get_prioritized_sources(claim)
603
- claim_category = priorities.get("category", "general")
604
- requires_recent_evidence = priorities.get("requires_recent", False)
605
-
606
- logger.info(f"Detected claim category: {claim_category} (recent: {requires_recent_evidence})")
607
-
608
- # Initialize results dictionary
609
- results = {
610
- "wikipedia": [],
611
- "wikidata": [],
612
- "claimreview": [],
613
- "news": [],
614
- "scholarly": [],
615
- "rss": []
616
- }
617
-
618
- # Track source counts and relevant evidence
619
- source_counts = {}
620
- relevant_evidence = {}
621
- total_evidence_count = 0
622
- relevant_evidence_count = 0
623
-
624
- # Define primary and secondary sources outside the try block
625
- # so they're available in the except block
626
- primary_sources = []
627
- for source_name in priorities.get("primary", []):
628
- if source_name == "wikipedia":
629
- primary_sources.append(("wikipedia", retrieve_evidence_from_wikipedia, claim))
630
- elif source_name == "wikidata":
631
- primary_sources.append(("wikidata", retrieve_evidence_from_wikidata, claim))
632
- elif source_name == "claimreview":
633
- primary_sources.append(("claimreview", retrieve_evidence_from_claimreview, claim))
634
- elif source_name == "news":
635
- primary_sources.append(("news", retrieve_news_articles, claim))
636
- elif source_name == "scholarly":
637
- primary_sources.append(("scholarly", retrieve_evidence_from_openalex, claim))
638
- elif source_name == "rss":
639
- # Get category-specific RSS max count
640
- max_results = 8 if requires_recent_evidence else 5
641
-
642
- # If the claim is science or technology related and we need to optimize
643
- # use category-specific RSS feeds
644
- if claim_category in ["science", "technology", "politics"]:
645
- # Get specialized RSS module to temporarily use category-specific feeds
646
- category_feeds = get_category_specific_rss_feeds(claim_category)
647
- if category_feeds:
648
- primary_sources.append(("rss", retrieve_evidence_from_rss, claim, max_results, category_feeds))
649
- else:
650
- primary_sources.append(("rss", retrieve_evidence_from_rss, claim, max_results))
651
- else:
652
- primary_sources.append(("rss", retrieve_evidence_from_rss, claim, max_results))
653
-
654
- # Prepare secondary sources
655
- secondary_sources = []
656
- for source_name in priorities.get("secondary", []):
657
- if source_name == "wikipedia":
658
- secondary_sources.append(("wikipedia", retrieve_evidence_from_wikipedia, claim))
659
- elif source_name == "wikidata":
660
- secondary_sources.append(("wikidata", retrieve_evidence_from_wikidata, claim))
661
- elif source_name == "claimreview":
662
- secondary_sources.append(("claimreview", retrieve_evidence_from_claimreview, claim))
663
- elif source_name == "news":
664
- secondary_sources.append(("news", retrieve_news_articles, claim))
665
- elif source_name == "scholarly":
666
- secondary_sources.append(("scholarly", retrieve_evidence_from_openalex, claim))
667
- elif source_name == "rss":
668
- max_results = 5 if requires_recent_evidence else 3
669
- # Use category-specific feeds if available
670
- if claim_category in ["science", "technology", "politics"]:
671
- category_feeds = get_category_specific_rss_feeds(claim_category)
672
- if category_feeds:
673
- secondary_sources.append(("rss", retrieve_evidence_from_rss, claim, max_results, category_feeds))
674
- else:
675
- secondary_sources.append(("rss", retrieve_evidence_from_rss, claim, max_results))
676
- else:
677
- secondary_sources.append(("rss", retrieve_evidence_from_rss, claim, max_results))
678
-
679
- # Optimize parallel processing for evidence retrieval with early results processing
680
- try:
681
- # Define function to safely retrieve evidence
682
- def safe_retrieve(source_name, retrieval_func, *args):
683
- try:
684
- source_result = retrieval_func(*args) or []
685
- return source_name, source_result
686
- except Exception as e:
687
- logger.error(f"Error retrieving from {source_name}: {str(e)}")
688
- return source_name, []
689
-
690
- # Define function to analyze evidence relevance
691
- def analyze_evidence_quick(evidence_items, claim_text):
692
- if not evidence_items or not claim_text:
693
- return []
694
-
695
- # Extract important keywords from claim
696
- keywords = [word.lower() for word in claim_text.split() if len(word) > 3]
697
-
698
- # Check for direct relevance
699
- relevant_items = []
700
- for evidence in evidence_items:
701
- if not isinstance(evidence, str):
702
- continue
703
-
704
- evidence_lower = evidence.lower()
705
-
706
- # Check if evidence contains any important keywords from claim
707
- if any(keyword in evidence_lower for keyword in keywords):
708
- relevant_items.append(evidence)
709
- continue
710
-
711
- # Check for claim subject in evidence (e.g. "earth" in "earth is flat")
712
- claim_parts = claim_text.split()
713
- if len(claim_parts) > 0 and claim_parts[0].lower() in evidence_lower:
714
- relevant_items.append(evidence)
715
- continue
716
-
717
- return relevant_items
718
-
719
- # Use ThreadPoolExecutor with a reasonable number of workers
720
- # Start with primary sources first - use all available sources in parallel
721
- with ThreadPoolExecutor(max_workers=min(4, len(primary_sources))) as executor:
722
- # Submit all primary source tasks
723
- futures_to_source = {
724
- executor.submit(safe_retrieve, source_name, func, *args): source_name
725
- for source_name, func, *args in primary_sources
726
- }
727
-
728
- # Track completed sources
729
- completed_sources = set()
730
-
731
- # Process results as they complete using as_completed for early processing
732
- for future in as_completed(futures_to_source):
733
- try:
734
- source_name, source_results = future.result()
735
- results[source_name] = source_results
736
- source_counts[source_name] = len(source_results)
737
- completed_sources.add(source_name)
738
- logger.info(f"Retrieved {len(source_results)} results from {source_name}")
739
-
740
- # Quick relevance analysis
741
- if source_results:
742
- relevant_items = analyze_evidence_quick(source_results, claim)
743
- relevant_evidence[source_name] = relevant_items
744
- total_evidence_count += len(source_results)
745
- relevant_evidence_count += len(relevant_items)
746
- logger.info(f"Found {len(relevant_items)} relevant items out of {len(source_results)} from {source_name}")
747
-
748
- # Start background pre-analysis while waiting for other sources
749
- try:
750
- executor.submit(
751
- analyze_early_evidence,
752
- claim,
753
- source_name,
754
- source_results
755
- )
756
- except Exception as e:
757
- logger.error(f"Error in early evidence analysis: {e}")
758
-
759
- except Exception as e:
760
- logger.error(f"Error processing future result: {str(e)}")
761
-
762
- # Check if we have sufficient RELEVANT evidence from primary sources
763
- # If not enough relevant evidence, query secondary sources
764
- # in parallel even if we have a lot of total evidence
765
- if relevant_evidence_count < 2:
766
- logger.info(f"Only found {relevant_evidence_count} relevant evidence items, querying secondary sources")
767
-
768
- # Add Wikipedia and Wikidata if they weren't in primary sources and haven't been queried yet
769
- must_check_sources = []
770
- if "wikipedia" not in completed_sources:
771
- must_check_sources.append(("wikipedia", retrieve_evidence_from_wikipedia, claim))
772
-
773
- if "wikidata" not in completed_sources:
774
- must_check_sources.append(("wikidata", retrieve_evidence_from_wikidata, claim))
775
-
776
- # Combine with other secondary sources
777
- remaining_sources = must_check_sources + [
778
- (source_name, func, *args) for source_name, func, *args in secondary_sources
779
- if source_name not in completed_sources
780
- ]
781
-
782
- with ThreadPoolExecutor(max_workers=min(3, len(remaining_sources))) as executor:
783
- # Submit all secondary source tasks
784
- futures_to_source = {
785
- executor.submit(safe_retrieve, source_name, func, *args): source_name
786
- for source_name, func, *args in remaining_sources
787
- }
788
-
789
- # Process results as they complete
790
- for future in as_completed(futures_to_source):
791
- try:
792
- source_name, source_results = future.result()
793
- results[source_name] = source_results
794
- source_counts[source_name] = len(source_results)
795
- logger.info(f"Retrieved {len(source_results)} results from {source_name}")
796
-
797
- # Quick relevance analysis for these as well
798
- if source_results:
799
- relevant_items = analyze_evidence_quick(source_results, claim)
800
- relevant_evidence[source_name] = relevant_items
801
- total_evidence_count += len(source_results)
802
- relevant_evidence_count += len(relevant_items)
803
- logger.info(f"Found {len(relevant_items)} relevant items out of {len(source_results)} from {source_name}")
804
- except Exception as e:
805
- logger.error(f"Error processing future result: {str(e)}")
806
-
807
- except Exception as e:
808
- logger.error(f"Error in parallel evidence retrieval: {str(e)}")
809
- # Fall back to sequential retrieval as a last resort
810
- try:
811
- logger.warning("Falling back to sequential retrieval due to parallel execution failure")
812
- # Sequential retrieval as fallback method - now primary_sources is in scope
813
- for source_name, func, *args in primary_sources:
814
- try:
815
- results[source_name] = func(*args) or []
816
- source_counts[source_name] = len(results[source_name])
817
- except Exception as source_error:
818
- logger.error(f"Error in sequential {source_name} retrieval: {str(source_error)}")
819
-
820
- # For sequential retrieval, always check Wikipedia and Wikidata as fallbacks
821
- if "wikipedia" not in completed_sources:
822
- try:
823
- results["wikipedia"] = retrieve_evidence_from_wikipedia(claim) or []
824
- source_counts["wikipedia"] = len(results["wikipedia"])
825
- except Exception as e:
826
- logger.error(f"Error in fallback Wikipedia retrieval: {e}")
827
-
828
- if "wikidata" not in completed_sources:
829
- try:
830
- results["wikidata"] = retrieve_evidence_from_wikidata(claim) or []
831
- source_counts["wikidata"] = len(results["wikidata"])
832
- except Exception as e:
833
- logger.error(f"Error in fallback Wikidata retrieval: {e}")
834
-
835
- except Exception as fallback_error:
836
- logger.error(f"Error in fallback sequential retrieval: {str(fallback_error)}")
837
-
838
- # Gather all evidence
839
- all_evidence = []
840
- for source, items in results.items():
841
- if isinstance(items, list):
842
- for item in items:
843
- if item and isinstance(item, str):
844
- all_evidence.append(item)
845
-
846
- # Skip processing if no evidence
847
- if not all_evidence:
848
- logger.warning("No evidence collected")
849
-
850
- # Fallback: try direct search for the claim subject
851
- try:
852
- logger.info("No evidence found, trying fallback subject search")
853
-
854
- # Extract the main subject using NLP
855
- nlp = get_nlp_model()
856
- doc = nlp(claim)
857
-
858
- # Find main subject entities or nouns
859
- subjects = []
860
- for ent in doc.ents:
861
- if ent.label_ in ["PERSON", "ORG", "GPE"]:
862
- subjects.append(ent.text)
863
-
864
- # If no entities found, use first noun phrase
865
- if not subjects:
866
- for chunk in doc.noun_chunks:
867
- subjects.append(chunk.text)
868
- break
869
-
870
- if subjects:
871
- # Try a direct search with just the subject
872
- logger.info(f"Trying fallback search with subject: {subjects[0]}")
873
-
874
- # Make sure we try Wikipedia for the subject regardless of priorities
875
- try:
876
- wiki_evidence = retrieve_evidence_from_wikipedia(subjects[0]) or []
877
- all_evidence.extend(wiki_evidence)
878
- logger.info(f"Retrieved {len(wiki_evidence)} results from fallback Wikipedia search")
879
- except Exception as e:
880
- logger.error(f"Error in fallback Wikipedia search: {e}")
881
-
882
- # If still no evidence, try other sources
883
- if not all_evidence:
884
- # Do fallback searches in parallel
885
- with ThreadPoolExecutor(max_workers=2) as executor:
886
- fallback_futures = {
887
- "news": executor.submit(retrieve_news_articles, subjects[0]),
888
- "wikidata": executor.submit(retrieve_evidence_from_wikidata, subjects[0])
889
- }
890
-
891
- # Process results as they complete
892
- for source, future in fallback_futures.items():
893
- try:
894
- fallback_results = future.result() or []
895
- if fallback_results:
896
- all_evidence.extend(fallback_results[:2]) # Add up to 2 results from each
897
- logger.info(f"Retrieved {len(fallback_results)} results from fallback {source} search")
898
- except Exception as e:
899
- logger.error(f"Error in fallback {source} search: {str(e)}")
900
-
901
- except Exception as subj_error:
902
- logger.error(f"Error in fallback subject search: {str(subj_error)}")
903
-
904
- # If still no evidence, return empty list
905
- if not all_evidence:
906
- return []
907
-
908
- # Use semantic analysis to score and select the most relevant evidence
909
- try:
910
- # For science and technology claims, boost the weight of scholarly sources
911
- if claim_category in ["science", "technology"]:
912
- from config import SOURCE_CREDIBILITY
913
- # Create a temporary copy with boosted reliability for relevant sources
914
- enhanced_credibility = dict(SOURCE_CREDIBILITY)
915
-
916
- # Add enhanced weights for scientific sources
917
- from modules.category_detection import SOURCE_RELIABILITY_BY_CATEGORY
918
- for domain, reliability in SOURCE_RELIABILITY_BY_CATEGORY.get(claim_category, {}).items():
919
- enhanced_credibility[domain] = reliability
920
-
921
- # Use the enhanced credibility for evidence analysis
922
- analyzed_evidence = analyze_evidence_relevance(claim, all_evidence, enhanced_credibility)
923
- else:
924
- # Analyze evidence relevance using semantic similarity with default weights
925
- from config import SOURCE_CREDIBILITY
926
- analyzed_evidence = analyze_evidence_relevance(claim, all_evidence, SOURCE_CREDIBILITY)
927
-
928
- # Log evidence scoring
929
- logger.info(f"Analyzed {len(analyzed_evidence)} evidence items")
930
-
931
- # Select diverse, relevant evidence items
932
- final_evidence = select_diverse_evidence(analyzed_evidence, max_items=5)
933
-
934
- # Log source distribution and selected count
935
- logger.info(f"Evidence source distribution: {source_counts}")
936
- logger.info(f"Selected evidence count: {len(final_evidence)}")
937
-
938
- # Return maximum 5 evidence items (to control API costs)
939
- return final_evidence[:5]
940
-
941
- except Exception as e:
942
- logger.error(f"Error in evidence analysis: {str(e)}")
943
- # Fallback to simple selection (top 5 items)
944
- return all_evidence[:5]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
modules/explanation.py DELETED
@@ -1,181 +0,0 @@
1
- import logging
2
- import re
3
- import ast
4
- from utils.models import get_llm_model
5
-
6
- logger = logging.getLogger("misinformation_detector")
7
-
8
- def extract_most_relevant_evidence(evidence_results):
9
- """
10
- Intelligently extract the most relevant piece of evidence
11
-
12
- Args:
13
- evidence_results (list): List of evidence items
14
-
15
- Returns:
16
- str: Most relevant evidence piece
17
- """
18
- if not evidence_results:
19
- return None
20
-
21
- # If evidence is a dictionary with 'evidence' key
22
- if isinstance(evidence_results[0], dict):
23
- # Sort by confidence if available
24
- sorted_evidence = sorted(
25
- evidence_results,
26
- key=lambda x: x.get('confidence', 0),
27
- reverse=True
28
- )
29
-
30
- # Return the evidence from the highest confidence item
31
- for item in sorted_evidence:
32
- evidence = item.get('evidence')
33
- if evidence:
34
- return evidence
35
-
36
- # If plain list of evidence
37
- return next((ev for ev in evidence_results if ev and isinstance(ev, str)), None)
38
-
39
- def generate_explanation(claim, evidence_results, truth_label, confidence=None):
40
- """
41
- Generate an explanation for the claim's classification
42
-
43
- Args:
44
- claim (str): The original claim
45
- evidence_results (list/str): Evidence supporting the classification
46
- truth_label (str): Classification of the claim
47
- confidence (float): Confidence level (0-1)
48
-
49
- Returns:
50
- str: Explanation of the claim's classification
51
- """
52
- logger.info(f"Generating explanation for claim with verdict: {truth_label}")
53
-
54
- try:
55
- # Normalize evidence_results to a list
56
- if not isinstance(evidence_results, list):
57
- try:
58
- evidence_results = ast.literal_eval(str(evidence_results)) if evidence_results else []
59
- except:
60
- evidence_results = [evidence_results] if evidence_results else []
61
-
62
- # Get the LLM model
63
- explanation_model = get_llm_model()
64
-
65
- # Extract most relevant evidence
66
- most_relevant_evidence = extract_most_relevant_evidence(evidence_results)
67
-
68
- # Prepare evidence text for prompt
69
- evidence_text = "\n".join([
70
- f"Evidence {i+1}: {str(ev)[:200] + '...' if len(str(ev)) > 200 else str(ev)}"
71
- for i, ev in enumerate(evidence_results[:3])
72
- ])
73
-
74
- # Convert confidence to percentage and description
75
- confidence_desc = ""
76
- if confidence is not None:
77
- confidence_pct = int(confidence * 100)
78
- if confidence < 0.3:
79
- confidence_desc = f"very low confidence ({confidence_pct}%)"
80
- elif confidence < 0.5:
81
- confidence_desc = f"low confidence ({confidence_pct}%)"
82
- elif confidence < 0.7:
83
- confidence_desc = f"moderate confidence ({confidence_pct}%)"
84
- elif confidence < 0.9:
85
- confidence_desc = f"high confidence ({confidence_pct}%)"
86
- else:
87
- confidence_desc = f"very high confidence ({confidence_pct}%)"
88
- else:
89
- # Determine confidence context from label if not explicitly provided
90
- confidence_desc = (
91
- "high confidence" if "High Confidence" in truth_label else
92
- "moderate confidence" if "Likely" in truth_label else
93
- "low confidence"
94
- )
95
-
96
- # Create prompt with specific instructions based on the type of claim
97
- has_negation = any(neg in claim.lower() for neg in ["not", "no longer", "isn't", "doesn't", "won't", "cannot"])
98
-
99
- # For claims with "True" verdict
100
- if "True" in truth_label:
101
- prompt = f"""
102
- Claim: "{claim}"
103
-
104
- Verdict: {truth_label} (with {confidence_desc})
105
-
106
- Available Evidence:
107
- {evidence_text}
108
-
109
- Task: Generate a clear explanation that:
110
- 1. Clearly states that the claim IS TRUE based on the evidence
111
- 2. {"Pay special attention to the logical relationship since the claim contains negation" if has_negation else "Explains why the evidence supports the claim"}
112
- 3. Uses confidence level of {confidence_desc}
113
- 4. Highlights the most relevant supporting evidence
114
- 5. Is factual and precise
115
- """
116
-
117
- # For claims with "False" verdict
118
- elif "False" in truth_label:
119
- prompt = f"""
120
- Claim: "{claim}"
121
-
122
- Verdict: {truth_label} (with {confidence_desc})
123
-
124
- Available Evidence:
125
- {evidence_text}
126
-
127
- Task: Generate a clear explanation that:
128
- 1. Clearly states that the claim IS FALSE based on the evidence
129
- 2. {"Pay special attention to the logical relationship since the claim contains negation" if has_negation else "Explains why the evidence contradicts the claim"}
130
- 3. Uses confidence level of {confidence_desc}
131
- 4. Highlights the contradicting evidence
132
- 5. Is factual and precise
133
-
134
- IMPORTANT: If the claim contains negation (words like 'not', 'no longer', etc.), be extra careful with the logical relationship between the evidence and the claim.
135
- """
136
-
137
- # For uncertain claims
138
- else:
139
- prompt = f"""
140
- Claim: "{claim}"
141
-
142
- Verdict: {truth_label} (with {confidence_desc})
143
-
144
- Available Evidence:
145
- {evidence_text}
146
-
147
- Task: Generate a clear explanation that:
148
- 1. Clearly states that there is insufficient evidence to determine if the claim is true or false
149
- 2. Explains what information is missing or why the available evidence is insufficient
150
- 3. Uses confidence level of {confidence_desc}
151
- 4. Makes NO speculation about whether the claim might be true or false
152
- 5. Mentions that the user should seek information from other reliable sources
153
- """
154
-
155
- # Generate explanation with multiple attempts
156
- max_attempts = 3
157
- for attempt in range(max_attempts):
158
- try:
159
- # Invoke the model
160
- response = explanation_model.invoke(prompt)
161
- explanation = response.content.strip()
162
-
163
- # Validate explanation length
164
- if explanation and len(explanation.split()) >= 5:
165
- return explanation
166
-
167
- except Exception as attempt_error:
168
- logger.error(f"Explanation generation attempt {attempt+1} failed: {str(attempt_error)}")
169
-
170
- # Ultimate fallback explanation
171
- if "Uncertain" in truth_label:
172
- return f"The claim '{claim}' cannot be verified due to insufficient evidence. The available information does not provide clear support for or against this claim. Consider consulting reliable sources for verification."
173
- elif "True" in truth_label:
174
- return f"The claim '{claim}' is supported by the evidence with {confidence_desc}. {most_relevant_evidence or 'The evidence indicates this claim is accurate.'}"
175
- else:
176
- return f"The claim '{claim}' is contradicted by the evidence with {confidence_desc}. {most_relevant_evidence or 'The evidence indicates this claim is not accurate.'}"
177
-
178
- except Exception as e:
179
- logger.error(f"Comprehensive error in explanation generation: {str(e)}")
180
- # Final fallback
181
- return f"The claim is classified as {truth_label} based on the available evidence."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
modules/rss_feed.py DELETED
@@ -1,391 +0,0 @@
1
- import feedparser
2
- import time
3
- import logging
4
- import re
5
- import ssl
6
- import requests
7
- from datetime import datetime, timedelta
8
- from threading import Timer
9
- from urllib.parse import urlparse
10
- from concurrent.futures import ThreadPoolExecutor, as_completed
11
-
12
- logger = logging.getLogger("misinformation_detector")
13
-
14
- # Disable SSL certificate verification for feeds with self-signed certs
15
- ssl._create_default_https_context = ssl._create_unverified_context
16
-
17
- # List of RSS feeds to check for news
18
- # These are popular news sources with reliable and frequently updated RSS feeds
19
- RSS_FEEDS = [
20
- # --------------------
21
- # 🌐 General World News
22
- # --------------------
23
- # "http://rss.cnn.com/rss/cnn_topstories.rss", # CNN Top Stories; Removed in round 2
24
- "http://rss.cnn.com/rss/cnn_world.rss", # CNN World News; Duplicate with category_detection
25
- # "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml", # NYT Home Page
26
- "https://rss.nytimes.com/services/xml/rss/nyt/World.xml", # NYT World News; Duplicate with category_detection
27
- # "https://rss.nytimes.com/services/xml/rss/nyt/US.xml", # NYT US News
28
- "https://feeds.washingtonpost.com/rss/world", # The Washington Post World News; Removed in round 2
29
- # "https://feeds.washingtonpost.com/rss/national", # The Washington Post National News
30
- # "https://feeds.bbci.co.uk/news/rss.xml", # BBC News - Top Stories; Removed in round 2
31
- "https://feeds.bbci.co.uk/news/world/rss.xml", # BBC News - World
32
- # "https://news.google.com/rss?gl=IN&ceid=IN:en&topic=w&hl=en-IN", # Google News India - World; Removed in round 2
33
- # "https://news.google.com/rss?gl=US&ceid=US:en&topic=w&hl=en-US", # Google News US - World; Removed in round 2
34
-
35
- # --------------------
36
- # 🧠 Tech & Startup News (Global)
37
- # --------------------
38
- "https://techcrunch.com/feed/", # TechCrunch - Startup and Technology News; Duplicate with category_detection
39
- "https://venturebeat.com/feed/", # VentureBeat - Tech News
40
- # "https://www.theverge.com/rss/index.xml", # The Verge - Technology News
41
- "https://www.wired.com/feed/rss", # Wired - Technology News
42
- "https://www.cnet.com/rss/news/", # CNET - Technology News
43
- # "https://sifted.eu/feed/", # Sifted - European Startups and Tech
44
- # "https://feeds.feedburner.com/fastcompany/headlines", # Fast Company - Business Innovation
45
- # "https://feeds.bbci.co.uk/news/technology/rss.xml", # BBC News - Technology
46
- "https://news.google.com/rss?gl=IN&ceid=IN:en&topic=t&hl=en-IN", # Google News India - Technology
47
- "https://news.google.com/rss?gl=US&ceid=US:en&topic=t&hl=en-US", # Google News US - Technology
48
-
49
- # --------------------
50
- # 💼 Startup & VC Focused
51
- # --------------------
52
- "https://news.crunchbase.com/feed/", # Crunchbase News - Startup Funding
53
- # "https://avc.com/feed/", # AVC - Musings of a VC in NYC
54
- "https://techstartups.com/feed/", # Tech Startups - Startup News
55
- # "https://tech.eu/feed/", # Tech.eu - European Tech News
56
- # "https://www.menabytes.com/feed/", # MENAbytes - Middle East & North Africa Startups
57
- # "http://feeds.feedburner.com/venturebeat/SZYF", # VentureBeat - Deals
58
-
59
- # --------------------
60
- # 📰 Global Business & Corporate Feeds
61
- # --------------------
62
- "https://feeds.bloomberg.com/technology/news.rss", # Bloomberg Technology News
63
- "https://www.ft.com/technology?format=rss", # Financial Times Technology News
64
- # "https://ir.thomsonreuters.com/rss/news-releases.xml", # Thomson Reuters Press Releases
65
- # "https://feeds.bbci.co.uk/news/business/rss.xml", # BBC News - Business
66
- "https://news.google.com/rss?gl=IN&ceid=IN:en&topic=b&hl=en-IN", # Google News India - Business
67
- # "https://news.google.com/rss?gl=US&ceid=US:en&topic=b&hl=en-US", # Google News US - Business; Removed in round 2
68
-
69
- # --------------------
70
- # 🇮🇳 India-specific News
71
- # --------------------
72
- "https://inc42.com/feed/", # Inc42 - Indian Startups and Technology
73
- # "https://yourstory.com/rss", # YourStory - Indian Startup Stories
74
- # "https://economictimes.indiatimes.com/startups/rssfeeds/49979279.cms", # Economic Times - Startups
75
- "https://timesofindia.indiatimes.com/rssfeedstopstories.cms", # TOI - Top Stories
76
- "https://timesofindia.indiatimes.com/rssfeedmostrecent.cms", # TOI - Most Recent Stories
77
- "https://timesofindia.indiatimes.com/rssfeeds/-2128936835.cms", # TOI - India News
78
- "https://timesofindia.indiatimes.com/rssfeeds/296589292.cms", # TOI - World News
79
- "https://timesofindia.indiatimes.com/rssfeeds/1898055.cms", # TOI - Business News
80
- "https://timesofindia.indiatimes.com/rssfeeds/54829575.cms", # TOI - Cricket News
81
- "https://timesofindia.indiatimes.com/rssfeeds/4719148.cms", # TOI - Sports News
82
- "https://timesofindia.indiatimes.com/rssfeeds/-2128672765.cms", # TOI - Science News
83
- # "https://timesofindia.indiatimes.com/rssfeeds/66949542.cms", # TOI - Technology News
84
- # "https://timesofindia.indiatimes.com/rssfeeds/1081479906.cms", # TOI - Education News
85
-
86
- # --------------------
87
- # 🏏 Sports News (Global + Cricket)
88
- # --------------------
89
- "https://www.espn.com/espn/rss/news", # ESPN - Top Sports News; Duplicate with category_detection
90
- # "https://api.foxsports.com/v2/content/optimized-rss?partnerKey=MB0Wehpmuj2lUhuRhQaafhBjAJqaPU244mlTDK1i&size=30", # Fox Sports; Removed in round 2
91
- "https://feeds.skynews.com/feeds/rss/sports.xml", # Sky News - Sports
92
- "https://sports.ndtv.com/rss/all", # NDTV Sports
93
- "https://www.espncricinfo.com/rss/content/story/feeds/0.xml", # ESPN Cricinfo - Cricket News; Duplicate with category_detection
94
- # "https://crickettimes.com/feed/", # Cricket Times - Cricket News
95
-
96
- # --------------------
97
- # ✅ Fact-Checking Sources
98
- # --------------------
99
- "https://www.snopes.com/feed/", # Snopes - Fact Checking; Duplicate with category_detection
100
- "https://www.politifact.com/rss/all/", # PolitiFact - Fact Checking; Duplicate with category_detection
101
-
102
- # --------------------
103
- # 🗳️ Politics & Policy (General)
104
- # --------------------
105
- "https://feeds.bbci.co.uk/news/politics/rss.xml", # BBC News - Politics; Duplicate with category_detection
106
- "https://feeds.bbci.co.uk/news/science_and_environment/rss.xml", # BBC - Science & Environment
107
-
108
- # --------------------
109
- # 🗳️ Science
110
- # --------------------
111
- "https://www.nature.com/nature.rss", # Nature science; Duplicate with category_detection
112
- "https://feeds.science.org/rss/science-advances.xml" # science.org
113
- ]
114
-
115
- def clean_html(raw_html):
116
- """Remove HTML tags from text"""
117
- if not raw_html:
118
- return ""
119
- clean_regex = re.compile('<.*?>')
120
- clean_text = re.sub(clean_regex, '', raw_html)
121
- # Remove extra whitespace
122
- clean_text = re.sub(r'\s+', ' ', clean_text).strip()
123
- return clean_text
124
-
125
- def parse_feed(feed_url, timeout=5):
126
- """
127
- Parse a single RSS feed with proper timeout handling
128
- Uses requests with timeout first, then passes content to feedparser
129
- """
130
- try:
131
- # Use requests with timeout to fetch the RSS content
132
- response = requests.get(feed_url, timeout=timeout)
133
- response.raise_for_status()
134
-
135
- # Then parse the content with feedparser (which doesn't support timeout)
136
- feed = feedparser.parse(response.content)
137
-
138
- # Basic validation of the feed
139
- if hasattr(feed, 'entries') and feed.entries:
140
- return feed
141
- else:
142
- logger.warning(f"Feed {feed_url} parsed but contains no entries")
143
- return None
144
-
145
- except requests.exceptions.Timeout:
146
- logger.warning(f"Timeout while fetching feed {feed_url}")
147
- return None
148
- except requests.exceptions.RequestException as e:
149
- logger.error(f"Request error fetching feed {feed_url}: {str(e)}")
150
- return None
151
- except Exception as e:
152
- logger.error(f"Error parsing feed {feed_url}: {str(e)}")
153
- return None
154
-
155
- def fetch_all_feeds(feeds_list=None, max_workers=5, timeout=5):
156
- """
157
- Fetch multiple RSS feeds with proper timeout handling
158
- Returns a list of (domain, feed) tuples for successfully fetched feeds
159
- """
160
- # Use default RSS_FEEDS list if none provided
161
- if feeds_list is None:
162
- feeds_list = RSS_FEEDS
163
-
164
- results = []
165
-
166
- with ThreadPoolExecutor(max_workers=max_workers) as executor:
167
- future_to_url = {executor.submit(parse_feed, url, timeout): url for url in feeds_list}
168
- for future in as_completed(future_to_url):
169
- url = future_to_url[future]
170
- try:
171
- feed = future.result()
172
- if feed and hasattr(feed, 'entries') and feed.entries:
173
- # Extract domain for source attribution
174
- domain = urlparse(url).netloc
175
- results.append((domain, feed))
176
- logger.info(f"Successfully fetched {domain} with {len(feed.entries)} entries")
177
- except Exception as e:
178
- logger.error(f"Error processing {url}: {str(e)}")
179
-
180
- return results
181
-
182
- def extract_date(entry):
183
- """Extract and normalize publication date from entry"""
184
- for date_field in ['published_parsed', 'updated_parsed', 'created_parsed']:
185
- if hasattr(entry, date_field) and getattr(entry, date_field):
186
- try:
187
- # Convert time tuple to datetime
188
- time_tuple = getattr(entry, date_field)
189
- return datetime(time_tuple[0], time_tuple[1], time_tuple[2],
190
- time_tuple[3], time_tuple[4], time_tuple[5])
191
- except Exception as e:
192
- logger.debug(f"Error parsing {date_field}: {e}")
193
- continue
194
-
195
- # Try string dates
196
- for date_field in ['published', 'updated', 'pubDate']:
197
- if hasattr(entry, date_field) and getattr(entry, date_field):
198
- try:
199
- date_str = getattr(entry, date_field)
200
- # Try various formats
201
- for fmt in ['%a, %d %b %Y %H:%M:%S %z', '%a, %d %b %Y %H:%M:%S %Z',
202
- '%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S%z']:
203
- try:
204
- return datetime.strptime(date_str, fmt)
205
- except:
206
- continue
207
- except Exception as e:
208
- logger.debug(f"Error parsing date string {date_field}: {e}")
209
- continue
210
-
211
- # Default to current time if parsing fails
212
- return datetime.now()
213
-
214
- def is_recent(entry_date, max_days=3):
215
- """Check if an entry is recent (within the last few days)"""
216
- if not entry_date:
217
- return False
218
- cutoff = datetime.now() - timedelta(days=max_days)
219
- return entry_date > cutoff
220
-
221
- def get_entry_relevance(entry, query_terms, domain):
222
- """Calculate relevance score for an entry based on query match and recency"""
223
- if not hasattr(entry, 'title') or not entry.title:
224
- return 0
225
-
226
- # Extract text content
227
- title = entry.title or ""
228
- description = clean_html(entry.description) if hasattr(entry, 'description') else ""
229
- content = ""
230
- if hasattr(entry, 'content'):
231
- for content_item in entry.content:
232
- if 'value' in content_item:
233
- content += clean_html(content_item['value']) + " "
234
-
235
- # Extract published date
236
- pub_date = extract_date(entry)
237
-
238
- # Calculate recency score (0-1)
239
- recency_score = 0
240
- if pub_date:
241
- days_old = (datetime.now() - pub_date).days
242
- if days_old <= 1: # Today or yesterday
243
- recency_score = 1.0
244
- elif days_old <= 2:
245
- recency_score = 0.8
246
- elif days_old <= 3:
247
- recency_score = 0.5
248
- else:
249
- recency_score = 0.2
250
-
251
- # Calculate relevance score based on keyword matches
252
- text = f"{title} {description} {content}".lower()
253
-
254
- # Count how many query terms appear in the content
255
- query_terms_lower = [term.lower() for term in query_terms]
256
- matches = sum(1 for term in query_terms_lower if term in text)
257
-
258
- # Calculate match score (0-1)
259
- match_score = min(1.0, matches / max(1, len(query_terms) * 0.7))
260
-
261
- # Boost score for exact phrase matches
262
- query_phrase = " ".join(query_terms_lower)
263
- if query_phrase in text:
264
- match_score += 0.5
265
-
266
- # Additional boost for title matches (they're more relevant)
267
- title_matches = sum(1 for term in query_terms_lower if term in title.lower())
268
- if title_matches > 0:
269
- match_score += 0.2 * (title_matches / len(query_terms_lower))
270
-
271
- # Source quality factor (can be adjusted based on source reliability)
272
- source_factor = 1.0
273
- high_quality_domains = ['bbc.co.uk', 'nytimes.com', 'reuters.com', 'washingtonpost.com',
274
- 'espncricinfo.com', 'cricbuzz.com', 'snopes.com']
275
- if any(quality_domain in domain for quality_domain in high_quality_domains):
276
- source_factor = 1.2
277
-
278
- # Calculate final score
279
- final_score = (match_score * 0.6) + (recency_score * 0.4) * source_factor
280
-
281
- return min(1.0, final_score) # Cap at 1.0
282
-
283
- def retrieve_evidence_from_rss(claim, max_results=3, category_feeds=None):
284
- """
285
- Retrieve evidence from RSS feeds for a given claim
286
-
287
- Args:
288
- claim (str): The claim to verify
289
- max_results (int): Maximum number of results to return
290
- category_feeds (list, optional): List of category-specific RSS feeds to check
291
-
292
- Returns:
293
- list: List of relevant evidence items
294
- """
295
- start_time = time.time()
296
- logger.info(f"Retrieving evidence from RSS feeds for: {claim}")
297
-
298
- # Extract key terms from claim
299
- terms = [term.strip() for term in re.findall(r'\b\w+\b', claim) if len(term.strip()) > 2]
300
-
301
- try:
302
- # Use category-specific feeds if provided
303
- feeds_to_use = category_feeds if category_feeds else RSS_FEEDS
304
-
305
- # Log which feeds we're using
306
- if category_feeds:
307
- logger.info(f"Using {len(category_feeds)} category-specific RSS feeds")
308
- else:
309
- logger.info(f"Using {len(RSS_FEEDS)} default RSS feeds")
310
-
311
- # Limit the number of feeds to process for efficiency
312
- if len(feeds_to_use) > 10:
313
- # If we have too many feeds, select a subset
314
- # Prioritize fact-checking sources
315
- fact_check_feeds = [feed for feed in feeds_to_use if "fact" in feed.lower() or "snopes" in feed.lower() or "politifact" in feed.lower()]
316
- other_feeds = [feed for feed in feeds_to_use if feed not in fact_check_feeds]
317
-
318
- # Take all fact-checking feeds plus a random selection of others
319
- import random
320
- selected_feeds = fact_check_feeds + random.sample(other_feeds, min(10 - len(fact_check_feeds), len(other_feeds)))
321
- else:
322
- selected_feeds = feeds_to_use
323
-
324
- # Fetch all feeds in parallel with the selected feeds
325
- feeds = fetch_all_feeds(selected_feeds)
326
-
327
- if not feeds:
328
- logger.warning("No RSS feeds could be fetched")
329
- return []
330
-
331
- all_entries = []
332
-
333
- # Process all feed entries
334
- for domain, feed in feeds:
335
- for entry in feed.entries:
336
- # Calculate relevance score
337
- relevance = get_entry_relevance(entry, terms, domain)
338
-
339
- if relevance > 0.3: # Only consider somewhat relevant entries
340
- # Extract entry details
341
- title = entry.title if hasattr(entry, 'title') else "No title"
342
- link = entry.link if hasattr(entry, 'link') else ""
343
-
344
- # Extract and clean description/content
345
- description = ""
346
- if hasattr(entry, 'description'):
347
- description = clean_html(entry.description)
348
- elif hasattr(entry, 'summary'):
349
- description = clean_html(entry.summary)
350
- elif hasattr(entry, 'content'):
351
- for content_item in entry.content:
352
- if 'value' in content_item:
353
- description += clean_html(content_item['value']) + " "
354
-
355
- # Truncate description if too long
356
- if len(description) > 250:
357
- description = description[:247] + "..."
358
-
359
- # Get publication date
360
- pub_date = extract_date(entry)
361
- date_str = pub_date.strftime('%Y-%m-%d') if pub_date else "Unknown date"
362
-
363
- # Format as evidence text
364
- evidence_text = (
365
- f"Title: {title}, "
366
- f"Source: {domain} (RSS), "
367
- f"Date: {date_str}, "
368
- f"URL: {link}, "
369
- f"Content: {description}"
370
- )
371
-
372
- all_entries.append({
373
- "text": evidence_text,
374
- "relevance": relevance,
375
- "date": pub_date or datetime.now()
376
- })
377
-
378
- # Sort entries by relevance
379
- all_entries.sort(key=lambda x: x["relevance"], reverse=True)
380
-
381
- # Take top results
382
- top_entries = all_entries[:max_results]
383
-
384
- logger.info(f"Retrieved {len(top_entries)} relevant RSS items from {len(feeds)} feeds in {time.time() - start_time:.2f}s")
385
-
386
- # Return just the text portion
387
- return [entry["text"] for entry in top_entries]
388
-
389
- except Exception as e:
390
- logger.error(f"Error in RSS retrieval: {str(e)}")
391
- return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
modules/semantic_analysis.py DELETED
@@ -1,503 +0,0 @@
1
- import logging
2
- import numpy as np
3
- from sklearn.metrics.pairwise import cosine_similarity
4
- from datetime import datetime, timedelta
5
- import re
6
-
7
- # Import the centralized NLP model handler
8
- from utils.models import get_nlp_model
9
-
10
- logger = logging.getLogger("misinformation_detector")
11
-
12
- def extract_entities(text):
13
- """Extract named entities from text"""
14
- if not text:
15
- return []
16
-
17
- try:
18
- # Use centralized NLP model
19
- nlp_model = get_nlp_model()
20
- doc = nlp_model(text)
21
- entities = [
22
- {
23
- "text": ent.text,
24
- "label": ent.label_,
25
- "start": ent.start_char,
26
- "end": ent.end_char
27
- }
28
- for ent in doc.ents
29
- ]
30
- return entities
31
- except Exception as e:
32
- logger.error(f"Error extracting entities: {str(e)}")
33
- return []
34
-
35
- def get_vector_representation(text):
36
- """Get vector representation of text using spaCy"""
37
- if not text:
38
- return None
39
-
40
- try:
41
- # Use centralized NLP model
42
- nlp_model = get_nlp_model()
43
- doc = nlp_model(text)
44
-
45
- # Return document vector if available
46
- if doc.has_vector:
47
- return doc.vector
48
-
49
- # Fallback: average of token vectors
50
- vectors = [token.vector for token in doc if token.has_vector]
51
- if vectors:
52
- return np.mean(vectors, axis=0)
53
-
54
- return None
55
- except Exception as e:
56
- logger.error(f"Error getting vector representation: {str(e)}")
57
- return None
58
-
59
- def calculate_similarity(text1, text2):
60
- """Calculate semantic similarity between two texts"""
61
- if not text1 or not text2:
62
- return 0.0
63
-
64
- try:
65
- vec1 = get_vector_representation(text1)
66
- vec2 = get_vector_representation(text2)
67
-
68
- if vec1 is None or vec2 is None:
69
- return 0.0
70
-
71
- # Reshape vectors for cosine_similarity
72
- vec1 = vec1.reshape(1, -1)
73
- vec2 = vec2.reshape(1, -1)
74
-
75
- # Calculate cosine similarity
76
- similarity = cosine_similarity(vec1, vec2)[0][0]
77
- return float(similarity)
78
- except Exception as e:
79
- logger.error(f"Error calculating similarity: {str(e)}")
80
- return 0.0
81
-
82
- def extract_date_from_evidence(evidence_text):
83
- """Extract date from evidence text"""
84
- if not evidence_text:
85
- return None
86
-
87
- try:
88
- # Look for date patterns in text
89
- date_patterns = [
90
- r'Date: (\d{4}-\d{2}-\d{2})', # ISO format
91
- r'published.*?(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})', # published on MM/DD/YYYY
92
- r'(\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4})', # DD Month YYYY
93
- r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}' # Month DD, YYYY
94
- ]
95
-
96
- for pattern in date_patterns:
97
- match = re.search(pattern, evidence_text)
98
- if match:
99
- date_str = match.group(1)
100
- # Parse date string based on format
101
- try:
102
- if '-' in date_str:
103
- return datetime.strptime(date_str, '%Y-%m-%d')
104
- elif '/' in date_str or '-' in date_str:
105
- formats = ['%m/%d/%Y', '%d/%m/%Y', '%m-%d-%Y', '%d-%m-%Y']
106
- for fmt in formats:
107
- try:
108
- return datetime.strptime(date_str, fmt)
109
- except ValueError:
110
- continue
111
- else:
112
- # Try different month formats
113
- formats = ['%d %B %Y', '%B %d, %Y', '%B %d %Y']
114
- for fmt in formats:
115
- try:
116
- return datetime.strptime(date_str, fmt)
117
- except ValueError:
118
- continue
119
- except Exception:
120
- pass
121
-
122
- return None
123
- except Exception as e:
124
- logger.error(f"Error extracting date from evidence: {str(e)}")
125
- return None
126
-
127
- def is_temporally_relevant(evidence_text, claim_text, max_days_old=30):
128
- """Check if evidence is temporally relevant to the claim"""
129
- # Check if claim seems to require recent evidence
130
- temporal_terms = ["today", "now", "current", "currently", "recent", "recently", "latest", "just", "this week", "this month", "this year"]
131
- requires_recent = any(term in claim_text.lower() for term in temporal_terms)
132
-
133
- # If claim doesn't specify temporality, consider evidence relevant
134
- if not requires_recent:
135
- return True
136
-
137
- # Extract date from evidence
138
- date = extract_date_from_evidence(evidence_text)
139
- if not date:
140
- return True # If we can't determine date, assume it's relevant
141
-
142
- # Check if evidence is recent enough
143
- cutoff = datetime.now() - timedelta(days=max_days_old)
144
- return date >= cutoff
145
-
146
- def has_authority_signal(evidence_text):
147
- """Check if evidence contains authority signals"""
148
- authority_signals = {
149
- "scientific_consensus": ["consensus", "scientists agree", "research shows", "studies confirm", "experts agree"],
150
- "fact_check": ["fact check", "rated false", "rated true", "debunked", "confirmed", "verification"],
151
- "high_authority": ["nasa", "world health organization", "who", "cdc", "national academy",
152
- "oxford", "harvard", "stanford", "mit", "cambridge", "yale",
153
- "princeton", "government", "official", "authorities", "minister",
154
- "ministry", "department", "administration", "university", "professor"]
155
- }
156
-
157
- evidence_lower = evidence_text.lower()
158
-
159
- authority_type = None
160
- authority_score = 1.0
161
-
162
- for signal_type, phrases in authority_signals.items():
163
- if any(phrase in evidence_lower for phrase in phrases):
164
- if signal_type == "scientific_consensus":
165
- authority_score = 1.8
166
- authority_type = "scientific_consensus"
167
- elif signal_type == "fact_check":
168
- authority_score = 1.5
169
- authority_type = "fact_check"
170
- elif signal_type == "high_authority":
171
- authority_score = 1.3
172
- authority_type = "high_authority"
173
- break
174
-
175
- return authority_score, authority_type
176
-
177
- def analyze_evidence_relevance(claim, evidence_list, source_credibility=None):
178
- """
179
- Analyze evidence relevance to claim using semantic similarity with improved handling
180
- for claims requiring strong evidence
181
-
182
- Args:
183
- claim (str): The claim being verified
184
- evidence_list (list): List of evidence items
185
- source_credibility (dict): Dictionary mapping source domains to credibility scores
186
-
187
- Returns:
188
- list: Sorted list of evidence items with relevance scores
189
- """
190
- if not evidence_list:
191
- return []
192
-
193
- # Ensure evidence_list is a list of strings
194
- if not isinstance(evidence_list, list):
195
- evidence_list = [str(evidence_list)]
196
-
197
- # Filter out None or empty items
198
- evidence_list = [item for item in evidence_list if item]
199
-
200
- # Check if claim contains strong assertions that would require specific evidence
201
- strong_assertion_markers = [
202
- "solved", "cured", "discovered", "breakthrough", "revolutionary",
203
- "first ever", "confirmed", "definitive", "conclusive", "proven",
204
- "groundbreaking", "unprecedented", "remarkable", "extends lifespan",
205
- "extends life", "definitively", "successfully"
206
- ]
207
-
208
- # Determine if claim contains strong assertions
209
- claim_has_strong_assertions = any(marker in claim.lower() for marker in strong_assertion_markers)
210
-
211
- # Log detection result
212
- if claim_has_strong_assertions:
213
- logger.info(f"Evidence analysis: Detected claim with strong assertions requiring specific evidence")
214
-
215
- # Extract named entities from claim
216
- claim_entities = extract_entities(claim)
217
- claim_entity_texts = [entity["text"].lower() for entity in claim_entities]
218
-
219
- # Process each evidence item
220
- analyzed_evidence = []
221
-
222
- # Track domains found in evidence to identify source diversity
223
- found_domains = set()
224
-
225
- for evidence in evidence_list:
226
- if not isinstance(evidence, str):
227
- continue
228
-
229
- # Calculate semantic similarity
230
- similarity = calculate_similarity(claim, evidence)
231
-
232
- # Check for entity overlap
233
- evidence_entities = extract_entities(evidence)
234
- evidence_entity_texts = [entity["text"].lower() for entity in evidence_entities]
235
-
236
- # Calculate entity overlap
237
- common_entities = set(claim_entity_texts).intersection(set(evidence_entity_texts))
238
- entity_overlap = len(common_entities) / max(1, len(claim_entity_texts))
239
-
240
- # Check temporal relevance
241
- temporal_relevance = 1.0
242
- if is_temporally_relevant(evidence, claim):
243
- temporal_relevance = 1.2
244
- else:
245
- # Penalty for temporally irrelevant evidence
246
- temporal_relevance = 0.7
247
-
248
- # Check for authority signals
249
- authority_score, authority_type = has_authority_signal(evidence)
250
-
251
- # Extract source from evidence if available
252
- source_boost = 1.0
253
- domain = None
254
-
255
- if source_credibility:
256
- # Try to extract domain from URL in evidence
257
- domain_match = re.search(r'URL: https?://(?:www\.)?([^/]+)', evidence)
258
- if domain_match:
259
- domain = domain_match.group(1)
260
- # Check if domain or its parent domain is in credibility list
261
- for cred_domain, cred_score in source_credibility.items():
262
- if cred_domain in domain:
263
- try:
264
- source_boost = float(cred_score)
265
- break
266
- except (ValueError, TypeError):
267
- pass
268
-
269
- # Track this domain for source diversity
270
- if domain:
271
- found_domains.add(domain)
272
-
273
- # For claims with strong assertions: check if evidence specifically addresses assertions
274
- claim_specificity_match = 1.0
275
- evidence_specificity_match = 1.0
276
-
277
- if claim_has_strong_assertions:
278
- # Check if evidence provides specific confirmation or contradiction
279
- direct_contradiction_terms = [
280
- "not yet", "has not", "have not", "cannot", "can't", "doesn't", "don't",
281
- "unlikely", "challenging", "remains a challenge", "in the future",
282
- "experimental", "in development", "proposed", "theoretical",
283
- "preliminary", "hypothesized", "potential", "promising but"
284
- ]
285
-
286
- # Check for contradictions to strong assertions
287
- if any(term in evidence.lower() for term in direct_contradiction_terms):
288
- # This evidence likely contradicts the strong assertion
289
- evidence_specificity_match = 2.0 # Boost relevance of contradicting evidence
290
- logger.debug(f"Found contradiction to strong assertion in evidence")
291
-
292
- # For claims with strong assertions, check if evidence specifically confirms
293
- direct_confirmation_terms = [
294
- "successfully demonstrated", "breakthrough", "solved", "cured",
295
- "confirmed", "definitive evidence", "conclusive results", "proven",
296
- "revolutionary results", "milestone achievement", "groundbreaking results"
297
- ]
298
-
299
- # If evidence confirms the strong assertion, adjust relevance
300
- if any(term in evidence.lower() for term in direct_confirmation_terms):
301
- # Apply higher scoring for evidence that specifically confirms
302
- evidence_specificity_match = 1.8
303
- logger.debug(f"Found confirmation of strong assertion in evidence")
304
-
305
- # For claims with strong assertions, check for high-quality sources
306
- high_quality_source_markers = [
307
- "journal", "doi.org", "research", "university", "institute",
308
- "laboratory", "professor", "study", "publication", "published in"
309
- ]
310
-
311
- is_high_quality = any(term in evidence.lower() for term in high_quality_source_markers)
312
- quality_boost = 1.4 if is_high_quality else 1.0
313
-
314
- # Apply the quality boost
315
- source_boost *= quality_boost
316
-
317
- # Calculate final relevance score with improvements for all claim types
318
- if claim_has_strong_assertions:
319
- relevance_score = (
320
- (similarity * 0.35) + # Semantic similarity
321
- (entity_overlap * 0.25) + # Entity overlap
322
- (0.25) # Base value to ensure all evidence has some relevance
323
- ) * temporal_relevance * authority_score * source_boost * claim_specificity_match * evidence_specificity_match
324
- else:
325
- # Original formula for regular claims
326
- relevance_score = (
327
- (similarity * 0.4) + # Semantic similarity
328
- (entity_overlap * 0.3) + # Entity overlap
329
- (0.3) # Base value to ensure all evidence has some relevance
330
- ) * temporal_relevance * authority_score * source_boost
331
-
332
- # Add metadata and relevance score
333
- analyzed_evidence.append({
334
- "text": evidence,
335
- "relevance_score": relevance_score,
336
- "similarity": similarity,
337
- "entity_overlap": entity_overlap,
338
- "temporal_relevance": temporal_relevance,
339
- "authority_score": authority_score,
340
- "authority_type": authority_type,
341
- "source_boost": source_boost,
342
- "domain": domain
343
- })
344
-
345
- # Sort by relevance score (descending)
346
- analyzed_evidence.sort(key=lambda x: x["relevance_score"], reverse=True)
347
-
348
- # Ensure we have diverse sources in top results for all claims
349
- if len(found_domains) > 1:
350
- # Try to promote evidence from reliable sources if we haven't selected any yet
351
- reliable_sources_seen = False
352
-
353
- # Check if top 3 results contain any reliable sources
354
- for item in analyzed_evidence[:3]:
355
- domain = item.get("domain", "")
356
- if domain and source_credibility and any(cred_domain in domain for cred_domain in source_credibility):
357
- reliable_sources_seen = True
358
- break
359
-
360
- # If no reliable sources in top results, promote one if available
361
- if not reliable_sources_seen:
362
- for i, item in enumerate(analyzed_evidence[3:]):
363
- domain = item.get("domain", "")
364
- if domain and source_credibility and any(cred_domain in domain for cred_domain in source_credibility):
365
- # Swap this item into the top 3
366
- analyzed_evidence.insert(2, analyzed_evidence.pop(i+3))
367
- break
368
-
369
- return analyzed_evidence
370
-
371
- def select_diverse_evidence(analyzed_evidence, max_items=5):
372
- """
373
- Select diverse evidence items based on relevance, source diversity and claim characteristics
374
-
375
- Args:
376
- analyzed_evidence (list): List of evidence items with relevance scores
377
- max_items (int): Maximum number of evidence items to return
378
-
379
- Returns:
380
- list: Selected diverse evidence items
381
- """
382
- if not analyzed_evidence:
383
- return []
384
-
385
- # Check if top evidence suggests claim has strong assertions
386
- strong_assertion_markers = [
387
- "solved", "cured", "discovered", "breakthrough", "revolutionary",
388
- "first ever", "confirmed", "definitive", "conclusive", "proven",
389
- "groundbreaking", "unprecedented", "extends lifespan", "definitively"
390
- ]
391
-
392
- # Determine if this is a claim with strong assertions by checking evidence text
393
- has_strong_assertions = False
394
-
395
- for item in analyzed_evidence[:3]: # Check just the top items for efficiency
396
- if "text" in item:
397
- item_text = item["text"].lower()
398
- if any(f"claim {marker}" in item_text or f"claim has {marker}" in item_text
399
- for marker in strong_assertion_markers):
400
- has_strong_assertions = True
401
- break
402
-
403
- # Also check for contradiction markers in evidence which can indicate a strong assertion
404
- contradiction_markers = [
405
- "not yet solved", "hasn't been proven", "no evidence that",
406
- "remains unsolved", "has not been confirmed", "remains theoretical"
407
- ]
408
-
409
- if not has_strong_assertions:
410
- for item in analyzed_evidence[:3]:
411
- if "text" in item:
412
- item_text = item["text"].lower()
413
- if any(marker in item_text for marker in contradiction_markers):
414
- has_strong_assertions = True
415
- break
416
-
417
- # Ensure we don't select more than available
418
- max_items = min(max_items, len(analyzed_evidence))
419
-
420
- # Initialize selected items with the most relevant item
421
- selected = [analyzed_evidence[0]]
422
- remaining = analyzed_evidence[1:]
423
-
424
- # Track sources to ensure diversity
425
- selected_sources = set()
426
- for item in selected:
427
- # Try to extract source from evidence
428
- source_match = re.search(r'Source: ([^,]+)', item["text"])
429
- if source_match:
430
- selected_sources.add(source_match.group(1))
431
-
432
- # For all claims, track if we have high-quality sources yet
433
- has_quality_source = False
434
- quality_source_markers = ["journal", "doi.org", "research", "university",
435
- "institute", "laboratory", "professor", "study"]
436
-
437
- # Check if our top item is already from a quality source
438
- if any(marker in selected[0]["text"].lower() for marker in quality_source_markers):
439
- has_quality_source = True
440
-
441
- # Select remaining items balancing relevance and diversity
442
- while len(selected) < max_items and remaining:
443
- best_item = None
444
- best_score = -1
445
-
446
- for i, item in enumerate(remaining):
447
- # Base score is the item's relevance
448
- score = item["relevance_score"]
449
-
450
- # Extract source if available
451
- source = None
452
- source_match = re.search(r'Source: ([^,]+)', item["text"])
453
- if source_match:
454
- source = source_match.group(1)
455
-
456
- # Apply diversity bonus if source is new
457
- if source and source not in selected_sources:
458
- score *= 1.2 # Diversity bonus
459
-
460
- # For claims with strong assertions, apply bonus for contradicting evidence
461
- if has_strong_assertions:
462
- # Check for contradiction markers in the text
463
- if any(marker in item["text"].lower() for marker in contradiction_markers):
464
- score *= 1.3 # Bonus for evidence that may contradict strong assertions
465
-
466
- # For any claim, apply bonus for high-quality sources if we don't have one yet
467
- if not has_quality_source:
468
- is_item_quality = any(marker in item["text"].lower() for marker in quality_source_markers)
469
- if is_item_quality:
470
- score *= 1.5 # Significant bonus for quality sources
471
-
472
- if score > best_score:
473
- best_score = score
474
- best_item = (i, item)
475
-
476
- if best_item:
477
- idx, item = best_item
478
- selected.append(item)
479
- remaining.pop(idx)
480
-
481
- # Add source to selected sources
482
- source_match = re.search(r'Source: ([^,]+)', item["text"])
483
- if source_match:
484
- selected_sources.add(source_match.group(1))
485
-
486
- # Check if we found a quality source
487
- if not has_quality_source:
488
- if any(marker in item["text"].lower() for marker in quality_source_markers):
489
- has_quality_source = True
490
- else:
491
- break
492
-
493
- # For any claim with strong assertions, ensure we have at least one quality source if available
494
- if has_strong_assertions and not has_quality_source and remaining:
495
- for i, item in enumerate(remaining):
496
- if any(marker in item["text"].lower() for marker in quality_source_markers):
497
- # Replace the least relevant selected item with this quality one
498
- selected.sort(key=lambda x: x["relevance_score"])
499
- selected[0] = item
500
- break
501
-
502
- # Return only the text portion
503
- return [item["text"] for item in selected]