Spaces:
Running
Running
Upload 12 files
Browse files- modules/__init__.py +19 -0
- modules/category_detection.py +880 -0
- modules/claim_extraction.py +236 -0
- modules/classification.py +521 -0
- modules/evidence_retrieval.py +944 -0
- modules/explanation.py +181 -0
- modules/rss_feed.py +391 -0
- modules/semantic_analysis.py +503 -0
- utils/__init__.py +20 -0
- utils/api_utils.py +229 -0
- utils/models.py +157 -0
- utils/performance.py +135 -0
modules/__init__.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Modules package initialization.
|
3 |
+
|
4 |
+
This package contains the core modules for the AskVeracity fact-checking system.
|
5 |
+
"""
|
6 |
+
|
7 |
+
from .claim_extraction import extract_claims, shorten_claim_for_evidence
|
8 |
+
from .evidence_retrieval import retrieve_combined_evidence
|
9 |
+
from .classification import classify_with_llm, aggregate_evidence
|
10 |
+
from .explanation import generate_explanation
|
11 |
+
|
12 |
+
__all__ = [
|
13 |
+
'extract_claims',
|
14 |
+
'shorten_claim_for_evidence',
|
15 |
+
'retrieve_combined_evidence',
|
16 |
+
'classify_with_llm',
|
17 |
+
'aggregate_evidence',
|
18 |
+
'generate_explanation'
|
19 |
+
]
|
modules/category_detection.py
ADDED
@@ -0,0 +1,880 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import re
|
3 |
+
from typing import Tuple, List, Dict, Optional
|
4 |
+
import os
|
5 |
+
import time
|
6 |
+
|
7 |
+
# Set up logging
|
8 |
+
logger = logging.getLogger("misinformation_detector")
|
9 |
+
|
10 |
+
# Define categories and their keywords
|
11 |
+
CLAIM_CATEGORIES = {
|
12 |
+
"ai": [
|
13 |
+
# General AI terms
|
14 |
+
"AI", "artificial intelligence", "machine learning", "ML", "deep learning", "DL",
|
15 |
+
"neural network", "neural nets", "generative AI", "GenAI", "AGI", "artificial general intelligence",
|
16 |
+
"transformer", "attention mechanism", "fine-tuning", "pre-training", "training", "inference",
|
17 |
+
|
18 |
+
# AI Models and Architectures
|
19 |
+
"language model", "large language model", "LLM", "foundation model", "multimodal model",
|
20 |
+
"vision language model", "VLM", "text-to-speech", "TTS", "speech-to-text", "STT",
|
21 |
+
"text-to-image", "image-to-text", "diffusion model", "generative model", "discriminative model",
|
22 |
+
"GPT", "BERT", "T5", "PaLM", "Claude", "Llama", "Gemini", "Mistral", "Mixtral", "Stable Diffusion",
|
23 |
+
"Dall-E", "Midjourney", "Sora", "transformer", "MoE", "mixture of experts", "sparse model",
|
24 |
+
"dense model", "encoder", "decoder", "encoder-decoder", "autoencoder", "VAE",
|
25 |
+
"mixture of experts", "MoE", "sparse MoE", "switch transformer", "gated experts",
|
26 |
+
"routing network", "expert routing", "pathways", "multi-query attention", "multi-head attention",
|
27 |
+
"rotary position embedding", "RoPE", "grouped-query attention", "GQA", "flash attention",
|
28 |
+
"state space model", "SSM", "mamba", "recurrent neural network", "RNN", "LSTM", "GRU",
|
29 |
+
"convolutional neural network", "CNN", "residual connection", "skip connection", "normalization",
|
30 |
+
"layer norm", "group norm", "batch norm", "parameter efficient fine-tuning", "PEFT",
|
31 |
+
"LoRA", "low-rank adaptation", "QLoRA", "adapters", "prompt tuning", "prefix tuning",
|
32 |
+
|
33 |
+
# AI Learning Paradigms
|
34 |
+
"supervised learning", "unsupervised learning", "reinforcement learning", "RL",
|
35 |
+
"meta-learning", "transfer learning", "federated learning", "self-supervised learning",
|
36 |
+
"semi-supervised learning", "few-shot learning", "zero-shot learning", "one-shot learning",
|
37 |
+
"contrastive learning", "curriculum learning", "imitation learning", "active learning",
|
38 |
+
"reinforcement learning from human feedback", "RLHF", "direct preference optimization", "DPO",
|
39 |
+
"constitutional AI", "red teaming", "adversarial training", "GAN", "generative adversarial network",
|
40 |
+
"diffusion", "latent diffusion", "flow-based model", "variational autoencoder", "VAE",
|
41 |
+
|
42 |
+
# AI Capabilities and Applications
|
43 |
+
"natural language processing", "NLP", "computer vision", "CV", "speech recognition",
|
44 |
+
"text generation", "image generation", "video generation", "multimodal", "multi-modal",
|
45 |
+
"recommendation system", "recommender system", "chatbot", "conversational AI",
|
46 |
+
"sentiment analysis", "entity recognition", "semantic search", "vector search", "embedding",
|
47 |
+
"classification", "regression", "clustering", "anomaly detection", "agent", "AI agent",
|
48 |
+
"autonomous agent", "agentic", "RAG", "retrieval augmented generation", "tool use",
|
49 |
+
"function calling", "reasoning", "chain-of-thought", "CoT", "tree-of-thought", "ToT",
|
50 |
+
"planning", "decision making", "multi-agent", "agent swarm", "multi-agent simulation",
|
51 |
+
|
52 |
+
# AI Technical Terms
|
53 |
+
"token", "tokenizer", "tokenization", "embedding", "vector", "prompt", "prompt engineering",
|
54 |
+
"context window", "parameter", "weights", "bias", "activation function", "loss function",
|
55 |
+
"gradient descent", "backpropagation", "epoch", "batch", "mini-batch", "regularization",
|
56 |
+
"dropout", "overfitting", "underfitting", "hyperparameter", "latent space", "latent variable",
|
57 |
+
"feature extraction", "dimensionality reduction", "optimization", "quantization", "pruning",
|
58 |
+
"fine-tuning", "transfer learning", "knowledge distillation", "int4", "int8", "bfloat16",
|
59 |
+
"float16", "mixed precision", "GPTQ", "AWQ", "GGUF", "GGML", "KV cache", "speculative decoding",
|
60 |
+
"beam search", "greedy decoding", "temperature", "top-k", "top-p", "nucleus sampling",
|
61 |
+
|
62 |
+
# AI Tools and Frameworks
|
63 |
+
"TensorFlow", "PyTorch", "JAX", "Keras", "Hugging Face", "Transformers", "Diffusers",
|
64 |
+
"LangChain", "Llama Index", "OpenAI", "Anthropic", "NVIDIA", "GPU", "TPU", "IPU", "NPU", "CUDA",
|
65 |
+
"MLOps", "model monitoring", "model deployment", "model serving", "inference endpoint",
|
66 |
+
"vLLM", "TGI", "text generation inference", "triton", "onnx", "tensorRT",
|
67 |
+
|
68 |
+
# AI Ethics and Concerns
|
69 |
+
"AI ethics", "responsible AI", "AI safety", "AI alignment", "AI governance",
|
70 |
+
"bias", "fairness", "interpretability", "explainability", "XAI", "transparency",
|
71 |
+
"hallucination", "toxicity", "safe deployment", "AI risk", "AI capabilities",
|
72 |
+
"alignment tax", "red teaming", "jailbreak", "prompt injection", "data poisoning",
|
73 |
+
|
74 |
+
# AI Companies and Organizations
|
75 |
+
"OpenAI", "Anthropic", "Google DeepMind", "Meta AI", "Microsoft", "NVIDIA",
|
76 |
+
"Hugging Face", "Mistral AI", "Cohere", "AI21 Labs", "Stability AI", "Midjourney",
|
77 |
+
"EleutherAI", "Allen AI", "DeepMind", "Character AI", "Inflection AI", "xAI"
|
78 |
+
],
|
79 |
+
|
80 |
+
"science": [
|
81 |
+
# General scientific terms
|
82 |
+
"study", "research", "scientist", "scientific", "discovered", "experiment",
|
83 |
+
"laboratory", "clinical", "trial", "hypothesis", "theory", "evidence-based",
|
84 |
+
"peer-reviewed", "journal", "publication", "finding", "breakthrough", "innovation",
|
85 |
+
"discovery", "analysis", "data", "measurement", "observation", "empirical",
|
86 |
+
|
87 |
+
# Biology and medicine
|
88 |
+
"biology", "chemistry", "physics", "genetics", "genomics", "DNA", "RNA",
|
89 |
+
"medicine", "gene", "protein", "molecule", "cell", "brain", "neuro",
|
90 |
+
"cancer", "disease", "cure", "treatment", "vaccine", "health", "medical",
|
91 |
+
"pharmaceutical", "drug", "therapy", "symptom", "diagnosis", "prognosis",
|
92 |
+
"patient", "doctor", "hospital", "clinic", "surgery", "immune", "antibody",
|
93 |
+
"virus", "bacteria", "pathogen", "infection", "epidemic", "pandemic",
|
94 |
+
"organism", "evolution", "mutation", "chromosome", "enzyme", "hormone",
|
95 |
+
|
96 |
+
# Physics and astronomy
|
97 |
+
"quantum", "particle", "atom", "nuclear", "electron", "neutron", "proton",
|
98 |
+
"atomic", "subatomic", "molecular", "energy", "matter", "mass", "force",
|
99 |
+
"space", "NASA", "telescope", "planet", "exoplanet", "moon", "lunar", "mars",
|
100 |
+
"star", "galaxy", "cosmic", "astronomical", "universe", "solar", "celestial",
|
101 |
+
"orbit", "gravitational", "gravity", "relativity", "quantum mechanics",
|
102 |
+
"string theory", "dark matter", "dark energy", "black hole", "supernova",
|
103 |
+
"radiation", "radioactive", "isotope", "fission", "fusion", "accelerator",
|
104 |
+
|
105 |
+
# Environmental science
|
106 |
+
"climate", "carbon", "environment", "ecosystem", "species", "extinct",
|
107 |
+
"endangered", "biodiversity", "conservation", "sustainable", "renewable",
|
108 |
+
"fossil fuel", "greenhouse", "global warming", "polar", "ice cap", "glacier",
|
109 |
+
"ozone", "atmosphere", "weather", "meteorology", "geology", "earthquake",
|
110 |
+
"volcanic", "ocean", "marine", "coral reef", "deforestation", "pollution",
|
111 |
+
|
112 |
+
# Math and computer science (non-AI specific)
|
113 |
+
"equation", "formula", "theorem", "calculus", "statistical", "probability",
|
114 |
+
"dataset", "parameter", "variable", "function", "matrix", "optimization",
|
115 |
+
|
116 |
+
# Organizations
|
117 |
+
"CERN", "NIH", "CDC", "WHO", "NOAA", "ESA", "SpaceX", "Blue Origin", "JPL",
|
118 |
+
"laboratory", "institute", "university", "academic", "faculty", "professor",
|
119 |
+
|
120 |
+
# Science tools
|
121 |
+
"Matlab", "SPSS", "SAS", "ImageJ", "LabVIEW", "ANSYS", "Cadence", "Origin",
|
122 |
+
"Avogadro", "ChemDraw", "Mathematica", "Wolfram Alpha", "COMSOL", "LAMMPS",
|
123 |
+
"VASP", "Gaussian", "GIS", "ArcGIS", "QGIS", "Maple", "R Studio"
|
124 |
+
],
|
125 |
+
|
126 |
+
"technology": [
|
127 |
+
# General tech terms
|
128 |
+
"computer", "software", "hardware", "internet", "cyber", "digital", "tech",
|
129 |
+
"robot", "automation", "autonomous", "code", "programming", "data", "cloud",
|
130 |
+
"server", "network", "encryption", "blockchain", "crypto", "bitcoin", "ethereum",
|
131 |
+
"technology", "innovation", "breakthrough", "prototype", "development",
|
132 |
+
"engineering", "technical", "specification", "feature", "functionality",
|
133 |
+
"interface", "system", "infrastructure", "integration", "implementation",
|
134 |
+
|
135 |
+
# Devices and hardware
|
136 |
+
"smartphone", "device", "gadget", "laptop", "desktop", "tablet", "wearable",
|
137 |
+
"smartwatch", "IoT", "internet of things", "sensor", "chip", "semiconductor",
|
138 |
+
"processor", "CPU", "GPU", "memory", "RAM", "storage", "hard drive", "SSD",
|
139 |
+
"electronic", "circuit", "motherboard", "component", "peripheral", "accessory",
|
140 |
+
"display", "screen", "touchscreen", "camera", "lens", "microphone", "speaker",
|
141 |
+
"battery", "charger", "wireless", "bluetooth", "WiFi", "router", "modem",
|
142 |
+
|
143 |
+
# Software and internet
|
144 |
+
"app", "application", "platform", "website", "online", "web", "browser",
|
145 |
+
"operating system", "Windows", "macOS", "Linux", "Android", "iOS", "software",
|
146 |
+
"program", "code", "coding", "development", "framework", "library", "API",
|
147 |
+
"interface", "backend", "frontend", "full-stack", "developer", "programmer",
|
148 |
+
"database", "SQL", "NoSQL", "cloud computing", "SaaS", "PaaS", "IaaS",
|
149 |
+
"DevOps", "agile", "scrum", "sprint", "version control", "git", "repository",
|
150 |
+
|
151 |
+
# Communications and networking
|
152 |
+
"5G", "6G", "broadband", "fiber", "network", "wireless", "cellular", "mobile",
|
153 |
+
"telecommunications", "telecom", "transmission", "bandwidth", "latency",
|
154 |
+
"protocol", "IP address", "DNS", "server", "hosting", "data center",
|
155 |
+
|
156 |
+
# Company and product names
|
157 |
+
"Apple", "Google", "Microsoft", "Amazon", "Facebook", "Meta", "Tesla",
|
158 |
+
"IBM", "Intel", "AMD", "Nvidia", "Qualcomm", "Cisco", "Oracle", "SAP",
|
159 |
+
"Huawei", "Samsung", "Sony", "LG", "Dell", "HP", "Lenovo", "Xiaomi",
|
160 |
+
"iPhone", "iPad", "MacBook", "Surface", "Galaxy", "Pixel", "Windows",
|
161 |
+
"Android", "iOS", "Chrome", "Firefox", "Edge", "Safari", "Office",
|
162 |
+
"Azure", "AWS", "Google Cloud", "Gmail", "Outlook", "Teams", "Zoom",
|
163 |
+
|
164 |
+
# Advanced technologies
|
165 |
+
"VR", "AR", "XR", "virtual reality", "augmented reality", "mixed reality",
|
166 |
+
"metaverse", "3D printing", "additive manufacturing", "quantum computing",
|
167 |
+
"nanotechnology", "biotechnology", "electric vehicle", "self-driving",
|
168 |
+
"autonomous vehicle", "drone", "UAV", "robotics", "cybersecurity",
|
169 |
+
|
170 |
+
# Social media
|
171 |
+
"social media", "social network", "Facebook", "Instagram", "Twitter", "X",
|
172 |
+
"LinkedIn", "TikTok", "Snapchat", "YouTube", "Pinterest", "Reddit",
|
173 |
+
"streaming", "content creator", "influencer", "follower", "like", "share",
|
174 |
+
"post", "tweet", "user-generated", "viral", "trending", "engagement",
|
175 |
+
|
176 |
+
# Technology tools
|
177 |
+
"NumPy", "Pandas", "Matplotlib", "Seaborn", "Scikit-learn", "Jupyter",
|
178 |
+
"Visual Studio", "VS Code", "IntelliJ", "PyCharm", "Eclipse", "Android Studio",
|
179 |
+
"Xcode", "Docker", "Kubernetes", "Jenkins", "Ansible", "Terraform", "Vagrant",
|
180 |
+
"AWS CLI", "Azure CLI", "GCP CLI", "PowerShell", "Bash", "npm", "pip", "conda",
|
181 |
+
"React", "Angular", "Vue.js", "Node.js", "Django", "Flask", "Spring", "Laravel",
|
182 |
+
"PostgreSQL", "MySQL", "MongoDB", "Redis", "Elasticsearch", "Kafka", "RabbitMQ",
|
183 |
+
|
184 |
+
# Optimization terms
|
185 |
+
"optimization", "efficiency", "performance tuning", "benchmarking", "profiling",
|
186 |
+
"refactoring", "scaling", "bottleneck", "throughput", "latency reduction",
|
187 |
+
"response time", "caching", "load balancing", "distributed computing",
|
188 |
+
"parallel processing", "concurrency", "asynchronous", "memory management"
|
189 |
+
],
|
190 |
+
|
191 |
+
"politics": [
|
192 |
+
# Government structure
|
193 |
+
"president", "prime minister", "government", "parliament", "congress",
|
194 |
+
"senate", "house", "representative", "minister", "secretary", "cabinet",
|
195 |
+
"administration", "mayor", "governor", "politician", "official", "authority",
|
196 |
+
"federal", "state", "local", "municipal", "county", "city", "town",
|
197 |
+
"constituency", "district", "precinct", "ward", "judiciary", "executive",
|
198 |
+
"legislative", "branch", "checks and balances", "separation of powers",
|
199 |
+
|
200 |
+
# Political activities
|
201 |
+
"policy", "election", "campaign", "vote", "voter", "ballot", "polling",
|
202 |
+
"political", "politics", "debate", "speech", "address", "press conference",
|
203 |
+
"approval rating", "opinion poll", "candidate", "incumbent", "challenger",
|
204 |
+
"primary", "caucus", "convention", "delegate", "nomination", "campaign trail",
|
205 |
+
"fundraising", "lobbying", "advocacy", "activism", "protest", "demonstration",
|
206 |
+
|
207 |
+
# Political ideologies
|
208 |
+
"democracy", "democratic", "republican", "conservative", "liberal",
|
209 |
+
"progressive", "left-wing", "right-wing", "centrist", "moderate",
|
210 |
+
"socialist", "capitalist", "communist", "libertarian", "populist",
|
211 |
+
"nationalist", "globalist", "isolationist", "hawk", "dove",
|
212 |
+
"ideology", "partisan", "bipartisan", "coalition", "majority", "minority",
|
213 |
+
|
214 |
+
# Laws and regulations
|
215 |
+
"bill", "law", "legislation", "regulation", "policy", "statute", "code",
|
216 |
+
"amendment", "reform", "repeal", "enact", "implement", "enforce",
|
217 |
+
"constitutional", "unconstitutional", "legal", "illegal", "legalize",
|
218 |
+
"criminalize", "deregulate", "regulatory", "compliance", "mandate",
|
219 |
+
|
220 |
+
# Judicial and legal
|
221 |
+
"court", "supreme", "justice", "judge", "ruling", "decision", "opinion",
|
222 |
+
"case", "lawsuit", "litigation", "plaintiff", "defendant", "prosecutor",
|
223 |
+
"attorney", "lawyer", "advocate", "judicial review", "precedent",
|
224 |
+
"constitution", "amendment", "rights", "civil rights", "human rights",
|
225 |
+
|
226 |
+
# International relations
|
227 |
+
"treaty", "international", "diplomatic", "diplomacy", "relations",
|
228 |
+
"foreign policy", "domestic policy", "UN", "NATO", "EU", "United Nations",
|
229 |
+
"sanctions", "embargo", "tariff", "trade war", "diplomat", "embassy",
|
230 |
+
"consulate", "ambassador", "delegation", "summit", "bilateral", "multilateral",
|
231 |
+
"alliance", "ally", "adversary", "geopolitical", "sovereignty", "regime",
|
232 |
+
|
233 |
+
# Security and defense
|
234 |
+
"national security", "homeland security", "defense", "military", "armed forces",
|
235 |
+
"army", "navy", "air force", "marines", "coast guard", "intelligence",
|
236 |
+
"CIA", "FBI", "NSA", "Pentagon", "war", "conflict", "peacekeeping",
|
237 |
+
"terrorism", "counterterrorism", "insurgency", "nuclear weapon", "missile",
|
238 |
+
"disarmament", "nonproliferation", "surveillance", "espionage",
|
239 |
+
|
240 |
+
# Political institutions
|
241 |
+
"White House", "Kremlin", "Downing Street", "Capitol Hill", "Westminster",
|
242 |
+
"United Nations", "European Union", "NATO", "World Bank", "IMF", "WTO",
|
243 |
+
"ASEAN", "African Union", "BRICS", "G7", "G20",
|
244 |
+
|
245 |
+
# Political parties and movements
|
246 |
+
"Democrat", "Republican", "Labour", "Conservative", "Green Party",
|
247 |
+
"Socialist", "Communist", "Libertarian", "Independent", "Tea Party",
|
248 |
+
"progressive movement", "civil rights movement", "womens rights",
|
249 |
+
"LGBTQ rights", "Black Lives Matter", "environmental movement"
|
250 |
+
],
|
251 |
+
|
252 |
+
"business": [
|
253 |
+
# Companies and organization types
|
254 |
+
"company", "corporation", "business", "startup", "firm", "enterprise",
|
255 |
+
"corporate", "industry", "sector", "conglomerate", "multinational",
|
256 |
+
"organization", "entity", "private", "public", "incorporated", "LLC",
|
257 |
+
"partnership", "proprietorship", "franchise", "subsidiary", "parent company",
|
258 |
+
"headquarters", "office", "facility", "plant", "factory", "warehouse",
|
259 |
+
"retail", "wholesale", "ecommerce", "brick-and-mortar", "chain", "outlet",
|
260 |
+
|
261 |
+
# Business roles and management
|
262 |
+
"executive", "CEO", "CFO", "CTO", "COO", "CMO", "CIO", "CHRO", "chief",
|
263 |
+
"director", "board", "chairman", "chairwoman", "chairperson", "president",
|
264 |
+
"vice president", "senior", "junior", "manager", "management", "supervisor",
|
265 |
+
"founder", "entrepreneur", "owner", "shareholder", "stakeholder",
|
266 |
+
"employee", "staff", "workforce", "personnel", "human resources", "HR",
|
267 |
+
"recruit", "hire", "layoff", "downsizing", "restructuring", "reorganization",
|
268 |
+
|
269 |
+
# Financial terms
|
270 |
+
"profit", "revenue", "sales", "income", "earnings", "EBITDA", "turnover",
|
271 |
+
"loss", "deficit", "expense", "cost", "overhead", "margin", "markup",
|
272 |
+
"budget", "forecast", "projection", "estimate", "actual", "variance",
|
273 |
+
"balance sheet", "income statement", "cash flow", "P&L", "liquidity",
|
274 |
+
"solvency", "asset", "liability", "equity", "debt", "leverage", "capital",
|
275 |
+
"working capital", "cash", "funds", "money", "payment", "transaction",
|
276 |
+
|
277 |
+
# Markets and trading
|
278 |
+
"market", "stock", "share", "bond", "security", "commodity", "futures",
|
279 |
+
"option", "derivative", "forex", "foreign exchange", "currency", "crypto",
|
280 |
+
"trader", "trading", "buy", "sell", "long", "short", "position", "portfolio",
|
281 |
+
"diversification", "hedge", "risk", "return", "yield", "dividend", "interest",
|
282 |
+
"bull market", "bear market", "correction", "crash", "rally", "volatile",
|
283 |
+
"volatility", "index", "benchmark", "Dow Jones", "NASDAQ", "S&P 500", "NYSE",
|
284 |
+
|
285 |
+
# Investment and funding
|
286 |
+
"investor", "investment", "fund", "mutual fund", "ETF", "hedge fund",
|
287 |
+
"private equity", "venture", "venture capital", "VC", "angel investor",
|
288 |
+
"seed", "Series A", "Series B", "Series C", "funding", "financing",
|
289 |
+
"loan", "credit", "debt", "equity", "fundraising", "crowdfunding",
|
290 |
+
"IPO", "initial public offering", "going public", "listed", "delisted",
|
291 |
+
"merger", "acquisition", "M&A", "takeover", "buyout", "divestiture",
|
292 |
+
"valuation", "billion", "million", "trillion", "unicorn", "decacorn",
|
293 |
+
|
294 |
+
# Economic terms
|
295 |
+
"economy", "economic", "economics", "macro", "micro", "fiscal", "monetary",
|
296 |
+
"supply", "demand", "market forces", "competition", "competitive", "monopoly",
|
297 |
+
"oligopoly", "antitrust", "regulation", "deregulation", "growth", "decline",
|
298 |
+
"recession", "depression", "recovery", "expansion", "contraction", "cycle",
|
299 |
+
"inflation", "deflation", "stagflation", "hyperinflation", "CPI", "price",
|
300 |
+
"GDP", "gross domestic product", "GNP", "productivity", "output", "input",
|
301 |
+
|
302 |
+
# Banking and finance
|
303 |
+
"finance", "financial", "bank", "banking", "commercial bank", "investment bank",
|
304 |
+
"central bank", "Federal Reserve", "Fed", "ECB", "Bank of England", "BOJ",
|
305 |
+
"interest rate", "prime rate", "discount rate", "basis point", "monetary policy",
|
306 |
+
"quantitative easing", "tightening", "loosening", "credit", "lending",
|
307 |
+
"borrowing", "loan", "mortgage", "consumer credit", "credit card", "debit card",
|
308 |
+
"checking", "savings", "deposit", "withdrawal", "ATM", "branch", "online banking",
|
309 |
+
|
310 |
+
# Currencies and payments
|
311 |
+
"dollar", "euro", "pound", "yen", "yuan", "rupee", "ruble", "real", "peso",
|
312 |
+
"currency", "money", "fiat", "exchange rate", "remittance", "transfer",
|
313 |
+
"payment", "transaction", "wire", "ACH", "SWIFT", "clearing", "settlement",
|
314 |
+
"cryptocurrency", "bitcoin", "ethereum", "blockchain", "fintech", "paytech",
|
315 |
+
|
316 |
+
# Business operations
|
317 |
+
"product", "service", "solution", "offering", "launch", "rollout", "release",
|
318 |
+
"operation", "production", "manufacturing", "supply chain", "logistics",
|
319 |
+
"procurement", "inventory", "distribution", "shipping", "delivery",
|
320 |
+
"quality", "control", "assurance", "standard", "certification", "compliance",
|
321 |
+
"process", "procedure", "workflow", "efficiency", "optimization",
|
322 |
+
|
323 |
+
# Marketing and sales
|
324 |
+
"marketing", "advertise", "advertising", "campaign", "promotion", "publicity",
|
325 |
+
"PR", "public relations", "brand", "branding", "identity", "image", "reputation",
|
326 |
+
"sales", "selling", "deal", "transaction", "pipeline", "lead", "prospect",
|
327 |
+
"customer", "client", "consumer", "buyer", "purchaser", "target market",
|
328 |
+
"segment", "demographic", "psychographic", "B2B", "B2C", "retail", "wholesale",
|
329 |
+
"price", "pricing", "discount", "premium", "luxury", "value", "bargain"
|
330 |
+
],
|
331 |
+
|
332 |
+
"world": [
|
333 |
+
# General international terms
|
334 |
+
"country", "nation", "state", "republic", "kingdom", "global", "international",
|
335 |
+
"foreign", "world", "worldwide", "domestic", "abroad", "overseas",
|
336 |
+
"developed", "developing", "industrialized", "emerging", "third world",
|
337 |
+
"global south", "global north", "east", "west", "western", "eastern",
|
338 |
+
"bilateral", "multilateral", "transnational", "multinational", "sovereignty",
|
339 |
+
|
340 |
+
# Regions and continents
|
341 |
+
"Europe", "European", "Asia", "Asian", "Africa", "African", "North America",
|
342 |
+
"South America", "Latin America", "Australia", "Oceania", "Antarctica",
|
343 |
+
"Middle East", "Central Asia", "Southeast Asia", "East Asia", "South Asia",
|
344 |
+
"Eastern Europe", "Western Europe", "Northern Europe", "Southern Europe",
|
345 |
+
"Mediterranean", "Scandinavia", "Nordic", "Baltic", "Balkans", "Caucasus",
|
346 |
+
"Caribbean", "Central America", "South Pacific", "Polynesia", "Micronesia",
|
347 |
+
|
348 |
+
# Major countries and regions
|
349 |
+
"China", "Chinese", "Russia", "Russian", "India", "Indian", "Japan", "Japanese",
|
350 |
+
"UK", "British", "England", "English", "Scotland", "Scottish", "Wales", "Welsh",
|
351 |
+
"Germany", "German", "France", "French", "Italy", "Italian", "Spain", "Spanish",
|
352 |
+
"Canada", "Canadian", "Brazil", "Brazilian", "Mexico", "Mexican", "Turkey", "Turkish",
|
353 |
+
"United States", "US", "USA", "American", "Britain", "Korea", "Korean",
|
354 |
+
"North Korea", "South Korea", "Saudi", "Saudi Arabia", "Saudi Arabian",
|
355 |
+
"Iran", "Iranian", "Iraq", "Iraqi", "Israel", "Israeli", "Palestine", "Palestinian",
|
356 |
+
"Egypt", "Egyptian", "Pakistan", "Pakistani", "Indonesia", "Indonesian",
|
357 |
+
"Australia", "Australian", "New Zealand", "Nigeria", "Nigerian", "South Africa",
|
358 |
+
"Argentina", "Argentinian", "Colombia", "Colombian", "Venezuela", "Venezuelan",
|
359 |
+
"Ukraine", "Ukrainian", "Poland", "Polish", "Switzerland", "Swiss",
|
360 |
+
"Netherlands", "Dutch", "Belgium", "Belgian", "Sweden", "Swedish", "Norway", "Norwegian",
|
361 |
+
|
362 |
+
# International issues and topics
|
363 |
+
"war", "conflict", "crisis", "tension", "dispute", "hostility", "peace",
|
364 |
+
"peacekeeping", "ceasefire", "truce", "armistice", "treaty", "agreement",
|
365 |
+
"compromise", "negotiation", "mediation", "resolution", "settlement",
|
366 |
+
"refugee", "migrant", "asylum seeker", "displacement", "humanitarian",
|
367 |
+
"border", "frontier", "territory", "territorial", "sovereignty", "jurisdiction",
|
368 |
+
"terror", "terrorism", "extremism", "radicalism", "insurgency", "militant",
|
369 |
+
"sanction", "embargo", "restriction", "isolation", "blockade",
|
370 |
+
|
371 |
+
# International trade and economy
|
372 |
+
"trade", "import", "export", "tariff", "duty", "quota", "subsidy",
|
373 |
+
"protectionism", "free trade", "fair trade", "globalization", "trade war",
|
374 |
+
"trade agreement", "trade deal", "trade deficit", "trade surplus",
|
375 |
+
"supply chain", "outsourcing", "offshoring", "reshoring", "nearshoring",
|
376 |
+
|
377 |
+
# Diplomacy and international relations
|
378 |
+
"embassy", "consulate", "diplomatic", "diplomacy", "diplomat", "ambassador",
|
379 |
+
"consul", "attaché", "envoy", "emissary", "delegation", "mission",
|
380 |
+
"foreign policy", "international relations", "geopolitics", "geopolitical",
|
381 |
+
"influence", "power", "superpower", "hegemony", "alliance", "coalition",
|
382 |
+
"bloc", "axis", "sphere of influence", "buffer state", "proxy",
|
383 |
+
|
384 |
+
# International organizations
|
385 |
+
"UN", "United Nations", "EU", "European Union", "NATO", "NAFTA", "USMCA",
|
386 |
+
"ASEAN", "OPEC", "Commonwealth", "Arab League", "African Union", "AU",
|
387 |
+
"BRICS", "G7", "G20", "IMF", "World Bank", "WTO", "WHO", "UNESCO",
|
388 |
+
"Security Council", "General Assembly", "International Court of Justice",
|
389 |
+
|
390 |
+
# Travel and cultural exchange
|
391 |
+
"visa", "passport", "immigration", "emigration", "migration", "travel",
|
392 |
+
"tourism", "tourist", "visitor", "foreigner", "expatriate", "expat",
|
393 |
+
"citizenship", "nationality", "dual citizen", "naturalization",
|
394 |
+
"cultural", "tradition", "heritage", "indigenous", "native", "local",
|
395 |
+
"language", "dialect", "translation", "interpreter", "cross-cultural"
|
396 |
+
],
|
397 |
+
|
398 |
+
"sports": [
|
399 |
+
# General sports terms
|
400 |
+
"game", "match", "tournament", "championship", "league", "cup", "Olympics",
|
401 |
+
"olympic", "world cup", "competition", "contest", "event", "series",
|
402 |
+
"sport", "sporting", "athletics", "physical", "play", "compete", "competition",
|
403 |
+
"amateur", "professional", "pro", "season", "preseason", "regular season",
|
404 |
+
"postseason", "playoff", "final", "semifinal", "quarterfinal", "qualifying",
|
405 |
+
|
406 |
+
# Team sports
|
407 |
+
"football", "soccer", "American football", "rugby", "basketball", "baseball",
|
408 |
+
"cricket", "hockey", "ice hockey", "field hockey", "volleyball", "handball",
|
409 |
+
"water polo", "lacrosse", "ultimate frisbee", "netball", "kabaddi",
|
410 |
+
"team", "club", "franchise", "squad", "roster", "lineup", "formation",
|
411 |
+
"player", "coach", "manager", "trainer", "captain", "starter", "substitute",
|
412 |
+
"bench", "draft", "trade", "free agent", "contract", "transfer", "loan",
|
413 |
+
|
414 |
+
# Individual sports
|
415 |
+
"tennis", "golf", "boxing", "wrestling", "martial arts", "MMA", "UFC",
|
416 |
+
"athletics", "track and field", "swimming", "diving", "gymnastics",
|
417 |
+
"skiing", "snowboarding", "skating", "figure skating", "speed skating",
|
418 |
+
"cycling", "mountain biking", "BMX", "motorsport", "F1", "Formula 1",
|
419 |
+
"NASCAR", "IndyCar", "MotoGP", "rally", "marathon", "triathlon", "decathlon",
|
420 |
+
"archery", "shooting", "fencing", "equestrian", "rowing", "canoeing", "kayaking",
|
421 |
+
"surfing", "skateboarding", "climbing", "bouldering", "weightlifting",
|
422 |
+
|
423 |
+
# Scoring and results
|
424 |
+
"score", "point", "goal", "touchdown", "basket", "run", "wicket", "try",
|
425 |
+
"win", "lose", "draw", "tie", "defeat", "victory", "champion", "winner",
|
426 |
+
"loser", "runner-up", "finalist", "semifinalist", "eliminated", "advance",
|
427 |
+
"qualify", "record", "personal best", "world record", "Olympic record",
|
428 |
+
"streak", "undefeated", "unbeaten", "perfect season", "comeback",
|
429 |
+
|
430 |
+
# Performance and training
|
431 |
+
"fitness", "training", "practice", "drill", "workout", "exercise", "regime",
|
432 |
+
"conditioning", "strength", "endurance", "speed", "agility", "flexibility",
|
433 |
+
"skill", "technique", "form", "style", "strategy", "tactic", "playbook",
|
434 |
+
"offense", "defense", "attack", "counter", "press", "formation",
|
435 |
+
"injury", "rehabilitation", "recovery", "physiotherapy", "sports medicine",
|
436 |
+
|
437 |
+
# Sports infrastructure
|
438 |
+
"stadium", "arena", "court", "field", "pitch", "rink", "pool", "track",
|
439 |
+
"course", "gymnasium", "gym", "complex", "venue", "facility", "locker room",
|
440 |
+
"dugout", "bench", "sideline", "grandstand", "spectator", "fan", "supporter",
|
441 |
+
|
442 |
+
# Sports organizations and competitions
|
443 |
+
"medal", "gold", "silver", "bronze", "podium", "Olympics", "Paralympic",
|
444 |
+
"commonwealth games", "Asian games", "Pan American games", "world championship",
|
445 |
+
"grand slam", "masters", "open", "invitational", "classic", "tour", "circuit",
|
446 |
+
"IPL", "Indian Premier League", "MLB", "Major League Baseball",
|
447 |
+
"NBA", "National Basketball Association", "NFL", "National Football League",
|
448 |
+
"NHL", "National Hockey League", "FIFA", "UEFA", "ATP", "WTA", "ICC",
|
449 |
+
"Premier League", "La Liga", "Bundesliga", "Serie A", "Ligue 1", "MLS",
|
450 |
+
"Champions League", "Europa League", "Super Bowl", "World Series", "Stanley Cup",
|
451 |
+
"NCAA", "collegiate", "college", "university", "varsity", "intramural",
|
452 |
+
|
453 |
+
# Sports media and business
|
454 |
+
"broadcast", "coverage", "commentator", "announcer", "pundit", "analyst",
|
455 |
+
"highlight", "replay", "sports network", "ESPN", "Sky Sports", "Fox Sports",
|
456 |
+
"sponsorship", "endorsement", "advertisement", "merchandise", "jersey", "kit",
|
457 |
+
"ticket", "season ticket", "box seat", "premium", "concession", "vendor",
|
458 |
+
# Sports media and business (continued)
|
459 |
+
"broadcast", "coverage", "commentator", "announcer", "pundit", "analyst",
|
460 |
+
"highlight", "replay", "sports network", "ESPN", "Sky Sports", "Fox Sports",
|
461 |
+
"sponsorship", "endorsement", "advertisement", "merchandise", "jersey", "kit",
|
462 |
+
"ticket", "season ticket", "box seat", "premium", "concession", "vendor"
|
463 |
+
],
|
464 |
+
|
465 |
+
"entertainment": [
|
466 |
+
# Film and cinema
|
467 |
+
"movie", "film", "cinema", "feature", "short film", "documentary", "animation",
|
468 |
+
"blockbuster", "indie", "independent film", "foreign film", "box office",
|
469 |
+
"screening", "premiere", "release", "theatrical", "stream", "streaming",
|
470 |
+
"director", "producer", "screenwriter", "script", "screenplay", "adaptation",
|
471 |
+
"cinematography", "cinematographer", "editing", "editor", "visual effects",
|
472 |
+
"special effects", "CGI", "motion capture", "sound design", "soundtrack",
|
473 |
+
"score", "composer", "scene", "shot", "take", "cut", "sequel", "prequel",
|
474 |
+
"trilogy", "franchise", "universe", "reboot", "remake", "spin-off",
|
475 |
+
"genre", "action", "comedy", "drama", "thriller", "horror", "sci-fi",
|
476 |
+
"science fiction", "fantasy", "romance", "romantic comedy", "rom-com",
|
477 |
+
"mystery", "crime", "western", "historical", "biographical", "biopic",
|
478 |
+
|
479 |
+
# Television
|
480 |
+
"TV", "television", "show", "series", "episode", "season", "pilot",
|
481 |
+
"finale", "midseason", "sitcom", "drama series", "miniseries", "limited series",
|
482 |
+
"anthology", "reality TV", "game show", "talk show", "variety show",
|
483 |
+
"network", "cable", "premium cable", "broadcast", "channel", "program",
|
484 |
+
"primetime", "daytime", "syndication", "rerun", "renewed", "cancelled",
|
485 |
+
"showrunner", "creator", "writer", "TV writer", "episode writer", "staff writer",
|
486 |
+
|
487 |
+
# Performing arts
|
488 |
+
"actor", "actress", "performer", "cast", "casting", "star", "co-star",
|
489 |
+
"supporting", "lead", "protagonist", "antagonist", "villain", "hero", "anti-hero",
|
490 |
+
"character", "role", "performance", "portrayal", "acting", "dialogue",
|
491 |
+
"monologue", "line", "script", "improv", "improvisation", "stand-up",
|
492 |
+
"comedian", "comic", "sketch", "theater", "theatre", "stage", "Broadway",
|
493 |
+
"West End", "play", "musical", "opera", "ballet", "dance", "choreography",
|
494 |
+
"production", "rehearsal", "audition", "understudy", "troupe", "ensemble",
|
495 |
+
|
496 |
+
# Music
|
497 |
+
"music", "song", "track", "single", "album", "EP", "LP", "record",
|
498 |
+
"release", "drop", "artist", "musician", "singer", "vocalist", "band",
|
499 |
+
"group", "duo", "trio", "soloist", "frontman", "frontwoman", "lead singer",
|
500 |
+
"songwriter", "composer", "producer", "DJ", "rapper", "MC", "beatmaker",
|
501 |
+
"guitarist", "bassist", "drummer", "pianist", "keyboardist", "violinist",
|
502 |
+
"instrumentalist", "orchestra", "symphony", "philharmonic", "conductor",
|
503 |
+
"genre", "rock", "pop", "hip-hop", "rap", "R&B", "soul", "funk", "jazz",
|
504 |
+
"blues", "country", "folk", "electronic", "EDM", "dance", "techno", "house",
|
505 |
+
"metal", "punk", "alternative", "indie", "classical", "reggae", "latin",
|
506 |
+
"hit", "chart", "Billboard", "Grammy", "award-winning", "platinum", "gold",
|
507 |
+
"concert", "tour", "gig", "show", "performance", "live", "venue", "arena",
|
508 |
+
"stadium", "festival", "Coachella", "Glastonbury", "Lollapalooza", "Bonnaroo",
|
509 |
+
|
510 |
+
# Celebrity culture
|
511 |
+
"celebrity", "star", "fame", "famous", "A-list", "B-list", "icon", "iconic",
|
512 |
+
"superstar", "public figure", "household name", "stardom", "limelight",
|
513 |
+
"popular", "popularity", "fan", "fanbase", "followers", "stan", "groupie",
|
514 |
+
"paparazzi", "tabloid", "gossip", "rumor", "scandal", "controversy",
|
515 |
+
"interview", "press conference", "red carpet", "premiere", "gala", "award show",
|
516 |
+
|
517 |
+
# Awards and recognition
|
518 |
+
"award", "nominee", "nomination", "winner", "recipient", "honor", "accolade",
|
519 |
+
"Oscar", "Academy Award", "Emmy", "Grammy", "Tony", "Golden Globe", "BAFTA",
|
520 |
+
"MTV Award", "People's Choice", "Critics' Choice", "SAG Award", "Billboard Award",
|
521 |
+
"best actor", "best actress", "best director", "best picture", "best film",
|
522 |
+
"best album", "best song", "hall of fame", "lifetime achievement", "legacy",
|
523 |
+
|
524 |
+
# Media and publishing
|
525 |
+
"book", "novel", "fiction", "non-fiction", "memoir", "biography", "autobiography",
|
526 |
+
"bestseller", "bestselling", "author", "writer", "novelist", "literary",
|
527 |
+
"literature", "publisher", "publishing", "imprint", "edition", "volume",
|
528 |
+
"chapter", "page", "paragraph", "prose", "narrative", "plot", "storyline",
|
529 |
+
"character", "protagonist", "antagonist", "setting", "theme", "genre",
|
530 |
+
"mystery", "thriller", "romance", "sci-fi", "fantasy", "young adult", "YA",
|
531 |
+
"comic", "comic book", "graphic novel", "manga", "anime", "cartoon",
|
532 |
+
|
533 |
+
# Digital entertainment
|
534 |
+
"streaming", "stream", "subscription", "platform", "service", "content",
|
535 |
+
"Netflix", "Disney+", "Amazon Prime", "Hulu", "HBO", "HBO Max", "Apple TV+",
|
536 |
+
"Peacock", "Paramount+", "YouTube", "YouTube Premium", "TikTok", "Instagram",
|
537 |
+
"influencer", "content creator", "vlogger", "blogger", "podcaster", "podcast",
|
538 |
+
"episode", "download", "subscriber", "follower", "like", "share", "viral",
|
539 |
+
"trending", "binge-watch", "marathon", "spoiler", "recap", "review", "trailer",
|
540 |
+
"teaser", "behind the scenes", "BTS", "exclusive", "original"
|
541 |
+
]
|
542 |
+
}
|
543 |
+
|
544 |
+
# Add domain-specific RSS feeds for different categories
|
545 |
+
CATEGORY_SPECIFIC_FEEDS = {
|
546 |
+
"science": [
|
547 |
+
# "https://www.science.org/rss/news_feeds/carousel.xml",
|
548 |
+
"https://www.science.org/rss/news_current.xml",
|
549 |
+
"https://www.nature.com/nature.rss",
|
550 |
+
# "https://www.scientificamerican.com/rss/",
|
551 |
+
"http://rss.sciam.com/basic-science",
|
552 |
+
# "https://rss.sciam.com/ScientificAmerican-Global",
|
553 |
+
"http://rss.sciam.com/ScientificAmerican-Global",
|
554 |
+
# "https://feeds.newscientist.com/science-news",
|
555 |
+
"https://www.newscientist.com/feed/home/?cmpid=RSS|NSNS-Home",
|
556 |
+
"https://phys.org/rss-feed/"
|
557 |
+
],
|
558 |
+
"technology": [
|
559 |
+
# "https://feed.wired.com/rss/category/business/feed.rss",
|
560 |
+
"https://www.wired.com/feed/category/business/latest/rss",
|
561 |
+
"https://techcrunch.com/feed/",
|
562 |
+
"https://www.technologyreview.com/feed/",
|
563 |
+
"https://arstechnica.com/feed/",
|
564 |
+
"https://www.theverge.com/rss/index.xml",
|
565 |
+
"https://news.ycombinator.com/rss"
|
566 |
+
],
|
567 |
+
"politics": [
|
568 |
+
"https://feeds.washingtonpost.com/rss/politics",
|
569 |
+
"https://rss.nytimes.com/services/xml/rss/nyt/Politics.xml",
|
570 |
+
"https://feeds.bbci.co.uk/news/politics/rss.xml",
|
571 |
+
"https://www.politico.com/rss/politicopicks.xml",
|
572 |
+
"https://www.realclearpolitics.com/index.xml"
|
573 |
+
],
|
574 |
+
"business": [
|
575 |
+
"https://www.ft.com/rss/home",
|
576 |
+
"https://feeds.bloomberg.com/markets/news.rss",
|
577 |
+
# "https://www.forbes.com/business/feed/",
|
578 |
+
"https://rss.nytimes.com/services/xml/rss/nyt/Business.xml",
|
579 |
+
"https://feeds.washingtonpost.com/rss/business",
|
580 |
+
"https://www.entrepreneur.com/latest.rss",
|
581 |
+
# "https://www.cnbc.com/id/10001147/device/rss/rss.htm",
|
582 |
+
"https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=10001147",
|
583 |
+
"https://feeds.content.dowjones.io/public/rss/WSJcomUSBusiness",
|
584 |
+
"https://feeds.a.dj.com/rss/RSSMarketsMain.xml"
|
585 |
+
],
|
586 |
+
"world": [
|
587 |
+
"https://feeds.bbci.co.uk/news/world/rss.xml",
|
588 |
+
"https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
|
589 |
+
"https://www.aljazeera.com/xml/rss/all.xml",
|
590 |
+
"https://feeds.washingtonpost.com/rss/world",
|
591 |
+
# "https://rss.cnn.com/rss/edition_world.rss"
|
592 |
+
"http://rss.cnn.com/rss/cnn_world.rss"
|
593 |
+
],
|
594 |
+
"sports": [
|
595 |
+
"https://www.espn.com/espn/rss/news",
|
596 |
+
"https://www.cbssports.com/rss/headlines/",
|
597 |
+
# "https://feeds.skysports.com/feeds/rss/latest.xml",
|
598 |
+
"https://www.espncricinfo.com/rss/content/story/feeds/0.xml",
|
599 |
+
"https://api.foxsports.com/v1/rss",
|
600 |
+
"https://www.sportingnews.com/us/rss",
|
601 |
+
"https://www.theguardian.com/sport/rss",
|
602 |
+
],
|
603 |
+
"entertainment": [
|
604 |
+
"https://www.hollywoodreporter.com/feed/",
|
605 |
+
"https://variety.com/feed/",
|
606 |
+
# "https://feeds.eonline.com/mrss/article/",
|
607 |
+
"https://www.eonline.com/syndication/feeds/rssfeeds/topstories.xml",
|
608 |
+
"https://www.rollingstone.com/feed/",
|
609 |
+
"https://rss.nytimes.com/services/xml/rss/nyt/Arts.xml"
|
610 |
+
],
|
611 |
+
"fact_checking": [
|
612 |
+
"https://www.snopes.com/feed/",
|
613 |
+
"https://www.politifact.com/rss/all/",
|
614 |
+
"https://www.factcheck.org/feed/",
|
615 |
+
"https://leadstories.com/atom.xml",
|
616 |
+
# "https://apnews.com/hub/fact-check/rss",
|
617 |
+
# "https://apnews.com/apf-fact-check"
|
618 |
+
"https://fullfact.org/feed/all/",
|
619 |
+
"https://www.truthorfiction.com/feed/"
|
620 |
+
]
|
621 |
+
}
|
622 |
+
|
623 |
+
# Reliability boosts for sources by category
|
624 |
+
SOURCE_RELIABILITY_BY_CATEGORY = {
|
625 |
+
"science": {
|
626 |
+
"nature.com": 0.95,
|
627 |
+
"science.org": 0.95,
|
628 |
+
"nih.gov": 0.95,
|
629 |
+
"nasa.gov": 0.95,
|
630 |
+
"scientificamerican.com": 0.9,
|
631 |
+
"newscientist.com": 0.9,
|
632 |
+
"pnas.org": 0.95,
|
633 |
+
"cell.com": 0.95,
|
634 |
+
"sciencedirect.com": 0.9,
|
635 |
+
"plos.org": 0.9,
|
636 |
+
"arxiv.org": 0.85
|
637 |
+
},
|
638 |
+
"technology": {
|
639 |
+
"wired.com": 0.9,
|
640 |
+
"techcrunch.com": 0.85,
|
641 |
+
"arstechnica.com": 0.9,
|
642 |
+
"technologyreview.com": 0.9,
|
643 |
+
"theverge.com": 0.85,
|
644 |
+
"cnet.com": 0.85,
|
645 |
+
"engadget.com": 0.85
|
646 |
+
},
|
647 |
+
"fact_checking": {
|
648 |
+
"snopes.com": 0.95,
|
649 |
+
"politifact.com": 0.9,
|
650 |
+
"factcheck.org": 0.9,
|
651 |
+
"apnews.com/hub/fact-check": 0.95,
|
652 |
+
"reuters.com/fact-check": 0.95
|
653 |
+
}
|
654 |
+
}
|
655 |
+
|
656 |
+
def detect_claim_category(claim: str) -> Tuple[str, float]:
|
657 |
+
"""
|
658 |
+
Detect the most likely category of a claim and its confidence score
|
659 |
+
|
660 |
+
Args:
|
661 |
+
claim (str): The claim text
|
662 |
+
|
663 |
+
Returns:
|
664 |
+
tuple: (category_name, confidence_score)
|
665 |
+
"""
|
666 |
+
if not claim:
|
667 |
+
return "general", 0.3
|
668 |
+
|
669 |
+
# Lowercase for better matching
|
670 |
+
claim_lower = claim.lower()
|
671 |
+
|
672 |
+
# Count matches for each category
|
673 |
+
category_scores = {}
|
674 |
+
|
675 |
+
for category, keywords in CLAIM_CATEGORIES.items():
|
676 |
+
# Count how many keywords from this category appear in the claim
|
677 |
+
matches = sum(1 for keyword in keywords if keyword.lower() in claim_lower)
|
678 |
+
|
679 |
+
# Calculate a simple score based on matches
|
680 |
+
if matches > 0:
|
681 |
+
# Calculate a more significant score based on number of matches
|
682 |
+
score = min(0.9, 0.3 + (matches * 0.1)) # Base 0.3 + 0.1 per match, max 0.9
|
683 |
+
category_scores[category] = score
|
684 |
+
|
685 |
+
# Find category with highest score
|
686 |
+
if not category_scores:
|
687 |
+
return "general", 0.3
|
688 |
+
|
689 |
+
top_category = max(category_scores.items(), key=lambda x: x[1])
|
690 |
+
category_name, confidence = top_category
|
691 |
+
|
692 |
+
# If the top score is too low, return general
|
693 |
+
if confidence < 0.3:
|
694 |
+
return "general", 0.3
|
695 |
+
|
696 |
+
return category_name, confidence
|
697 |
+
|
698 |
+
def get_topic_specific_sources(claim: str, existing_sources: Dict) -> Dict:
|
699 |
+
"""
|
700 |
+
Enrich existing sources dict with topic-specific sources
|
701 |
+
|
702 |
+
Args:
|
703 |
+
claim (str): The claim text
|
704 |
+
existing_sources (dict): Current sources configuration
|
705 |
+
|
706 |
+
Returns:
|
707 |
+
dict: Updated sources with topic-specific priorities
|
708 |
+
"""
|
709 |
+
# Detect claim category
|
710 |
+
category, confidence = detect_claim_category(claim)
|
711 |
+
logger.info(f"Claim category detected: {category} (confidence: {confidence:.2f})")
|
712 |
+
|
713 |
+
# If confidence is low, keep existing sources
|
714 |
+
if confidence < 0.4:
|
715 |
+
return existing_sources
|
716 |
+
|
717 |
+
# Get specific feeds for the category
|
718 |
+
category_feeds = CATEGORY_SPECIFIC_FEEDS.get(category, [])
|
719 |
+
|
720 |
+
# Only proceed if we have category-specific feeds
|
721 |
+
if not category_feeds:
|
722 |
+
return existing_sources
|
723 |
+
|
724 |
+
# Create a new sources dictionary with category-specific modifications
|
725 |
+
updated_sources = existing_sources.copy()
|
726 |
+
|
727 |
+
# If the category is science, add the category-specific feeds to the list
|
728 |
+
# and prioritize them by putting them first in RSS feeds
|
729 |
+
if category in CATEGORY_SPECIFIC_FEEDS:
|
730 |
+
# Add up to 5 category-specific RSS feeds (if we have them)
|
731 |
+
category_feeds_sample = category_feeds[:min(5, len(category_feeds))]
|
732 |
+
|
733 |
+
# Add or update source reliability data
|
734 |
+
if category in SOURCE_RELIABILITY_BY_CATEGORY:
|
735 |
+
for domain, reliability in SOURCE_RELIABILITY_BY_CATEGORY[category].items():
|
736 |
+
updated_sources["source_credibility"] = updated_sources.get("source_credibility", {})
|
737 |
+
updated_sources["source_credibility"][domain] = reliability
|
738 |
+
|
739 |
+
# Return updated sources with prioritized feeds
|
740 |
+
return {
|
741 |
+
"category": category,
|
742 |
+
"confidence": confidence,
|
743 |
+
"rss_feeds": category_feeds_sample + (updated_sources.get("rss_feeds", []) or []),
|
744 |
+
"source_credibility": updated_sources.get("source_credibility", {})
|
745 |
+
}
|
746 |
+
|
747 |
+
return existing_sources
|
748 |
+
|
749 |
+
def get_prioritized_sources(claim: str, claim_category: Optional[str] = None) -> Dict[str, List[str]]:
|
750 |
+
"""
|
751 |
+
Get prioritized sources for a claim based on its category
|
752 |
+
|
753 |
+
Args:
|
754 |
+
claim (str): The claim to check
|
755 |
+
claim_category (str, optional): Override detected category
|
756 |
+
|
757 |
+
Returns:
|
758 |
+
dict: Dictionary with source types prioritized by relevance
|
759 |
+
"""
|
760 |
+
# Detect category if not provided
|
761 |
+
if not claim_category:
|
762 |
+
category, confidence = detect_claim_category(claim)
|
763 |
+
else:
|
764 |
+
category = claim_category
|
765 |
+
confidence = 0.8 # Assume high confidence if category is explicitly provided
|
766 |
+
|
767 |
+
# Log detected category
|
768 |
+
logger.info(f"Using claim category: {category} for source prioritization")
|
769 |
+
|
770 |
+
# Default priorities
|
771 |
+
priorities = {
|
772 |
+
"primary": ["wikipedia", "news", "claimreview"],
|
773 |
+
"secondary": ["rss", "scholarly", "wikidata"]
|
774 |
+
}
|
775 |
+
|
776 |
+
# Needs recent evidence check (existing logic)
|
777 |
+
temporal_terms = ["is", "are", "remains", "continues", "still", "currently",
|
778 |
+
"now", "today", "recent", "latest"]
|
779 |
+
negation_terms = ["not", "no longer", "isn't", "aren't", "doesn't", "don't",
|
780 |
+
"can't", "cannot", "anymore"]
|
781 |
+
|
782 |
+
requires_recent = any(term in claim.lower() for term in temporal_terms) or \
|
783 |
+
any(term in claim.lower() for term in negation_terms)
|
784 |
+
|
785 |
+
# Adjust priorities based on category
|
786 |
+
if category == "science":
|
787 |
+
if requires_recent:
|
788 |
+
priorities = {
|
789 |
+
"primary": ["scholarly", "rss", "wikipedia"],
|
790 |
+
"secondary": ["news", "claimreview", "wikidata"]
|
791 |
+
}
|
792 |
+
else:
|
793 |
+
priorities = {
|
794 |
+
"primary": ["scholarly", "wikipedia", "rss"],
|
795 |
+
"secondary": ["claimreview", "news", "wikidata"]
|
796 |
+
}
|
797 |
+
|
798 |
+
elif category == "technology":
|
799 |
+
if requires_recent:
|
800 |
+
priorities = {
|
801 |
+
"primary": ["rss", "news", "scholarly"],
|
802 |
+
"secondary": ["wikipedia", "claimreview", "wikidata"]
|
803 |
+
}
|
804 |
+
else:
|
805 |
+
priorities = {
|
806 |
+
"primary": ["news", "scholarly", "wikipedia"],
|
807 |
+
"secondary": ["rss", "claimreview", "wikidata"]
|
808 |
+
}
|
809 |
+
|
810 |
+
elif category == "politics":
|
811 |
+
if requires_recent:
|
812 |
+
priorities = {
|
813 |
+
"primary": ["rss", "news", "claimreview"],
|
814 |
+
"secondary": ["wikipedia", "wikidata", "scholarly"]
|
815 |
+
}
|
816 |
+
else:
|
817 |
+
priorities = {
|
818 |
+
"primary": ["claimreview", "news", "wikipedia"],
|
819 |
+
"secondary": ["rss", "wikidata", "scholarly"]
|
820 |
+
}
|
821 |
+
|
822 |
+
elif category == "business" or category == "world":
|
823 |
+
if requires_recent:
|
824 |
+
priorities = {
|
825 |
+
"primary": ["rss", "news", "wikipedia"],
|
826 |
+
"secondary": ["claimreview", "wikidata", "scholarly"]
|
827 |
+
}
|
828 |
+
else:
|
829 |
+
priorities = {
|
830 |
+
"primary": ["news", "wikipedia", "rss"],
|
831 |
+
"secondary": ["claimreview", "wikidata", "scholarly"]
|
832 |
+
}
|
833 |
+
|
834 |
+
elif category == "sports":
|
835 |
+
if requires_recent:
|
836 |
+
priorities = {
|
837 |
+
"primary": ["rss", "news", "wikipedia"],
|
838 |
+
"secondary": ["wikidata", "claimreview", "scholarly"]
|
839 |
+
}
|
840 |
+
else:
|
841 |
+
priorities = {
|
842 |
+
"primary": ["wikipedia", "news", "rss"],
|
843 |
+
"secondary": ["wikidata", "claimreview", "scholarly"]
|
844 |
+
}
|
845 |
+
|
846 |
+
elif category == "entertainment":
|
847 |
+
if requires_recent:
|
848 |
+
priorities = {
|
849 |
+
"primary": ["rss", "news", "claimreview"],
|
850 |
+
"secondary": ["wikipedia", "wikidata", "scholarly"]
|
851 |
+
}
|
852 |
+
else:
|
853 |
+
priorities = {
|
854 |
+
"primary": ["news", "wikipedia", "claimreview"],
|
855 |
+
"secondary": ["rss", "wikidata", "scholarly"]
|
856 |
+
}
|
857 |
+
|
858 |
+
# Add category and confidence for reference
|
859 |
+
priorities["category"] = category
|
860 |
+
priorities["confidence"] = confidence
|
861 |
+
priorities["requires_recent"] = requires_recent
|
862 |
+
|
863 |
+
return priorities
|
864 |
+
|
865 |
+
def get_category_specific_rss_feeds(category: str, max_feeds: int = 5) -> List[str]:
|
866 |
+
"""
|
867 |
+
Get a list of RSS feeds specific to a category
|
868 |
+
|
869 |
+
Args:
|
870 |
+
category (str): The claim category
|
871 |
+
max_feeds (int): Maximum number of feeds to return
|
872 |
+
|
873 |
+
Returns:
|
874 |
+
list: List of RSS feed URLs
|
875 |
+
"""
|
876 |
+
# Get category-specific feeds
|
877 |
+
category_feeds = CATEGORY_SPECIFIC_FEEDS.get(category, [])
|
878 |
+
|
879 |
+
# Limit to max_feeds
|
880 |
+
return category_feeds[:min(max_feeds, len(category_feeds))]
|
modules/claim_extraction.py
ADDED
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import time
|
3 |
+
import re
|
4 |
+
from langdetect import detect
|
5 |
+
import spacy
|
6 |
+
|
7 |
+
from utils.performance import PerformanceTracker
|
8 |
+
from utils.models import get_nlp_model, get_llm_model
|
9 |
+
|
10 |
+
logger = logging.getLogger("misinformation_detector")
|
11 |
+
|
12 |
+
performance_tracker = PerformanceTracker()
|
13 |
+
|
14 |
+
def extract_claims(text):
|
15 |
+
"""
|
16 |
+
Extract the main factual claim from the provided text.
|
17 |
+
For concise claims (<20 words), preserves them exactly.
|
18 |
+
For longer text, uses OpenAI to extract the claim.
|
19 |
+
"""
|
20 |
+
logger.info(f"Extracting claims from: {text}")
|
21 |
+
start_time = time.time()
|
22 |
+
|
23 |
+
# First, check if the input already appears to be a concise claim
|
24 |
+
if len(text.split()) < 20:
|
25 |
+
logger.info("Input appears to be a concise claim already, preserving as-is")
|
26 |
+
performance_tracker.log_processing_time(start_time)
|
27 |
+
performance_tracker.log_claim_processed()
|
28 |
+
return text
|
29 |
+
|
30 |
+
try:
|
31 |
+
# For longer text, use OpenAI for extraction
|
32 |
+
extracted_claim = extract_with_openai(text)
|
33 |
+
|
34 |
+
# Log processing time
|
35 |
+
performance_tracker.log_processing_time(start_time)
|
36 |
+
performance_tracker.log_claim_processed()
|
37 |
+
|
38 |
+
logger.info(f"Extracted claim: {extracted_claim}")
|
39 |
+
return extracted_claim
|
40 |
+
except Exception as e:
|
41 |
+
logger.error(f"Error extracting claims: {str(e)}")
|
42 |
+
# Fallback to original text on error
|
43 |
+
return text
|
44 |
+
|
45 |
+
def extract_with_openai(text):
|
46 |
+
"""
|
47 |
+
Use OpenAI model for claim extraction
|
48 |
+
"""
|
49 |
+
try:
|
50 |
+
# Get LLM model
|
51 |
+
llm_model = get_llm_model()
|
52 |
+
|
53 |
+
# Create a very explicit prompt to avoid hallucination
|
54 |
+
prompt = f"""
|
55 |
+
Extract the main factual claim from the following text.
|
56 |
+
DO NOT add any information not present in the original text.
|
57 |
+
DO NOT add locations, dates, or other details.
|
58 |
+
ONLY extract what is explicitly stated.
|
59 |
+
|
60 |
+
Text: {text}
|
61 |
+
|
62 |
+
Main factual claim:
|
63 |
+
"""
|
64 |
+
|
65 |
+
# Call OpenAI with temperature=0 for deterministic output
|
66 |
+
response = llm_model.invoke(prompt, temperature=0)
|
67 |
+
extracted_claim = response.content.strip()
|
68 |
+
|
69 |
+
# Further clean up any explanations or extra text
|
70 |
+
if ":" in extracted_claim:
|
71 |
+
parts = extracted_claim.split(":")
|
72 |
+
if len(parts) > 1:
|
73 |
+
extracted_claim = parts[-1].strip()
|
74 |
+
|
75 |
+
logger.info(f"OpenAI extraction: {extracted_claim}")
|
76 |
+
|
77 |
+
# Validate that we're not adding info not in the original
|
78 |
+
nlp = get_nlp_model()
|
79 |
+
extracted_claim = validate_extraction(text, extracted_claim, nlp)
|
80 |
+
|
81 |
+
return extracted_claim
|
82 |
+
except Exception as e:
|
83 |
+
logger.error(f"Error in OpenAI claim extraction: {str(e)}")
|
84 |
+
return text # Fallback to original
|
85 |
+
|
86 |
+
def validate_extraction(original_text, extracted_claim, nlp):
|
87 |
+
"""
|
88 |
+
Validate that the extracted claim doesn't add information not present in the original text
|
89 |
+
"""
|
90 |
+
# If extraction fails or is empty, return original
|
91 |
+
if not extracted_claim or extracted_claim.strip() == "":
|
92 |
+
logger.warning("Empty extraction result, using original text")
|
93 |
+
return original_text
|
94 |
+
|
95 |
+
# Check for added location information
|
96 |
+
location_terms = ["united states", "america", "u.s.", "usa", "china", "india", "europe",
|
97 |
+
"russia", "japan", "uk", "germany", "france", "australia"]
|
98 |
+
for term in location_terms:
|
99 |
+
if term in extracted_claim.lower() and term not in original_text.lower():
|
100 |
+
logger.warning(f"Extraction added location '{term}' not in original, using original text")
|
101 |
+
return original_text
|
102 |
+
|
103 |
+
# Check for entity preservation/addition using spaCy
|
104 |
+
try:
|
105 |
+
# Get entities from extracted text
|
106 |
+
extracted_doc = nlp(extracted_claim)
|
107 |
+
extracted_entities = [ent.text.lower() for ent in extracted_doc.ents]
|
108 |
+
|
109 |
+
# Get entities from original text
|
110 |
+
original_doc = nlp(original_text)
|
111 |
+
original_entities = [ent.text.lower() for ent in original_doc.ents]
|
112 |
+
|
113 |
+
# Check for new entities that don't exist in original
|
114 |
+
for entity in extracted_entities:
|
115 |
+
if not any(entity in orig_entity or orig_entity in entity for orig_entity in original_entities):
|
116 |
+
logger.warning(f"Extraction added new entity '{entity}', using original text")
|
117 |
+
return original_text
|
118 |
+
|
119 |
+
return extracted_claim
|
120 |
+
except Exception as e:
|
121 |
+
logger.error(f"Error in extraction validation: {str(e)}")
|
122 |
+
return original_text # On error, safer to return original
|
123 |
+
|
124 |
+
def shorten_claim_for_evidence(claim):
|
125 |
+
"""
|
126 |
+
Shorten a claim to use for evidence retrieval by preserving important keywords
|
127 |
+
while maintaining claim context
|
128 |
+
"""
|
129 |
+
try:
|
130 |
+
# Get NLP model
|
131 |
+
nlp = get_nlp_model()
|
132 |
+
|
133 |
+
# Use NER to extract key entities
|
134 |
+
doc = nlp(claim)
|
135 |
+
|
136 |
+
# Extract all entities for search
|
137 |
+
entities = [ent.text for ent in doc.ents]
|
138 |
+
|
139 |
+
# Extract key proper nouns, entities, and important context words
|
140 |
+
important_words = []
|
141 |
+
|
142 |
+
# Add all named entities
|
143 |
+
for ent in doc.ents:
|
144 |
+
important_words.append(ent.text)
|
145 |
+
|
146 |
+
# Add important nouns and adjectives not already added
|
147 |
+
for token in doc:
|
148 |
+
if token.pos_ in ["NOUN", "PROPN"] and token.text not in important_words:
|
149 |
+
important_words.append(token.text)
|
150 |
+
|
151 |
+
# Make sure we include key terms like "prime minister", "president", etc.
|
152 |
+
title_terms = ["president", "prime minister", "minister", "chancellor", "premier", "governor", "mayor", "senator"]
|
153 |
+
for term in title_terms:
|
154 |
+
if term in claim.lower() and not any(term in word.lower() for word in important_words):
|
155 |
+
# Find the full phrase (e.g., "Canadian Prime Minister")
|
156 |
+
matches = re.finditer(r'(?i)(?:\w+\s+)*\b' + re.escape(term) + r'\b(?:\s+\w+)*', claim)
|
157 |
+
for match in matches:
|
158 |
+
phrase = match.group(0)
|
159 |
+
if phrase not in important_words:
|
160 |
+
important_words.append(phrase)
|
161 |
+
|
162 |
+
# Add country names or important place references
|
163 |
+
country_terms = ["canada", "canadian", "us", "united states", "american", "uk", "british", "australia", "china", "russian"]
|
164 |
+
for term in country_terms:
|
165 |
+
if term in claim.lower() and not any(term in word.lower() for word in important_words):
|
166 |
+
for token in doc:
|
167 |
+
if token.text.lower() == term and token.text not in important_words:
|
168 |
+
important_words.append(token.text)
|
169 |
+
|
170 |
+
# Always include negation words as they're critical for meaning
|
171 |
+
negation_terms = ["not", "no longer", "former", "ex-", "isn't", "aren't", "doesn't", "don't"]
|
172 |
+
negation_found = False
|
173 |
+
for term in negation_terms:
|
174 |
+
if term in claim.lower():
|
175 |
+
# Find the context around the negation (3 words before and after)
|
176 |
+
matches = re.finditer(r'(?i)(?:\w+\s+){0,3}\b' + re.escape(term) + r'\b(?:\s+\w+){0,3}', claim)
|
177 |
+
for match in matches:
|
178 |
+
phrase = match.group(0)
|
179 |
+
if phrase not in important_words:
|
180 |
+
important_words.append(phrase)
|
181 |
+
negation_found = True
|
182 |
+
|
183 |
+
# Special handling for time-sensitive claims with negations
|
184 |
+
is_time_sensitive = any(term in claim.lower() for term in ["anymore", "still", "currently", "now", "today", "recent"])
|
185 |
+
|
186 |
+
# If we have both negation and time sensitivity, ensure we keep those key aspects
|
187 |
+
if negation_found and is_time_sensitive:
|
188 |
+
# Ensure we keep time-sensitive terms
|
189 |
+
time_terms = ["anymore", "still", "currently", "now", "today", "recent"]
|
190 |
+
for term in time_terms:
|
191 |
+
if term in claim.lower() and not any(term in word.lower() for word in important_words):
|
192 |
+
# Add the context around the time term
|
193 |
+
matches = re.finditer(r'(?i)(?:\w+\s+){0,2}\b' + re.escape(term) + r'\b(?:\s+\w+){0,2}', claim)
|
194 |
+
for match in matches:
|
195 |
+
phrase = match.group(0)
|
196 |
+
if phrase not in important_words:
|
197 |
+
important_words.append(phrase)
|
198 |
+
|
199 |
+
# If entities plus titles don't give us enough, include key parts of claim
|
200 |
+
if len(entities) < 2 and not any("minister" in word.lower() for word in important_words):
|
201 |
+
words = claim.split()
|
202 |
+
# Use first 8 words
|
203 |
+
return " ".join(words[:min(8, len(words))])
|
204 |
+
|
205 |
+
# Remove duplicates while preserving order
|
206 |
+
seen = set()
|
207 |
+
unique_terms = []
|
208 |
+
for word in important_words:
|
209 |
+
if word.lower() not in seen:
|
210 |
+
seen.add(word.lower())
|
211 |
+
unique_terms.append(word)
|
212 |
+
|
213 |
+
# Ensure we have a reasonable number of search terms (maintain more for complex claims)
|
214 |
+
search_terms = unique_terms[:min(6, len(unique_terms))]
|
215 |
+
|
216 |
+
# Sort search terms to try to maintain original word order from claim
|
217 |
+
def get_position(term):
|
218 |
+
return claim.lower().find(term.lower())
|
219 |
+
|
220 |
+
search_terms.sort(key=get_position)
|
221 |
+
|
222 |
+
# Join terms to create search query
|
223 |
+
shortened_claim = " ".join(search_terms)
|
224 |
+
|
225 |
+
# If the shortened claim is too short compared to original, use more of original
|
226 |
+
if len(shortened_claim.split()) < 3 and len(claim.split()) > 5:
|
227 |
+
words = claim.split()
|
228 |
+
shortened_claim = " ".join(words[:min(8, len(words))])
|
229 |
+
|
230 |
+
logger.info(f"Shortened Claim: {shortened_claim}")
|
231 |
+
|
232 |
+
return shortened_claim
|
233 |
+
except Exception as e:
|
234 |
+
logger.error(f"Error in shortening claim: {str(e)}")
|
235 |
+
# Return original claim on error
|
236 |
+
return claim
|
modules/classification.py
ADDED
@@ -0,0 +1,521 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import re
|
3 |
+
from utils.models import get_llm_model
|
4 |
+
from utils.performance import PerformanceTracker
|
5 |
+
|
6 |
+
logger = logging.getLogger("misinformation_detector")
|
7 |
+
|
8 |
+
performance_tracker = PerformanceTracker()
|
9 |
+
|
10 |
+
def classify_with_llm(claim, evidence):
|
11 |
+
"""
|
12 |
+
Optimized classification function that handles evidence classification
|
13 |
+
and verdict generation in a single LLM call with robust parsing
|
14 |
+
"""
|
15 |
+
logger.info(f"Classifying evidence for claim: {claim}")
|
16 |
+
|
17 |
+
# Get the LLM model
|
18 |
+
llm_model = get_llm_model()
|
19 |
+
|
20 |
+
# Skip if no evidence
|
21 |
+
if not evidence:
|
22 |
+
logger.warning("No evidence provided for classification")
|
23 |
+
return []
|
24 |
+
|
25 |
+
# Normalize evidence to a list
|
26 |
+
if not isinstance(evidence, list):
|
27 |
+
if evidence:
|
28 |
+
try:
|
29 |
+
evidence = [evidence]
|
30 |
+
except Exception as e:
|
31 |
+
logger.error(f"Could not convert evidence to list: {e}")
|
32 |
+
return []
|
33 |
+
else:
|
34 |
+
return []
|
35 |
+
|
36 |
+
# Does the claim contain strong assertions that require specific evidence?
|
37 |
+
strong_assertion_markers = [
|
38 |
+
"solved", "cured", "discovered", "confirmed", "proven", "definitive",
|
39 |
+
"breakthrough", "revolutionary", "successfully", "first ever", "extends",
|
40 |
+
"conclusive", "unprecedented", "remarkable", "definitively"
|
41 |
+
]
|
42 |
+
|
43 |
+
# Check if the claim contains strong assertions that would require specific supporting evidence
|
44 |
+
contains_strong_assertions = any(marker in claim.lower() for marker in strong_assertion_markers)
|
45 |
+
|
46 |
+
# Limit to top 5 evidence items to reduce token usage
|
47 |
+
evidence = evidence[:5]
|
48 |
+
|
49 |
+
try:
|
50 |
+
# Format evidence items
|
51 |
+
evidence_text = ""
|
52 |
+
for idx, chunk in enumerate(evidence):
|
53 |
+
# Truncate long evidence
|
54 |
+
chunk_text = str(chunk)
|
55 |
+
if len(chunk_text) > 300:
|
56 |
+
chunk_text = chunk_text[:297] + "..."
|
57 |
+
|
58 |
+
evidence_text += f"EVIDENCE {idx+1}:\n{chunk_text}\n\n"
|
59 |
+
|
60 |
+
# Create a structured prompt with explicit formatting instructions
|
61 |
+
# Adjust instructions based on claim characteristics
|
62 |
+
if contains_strong_assertions:
|
63 |
+
prompt = f"""
|
64 |
+
CLAIM: {claim}
|
65 |
+
|
66 |
+
EVIDENCE:
|
67 |
+
{evidence_text}
|
68 |
+
|
69 |
+
TASK: Evaluate if the evidence supports, contradicts, or is irrelevant to the claim.
|
70 |
+
|
71 |
+
IMPORTANT CONTEXT: This claim makes strong assertions that require specific supporting evidence.
|
72 |
+
|
73 |
+
When evaluating such claims:
|
74 |
+
1. Strong assertions require strong, direct evidence - look for specific confirmation from credible sources
|
75 |
+
2. General information about the topic is not sufficient to support specific assertions
|
76 |
+
3. Evidence of ongoing work or research is not sufficient to support claims of completion or success
|
77 |
+
4. If the evidence doesn't directly confirm the specific assertion, classify it as "insufficient" rather than "support"
|
78 |
+
|
79 |
+
INSTRUCTIONS:
|
80 |
+
1. For each evidence, provide your analysis in EXACTLY this format:
|
81 |
+
|
82 |
+
EVIDENCE 1 ANALYSIS:
|
83 |
+
Relevance: [relevant/irrelevant]
|
84 |
+
Classification: [support/contradict/insufficient/irrelevant]
|
85 |
+
Confidence: [number between 0-100]
|
86 |
+
Reason: [brief explanation focusing on whether evidence directly confirms the specific assertion]
|
87 |
+
|
88 |
+
2. After analyzing all evidence pieces, provide a final verdict in this format:
|
89 |
+
|
90 |
+
FINAL VERDICT: [clear statement if evidence collectively supports or contradicts the claim]
|
91 |
+
|
92 |
+
Without specific, direct supporting evidence, default to "The evidence does not support the claim" rather than "insufficient evidence."
|
93 |
+
|
94 |
+
CRITICAL INSTRUCTION: FOCUS ON THE EXACT CLAIM. Evaluate ONLY the specific claim, not related topics
|
95 |
+
"""
|
96 |
+
else:
|
97 |
+
prompt = f"""
|
98 |
+
CLAIM: {claim}
|
99 |
+
|
100 |
+
EVIDENCE:
|
101 |
+
{evidence_text}
|
102 |
+
|
103 |
+
TASK: Evaluate if the evidence supports, contradicts, or is irrelevant to the claim.
|
104 |
+
|
105 |
+
INSTRUCTIONS:
|
106 |
+
1. For each evidence, provide your analysis in EXACTLY this format:
|
107 |
+
|
108 |
+
EVIDENCE 1 ANALYSIS:
|
109 |
+
Relevance: [relevant/irrelevant]
|
110 |
+
Classification: [support/contradict/insufficient/irrelevant]
|
111 |
+
Confidence: [number between 0-100]
|
112 |
+
Reason: [brief explanation]
|
113 |
+
|
114 |
+
2. After analyzing all evidence pieces, provide a final verdict in this format:
|
115 |
+
|
116 |
+
FINAL VERDICT: [clear statement if evidence collectively supports or contradicts the claim]
|
117 |
+
|
118 |
+
CRITICAL INSTRUCTION: FOCUS ON THE EXACT CLAIM. Evaluate ONLY the specific claim, not related topics
|
119 |
+
"""
|
120 |
+
|
121 |
+
# Get response with temperature=0 for consistency
|
122 |
+
result = llm_model.invoke(prompt, temperature=0)
|
123 |
+
result_text = result.content.strip()
|
124 |
+
|
125 |
+
# Extract final verdict first since it's most important
|
126 |
+
final_verdict = None
|
127 |
+
final_match = re.search(r'FINAL VERDICT:\s*(.*?)(?=\s*$|\n\n)', result_text, re.DOTALL | re.IGNORECASE)
|
128 |
+
if final_match:
|
129 |
+
final_verdict = final_match.group(1).strip()
|
130 |
+
logger.info(f"Final assessment: {final_verdict}")
|
131 |
+
|
132 |
+
# Define a precise regex pattern matching the requested format
|
133 |
+
analysis_pattern = r'EVIDENCE\s+(\d+)\s+ANALYSIS:\s*\n+Relevance:\s*(relevant|irrelevant)\s*\n+Classification:\s*(support|contradict|neutral|irrelevant|insufficient)\s*\n+Confidence:\s*(\d+)\s*\n+Reason:\s*(.*?)(?=\s*EVIDENCE\s+\d+\s+ANALYSIS:|\s*FINAL VERDICT:|\s*$)'
|
134 |
+
|
135 |
+
# Parse each evidence analysis
|
136 |
+
classification_results = []
|
137 |
+
matched_evidence = set()
|
138 |
+
|
139 |
+
# Try matching with our strict pattern first
|
140 |
+
matches = list(re.finditer(analysis_pattern, result_text, re.IGNORECASE | re.DOTALL))
|
141 |
+
|
142 |
+
# If no matches, try a more flexible pattern
|
143 |
+
if not matches:
|
144 |
+
flexible_pattern = r'(?:EVIDENCE|Evidence)\s+(\d+)(?:\s+ANALYSIS)?:?\s*\n+(?:Relevance|relevance):\s*(relevant|irrelevant|unknown)\s*\n+(?:Classification|classification):\s*(support|contradict|neutral|irrelevant|insufficient|unknown)\s*\n+(?:Confidence|confidence):\s*(\d+)\s*\n+(?:Reason|reason|Brief reason):\s*(.*?)(?=\s*(?:EVIDENCE|Evidence)\s+\d+|FINAL VERDICT:|$)'
|
145 |
+
matches = list(re.finditer(flexible_pattern, result_text, re.IGNORECASE | re.DOTALL))
|
146 |
+
|
147 |
+
# Process matches
|
148 |
+
for match in matches:
|
149 |
+
try:
|
150 |
+
evidence_idx = int(match.group(1)) - 1
|
151 |
+
relevance = match.group(2).lower()
|
152 |
+
classification = match.group(3).lower()
|
153 |
+
confidence = int(match.group(4))
|
154 |
+
reason = match.group(5).strip()
|
155 |
+
|
156 |
+
# Normalize classification terms
|
157 |
+
if classification == "neutral":
|
158 |
+
classification = "insufficient"
|
159 |
+
|
160 |
+
# For strong assertions, apply confidence adjustments based on classification
|
161 |
+
if contains_strong_assertions:
|
162 |
+
if classification == "support":
|
163 |
+
# Check if the reasoning indicates direct or indirect support
|
164 |
+
indirect_support_markers = ["general", "doesn't directly", "does not directly",
|
165 |
+
"doesn't specifically", "not specific", "related to",
|
166 |
+
"doesn't confirm"]
|
167 |
+
if any(marker in reason.lower() for marker in indirect_support_markers):
|
168 |
+
# Downgrade support confidence for indirect evidence
|
169 |
+
confidence = max(5, confidence - 20)
|
170 |
+
elif classification == "contradict":
|
171 |
+
# For contradictions of strong assertions, slightly boost confidence
|
172 |
+
confidence = min(95, confidence + 5)
|
173 |
+
|
174 |
+
# Ensure index is valid
|
175 |
+
if 0 <= evidence_idx < len(evidence):
|
176 |
+
matched_evidence.add(evidence_idx)
|
177 |
+
|
178 |
+
# Create result entry
|
179 |
+
classification_results.append({
|
180 |
+
"label": classification,
|
181 |
+
"confidence": confidence / 100.0,
|
182 |
+
"evidence": evidence[evidence_idx],
|
183 |
+
"relevance": relevance,
|
184 |
+
"reason": reason,
|
185 |
+
"final_assessment": final_verdict
|
186 |
+
})
|
187 |
+
except (ValueError, IndexError) as e:
|
188 |
+
logger.error(f"Error parsing evidence analysis: {e}")
|
189 |
+
|
190 |
+
# Handle any unmatched evidence items
|
191 |
+
if matches: # Only add defaults if we successfully matched some
|
192 |
+
for idx, ev in enumerate(evidence):
|
193 |
+
if idx not in matched_evidence:
|
194 |
+
# Check if the evidence text itself suggests a classification
|
195 |
+
contains_support = bool(re.search(r'support|confirm|verify|true|correct|released', final_verdict or "", re.IGNORECASE))
|
196 |
+
contains_contradicting = bool(re.search(r'not yet|hasn\'t|have not|doesn\'t|don\'t|cannot|preliminary|proposed', str(ev).lower()))
|
197 |
+
|
198 |
+
# For claims with strong assertions without explicit evidence, be more cautious
|
199 |
+
if contains_strong_assertions:
|
200 |
+
if contains_contradicting:
|
201 |
+
label = "contradict"
|
202 |
+
confidence = 0.6
|
203 |
+
elif contains_support:
|
204 |
+
label = "insufficient" # Default to insufficient for strong assertions without clear analysis
|
205 |
+
confidence = 0.5
|
206 |
+
else:
|
207 |
+
label = "insufficient"
|
208 |
+
confidence = 0.5
|
209 |
+
else:
|
210 |
+
label = "support" if contains_support else "unknown"
|
211 |
+
confidence = 0.7 if contains_support else 0.5
|
212 |
+
|
213 |
+
classification_results.append({
|
214 |
+
"label": label,
|
215 |
+
"confidence": confidence,
|
216 |
+
"evidence": ev,
|
217 |
+
"relevance": "relevant" if (contains_support or contains_contradicting) else "unknown",
|
218 |
+
"reason": "Based on overall assessment",
|
219 |
+
"final_assessment": final_verdict
|
220 |
+
})
|
221 |
+
else:
|
222 |
+
# No structured parsing worked, use final verdict to create simple results
|
223 |
+
contains_support = bool(re.search(r'support|confirm|verify|true|correct|released', final_verdict or "", re.IGNORECASE))
|
224 |
+
contains_contradict = bool(re.search(r'contradict|against|false|incorrect|not support|does not support|insufficient evidence|does not confirm|no evidence', final_verdict or "", re.IGNORECASE))
|
225 |
+
contains_insufficient = bool(re.search(r'insufficient|not enough|cannot determine|no evidence|lack of evidence', final_verdict or "", re.IGNORECASE))
|
226 |
+
|
227 |
+
# For claims with strong assertions, be more stringent
|
228 |
+
if contains_strong_assertions:
|
229 |
+
if contains_support and not contains_insufficient and not contains_contradict:
|
230 |
+
label = "support"
|
231 |
+
confidence = 0.6 # Lower confidence even for support of strong assertions
|
232 |
+
elif contains_contradict:
|
233 |
+
label = "contradict"
|
234 |
+
confidence = 0.8 # Higher confidence for contradiction of strong assertions
|
235 |
+
else:
|
236 |
+
label = "insufficient"
|
237 |
+
confidence = 0.7 # Good confidence for insufficient judgment
|
238 |
+
else:
|
239 |
+
label = "support" if contains_support else "contradict" if contains_contradict else "unknown"
|
240 |
+
confidence = 0.7 if (contains_support or contains_contradict) else 0.5
|
241 |
+
|
242 |
+
# Create basic results based on final verdict
|
243 |
+
for ev in evidence:
|
244 |
+
classification_results.append({
|
245 |
+
"label": label,
|
246 |
+
"confidence": confidence,
|
247 |
+
"evidence": ev,
|
248 |
+
"relevance": "relevant" if (contains_support or contains_contradict) else "unknown",
|
249 |
+
"reason": final_verdict or "Based on collective evidence",
|
250 |
+
"final_assessment": final_verdict
|
251 |
+
})
|
252 |
+
|
253 |
+
logger.info(f"Classified {len(classification_results)} evidence items")
|
254 |
+
return classification_results
|
255 |
+
|
256 |
+
except Exception as e:
|
257 |
+
logger.error(f"Error in evidence classification: {str(e)}")
|
258 |
+
# Provide a basic fallback that checks for keywords in evidence
|
259 |
+
try:
|
260 |
+
fallback_results = []
|
261 |
+
for ev in evidence:
|
262 |
+
ev_text = str(ev).lower()
|
263 |
+
supports = False
|
264 |
+
contradicts = False
|
265 |
+
|
266 |
+
# Basic keyword checking as last resort
|
267 |
+
if claim.lower() in ev_text:
|
268 |
+
keywords = [word for word in claim.lower().split() if len(word) > 3]
|
269 |
+
matching_keywords = [k for k in keywords if k in ev_text]
|
270 |
+
|
271 |
+
# If substantial keywords match, consider it support
|
272 |
+
supports = len(matching_keywords) >= max(1, len(keywords) // 2)
|
273 |
+
|
274 |
+
# Check for contradiction terms
|
275 |
+
contradiction_terms = ["not yet", "hasn't", "haven't", "cannot", "can't",
|
276 |
+
"doesn't", "don't", "no evidence", "insufficient",
|
277 |
+
"preliminary", "proposed", "in development", "future"]
|
278 |
+
contradicts = any(term in ev_text for term in contradiction_terms)
|
279 |
+
|
280 |
+
# For claims with strong assertions, be more conservative in the fallback case
|
281 |
+
if contains_strong_assertions:
|
282 |
+
if contradicts:
|
283 |
+
fallback_results.append({
|
284 |
+
"label": "contradict",
|
285 |
+
"confidence": 0.6,
|
286 |
+
"evidence": ev,
|
287 |
+
"relevance": "relevant",
|
288 |
+
"reason": "Evidence suggests the claim is not yet proven (fallback method)"
|
289 |
+
})
|
290 |
+
elif supports:
|
291 |
+
fallback_results.append({
|
292 |
+
"label": "insufficient",
|
293 |
+
"confidence": 0.6,
|
294 |
+
"evidence": ev,
|
295 |
+
"relevance": "relevant",
|
296 |
+
"reason": "Evidence is related but doesn't conclusively confirm the assertion (fallback method)"
|
297 |
+
})
|
298 |
+
else:
|
299 |
+
fallback_results.append({
|
300 |
+
"label": "unknown",
|
301 |
+
"confidence": 0.5,
|
302 |
+
"evidence": ev,
|
303 |
+
"relevance": "unknown",
|
304 |
+
"reason": "Cannot determine relevance (fallback method)"
|
305 |
+
})
|
306 |
+
else:
|
307 |
+
fallback_results.append({
|
308 |
+
"label": "support" if supports else "unknown",
|
309 |
+
"confidence": 0.6 if supports else 0.5,
|
310 |
+
"evidence": ev,
|
311 |
+
"relevance": "relevant" if supports else "unknown",
|
312 |
+
"reason": "Based on keyword matching (fallback method)"
|
313 |
+
})
|
314 |
+
|
315 |
+
return fallback_results
|
316 |
+
except:
|
317 |
+
# Absolute last resort
|
318 |
+
return [{"label": "unknown", "confidence": 0.5, "evidence": ev} for ev in evidence]
|
319 |
+
|
320 |
+
def aggregate_evidence(classification_results):
|
321 |
+
"""
|
322 |
+
Aggregate evidence classifications to determine overall verdict
|
323 |
+
with robust fallback mechanisms for reliable results
|
324 |
+
"""
|
325 |
+
logger.info(f"Aggregating evidence from {len(classification_results) if classification_results else 0} results")
|
326 |
+
|
327 |
+
if not classification_results:
|
328 |
+
logger.warning("No classification results to aggregate")
|
329 |
+
return "Uncertain", 0.3 # Default with low confidence
|
330 |
+
|
331 |
+
# Assess the claim's characteristics (without relying on explicit category detection)
|
332 |
+
# Does the claim contain strong assertions that require specific evidence?
|
333 |
+
strong_assertion_markers = [
|
334 |
+
"solved", "cured", "discovered", "confirmed", "proven", "definitive",
|
335 |
+
"breakthrough", "revolutionary", "successfully", "first ever", "extends",
|
336 |
+
"conclusive", "unprecedented", "remarkable", "definitively"
|
337 |
+
]
|
338 |
+
|
339 |
+
# Check if claim text is available in final assessment
|
340 |
+
claim_text = None
|
341 |
+
claim_has_strong_assertions = False
|
342 |
+
|
343 |
+
# Extract claim from final assessment if available
|
344 |
+
for item in classification_results:
|
345 |
+
if "final_assessment" in item and item["final_assessment"]:
|
346 |
+
match = re.search(r'the claim (?:that )?"?([^"]+)"?', item["final_assessment"], re.IGNORECASE)
|
347 |
+
if match:
|
348 |
+
claim_text = match.group(1)
|
349 |
+
claim_has_strong_assertions = any(marker in claim_text.lower() for marker in strong_assertion_markers)
|
350 |
+
break
|
351 |
+
|
352 |
+
# If we couldn't extract the claim, check evidence context for assertion indicators
|
353 |
+
if not claim_text:
|
354 |
+
# Check if evidence reasons suggest dealing with strong assertions
|
355 |
+
assertion_context_indicators = ["conclusive evidence", "definitive proof", "solved", "breakthrough",
|
356 |
+
"revolutionary", "directly confirms", "specific confirmation"]
|
357 |
+
|
358 |
+
reasons = [item.get("reason", "").lower() for item in classification_results if "reason" in item]
|
359 |
+
assertion_indicators_count = sum(1 for indicator in assertion_context_indicators
|
360 |
+
for reason in reasons if indicator in reason)
|
361 |
+
|
362 |
+
claim_has_strong_assertions = assertion_indicators_count >= 2
|
363 |
+
|
364 |
+
# Extract final assessment if present
|
365 |
+
final_assessment = None
|
366 |
+
for item in classification_results:
|
367 |
+
if "final_assessment" in item and item["final_assessment"]:
|
368 |
+
final_assessment = item["final_assessment"]
|
369 |
+
break
|
370 |
+
|
371 |
+
# Count evidence by classification
|
372 |
+
support_items = [item for item in classification_results if item.get("label") == "support"]
|
373 |
+
contradict_items = [item for item in classification_results if item.get("label") == "contradict"]
|
374 |
+
insufficient_items = [item for item in classification_results if item.get("label") in ["insufficient", "neutral"]]
|
375 |
+
relevant_items = [item for item in classification_results
|
376 |
+
if item.get("relevance") == "relevant" or item.get("label") in ["support", "contradict"]]
|
377 |
+
|
378 |
+
# Calculate the proportion of supported evidence
|
379 |
+
total_relevant = len(relevant_items)
|
380 |
+
|
381 |
+
# Direct keyword detection from final assessment or evidence
|
382 |
+
if final_assessment:
|
383 |
+
# Check for support indicators in final assessment
|
384 |
+
supports_pattern = r'\b(support|confirm|verify|true|correct|released|proves|validates|evidence (?:that |for |of )(?:the claim|it) is true)\b'
|
385 |
+
contradicts_pattern = r'\b(contradict|refute|deny|false|incorrect|not released|doesn\'t support|does not support|no evidence|cannot support|is not true|evidence (?:that |for |of )(?:the claim|it) is false)\b'
|
386 |
+
insufficient_pattern = r'\b(uncertain|insufficient|not enough|inconclusive|cannot determine|unable to determine|lack of evidence)\b'
|
387 |
+
|
388 |
+
supports_match = re.search(supports_pattern, final_assessment, re.IGNORECASE)
|
389 |
+
contradicts_match = re.search(contradicts_pattern, final_assessment, re.IGNORECASE)
|
390 |
+
insufficient_match = re.search(insufficient_pattern, final_assessment, re.IGNORECASE)
|
391 |
+
|
392 |
+
# Direct determination based on final assessment keywords
|
393 |
+
if supports_match and not contradicts_match and not insufficient_match:
|
394 |
+
# Get max confidence from supporting evidence
|
395 |
+
confidence = max([item.get("confidence", 0) for item in support_items]) if support_items else 0.7
|
396 |
+
|
397 |
+
# Adjust confidence for claims with strong assertions
|
398 |
+
if claim_has_strong_assertions:
|
399 |
+
confidence = min(confidence, 0.8) # Cap confidence for strong assertions
|
400 |
+
|
401 |
+
return "True (Based on Evidence)", max(0.6, confidence) # Minimum 0.6 confidence
|
402 |
+
|
403 |
+
if contradicts_match and not supports_match:
|
404 |
+
# Get max confidence from contradicting evidence
|
405 |
+
confidence = max([item.get("confidence", 0) for item in contradict_items]) if contradict_items else 0.7
|
406 |
+
|
407 |
+
# For claims with strong assertions, increase confidence in contradiction
|
408 |
+
if claim_has_strong_assertions:
|
409 |
+
confidence = max(confidence, 0.7) # Minimum 0.7 confidence for contradicting strong assertions
|
410 |
+
|
411 |
+
return "False (Based on Evidence)", max(0.6, confidence) # Minimum 0.6 confidence
|
412 |
+
|
413 |
+
if insufficient_match:
|
414 |
+
# For claims with strong assertions without confirming evidence,
|
415 |
+
# change "Uncertain" to a clearer negative verdict
|
416 |
+
if claim_has_strong_assertions:
|
417 |
+
return "False (Based on Evidence)", 0.7
|
418 |
+
return "Uncertain", 0.4 # Medium-low confidence
|
419 |
+
|
420 |
+
# If we have distinct classifications, weigh them by confidence and quantity
|
421 |
+
if support_items and (not contradict_items or all(item.get("confidence", 0) < 0.95 for item in contradict_items)):
|
422 |
+
# Check if there's high confidence support evidence (greater than 0.95)
|
423 |
+
high_confidence_support = [item for item in support_items if item.get("confidence", 0) > 0.95]
|
424 |
+
|
425 |
+
if high_confidence_support:
|
426 |
+
# High confidence support evidence exists, use it even if there are some contradictions
|
427 |
+
confidence = max([item.get("confidence", 0) for item in high_confidence_support])
|
428 |
+
# For claims with strong assertions, be more conservative with pure support
|
429 |
+
if claim_has_strong_assertions:
|
430 |
+
confidence = min(confidence, 0.8)
|
431 |
+
return "True (Based on Evidence)", max(0.7, confidence)
|
432 |
+
elif not contradict_items:
|
433 |
+
# All supportive evidence with no contradictions (standard case)
|
434 |
+
confidence = max([item.get("confidence", 0) for item in support_items])
|
435 |
+
|
436 |
+
# For claims with strong assertions, be more conservative with pure support
|
437 |
+
if claim_has_strong_assertions:
|
438 |
+
# For strong assertions with only support but no contradictions, be cautious
|
439 |
+
confidence = min(confidence, 0.7)
|
440 |
+
# If the support is from low-quality evidence, consider it uncertain
|
441 |
+
support_reasons = [item.get("reason", "").lower() for item in support_items]
|
442 |
+
weak_supports = sum(1 for reason in support_reasons if
|
443 |
+
"general information" in reason or
|
444 |
+
"doesn't specify" in reason or
|
445 |
+
"does not directly" in reason)
|
446 |
+
if weak_supports / max(1, len(support_items)) > 0.5:
|
447 |
+
return "Uncertain", 0.6
|
448 |
+
|
449 |
+
return "True (Based on Evidence)", max(0.6, confidence)
|
450 |
+
|
451 |
+
if contradict_items and not support_items:
|
452 |
+
# All contradicting evidence
|
453 |
+
confidence = max([item.get("confidence", 0) for item in contradict_items])
|
454 |
+
|
455 |
+
# For claims with strong assertions, increase confidence in contradiction
|
456 |
+
if claim_has_strong_assertions:
|
457 |
+
confidence = max(confidence, 0.7)
|
458 |
+
|
459 |
+
return "False (Based on Evidence)", max(0.6, confidence)
|
460 |
+
|
461 |
+
if insufficient_items and len(insufficient_items) > len(support_items) + len(contradict_items):
|
462 |
+
# Mostly insufficient evidence
|
463 |
+
# For claims with strong assertions and mainly insufficient evidence, lean toward "False"
|
464 |
+
if claim_has_strong_assertions:
|
465 |
+
return "False (Based on Evidence)", 0.7
|
466 |
+
return "Uncertain", 0.5 # Medium confidence for explicitly uncertain
|
467 |
+
|
468 |
+
if support_items and contradict_items:
|
469 |
+
# Competing evidence - compare confidence and quantity
|
470 |
+
support_confidence = max([item.get("confidence", 0) for item in support_items])
|
471 |
+
contradict_confidence = max([item.get("confidence", 0) for item in contradict_items])
|
472 |
+
|
473 |
+
# For claims with strong assertions, require stronger support to overcome contradiction
|
474 |
+
if claim_has_strong_assertions:
|
475 |
+
# Higher threshold for strong assertions
|
476 |
+
if support_confidence > contradict_confidence + 0.3:
|
477 |
+
return "True (Based on Evidence)", support_confidence * 0.9 # Apply a confidence penalty
|
478 |
+
elif contradict_confidence >= support_confidence - 0.1: # Lower threshold for contradiction
|
479 |
+
return "False (Based on Evidence)", max(contradict_confidence, 0.7) # Minimum 0.7 confidence
|
480 |
+
else:
|
481 |
+
# Default to uncertain for close calls on strong assertions
|
482 |
+
return "Uncertain", 0.6
|
483 |
+
else:
|
484 |
+
# Standard threshold for regular claims
|
485 |
+
if support_confidence > contradict_confidence + 0.2:
|
486 |
+
return "True (Based on Evidence)", support_confidence
|
487 |
+
elif contradict_confidence > support_confidence + 0.2:
|
488 |
+
return "False (Based on Evidence)", contradict_confidence
|
489 |
+
else:
|
490 |
+
# Close call - check quantity of evidence
|
491 |
+
if len(support_items) > len(contradict_items) * 2:
|
492 |
+
return "True (Based on Evidence)", support_confidence * 0.9 # Slight confidence penalty
|
493 |
+
elif len(contradict_items) > len(support_items) * 2:
|
494 |
+
return "False (Based on Evidence)", contradict_confidence * 0.9 # Slight confidence penalty
|
495 |
+
else:
|
496 |
+
# Truly conflicting evidence
|
497 |
+
return "Uncertain", 0.5 # Medium confidence
|
498 |
+
|
499 |
+
# Check for evidence quality issues
|
500 |
+
all_unknown = all(item.get("label") == "unknown" for item in classification_results)
|
501 |
+
evidence_text = " ".join([str(item.get("evidence", "")) for item in classification_results])
|
502 |
+
|
503 |
+
# General case: For any claims with all unknown labels that contain markers of strong assertions
|
504 |
+
if all_unknown and claim_has_strong_assertions:
|
505 |
+
# Absence of clear supporting evidence for claims with strong assertions points to "False"
|
506 |
+
return "False (Based on Evidence)", 0.7
|
507 |
+
|
508 |
+
# For general claims, if all items are unknown but evidence clearly mentions the claim
|
509 |
+
if all_unknown:
|
510 |
+
# Examples of direct evidence matching as fallback
|
511 |
+
if re.search(r'\bllama\s*4\b', evidence_text, re.IGNORECASE) and re.search(r'\bmeta\b|\bfacebook\b', evidence_text, re.IGNORECASE) and re.search(r'\breleas', evidence_text, re.IGNORECASE):
|
512 |
+
return "True (Based on Evidence)", 0.7
|
513 |
+
elif re.search(r'\bnot\s+releas', evidence_text, re.IGNORECASE) or re.search(r'\bdenies\b|\bdenied\b', evidence_text, re.IGNORECASE):
|
514 |
+
return "False (Based on Evidence)", 0.7
|
515 |
+
|
516 |
+
# Default to uncertain if no clear pattern - but with special case for claims with strong assertions
|
517 |
+
if claim_has_strong_assertions:
|
518 |
+
# For claims with strong assertions with no clear evidence, default to false
|
519 |
+
return "False (Based on Evidence)", 0.7
|
520 |
+
|
521 |
+
return "Uncertain", 0.3
|
modules/evidence_retrieval.py
ADDED
@@ -0,0 +1,944 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import time
|
3 |
+
import re
|
4 |
+
import random
|
5 |
+
import requests
|
6 |
+
import json
|
7 |
+
import ssl
|
8 |
+
from urllib.parse import urlencode
|
9 |
+
from bs4 import BeautifulSoup
|
10 |
+
from SPARQLWrapper import SPARQLWrapper, JSON
|
11 |
+
from datetime import datetime, timedelta
|
12 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed, wait, FIRST_COMPLETED
|
13 |
+
|
14 |
+
from utils.api_utils import api_error_handler, safe_json_parse
|
15 |
+
from utils.models import get_nlp_model
|
16 |
+
from modules.claim_extraction import shorten_claim_for_evidence, extract_claims
|
17 |
+
from modules.rss_feed import retrieve_evidence_from_rss
|
18 |
+
from modules.semantic_analysis import analyze_evidence_relevance, select_diverse_evidence
|
19 |
+
from config import SOURCE_CREDIBILITY, NEWS_API_KEY, FACTCHECK_API_KEY
|
20 |
+
|
21 |
+
# Import the performance tracker
|
22 |
+
from utils.performance import PerformanceTracker
|
23 |
+
performance_tracker = PerformanceTracker()
|
24 |
+
|
25 |
+
logger = logging.getLogger("misinformation_detector")
|
26 |
+
|
27 |
+
# Define early analysis function at the module level so it's available everywhere
|
28 |
+
def analyze_early_evidence(claim, source_name, source_evidence):
|
29 |
+
"""Pre-analyze evidence while waiting for other sources to complete"""
|
30 |
+
try:
|
31 |
+
if not source_evidence:
|
32 |
+
return None
|
33 |
+
|
34 |
+
logger.info(f"Pre-analyzing {len(source_evidence)} evidence items from {source_name}")
|
35 |
+
|
36 |
+
# Do a quick relevance check using similarity scoring
|
37 |
+
nlp_model = get_nlp_model()
|
38 |
+
claim_doc = nlp_model(claim)
|
39 |
+
|
40 |
+
relevant_evidence = []
|
41 |
+
for evidence in source_evidence:
|
42 |
+
if not isinstance(evidence, str):
|
43 |
+
continue
|
44 |
+
|
45 |
+
# Look for direct keyword matches first (fast check)
|
46 |
+
is_related = False
|
47 |
+
keywords = [word.lower() for word in claim.split() if len(word) > 3]
|
48 |
+
for keyword in keywords:
|
49 |
+
if keyword in evidence.lower():
|
50 |
+
is_related = True
|
51 |
+
break
|
52 |
+
|
53 |
+
# If no keywords match, do a basic entity check
|
54 |
+
if not is_related:
|
55 |
+
# Check if claim and evidence share any entities
|
56 |
+
evidence_doc = nlp_model(evidence[:500]) # Limit for speed
|
57 |
+
claim_entities = [ent.text.lower() for ent in claim_doc.ents]
|
58 |
+
evidence_entities = [ent.text.lower() for ent in evidence_doc.ents]
|
59 |
+
|
60 |
+
common_entities = set(claim_entities).intersection(set(evidence_entities))
|
61 |
+
if common_entities:
|
62 |
+
is_related = True
|
63 |
+
|
64 |
+
if is_related:
|
65 |
+
relevant_evidence.append(evidence)
|
66 |
+
|
67 |
+
logger.info(f"Found {len(relevant_evidence)} relevant items out of {len(source_evidence)} from {source_name}")
|
68 |
+
return relevant_evidence
|
69 |
+
except Exception as e:
|
70 |
+
logger.error(f"Error in early evidence analysis: {e}")
|
71 |
+
return source_evidence # On error, return original evidence
|
72 |
+
|
73 |
+
# New function to get recent date for filtering news
|
74 |
+
def get_recent_date_range():
|
75 |
+
"""Return date range for recent news filtering - last 3 days"""
|
76 |
+
today = datetime.now()
|
77 |
+
three_days_ago = today - timedelta(days=3)
|
78 |
+
return three_days_ago.strftime('%Y-%m-%d'), today.strftime('%Y-%m-%d')
|
79 |
+
|
80 |
+
@api_error_handler("wikipedia")
|
81 |
+
def retrieve_evidence_from_wikipedia(claim):
|
82 |
+
"""Retrieve evidence from Wikipedia for a given claim"""
|
83 |
+
logger.info(f"Retrieving evidence from Wikipedia for: {claim}")
|
84 |
+
|
85 |
+
# Ensure shortened_claim is a string
|
86 |
+
try:
|
87 |
+
shortened_claim = shorten_claim_for_evidence(claim)
|
88 |
+
except Exception as e:
|
89 |
+
logger.error(f"Error in claim shortening: {e}")
|
90 |
+
shortened_claim = claim # Fallback to original claim
|
91 |
+
|
92 |
+
# Ensure query_parts is a list of strings
|
93 |
+
query_parts = str(shortened_claim).split()
|
94 |
+
evidence = []
|
95 |
+
source_count = {"wikipedia": 0}
|
96 |
+
|
97 |
+
for i in range(len(query_parts), 0, -1): # Start with full query, shorten iteratively
|
98 |
+
try:
|
99 |
+
# Safely join and encode query
|
100 |
+
current_query = "+".join(query_parts[:i])
|
101 |
+
search_url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={current_query}&format=json"
|
102 |
+
logger.info(f"Wikipedia search URL: {search_url}")
|
103 |
+
|
104 |
+
headers = {
|
105 |
+
"User-Agent": "MisinformationDetectionResearchBot/1.0 (Research Project)"
|
106 |
+
}
|
107 |
+
|
108 |
+
# Make the search request with reduced timeout
|
109 |
+
response = requests.get(search_url, headers=headers, timeout=7)
|
110 |
+
response.raise_for_status()
|
111 |
+
|
112 |
+
# Safely parse JSON
|
113 |
+
search_data = safe_json_parse(response, "wikipedia")
|
114 |
+
|
115 |
+
# Safely extract search results
|
116 |
+
search_results = search_data.get("query", {}).get("search", [])
|
117 |
+
|
118 |
+
# Ensure search_results is a list
|
119 |
+
if not isinstance(search_results, list):
|
120 |
+
logger.warning(f"Unexpected search results type: {type(search_results)}")
|
121 |
+
search_results = []
|
122 |
+
|
123 |
+
# Use ThreadPoolExecutor to fetch page content in parallel
|
124 |
+
with ThreadPoolExecutor(max_workers=3) as executor:
|
125 |
+
# Submit up to 3 page requests in parallel
|
126 |
+
futures = []
|
127 |
+
for idx, result in enumerate(search_results[:3]):
|
128 |
+
# Ensure result is a dictionary
|
129 |
+
if not isinstance(result, dict):
|
130 |
+
logger.warning(f"Skipping non-dictionary result: {type(result)}")
|
131 |
+
continue
|
132 |
+
|
133 |
+
# Safely extract title
|
134 |
+
page_title = result.get("title", "")
|
135 |
+
if not page_title:
|
136 |
+
continue
|
137 |
+
|
138 |
+
page_url = f"https://en.wikipedia.org/wiki/{page_title.replace(' ', '_')}"
|
139 |
+
|
140 |
+
# Submit the page request task to executor
|
141 |
+
futures.append(executor.submit(
|
142 |
+
fetch_wikipedia_page_content,
|
143 |
+
page_url,
|
144 |
+
page_title,
|
145 |
+
headers
|
146 |
+
))
|
147 |
+
|
148 |
+
# Process completed futures as they finish
|
149 |
+
for future in as_completed(futures):
|
150 |
+
try:
|
151 |
+
page_result = future.result()
|
152 |
+
if page_result:
|
153 |
+
evidence.append(page_result)
|
154 |
+
source_count["wikipedia"] += 1
|
155 |
+
except Exception as e:
|
156 |
+
logger.error(f"Error processing Wikipedia page: {e}")
|
157 |
+
|
158 |
+
# Stop if we found any evidence
|
159 |
+
if evidence:
|
160 |
+
break
|
161 |
+
|
162 |
+
except Exception as e:
|
163 |
+
logger.error(f"Error retrieving from Wikipedia: {str(e)}")
|
164 |
+
continue
|
165 |
+
|
166 |
+
# Ensure success is a boolean
|
167 |
+
success = bool(evidence)
|
168 |
+
|
169 |
+
# Safely log evidence retrieval
|
170 |
+
try:
|
171 |
+
performance_tracker.log_evidence_retrieval(success, source_count)
|
172 |
+
except Exception as e:
|
173 |
+
logger.error(f"Error logging evidence retrieval: {e}")
|
174 |
+
|
175 |
+
if not evidence:
|
176 |
+
logger.warning("No evidence found from Wikipedia.")
|
177 |
+
|
178 |
+
return evidence
|
179 |
+
|
180 |
+
def fetch_wikipedia_page_content(page_url, page_title, headers):
|
181 |
+
"""Helper function to fetch and parse Wikipedia page content"""
|
182 |
+
try:
|
183 |
+
# Get page content with reduced timeout
|
184 |
+
page_response = requests.get(page_url, headers=headers, timeout=5)
|
185 |
+
page_response.raise_for_status()
|
186 |
+
|
187 |
+
# Extract relevant sections using BeautifulSoup
|
188 |
+
soup = BeautifulSoup(page_response.text, 'html.parser')
|
189 |
+
paragraphs = soup.find_all('p', limit=3) # Limit to first 3 paragraphs
|
190 |
+
content = " ".join([para.get_text(strip=True) for para in paragraphs])
|
191 |
+
|
192 |
+
# Truncate content to reduce token usage earlier in the pipeline
|
193 |
+
if len(content) > 300:
|
194 |
+
content = content[:297] + "..."
|
195 |
+
|
196 |
+
if content.strip(): # Ensure content is not empty
|
197 |
+
return f"Title: {page_title}, URL: {page_url}, Content: {content}"
|
198 |
+
return None
|
199 |
+
except Exception as e:
|
200 |
+
logger.error(f"Error fetching Wikipedia page {page_url}: {e}")
|
201 |
+
return None
|
202 |
+
|
203 |
+
# Update the WikiData function to fix SSL issues
|
204 |
+
@api_error_handler("wikidata")
|
205 |
+
def retrieve_evidence_from_wikidata(claim):
|
206 |
+
"""Retrieve evidence from WikiData for a given claim"""
|
207 |
+
logger.info(f"Retrieving evidence from WikiData for: {claim}")
|
208 |
+
|
209 |
+
# Prepare entities for SPARQL query
|
210 |
+
shortened_claim = shorten_claim_for_evidence(claim)
|
211 |
+
query_terms = shortened_claim.split()
|
212 |
+
|
213 |
+
# Initialize SPARQLWrapper for WikiData
|
214 |
+
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
|
215 |
+
|
216 |
+
# Use a more conservative user agent to avoid blocks
|
217 |
+
sparql.addCustomHttpHeader("User-Agent", "MisinformationDetectionResearchBot/1.0")
|
218 |
+
|
219 |
+
# Fix SSL issues by disabling SSL verification for this specific request
|
220 |
+
try:
|
221 |
+
# Create a context where we don't verify SSL certs
|
222 |
+
import ssl
|
223 |
+
import urllib.request
|
224 |
+
|
225 |
+
# Create a context that doesn't verify certificates
|
226 |
+
ssl_context = ssl._create_unverified_context()
|
227 |
+
|
228 |
+
# Monkey patch the opener for SPARQLWrapper
|
229 |
+
opener = urllib.request.build_opener(urllib.request.HTTPSHandler(context=ssl_context))
|
230 |
+
urllib.request.install_opener(opener)
|
231 |
+
except Exception as e:
|
232 |
+
logger.error(f"Error setting up SSL context: {str(e)}")
|
233 |
+
|
234 |
+
# Construct basic SPARQL query for relevant entities
|
235 |
+
query = """
|
236 |
+
SELECT ?item ?itemLabel ?description ?article WHERE {
|
237 |
+
SERVICE wikibase:mwapi {
|
238 |
+
bd:serviceParam wikibase:api "EntitySearch" .
|
239 |
+
bd:serviceParam wikibase:endpoint "www.wikidata.org" .
|
240 |
+
bd:serviceParam mwapi:search "%s" .
|
241 |
+
bd:serviceParam mwapi:language "en" .
|
242 |
+
?item wikibase:apiOutputItem mwapi:item .
|
243 |
+
}
|
244 |
+
?item schema:description ?description .
|
245 |
+
FILTER(LANG(?description) = "en")
|
246 |
+
OPTIONAL {
|
247 |
+
?article schema:about ?item .
|
248 |
+
?article schema:isPartOf <https://en.wikipedia.org/> .
|
249 |
+
}
|
250 |
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
|
251 |
+
}
|
252 |
+
LIMIT 5
|
253 |
+
""" % " ".join(query_terms)
|
254 |
+
|
255 |
+
sparql.setQuery(query)
|
256 |
+
sparql.setReturnFormat(JSON)
|
257 |
+
|
258 |
+
try:
|
259 |
+
results = sparql.query().convert()
|
260 |
+
|
261 |
+
wikidata_evidence = []
|
262 |
+
|
263 |
+
for result in results["results"]["bindings"]:
|
264 |
+
entity_label = result.get("itemLabel", {}).get("value", "Unknown")
|
265 |
+
description = result.get("description", {}).get("value", "No description")
|
266 |
+
article_url = result.get("article", {}).get("value", "")
|
267 |
+
|
268 |
+
# Truncate description to reduce token usage
|
269 |
+
if len(description) > 200:
|
270 |
+
description = description[:197] + "..."
|
271 |
+
|
272 |
+
evidence_text = f"Entity: {entity_label}, Description: {description}"
|
273 |
+
if article_url:
|
274 |
+
evidence_text += f", URL: {article_url}"
|
275 |
+
|
276 |
+
wikidata_evidence.append(evidence_text)
|
277 |
+
|
278 |
+
logger.info(f"Retrieved {len(wikidata_evidence)} WikiData entities")
|
279 |
+
return wikidata_evidence
|
280 |
+
|
281 |
+
except Exception as e:
|
282 |
+
logger.error(f"Error retrieving from WikiData: {str(e)}")
|
283 |
+
return []
|
284 |
+
|
285 |
+
@api_error_handler("openalex")
|
286 |
+
def retrieve_evidence_from_openalex(claim):
|
287 |
+
"""Retrieve evidence from OpenAlex for a given claim (replacement for Semantic Scholar)"""
|
288 |
+
logger.info(f"Retrieving evidence from OpenAlex for: {claim}")
|
289 |
+
|
290 |
+
try:
|
291 |
+
shortened_claim = shorten_claim_for_evidence(claim)
|
292 |
+
query = shortened_claim.replace(" ", "+")
|
293 |
+
|
294 |
+
# OpenAlex API endpoint
|
295 |
+
api_url = f"https://api.openalex.org/works?search={query}&filter=is_paratext:false&per_page=3"
|
296 |
+
|
297 |
+
headers = {
|
298 |
+
"Accept": "application/json",
|
299 |
+
"User-Agent": "MisinformationDetectionResearchBot/1.0 ([email protected])",
|
300 |
+
}
|
301 |
+
|
302 |
+
scholarly_evidence = []
|
303 |
+
|
304 |
+
try:
|
305 |
+
# Request with reduced timeout
|
306 |
+
response = requests.get(api_url, headers=headers, timeout=8)
|
307 |
+
|
308 |
+
# Check response status
|
309 |
+
if response.status_code == 200:
|
310 |
+
# Successfully retrieved data
|
311 |
+
data = safe_json_parse(response, "openalex")
|
312 |
+
papers = data.get("results", [])
|
313 |
+
|
314 |
+
for paper in papers:
|
315 |
+
title = paper.get("title", "Unknown Title")
|
316 |
+
abstract = paper.get("abstract_inverted_index", None)
|
317 |
+
|
318 |
+
# OpenAlex stores abstracts in an inverted index format, so we need to reconstruct it
|
319 |
+
abstract_text = "No abstract available"
|
320 |
+
if abstract:
|
321 |
+
try:
|
322 |
+
# Simple approach to reconstruct from inverted index
|
323 |
+
# For a production app, implement a proper reconstruction algorithm
|
324 |
+
words = list(abstract.keys())
|
325 |
+
abstract_text = " ".join(words[:30]) + "..."
|
326 |
+
except Exception as e:
|
327 |
+
logger.error(f"Error reconstructing abstract: {e}")
|
328 |
+
|
329 |
+
url = paper.get("doi", "")
|
330 |
+
if url and not url.startswith("http"):
|
331 |
+
url = f"https://doi.org/{url}"
|
332 |
+
|
333 |
+
year = ""
|
334 |
+
publication_date = paper.get("publication_date", "")
|
335 |
+
if publication_date:
|
336 |
+
year = publication_date.split("-")[0]
|
337 |
+
|
338 |
+
# Truncate abstract to reasonable length
|
339 |
+
if len(abstract_text) > 250:
|
340 |
+
abstract_text = abstract_text[:247] + "..."
|
341 |
+
|
342 |
+
evidence_text = f"Title: {title}, Year: {year}, Abstract: {abstract_text}, URL: {url}"
|
343 |
+
scholarly_evidence.append(evidence_text)
|
344 |
+
|
345 |
+
else:
|
346 |
+
logger.error(f"OpenAlex API error: {response.status_code}")
|
347 |
+
|
348 |
+
except requests.exceptions.Timeout:
|
349 |
+
logger.warning("OpenAlex request timed out")
|
350 |
+
except requests.exceptions.ConnectionError:
|
351 |
+
logger.warning("OpenAlex connection error")
|
352 |
+
except Exception as e:
|
353 |
+
logger.error(f"Unexpected error in OpenAlex request: {str(e)}")
|
354 |
+
|
355 |
+
logger.info(f"Retrieved {len(scholarly_evidence)} scholarly papers from OpenAlex")
|
356 |
+
return scholarly_evidence
|
357 |
+
|
358 |
+
except Exception as e:
|
359 |
+
logger.error(f"Fatal error in OpenAlex retrieval: {str(e)}")
|
360 |
+
return []
|
361 |
+
|
362 |
+
@api_error_handler("factcheck")
|
363 |
+
def retrieve_evidence_from_claimreview(claim):
|
364 |
+
"""Retrieve evidence from Google's ClaimReview for a given claim"""
|
365 |
+
logger.info(f"Retrieving evidence from ClaimReview for: {claim}")
|
366 |
+
factcheck_api_key = FACTCHECK_API_KEY
|
367 |
+
|
368 |
+
# Safely shorten claim
|
369 |
+
try:
|
370 |
+
shortened_claim = shorten_claim_for_evidence(claim)
|
371 |
+
except Exception as e:
|
372 |
+
logger.error(f"Error shortening claim: {e}")
|
373 |
+
shortened_claim = claim
|
374 |
+
|
375 |
+
query_parts = str(shortened_claim).split()
|
376 |
+
factcheck_results = []
|
377 |
+
source_count = {"factcheck": 0}
|
378 |
+
|
379 |
+
for i in range(len(query_parts), 0, -1): # Iteratively try shorter queries
|
380 |
+
try:
|
381 |
+
current_query = " ".join(query_parts[:i])
|
382 |
+
encoded_query = urlencode({"query": current_query})
|
383 |
+
factcheck_url = f"https://factchecktools.googleapis.com/v1alpha1/claims:search?{encoded_query}&key={factcheck_api_key}"
|
384 |
+
logger.info(f"Factcheck URL: {factcheck_url}")
|
385 |
+
|
386 |
+
# Make request with reduced timeout
|
387 |
+
response = requests.get(factcheck_url, timeout=7)
|
388 |
+
response.raise_for_status()
|
389 |
+
data = safe_json_parse(response, "factcheck")
|
390 |
+
|
391 |
+
# Safely extract claims
|
392 |
+
claims = data.get("claims", [])
|
393 |
+
if not isinstance(claims, list):
|
394 |
+
logger.warning(f"Unexpected claims type: {type(claims)}")
|
395 |
+
claims = []
|
396 |
+
|
397 |
+
if claims: # If results found
|
398 |
+
logger.info(f"Results found for query '{current_query}'.")
|
399 |
+
for item in claims:
|
400 |
+
try:
|
401 |
+
# Ensure item is a dictionary
|
402 |
+
if not isinstance(item, dict):
|
403 |
+
logger.warning(f"Skipping non-dictionary item: {type(item)}")
|
404 |
+
continue
|
405 |
+
|
406 |
+
claim_text = str(item.get("text", ""))
|
407 |
+
# Truncate claim text
|
408 |
+
if len(claim_text) > 200:
|
409 |
+
claim_text = claim_text[:197] + "..."
|
410 |
+
|
411 |
+
reviews = item.get("claimReview", [])
|
412 |
+
|
413 |
+
# Ensure reviews is a list
|
414 |
+
if not isinstance(reviews, list):
|
415 |
+
logger.warning(f"Unexpected reviews type: {type(reviews)}")
|
416 |
+
reviews = []
|
417 |
+
|
418 |
+
for review in reviews:
|
419 |
+
# Ensure review is a dictionary
|
420 |
+
if not isinstance(review, dict):
|
421 |
+
logger.warning(f"Skipping non-dictionary review: {type(review)}")
|
422 |
+
continue
|
423 |
+
|
424 |
+
publisher = str(review.get("publisher", {}).get("name", "Unknown Source"))
|
425 |
+
rating = str(review.get("textualRating", "Unknown"))
|
426 |
+
review_url = str(review.get("url", ""))
|
427 |
+
|
428 |
+
if claim_text:
|
429 |
+
factcheck_results.append(
|
430 |
+
f"Claim: {claim_text}, Rating: {rating}, " +
|
431 |
+
f"Source: {publisher}, URL: {review_url}"
|
432 |
+
)
|
433 |
+
source_count["factcheck"] += 1
|
434 |
+
|
435 |
+
except Exception as e:
|
436 |
+
logger.error(f"Error processing FactCheck result: {e}")
|
437 |
+
|
438 |
+
break # Break once we have results
|
439 |
+
else:
|
440 |
+
logger.info(f"No results for query '{current_query}', trying shorter version.")
|
441 |
+
|
442 |
+
except Exception as e:
|
443 |
+
logger.error(f"Error in FactCheck retrieval: {e}")
|
444 |
+
|
445 |
+
# Safely log evidence retrieval
|
446 |
+
try:
|
447 |
+
success = bool(factcheck_results)
|
448 |
+
performance_tracker.log_evidence_retrieval(success, source_count)
|
449 |
+
except Exception as e:
|
450 |
+
logger.error(f"Error logging evidence retrieval: {e}")
|
451 |
+
|
452 |
+
if not factcheck_results:
|
453 |
+
logger.warning("No factcheck evidence found after trying all query variants.")
|
454 |
+
|
455 |
+
return factcheck_results
|
456 |
+
|
457 |
+
@api_error_handler("newsapi")
|
458 |
+
def retrieve_news_articles(claim):
|
459 |
+
"""Retrieve evidence from NewsAPI for a given claim with improved single request approach"""
|
460 |
+
logger.info(f"Retrieving evidence from News API for: {claim}")
|
461 |
+
|
462 |
+
# Get API key
|
463 |
+
news_api_key = NEWS_API_KEY
|
464 |
+
if not news_api_key:
|
465 |
+
logger.error("No NewsAPI key available")
|
466 |
+
return []
|
467 |
+
|
468 |
+
news_results = []
|
469 |
+
source_count = {"news": 0}
|
470 |
+
|
471 |
+
# Get date range for recent news
|
472 |
+
from_date, to_date = get_recent_date_range()
|
473 |
+
logger.info(f"Filtering for news from {from_date} to {to_date}")
|
474 |
+
|
475 |
+
try:
|
476 |
+
# Extract a simplified claim for better matching
|
477 |
+
shortened_claim = shorten_claim_for_evidence(claim)
|
478 |
+
|
479 |
+
# Use a single endpoint with proper parameters
|
480 |
+
encoded_query = urlencode({"q": shortened_claim})
|
481 |
+
|
482 |
+
# Use the 'everything' endpoint as it's more comprehensive
|
483 |
+
news_api_url = f"https://newsapi.org/v2/everything?{encoded_query}&apiKey={news_api_key}&language=en&pageSize=5&sortBy=publishedAt&from={from_date}&to={to_date}"
|
484 |
+
|
485 |
+
log_url = news_api_url.replace(news_api_key, "API_KEY_REDACTED")
|
486 |
+
logger.info(f"Requesting: {log_url}")
|
487 |
+
|
488 |
+
# Make a single request with proper headers and reduced timeout
|
489 |
+
headers = {
|
490 |
+
"User-Agent": "MisinformationDetectionResearchBot/1.0",
|
491 |
+
"X-Api-Key": news_api_key,
|
492 |
+
"Accept": "application/json"
|
493 |
+
}
|
494 |
+
|
495 |
+
response = requests.get(
|
496 |
+
news_api_url,
|
497 |
+
headers=headers,
|
498 |
+
timeout=8
|
499 |
+
)
|
500 |
+
|
501 |
+
logger.info(f"Response status: {response.status_code}")
|
502 |
+
|
503 |
+
if response.status_code == 200:
|
504 |
+
data = safe_json_parse(response, "newsapi")
|
505 |
+
|
506 |
+
if data.get("status") == "ok":
|
507 |
+
articles = data.get("articles", [])
|
508 |
+
logger.info(f"Found {len(articles)} articles")
|
509 |
+
|
510 |
+
for article in articles:
|
511 |
+
try:
|
512 |
+
# Robust article parsing
|
513 |
+
title = str(article.get("title", ""))
|
514 |
+
description = str(article.get("description", ""))
|
515 |
+
content = str(article.get("content", ""))
|
516 |
+
source_name = str(article.get("source", {}).get("name", "Unknown"))
|
517 |
+
url = str(article.get("url", ""))
|
518 |
+
published_at = str(article.get("publishedAt", ""))
|
519 |
+
|
520 |
+
# Parse date to prioritize recent content
|
521 |
+
article_date = None
|
522 |
+
try:
|
523 |
+
if published_at:
|
524 |
+
article_date = datetime.strptime(published_at.split('T')[0], '%Y-%m-%d')
|
525 |
+
except Exception as date_error:
|
526 |
+
logger.warning(f"Could not parse date: {published_at}")
|
527 |
+
|
528 |
+
# Calculate recency score (higher = more recent)
|
529 |
+
recency_score = 1.0 # Default
|
530 |
+
if article_date:
|
531 |
+
days_old = (datetime.now() - article_date).days
|
532 |
+
if days_old == 0: # Today
|
533 |
+
recency_score = 3.0
|
534 |
+
elif days_old == 1: # Yesterday
|
535 |
+
recency_score = 2.0
|
536 |
+
|
537 |
+
# Use description if content is empty or too short
|
538 |
+
if not content or len(content) < 50:
|
539 |
+
content = description
|
540 |
+
|
541 |
+
# Truncate content to reduce token usage
|
542 |
+
if len(content) > 250:
|
543 |
+
content = content[:247] + "..."
|
544 |
+
|
545 |
+
# Ensure meaningful content
|
546 |
+
if title and (content or description):
|
547 |
+
news_item = {
|
548 |
+
"text": (
|
549 |
+
f"Title: {title}, " +
|
550 |
+
f"Source: {source_name}, " +
|
551 |
+
f"Date: {published_at}, " +
|
552 |
+
f"URL: {url}, " +
|
553 |
+
f"Content: {content}"
|
554 |
+
),
|
555 |
+
"recency_score": recency_score,
|
556 |
+
"date": article_date
|
557 |
+
}
|
558 |
+
news_results.append(news_item)
|
559 |
+
source_count["news"] += 1
|
560 |
+
logger.info(f"Added article: {title}")
|
561 |
+
|
562 |
+
except Exception as article_error:
|
563 |
+
logger.error(f"Error processing article: {article_error}")
|
564 |
+
|
565 |
+
# Sort results by recency
|
566 |
+
if news_results:
|
567 |
+
news_results.sort(key=lambda x: x.get('recency_score', 0), reverse=True)
|
568 |
+
|
569 |
+
except Exception as query_error:
|
570 |
+
logger.error(f"Error processing query: {query_error}")
|
571 |
+
|
572 |
+
# Convert to plain text list for compatibility with existing code
|
573 |
+
news_texts = [item["text"] for item in news_results]
|
574 |
+
|
575 |
+
# Log evidence retrieval
|
576 |
+
try:
|
577 |
+
success = bool(news_texts)
|
578 |
+
performance_tracker.log_evidence_retrieval(success, source_count)
|
579 |
+
except Exception as log_error:
|
580 |
+
logger.error(f"Error logging evidence retrieval: {log_error}")
|
581 |
+
|
582 |
+
# Log results
|
583 |
+
if news_texts:
|
584 |
+
logger.info(f"Retrieved {len(news_texts)} news articles")
|
585 |
+
else:
|
586 |
+
logger.warning("No news articles found")
|
587 |
+
|
588 |
+
return news_texts
|
589 |
+
|
590 |
+
def retrieve_combined_evidence(claim):
|
591 |
+
"""
|
592 |
+
Retrieve evidence from multiple sources in parallel and analyze relevance using semantic similarity
|
593 |
+
with category-aware source prioritization and optimized parallel processing
|
594 |
+
"""
|
595 |
+
logger.info(f"Starting evidence retrieval for: {claim}")
|
596 |
+
start_time = time.time()
|
597 |
+
|
598 |
+
# Use the category detector to prioritize sources
|
599 |
+
from modules.category_detection import get_prioritized_sources, get_category_specific_rss_feeds
|
600 |
+
|
601 |
+
# Get source priorities based on claim category
|
602 |
+
priorities = get_prioritized_sources(claim)
|
603 |
+
claim_category = priorities.get("category", "general")
|
604 |
+
requires_recent_evidence = priorities.get("requires_recent", False)
|
605 |
+
|
606 |
+
logger.info(f"Detected claim category: {claim_category} (recent: {requires_recent_evidence})")
|
607 |
+
|
608 |
+
# Initialize results dictionary
|
609 |
+
results = {
|
610 |
+
"wikipedia": [],
|
611 |
+
"wikidata": [],
|
612 |
+
"claimreview": [],
|
613 |
+
"news": [],
|
614 |
+
"scholarly": [],
|
615 |
+
"rss": []
|
616 |
+
}
|
617 |
+
|
618 |
+
# Track source counts and relevant evidence
|
619 |
+
source_counts = {}
|
620 |
+
relevant_evidence = {}
|
621 |
+
total_evidence_count = 0
|
622 |
+
relevant_evidence_count = 0
|
623 |
+
|
624 |
+
# Define primary and secondary sources outside the try block
|
625 |
+
# so they're available in the except block
|
626 |
+
primary_sources = []
|
627 |
+
for source_name in priorities.get("primary", []):
|
628 |
+
if source_name == "wikipedia":
|
629 |
+
primary_sources.append(("wikipedia", retrieve_evidence_from_wikipedia, claim))
|
630 |
+
elif source_name == "wikidata":
|
631 |
+
primary_sources.append(("wikidata", retrieve_evidence_from_wikidata, claim))
|
632 |
+
elif source_name == "claimreview":
|
633 |
+
primary_sources.append(("claimreview", retrieve_evidence_from_claimreview, claim))
|
634 |
+
elif source_name == "news":
|
635 |
+
primary_sources.append(("news", retrieve_news_articles, claim))
|
636 |
+
elif source_name == "scholarly":
|
637 |
+
primary_sources.append(("scholarly", retrieve_evidence_from_openalex, claim))
|
638 |
+
elif source_name == "rss":
|
639 |
+
# Get category-specific RSS max count
|
640 |
+
max_results = 8 if requires_recent_evidence else 5
|
641 |
+
|
642 |
+
# If the claim is science or technology related and we need to optimize
|
643 |
+
# use category-specific RSS feeds
|
644 |
+
if claim_category in ["science", "technology", "politics"]:
|
645 |
+
# Get specialized RSS module to temporarily use category-specific feeds
|
646 |
+
category_feeds = get_category_specific_rss_feeds(claim_category)
|
647 |
+
if category_feeds:
|
648 |
+
primary_sources.append(("rss", retrieve_evidence_from_rss, claim, max_results, category_feeds))
|
649 |
+
else:
|
650 |
+
primary_sources.append(("rss", retrieve_evidence_from_rss, claim, max_results))
|
651 |
+
else:
|
652 |
+
primary_sources.append(("rss", retrieve_evidence_from_rss, claim, max_results))
|
653 |
+
|
654 |
+
# Prepare secondary sources
|
655 |
+
secondary_sources = []
|
656 |
+
for source_name in priorities.get("secondary", []):
|
657 |
+
if source_name == "wikipedia":
|
658 |
+
secondary_sources.append(("wikipedia", retrieve_evidence_from_wikipedia, claim))
|
659 |
+
elif source_name == "wikidata":
|
660 |
+
secondary_sources.append(("wikidata", retrieve_evidence_from_wikidata, claim))
|
661 |
+
elif source_name == "claimreview":
|
662 |
+
secondary_sources.append(("claimreview", retrieve_evidence_from_claimreview, claim))
|
663 |
+
elif source_name == "news":
|
664 |
+
secondary_sources.append(("news", retrieve_news_articles, claim))
|
665 |
+
elif source_name == "scholarly":
|
666 |
+
secondary_sources.append(("scholarly", retrieve_evidence_from_openalex, claim))
|
667 |
+
elif source_name == "rss":
|
668 |
+
max_results = 5 if requires_recent_evidence else 3
|
669 |
+
# Use category-specific feeds if available
|
670 |
+
if claim_category in ["science", "technology", "politics"]:
|
671 |
+
category_feeds = get_category_specific_rss_feeds(claim_category)
|
672 |
+
if category_feeds:
|
673 |
+
secondary_sources.append(("rss", retrieve_evidence_from_rss, claim, max_results, category_feeds))
|
674 |
+
else:
|
675 |
+
secondary_sources.append(("rss", retrieve_evidence_from_rss, claim, max_results))
|
676 |
+
else:
|
677 |
+
secondary_sources.append(("rss", retrieve_evidence_from_rss, claim, max_results))
|
678 |
+
|
679 |
+
# Optimize parallel processing for evidence retrieval with early results processing
|
680 |
+
try:
|
681 |
+
# Define function to safely retrieve evidence
|
682 |
+
def safe_retrieve(source_name, retrieval_func, *args):
|
683 |
+
try:
|
684 |
+
source_result = retrieval_func(*args) or []
|
685 |
+
return source_name, source_result
|
686 |
+
except Exception as e:
|
687 |
+
logger.error(f"Error retrieving from {source_name}: {str(e)}")
|
688 |
+
return source_name, []
|
689 |
+
|
690 |
+
# Define function to analyze evidence relevance
|
691 |
+
def analyze_evidence_quick(evidence_items, claim_text):
|
692 |
+
if not evidence_items or not claim_text:
|
693 |
+
return []
|
694 |
+
|
695 |
+
# Extract important keywords from claim
|
696 |
+
keywords = [word.lower() for word in claim_text.split() if len(word) > 3]
|
697 |
+
|
698 |
+
# Check for direct relevance
|
699 |
+
relevant_items = []
|
700 |
+
for evidence in evidence_items:
|
701 |
+
if not isinstance(evidence, str):
|
702 |
+
continue
|
703 |
+
|
704 |
+
evidence_lower = evidence.lower()
|
705 |
+
|
706 |
+
# Check if evidence contains any important keywords from claim
|
707 |
+
if any(keyword in evidence_lower for keyword in keywords):
|
708 |
+
relevant_items.append(evidence)
|
709 |
+
continue
|
710 |
+
|
711 |
+
# Check for claim subject in evidence (e.g. "earth" in "earth is flat")
|
712 |
+
claim_parts = claim_text.split()
|
713 |
+
if len(claim_parts) > 0 and claim_parts[0].lower() in evidence_lower:
|
714 |
+
relevant_items.append(evidence)
|
715 |
+
continue
|
716 |
+
|
717 |
+
return relevant_items
|
718 |
+
|
719 |
+
# Use ThreadPoolExecutor with a reasonable number of workers
|
720 |
+
# Start with primary sources first - use all available sources in parallel
|
721 |
+
with ThreadPoolExecutor(max_workers=min(4, len(primary_sources))) as executor:
|
722 |
+
# Submit all primary source tasks
|
723 |
+
futures_to_source = {
|
724 |
+
executor.submit(safe_retrieve, source_name, func, *args): source_name
|
725 |
+
for source_name, func, *args in primary_sources
|
726 |
+
}
|
727 |
+
|
728 |
+
# Track completed sources
|
729 |
+
completed_sources = set()
|
730 |
+
|
731 |
+
# Process results as they complete using as_completed for early processing
|
732 |
+
for future in as_completed(futures_to_source):
|
733 |
+
try:
|
734 |
+
source_name, source_results = future.result()
|
735 |
+
results[source_name] = source_results
|
736 |
+
source_counts[source_name] = len(source_results)
|
737 |
+
completed_sources.add(source_name)
|
738 |
+
logger.info(f"Retrieved {len(source_results)} results from {source_name}")
|
739 |
+
|
740 |
+
# Quick relevance analysis
|
741 |
+
if source_results:
|
742 |
+
relevant_items = analyze_evidence_quick(source_results, claim)
|
743 |
+
relevant_evidence[source_name] = relevant_items
|
744 |
+
total_evidence_count += len(source_results)
|
745 |
+
relevant_evidence_count += len(relevant_items)
|
746 |
+
logger.info(f"Found {len(relevant_items)} relevant items out of {len(source_results)} from {source_name}")
|
747 |
+
|
748 |
+
# Start background pre-analysis while waiting for other sources
|
749 |
+
try:
|
750 |
+
executor.submit(
|
751 |
+
analyze_early_evidence,
|
752 |
+
claim,
|
753 |
+
source_name,
|
754 |
+
source_results
|
755 |
+
)
|
756 |
+
except Exception as e:
|
757 |
+
logger.error(f"Error in early evidence analysis: {e}")
|
758 |
+
|
759 |
+
except Exception as e:
|
760 |
+
logger.error(f"Error processing future result: {str(e)}")
|
761 |
+
|
762 |
+
# Check if we have sufficient RELEVANT evidence from primary sources
|
763 |
+
# If not enough relevant evidence, query secondary sources
|
764 |
+
# in parallel even if we have a lot of total evidence
|
765 |
+
if relevant_evidence_count < 2:
|
766 |
+
logger.info(f"Only found {relevant_evidence_count} relevant evidence items, querying secondary sources")
|
767 |
+
|
768 |
+
# Add Wikipedia and Wikidata if they weren't in primary sources and haven't been queried yet
|
769 |
+
must_check_sources = []
|
770 |
+
if "wikipedia" not in completed_sources:
|
771 |
+
must_check_sources.append(("wikipedia", retrieve_evidence_from_wikipedia, claim))
|
772 |
+
|
773 |
+
if "wikidata" not in completed_sources:
|
774 |
+
must_check_sources.append(("wikidata", retrieve_evidence_from_wikidata, claim))
|
775 |
+
|
776 |
+
# Combine with other secondary sources
|
777 |
+
remaining_sources = must_check_sources + [
|
778 |
+
(source_name, func, *args) for source_name, func, *args in secondary_sources
|
779 |
+
if source_name not in completed_sources
|
780 |
+
]
|
781 |
+
|
782 |
+
with ThreadPoolExecutor(max_workers=min(3, len(remaining_sources))) as executor:
|
783 |
+
# Submit all secondary source tasks
|
784 |
+
futures_to_source = {
|
785 |
+
executor.submit(safe_retrieve, source_name, func, *args): source_name
|
786 |
+
for source_name, func, *args in remaining_sources
|
787 |
+
}
|
788 |
+
|
789 |
+
# Process results as they complete
|
790 |
+
for future in as_completed(futures_to_source):
|
791 |
+
try:
|
792 |
+
source_name, source_results = future.result()
|
793 |
+
results[source_name] = source_results
|
794 |
+
source_counts[source_name] = len(source_results)
|
795 |
+
logger.info(f"Retrieved {len(source_results)} results from {source_name}")
|
796 |
+
|
797 |
+
# Quick relevance analysis for these as well
|
798 |
+
if source_results:
|
799 |
+
relevant_items = analyze_evidence_quick(source_results, claim)
|
800 |
+
relevant_evidence[source_name] = relevant_items
|
801 |
+
total_evidence_count += len(source_results)
|
802 |
+
relevant_evidence_count += len(relevant_items)
|
803 |
+
logger.info(f"Found {len(relevant_items)} relevant items out of {len(source_results)} from {source_name}")
|
804 |
+
except Exception as e:
|
805 |
+
logger.error(f"Error processing future result: {str(e)}")
|
806 |
+
|
807 |
+
except Exception as e:
|
808 |
+
logger.error(f"Error in parallel evidence retrieval: {str(e)}")
|
809 |
+
# Fall back to sequential retrieval as a last resort
|
810 |
+
try:
|
811 |
+
logger.warning("Falling back to sequential retrieval due to parallel execution failure")
|
812 |
+
# Sequential retrieval as fallback method - now primary_sources is in scope
|
813 |
+
for source_name, func, *args in primary_sources:
|
814 |
+
try:
|
815 |
+
results[source_name] = func(*args) or []
|
816 |
+
source_counts[source_name] = len(results[source_name])
|
817 |
+
except Exception as source_error:
|
818 |
+
logger.error(f"Error in sequential {source_name} retrieval: {str(source_error)}")
|
819 |
+
|
820 |
+
# For sequential retrieval, always check Wikipedia and Wikidata as fallbacks
|
821 |
+
if "wikipedia" not in completed_sources:
|
822 |
+
try:
|
823 |
+
results["wikipedia"] = retrieve_evidence_from_wikipedia(claim) or []
|
824 |
+
source_counts["wikipedia"] = len(results["wikipedia"])
|
825 |
+
except Exception as e:
|
826 |
+
logger.error(f"Error in fallback Wikipedia retrieval: {e}")
|
827 |
+
|
828 |
+
if "wikidata" not in completed_sources:
|
829 |
+
try:
|
830 |
+
results["wikidata"] = retrieve_evidence_from_wikidata(claim) or []
|
831 |
+
source_counts["wikidata"] = len(results["wikidata"])
|
832 |
+
except Exception as e:
|
833 |
+
logger.error(f"Error in fallback Wikidata retrieval: {e}")
|
834 |
+
|
835 |
+
except Exception as fallback_error:
|
836 |
+
logger.error(f"Error in fallback sequential retrieval: {str(fallback_error)}")
|
837 |
+
|
838 |
+
# Gather all evidence
|
839 |
+
all_evidence = []
|
840 |
+
for source, items in results.items():
|
841 |
+
if isinstance(items, list):
|
842 |
+
for item in items:
|
843 |
+
if item and isinstance(item, str):
|
844 |
+
all_evidence.append(item)
|
845 |
+
|
846 |
+
# Skip processing if no evidence
|
847 |
+
if not all_evidence:
|
848 |
+
logger.warning("No evidence collected")
|
849 |
+
|
850 |
+
# Fallback: try direct search for the claim subject
|
851 |
+
try:
|
852 |
+
logger.info("No evidence found, trying fallback subject search")
|
853 |
+
|
854 |
+
# Extract the main subject using NLP
|
855 |
+
nlp = get_nlp_model()
|
856 |
+
doc = nlp(claim)
|
857 |
+
|
858 |
+
# Find main subject entities or nouns
|
859 |
+
subjects = []
|
860 |
+
for ent in doc.ents:
|
861 |
+
if ent.label_ in ["PERSON", "ORG", "GPE"]:
|
862 |
+
subjects.append(ent.text)
|
863 |
+
|
864 |
+
# If no entities found, use first noun phrase
|
865 |
+
if not subjects:
|
866 |
+
for chunk in doc.noun_chunks:
|
867 |
+
subjects.append(chunk.text)
|
868 |
+
break
|
869 |
+
|
870 |
+
if subjects:
|
871 |
+
# Try a direct search with just the subject
|
872 |
+
logger.info(f"Trying fallback search with subject: {subjects[0]}")
|
873 |
+
|
874 |
+
# Make sure we try Wikipedia for the subject regardless of priorities
|
875 |
+
try:
|
876 |
+
wiki_evidence = retrieve_evidence_from_wikipedia(subjects[0]) or []
|
877 |
+
all_evidence.extend(wiki_evidence)
|
878 |
+
logger.info(f"Retrieved {len(wiki_evidence)} results from fallback Wikipedia search")
|
879 |
+
except Exception as e:
|
880 |
+
logger.error(f"Error in fallback Wikipedia search: {e}")
|
881 |
+
|
882 |
+
# If still no evidence, try other sources
|
883 |
+
if not all_evidence:
|
884 |
+
# Do fallback searches in parallel
|
885 |
+
with ThreadPoolExecutor(max_workers=2) as executor:
|
886 |
+
fallback_futures = {
|
887 |
+
"news": executor.submit(retrieve_news_articles, subjects[0]),
|
888 |
+
"wikidata": executor.submit(retrieve_evidence_from_wikidata, subjects[0])
|
889 |
+
}
|
890 |
+
|
891 |
+
# Process results as they complete
|
892 |
+
for source, future in fallback_futures.items():
|
893 |
+
try:
|
894 |
+
fallback_results = future.result() or []
|
895 |
+
if fallback_results:
|
896 |
+
all_evidence.extend(fallback_results[:2]) # Add up to 2 results from each
|
897 |
+
logger.info(f"Retrieved {len(fallback_results)} results from fallback {source} search")
|
898 |
+
except Exception as e:
|
899 |
+
logger.error(f"Error in fallback {source} search: {str(e)}")
|
900 |
+
|
901 |
+
except Exception as subj_error:
|
902 |
+
logger.error(f"Error in fallback subject search: {str(subj_error)}")
|
903 |
+
|
904 |
+
# If still no evidence, return empty list
|
905 |
+
if not all_evidence:
|
906 |
+
return []
|
907 |
+
|
908 |
+
# Use semantic analysis to score and select the most relevant evidence
|
909 |
+
try:
|
910 |
+
# For science and technology claims, boost the weight of scholarly sources
|
911 |
+
if claim_category in ["science", "technology"]:
|
912 |
+
from config import SOURCE_CREDIBILITY
|
913 |
+
# Create a temporary copy with boosted reliability for relevant sources
|
914 |
+
enhanced_credibility = dict(SOURCE_CREDIBILITY)
|
915 |
+
|
916 |
+
# Add enhanced weights for scientific sources
|
917 |
+
from modules.category_detection import SOURCE_RELIABILITY_BY_CATEGORY
|
918 |
+
for domain, reliability in SOURCE_RELIABILITY_BY_CATEGORY.get(claim_category, {}).items():
|
919 |
+
enhanced_credibility[domain] = reliability
|
920 |
+
|
921 |
+
# Use the enhanced credibility for evidence analysis
|
922 |
+
analyzed_evidence = analyze_evidence_relevance(claim, all_evidence, enhanced_credibility)
|
923 |
+
else:
|
924 |
+
# Analyze evidence relevance using semantic similarity with default weights
|
925 |
+
from config import SOURCE_CREDIBILITY
|
926 |
+
analyzed_evidence = analyze_evidence_relevance(claim, all_evidence, SOURCE_CREDIBILITY)
|
927 |
+
|
928 |
+
# Log evidence scoring
|
929 |
+
logger.info(f"Analyzed {len(analyzed_evidence)} evidence items")
|
930 |
+
|
931 |
+
# Select diverse, relevant evidence items
|
932 |
+
final_evidence = select_diverse_evidence(analyzed_evidence, max_items=5)
|
933 |
+
|
934 |
+
# Log source distribution and selected count
|
935 |
+
logger.info(f"Evidence source distribution: {source_counts}")
|
936 |
+
logger.info(f"Selected evidence count: {len(final_evidence)}")
|
937 |
+
|
938 |
+
# Return maximum 5 evidence items (to control API costs)
|
939 |
+
return final_evidence[:5]
|
940 |
+
|
941 |
+
except Exception as e:
|
942 |
+
logger.error(f"Error in evidence analysis: {str(e)}")
|
943 |
+
# Fallback to simple selection (top 5 items)
|
944 |
+
return all_evidence[:5]
|
modules/explanation.py
ADDED
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import re
|
3 |
+
import ast
|
4 |
+
from utils.models import get_llm_model
|
5 |
+
|
6 |
+
logger = logging.getLogger("misinformation_detector")
|
7 |
+
|
8 |
+
def extract_most_relevant_evidence(evidence_results):
|
9 |
+
"""
|
10 |
+
Intelligently extract the most relevant piece of evidence
|
11 |
+
|
12 |
+
Args:
|
13 |
+
evidence_results (list): List of evidence items
|
14 |
+
|
15 |
+
Returns:
|
16 |
+
str: Most relevant evidence piece
|
17 |
+
"""
|
18 |
+
if not evidence_results:
|
19 |
+
return None
|
20 |
+
|
21 |
+
# If evidence is a dictionary with 'evidence' key
|
22 |
+
if isinstance(evidence_results[0], dict):
|
23 |
+
# Sort by confidence if available
|
24 |
+
sorted_evidence = sorted(
|
25 |
+
evidence_results,
|
26 |
+
key=lambda x: x.get('confidence', 0),
|
27 |
+
reverse=True
|
28 |
+
)
|
29 |
+
|
30 |
+
# Return the evidence from the highest confidence item
|
31 |
+
for item in sorted_evidence:
|
32 |
+
evidence = item.get('evidence')
|
33 |
+
if evidence:
|
34 |
+
return evidence
|
35 |
+
|
36 |
+
# If plain list of evidence
|
37 |
+
return next((ev for ev in evidence_results if ev and isinstance(ev, str)), None)
|
38 |
+
|
39 |
+
def generate_explanation(claim, evidence_results, truth_label, confidence=None):
|
40 |
+
"""
|
41 |
+
Generate an explanation for the claim's classification
|
42 |
+
|
43 |
+
Args:
|
44 |
+
claim (str): The original claim
|
45 |
+
evidence_results (list/str): Evidence supporting the classification
|
46 |
+
truth_label (str): Classification of the claim
|
47 |
+
confidence (float): Confidence level (0-1)
|
48 |
+
|
49 |
+
Returns:
|
50 |
+
str: Explanation of the claim's classification
|
51 |
+
"""
|
52 |
+
logger.info(f"Generating explanation for claim with verdict: {truth_label}")
|
53 |
+
|
54 |
+
try:
|
55 |
+
# Normalize evidence_results to a list
|
56 |
+
if not isinstance(evidence_results, list):
|
57 |
+
try:
|
58 |
+
evidence_results = ast.literal_eval(str(evidence_results)) if evidence_results else []
|
59 |
+
except:
|
60 |
+
evidence_results = [evidence_results] if evidence_results else []
|
61 |
+
|
62 |
+
# Get the LLM model
|
63 |
+
explanation_model = get_llm_model()
|
64 |
+
|
65 |
+
# Extract most relevant evidence
|
66 |
+
most_relevant_evidence = extract_most_relevant_evidence(evidence_results)
|
67 |
+
|
68 |
+
# Prepare evidence text for prompt
|
69 |
+
evidence_text = "\n".join([
|
70 |
+
f"Evidence {i+1}: {str(ev)[:200] + '...' if len(str(ev)) > 200 else str(ev)}"
|
71 |
+
for i, ev in enumerate(evidence_results[:3])
|
72 |
+
])
|
73 |
+
|
74 |
+
# Convert confidence to percentage and description
|
75 |
+
confidence_desc = ""
|
76 |
+
if confidence is not None:
|
77 |
+
confidence_pct = int(confidence * 100)
|
78 |
+
if confidence < 0.3:
|
79 |
+
confidence_desc = f"very low confidence ({confidence_pct}%)"
|
80 |
+
elif confidence < 0.5:
|
81 |
+
confidence_desc = f"low confidence ({confidence_pct}%)"
|
82 |
+
elif confidence < 0.7:
|
83 |
+
confidence_desc = f"moderate confidence ({confidence_pct}%)"
|
84 |
+
elif confidence < 0.9:
|
85 |
+
confidence_desc = f"high confidence ({confidence_pct}%)"
|
86 |
+
else:
|
87 |
+
confidence_desc = f"very high confidence ({confidence_pct}%)"
|
88 |
+
else:
|
89 |
+
# Determine confidence context from label if not explicitly provided
|
90 |
+
confidence_desc = (
|
91 |
+
"high confidence" if "High Confidence" in truth_label else
|
92 |
+
"moderate confidence" if "Likely" in truth_label else
|
93 |
+
"low confidence"
|
94 |
+
)
|
95 |
+
|
96 |
+
# Create prompt with specific instructions based on the type of claim
|
97 |
+
has_negation = any(neg in claim.lower() for neg in ["not", "no longer", "isn't", "doesn't", "won't", "cannot"])
|
98 |
+
|
99 |
+
# For claims with "True" verdict
|
100 |
+
if "True" in truth_label:
|
101 |
+
prompt = f"""
|
102 |
+
Claim: "{claim}"
|
103 |
+
|
104 |
+
Verdict: {truth_label} (with {confidence_desc})
|
105 |
+
|
106 |
+
Available Evidence:
|
107 |
+
{evidence_text}
|
108 |
+
|
109 |
+
Task: Generate a clear explanation that:
|
110 |
+
1. Clearly states that the claim IS TRUE based on the evidence
|
111 |
+
2. {"Pay special attention to the logical relationship since the claim contains negation" if has_negation else "Explains why the evidence supports the claim"}
|
112 |
+
3. Uses confidence level of {confidence_desc}
|
113 |
+
4. Highlights the most relevant supporting evidence
|
114 |
+
5. Is factual and precise
|
115 |
+
"""
|
116 |
+
|
117 |
+
# For claims with "False" verdict
|
118 |
+
elif "False" in truth_label:
|
119 |
+
prompt = f"""
|
120 |
+
Claim: "{claim}"
|
121 |
+
|
122 |
+
Verdict: {truth_label} (with {confidence_desc})
|
123 |
+
|
124 |
+
Available Evidence:
|
125 |
+
{evidence_text}
|
126 |
+
|
127 |
+
Task: Generate a clear explanation that:
|
128 |
+
1. Clearly states that the claim IS FALSE based on the evidence
|
129 |
+
2. {"Pay special attention to the logical relationship since the claim contains negation" if has_negation else "Explains why the evidence contradicts the claim"}
|
130 |
+
3. Uses confidence level of {confidence_desc}
|
131 |
+
4. Highlights the contradicting evidence
|
132 |
+
5. Is factual and precise
|
133 |
+
|
134 |
+
IMPORTANT: If the claim contains negation (words like 'not', 'no longer', etc.), be extra careful with the logical relationship between the evidence and the claim.
|
135 |
+
"""
|
136 |
+
|
137 |
+
# For uncertain claims
|
138 |
+
else:
|
139 |
+
prompt = f"""
|
140 |
+
Claim: "{claim}"
|
141 |
+
|
142 |
+
Verdict: {truth_label} (with {confidence_desc})
|
143 |
+
|
144 |
+
Available Evidence:
|
145 |
+
{evidence_text}
|
146 |
+
|
147 |
+
Task: Generate a clear explanation that:
|
148 |
+
1. Clearly states that there is insufficient evidence to determine if the claim is true or false
|
149 |
+
2. Explains what information is missing or why the available evidence is insufficient
|
150 |
+
3. Uses confidence level of {confidence_desc}
|
151 |
+
4. Makes NO speculation about whether the claim might be true or false
|
152 |
+
5. Mentions that the user should seek information from other reliable sources
|
153 |
+
"""
|
154 |
+
|
155 |
+
# Generate explanation with multiple attempts
|
156 |
+
max_attempts = 3
|
157 |
+
for attempt in range(max_attempts):
|
158 |
+
try:
|
159 |
+
# Invoke the model
|
160 |
+
response = explanation_model.invoke(prompt)
|
161 |
+
explanation = response.content.strip()
|
162 |
+
|
163 |
+
# Validate explanation length
|
164 |
+
if explanation and len(explanation.split()) >= 5:
|
165 |
+
return explanation
|
166 |
+
|
167 |
+
except Exception as attempt_error:
|
168 |
+
logger.error(f"Explanation generation attempt {attempt+1} failed: {str(attempt_error)}")
|
169 |
+
|
170 |
+
# Ultimate fallback explanation
|
171 |
+
if "Uncertain" in truth_label:
|
172 |
+
return f"The claim '{claim}' cannot be verified due to insufficient evidence. The available information does not provide clear support for or against this claim. Consider consulting reliable sources for verification."
|
173 |
+
elif "True" in truth_label:
|
174 |
+
return f"The claim '{claim}' is supported by the evidence with {confidence_desc}. {most_relevant_evidence or 'The evidence indicates this claim is accurate.'}"
|
175 |
+
else:
|
176 |
+
return f"The claim '{claim}' is contradicted by the evidence with {confidence_desc}. {most_relevant_evidence or 'The evidence indicates this claim is not accurate.'}"
|
177 |
+
|
178 |
+
except Exception as e:
|
179 |
+
logger.error(f"Comprehensive error in explanation generation: {str(e)}")
|
180 |
+
# Final fallback
|
181 |
+
return f"The claim is classified as {truth_label} based on the available evidence."
|
modules/rss_feed.py
ADDED
@@ -0,0 +1,391 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import feedparser
|
2 |
+
import time
|
3 |
+
import logging
|
4 |
+
import re
|
5 |
+
import ssl
|
6 |
+
import requests
|
7 |
+
from datetime import datetime, timedelta
|
8 |
+
from threading import Timer
|
9 |
+
from urllib.parse import urlparse
|
10 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
11 |
+
|
12 |
+
logger = logging.getLogger("misinformation_detector")
|
13 |
+
|
14 |
+
# Disable SSL certificate verification for feeds with self-signed certs
|
15 |
+
ssl._create_default_https_context = ssl._create_unverified_context
|
16 |
+
|
17 |
+
# List of RSS feeds to check for news
|
18 |
+
# These are popular news sources with reliable and frequently updated RSS feeds
|
19 |
+
RSS_FEEDS = [
|
20 |
+
# --------------------
|
21 |
+
# 🌐 General World News
|
22 |
+
# --------------------
|
23 |
+
# "http://rss.cnn.com/rss/cnn_topstories.rss", # CNN Top Stories; Removed in round 2
|
24 |
+
"http://rss.cnn.com/rss/cnn_world.rss", # CNN World News; Duplicate with category_detection
|
25 |
+
# "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml", # NYT Home Page
|
26 |
+
"https://rss.nytimes.com/services/xml/rss/nyt/World.xml", # NYT World News; Duplicate with category_detection
|
27 |
+
# "https://rss.nytimes.com/services/xml/rss/nyt/US.xml", # NYT US News
|
28 |
+
"https://feeds.washingtonpost.com/rss/world", # The Washington Post World News; Removed in round 2
|
29 |
+
# "https://feeds.washingtonpost.com/rss/national", # The Washington Post National News
|
30 |
+
# "https://feeds.bbci.co.uk/news/rss.xml", # BBC News - Top Stories; Removed in round 2
|
31 |
+
"https://feeds.bbci.co.uk/news/world/rss.xml", # BBC News - World
|
32 |
+
# "https://news.google.com/rss?gl=IN&ceid=IN:en&topic=w&hl=en-IN", # Google News India - World; Removed in round 2
|
33 |
+
# "https://news.google.com/rss?gl=US&ceid=US:en&topic=w&hl=en-US", # Google News US - World; Removed in round 2
|
34 |
+
|
35 |
+
# --------------------
|
36 |
+
# 🧠 Tech & Startup News (Global)
|
37 |
+
# --------------------
|
38 |
+
"https://techcrunch.com/feed/", # TechCrunch - Startup and Technology News; Duplicate with category_detection
|
39 |
+
"https://venturebeat.com/feed/", # VentureBeat - Tech News
|
40 |
+
# "https://www.theverge.com/rss/index.xml", # The Verge - Technology News
|
41 |
+
"https://www.wired.com/feed/rss", # Wired - Technology News
|
42 |
+
"https://www.cnet.com/rss/news/", # CNET - Technology News
|
43 |
+
# "https://sifted.eu/feed/", # Sifted - European Startups and Tech
|
44 |
+
# "https://feeds.feedburner.com/fastcompany/headlines", # Fast Company - Business Innovation
|
45 |
+
# "https://feeds.bbci.co.uk/news/technology/rss.xml", # BBC News - Technology
|
46 |
+
"https://news.google.com/rss?gl=IN&ceid=IN:en&topic=t&hl=en-IN", # Google News India - Technology
|
47 |
+
"https://news.google.com/rss?gl=US&ceid=US:en&topic=t&hl=en-US", # Google News US - Technology
|
48 |
+
|
49 |
+
# --------------------
|
50 |
+
# 💼 Startup & VC Focused
|
51 |
+
# --------------------
|
52 |
+
"https://news.crunchbase.com/feed/", # Crunchbase News - Startup Funding
|
53 |
+
# "https://avc.com/feed/", # AVC - Musings of a VC in NYC
|
54 |
+
"https://techstartups.com/feed/", # Tech Startups - Startup News
|
55 |
+
# "https://tech.eu/feed/", # Tech.eu - European Tech News
|
56 |
+
# "https://www.menabytes.com/feed/", # MENAbytes - Middle East & North Africa Startups
|
57 |
+
# "http://feeds.feedburner.com/venturebeat/SZYF", # VentureBeat - Deals
|
58 |
+
|
59 |
+
# --------------------
|
60 |
+
# 📰 Global Business & Corporate Feeds
|
61 |
+
# --------------------
|
62 |
+
"https://feeds.bloomberg.com/technology/news.rss", # Bloomberg Technology News
|
63 |
+
"https://www.ft.com/technology?format=rss", # Financial Times Technology News
|
64 |
+
# "https://ir.thomsonreuters.com/rss/news-releases.xml", # Thomson Reuters Press Releases
|
65 |
+
# "https://feeds.bbci.co.uk/news/business/rss.xml", # BBC News - Business
|
66 |
+
"https://news.google.com/rss?gl=IN&ceid=IN:en&topic=b&hl=en-IN", # Google News India - Business
|
67 |
+
# "https://news.google.com/rss?gl=US&ceid=US:en&topic=b&hl=en-US", # Google News US - Business; Removed in round 2
|
68 |
+
|
69 |
+
# --------------------
|
70 |
+
# 🇮🇳 India-specific News
|
71 |
+
# --------------------
|
72 |
+
"https://inc42.com/feed/", # Inc42 - Indian Startups and Technology
|
73 |
+
# "https://yourstory.com/rss", # YourStory - Indian Startup Stories
|
74 |
+
# "https://economictimes.indiatimes.com/startups/rssfeeds/49979279.cms", # Economic Times - Startups
|
75 |
+
"https://timesofindia.indiatimes.com/rssfeedstopstories.cms", # TOI - Top Stories
|
76 |
+
"https://timesofindia.indiatimes.com/rssfeedmostrecent.cms", # TOI - Most Recent Stories
|
77 |
+
"https://timesofindia.indiatimes.com/rssfeeds/-2128936835.cms", # TOI - India News
|
78 |
+
"https://timesofindia.indiatimes.com/rssfeeds/296589292.cms", # TOI - World News
|
79 |
+
"https://timesofindia.indiatimes.com/rssfeeds/1898055.cms", # TOI - Business News
|
80 |
+
"https://timesofindia.indiatimes.com/rssfeeds/54829575.cms", # TOI - Cricket News
|
81 |
+
"https://timesofindia.indiatimes.com/rssfeeds/4719148.cms", # TOI - Sports News
|
82 |
+
"https://timesofindia.indiatimes.com/rssfeeds/-2128672765.cms", # TOI - Science News
|
83 |
+
# "https://timesofindia.indiatimes.com/rssfeeds/66949542.cms", # TOI - Technology News
|
84 |
+
# "https://timesofindia.indiatimes.com/rssfeeds/1081479906.cms", # TOI - Education News
|
85 |
+
|
86 |
+
# --------------------
|
87 |
+
# 🏏 Sports News (Global + Cricket)
|
88 |
+
# --------------------
|
89 |
+
"https://www.espn.com/espn/rss/news", # ESPN - Top Sports News; Duplicate with category_detection
|
90 |
+
# "https://api.foxsports.com/v2/content/optimized-rss?partnerKey=MB0Wehpmuj2lUhuRhQaafhBjAJqaPU244mlTDK1i&size=30", # Fox Sports; Removed in round 2
|
91 |
+
"https://feeds.skynews.com/feeds/rss/sports.xml", # Sky News - Sports
|
92 |
+
"https://sports.ndtv.com/rss/all", # NDTV Sports
|
93 |
+
"https://www.espncricinfo.com/rss/content/story/feeds/0.xml", # ESPN Cricinfo - Cricket News; Duplicate with category_detection
|
94 |
+
# "https://crickettimes.com/feed/", # Cricket Times - Cricket News
|
95 |
+
|
96 |
+
# --------------------
|
97 |
+
# ✅ Fact-Checking Sources
|
98 |
+
# --------------------
|
99 |
+
"https://www.snopes.com/feed/", # Snopes - Fact Checking; Duplicate with category_detection
|
100 |
+
"https://www.politifact.com/rss/all/", # PolitiFact - Fact Checking; Duplicate with category_detection
|
101 |
+
|
102 |
+
# --------------------
|
103 |
+
# 🗳️ Politics & Policy (General)
|
104 |
+
# --------------------
|
105 |
+
"https://feeds.bbci.co.uk/news/politics/rss.xml", # BBC News - Politics; Duplicate with category_detection
|
106 |
+
"https://feeds.bbci.co.uk/news/science_and_environment/rss.xml", # BBC - Science & Environment
|
107 |
+
|
108 |
+
# --------------------
|
109 |
+
# 🗳️ Science
|
110 |
+
# --------------------
|
111 |
+
"https://www.nature.com/nature.rss", # Nature science; Duplicate with category_detection
|
112 |
+
"https://feeds.science.org/rss/science-advances.xml" # science.org
|
113 |
+
]
|
114 |
+
|
115 |
+
def clean_html(raw_html):
|
116 |
+
"""Remove HTML tags from text"""
|
117 |
+
if not raw_html:
|
118 |
+
return ""
|
119 |
+
clean_regex = re.compile('<.*?>')
|
120 |
+
clean_text = re.sub(clean_regex, '', raw_html)
|
121 |
+
# Remove extra whitespace
|
122 |
+
clean_text = re.sub(r'\s+', ' ', clean_text).strip()
|
123 |
+
return clean_text
|
124 |
+
|
125 |
+
def parse_feed(feed_url, timeout=5):
|
126 |
+
"""
|
127 |
+
Parse a single RSS feed with proper timeout handling
|
128 |
+
Uses requests with timeout first, then passes content to feedparser
|
129 |
+
"""
|
130 |
+
try:
|
131 |
+
# Use requests with timeout to fetch the RSS content
|
132 |
+
response = requests.get(feed_url, timeout=timeout)
|
133 |
+
response.raise_for_status()
|
134 |
+
|
135 |
+
# Then parse the content with feedparser (which doesn't support timeout)
|
136 |
+
feed = feedparser.parse(response.content)
|
137 |
+
|
138 |
+
# Basic validation of the feed
|
139 |
+
if hasattr(feed, 'entries') and feed.entries:
|
140 |
+
return feed
|
141 |
+
else:
|
142 |
+
logger.warning(f"Feed {feed_url} parsed but contains no entries")
|
143 |
+
return None
|
144 |
+
|
145 |
+
except requests.exceptions.Timeout:
|
146 |
+
logger.warning(f"Timeout while fetching feed {feed_url}")
|
147 |
+
return None
|
148 |
+
except requests.exceptions.RequestException as e:
|
149 |
+
logger.error(f"Request error fetching feed {feed_url}: {str(e)}")
|
150 |
+
return None
|
151 |
+
except Exception as e:
|
152 |
+
logger.error(f"Error parsing feed {feed_url}: {str(e)}")
|
153 |
+
return None
|
154 |
+
|
155 |
+
def fetch_all_feeds(feeds_list=None, max_workers=5, timeout=5):
|
156 |
+
"""
|
157 |
+
Fetch multiple RSS feeds with proper timeout handling
|
158 |
+
Returns a list of (domain, feed) tuples for successfully fetched feeds
|
159 |
+
"""
|
160 |
+
# Use default RSS_FEEDS list if none provided
|
161 |
+
if feeds_list is None:
|
162 |
+
feeds_list = RSS_FEEDS
|
163 |
+
|
164 |
+
results = []
|
165 |
+
|
166 |
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
167 |
+
future_to_url = {executor.submit(parse_feed, url, timeout): url for url in feeds_list}
|
168 |
+
for future in as_completed(future_to_url):
|
169 |
+
url = future_to_url[future]
|
170 |
+
try:
|
171 |
+
feed = future.result()
|
172 |
+
if feed and hasattr(feed, 'entries') and feed.entries:
|
173 |
+
# Extract domain for source attribution
|
174 |
+
domain = urlparse(url).netloc
|
175 |
+
results.append((domain, feed))
|
176 |
+
logger.info(f"Successfully fetched {domain} with {len(feed.entries)} entries")
|
177 |
+
except Exception as e:
|
178 |
+
logger.error(f"Error processing {url}: {str(e)}")
|
179 |
+
|
180 |
+
return results
|
181 |
+
|
182 |
+
def extract_date(entry):
|
183 |
+
"""Extract and normalize publication date from entry"""
|
184 |
+
for date_field in ['published_parsed', 'updated_parsed', 'created_parsed']:
|
185 |
+
if hasattr(entry, date_field) and getattr(entry, date_field):
|
186 |
+
try:
|
187 |
+
# Convert time tuple to datetime
|
188 |
+
time_tuple = getattr(entry, date_field)
|
189 |
+
return datetime(time_tuple[0], time_tuple[1], time_tuple[2],
|
190 |
+
time_tuple[3], time_tuple[4], time_tuple[5])
|
191 |
+
except Exception as e:
|
192 |
+
logger.debug(f"Error parsing {date_field}: {e}")
|
193 |
+
continue
|
194 |
+
|
195 |
+
# Try string dates
|
196 |
+
for date_field in ['published', 'updated', 'pubDate']:
|
197 |
+
if hasattr(entry, date_field) and getattr(entry, date_field):
|
198 |
+
try:
|
199 |
+
date_str = getattr(entry, date_field)
|
200 |
+
# Try various formats
|
201 |
+
for fmt in ['%a, %d %b %Y %H:%M:%S %z', '%a, %d %b %Y %H:%M:%S %Z',
|
202 |
+
'%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S%z']:
|
203 |
+
try:
|
204 |
+
return datetime.strptime(date_str, fmt)
|
205 |
+
except:
|
206 |
+
continue
|
207 |
+
except Exception as e:
|
208 |
+
logger.debug(f"Error parsing date string {date_field}: {e}")
|
209 |
+
continue
|
210 |
+
|
211 |
+
# Default to current time if parsing fails
|
212 |
+
return datetime.now()
|
213 |
+
|
214 |
+
def is_recent(entry_date, max_days=3):
|
215 |
+
"""Check if an entry is recent (within the last few days)"""
|
216 |
+
if not entry_date:
|
217 |
+
return False
|
218 |
+
cutoff = datetime.now() - timedelta(days=max_days)
|
219 |
+
return entry_date > cutoff
|
220 |
+
|
221 |
+
def get_entry_relevance(entry, query_terms, domain):
|
222 |
+
"""Calculate relevance score for an entry based on query match and recency"""
|
223 |
+
if not hasattr(entry, 'title') or not entry.title:
|
224 |
+
return 0
|
225 |
+
|
226 |
+
# Extract text content
|
227 |
+
title = entry.title or ""
|
228 |
+
description = clean_html(entry.description) if hasattr(entry, 'description') else ""
|
229 |
+
content = ""
|
230 |
+
if hasattr(entry, 'content'):
|
231 |
+
for content_item in entry.content:
|
232 |
+
if 'value' in content_item:
|
233 |
+
content += clean_html(content_item['value']) + " "
|
234 |
+
|
235 |
+
# Extract published date
|
236 |
+
pub_date = extract_date(entry)
|
237 |
+
|
238 |
+
# Calculate recency score (0-1)
|
239 |
+
recency_score = 0
|
240 |
+
if pub_date:
|
241 |
+
days_old = (datetime.now() - pub_date).days
|
242 |
+
if days_old <= 1: # Today or yesterday
|
243 |
+
recency_score = 1.0
|
244 |
+
elif days_old <= 2:
|
245 |
+
recency_score = 0.8
|
246 |
+
elif days_old <= 3:
|
247 |
+
recency_score = 0.5
|
248 |
+
else:
|
249 |
+
recency_score = 0.2
|
250 |
+
|
251 |
+
# Calculate relevance score based on keyword matches
|
252 |
+
text = f"{title} {description} {content}".lower()
|
253 |
+
|
254 |
+
# Count how many query terms appear in the content
|
255 |
+
query_terms_lower = [term.lower() for term in query_terms]
|
256 |
+
matches = sum(1 for term in query_terms_lower if term in text)
|
257 |
+
|
258 |
+
# Calculate match score (0-1)
|
259 |
+
match_score = min(1.0, matches / max(1, len(query_terms) * 0.7))
|
260 |
+
|
261 |
+
# Boost score for exact phrase matches
|
262 |
+
query_phrase = " ".join(query_terms_lower)
|
263 |
+
if query_phrase in text:
|
264 |
+
match_score += 0.5
|
265 |
+
|
266 |
+
# Additional boost for title matches (they're more relevant)
|
267 |
+
title_matches = sum(1 for term in query_terms_lower if term in title.lower())
|
268 |
+
if title_matches > 0:
|
269 |
+
match_score += 0.2 * (title_matches / len(query_terms_lower))
|
270 |
+
|
271 |
+
# Source quality factor (can be adjusted based on source reliability)
|
272 |
+
source_factor = 1.0
|
273 |
+
high_quality_domains = ['bbc.co.uk', 'nytimes.com', 'reuters.com', 'washingtonpost.com',
|
274 |
+
'espncricinfo.com', 'cricbuzz.com', 'snopes.com']
|
275 |
+
if any(quality_domain in domain for quality_domain in high_quality_domains):
|
276 |
+
source_factor = 1.2
|
277 |
+
|
278 |
+
# Calculate final score
|
279 |
+
final_score = (match_score * 0.6) + (recency_score * 0.4) * source_factor
|
280 |
+
|
281 |
+
return min(1.0, final_score) # Cap at 1.0
|
282 |
+
|
283 |
+
def retrieve_evidence_from_rss(claim, max_results=3, category_feeds=None):
|
284 |
+
"""
|
285 |
+
Retrieve evidence from RSS feeds for a given claim
|
286 |
+
|
287 |
+
Args:
|
288 |
+
claim (str): The claim to verify
|
289 |
+
max_results (int): Maximum number of results to return
|
290 |
+
category_feeds (list, optional): List of category-specific RSS feeds to check
|
291 |
+
|
292 |
+
Returns:
|
293 |
+
list: List of relevant evidence items
|
294 |
+
"""
|
295 |
+
start_time = time.time()
|
296 |
+
logger.info(f"Retrieving evidence from RSS feeds for: {claim}")
|
297 |
+
|
298 |
+
# Extract key terms from claim
|
299 |
+
terms = [term.strip() for term in re.findall(r'\b\w+\b', claim) if len(term.strip()) > 2]
|
300 |
+
|
301 |
+
try:
|
302 |
+
# Use category-specific feeds if provided
|
303 |
+
feeds_to_use = category_feeds if category_feeds else RSS_FEEDS
|
304 |
+
|
305 |
+
# Log which feeds we're using
|
306 |
+
if category_feeds:
|
307 |
+
logger.info(f"Using {len(category_feeds)} category-specific RSS feeds")
|
308 |
+
else:
|
309 |
+
logger.info(f"Using {len(RSS_FEEDS)} default RSS feeds")
|
310 |
+
|
311 |
+
# Limit the number of feeds to process for efficiency
|
312 |
+
if len(feeds_to_use) > 10:
|
313 |
+
# If we have too many feeds, select a subset
|
314 |
+
# Prioritize fact-checking sources
|
315 |
+
fact_check_feeds = [feed for feed in feeds_to_use if "fact" in feed.lower() or "snopes" in feed.lower() or "politifact" in feed.lower()]
|
316 |
+
other_feeds = [feed for feed in feeds_to_use if feed not in fact_check_feeds]
|
317 |
+
|
318 |
+
# Take all fact-checking feeds plus a random selection of others
|
319 |
+
import random
|
320 |
+
selected_feeds = fact_check_feeds + random.sample(other_feeds, min(10 - len(fact_check_feeds), len(other_feeds)))
|
321 |
+
else:
|
322 |
+
selected_feeds = feeds_to_use
|
323 |
+
|
324 |
+
# Fetch all feeds in parallel with the selected feeds
|
325 |
+
feeds = fetch_all_feeds(selected_feeds)
|
326 |
+
|
327 |
+
if not feeds:
|
328 |
+
logger.warning("No RSS feeds could be fetched")
|
329 |
+
return []
|
330 |
+
|
331 |
+
all_entries = []
|
332 |
+
|
333 |
+
# Process all feed entries
|
334 |
+
for domain, feed in feeds:
|
335 |
+
for entry in feed.entries:
|
336 |
+
# Calculate relevance score
|
337 |
+
relevance = get_entry_relevance(entry, terms, domain)
|
338 |
+
|
339 |
+
if relevance > 0.3: # Only consider somewhat relevant entries
|
340 |
+
# Extract entry details
|
341 |
+
title = entry.title if hasattr(entry, 'title') else "No title"
|
342 |
+
link = entry.link if hasattr(entry, 'link') else ""
|
343 |
+
|
344 |
+
# Extract and clean description/content
|
345 |
+
description = ""
|
346 |
+
if hasattr(entry, 'description'):
|
347 |
+
description = clean_html(entry.description)
|
348 |
+
elif hasattr(entry, 'summary'):
|
349 |
+
description = clean_html(entry.summary)
|
350 |
+
elif hasattr(entry, 'content'):
|
351 |
+
for content_item in entry.content:
|
352 |
+
if 'value' in content_item:
|
353 |
+
description += clean_html(content_item['value']) + " "
|
354 |
+
|
355 |
+
# Truncate description if too long
|
356 |
+
if len(description) > 250:
|
357 |
+
description = description[:247] + "..."
|
358 |
+
|
359 |
+
# Get publication date
|
360 |
+
pub_date = extract_date(entry)
|
361 |
+
date_str = pub_date.strftime('%Y-%m-%d') if pub_date else "Unknown date"
|
362 |
+
|
363 |
+
# Format as evidence text
|
364 |
+
evidence_text = (
|
365 |
+
f"Title: {title}, "
|
366 |
+
f"Source: {domain} (RSS), "
|
367 |
+
f"Date: {date_str}, "
|
368 |
+
f"URL: {link}, "
|
369 |
+
f"Content: {description}"
|
370 |
+
)
|
371 |
+
|
372 |
+
all_entries.append({
|
373 |
+
"text": evidence_text,
|
374 |
+
"relevance": relevance,
|
375 |
+
"date": pub_date or datetime.now()
|
376 |
+
})
|
377 |
+
|
378 |
+
# Sort entries by relevance
|
379 |
+
all_entries.sort(key=lambda x: x["relevance"], reverse=True)
|
380 |
+
|
381 |
+
# Take top results
|
382 |
+
top_entries = all_entries[:max_results]
|
383 |
+
|
384 |
+
logger.info(f"Retrieved {len(top_entries)} relevant RSS items from {len(feeds)} feeds in {time.time() - start_time:.2f}s")
|
385 |
+
|
386 |
+
# Return just the text portion
|
387 |
+
return [entry["text"] for entry in top_entries]
|
388 |
+
|
389 |
+
except Exception as e:
|
390 |
+
logger.error(f"Error in RSS retrieval: {str(e)}")
|
391 |
+
return []
|
modules/semantic_analysis.py
ADDED
@@ -0,0 +1,503 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import numpy as np
|
3 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
4 |
+
from datetime import datetime, timedelta
|
5 |
+
import re
|
6 |
+
|
7 |
+
# Import the centralized NLP model handler
|
8 |
+
from utils.models import get_nlp_model
|
9 |
+
|
10 |
+
logger = logging.getLogger("misinformation_detector")
|
11 |
+
|
12 |
+
def extract_entities(text):
|
13 |
+
"""Extract named entities from text"""
|
14 |
+
if not text:
|
15 |
+
return []
|
16 |
+
|
17 |
+
try:
|
18 |
+
# Use centralized NLP model
|
19 |
+
nlp_model = get_nlp_model()
|
20 |
+
doc = nlp_model(text)
|
21 |
+
entities = [
|
22 |
+
{
|
23 |
+
"text": ent.text,
|
24 |
+
"label": ent.label_,
|
25 |
+
"start": ent.start_char,
|
26 |
+
"end": ent.end_char
|
27 |
+
}
|
28 |
+
for ent in doc.ents
|
29 |
+
]
|
30 |
+
return entities
|
31 |
+
except Exception as e:
|
32 |
+
logger.error(f"Error extracting entities: {str(e)}")
|
33 |
+
return []
|
34 |
+
|
35 |
+
def get_vector_representation(text):
|
36 |
+
"""Get vector representation of text using spaCy"""
|
37 |
+
if not text:
|
38 |
+
return None
|
39 |
+
|
40 |
+
try:
|
41 |
+
# Use centralized NLP model
|
42 |
+
nlp_model = get_nlp_model()
|
43 |
+
doc = nlp_model(text)
|
44 |
+
|
45 |
+
# Return document vector if available
|
46 |
+
if doc.has_vector:
|
47 |
+
return doc.vector
|
48 |
+
|
49 |
+
# Fallback: average of token vectors
|
50 |
+
vectors = [token.vector for token in doc if token.has_vector]
|
51 |
+
if vectors:
|
52 |
+
return np.mean(vectors, axis=0)
|
53 |
+
|
54 |
+
return None
|
55 |
+
except Exception as e:
|
56 |
+
logger.error(f"Error getting vector representation: {str(e)}")
|
57 |
+
return None
|
58 |
+
|
59 |
+
def calculate_similarity(text1, text2):
|
60 |
+
"""Calculate semantic similarity between two texts"""
|
61 |
+
if not text1 or not text2:
|
62 |
+
return 0.0
|
63 |
+
|
64 |
+
try:
|
65 |
+
vec1 = get_vector_representation(text1)
|
66 |
+
vec2 = get_vector_representation(text2)
|
67 |
+
|
68 |
+
if vec1 is None or vec2 is None:
|
69 |
+
return 0.0
|
70 |
+
|
71 |
+
# Reshape vectors for cosine_similarity
|
72 |
+
vec1 = vec1.reshape(1, -1)
|
73 |
+
vec2 = vec2.reshape(1, -1)
|
74 |
+
|
75 |
+
# Calculate cosine similarity
|
76 |
+
similarity = cosine_similarity(vec1, vec2)[0][0]
|
77 |
+
return float(similarity)
|
78 |
+
except Exception as e:
|
79 |
+
logger.error(f"Error calculating similarity: {str(e)}")
|
80 |
+
return 0.0
|
81 |
+
|
82 |
+
def extract_date_from_evidence(evidence_text):
|
83 |
+
"""Extract date from evidence text"""
|
84 |
+
if not evidence_text:
|
85 |
+
return None
|
86 |
+
|
87 |
+
try:
|
88 |
+
# Look for date patterns in text
|
89 |
+
date_patterns = [
|
90 |
+
r'Date: (\d{4}-\d{2}-\d{2})', # ISO format
|
91 |
+
r'published.*?(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})', # published on MM/DD/YYYY
|
92 |
+
r'(\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4})', # DD Month YYYY
|
93 |
+
r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}' # Month DD, YYYY
|
94 |
+
]
|
95 |
+
|
96 |
+
for pattern in date_patterns:
|
97 |
+
match = re.search(pattern, evidence_text)
|
98 |
+
if match:
|
99 |
+
date_str = match.group(1)
|
100 |
+
# Parse date string based on format
|
101 |
+
try:
|
102 |
+
if '-' in date_str:
|
103 |
+
return datetime.strptime(date_str, '%Y-%m-%d')
|
104 |
+
elif '/' in date_str or '-' in date_str:
|
105 |
+
formats = ['%m/%d/%Y', '%d/%m/%Y', '%m-%d-%Y', '%d-%m-%Y']
|
106 |
+
for fmt in formats:
|
107 |
+
try:
|
108 |
+
return datetime.strptime(date_str, fmt)
|
109 |
+
except ValueError:
|
110 |
+
continue
|
111 |
+
else:
|
112 |
+
# Try different month formats
|
113 |
+
formats = ['%d %B %Y', '%B %d, %Y', '%B %d %Y']
|
114 |
+
for fmt in formats:
|
115 |
+
try:
|
116 |
+
return datetime.strptime(date_str, fmt)
|
117 |
+
except ValueError:
|
118 |
+
continue
|
119 |
+
except Exception:
|
120 |
+
pass
|
121 |
+
|
122 |
+
return None
|
123 |
+
except Exception as e:
|
124 |
+
logger.error(f"Error extracting date from evidence: {str(e)}")
|
125 |
+
return None
|
126 |
+
|
127 |
+
def is_temporally_relevant(evidence_text, claim_text, max_days_old=30):
|
128 |
+
"""Check if evidence is temporally relevant to the claim"""
|
129 |
+
# Check if claim seems to require recent evidence
|
130 |
+
temporal_terms = ["today", "now", "current", "currently", "recent", "recently", "latest", "just", "this week", "this month", "this year"]
|
131 |
+
requires_recent = any(term in claim_text.lower() for term in temporal_terms)
|
132 |
+
|
133 |
+
# If claim doesn't specify temporality, consider evidence relevant
|
134 |
+
if not requires_recent:
|
135 |
+
return True
|
136 |
+
|
137 |
+
# Extract date from evidence
|
138 |
+
date = extract_date_from_evidence(evidence_text)
|
139 |
+
if not date:
|
140 |
+
return True # If we can't determine date, assume it's relevant
|
141 |
+
|
142 |
+
# Check if evidence is recent enough
|
143 |
+
cutoff = datetime.now() - timedelta(days=max_days_old)
|
144 |
+
return date >= cutoff
|
145 |
+
|
146 |
+
def has_authority_signal(evidence_text):
|
147 |
+
"""Check if evidence contains authority signals"""
|
148 |
+
authority_signals = {
|
149 |
+
"scientific_consensus": ["consensus", "scientists agree", "research shows", "studies confirm", "experts agree"],
|
150 |
+
"fact_check": ["fact check", "rated false", "rated true", "debunked", "confirmed", "verification"],
|
151 |
+
"high_authority": ["nasa", "world health organization", "who", "cdc", "national academy",
|
152 |
+
"oxford", "harvard", "stanford", "mit", "cambridge", "yale",
|
153 |
+
"princeton", "government", "official", "authorities", "minister",
|
154 |
+
"ministry", "department", "administration", "university", "professor"]
|
155 |
+
}
|
156 |
+
|
157 |
+
evidence_lower = evidence_text.lower()
|
158 |
+
|
159 |
+
authority_type = None
|
160 |
+
authority_score = 1.0
|
161 |
+
|
162 |
+
for signal_type, phrases in authority_signals.items():
|
163 |
+
if any(phrase in evidence_lower for phrase in phrases):
|
164 |
+
if signal_type == "scientific_consensus":
|
165 |
+
authority_score = 1.8
|
166 |
+
authority_type = "scientific_consensus"
|
167 |
+
elif signal_type == "fact_check":
|
168 |
+
authority_score = 1.5
|
169 |
+
authority_type = "fact_check"
|
170 |
+
elif signal_type == "high_authority":
|
171 |
+
authority_score = 1.3
|
172 |
+
authority_type = "high_authority"
|
173 |
+
break
|
174 |
+
|
175 |
+
return authority_score, authority_type
|
176 |
+
|
177 |
+
def analyze_evidence_relevance(claim, evidence_list, source_credibility=None):
|
178 |
+
"""
|
179 |
+
Analyze evidence relevance to claim using semantic similarity with improved handling
|
180 |
+
for claims requiring strong evidence
|
181 |
+
|
182 |
+
Args:
|
183 |
+
claim (str): The claim being verified
|
184 |
+
evidence_list (list): List of evidence items
|
185 |
+
source_credibility (dict): Dictionary mapping source domains to credibility scores
|
186 |
+
|
187 |
+
Returns:
|
188 |
+
list: Sorted list of evidence items with relevance scores
|
189 |
+
"""
|
190 |
+
if not evidence_list:
|
191 |
+
return []
|
192 |
+
|
193 |
+
# Ensure evidence_list is a list of strings
|
194 |
+
if not isinstance(evidence_list, list):
|
195 |
+
evidence_list = [str(evidence_list)]
|
196 |
+
|
197 |
+
# Filter out None or empty items
|
198 |
+
evidence_list = [item for item in evidence_list if item]
|
199 |
+
|
200 |
+
# Check if claim contains strong assertions that would require specific evidence
|
201 |
+
strong_assertion_markers = [
|
202 |
+
"solved", "cured", "discovered", "breakthrough", "revolutionary",
|
203 |
+
"first ever", "confirmed", "definitive", "conclusive", "proven",
|
204 |
+
"groundbreaking", "unprecedented", "remarkable", "extends lifespan",
|
205 |
+
"extends life", "definitively", "successfully"
|
206 |
+
]
|
207 |
+
|
208 |
+
# Determine if claim contains strong assertions
|
209 |
+
claim_has_strong_assertions = any(marker in claim.lower() for marker in strong_assertion_markers)
|
210 |
+
|
211 |
+
# Log detection result
|
212 |
+
if claim_has_strong_assertions:
|
213 |
+
logger.info(f"Evidence analysis: Detected claim with strong assertions requiring specific evidence")
|
214 |
+
|
215 |
+
# Extract named entities from claim
|
216 |
+
claim_entities = extract_entities(claim)
|
217 |
+
claim_entity_texts = [entity["text"].lower() for entity in claim_entities]
|
218 |
+
|
219 |
+
# Process each evidence item
|
220 |
+
analyzed_evidence = []
|
221 |
+
|
222 |
+
# Track domains found in evidence to identify source diversity
|
223 |
+
found_domains = set()
|
224 |
+
|
225 |
+
for evidence in evidence_list:
|
226 |
+
if not isinstance(evidence, str):
|
227 |
+
continue
|
228 |
+
|
229 |
+
# Calculate semantic similarity
|
230 |
+
similarity = calculate_similarity(claim, evidence)
|
231 |
+
|
232 |
+
# Check for entity overlap
|
233 |
+
evidence_entities = extract_entities(evidence)
|
234 |
+
evidence_entity_texts = [entity["text"].lower() for entity in evidence_entities]
|
235 |
+
|
236 |
+
# Calculate entity overlap
|
237 |
+
common_entities = set(claim_entity_texts).intersection(set(evidence_entity_texts))
|
238 |
+
entity_overlap = len(common_entities) / max(1, len(claim_entity_texts))
|
239 |
+
|
240 |
+
# Check temporal relevance
|
241 |
+
temporal_relevance = 1.0
|
242 |
+
if is_temporally_relevant(evidence, claim):
|
243 |
+
temporal_relevance = 1.2
|
244 |
+
else:
|
245 |
+
# Penalty for temporally irrelevant evidence
|
246 |
+
temporal_relevance = 0.7
|
247 |
+
|
248 |
+
# Check for authority signals
|
249 |
+
authority_score, authority_type = has_authority_signal(evidence)
|
250 |
+
|
251 |
+
# Extract source from evidence if available
|
252 |
+
source_boost = 1.0
|
253 |
+
domain = None
|
254 |
+
|
255 |
+
if source_credibility:
|
256 |
+
# Try to extract domain from URL in evidence
|
257 |
+
domain_match = re.search(r'URL: https?://(?:www\.)?([^/]+)', evidence)
|
258 |
+
if domain_match:
|
259 |
+
domain = domain_match.group(1)
|
260 |
+
# Check if domain or its parent domain is in credibility list
|
261 |
+
for cred_domain, cred_score in source_credibility.items():
|
262 |
+
if cred_domain in domain:
|
263 |
+
try:
|
264 |
+
source_boost = float(cred_score)
|
265 |
+
break
|
266 |
+
except (ValueError, TypeError):
|
267 |
+
pass
|
268 |
+
|
269 |
+
# Track this domain for source diversity
|
270 |
+
if domain:
|
271 |
+
found_domains.add(domain)
|
272 |
+
|
273 |
+
# For claims with strong assertions: check if evidence specifically addresses assertions
|
274 |
+
claim_specificity_match = 1.0
|
275 |
+
evidence_specificity_match = 1.0
|
276 |
+
|
277 |
+
if claim_has_strong_assertions:
|
278 |
+
# Check if evidence provides specific confirmation or contradiction
|
279 |
+
direct_contradiction_terms = [
|
280 |
+
"not yet", "has not", "have not", "cannot", "can't", "doesn't", "don't",
|
281 |
+
"unlikely", "challenging", "remains a challenge", "in the future",
|
282 |
+
"experimental", "in development", "proposed", "theoretical",
|
283 |
+
"preliminary", "hypothesized", "potential", "promising but"
|
284 |
+
]
|
285 |
+
|
286 |
+
# Check for contradictions to strong assertions
|
287 |
+
if any(term in evidence.lower() for term in direct_contradiction_terms):
|
288 |
+
# This evidence likely contradicts the strong assertion
|
289 |
+
evidence_specificity_match = 2.0 # Boost relevance of contradicting evidence
|
290 |
+
logger.debug(f"Found contradiction to strong assertion in evidence")
|
291 |
+
|
292 |
+
# For claims with strong assertions, check if evidence specifically confirms
|
293 |
+
direct_confirmation_terms = [
|
294 |
+
"successfully demonstrated", "breakthrough", "solved", "cured",
|
295 |
+
"confirmed", "definitive evidence", "conclusive results", "proven",
|
296 |
+
"revolutionary results", "milestone achievement", "groundbreaking results"
|
297 |
+
]
|
298 |
+
|
299 |
+
# If evidence confirms the strong assertion, adjust relevance
|
300 |
+
if any(term in evidence.lower() for term in direct_confirmation_terms):
|
301 |
+
# Apply higher scoring for evidence that specifically confirms
|
302 |
+
evidence_specificity_match = 1.8
|
303 |
+
logger.debug(f"Found confirmation of strong assertion in evidence")
|
304 |
+
|
305 |
+
# For claims with strong assertions, check for high-quality sources
|
306 |
+
high_quality_source_markers = [
|
307 |
+
"journal", "doi.org", "research", "university", "institute",
|
308 |
+
"laboratory", "professor", "study", "publication", "published in"
|
309 |
+
]
|
310 |
+
|
311 |
+
is_high_quality = any(term in evidence.lower() for term in high_quality_source_markers)
|
312 |
+
quality_boost = 1.4 if is_high_quality else 1.0
|
313 |
+
|
314 |
+
# Apply the quality boost
|
315 |
+
source_boost *= quality_boost
|
316 |
+
|
317 |
+
# Calculate final relevance score with improvements for all claim types
|
318 |
+
if claim_has_strong_assertions:
|
319 |
+
relevance_score = (
|
320 |
+
(similarity * 0.35) + # Semantic similarity
|
321 |
+
(entity_overlap * 0.25) + # Entity overlap
|
322 |
+
(0.25) # Base value to ensure all evidence has some relevance
|
323 |
+
) * temporal_relevance * authority_score * source_boost * claim_specificity_match * evidence_specificity_match
|
324 |
+
else:
|
325 |
+
# Original formula for regular claims
|
326 |
+
relevance_score = (
|
327 |
+
(similarity * 0.4) + # Semantic similarity
|
328 |
+
(entity_overlap * 0.3) + # Entity overlap
|
329 |
+
(0.3) # Base value to ensure all evidence has some relevance
|
330 |
+
) * temporal_relevance * authority_score * source_boost
|
331 |
+
|
332 |
+
# Add metadata and relevance score
|
333 |
+
analyzed_evidence.append({
|
334 |
+
"text": evidence,
|
335 |
+
"relevance_score": relevance_score,
|
336 |
+
"similarity": similarity,
|
337 |
+
"entity_overlap": entity_overlap,
|
338 |
+
"temporal_relevance": temporal_relevance,
|
339 |
+
"authority_score": authority_score,
|
340 |
+
"authority_type": authority_type,
|
341 |
+
"source_boost": source_boost,
|
342 |
+
"domain": domain
|
343 |
+
})
|
344 |
+
|
345 |
+
# Sort by relevance score (descending)
|
346 |
+
analyzed_evidence.sort(key=lambda x: x["relevance_score"], reverse=True)
|
347 |
+
|
348 |
+
# Ensure we have diverse sources in top results for all claims
|
349 |
+
if len(found_domains) > 1:
|
350 |
+
# Try to promote evidence from reliable sources if we haven't selected any yet
|
351 |
+
reliable_sources_seen = False
|
352 |
+
|
353 |
+
# Check if top 3 results contain any reliable sources
|
354 |
+
for item in analyzed_evidence[:3]:
|
355 |
+
domain = item.get("domain", "")
|
356 |
+
if domain and source_credibility and any(cred_domain in domain for cred_domain in source_credibility):
|
357 |
+
reliable_sources_seen = True
|
358 |
+
break
|
359 |
+
|
360 |
+
# If no reliable sources in top results, promote one if available
|
361 |
+
if not reliable_sources_seen:
|
362 |
+
for i, item in enumerate(analyzed_evidence[3:]):
|
363 |
+
domain = item.get("domain", "")
|
364 |
+
if domain and source_credibility and any(cred_domain in domain for cred_domain in source_credibility):
|
365 |
+
# Swap this item into the top 3
|
366 |
+
analyzed_evidence.insert(2, analyzed_evidence.pop(i+3))
|
367 |
+
break
|
368 |
+
|
369 |
+
return analyzed_evidence
|
370 |
+
|
371 |
+
def select_diverse_evidence(analyzed_evidence, max_items=5):
|
372 |
+
"""
|
373 |
+
Select diverse evidence items based on relevance, source diversity and claim characteristics
|
374 |
+
|
375 |
+
Args:
|
376 |
+
analyzed_evidence (list): List of evidence items with relevance scores
|
377 |
+
max_items (int): Maximum number of evidence items to return
|
378 |
+
|
379 |
+
Returns:
|
380 |
+
list: Selected diverse evidence items
|
381 |
+
"""
|
382 |
+
if not analyzed_evidence:
|
383 |
+
return []
|
384 |
+
|
385 |
+
# Check if top evidence suggests claim has strong assertions
|
386 |
+
strong_assertion_markers = [
|
387 |
+
"solved", "cured", "discovered", "breakthrough", "revolutionary",
|
388 |
+
"first ever", "confirmed", "definitive", "conclusive", "proven",
|
389 |
+
"groundbreaking", "unprecedented", "extends lifespan", "definitively"
|
390 |
+
]
|
391 |
+
|
392 |
+
# Determine if this is a claim with strong assertions by checking evidence text
|
393 |
+
has_strong_assertions = False
|
394 |
+
|
395 |
+
for item in analyzed_evidence[:3]: # Check just the top items for efficiency
|
396 |
+
if "text" in item:
|
397 |
+
item_text = item["text"].lower()
|
398 |
+
if any(f"claim {marker}" in item_text or f"claim has {marker}" in item_text
|
399 |
+
for marker in strong_assertion_markers):
|
400 |
+
has_strong_assertions = True
|
401 |
+
break
|
402 |
+
|
403 |
+
# Also check for contradiction markers in evidence which can indicate a strong assertion
|
404 |
+
contradiction_markers = [
|
405 |
+
"not yet solved", "hasn't been proven", "no evidence that",
|
406 |
+
"remains unsolved", "has not been confirmed", "remains theoretical"
|
407 |
+
]
|
408 |
+
|
409 |
+
if not has_strong_assertions:
|
410 |
+
for item in analyzed_evidence[:3]:
|
411 |
+
if "text" in item:
|
412 |
+
item_text = item["text"].lower()
|
413 |
+
if any(marker in item_text for marker in contradiction_markers):
|
414 |
+
has_strong_assertions = True
|
415 |
+
break
|
416 |
+
|
417 |
+
# Ensure we don't select more than available
|
418 |
+
max_items = min(max_items, len(analyzed_evidence))
|
419 |
+
|
420 |
+
# Initialize selected items with the most relevant item
|
421 |
+
selected = [analyzed_evidence[0]]
|
422 |
+
remaining = analyzed_evidence[1:]
|
423 |
+
|
424 |
+
# Track sources to ensure diversity
|
425 |
+
selected_sources = set()
|
426 |
+
for item in selected:
|
427 |
+
# Try to extract source from evidence
|
428 |
+
source_match = re.search(r'Source: ([^,]+)', item["text"])
|
429 |
+
if source_match:
|
430 |
+
selected_sources.add(source_match.group(1))
|
431 |
+
|
432 |
+
# For all claims, track if we have high-quality sources yet
|
433 |
+
has_quality_source = False
|
434 |
+
quality_source_markers = ["journal", "doi.org", "research", "university",
|
435 |
+
"institute", "laboratory", "professor", "study"]
|
436 |
+
|
437 |
+
# Check if our top item is already from a quality source
|
438 |
+
if any(marker in selected[0]["text"].lower() for marker in quality_source_markers):
|
439 |
+
has_quality_source = True
|
440 |
+
|
441 |
+
# Select remaining items balancing relevance and diversity
|
442 |
+
while len(selected) < max_items and remaining:
|
443 |
+
best_item = None
|
444 |
+
best_score = -1
|
445 |
+
|
446 |
+
for i, item in enumerate(remaining):
|
447 |
+
# Base score is the item's relevance
|
448 |
+
score = item["relevance_score"]
|
449 |
+
|
450 |
+
# Extract source if available
|
451 |
+
source = None
|
452 |
+
source_match = re.search(r'Source: ([^,]+)', item["text"])
|
453 |
+
if source_match:
|
454 |
+
source = source_match.group(1)
|
455 |
+
|
456 |
+
# Apply diversity bonus if source is new
|
457 |
+
if source and source not in selected_sources:
|
458 |
+
score *= 1.2 # Diversity bonus
|
459 |
+
|
460 |
+
# For claims with strong assertions, apply bonus for contradicting evidence
|
461 |
+
if has_strong_assertions:
|
462 |
+
# Check for contradiction markers in the text
|
463 |
+
if any(marker in item["text"].lower() for marker in contradiction_markers):
|
464 |
+
score *= 1.3 # Bonus for evidence that may contradict strong assertions
|
465 |
+
|
466 |
+
# For any claim, apply bonus for high-quality sources if we don't have one yet
|
467 |
+
if not has_quality_source:
|
468 |
+
is_item_quality = any(marker in item["text"].lower() for marker in quality_source_markers)
|
469 |
+
if is_item_quality:
|
470 |
+
score *= 1.5 # Significant bonus for quality sources
|
471 |
+
|
472 |
+
if score > best_score:
|
473 |
+
best_score = score
|
474 |
+
best_item = (i, item)
|
475 |
+
|
476 |
+
if best_item:
|
477 |
+
idx, item = best_item
|
478 |
+
selected.append(item)
|
479 |
+
remaining.pop(idx)
|
480 |
+
|
481 |
+
# Add source to selected sources
|
482 |
+
source_match = re.search(r'Source: ([^,]+)', item["text"])
|
483 |
+
if source_match:
|
484 |
+
selected_sources.add(source_match.group(1))
|
485 |
+
|
486 |
+
# Check if we found a quality source
|
487 |
+
if not has_quality_source:
|
488 |
+
if any(marker in item["text"].lower() for marker in quality_source_markers):
|
489 |
+
has_quality_source = True
|
490 |
+
else:
|
491 |
+
break
|
492 |
+
|
493 |
+
# For any claim with strong assertions, ensure we have at least one quality source if available
|
494 |
+
if has_strong_assertions and not has_quality_source and remaining:
|
495 |
+
for i, item in enumerate(remaining):
|
496 |
+
if any(marker in item["text"].lower() for marker in quality_source_markers):
|
497 |
+
# Replace the least relevant selected item with this quality one
|
498 |
+
selected.sort(key=lambda x: x["relevance_score"])
|
499 |
+
selected[0] = item
|
500 |
+
break
|
501 |
+
|
502 |
+
# Return only the text portion
|
503 |
+
return [item["text"] for item in selected]
|
utils/__init__.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Utils package initialization.
|
3 |
+
|
4 |
+
This package provides utility functions for the AskVeracity fact-checking system.
|
5 |
+
"""
|
6 |
+
|
7 |
+
from .api_utils import api_error_handler, safe_json_parse, RateLimiter
|
8 |
+
from .performance import PerformanceTracker
|
9 |
+
from .models import initialize_models, get_nlp_model, get_llm_model
|
10 |
+
|
11 |
+
|
12 |
+
__all__ = [
|
13 |
+
'api_error_handler',
|
14 |
+
'safe_json_parse',
|
15 |
+
'RateLimiter',
|
16 |
+
'PerformanceTracker',
|
17 |
+
'initialize_models',
|
18 |
+
'get_nlp_model',
|
19 |
+
'get_llm_model'
|
20 |
+
]
|
utils/api_utils.py
ADDED
@@ -0,0 +1,229 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
API utilities for the Fake News Detector application.
|
3 |
+
|
4 |
+
This module provides utilities for handling API calls, rate limiting,
|
5 |
+
error handling, and exponential backoff for retrying failed requests.
|
6 |
+
"""
|
7 |
+
|
8 |
+
import time
|
9 |
+
import functools
|
10 |
+
import random
|
11 |
+
import logging
|
12 |
+
import requests
|
13 |
+
from datetime import datetime, timedelta
|
14 |
+
from collections import deque
|
15 |
+
|
16 |
+
from config import RATE_LIMITS, ERROR_BACKOFF
|
17 |
+
|
18 |
+
logger = logging.getLogger("misinformation_detector")
|
19 |
+
|
20 |
+
class RateLimiter:
|
21 |
+
"""
|
22 |
+
Rate limiter for API calls with support for different APIs.
|
23 |
+
|
24 |
+
This class implements a token bucket algorithm for rate limiting,
|
25 |
+
with support for different rate limits for different APIs.
|
26 |
+
It also provides exponential backoff for error handling.
|
27 |
+
"""
|
28 |
+
|
29 |
+
def __init__(self):
|
30 |
+
"""Initialize the rate limiter with configuration from settings."""
|
31 |
+
# Store rate limits for different APIs
|
32 |
+
self.limits = {}
|
33 |
+
|
34 |
+
# Initialize limits from config
|
35 |
+
for api_name, limit_info in RATE_LIMITS.items():
|
36 |
+
self.limits[api_name] = {
|
37 |
+
"requests": limit_info["requests"],
|
38 |
+
"period": limit_info["period"],
|
39 |
+
"timestamps": deque()
|
40 |
+
}
|
41 |
+
|
42 |
+
# Error backoff settings
|
43 |
+
self.max_retries = ERROR_BACKOFF["max_retries"]
|
44 |
+
self.initial_backoff = ERROR_BACKOFF["initial_backoff"]
|
45 |
+
self.backoff_factor = ERROR_BACKOFF["backoff_factor"]
|
46 |
+
|
47 |
+
def check_and_update(self, api_name):
|
48 |
+
"""
|
49 |
+
Check if request is allowed and update timestamps.
|
50 |
+
|
51 |
+
Args:
|
52 |
+
api_name (str): Name of the API to check
|
53 |
+
|
54 |
+
Returns:
|
55 |
+
tuple: (allowed, wait_time)
|
56 |
+
- allowed (bool): Whether the request is allowed
|
57 |
+
- wait_time (float): Time to wait if not allowed
|
58 |
+
"""
|
59 |
+
if api_name not in self.limits:
|
60 |
+
return True, 0 # Unknown API, allow by default
|
61 |
+
|
62 |
+
now = datetime.now()
|
63 |
+
limit_info = self.limits[api_name]
|
64 |
+
|
65 |
+
# Remove timestamps older than the period
|
66 |
+
cutoff = now - timedelta(seconds=limit_info["period"])
|
67 |
+
while limit_info["timestamps"] and limit_info["timestamps"][0] < cutoff:
|
68 |
+
limit_info["timestamps"].popleft()
|
69 |
+
|
70 |
+
# Check if we're at the rate limit
|
71 |
+
if len(limit_info["timestamps"]) >= limit_info["requests"]:
|
72 |
+
# Calculate wait time until oldest timestamp expires
|
73 |
+
wait_time = (limit_info["timestamps"][0] + timedelta(seconds=limit_info["period"]) - now).total_seconds()
|
74 |
+
return False, max(0, wait_time)
|
75 |
+
|
76 |
+
# Add current timestamp and allow request
|
77 |
+
limit_info["timestamps"].append(now)
|
78 |
+
return True, 0
|
79 |
+
|
80 |
+
def wait_if_needed(self, api_name):
|
81 |
+
"""
|
82 |
+
Wait if rate limit is reached.
|
83 |
+
|
84 |
+
Args:
|
85 |
+
api_name (str): Name of the API to check
|
86 |
+
|
87 |
+
Returns:
|
88 |
+
bool: True if waited, False otherwise
|
89 |
+
"""
|
90 |
+
allowed, wait_time = self.check_and_update(api_name)
|
91 |
+
if not allowed:
|
92 |
+
logger.info(f"Rate limit reached for {api_name}. Waiting {wait_time:.2f} seconds...")
|
93 |
+
time.sleep(wait_time + 0.1) # Add a small buffer
|
94 |
+
return True
|
95 |
+
return False
|
96 |
+
|
97 |
+
def get_backoff_time(self, attempt):
|
98 |
+
"""
|
99 |
+
Calculate exponential backoff time with jitter.
|
100 |
+
|
101 |
+
Args:
|
102 |
+
attempt (int): Current attempt number (0-based)
|
103 |
+
|
104 |
+
Returns:
|
105 |
+
float: Backoff time in seconds
|
106 |
+
"""
|
107 |
+
backoff = self.initial_backoff * (self.backoff_factor ** attempt)
|
108 |
+
# Add jitter to prevent thundering herd problem
|
109 |
+
jitter = random.uniform(0, 0.1 * backoff)
|
110 |
+
return backoff + jitter
|
111 |
+
|
112 |
+
|
113 |
+
# Create rate limiter instance
|
114 |
+
rate_limiter = RateLimiter()
|
115 |
+
|
116 |
+
# API Error Handler decorator
|
117 |
+
def api_error_handler(api_name):
|
118 |
+
"""
|
119 |
+
Decorator for API calls with error handling and rate limiting.
|
120 |
+
|
121 |
+
This decorator handles rate limiting, retries with exponential
|
122 |
+
backoff, and error handling for API calls.
|
123 |
+
|
124 |
+
Args:
|
125 |
+
api_name (str): Name of the API being called
|
126 |
+
|
127 |
+
Returns:
|
128 |
+
callable: Decorated function
|
129 |
+
"""
|
130 |
+
def decorator(func):
|
131 |
+
@functools.wraps(func)
|
132 |
+
def wrapper(*args, **kwargs):
|
133 |
+
try:
|
134 |
+
# Apply rate limiting - make sure rate_limiter exists and has the method
|
135 |
+
if hasattr(rate_limiter, 'wait_if_needed'):
|
136 |
+
rate_limiter.wait_if_needed(api_name)
|
137 |
+
|
138 |
+
# Track retries
|
139 |
+
for attempt in range(rate_limiter.max_retries):
|
140 |
+
try:
|
141 |
+
return func(*args, **kwargs)
|
142 |
+
except requests.exceptions.HTTPError as e:
|
143 |
+
status_code = e.response.status_code if hasattr(e, 'response') else 0
|
144 |
+
|
145 |
+
# Handle specific HTTP errors
|
146 |
+
if status_code == 429: # Too Many Requests
|
147 |
+
logger.warning(f"{api_name} rate limit exceeded (429). Attempt {attempt+1}/{rate_limiter.max_retries}")
|
148 |
+
# Get retry-after header or use exponential backoff
|
149 |
+
retry_after = e.response.headers.get('Retry-After')
|
150 |
+
if retry_after and retry_after.isdigit():
|
151 |
+
wait_time = int(retry_after)
|
152 |
+
else:
|
153 |
+
wait_time = rate_limiter.get_backoff_time(attempt)
|
154 |
+
logger.info(f"Waiting {wait_time} seconds before retry...")
|
155 |
+
time.sleep(wait_time)
|
156 |
+
elif status_code >= 500: # Server errors
|
157 |
+
logger.warning(f"{api_name} server error ({status_code}). Attempt {attempt+1}/{rate_limiter.max_retries}")
|
158 |
+
time.sleep(rate_limiter.get_backoff_time(attempt))
|
159 |
+
elif status_code == 403: # Forbidden - likely API key issue
|
160 |
+
logger.error(f"{api_name} access forbidden (403). Check API key.")
|
161 |
+
return None # Don't retry on auth errors
|
162 |
+
elif status_code == 404: # Not Found
|
163 |
+
logger.warning(f"{api_name} resource not found (404).")
|
164 |
+
return None # Don't retry on resource not found
|
165 |
+
else:
|
166 |
+
logger.error(f"{api_name} HTTP error: {e}")
|
167 |
+
if attempt < rate_limiter.max_retries - 1:
|
168 |
+
wait_time = rate_limiter.get_backoff_time(attempt)
|
169 |
+
logger.info(f"Waiting {wait_time} seconds before retry...")
|
170 |
+
time.sleep(wait_time)
|
171 |
+
else:
|
172 |
+
return None
|
173 |
+
|
174 |
+
except requests.exceptions.ConnectionError as e:
|
175 |
+
logger.error(f"{api_name} connection error: {e}")
|
176 |
+
if attempt < rate_limiter.max_retries - 1:
|
177 |
+
wait_time = rate_limiter.get_backoff_time(attempt)
|
178 |
+
logger.info(f"Waiting {wait_time} seconds before retry...")
|
179 |
+
time.sleep(wait_time)
|
180 |
+
else:
|
181 |
+
return None
|
182 |
+
|
183 |
+
except requests.exceptions.Timeout as e:
|
184 |
+
logger.error(f"{api_name} timeout error: {e}")
|
185 |
+
if attempt < rate_limiter.max_retries - 1:
|
186 |
+
wait_time = rate_limiter.get_backoff_time(attempt)
|
187 |
+
logger.info(f"Waiting {wait_time} seconds before retry...")
|
188 |
+
time.sleep(wait_time)
|
189 |
+
else:
|
190 |
+
return None
|
191 |
+
|
192 |
+
except Exception as e:
|
193 |
+
logger.error(f"{api_name} unexpected error: {str(e)}")
|
194 |
+
if attempt < rate_limiter.max_retries - 1:
|
195 |
+
wait_time = rate_limiter.get_backoff_time(attempt)
|
196 |
+
logger.info(f"Waiting {wait_time} seconds before retry...")
|
197 |
+
time.sleep(wait_time)
|
198 |
+
else:
|
199 |
+
return None
|
200 |
+
|
201 |
+
# If we've exhausted all retries
|
202 |
+
logger.error(f"{api_name} call failed after {rate_limiter.max_retries} attempts")
|
203 |
+
return None
|
204 |
+
|
205 |
+
except Exception as e:
|
206 |
+
# Catch any unexpected errors in the decorator itself
|
207 |
+
logger.error(f"{api_name} decorator error: {str(e)}")
|
208 |
+
return None
|
209 |
+
|
210 |
+
return wrapper
|
211 |
+
return decorator
|
212 |
+
|
213 |
+
def safe_json_parse(response, api_name):
|
214 |
+
"""
|
215 |
+
Safely parse JSON response with error handling.
|
216 |
+
|
217 |
+
Args:
|
218 |
+
response (requests.Response): Response object to parse
|
219 |
+
api_name (str): Name of the API for logging
|
220 |
+
|
221 |
+
Returns:
|
222 |
+
dict: Parsed JSON or empty dict on error
|
223 |
+
"""
|
224 |
+
try:
|
225 |
+
return response.json()
|
226 |
+
except ValueError as e:
|
227 |
+
logger.error(f"Error parsing {api_name} JSON response: {e}")
|
228 |
+
logger.debug(f"Response content: {response.text[:500]}...")
|
229 |
+
return {}
|
utils/models.py
ADDED
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Model management utility for the Fake News Detector application.
|
3 |
+
|
4 |
+
This module provides functions for initializing, caching, and
|
5 |
+
retrieving language models used throughout the application.
|
6 |
+
It ensures models are loaded efficiently and reused appropriately.
|
7 |
+
"""
|
8 |
+
|
9 |
+
import os
|
10 |
+
import logging
|
11 |
+
import functools
|
12 |
+
from langchain_openai import ChatOpenAI
|
13 |
+
import spacy
|
14 |
+
|
15 |
+
logger = logging.getLogger("misinformation_detector")
|
16 |
+
|
17 |
+
# Global variables for models
|
18 |
+
nlp = None
|
19 |
+
model = None
|
20 |
+
models_initialized = False
|
21 |
+
|
22 |
+
# Add caching decorator
|
23 |
+
def cached_model(func):
|
24 |
+
"""
|
25 |
+
Decorator to cache model loading for improved performance.
|
26 |
+
|
27 |
+
This decorator ensures that models are only loaded once and
|
28 |
+
then reused for subsequent calls, improving performance by
|
29 |
+
avoiding redundant model loading.
|
30 |
+
|
31 |
+
Args:
|
32 |
+
func (callable): Function that loads a model
|
33 |
+
|
34 |
+
Returns:
|
35 |
+
callable: Wrapped function that returns a cached model
|
36 |
+
"""
|
37 |
+
cache = {}
|
38 |
+
|
39 |
+
@functools.wraps(func)
|
40 |
+
def wrapper(*args, **kwargs):
|
41 |
+
# Use function name as cache key
|
42 |
+
key = func.__name__
|
43 |
+
if key not in cache:
|
44 |
+
logger.info(f"Model not in cache, calling {key}...")
|
45 |
+
cache[key] = func(*args, **kwargs)
|
46 |
+
return cache[key]
|
47 |
+
|
48 |
+
return wrapper
|
49 |
+
|
50 |
+
def initialize_models():
|
51 |
+
"""
|
52 |
+
Initialize all required models.
|
53 |
+
|
54 |
+
This function loads and initializes all the language models
|
55 |
+
needed by the application, including spaCy for NLP tasks and
|
56 |
+
OpenAI for LLM-based processing.
|
57 |
+
|
58 |
+
Returns:
|
59 |
+
str: Initialization status message
|
60 |
+
|
61 |
+
Raises:
|
62 |
+
ValueError: If OpenAI API key is not set
|
63 |
+
"""
|
64 |
+
global nlp, model, models_initialized
|
65 |
+
|
66 |
+
# Skip initialization if already done
|
67 |
+
if models_initialized:
|
68 |
+
logger.info("Models already initialized, skipping initialization")
|
69 |
+
return "Models already initialized"
|
70 |
+
|
71 |
+
# Check OpenAI API key
|
72 |
+
if "OPENAI_API_KEY" not in os.environ or not os.environ["OPENAI_API_KEY"].strip():
|
73 |
+
logger.error("OPENAI_API_KEY environment variable not set or empty")
|
74 |
+
raise ValueError("OpenAI API key is required. Please set it in the Hugging Face Space secrets.")
|
75 |
+
|
76 |
+
try:
|
77 |
+
# Load NLP model
|
78 |
+
try:
|
79 |
+
logger.info("Loading spaCy NLP model...")
|
80 |
+
nlp = spacy.load("en_core_web_sm")
|
81 |
+
logger.info("Loaded spaCy NLP model")
|
82 |
+
except OSError as e:
|
83 |
+
# This handles the case if the model wasn't installed correctly
|
84 |
+
logger.warning(f"Could not load spaCy model: {str(e)}")
|
85 |
+
logger.info("Attempting to download spaCy model...")
|
86 |
+
try:
|
87 |
+
import subprocess
|
88 |
+
import sys
|
89 |
+
# This downloads the model if it's missing
|
90 |
+
subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
|
91 |
+
# Try loading again
|
92 |
+
nlp = spacy.load("en_core_web_sm")
|
93 |
+
logger.info("Successfully downloaded and loaded spaCy model")
|
94 |
+
except Exception as download_err:
|
95 |
+
logger.error(f"Failed to download spaCy model: {str(download_err)}")
|
96 |
+
# Continue with other initialization, we'll handle missing NLP model elsewhere
|
97 |
+
|
98 |
+
# Set up OpenAI model
|
99 |
+
logger.info("Initializing ChatOpenAI model...")
|
100 |
+
model = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
|
101 |
+
logger.info("Initialized ChatOpenAI model")
|
102 |
+
|
103 |
+
# Mark initialization as complete
|
104 |
+
models_initialized = True
|
105 |
+
return "Models initialized successfully"
|
106 |
+
|
107 |
+
except Exception as e:
|
108 |
+
logger.error(f"Error initializing models: {str(e)}")
|
109 |
+
raise e
|
110 |
+
|
111 |
+
@cached_model
|
112 |
+
def get_nlp_model():
|
113 |
+
"""
|
114 |
+
Get the spaCy NLP model, initializing if needed.
|
115 |
+
|
116 |
+
This function returns a cached spaCy model for NLP tasks.
|
117 |
+
If the model hasn't been loaded yet, it will be loaded.
|
118 |
+
|
119 |
+
Returns:
|
120 |
+
spacy.Language: Loaded spaCy model
|
121 |
+
"""
|
122 |
+
global nlp
|
123 |
+
if nlp is None:
|
124 |
+
try:
|
125 |
+
# Try to load just the spaCy model if not loaded yet
|
126 |
+
logger.info("Loading spaCy NLP model...")
|
127 |
+
nlp = spacy.load("en_core_web_sm")
|
128 |
+
logger.info("Loaded spaCy NLP model")
|
129 |
+
except Exception as e:
|
130 |
+
logger.error(f"Error loading spaCy model: {str(e)}")
|
131 |
+
# Fall back to full initialization
|
132 |
+
initialize_models()
|
133 |
+
return nlp
|
134 |
+
|
135 |
+
@cached_model
|
136 |
+
def get_llm_model():
|
137 |
+
"""
|
138 |
+
Get the ChatOpenAI model, initializing if needed.
|
139 |
+
|
140 |
+
This function returns a cached OpenAI LLM model.
|
141 |
+
If the model hasn't been loaded yet, it will be loaded.
|
142 |
+
|
143 |
+
Returns:
|
144 |
+
ChatOpenAI: Loaded LLM model
|
145 |
+
"""
|
146 |
+
global model
|
147 |
+
if model is None:
|
148 |
+
try:
|
149 |
+
# Try to load just the LLM model if not loaded yet
|
150 |
+
logger.info("Initializing ChatOpenAI model...")
|
151 |
+
model = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
|
152 |
+
logger.info("Initialized ChatOpenAI model")
|
153 |
+
except Exception as e:
|
154 |
+
logger.error(f"Error initializing ChatOpenAI model: {str(e)}")
|
155 |
+
# Fall back to full initialization
|
156 |
+
initialize_models()
|
157 |
+
return model
|
utils/performance.py
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Performance tracking utility for the Fake News Detector application.
|
3 |
+
|
4 |
+
This module provides functionality to track and analyze the
|
5 |
+
performance of the application, including processing times,
|
6 |
+
success rates, and resource utilization.
|
7 |
+
"""
|
8 |
+
|
9 |
+
import time
|
10 |
+
import logging
|
11 |
+
|
12 |
+
logger = logging.getLogger("misinformation_detector")
|
13 |
+
|
14 |
+
class PerformanceTracker:
|
15 |
+
"""
|
16 |
+
Tracks and logs performance metrics for the fact-checking system.
|
17 |
+
|
18 |
+
This class maintains counters and statistics for various performance
|
19 |
+
metrics, such as processing times, evidence retrieval success rates,
|
20 |
+
and confidence scores.
|
21 |
+
"""
|
22 |
+
|
23 |
+
def __init__(self):
|
24 |
+
"""Initialize the performance tracker with empty metrics."""
|
25 |
+
self.metrics = {
|
26 |
+
"claims_processed": 0,
|
27 |
+
"evidence_retrieval_success_rate": [],
|
28 |
+
"processing_times": [],
|
29 |
+
"confidence_scores": [],
|
30 |
+
"source_types_used": {},
|
31 |
+
"temporal_relevance": []
|
32 |
+
}
|
33 |
+
|
34 |
+
def log_claim_processed(self):
|
35 |
+
"""
|
36 |
+
Increment the counter for processed claims.
|
37 |
+
This should be called whenever a claim is processed successfully.
|
38 |
+
"""
|
39 |
+
self.metrics["claims_processed"] += 1
|
40 |
+
|
41 |
+
def log_evidence_retrieval(self, success, sources_count):
|
42 |
+
"""
|
43 |
+
Log the success or failure of evidence retrieval.
|
44 |
+
|
45 |
+
Args:
|
46 |
+
success (bool): Whether evidence retrieval was successful
|
47 |
+
sources_count (dict): Count of evidence items by source type
|
48 |
+
"""
|
49 |
+
# Ensure success is a boolean
|
50 |
+
success_value = 1 if success else 0
|
51 |
+
self.metrics["evidence_retrieval_success_rate"].append(success_value)
|
52 |
+
|
53 |
+
# Safely process source types
|
54 |
+
if isinstance(sources_count, dict):
|
55 |
+
for source_type, count in sources_count.items():
|
56 |
+
# Ensure source_type is a string and count is an integer
|
57 |
+
source_type = str(source_type)
|
58 |
+
try:
|
59 |
+
count = int(count)
|
60 |
+
except (ValueError, TypeError):
|
61 |
+
count = 1
|
62 |
+
|
63 |
+
# Update source types used
|
64 |
+
self.metrics["source_types_used"][source_type] = \
|
65 |
+
self.metrics["source_types_used"].get(source_type, 0) + count
|
66 |
+
|
67 |
+
def log_processing_time(self, start_time):
|
68 |
+
"""
|
69 |
+
Log the processing time for an operation.
|
70 |
+
|
71 |
+
Args:
|
72 |
+
start_time (float): Start time obtained from time.time()
|
73 |
+
"""
|
74 |
+
end_time = time.time()
|
75 |
+
processing_time = end_time - start_time
|
76 |
+
self.metrics["processing_times"].append(processing_time)
|
77 |
+
|
78 |
+
def log_confidence_score(self, score):
|
79 |
+
"""
|
80 |
+
Log a confidence score.
|
81 |
+
|
82 |
+
Args:
|
83 |
+
score (float): Confidence score between 0 and 1
|
84 |
+
"""
|
85 |
+
# Ensure score is a float between 0 and 1
|
86 |
+
try:
|
87 |
+
score = float(score)
|
88 |
+
if 0 <= score <= 1:
|
89 |
+
self.metrics["confidence_scores"].append(score)
|
90 |
+
except (ValueError, TypeError):
|
91 |
+
logger.warning(f"Invalid confidence score: {score}")
|
92 |
+
|
93 |
+
def log_temporal_relevance(self, relevance_score):
|
94 |
+
"""
|
95 |
+
Log a temporal relevance score.
|
96 |
+
|
97 |
+
Args:
|
98 |
+
relevance_score (float): Temporal relevance score between 0 and 1
|
99 |
+
"""
|
100 |
+
# Ensure relevance score is a float between 0 and 1
|
101 |
+
try:
|
102 |
+
relevance_score = float(relevance_score)
|
103 |
+
if 0 <= relevance_score <= 1:
|
104 |
+
self.metrics["temporal_relevance"].append(relevance_score)
|
105 |
+
except (ValueError, TypeError):
|
106 |
+
logger.warning(f"Invalid temporal relevance score: {relevance_score}")
|
107 |
+
|
108 |
+
def get_summary(self):
|
109 |
+
"""
|
110 |
+
Get a summary of all performance metrics.
|
111 |
+
|
112 |
+
Returns:
|
113 |
+
dict: Summary of performance metrics
|
114 |
+
"""
|
115 |
+
# Safely calculate averages with error handling
|
116 |
+
def safe_avg(metric_list):
|
117 |
+
try:
|
118 |
+
return sum(metric_list) / max(len(metric_list), 1)
|
119 |
+
except (TypeError, ValueError):
|
120 |
+
return 0.0
|
121 |
+
|
122 |
+
return {
|
123 |
+
"claims_processed": self.metrics["claims_processed"],
|
124 |
+
"avg_evidence_retrieval_success_rate": safe_avg(self.metrics["evidence_retrieval_success_rate"]),
|
125 |
+
"avg_processing_time": safe_avg(self.metrics["processing_times"]),
|
126 |
+
"avg_confidence_score": safe_avg(self.metrics["confidence_scores"]),
|
127 |
+
"source_types_used": dict(self.metrics["source_types_used"]),
|
128 |
+
"avg_temporal_relevance": safe_avg(self.metrics["temporal_relevance"])
|
129 |
+
}
|
130 |
+
|
131 |
+
def reset(self):
|
132 |
+
"""Reset all performance metrics."""
|
133 |
+
self.__init__()
|
134 |
+
logger.info("Performance metrics have been reset")
|
135 |
+
return "Performance metrics reset successfully"
|