File size: 21,088 Bytes
5dc3509
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
import logging
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime, timedelta
import re

# Import the centralized NLP model handler
from utils.models import get_nlp_model

logger = logging.getLogger("misinformation_detector")

def extract_entities(text):
    """Extract named entities from text"""
    if not text:
        return []
    
    try:
        # Use centralized NLP model
        nlp_model = get_nlp_model()
        doc = nlp_model(text)
        entities = [
            {
                "text": ent.text,
                "label": ent.label_,
                "start": ent.start_char,
                "end": ent.end_char
            }
            for ent in doc.ents
        ]
        return entities
    except Exception as e:
        logger.error(f"Error extracting entities: {str(e)}")
        return []

def get_vector_representation(text):
    """Get vector representation of text using spaCy"""
    if not text:
        return None
    
    try:
        # Use centralized NLP model
        nlp_model = get_nlp_model()
        doc = nlp_model(text)
        
        # Return document vector if available
        if doc.has_vector:
            return doc.vector
        
        # Fallback: average of token vectors
        vectors = [token.vector for token in doc if token.has_vector]
        if vectors:
            return np.mean(vectors, axis=0)
        
        return None
    except Exception as e:
        logger.error(f"Error getting vector representation: {str(e)}")
        return None

def calculate_similarity(text1, text2):
    """Calculate semantic similarity between two texts"""
    if not text1 or not text2:
        return 0.0
    
    try:
        vec1 = get_vector_representation(text1)
        vec2 = get_vector_representation(text2)
        
        if vec1 is None or vec2 is None:
            return 0.0
        
        # Reshape vectors for cosine_similarity
        vec1 = vec1.reshape(1, -1)
        vec2 = vec2.reshape(1, -1)
        
        # Calculate cosine similarity
        similarity = cosine_similarity(vec1, vec2)[0][0]
        return float(similarity)
    except Exception as e:
        logger.error(f"Error calculating similarity: {str(e)}")
        return 0.0

def extract_date_from_evidence(evidence_text):
    """Extract date from evidence text"""
    if not evidence_text:
        return None
    
    try:
        # Look for date patterns in text
        date_patterns = [
            r'Date: (\d{4}-\d{2}-\d{2})',  # ISO format
            r'published.*?(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',  # published on MM/DD/YYYY
            r'(\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4})',  # DD Month YYYY
            r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}'  # Month DD, YYYY
        ]
        
        for pattern in date_patterns:
            match = re.search(pattern, evidence_text)
            if match:
                date_str = match.group(1)
                # Parse date string based on format
                try:
                    if '-' in date_str:
                        return datetime.strptime(date_str, '%Y-%m-%d')
                    elif '/' in date_str or '-' in date_str:
                        formats = ['%m/%d/%Y', '%d/%m/%Y', '%m-%d-%Y', '%d-%m-%Y']
                        for fmt in formats:
                            try:
                                return datetime.strptime(date_str, fmt)
                            except ValueError:
                                continue
                    else:
                        # Try different month formats
                        formats = ['%d %B %Y', '%B %d, %Y', '%B %d %Y']
                        for fmt in formats:
                            try:
                                return datetime.strptime(date_str, fmt)
                            except ValueError:
                                continue
                except Exception:
                    pass
        
        return None
    except Exception as e:
        logger.error(f"Error extracting date from evidence: {str(e)}")
        return None

def is_temporally_relevant(evidence_text, claim_text, max_days_old=30):
    """Check if evidence is temporally relevant to the claim"""
    # Check if claim seems to require recent evidence
    temporal_terms = ["today", "now", "current", "currently", "recent", "recently", "latest", "just", "this week", "this month", "this year"]
    requires_recent = any(term in claim_text.lower() for term in temporal_terms)
    
    # If claim doesn't specify temporality, consider evidence relevant
    if not requires_recent:
        return True
    
    # Extract date from evidence
    date = extract_date_from_evidence(evidence_text)
    if not date:
        return True  # If we can't determine date, assume it's relevant
    
    # Check if evidence is recent enough
    cutoff = datetime.now() - timedelta(days=max_days_old)
    return date >= cutoff

def has_authority_signal(evidence_text):
    """Check if evidence contains authority signals"""
    authority_signals = {
        "scientific_consensus": ["consensus", "scientists agree", "research shows", "studies confirm", "experts agree"],
        "fact_check": ["fact check", "rated false", "rated true", "debunked", "confirmed", "verification"],
        "high_authority": ["nasa", "world health organization", "who", "cdc", "national academy", 
                          "oxford", "harvard", "stanford", "mit", "cambridge", "yale", 
                          "princeton", "government", "official", "authorities", "minister", 
                          "ministry", "department", "administration", "university", "professor"]
    }
    
    evidence_lower = evidence_text.lower()
    
    authority_type = None
    authority_score = 1.0
    
    for signal_type, phrases in authority_signals.items():
        if any(phrase in evidence_lower for phrase in phrases):
            if signal_type == "scientific_consensus":
                authority_score = 1.8
                authority_type = "scientific_consensus"
            elif signal_type == "fact_check":
                authority_score = 1.5
                authority_type = "fact_check"
            elif signal_type == "high_authority":
                authority_score = 1.3
                authority_type = "high_authority"
            break
    
    return authority_score, authority_type

def analyze_evidence_relevance(claim, evidence_list, source_credibility=None):
    """
    Analyze evidence relevance to claim using semantic similarity with improved handling
    for claims requiring strong evidence
    
    Args:
        claim (str): The claim being verified
        evidence_list (list): List of evidence items
        source_credibility (dict): Dictionary mapping source domains to credibility scores
        
    Returns:
        list: Sorted list of evidence items with relevance scores
    """
    if not evidence_list:
        return []
    
    # Ensure evidence_list is a list of strings
    if not isinstance(evidence_list, list):
        evidence_list = [str(evidence_list)]
    
    # Filter out None or empty items
    evidence_list = [item for item in evidence_list if item]
    
    # Check if claim contains strong assertions that would require specific evidence
    strong_assertion_markers = [
        "solved", "cured", "discovered", "breakthrough", "revolutionary", 
        "first ever", "confirmed", "definitive", "conclusive", "proven",
        "groundbreaking", "unprecedented", "remarkable", "extends lifespan",
        "extends life", "definitively", "successfully"
    ]
    
    # Determine if claim contains strong assertions
    claim_has_strong_assertions = any(marker in claim.lower() for marker in strong_assertion_markers)
    
    # Log detection result
    if claim_has_strong_assertions:
        logger.info(f"Evidence analysis: Detected claim with strong assertions requiring specific evidence")
    
    # Extract named entities from claim
    claim_entities = extract_entities(claim)
    claim_entity_texts = [entity["text"].lower() for entity in claim_entities]
    
    # Process each evidence item
    analyzed_evidence = []
    
    # Track domains found in evidence to identify source diversity
    found_domains = set()
    
    for evidence in evidence_list:
        if not isinstance(evidence, str):
            continue
            
        # Calculate semantic similarity
        similarity = calculate_similarity(claim, evidence)
        
        # Check for entity overlap
        evidence_entities = extract_entities(evidence)
        evidence_entity_texts = [entity["text"].lower() for entity in evidence_entities]
        
        # Calculate entity overlap
        common_entities = set(claim_entity_texts).intersection(set(evidence_entity_texts))
        entity_overlap = len(common_entities) / max(1, len(claim_entity_texts))
        
        # Check temporal relevance
        temporal_relevance = 1.0
        if is_temporally_relevant(evidence, claim):
            temporal_relevance = 1.2
        else:
            # Penalty for temporally irrelevant evidence
            temporal_relevance = 0.7
        
        # Check for authority signals
        authority_score, authority_type = has_authority_signal(evidence)
        
        # Extract source from evidence if available
        source_boost = 1.0
        domain = None
        
        if source_credibility:
            # Try to extract domain from URL in evidence
            domain_match = re.search(r'URL: https?://(?:www\.)?([^/]+)', evidence)
            if domain_match:
                domain = domain_match.group(1)
                # Check if domain or its parent domain is in credibility list
                for cred_domain, cred_score in source_credibility.items():
                    if cred_domain in domain:
                        try:
                            source_boost = float(cred_score)
                            break
                        except (ValueError, TypeError):
                            pass
                
                # Track this domain for source diversity
                if domain:
                    found_domains.add(domain)
        
        # For claims with strong assertions: check if evidence specifically addresses assertions
        claim_specificity_match = 1.0
        evidence_specificity_match = 1.0
        
        if claim_has_strong_assertions:
            # Check if evidence provides specific confirmation or contradiction
            direct_contradiction_terms = [
                "not yet", "has not", "have not", "cannot", "can't", "doesn't", "don't",
                "unlikely", "challenging", "remains a challenge", "in the future", 
                "experimental", "in development", "proposed", "theoretical", 
                "preliminary", "hypothesized", "potential", "promising but"
            ]
            
            # Check for contradictions to strong assertions
            if any(term in evidence.lower() for term in direct_contradiction_terms):
                # This evidence likely contradicts the strong assertion
                evidence_specificity_match = 2.0  # Boost relevance of contradicting evidence
                logger.debug(f"Found contradiction to strong assertion in evidence")
            
            # For claims with strong assertions, check if evidence specifically confirms
            direct_confirmation_terms = [
                "successfully demonstrated", "breakthrough", "solved", "cured", 
                "confirmed", "definitive evidence", "conclusive results", "proven",
                "revolutionary results", "milestone achievement", "groundbreaking results"
            ]
            
            # If evidence confirms the strong assertion, adjust relevance
            if any(term in evidence.lower() for term in direct_confirmation_terms):
                # Apply higher scoring for evidence that specifically confirms
                evidence_specificity_match = 1.8
                logger.debug(f"Found confirmation of strong assertion in evidence")
            
            # For claims with strong assertions, check for high-quality sources
            high_quality_source_markers = [
                "journal", "doi.org", "research", "university", "institute",
                "laboratory", "professor", "study", "publication", "published in"
            ]
            
            is_high_quality = any(term in evidence.lower() for term in high_quality_source_markers)
            quality_boost = 1.4 if is_high_quality else 1.0
            
            # Apply the quality boost
            source_boost *= quality_boost
        
        # Calculate final relevance score with improvements for all claim types
        if claim_has_strong_assertions:
            relevance_score = (
                (similarity * 0.35) +  # Semantic similarity
                (entity_overlap * 0.25) +  # Entity overlap
                (0.25)  # Base value to ensure all evidence has some relevance
            ) * temporal_relevance * authority_score * source_boost * claim_specificity_match * evidence_specificity_match
        else:
            # Original formula for regular claims
            relevance_score = (
                (similarity * 0.4) +  # Semantic similarity
                (entity_overlap * 0.3) +  # Entity overlap
                (0.3)  # Base value to ensure all evidence has some relevance
            ) * temporal_relevance * authority_score * source_boost
        
        # Add metadata and relevance score
        analyzed_evidence.append({
            "text": evidence,
            "relevance_score": relevance_score,
            "similarity": similarity,
            "entity_overlap": entity_overlap,
            "temporal_relevance": temporal_relevance,
            "authority_score": authority_score,
            "authority_type": authority_type,
            "source_boost": source_boost,
            "domain": domain
        })
    
    # Sort by relevance score (descending)
    analyzed_evidence.sort(key=lambda x: x["relevance_score"], reverse=True)
    
    # Ensure we have diverse sources in top results for all claims
    if len(found_domains) > 1:
        # Try to promote evidence from reliable sources if we haven't selected any yet
        reliable_sources_seen = False
        
        # Check if top 3 results contain any reliable sources 
        for item in analyzed_evidence[:3]:
            domain = item.get("domain", "")
            if domain and source_credibility and any(cred_domain in domain for cred_domain in source_credibility):
                reliable_sources_seen = True
                break
                
        # If no reliable sources in top results, promote one if available
        if not reliable_sources_seen:
            for i, item in enumerate(analyzed_evidence[3:]):
                domain = item.get("domain", "")
                if domain and source_credibility and any(cred_domain in domain for cred_domain in source_credibility):
                    # Swap this item into the top 3
                    analyzed_evidence.insert(2, analyzed_evidence.pop(i+3))
                    break
    
    return analyzed_evidence

def select_diverse_evidence(analyzed_evidence, max_items=5):
    """
    Select diverse evidence items based on relevance, source diversity and claim characteristics
    
    Args:
        analyzed_evidence (list): List of evidence items with relevance scores
        max_items (int): Maximum number of evidence items to return
        
    Returns:
        list: Selected diverse evidence items
    """
    if not analyzed_evidence:
        return []
    
    # Check if top evidence suggests claim has strong assertions
    strong_assertion_markers = [
        "solved", "cured", "discovered", "breakthrough", "revolutionary", 
        "first ever", "confirmed", "definitive", "conclusive", "proven",
        "groundbreaking", "unprecedented", "extends lifespan", "definitively"
    ]
    
    # Determine if this is a claim with strong assertions by checking evidence text
    has_strong_assertions = False
    
    for item in analyzed_evidence[:3]:  # Check just the top items for efficiency
        if "text" in item:
            item_text = item["text"].lower()
            if any(f"claim {marker}" in item_text or f"claim has {marker}" in item_text 
                  for marker in strong_assertion_markers):
                has_strong_assertions = True
                break
    
    # Also check for contradiction markers in evidence which can indicate a strong assertion
    contradiction_markers = [
        "not yet solved", "hasn't been proven", "no evidence that", 
        "remains unsolved", "has not been confirmed", "remains theoretical"
    ]
    
    if not has_strong_assertions:
        for item in analyzed_evidence[:3]:
            if "text" in item:
                item_text = item["text"].lower()
                if any(marker in item_text for marker in contradiction_markers):
                    has_strong_assertions = True
                    break
    
    # Ensure we don't select more than available
    max_items = min(max_items, len(analyzed_evidence))
    
    # Initialize selected items with the most relevant item
    selected = [analyzed_evidence[0]]
    remaining = analyzed_evidence[1:]
    
    # Track sources to ensure diversity
    selected_sources = set()
    for item in selected:
        # Try to extract source from evidence
        source_match = re.search(r'Source: ([^,]+)', item["text"])
        if source_match:
            selected_sources.add(source_match.group(1))
    
    # For all claims, track if we have high-quality sources yet
    has_quality_source = False
    quality_source_markers = ["journal", "doi.org", "research", "university", 
                             "institute", "laboratory", "professor", "study"]
    
    # Check if our top item is already from a quality source
    if any(marker in selected[0]["text"].lower() for marker in quality_source_markers):
        has_quality_source = True
    
    # Select remaining items balancing relevance and diversity
    while len(selected) < max_items and remaining:
        best_item = None
        best_score = -1
        
        for i, item in enumerate(remaining):
            # Base score is the item's relevance
            score = item["relevance_score"]
            
            # Extract source if available
            source = None
            source_match = re.search(r'Source: ([^,]+)', item["text"])
            if source_match:
                source = source_match.group(1)
            
            # Apply diversity bonus if source is new
            if source and source not in selected_sources:
                score *= 1.2  # Diversity bonus
            
            # For claims with strong assertions, apply bonus for contradicting evidence
            if has_strong_assertions:
                # Check for contradiction markers in the text
                if any(marker in item["text"].lower() for marker in contradiction_markers):
                    score *= 1.3  # Bonus for evidence that may contradict strong assertions
            
            # For any claim, apply bonus for high-quality sources if we don't have one yet
            if not has_quality_source:
                is_item_quality = any(marker in item["text"].lower() for marker in quality_source_markers)
                if is_item_quality:
                    score *= 1.5  # Significant bonus for quality sources
            
            if score > best_score:
                best_score = score
                best_item = (i, item)
        
        if best_item:
            idx, item = best_item
            selected.append(item)
            remaining.pop(idx)
            
            # Add source to selected sources
            source_match = re.search(r'Source: ([^,]+)', item["text"])
            if source_match:
                selected_sources.add(source_match.group(1))
            
            # Check if we found a quality source
            if not has_quality_source:
                if any(marker in item["text"].lower() for marker in quality_source_markers):
                    has_quality_source = True
        else:
            break
    
    # For any claim with strong assertions, ensure we have at least one quality source if available
    if has_strong_assertions and not has_quality_source and remaining:
        for i, item in enumerate(remaining):
            if any(marker in item["text"].lower() for marker in quality_source_markers):
                # Replace the least relevant selected item with this quality one
                selected.sort(key=lambda x: x["relevance_score"])
                selected[0] = item
                break
    
    # Return only the text portion
    return [item["text"] for item in selected]