Spaces:
Running
Running
import logging | |
import time | |
import re | |
from langdetect import detect | |
import spacy | |
from utils.performance import PerformanceTracker | |
from utils.models import get_nlp_model, get_llm_model | |
logger = logging.getLogger("misinformation_detector") | |
performance_tracker = PerformanceTracker() | |
def extract_claims(text): | |
""" | |
Extract the main factual claim from the provided text. | |
For concise claims (<20 words), preserves them exactly. | |
For longer text, uses OpenAI to extract the claim. | |
""" | |
logger.info(f"Extracting claims from: {text}") | |
start_time = time.time() | |
# First, check if the input already appears to be a concise claim | |
if len(text.split()) < 20: | |
logger.info("Input appears to be a concise claim already, preserving as-is") | |
performance_tracker.log_processing_time(start_time) | |
performance_tracker.log_claim_processed() | |
return text | |
try: | |
# For longer text, use OpenAI for extraction | |
extracted_claim = extract_with_openai(text) | |
# Log processing time | |
performance_tracker.log_processing_time(start_time) | |
performance_tracker.log_claim_processed() | |
logger.info(f"Extracted claim: {extracted_claim}") | |
return extracted_claim | |
except Exception as e: | |
logger.error(f"Error extracting claims: {str(e)}") | |
# Fallback to original text on error | |
return text | |
def extract_with_openai(text): | |
""" | |
Use OpenAI model for claim extraction | |
""" | |
try: | |
# Get LLM model | |
llm_model = get_llm_model() | |
# Create a very explicit prompt to avoid hallucination | |
prompt = f""" | |
Extract the main factual claim from the following text. | |
DO NOT add any information not present in the original text. | |
DO NOT add locations, dates, or other details. | |
ONLY extract what is explicitly stated. | |
Text: {text} | |
Main factual claim: | |
""" | |
# Call OpenAI with temperature=0 for deterministic output | |
response = llm_model.invoke(prompt, temperature=0) | |
extracted_claim = response.content.strip() | |
# Further clean up any explanations or extra text | |
if ":" in extracted_claim: | |
parts = extracted_claim.split(":") | |
if len(parts) > 1: | |
extracted_claim = parts[-1].strip() | |
logger.info(f"OpenAI extraction: {extracted_claim}") | |
# Validate that we're not adding info not in the original | |
nlp = get_nlp_model() | |
extracted_claim = validate_extraction(text, extracted_claim, nlp) | |
return extracted_claim | |
except Exception as e: | |
logger.error(f"Error in OpenAI claim extraction: {str(e)}") | |
return text # Fallback to original | |
def validate_extraction(original_text, extracted_claim, nlp): | |
""" | |
Validate that the extracted claim doesn't add information not present in the original text | |
""" | |
# If extraction fails or is empty, return original | |
if not extracted_claim or extracted_claim.strip() == "": | |
logger.warning("Empty extraction result, using original text") | |
return original_text | |
# Check for added location information | |
location_terms = ["united states", "america", "u.s.", "usa", "china", "india", "europe", | |
"russia", "japan", "uk", "germany", "france", "australia"] | |
for term in location_terms: | |
if term in extracted_claim.lower() and term not in original_text.lower(): | |
logger.warning(f"Extraction added location '{term}' not in original, using original text") | |
return original_text | |
# Check for entity preservation/addition using spaCy | |
try: | |
# Get entities from extracted text | |
extracted_doc = nlp(extracted_claim) | |
extracted_entities = [ent.text.lower() for ent in extracted_doc.ents] | |
# Get entities from original text | |
original_doc = nlp(original_text) | |
original_entities = [ent.text.lower() for ent in original_doc.ents] | |
# Check for new entities that don't exist in original | |
for entity in extracted_entities: | |
if not any(entity in orig_entity or orig_entity in entity for orig_entity in original_entities): | |
logger.warning(f"Extraction added new entity '{entity}', using original text") | |
return original_text | |
return extracted_claim | |
except Exception as e: | |
logger.error(f"Error in extraction validation: {str(e)}") | |
return original_text # On error, safer to return original | |
def shorten_claim_for_evidence(claim): | |
""" | |
Shorten a claim to use for evidence retrieval by preserving important keywords | |
while maintaining claim context | |
""" | |
try: | |
# Get NLP model | |
nlp = get_nlp_model() | |
# Use NER to extract key entities | |
doc = nlp(claim) | |
# Extract all entities for search | |
entities = [ent.text for ent in doc.ents] | |
# Extract key proper nouns, entities, and important context words | |
important_words = [] | |
# Add all named entities | |
for ent in doc.ents: | |
important_words.append(ent.text) | |
# Add important nouns and adjectives not already added | |
for token in doc: | |
if token.pos_ in ["NOUN", "PROPN"] and token.text not in important_words: | |
important_words.append(token.text) | |
# Make sure we include key terms like "prime minister", "president", etc. | |
title_terms = ["president", "prime minister", "minister", "chancellor", "premier", "governor", "mayor", "senator"] | |
for term in title_terms: | |
if term in claim.lower() and not any(term in word.lower() for word in important_words): | |
# Find the full phrase (e.g., "Canadian Prime Minister") | |
matches = re.finditer(r'(?i)(?:\w+\s+)*\b' + re.escape(term) + r'\b(?:\s+\w+)*', claim) | |
for match in matches: | |
phrase = match.group(0) | |
if phrase not in important_words: | |
important_words.append(phrase) | |
# Add country names or important place references | |
country_terms = ["canada", "canadian", "us", "united states", "american", "uk", "british", "australia", "china", "russian"] | |
for term in country_terms: | |
if term in claim.lower() and not any(term in word.lower() for word in important_words): | |
for token in doc: | |
if token.text.lower() == term and token.text not in important_words: | |
important_words.append(token.text) | |
# Always include negation words as they're critical for meaning | |
negation_terms = ["not", "no longer", "former", "ex-", "isn't", "aren't", "doesn't", "don't"] | |
negation_found = False | |
for term in negation_terms: | |
if term in claim.lower(): | |
# Find the context around the negation (3 words before and after) | |
matches = re.finditer(r'(?i)(?:\w+\s+){0,3}\b' + re.escape(term) + r'\b(?:\s+\w+){0,3}', claim) | |
for match in matches: | |
phrase = match.group(0) | |
if phrase not in important_words: | |
important_words.append(phrase) | |
negation_found = True | |
# Special handling for time-sensitive claims with negations | |
is_time_sensitive = any(term in claim.lower() for term in ["anymore", "still", "currently", "now", "today", "recent"]) | |
# If we have both negation and time sensitivity, ensure we keep those key aspects | |
if negation_found and is_time_sensitive: | |
# Ensure we keep time-sensitive terms | |
time_terms = ["anymore", "still", "currently", "now", "today", "recent"] | |
for term in time_terms: | |
if term in claim.lower() and not any(term in word.lower() for word in important_words): | |
# Add the context around the time term | |
matches = re.finditer(r'(?i)(?:\w+\s+){0,2}\b' + re.escape(term) + r'\b(?:\s+\w+){0,2}', claim) | |
for match in matches: | |
phrase = match.group(0) | |
if phrase not in important_words: | |
important_words.append(phrase) | |
# If entities plus titles don't give us enough, include key parts of claim | |
if len(entities) < 2 and not any("minister" in word.lower() for word in important_words): | |
words = claim.split() | |
# Use first 8 words | |
return " ".join(words[:min(8, len(words))]) | |
# Remove duplicates while preserving order | |
seen = set() | |
unique_terms = [] | |
for word in important_words: | |
if word.lower() not in seen: | |
seen.add(word.lower()) | |
unique_terms.append(word) | |
# Ensure we have a reasonable number of search terms (maintain more for complex claims) | |
search_terms = unique_terms[:min(6, len(unique_terms))] | |
# Sort search terms to try to maintain original word order from claim | |
def get_position(term): | |
return claim.lower().find(term.lower()) | |
search_terms.sort(key=get_position) | |
# Join terms to create search query | |
shortened_claim = " ".join(search_terms) | |
# If the shortened claim is too short compared to original, use more of original | |
if len(shortened_claim.split()) < 3 and len(claim.split()) > 5: | |
words = claim.split() | |
shortened_claim = " ".join(words[:min(8, len(words))]) | |
logger.info(f"Shortened Claim: {shortened_claim}") | |
return shortened_claim | |
except Exception as e: | |
logger.error(f"Error in shortening claim: {str(e)}") | |
# Return original claim on error | |
return claim |