Spaces:

ankanghosh
/

askveracity

Running

File size: 10,062 Bytes

5dc3509

import logging
import time
import re
from langdetect import detect
import spacy

from utils.performance import PerformanceTracker
from utils.models import get_nlp_model, get_llm_model

logger = logging.getLogger("misinformation_detector")

performance_tracker = PerformanceTracker()

def extract_claims(text):
    """
    Extract the main factual claim from the provided text.
    For concise claims (<20 words), preserves them exactly.
    For longer text, uses OpenAI to extract the claim.
    """
    logger.info(f"Extracting claims from: {text}")
    start_time = time.time()

    # First, check if the input already appears to be a concise claim
    if len(text.split()) < 20:
        logger.info("Input appears to be a concise claim already, preserving as-is")
        performance_tracker.log_processing_time(start_time)
        performance_tracker.log_claim_processed()
        return text

    try:
        # For longer text, use OpenAI for extraction
        extracted_claim = extract_with_openai(text)
        
        # Log processing time
        performance_tracker.log_processing_time(start_time)
        performance_tracker.log_claim_processed()
        
        logger.info(f"Extracted claim: {extracted_claim}")
        return extracted_claim
    except Exception as e:
        logger.error(f"Error extracting claims: {str(e)}")
        # Fallback to original text on error
        return text

def extract_with_openai(text):
    """
    Use OpenAI model for claim extraction
    """
    try:
        # Get LLM model
        llm_model = get_llm_model()
        
        # Create a very explicit prompt to avoid hallucination
        prompt = f"""
        Extract the main factual claim from the following text. 
        DO NOT add any information not present in the original text.
        DO NOT add locations, dates, or other details.
        ONLY extract what is explicitly stated.
        
        Text: {text}
        
        Main factual claim:
        """
        
        # Call OpenAI with temperature=0 for deterministic output
        response = llm_model.invoke(prompt, temperature=0)
        extracted_claim = response.content.strip()
        
        # Further clean up any explanations or extra text
        if ":" in extracted_claim:
            parts = extracted_claim.split(":")
            if len(parts) > 1:
                extracted_claim = parts[-1].strip()
        
        logger.info(f"OpenAI extraction: {extracted_claim}")
        
        # Validate that we're not adding info not in the original
        nlp = get_nlp_model()
        extracted_claim = validate_extraction(text, extracted_claim, nlp)
        
        return extracted_claim
    except Exception as e:
        logger.error(f"Error in OpenAI claim extraction: {str(e)}")
        return text  # Fallback to original

def validate_extraction(original_text, extracted_claim, nlp):
    """
    Validate that the extracted claim doesn't add information not present in the original text
    """
    # If extraction fails or is empty, return original
    if not extracted_claim or extracted_claim.strip() == "":
        logger.warning("Empty extraction result, using original text")
        return original_text
    
    # Check for added location information
    location_terms = ["united states", "america", "u.s.", "usa", "china", "india", "europe", 
                      "russia", "japan", "uk", "germany", "france", "australia"]
    for term in location_terms:
        if term in extracted_claim.lower() and term not in original_text.lower():
            logger.warning(f"Extraction added location '{term}' not in original, using original text")
            return original_text
    
    # Check for entity preservation/addition using spaCy
    try:
        # Get entities from extracted text
        extracted_doc = nlp(extracted_claim)
        extracted_entities = [ent.text.lower() for ent in extracted_doc.ents]
        
        # Get entities from original text
        original_doc = nlp(original_text)
        original_entities = [ent.text.lower() for ent in original_doc.ents]
        
        # Check for new entities that don't exist in original
        for entity in extracted_entities:
            if not any(entity in orig_entity or orig_entity in entity for orig_entity in original_entities):
                logger.warning(f"Extraction added new entity '{entity}', using original text")
                return original_text
        
        return extracted_claim
    except Exception as e:
        logger.error(f"Error in extraction validation: {str(e)}")
        return original_text  # On error, safer to return original

def shorten_claim_for_evidence(claim):
    """
    Shorten a claim to use for evidence retrieval by preserving important keywords
    while maintaining claim context
    """
    try:
        # Get NLP model
        nlp = get_nlp_model()
        
        # Use NER to extract key entities
        doc = nlp(claim)
        
        # Extract all entities for search
        entities = [ent.text for ent in doc.ents]
        
        # Extract key proper nouns, entities, and important context words
        important_words = []
        
        # Add all named entities
        for ent in doc.ents:
            important_words.append(ent.text)
        
        # Add important nouns and adjectives not already added
        for token in doc:
            if token.pos_ in ["NOUN", "PROPN"] and token.text not in important_words:
                important_words.append(token.text)
        
        # Make sure we include key terms like "prime minister", "president", etc.
        title_terms = ["president", "prime minister", "minister", "chancellor", "premier", "governor", "mayor", "senator"]
        for term in title_terms:
            if term in claim.lower() and not any(term in word.lower() for word in important_words):
                # Find the full phrase (e.g., "Canadian Prime Minister")
                matches = re.finditer(r'(?i)(?:\w+\s+)*\b' + re.escape(term) + r'\b(?:\s+\w+)*', claim)
                for match in matches:
                    phrase = match.group(0)
                    if phrase not in important_words:
                        important_words.append(phrase)
        
        # Add country names or important place references
        country_terms = ["canada", "canadian", "us", "united states", "american", "uk", "british", "australia", "china", "russian"]
        for term in country_terms:
            if term in claim.lower() and not any(term in word.lower() for word in important_words):
                for token in doc:
                    if token.text.lower() == term and token.text not in important_words:
                        important_words.append(token.text)
        
        # Always include negation words as they're critical for meaning
        negation_terms = ["not", "no longer", "former", "ex-", "isn't", "aren't", "doesn't", "don't"]
        negation_found = False
        for term in negation_terms:
            if term in claim.lower():
                # Find the context around the negation (3 words before and after)
                matches = re.finditer(r'(?i)(?:\w+\s+){0,3}\b' + re.escape(term) + r'\b(?:\s+\w+){0,3}', claim)
                for match in matches:
                    phrase = match.group(0)
                    if phrase not in important_words:
                        important_words.append(phrase)
                        negation_found = True

        # Special handling for time-sensitive claims with negations
        is_time_sensitive = any(term in claim.lower() for term in ["anymore", "still", "currently", "now", "today", "recent"])
        
        # If we have both negation and time sensitivity, ensure we keep those key aspects
        if negation_found and is_time_sensitive:
            # Ensure we keep time-sensitive terms
            time_terms = ["anymore", "still", "currently", "now", "today", "recent"]
            for term in time_terms:
                if term in claim.lower() and not any(term in word.lower() for word in important_words):
                    # Add the context around the time term
                    matches = re.finditer(r'(?i)(?:\w+\s+){0,2}\b' + re.escape(term) + r'\b(?:\s+\w+){0,2}', claim)
                    for match in matches:
                        phrase = match.group(0)
                        if phrase not in important_words:
                            important_words.append(phrase)

        # If entities plus titles don't give us enough, include key parts of claim
        if len(entities) < 2 and not any("minister" in word.lower() for word in important_words):
            words = claim.split()
            # Use first 8 words
            return " ".join(words[:min(8, len(words))])
        
        # Remove duplicates while preserving order
        seen = set()
        unique_terms = []
        for word in important_words:
            if word.lower() not in seen:
                seen.add(word.lower())
                unique_terms.append(word)
        
        # Ensure we have a reasonable number of search terms (maintain more for complex claims)
        search_terms = unique_terms[:min(6, len(unique_terms))]
        
        # Sort search terms to try to maintain original word order from claim
        def get_position(term):
            return claim.lower().find(term.lower())
        
        search_terms.sort(key=get_position)
        
        # Join terms to create search query
        shortened_claim = " ".join(search_terms)
        
        # If the shortened claim is too short compared to original, use more of original
        if len(shortened_claim.split()) < 3 and len(claim.split()) > 5:
            words = claim.split()
            shortened_claim = " ".join(words[:min(8, len(words))])
        
        logger.info(f"Shortened Claim: {shortened_claim}")

        return shortened_claim
    except Exception as e:
        logger.error(f"Error in shortening claim: {str(e)}")
        # Return original claim on error
        return claim