Spaces:

ankanghosh
/

askveracity

Running

App Files Files Community

ankanghosh commited on Apr 27

Commit

3b7eec8

verified ·

1 Parent(s): e3cde99

Delete utils

Browse files

Files changed (4) hide show

utils/__init__.py +0 -20
utils/api_utils.py +0 -229
utils/models.py +0 -157
utils/performance.py +0 -135

utils/__init__.py DELETED Viewed

@@ -1,20 +0,0 @@
-"""
-Utils package initialization.
-This package provides utility functions for the AskVeracity fact-checking system.
-"""
-from .api_utils import api_error_handler, safe_json_parse, RateLimiter
-from .performance import PerformanceTracker
-from .models import initialize_models, get_nlp_model, get_llm_model
-__all__ = [
-    'api_error_handler',
-    'safe_json_parse',
-    'RateLimiter',
-    'PerformanceTracker',
-    'initialize_models',
-    'get_nlp_model',
-    'get_llm_model'
-]

utils/api_utils.py DELETED Viewed

@@ -1,229 +0,0 @@
-"""
-API utilities for the Fake News Detector application.
-This module provides utilities for handling API calls, rate limiting,
-error handling, and exponential backoff for retrying failed requests.
-"""
-import time
-import functools
-import random
-import logging
-import requests
-from datetime import datetime, timedelta
-from collections import deque
-from config import RATE_LIMITS, ERROR_BACKOFF
-logger = logging.getLogger("misinformation_detector")
-class RateLimiter:
-    """
-    Rate limiter for API calls with support for different APIs.
-    This class implements a token bucket algorithm for rate limiting,
-    with support for different rate limits for different APIs.
-    It also provides exponential backoff for error handling.
-    """
-    def __init__(self):
-        """Initialize the rate limiter with configuration from settings."""
-        # Store rate limits for different APIs
-        self.limits = {}
-        # Initialize limits from config
-        for api_name, limit_info in RATE_LIMITS.items():
-            self.limits[api_name] = {
-                "requests": limit_info["requests"],
-                "period": limit_info["period"],
-                "timestamps": deque()
-            }
-        # Error backoff settings
-        self.max_retries = ERROR_BACKOFF["max_retries"]
-        self.initial_backoff = ERROR_BACKOFF["initial_backoff"]
-        self.backoff_factor = ERROR_BACKOFF["backoff_factor"]
-    def check_and_update(self, api_name):
-        """
-        Check if request is allowed and update timestamps.
-        Args:
-            api_name (str): Name of the API to check
-        Returns:
-            tuple: (allowed, wait_time)
-                - allowed (bool): Whether the request is allowed
-                - wait_time (float): Time to wait if not allowed
-        """
-        if api_name not in self.limits:
-            return True, 0  # Unknown API, allow by default
-        now = datetime.now()
-        limit_info = self.limits[api_name]
-        # Remove timestamps older than the period
-        cutoff = now - timedelta(seconds=limit_info["period"])
-        while limit_info["timestamps"] and limit_info["timestamps"][0] < cutoff:
-            limit_info["timestamps"].popleft()
-        # Check if we're at the rate limit
-        if len(limit_info["timestamps"]) >= limit_info["requests"]:
-            # Calculate wait time until oldest timestamp expires
-            wait_time = (limit_info["timestamps"][0] + timedelta(seconds=limit_info["period"]) - now).total_seconds()
-            return False, max(0, wait_time)
-        # Add current timestamp and allow request
-        limit_info["timestamps"].append(now)
-        return True, 0
-    def wait_if_needed(self, api_name):
-        """
-        Wait if rate limit is reached.
-        Args:
-            api_name (str): Name of the API to check
-        Returns:
-            bool: True if waited, False otherwise
-        """
-        allowed, wait_time = self.check_and_update(api_name)
-        if not allowed:
-            logger.info(f"Rate limit reached for {api_name}. Waiting {wait_time:.2f} seconds...")
-            time.sleep(wait_time + 0.1)  # Add a small buffer
-            return True
-        return False
-    def get_backoff_time(self, attempt):
-        """
-        Calculate exponential backoff time with jitter.
-        Args:
-            attempt (int): Current attempt number (0-based)
-        Returns:
-            float: Backoff time in seconds
-        """
-        backoff = self.initial_backoff * (self.backoff_factor ** attempt)
-        # Add jitter to prevent thundering herd problem
-        jitter = random.uniform(0, 0.1 * backoff)
-        return backoff + jitter
-# Create rate limiter instance
-rate_limiter = RateLimiter()
-# API Error Handler decorator
-def api_error_handler(api_name):
-    """
-    Decorator for API calls with error handling and rate limiting.
-    This decorator handles rate limiting, retries with exponential
-    backoff, and error handling for API calls.
-    Args:
-        api_name (str): Name of the API being called
-    Returns:
-        callable: Decorated function
-    """
-    def decorator(func):
-        @functools.wraps(func)
-        def wrapper(*args, **kwargs):
-            try:
-                # Apply rate limiting - make sure rate_limiter exists and has the method
-                if hasattr(rate_limiter, 'wait_if_needed'):
-                    rate_limiter.wait_if_needed(api_name)
-                # Track retries
-                for attempt in range(rate_limiter.max_retries):
-                    try:
-                        return func(*args, **kwargs)
-                    except requests.exceptions.HTTPError as e:
-                        status_code = e.response.status_code if hasattr(e, 'response') else 0
-                        # Handle specific HTTP errors
-                        if status_code == 429:  # Too Many Requests
-                            logger.warning(f"{api_name} rate limit exceeded (429). Attempt {attempt+1}/{rate_limiter.max_retries}")
-                            # Get retry-after header or use exponential backoff
-                            retry_after = e.response.headers.get('Retry-After')
-                            if retry_after and retry_after.isdigit():
-                                wait_time = int(retry_after)
-                            else:
-                                wait_time = rate_limiter.get_backoff_time(attempt)
-                            logger.info(f"Waiting {wait_time} seconds before retry...")
-                            time.sleep(wait_time)
-                        elif status_code >= 500:  # Server errors
-                            logger.warning(f"{api_name} server error ({status_code}). Attempt {attempt+1}/{rate_limiter.max_retries}")
-                            time.sleep(rate_limiter.get_backoff_time(attempt))
-                        elif status_code == 403:  # Forbidden - likely API key issue
-                            logger.error(f"{api_name} access forbidden (403). Check API key.")
-                            return None  # Don't retry on auth errors
-                        elif status_code == 404:  # Not Found
-                            logger.warning(f"{api_name} resource not found (404).")
-                            return None  # Don't retry on resource not found
-                        else:
-                            logger.error(f"{api_name} HTTP error: {e}")
-                            if attempt < rate_limiter.max_retries - 1:
-                                wait_time = rate_limiter.get_backoff_time(attempt)
-                                logger.info(f"Waiting {wait_time} seconds before retry...")
-                                time.sleep(wait_time)
-                            else:
-                                return None
-                    except requests.exceptions.ConnectionError as e:
-                        logger.error(f"{api_name} connection error: {e}")
-                        if attempt < rate_limiter.max_retries - 1:
-                            wait_time = rate_limiter.get_backoff_time(attempt)
-                            logger.info(f"Waiting {wait_time} seconds before retry...")
-                            time.sleep(wait_time)
-                        else:
-                            return None
-                    except requests.exceptions.Timeout as e:
-                        logger.error(f"{api_name} timeout error: {e}")
-                        if attempt < rate_limiter.max_retries - 1:
-                            wait_time = rate_limiter.get_backoff_time(attempt)
-                            logger.info(f"Waiting {wait_time} seconds before retry...")
-                            time.sleep(wait_time)
-                        else:
-                            return None
-                    except Exception as e:
-                        logger.error(f"{api_name} unexpected error: {str(e)}")
-                        if attempt < rate_limiter.max_retries - 1:
-                            wait_time = rate_limiter.get_backoff_time(attempt)
-                            logger.info(f"Waiting {wait_time} seconds before retry...")
-                            time.sleep(wait_time)
-                        else:
-                            return None
-                # If we've exhausted all retries
-                logger.error(f"{api_name} call failed after {rate_limiter.max_retries} attempts")
-                return None
-            except Exception as e:
-                # Catch any unexpected errors in the decorator itself
-                logger.error(f"{api_name} decorator error: {str(e)}")
-                return None
-        return wrapper
-    return decorator
-def safe_json_parse(response, api_name):
-    """
-    Safely parse JSON response with error handling.
-    Args:
-        response (requests.Response): Response object to parse
-        api_name (str): Name of the API for logging
-    Returns:
-        dict: Parsed JSON or empty dict on error
-    """
-    try:
-        return response.json()
-    except ValueError as e:
-        logger.error(f"Error parsing {api_name} JSON response: {e}")
-        logger.debug(f"Response content: {response.text[:500]}...")
-        return {}

utils/models.py DELETED Viewed

@@ -1,157 +0,0 @@
-"""
-Model management utility for the Fake News Detector application.
-This module provides functions for initializing, caching, and
-retrieving language models used throughout the application.
-It ensures models are loaded efficiently and reused appropriately.
-"""
-import os
-import logging
-import functools
-from langchain_openai import ChatOpenAI
-import spacy
-logger = logging.getLogger("misinformation_detector")
-# Global variables for models
-nlp = None
-model = None
-models_initialized = False
-# Add caching decorator
-def cached_model(func):
-    """
-    Decorator to cache model loading for improved performance.
-    This decorator ensures that models are only loaded once and
-    then reused for subsequent calls, improving performance by
-    avoiding redundant model loading.
-    Args:
-        func (callable): Function that loads a model
-    Returns:
-        callable: Wrapped function that returns a cached model
-    """
-    cache = {}
-    @functools.wraps(func)
-    def wrapper(*args, **kwargs):
-        # Use function name as cache key
-        key = func.__name__
-        if key not in cache:
-            logger.info(f"Model not in cache, calling {key}...")
-            cache[key] = func(*args, **kwargs)
-        return cache[key]
-    return wrapper
-def initialize_models():
-    """
-    Initialize all required models.
-    This function loads and initializes all the language models
-    needed by the application, including spaCy for NLP tasks and
-    OpenAI for LLM-based processing.
-    Returns:
-        str: Initialization status message
-    Raises:
-        ValueError: If OpenAI API key is not set
-    """
-    global nlp, model, models_initialized
-    # Skip initialization if already done
-    if models_initialized:
-        logger.info("Models already initialized, skipping initialization")
-        return "Models already initialized"
-    # Check OpenAI API key
-    if "OPENAI_API_KEY" not in os.environ or not os.environ["OPENAI_API_KEY"].strip():
-        logger.error("OPENAI_API_KEY environment variable not set or empty")
-        raise ValueError("OpenAI API key is required. Please set it in the Hugging Face Space secrets.")
-    try:
-        # Load NLP model
-        try:
-            logger.info("Loading spaCy NLP model...")
-            nlp = spacy.load("en_core_web_sm")
-            logger.info("Loaded spaCy NLP model")
-        except OSError as e:
-            # This handles the case if the model wasn't installed correctly
-            logger.warning(f"Could not load spaCy model: {str(e)}")
-            logger.info("Attempting to download spaCy model...")
-            try:
-                import subprocess
-                import sys
-                # This downloads the model if it's missing
-                subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
-                # Try loading again
-                nlp = spacy.load("en_core_web_sm")
-                logger.info("Successfully downloaded and loaded spaCy model")
-            except Exception as download_err:
-                logger.error(f"Failed to download spaCy model: {str(download_err)}")
-                # Continue with other initialization, we'll handle missing NLP model elsewhere
-        # Set up OpenAI model
-        logger.info("Initializing ChatOpenAI model...")
-        model = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
-        logger.info("Initialized ChatOpenAI model")
-        # Mark initialization as complete
-        models_initialized = True
-        return "Models initialized successfully"
-    except Exception as e:
-        logger.error(f"Error initializing models: {str(e)}")
-        raise e
-@cached_model
-def get_nlp_model():
-    """
-    Get the spaCy NLP model, initializing if needed.
-    This function returns a cached spaCy model for NLP tasks.
-    If the model hasn't been loaded yet, it will be loaded.
-    Returns:
-        spacy.Language: Loaded spaCy model
-    """
-    global nlp
-    if nlp is None:
-        try:
-            # Try to load just the spaCy model if not loaded yet
-            logger.info("Loading spaCy NLP model...")
-            nlp = spacy.load("en_core_web_sm")
-            logger.info("Loaded spaCy NLP model")
-        except Exception as e:
-            logger.error(f"Error loading spaCy model: {str(e)}")
-            # Fall back to full initialization
-            initialize_models()
-    return nlp
-@cached_model
-def get_llm_model():
-    """
-    Get the ChatOpenAI model, initializing if needed.
-    This function returns a cached OpenAI LLM model.
-    If the model hasn't been loaded yet, it will be loaded.
-    Returns:
-        ChatOpenAI: Loaded LLM model
-    """
-    global model
-    if model is None:
-        try:
-            # Try to load just the LLM model if not loaded yet
-            logger.info("Initializing ChatOpenAI model...")
-            model = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
-            logger.info("Initialized ChatOpenAI model")
-        except Exception as e:
-            logger.error(f"Error initializing ChatOpenAI model: {str(e)}")
-            # Fall back to full initialization
-            initialize_models()
-    return model

utils/performance.py DELETED Viewed

@@ -1,135 +0,0 @@
-"""
-Performance tracking utility for the Fake News Detector application.
-This module provides functionality to track and analyze the
-performance of the application, including processing times,
-success rates, and resource utilization.
-"""
-import time
-import logging
-logger = logging.getLogger("misinformation_detector")
-class PerformanceTracker:
-    """
-    Tracks and logs performance metrics for the fact-checking system.
-    This class maintains counters and statistics for various performance
-    metrics, such as processing times, evidence retrieval success rates,
-    and confidence scores.
-    """
-    def __init__(self):
-        """Initialize the performance tracker with empty metrics."""
-        self.metrics = {
-            "claims_processed": 0,
-            "evidence_retrieval_success_rate": [],
-            "processing_times": [],
-            "confidence_scores": [],
-            "source_types_used": {},
-            "temporal_relevance": []
-        }
-    def log_claim_processed(self):
-        """
-        Increment the counter for processed claims.
-        This should be called whenever a claim is processed successfully.
-        """
-        self.metrics["claims_processed"] += 1
-    def log_evidence_retrieval(self, success, sources_count):
-        """
-        Log the success or failure of evidence retrieval.
-        Args:
-            success (bool): Whether evidence retrieval was successful
-            sources_count (dict): Count of evidence items by source type
-        """
-        # Ensure success is a boolean
-        success_value = 1 if success else 0
-        self.metrics["evidence_retrieval_success_rate"].append(success_value)
-        # Safely process source types
-        if isinstance(sources_count, dict):
-            for source_type, count in sources_count.items():
-                # Ensure source_type is a string and count is an integer
-                source_type = str(source_type)
-                try:
-                    count = int(count)
-                except (ValueError, TypeError):
-                    count = 1
-                # Update source types used
-                self.metrics["source_types_used"][source_type] = \
-                    self.metrics["source_types_used"].get(source_type, 0) + count
-    def log_processing_time(self, start_time):
-        """
-        Log the processing time for an operation.
-        Args:
-            start_time (float): Start time obtained from time.time()
-        """
-        end_time = time.time()
-        processing_time = end_time - start_time
-        self.metrics["processing_times"].append(processing_time)
-    def log_confidence_score(self, score):
-        """
-        Log a confidence score.
-        Args:
-            score (float): Confidence score between 0 and 1
-        """
-        # Ensure score is a float between 0 and 1
-        try:
-            score = float(score)
-            if 0 <= score <= 1:
-                self.metrics["confidence_scores"].append(score)
-        except (ValueError, TypeError):
-            logger.warning(f"Invalid confidence score: {score}")
-    def log_temporal_relevance(self, relevance_score):
-        """
-        Log a temporal relevance score.
-        Args:
-            relevance_score (float): Temporal relevance score between 0 and 1
-        """
-        # Ensure relevance score is a float between 0 and 1
-        try:
-            relevance_score = float(relevance_score)
-            if 0 <= relevance_score <= 1:
-                self.metrics["temporal_relevance"].append(relevance_score)
-        except (ValueError, TypeError):
-            logger.warning(f"Invalid temporal relevance score: {relevance_score}")
-    def get_summary(self):
-        """
-        Get a summary of all performance metrics.
-        Returns:
-            dict: Summary of performance metrics
-        """
-        # Safely calculate averages with error handling
-        def safe_avg(metric_list):
-            try:
-                return sum(metric_list) / max(len(metric_list), 1)
-            except (TypeError, ValueError):
-                return 0.0
-        return {
-            "claims_processed": self.metrics["claims_processed"],
-            "avg_evidence_retrieval_success_rate": safe_avg(self.metrics["evidence_retrieval_success_rate"]),
-            "avg_processing_time": safe_avg(self.metrics["processing_times"]),
-            "avg_confidence_score": safe_avg(self.metrics["confidence_scores"]),
-            "source_types_used": dict(self.metrics["source_types_used"]),
-            "avg_temporal_relevance": safe_avg(self.metrics["temporal_relevance"])
-        }
-    def reset(self):
-        """Reset all performance metrics."""
-        self.__init__()
-        logger.info("Performance metrics have been reset")
-        return "Performance metrics reset successfully"