ankanghosh commited on
Commit
3b7eec8
·
verified ·
1 Parent(s): e3cde99

Delete utils

Browse files
Files changed (4) hide show
  1. utils/__init__.py +0 -20
  2. utils/api_utils.py +0 -229
  3. utils/models.py +0 -157
  4. utils/performance.py +0 -135
utils/__init__.py DELETED
@@ -1,20 +0,0 @@
1
- """
2
- Utils package initialization.
3
-
4
- This package provides utility functions for the AskVeracity fact-checking system.
5
- """
6
-
7
- from .api_utils import api_error_handler, safe_json_parse, RateLimiter
8
- from .performance import PerformanceTracker
9
- from .models import initialize_models, get_nlp_model, get_llm_model
10
-
11
-
12
- __all__ = [
13
- 'api_error_handler',
14
- 'safe_json_parse',
15
- 'RateLimiter',
16
- 'PerformanceTracker',
17
- 'initialize_models',
18
- 'get_nlp_model',
19
- 'get_llm_model'
20
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/api_utils.py DELETED
@@ -1,229 +0,0 @@
1
- """
2
- API utilities for the Fake News Detector application.
3
-
4
- This module provides utilities for handling API calls, rate limiting,
5
- error handling, and exponential backoff for retrying failed requests.
6
- """
7
-
8
- import time
9
- import functools
10
- import random
11
- import logging
12
- import requests
13
- from datetime import datetime, timedelta
14
- from collections import deque
15
-
16
- from config import RATE_LIMITS, ERROR_BACKOFF
17
-
18
- logger = logging.getLogger("misinformation_detector")
19
-
20
- class RateLimiter:
21
- """
22
- Rate limiter for API calls with support for different APIs.
23
-
24
- This class implements a token bucket algorithm for rate limiting,
25
- with support for different rate limits for different APIs.
26
- It also provides exponential backoff for error handling.
27
- """
28
-
29
- def __init__(self):
30
- """Initialize the rate limiter with configuration from settings."""
31
- # Store rate limits for different APIs
32
- self.limits = {}
33
-
34
- # Initialize limits from config
35
- for api_name, limit_info in RATE_LIMITS.items():
36
- self.limits[api_name] = {
37
- "requests": limit_info["requests"],
38
- "period": limit_info["period"],
39
- "timestamps": deque()
40
- }
41
-
42
- # Error backoff settings
43
- self.max_retries = ERROR_BACKOFF["max_retries"]
44
- self.initial_backoff = ERROR_BACKOFF["initial_backoff"]
45
- self.backoff_factor = ERROR_BACKOFF["backoff_factor"]
46
-
47
- def check_and_update(self, api_name):
48
- """
49
- Check if request is allowed and update timestamps.
50
-
51
- Args:
52
- api_name (str): Name of the API to check
53
-
54
- Returns:
55
- tuple: (allowed, wait_time)
56
- - allowed (bool): Whether the request is allowed
57
- - wait_time (float): Time to wait if not allowed
58
- """
59
- if api_name not in self.limits:
60
- return True, 0 # Unknown API, allow by default
61
-
62
- now = datetime.now()
63
- limit_info = self.limits[api_name]
64
-
65
- # Remove timestamps older than the period
66
- cutoff = now - timedelta(seconds=limit_info["period"])
67
- while limit_info["timestamps"] and limit_info["timestamps"][0] < cutoff:
68
- limit_info["timestamps"].popleft()
69
-
70
- # Check if we're at the rate limit
71
- if len(limit_info["timestamps"]) >= limit_info["requests"]:
72
- # Calculate wait time until oldest timestamp expires
73
- wait_time = (limit_info["timestamps"][0] + timedelta(seconds=limit_info["period"]) - now).total_seconds()
74
- return False, max(0, wait_time)
75
-
76
- # Add current timestamp and allow request
77
- limit_info["timestamps"].append(now)
78
- return True, 0
79
-
80
- def wait_if_needed(self, api_name):
81
- """
82
- Wait if rate limit is reached.
83
-
84
- Args:
85
- api_name (str): Name of the API to check
86
-
87
- Returns:
88
- bool: True if waited, False otherwise
89
- """
90
- allowed, wait_time = self.check_and_update(api_name)
91
- if not allowed:
92
- logger.info(f"Rate limit reached for {api_name}. Waiting {wait_time:.2f} seconds...")
93
- time.sleep(wait_time + 0.1) # Add a small buffer
94
- return True
95
- return False
96
-
97
- def get_backoff_time(self, attempt):
98
- """
99
- Calculate exponential backoff time with jitter.
100
-
101
- Args:
102
- attempt (int): Current attempt number (0-based)
103
-
104
- Returns:
105
- float: Backoff time in seconds
106
- """
107
- backoff = self.initial_backoff * (self.backoff_factor ** attempt)
108
- # Add jitter to prevent thundering herd problem
109
- jitter = random.uniform(0, 0.1 * backoff)
110
- return backoff + jitter
111
-
112
-
113
- # Create rate limiter instance
114
- rate_limiter = RateLimiter()
115
-
116
- # API Error Handler decorator
117
- def api_error_handler(api_name):
118
- """
119
- Decorator for API calls with error handling and rate limiting.
120
-
121
- This decorator handles rate limiting, retries with exponential
122
- backoff, and error handling for API calls.
123
-
124
- Args:
125
- api_name (str): Name of the API being called
126
-
127
- Returns:
128
- callable: Decorated function
129
- """
130
- def decorator(func):
131
- @functools.wraps(func)
132
- def wrapper(*args, **kwargs):
133
- try:
134
- # Apply rate limiting - make sure rate_limiter exists and has the method
135
- if hasattr(rate_limiter, 'wait_if_needed'):
136
- rate_limiter.wait_if_needed(api_name)
137
-
138
- # Track retries
139
- for attempt in range(rate_limiter.max_retries):
140
- try:
141
- return func(*args, **kwargs)
142
- except requests.exceptions.HTTPError as e:
143
- status_code = e.response.status_code if hasattr(e, 'response') else 0
144
-
145
- # Handle specific HTTP errors
146
- if status_code == 429: # Too Many Requests
147
- logger.warning(f"{api_name} rate limit exceeded (429). Attempt {attempt+1}/{rate_limiter.max_retries}")
148
- # Get retry-after header or use exponential backoff
149
- retry_after = e.response.headers.get('Retry-After')
150
- if retry_after and retry_after.isdigit():
151
- wait_time = int(retry_after)
152
- else:
153
- wait_time = rate_limiter.get_backoff_time(attempt)
154
- logger.info(f"Waiting {wait_time} seconds before retry...")
155
- time.sleep(wait_time)
156
- elif status_code >= 500: # Server errors
157
- logger.warning(f"{api_name} server error ({status_code}). Attempt {attempt+1}/{rate_limiter.max_retries}")
158
- time.sleep(rate_limiter.get_backoff_time(attempt))
159
- elif status_code == 403: # Forbidden - likely API key issue
160
- logger.error(f"{api_name} access forbidden (403). Check API key.")
161
- return None # Don't retry on auth errors
162
- elif status_code == 404: # Not Found
163
- logger.warning(f"{api_name} resource not found (404).")
164
- return None # Don't retry on resource not found
165
- else:
166
- logger.error(f"{api_name} HTTP error: {e}")
167
- if attempt < rate_limiter.max_retries - 1:
168
- wait_time = rate_limiter.get_backoff_time(attempt)
169
- logger.info(f"Waiting {wait_time} seconds before retry...")
170
- time.sleep(wait_time)
171
- else:
172
- return None
173
-
174
- except requests.exceptions.ConnectionError as e:
175
- logger.error(f"{api_name} connection error: {e}")
176
- if attempt < rate_limiter.max_retries - 1:
177
- wait_time = rate_limiter.get_backoff_time(attempt)
178
- logger.info(f"Waiting {wait_time} seconds before retry...")
179
- time.sleep(wait_time)
180
- else:
181
- return None
182
-
183
- except requests.exceptions.Timeout as e:
184
- logger.error(f"{api_name} timeout error: {e}")
185
- if attempt < rate_limiter.max_retries - 1:
186
- wait_time = rate_limiter.get_backoff_time(attempt)
187
- logger.info(f"Waiting {wait_time} seconds before retry...")
188
- time.sleep(wait_time)
189
- else:
190
- return None
191
-
192
- except Exception as e:
193
- logger.error(f"{api_name} unexpected error: {str(e)}")
194
- if attempt < rate_limiter.max_retries - 1:
195
- wait_time = rate_limiter.get_backoff_time(attempt)
196
- logger.info(f"Waiting {wait_time} seconds before retry...")
197
- time.sleep(wait_time)
198
- else:
199
- return None
200
-
201
- # If we've exhausted all retries
202
- logger.error(f"{api_name} call failed after {rate_limiter.max_retries} attempts")
203
- return None
204
-
205
- except Exception as e:
206
- # Catch any unexpected errors in the decorator itself
207
- logger.error(f"{api_name} decorator error: {str(e)}")
208
- return None
209
-
210
- return wrapper
211
- return decorator
212
-
213
- def safe_json_parse(response, api_name):
214
- """
215
- Safely parse JSON response with error handling.
216
-
217
- Args:
218
- response (requests.Response): Response object to parse
219
- api_name (str): Name of the API for logging
220
-
221
- Returns:
222
- dict: Parsed JSON or empty dict on error
223
- """
224
- try:
225
- return response.json()
226
- except ValueError as e:
227
- logger.error(f"Error parsing {api_name} JSON response: {e}")
228
- logger.debug(f"Response content: {response.text[:500]}...")
229
- return {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/models.py DELETED
@@ -1,157 +0,0 @@
1
- """
2
- Model management utility for the Fake News Detector application.
3
-
4
- This module provides functions for initializing, caching, and
5
- retrieving language models used throughout the application.
6
- It ensures models are loaded efficiently and reused appropriately.
7
- """
8
-
9
- import os
10
- import logging
11
- import functools
12
- from langchain_openai import ChatOpenAI
13
- import spacy
14
-
15
- logger = logging.getLogger("misinformation_detector")
16
-
17
- # Global variables for models
18
- nlp = None
19
- model = None
20
- models_initialized = False
21
-
22
- # Add caching decorator
23
- def cached_model(func):
24
- """
25
- Decorator to cache model loading for improved performance.
26
-
27
- This decorator ensures that models are only loaded once and
28
- then reused for subsequent calls, improving performance by
29
- avoiding redundant model loading.
30
-
31
- Args:
32
- func (callable): Function that loads a model
33
-
34
- Returns:
35
- callable: Wrapped function that returns a cached model
36
- """
37
- cache = {}
38
-
39
- @functools.wraps(func)
40
- def wrapper(*args, **kwargs):
41
- # Use function name as cache key
42
- key = func.__name__
43
- if key not in cache:
44
- logger.info(f"Model not in cache, calling {key}...")
45
- cache[key] = func(*args, **kwargs)
46
- return cache[key]
47
-
48
- return wrapper
49
-
50
- def initialize_models():
51
- """
52
- Initialize all required models.
53
-
54
- This function loads and initializes all the language models
55
- needed by the application, including spaCy for NLP tasks and
56
- OpenAI for LLM-based processing.
57
-
58
- Returns:
59
- str: Initialization status message
60
-
61
- Raises:
62
- ValueError: If OpenAI API key is not set
63
- """
64
- global nlp, model, models_initialized
65
-
66
- # Skip initialization if already done
67
- if models_initialized:
68
- logger.info("Models already initialized, skipping initialization")
69
- return "Models already initialized"
70
-
71
- # Check OpenAI API key
72
- if "OPENAI_API_KEY" not in os.environ or not os.environ["OPENAI_API_KEY"].strip():
73
- logger.error("OPENAI_API_KEY environment variable not set or empty")
74
- raise ValueError("OpenAI API key is required. Please set it in the Hugging Face Space secrets.")
75
-
76
- try:
77
- # Load NLP model
78
- try:
79
- logger.info("Loading spaCy NLP model...")
80
- nlp = spacy.load("en_core_web_sm")
81
- logger.info("Loaded spaCy NLP model")
82
- except OSError as e:
83
- # This handles the case if the model wasn't installed correctly
84
- logger.warning(f"Could not load spaCy model: {str(e)}")
85
- logger.info("Attempting to download spaCy model...")
86
- try:
87
- import subprocess
88
- import sys
89
- # This downloads the model if it's missing
90
- subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
91
- # Try loading again
92
- nlp = spacy.load("en_core_web_sm")
93
- logger.info("Successfully downloaded and loaded spaCy model")
94
- except Exception as download_err:
95
- logger.error(f"Failed to download spaCy model: {str(download_err)}")
96
- # Continue with other initialization, we'll handle missing NLP model elsewhere
97
-
98
- # Set up OpenAI model
99
- logger.info("Initializing ChatOpenAI model...")
100
- model = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
101
- logger.info("Initialized ChatOpenAI model")
102
-
103
- # Mark initialization as complete
104
- models_initialized = True
105
- return "Models initialized successfully"
106
-
107
- except Exception as e:
108
- logger.error(f"Error initializing models: {str(e)}")
109
- raise e
110
-
111
- @cached_model
112
- def get_nlp_model():
113
- """
114
- Get the spaCy NLP model, initializing if needed.
115
-
116
- This function returns a cached spaCy model for NLP tasks.
117
- If the model hasn't been loaded yet, it will be loaded.
118
-
119
- Returns:
120
- spacy.Language: Loaded spaCy model
121
- """
122
- global nlp
123
- if nlp is None:
124
- try:
125
- # Try to load just the spaCy model if not loaded yet
126
- logger.info("Loading spaCy NLP model...")
127
- nlp = spacy.load("en_core_web_sm")
128
- logger.info("Loaded spaCy NLP model")
129
- except Exception as e:
130
- logger.error(f"Error loading spaCy model: {str(e)}")
131
- # Fall back to full initialization
132
- initialize_models()
133
- return nlp
134
-
135
- @cached_model
136
- def get_llm_model():
137
- """
138
- Get the ChatOpenAI model, initializing if needed.
139
-
140
- This function returns a cached OpenAI LLM model.
141
- If the model hasn't been loaded yet, it will be loaded.
142
-
143
- Returns:
144
- ChatOpenAI: Loaded LLM model
145
- """
146
- global model
147
- if model is None:
148
- try:
149
- # Try to load just the LLM model if not loaded yet
150
- logger.info("Initializing ChatOpenAI model...")
151
- model = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
152
- logger.info("Initialized ChatOpenAI model")
153
- except Exception as e:
154
- logger.error(f"Error initializing ChatOpenAI model: {str(e)}")
155
- # Fall back to full initialization
156
- initialize_models()
157
- return model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/performance.py DELETED
@@ -1,135 +0,0 @@
1
- """
2
- Performance tracking utility for the Fake News Detector application.
3
-
4
- This module provides functionality to track and analyze the
5
- performance of the application, including processing times,
6
- success rates, and resource utilization.
7
- """
8
-
9
- import time
10
- import logging
11
-
12
- logger = logging.getLogger("misinformation_detector")
13
-
14
- class PerformanceTracker:
15
- """
16
- Tracks and logs performance metrics for the fact-checking system.
17
-
18
- This class maintains counters and statistics for various performance
19
- metrics, such as processing times, evidence retrieval success rates,
20
- and confidence scores.
21
- """
22
-
23
- def __init__(self):
24
- """Initialize the performance tracker with empty metrics."""
25
- self.metrics = {
26
- "claims_processed": 0,
27
- "evidence_retrieval_success_rate": [],
28
- "processing_times": [],
29
- "confidence_scores": [],
30
- "source_types_used": {},
31
- "temporal_relevance": []
32
- }
33
-
34
- def log_claim_processed(self):
35
- """
36
- Increment the counter for processed claims.
37
- This should be called whenever a claim is processed successfully.
38
- """
39
- self.metrics["claims_processed"] += 1
40
-
41
- def log_evidence_retrieval(self, success, sources_count):
42
- """
43
- Log the success or failure of evidence retrieval.
44
-
45
- Args:
46
- success (bool): Whether evidence retrieval was successful
47
- sources_count (dict): Count of evidence items by source type
48
- """
49
- # Ensure success is a boolean
50
- success_value = 1 if success else 0
51
- self.metrics["evidence_retrieval_success_rate"].append(success_value)
52
-
53
- # Safely process source types
54
- if isinstance(sources_count, dict):
55
- for source_type, count in sources_count.items():
56
- # Ensure source_type is a string and count is an integer
57
- source_type = str(source_type)
58
- try:
59
- count = int(count)
60
- except (ValueError, TypeError):
61
- count = 1
62
-
63
- # Update source types used
64
- self.metrics["source_types_used"][source_type] = \
65
- self.metrics["source_types_used"].get(source_type, 0) + count
66
-
67
- def log_processing_time(self, start_time):
68
- """
69
- Log the processing time for an operation.
70
-
71
- Args:
72
- start_time (float): Start time obtained from time.time()
73
- """
74
- end_time = time.time()
75
- processing_time = end_time - start_time
76
- self.metrics["processing_times"].append(processing_time)
77
-
78
- def log_confidence_score(self, score):
79
- """
80
- Log a confidence score.
81
-
82
- Args:
83
- score (float): Confidence score between 0 and 1
84
- """
85
- # Ensure score is a float between 0 and 1
86
- try:
87
- score = float(score)
88
- if 0 <= score <= 1:
89
- self.metrics["confidence_scores"].append(score)
90
- except (ValueError, TypeError):
91
- logger.warning(f"Invalid confidence score: {score}")
92
-
93
- def log_temporal_relevance(self, relevance_score):
94
- """
95
- Log a temporal relevance score.
96
-
97
- Args:
98
- relevance_score (float): Temporal relevance score between 0 and 1
99
- """
100
- # Ensure relevance score is a float between 0 and 1
101
- try:
102
- relevance_score = float(relevance_score)
103
- if 0 <= relevance_score <= 1:
104
- self.metrics["temporal_relevance"].append(relevance_score)
105
- except (ValueError, TypeError):
106
- logger.warning(f"Invalid temporal relevance score: {relevance_score}")
107
-
108
- def get_summary(self):
109
- """
110
- Get a summary of all performance metrics.
111
-
112
- Returns:
113
- dict: Summary of performance metrics
114
- """
115
- # Safely calculate averages with error handling
116
- def safe_avg(metric_list):
117
- try:
118
- return sum(metric_list) / max(len(metric_list), 1)
119
- except (TypeError, ValueError):
120
- return 0.0
121
-
122
- return {
123
- "claims_processed": self.metrics["claims_processed"],
124
- "avg_evidence_retrieval_success_rate": safe_avg(self.metrics["evidence_retrieval_success_rate"]),
125
- "avg_processing_time": safe_avg(self.metrics["processing_times"]),
126
- "avg_confidence_score": safe_avg(self.metrics["confidence_scores"]),
127
- "source_types_used": dict(self.metrics["source_types_used"]),
128
- "avg_temporal_relevance": safe_avg(self.metrics["temporal_relevance"])
129
- }
130
-
131
- def reset(self):
132
- """Reset all performance metrics."""
133
- self.__init__()
134
- logger.info("Performance metrics have been reset")
135
- return "Performance metrics reset successfully"