Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Performance Evaluation Script for AskVeracity. | |
| This script evaluates the performance of the AskVeracity fact-checking system | |
| using a predefined set of test claims with known ground truth labels. | |
| It collects metrics on accuracy, safety rate, processing time, and confidence scores | |
| without modifying the core codebase. | |
| Usage: | |
| python evaluate_performance.py [--limit N] [--output FILE] | |
| Options: | |
| --limit N Limit evaluation to first N claims (default: all) | |
| --output FILE Save results to FILE (default: performance_results.json) | |
| """ | |
| import os | |
| import sys | |
| import json | |
| import time | |
| import argparse | |
| from datetime import datetime | |
| import matplotlib.pyplot as plt | |
| from tabulate import tabulate | |
| import numpy as np | |
| # Add the parent directory to sys.path if this script is run directly | |
| if __name__ == "__main__": | |
| sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| # Import the agent and performance tracker | |
| import agent | |
| from utils.performance import PerformanceTracker | |
| from utils.models import initialize_models | |
| # IMPORTANT NOTE FOR DEVELOPERS: | |
| # The test claims below include many recent events that will become outdated. | |
| # When using this script for testing or evaluation, please update these claims | |
| # with relevant and up-to-date examples to ensure meaningful results. | |
| # Performance metrics are heavily influenced by the recency and verifiability | |
| # of these claims, so using outdated claims will likely lead to poor results. | |
| # Define the test claims with ground truth labels | |
| TEST_CLAIMS = [ | |
| # True claims | |
| {"claim": "Dozens killed as gunmen massacre tourists in Kashmir beauty spot.", "expected": "True"}, | |
| {"claim": "Pope Francis dies at 88.", "expected": "True"}, | |
| {"claim": "OpenAI released new reasoning models called o3 and o4-mini.", "expected": "True"}, | |
| {"claim": "Trump And Zelensky Clash Again As US Says Crimea Now Russian Territory.", "expected": "True"}, | |
| {"claim": "Twelve states sue Donald Trump administration in trade court over chaotic and illegal tariff policy.", "expected": "True"}, | |
| {"claim": "Zomato has been renamed to Eternal Limited.", "expected": "True"}, | |
| {"claim": "The Taj Mahal is located in Agra.", "expected": "True"}, | |
| {"claim": "ISRO achieves second docking with SpaDeX satellites.", "expected": "True"}, | |
| {"claim": "The TV series Adolescence is streaming on Netflix.", "expected": "True"}, | |
| {"claim": "Vladimir Putin offers to halt Ukraine invasion.", "expected": "True"}, | |
| {"claim": "Meta released its Llama 4 language model.", "expected": "True"}, | |
| {"claim": "Google launched Gemini 2.5 Pro Experimental, the first model in the Gemini 2.5 family.", "expected": "True"}, | |
| {"claim": "Microsoft is rolling out improved Recall feature for Windows Insiders.", "expected": "True"}, | |
| {"claim": "Microsoft announced a 1-bit language model that can run on CPU.", "expected": "True"}, | |
| {"claim": "Royal Challengers Bengaluru beat Rajasthan Royals by 11 runs in yesterday's IPL match.", "expected": "True"}, | |
| {"claim": "Anthropic introduced Claude Research.", "expected": "True"}, | |
| {"claim": "The IMF has lowered India's growth projection for the fiscal year 2025-26 to 6.2 per cent.", "expected": "True"}, | |
| {"claim": "In Bundesliga, Bayern Munich beat Heidenheim 4-0 last week.", "expected": "True"}, | |
| {"claim": "Manchester United in Europa League semi-finals.", "expected": "True"}, | |
| # False claims | |
| {"claim": "The Eiffel Tower is in Rome.", "expected": "False"}, | |
| {"claim": "The earth is flat.", "expected": "False"}, | |
| {"claim": "Rishi Sunak is the current Prime Minister of the UK.", "expected": "False"}, | |
| {"claim": "New Zealand won the ICC Champions Trophy in 2025.", "expected": "False"}, | |
| {"claim": "US President Donald trump to visit India next week.", "expected": "False"}, | |
| {"claim": "Quantum computers have definitively solved the protein folding problem.", "expected": "False"}, | |
| {"claim": "CRISPR gene editing has successfully cured type 1 diabetes in human clinical trials.", "expected": "False"}, | |
| {"claim": "Google's new quantum computer, Willow, has demonstrated remarkable capabilities by solving mathematical problems far beyond the reach of the fastest supercomputers.", "expected": "False"}, | |
| {"claim": "NASA confirmed that the James Webb Space Telescope has found definitive evidence of alien life on an exoplanet.", "expected": "False"}, | |
| {"claim": "Google launched Gemini 3.", "expected": "False"}, | |
| {"claim": "A solar eclipse was be seen in India on October 17, 2024.", "expected": "False"}, | |
| {"claim": "Tom Cruise and Shah Rukh Khan have starred in a Bollywood movie in the past.", "expected": "False"}, | |
| {"claim": "Germany has the highest GDP in the world.", "expected": "False"}, | |
| # Uncertain claims | |
| {"claim": "Aliens have visited the Earth.", "expected": "Uncertain"}, | |
| {"claim": "Information that falls into a black hole is permanently lost or destroyed.", "expected": "Uncertain"}, | |
| {"claim": "Time travel into the past is possible.", "expected": "Uncertain"}, | |
| {"claim": "Bigfoot (or Yeti) exists in remote wilderness areas.", "expected": "Uncertain"}, | |
| {"claim": "Intelligent life exists elsewhere in the universe.", "expected": "Uncertain"}, | |
| {"claim": "Yogi Adityanath will be the next Prime Minister of India.", "expected": "Uncertain"}, | |
| {"claim": "Consciousness continues to exist after biological death.", "expected": "Uncertain"}, | |
| {"claim": "There are multiple parallel universes.", "expected": "Uncertain"} | |
| ] | |
| def setup_argument_parser(): | |
| """ | |
| Set up command line argument parsing. | |
| Returns: | |
| argparse.Namespace: Parsed command line arguments | |
| """ | |
| parser = argparse.ArgumentParser(description="Evaluate AskVeracity performance") | |
| parser.add_argument("--limit", type=int, help="Limit evaluation to first N claims") | |
| parser.add_argument("--output", type=str, default="performance_results.json", | |
| help="Output file for results (default: performance_results.json)") | |
| return parser.parse_args() | |
| def initialize_system(): | |
| """ | |
| Initialize the system for evaluation. | |
| Returns: | |
| object: Initialized LangGraph agent | |
| """ | |
| print("Initializing models and agent...") | |
| initialize_models() | |
| eval_agent = agent.setup_agent() | |
| return eval_agent | |
| def normalize_classification(classification): | |
| """ | |
| Normalize classification labels for consistent comparison. | |
| Args: | |
| classification (str): Classification label from the system | |
| Returns: | |
| str: Normalized classification label ("True", "False", or "Uncertain") | |
| """ | |
| if not classification: | |
| return "Uncertain" | |
| if "true" in classification.lower(): | |
| return "True" | |
| elif "false" in classification.lower(): | |
| return "False" | |
| else: | |
| return "Uncertain" | |
| def is_correct(actual, expected): | |
| """ | |
| Determine if the actual classification matches the expected classification. | |
| Args: | |
| actual (str): Actual classification from the system | |
| expected (str): Expected (ground truth) classification | |
| Returns: | |
| bool: True if classifications match, False otherwise | |
| """ | |
| # Normalize both for comparison | |
| normalized_actual = normalize_classification(actual) | |
| normalized_expected = expected | |
| return normalized_actual == normalized_expected | |
| def is_safe(actual, expected): | |
| """ | |
| Determine if the classification is "safe" - either correct or abstained (Uncertain) | |
| instead of making an incorrect assertion. | |
| Args: | |
| actual (str): Actual classification from the system | |
| expected (str): Expected (ground truth) classification | |
| Returns: | |
| bool: True if the classification is safe, False otherwise | |
| """ | |
| # Normalize both for comparison | |
| normalized_actual = normalize_classification(actual) | |
| normalized_expected = expected | |
| # If the classification is correct, it's definitely safe | |
| if normalized_actual == normalized_expected: | |
| return True | |
| # If the system classified as "Uncertain", that's safe (abstaining rather than wrong assertion) | |
| if normalized_actual == "Uncertain": | |
| return True | |
| # Otherwise, the system made an incorrect assertion (False as True or True as False) | |
| return False | |
| def evaluate_claims(test_claims, eval_agent, limit=None): | |
| """ | |
| Evaluate a list of claims using the fact-checking system. | |
| Args: | |
| test_claims (list): List of test claims with expected classifications | |
| eval_agent (object): Initialized LangGraph agent | |
| limit (int, optional): Maximum number of claims to evaluate | |
| Returns: | |
| tuple: (results, metrics) | |
| - results (list): Detailed results for each claim | |
| - metrics (dict): Aggregated performance metrics | |
| """ | |
| # Initialize performance tracker | |
| performance_tracker = PerformanceTracker() | |
| # Limit the number of claims if requested | |
| if limit and limit > 0: | |
| claims_to_evaluate = test_claims[:limit] | |
| else: | |
| claims_to_evaluate = test_claims | |
| results = [] | |
| total_count = len(claims_to_evaluate) | |
| correct_count = 0 | |
| safe_count = 0 | |
| # Classification counts | |
| classification_counts = {"True": 0, "False": 0, "Uncertain": 0} | |
| # Track processing times by expected classification | |
| processing_times = {"True": [], "False": [], "Uncertain": []} | |
| # Confidence scores by expected classification | |
| confidence_scores = {"True": [], "False": [], "Uncertain": []} | |
| # Track correct classifications by expected classification | |
| correct_by_class = {"True": 0, "False": 0, "Uncertain": 0} | |
| safe_by_class = {"True": 0, "False": 0, "Uncertain": 0} | |
| total_by_class = {"True": 0, "False": 0, "Uncertain": 0} | |
| print(f"Evaluating {len(claims_to_evaluate)} claims...") | |
| # Process each claim | |
| for idx, test_case in enumerate(claims_to_evaluate): | |
| claim = test_case["claim"] | |
| expected = test_case["expected"] | |
| print(f"\nProcessing claim {idx+1}/{len(claims_to_evaluate)}: {claim}") | |
| try: | |
| # Process the claim and measure time | |
| start_time = time.time() | |
| result = agent.process_claim(claim, eval_agent) | |
| total_time = time.time() - start_time | |
| # Extract classification and confidence | |
| classification = result.get("classification", "Uncertain") | |
| confidence = result.get("confidence", 0.0) | |
| # Normalize classification for comparison | |
| normalized_classification = normalize_classification(classification) | |
| # Check if classification is correct | |
| correct = is_correct(normalized_classification, expected) | |
| if correct: | |
| correct_count += 1 | |
| correct_by_class[expected] += 1 | |
| # Check if classification is safe | |
| safe = is_safe(normalized_classification, expected) | |
| if safe: | |
| safe_count += 1 | |
| safe_by_class[expected] += 1 | |
| # Update classification count | |
| classification_counts[normalized_classification] = classification_counts.get(normalized_classification, 0) + 1 | |
| # Update counts by expected class | |
| total_by_class[expected] += 1 | |
| # Update processing times | |
| processing_times[expected].append(total_time) | |
| # Update confidence scores | |
| confidence_scores[expected].append(confidence) | |
| # Save detailed result | |
| detail_result = { | |
| "claim": claim, | |
| "expected": expected, | |
| "actual": normalized_classification, | |
| "correct": correct, | |
| "safe": safe, | |
| "confidence": confidence, | |
| "processing_time": total_time | |
| } | |
| results.append(detail_result) | |
| # Print progress indicator | |
| outcome = "✓" if correct else "✗" | |
| safety = "(safe)" if safe and not correct else "" | |
| print(f" Result: {normalized_classification} (Expected: {expected}) {outcome} {safety}") | |
| print(f" Time: {total_time:.2f}s, Confidence: {confidence:.2f}") | |
| except Exception as e: | |
| print(f"Error processing claim: {str(e)}") | |
| results.append({ | |
| "claim": claim, | |
| "expected": expected, | |
| "error": str(e) | |
| }) | |
| # Calculate performance metrics | |
| accuracy = correct_count / total_count if total_count > 0 else 0 | |
| safety_rate = safe_count / total_count if total_count > 0 else 0 | |
| # Calculate per-class metrics | |
| class_metrics = {} | |
| for cls in ["True", "False", "Uncertain"]: | |
| class_accuracy = correct_by_class[cls] / total_by_class[cls] if total_by_class[cls] > 0 else 0 | |
| class_safety_rate = safe_by_class[cls] / total_by_class[cls] if total_by_class[cls] > 0 else 0 | |
| avg_time = sum(processing_times[cls]) / len(processing_times[cls]) if processing_times[cls] else 0 | |
| avg_confidence = sum(confidence_scores[cls]) / len(confidence_scores[cls]) if confidence_scores[cls] else 0 | |
| class_metrics[cls] = { | |
| "accuracy": class_accuracy, | |
| "safety_rate": class_safety_rate, | |
| "count": total_by_class[cls], | |
| "correct": correct_by_class[cls], | |
| "safe": safe_by_class[cls], | |
| "avg_processing_time": avg_time, | |
| "avg_confidence": avg_confidence | |
| } | |
| # Calculate overall metrics | |
| all_times = [r.get("processing_time", 0) for r in results if "processing_time" in r] | |
| all_confidence = [r.get("confidence", 0) for r in results if "confidence" in r] | |
| metrics = { | |
| "total_claims": total_count, | |
| "correct_claims": correct_count, | |
| "safe_claims": safe_count, | |
| "accuracy": accuracy, | |
| "safety_rate": safety_rate, | |
| "avg_processing_time": sum(all_times) / len(all_times) if all_times else 0, | |
| "avg_confidence": sum(all_confidence) / len(all_confidence) if all_confidence else 0, | |
| "classification_counts": classification_counts, | |
| "per_class_metrics": class_metrics | |
| } | |
| return results, metrics | |
| def save_results(results, metrics, output_file): | |
| """ | |
| Save evaluation results to a JSON file. | |
| Args: | |
| results (list): Detailed results for each claim | |
| metrics (dict): Aggregated performance metrics | |
| output_file (str): Path to output file | |
| """ | |
| output_data = { | |
| "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), | |
| "metrics": metrics, | |
| "detailed_results": results | |
| } | |
| with open(output_file, 'w') as f: | |
| json.dump(output_data, f, indent=2) | |
| print(f"\nResults saved to {output_file}") | |
| def print_summary(metrics): | |
| """ | |
| Print a summary of performance metrics. | |
| Args: | |
| metrics (dict): Aggregated performance metrics | |
| """ | |
| print("\n" + "="*70) | |
| print(f"PERFORMANCE SUMMARY") | |
| print("="*70) | |
| # Overall metrics | |
| print(f"\nOverall Metrics:") | |
| print(f"Total Claims: {metrics['total_claims']}") | |
| print(f"Correctly Classified: {metrics['correct_claims']}") | |
| print(f"Safely Classified: {metrics['safe_claims']}") | |
| print(f"Accuracy: {metrics['accuracy']:.2%}") | |
| print(f"Safety Rate: {metrics['safety_rate']:.2%}") | |
| print(f"Average Processing Time: {metrics['avg_processing_time']:.2f} seconds") | |
| print(f"Average Confidence Score: {metrics['avg_confidence']:.2f}") | |
| # Per-class metrics as table | |
| print("\nPer-Class Performance:") | |
| table_data = [] | |
| headers = ["Class", "Count", "Correct", "Safe", "Accuracy", "Safety Rate", "Avg Time", "Avg Confidence"] | |
| for cls, cls_metrics in metrics['per_class_metrics'].items(): | |
| table_data.append([ | |
| cls, | |
| cls_metrics['count'], | |
| cls_metrics['correct'], | |
| cls_metrics['safe'], | |
| f"{cls_metrics['accuracy']:.2%}", | |
| f"{cls_metrics['safety_rate']:.2%}", | |
| f"{cls_metrics['avg_processing_time']:.2f}s", | |
| f"{cls_metrics['avg_confidence']:.2f}" | |
| ]) | |
| print(tabulate(table_data, headers=headers, tablefmt="grid")) | |
| def create_charts(metrics, output_dir="."): | |
| """ | |
| Create visualizations of performance metrics. | |
| Args: | |
| metrics (dict): Aggregated performance metrics | |
| output_dir (str): Directory to save charts | |
| """ | |
| try: | |
| # Create output directory if it doesn't exist | |
| os.makedirs(output_dir, exist_ok=True) | |
| # Plot 1: Accuracy by class | |
| plt.figure(figsize=(10, 6)) | |
| classes = list(metrics['per_class_metrics'].keys()) | |
| accuracies = [metrics['per_class_metrics'][cls]['accuracy'] for cls in classes] | |
| plt.bar(classes, accuracies, color=['green', 'red', 'gray']) | |
| plt.title('Accuracy by Classification Type') | |
| plt.xlabel('Classification') | |
| plt.ylabel('Accuracy') | |
| plt.ylim(0, 1) | |
| for i, v in enumerate(accuracies): | |
| plt.text(i, v + 0.02, f"{v:.2%}", ha='center') | |
| plt.tight_layout() | |
| plt.savefig(os.path.join(output_dir, 'accuracy_by_class.png')) | |
| plt.close() # Close the figure to free memory | |
| # Plot 2: Safety rate by class | |
| plt.figure(figsize=(10, 6)) | |
| safety_rates = [metrics['per_class_metrics'][cls]['safety_rate'] for cls in classes] | |
| plt.bar(classes, safety_rates, color=['green', 'red', 'gray']) | |
| plt.title('Safety Rate by Classification Type') | |
| plt.xlabel('Classification') | |
| plt.ylabel('Safety Rate') | |
| plt.ylim(0, 1) | |
| for i, v in enumerate(safety_rates): | |
| plt.text(i, v + 0.02, f"{v:.2%}", ha='center') | |
| plt.tight_layout() | |
| plt.savefig(os.path.join(output_dir, 'safety_rate_by_class.png')) | |
| plt.close() # Close the figure to free memory | |
| # Plot 3: Processing time by class | |
| plt.figure(figsize=(10, 6)) | |
| times = [metrics['per_class_metrics'][cls]['avg_processing_time'] for cls in classes] | |
| plt.bar(classes, times, color=['green', 'red', 'gray']) | |
| plt.title('Average Processing Time by Classification Type') | |
| plt.xlabel('Classification') | |
| plt.ylabel('Time (seconds)') | |
| for i, v in enumerate(times): | |
| plt.text(i, v + 0.5, f"{v:.2f}s", ha='center') | |
| plt.tight_layout() | |
| plt.savefig(os.path.join(output_dir, 'processing_time_by_class.png')) | |
| plt.close() # Close the figure to free memory | |
| # Plot 4: Confidence scores by class | |
| plt.figure(figsize=(10, 6)) | |
| confidence = [metrics['per_class_metrics'][cls]['avg_confidence'] for cls in classes] | |
| plt.bar(classes, confidence, color=['green', 'red', 'gray']) | |
| plt.title('Average Confidence Score by Classification Type') | |
| plt.xlabel('Classification') | |
| plt.ylabel('Confidence Score') | |
| plt.ylim(0, 1) | |
| for i, v in enumerate(confidence): | |
| plt.text(i, v + 0.02, f"{v:.2f}", ha='center') | |
| plt.tight_layout() | |
| plt.savefig(os.path.join(output_dir, 'confidence_by_class.png')) | |
| plt.close() # Close the figure to free memory | |
| print(f"\nCharts created in {output_dir}") | |
| except Exception as e: | |
| print(f"Error creating charts: {str(e)}") | |
| print("Continuing without charts.") | |
| def main(): | |
| """Main evaluation function that runs the entire evaluation process.""" | |
| # Parse arguments | |
| args = setup_argument_parser() | |
| # Initialize the agent | |
| eval_agent = initialize_system() | |
| # Create results directory if it doesn't exist | |
| results_dir = "results" | |
| os.makedirs(results_dir, exist_ok=True) | |
| # Set output file path | |
| output_file = args.output | |
| if not os.path.isabs(output_file): | |
| output_file = os.path.join(results_dir, output_file) | |
| # Evaluate claims | |
| results, metrics = evaluate_claims(TEST_CLAIMS, eval_agent, args.limit) | |
| # results, metrics = evaluate_claims(TEST_CLAIMS, eval_agent, 1) | |
| # Print summary | |
| print_summary(metrics) | |
| # Save results | |
| save_results(results, metrics, output_file) | |
| # Create charts | |
| try: | |
| from tabulate import tabulate | |
| import matplotlib.pyplot as plt | |
| create_charts(metrics, results_dir) | |
| except ImportError: | |
| print("\nCould not create charts. Please install matplotlib and tabulate packages:") | |
| print("pip install matplotlib tabulate") | |
| if __name__ == "__main__": | |
| main() |