Spaces:

ankanghosh
/

askveracity

Sleeping

App Files Files Community

askveracity / evaluate_performance.py

ankanghosh

Update evaluate_performance.py

4dbfec8 verified 6 months ago

raw

history blame

21.1 kB

	#!/usr/bin/env python3
	"""
	Performance Evaluation Script for AskVeracity.

	This script evaluates the performance of the AskVeracity fact-checking system
	using a predefined set of test claims with known ground truth labels.
	It collects metrics on accuracy, safety rate, processing time, and confidence scores
	without modifying the core codebase.

	Usage:
	python evaluate_performance.py [--limit N] [--output FILE]

	Options:
	--limit N Limit evaluation to first N claims (default: all)
	--output FILE Save results to FILE (default: performance_results.json)
	"""

	import os
	import sys
	import json
	import time
	import argparse
	from datetime import datetime
	import matplotlib.pyplot as plt
	from tabulate import tabulate
	import numpy as np

	# Add the parent directory to sys.path if this script is run directly
	if __name__ == "__main__":
	sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

	# Import the agent and performance tracker
	import agent
	from utils.performance import PerformanceTracker
	from utils.models import initialize_models

	# IMPORTANT NOTE FOR DEVELOPERS:
	# The test claims below include many recent events that will become outdated.
	# When using this script for testing or evaluation, please update these claims
	# with relevant and up-to-date examples to ensure meaningful results.
	# Performance metrics are heavily influenced by the recency and verifiability
	# of these claims, so using outdated claims will likely lead to poor results.

	# Define the test claims with ground truth labels
	TEST_CLAIMS = [
	# True claims
	{"claim": "Dozens killed as gunmen massacre tourists in Kashmir beauty spot.", "expected": "True"},
	{"claim": "Pope Francis dies at 88.", "expected": "True"},
	{"claim": "OpenAI released new reasoning models called o3 and o4-mini.", "expected": "True"},
	{"claim": "Trump And Zelensky Clash Again As US Says Crimea Now Russian Territory.", "expected": "True"},
	{"claim": "Twelve states sue Donald Trump administration in trade court over chaotic and illegal tariff policy.", "expected": "True"},
	{"claim": "Zomato has been renamed to Eternal Limited.", "expected": "True"},
	{"claim": "The Taj Mahal is located in Agra.", "expected": "True"},
	{"claim": "ISRO achieves second docking with SpaDeX satellites.", "expected": "True"},
	{"claim": "The TV series Adolescence is streaming on Netflix.", "expected": "True"},
	{"claim": "Vladimir Putin offers to halt Ukraine invasion.", "expected": "True"},
	{"claim": "Meta released its Llama 4 language model.", "expected": "True"},
	{"claim": "Google launched Gemini 2.5 Pro Experimental, the first model in the Gemini 2.5 family.", "expected": "True"},
	{"claim": "Microsoft is rolling out improved Recall feature for Windows Insiders.", "expected": "True"},
	{"claim": "Microsoft announced a 1-bit language model that can run on CPU.", "expected": "True"},
	{"claim": "Royal Challengers Bengaluru beat Rajasthan Royals by 11 runs in yesterday's IPL match.", "expected": "True"},
	{"claim": "Anthropic introduced Claude Research.", "expected": "True"},
	{"claim": "The IMF has lowered India's growth projection for the fiscal year 2025-26 to 6.2 per cent.", "expected": "True"},
	{"claim": "In Bundesliga, Bayern Munich beat Heidenheim 4-0 last week.", "expected": "True"},
	{"claim": "Manchester United in Europa League semi-finals.", "expected": "True"},

	# False claims
	{"claim": "The Eiffel Tower is in Rome.", "expected": "False"},
	{"claim": "The earth is flat.", "expected": "False"},
	{"claim": "Rishi Sunak is the current Prime Minister of the UK.", "expected": "False"},
	{"claim": "New Zealand won the ICC Champions Trophy in 2025.", "expected": "False"},
	{"claim": "US President Donald trump to visit India next week.", "expected": "False"},
	{"claim": "Quantum computers have definitively solved the protein folding problem.", "expected": "False"},
	{"claim": "CRISPR gene editing has successfully cured type 1 diabetes in human clinical trials.", "expected": "False"},
	{"claim": "Google's new quantum computer, Willow, has demonstrated remarkable capabilities by solving mathematical problems far beyond the reach of the fastest supercomputers.", "expected": "False"},
	{"claim": "NASA confirmed that the James Webb Space Telescope has found definitive evidence of alien life on an exoplanet.", "expected": "False"},
	{"claim": "Google launched Gemini 3.", "expected": "False"},
	{"claim": "A solar eclipse was be seen in India on October 17, 2024.", "expected": "False"},
	{"claim": "Tom Cruise and Shah Rukh Khan have starred in a Bollywood movie in the past.", "expected": "False"},
	{"claim": "Germany has the highest GDP in the world.", "expected": "False"},

	# Uncertain claims
	{"claim": "Aliens have visited the Earth.", "expected": "Uncertain"},
	{"claim": "Information that falls into a black hole is permanently lost or destroyed.", "expected": "Uncertain"},
	{"claim": "Time travel into the past is possible.", "expected": "Uncertain"},
	{"claim": "Bigfoot (or Yeti) exists in remote wilderness areas.", "expected": "Uncertain"},
	{"claim": "Intelligent life exists elsewhere in the universe.", "expected": "Uncertain"},
	{"claim": "Yogi Adityanath will be the next Prime Minister of India.", "expected": "Uncertain"},
	{"claim": "Consciousness continues to exist after biological death.", "expected": "Uncertain"},
	{"claim": "There are multiple parallel universes.", "expected": "Uncertain"}
	]

	def setup_argument_parser():
	"""
	Set up command line argument parsing.

	Returns:
	argparse.Namespace: Parsed command line arguments
	"""
	parser = argparse.ArgumentParser(description="Evaluate AskVeracity performance")
	parser.add_argument("--limit", type=int, help="Limit evaluation to first N claims")
	parser.add_argument("--output", type=str, default="performance_results.json",
	help="Output file for results (default: performance_results.json)")
	return parser.parse_args()

	def initialize_system():
	"""
	Initialize the system for evaluation.

	Returns:
	object: Initialized LangGraph agent
	"""
	print("Initializing models and agent...")
	initialize_models()
	eval_agent = agent.setup_agent()
	return eval_agent

	def normalize_classification(classification):
	"""
	Normalize classification labels for consistent comparison.

	Args:
	classification (str): Classification label from the system

	Returns:
	str: Normalized classification label ("True", "False", or "Uncertain")
	"""
	if not classification:
	return "Uncertain"

	if "true" in classification.lower():
	return "True"
	elif "false" in classification.lower():
	return "False"
	else:
	return "Uncertain"

	def is_correct(actual, expected):
	"""
	Determine if the actual classification matches the expected classification.

	Args:
	actual (str): Actual classification from the system
	expected (str): Expected (ground truth) classification

	Returns:
	bool: True if classifications match, False otherwise
	"""
	# Normalize both for comparison
	normalized_actual = normalize_classification(actual)
	normalized_expected = expected

	return normalized_actual == normalized_expected

	def is_safe(actual, expected):
	"""
	Determine if the classification is "safe" - either correct or abstained (Uncertain)
	instead of making an incorrect assertion.

	Args:
	actual (str): Actual classification from the system
	expected (str): Expected (ground truth) classification

	Returns:
	bool: True if the classification is safe, False otherwise
	"""
	# Normalize both for comparison
	normalized_actual = normalize_classification(actual)
	normalized_expected = expected

	# If the classification is correct, it's definitely safe
	if normalized_actual == normalized_expected:
	return True

	# If the system classified as "Uncertain", that's safe (abstaining rather than wrong assertion)
	if normalized_actual == "Uncertain":
	return True

	# Otherwise, the system made an incorrect assertion (False as True or True as False)
	return False

	def evaluate_claims(test_claims, eval_agent, limit=None):
	"""
	Evaluate a list of claims using the fact-checking system.

	Args:
	test_claims (list): List of test claims with expected classifications
	eval_agent (object): Initialized LangGraph agent
	limit (int, optional): Maximum number of claims to evaluate

	Returns:
	tuple: (results, metrics)
	- results (list): Detailed results for each claim
	- metrics (dict): Aggregated performance metrics
	"""
	# Initialize performance tracker
	performance_tracker = PerformanceTracker()

	# Limit the number of claims if requested
	if limit and limit > 0:
	claims_to_evaluate = test_claims[:limit]
	else:
	claims_to_evaluate = test_claims

	results = []
	total_count = len(claims_to_evaluate)
	correct_count = 0
	safe_count = 0

	# Classification counts
	classification_counts = {"True": 0, "False": 0, "Uncertain": 0}

	# Track processing times by expected classification
	processing_times = {"True": [], "False": [], "Uncertain": []}

	# Confidence scores by expected classification
	confidence_scores = {"True": [], "False": [], "Uncertain": []}

	# Track correct classifications by expected classification
	correct_by_class = {"True": 0, "False": 0, "Uncertain": 0}
	safe_by_class = {"True": 0, "False": 0, "Uncertain": 0}
	total_by_class = {"True": 0, "False": 0, "Uncertain": 0}

	print(f"Evaluating {len(claims_to_evaluate)} claims...")

	# Process each claim
	for idx, test_case in enumerate(claims_to_evaluate):
	claim = test_case["claim"]
	expected = test_case["expected"]

	print(f"\nProcessing claim {idx+1}/{len(claims_to_evaluate)}: {claim}")

	try:
	# Process the claim and measure time
	start_time = time.time()
	result = agent.process_claim(claim, eval_agent)
	total_time = time.time() - start_time

	# Extract classification and confidence
	classification = result.get("classification", "Uncertain")
	confidence = result.get("confidence", 0.0)

	# Normalize classification for comparison
	normalized_classification = normalize_classification(classification)

	# Check if classification is correct
	correct = is_correct(normalized_classification, expected)
	if correct:
	correct_count += 1
	correct_by_class[expected] += 1

	# Check if classification is safe
	safe = is_safe(normalized_classification, expected)
	if safe:
	safe_count += 1
	safe_by_class[expected] += 1

	# Update classification count
	classification_counts[normalized_classification] = classification_counts.get(normalized_classification, 0) + 1

	# Update counts by expected class
	total_by_class[expected] += 1

	# Update processing times
	processing_times[expected].append(total_time)

	# Update confidence scores
	confidence_scores[expected].append(confidence)

	# Save detailed result
	detail_result = {
	"claim": claim,
	"expected": expected,
	"actual": normalized_classification,
	"correct": correct,
	"safe": safe,
	"confidence": confidence,
	"processing_time": total_time
	}

	results.append(detail_result)

	# Print progress indicator
	outcome = "✓" if correct else "✗"
	safety = "(safe)" if safe and not correct else ""
	print(f" Result: {normalized_classification} (Expected: {expected}) {outcome} {safety}")
	print(f" Time: {total_time:.2f}s, Confidence: {confidence:.2f}")

	except Exception as e:
	print(f"Error processing claim: {str(e)}")
	results.append({
	"claim": claim,
	"expected": expected,
	"error": str(e)
	})

	# Calculate performance metrics
	accuracy = correct_count / total_count if total_count > 0 else 0
	safety_rate = safe_count / total_count if total_count > 0 else 0

	# Calculate per-class metrics
	class_metrics = {}
	for cls in ["True", "False", "Uncertain"]:
	class_accuracy = correct_by_class[cls] / total_by_class[cls] if total_by_class[cls] > 0 else 0
	class_safety_rate = safe_by_class[cls] / total_by_class[cls] if total_by_class[cls] > 0 else 0
	avg_time = sum(processing_times[cls]) / len(processing_times[cls]) if processing_times[cls] else 0
	avg_confidence = sum(confidence_scores[cls]) / len(confidence_scores[cls]) if confidence_scores[cls] else 0

	class_metrics[cls] = {
	"accuracy": class_accuracy,
	"safety_rate": class_safety_rate,
	"count": total_by_class[cls],
	"correct": correct_by_class[cls],
	"safe": safe_by_class[cls],
	"avg_processing_time": avg_time,
	"avg_confidence": avg_confidence
	}

	# Calculate overall metrics
	all_times = [r.get("processing_time", 0) for r in results if "processing_time" in r]
	all_confidence = [r.get("confidence", 0) for r in results if "confidence" in r]

	metrics = {
	"total_claims": total_count,
	"correct_claims": correct_count,
	"safe_claims": safe_count,
	"accuracy": accuracy,
	"safety_rate": safety_rate,
	"avg_processing_time": sum(all_times) / len(all_times) if all_times else 0,
	"avg_confidence": sum(all_confidence) / len(all_confidence) if all_confidence else 0,
	"classification_counts": classification_counts,
	"per_class_metrics": class_metrics
	}

	return results, metrics

	def save_results(results, metrics, output_file):
	"""
	Save evaluation results to a JSON file.

	Args:
	results (list): Detailed results for each claim
	metrics (dict): Aggregated performance metrics
	output_file (str): Path to output file
	"""
	output_data = {
	"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
	"metrics": metrics,
	"detailed_results": results
	}

	with open(output_file, 'w') as f:
	json.dump(output_data, f, indent=2)

	print(f"\nResults saved to {output_file}")

	def print_summary(metrics):
	"""
	Print a summary of performance metrics.

	Args:
	metrics (dict): Aggregated performance metrics
	"""
	print("\n" + "="*70)
	print(f"PERFORMANCE SUMMARY")
	print("="*70)

	# Overall metrics
	print(f"\nOverall Metrics:")
	print(f"Total Claims: {metrics['total_claims']}")
	print(f"Correctly Classified: {metrics['correct_claims']}")
	print(f"Safely Classified: {metrics['safe_claims']}")
	print(f"Accuracy: {metrics['accuracy']:.2%}")
	print(f"Safety Rate: {metrics['safety_rate']:.2%}")
	print(f"Average Processing Time: {metrics['avg_processing_time']:.2f} seconds")
	print(f"Average Confidence Score: {metrics['avg_confidence']:.2f}")

	# Per-class metrics as table
	print("\nPer-Class Performance:")
	table_data = []
	headers = ["Class", "Count", "Correct", "Safe", "Accuracy", "Safety Rate", "Avg Time", "Avg Confidence"]

	for cls, cls_metrics in metrics['per_class_metrics'].items():
	table_data.append([
	cls,
	cls_metrics['count'],
	cls_metrics['correct'],
	cls_metrics['safe'],
	f"{cls_metrics['accuracy']:.2%}",
	f"{cls_metrics['safety_rate']:.2%}",
	f"{cls_metrics['avg_processing_time']:.2f}s",
	f"{cls_metrics['avg_confidence']:.2f}"
	])

	print(tabulate(table_data, headers=headers, tablefmt="grid"))

	def create_charts(metrics, output_dir="."):
	"""
	Create visualizations of performance metrics.

	Args:
	metrics (dict): Aggregated performance metrics
	output_dir (str): Directory to save charts
	"""
	try:
	# Create output directory if it doesn't exist
	os.makedirs(output_dir, exist_ok=True)

	# Plot 1: Accuracy by class
	plt.figure(figsize=(10, 6))
	classes = list(metrics['per_class_metrics'].keys())
	accuracies = [metrics['per_class_metrics'][cls]['accuracy'] for cls in classes]

	plt.bar(classes, accuracies, color=['green', 'red', 'gray'])
	plt.title('Accuracy by Classification Type')
	plt.xlabel('Classification')
	plt.ylabel('Accuracy')
	plt.ylim(0, 1)

	for i, v in enumerate(accuracies):
	plt.text(i, v + 0.02, f"{v:.2%}", ha='center')

	plt.tight_layout()
	plt.savefig(os.path.join(output_dir, 'accuracy_by_class.png'))
	plt.close() # Close the figure to free memory

	# Plot 2: Safety rate by class
	plt.figure(figsize=(10, 6))
	safety_rates = [metrics['per_class_metrics'][cls]['safety_rate'] for cls in classes]

	plt.bar(classes, safety_rates, color=['green', 'red', 'gray'])
	plt.title('Safety Rate by Classification Type')
	plt.xlabel('Classification')
	plt.ylabel('Safety Rate')
	plt.ylim(0, 1)

	for i, v in enumerate(safety_rates):
	plt.text(i, v + 0.02, f"{v:.2%}", ha='center')

	plt.tight_layout()
	plt.savefig(os.path.join(output_dir, 'safety_rate_by_class.png'))
	plt.close() # Close the figure to free memory

	# Plot 3: Processing time by class
	plt.figure(figsize=(10, 6))
	times = [metrics['per_class_metrics'][cls]['avg_processing_time'] for cls in classes]

	plt.bar(classes, times, color=['green', 'red', 'gray'])
	plt.title('Average Processing Time by Classification Type')
	plt.xlabel('Classification')
	plt.ylabel('Time (seconds)')

	for i, v in enumerate(times):
	plt.text(i, v + 0.5, f"{v:.2f}s", ha='center')

	plt.tight_layout()
	plt.savefig(os.path.join(output_dir, 'processing_time_by_class.png'))
	plt.close() # Close the figure to free memory

	# Plot 4: Confidence scores by class
	plt.figure(figsize=(10, 6))
	confidence = [metrics['per_class_metrics'][cls]['avg_confidence'] for cls in classes]

	plt.bar(classes, confidence, color=['green', 'red', 'gray'])
	plt.title('Average Confidence Score by Classification Type')
	plt.xlabel('Classification')
	plt.ylabel('Confidence Score')
	plt.ylim(0, 1)

	for i, v in enumerate(confidence):
	plt.text(i, v + 0.02, f"{v:.2f}", ha='center')

	plt.tight_layout()
	plt.savefig(os.path.join(output_dir, 'confidence_by_class.png'))
	plt.close() # Close the figure to free memory

	print(f"\nCharts created in {output_dir}")

	except Exception as e:
	print(f"Error creating charts: {str(e)}")
	print("Continuing without charts.")

	def main():
	"""Main evaluation function that runs the entire evaluation process."""
	# Parse arguments
	args = setup_argument_parser()

	# Initialize the agent
	eval_agent = initialize_system()

	# Create results directory if it doesn't exist
	results_dir = "results"
	os.makedirs(results_dir, exist_ok=True)

	# Set output file path
	output_file = args.output
	if not os.path.isabs(output_file):
	output_file = os.path.join(results_dir, output_file)

	# Evaluate claims
	results, metrics = evaluate_claims(TEST_CLAIMS, eval_agent, args.limit)
	# results, metrics = evaluate_claims(TEST_CLAIMS, eval_agent, 1)

	# Print summary
	print_summary(metrics)

	# Save results
	save_results(results, metrics, output_file)

	# Create charts
	try:
	from tabulate import tabulate
	import matplotlib.pyplot as plt
	create_charts(metrics, results_dir)
	except ImportError:
	print("\nCould not create charts. Please install matplotlib and tabulate packages:")
	print("pip install matplotlib tabulate")

	if __name__ == "__main__":
	main()