Spaces:

AvocadoMuffin
/

eval_model

Sleeping

File size: 5,247 Bytes

0f03dd5
 
 
 
37e8cfe
0f03dd5
 
 
 
 
 
 
 
 
37e8cfe
0f03dd5
37e8cfe
0f03dd5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37e8cfe
0f03dd5
 
 
 
37e8cfe
 
 
 
 
 
 
 
 
 
 
0f03dd5
37e8cfe
 
 
0f03dd5
37e8cfe
 
 
 
 
 
 
0f03dd5
37e8cfe
 
 
0f03dd5
37e8cfe
 
0f03dd5
37e8cfe
 
 
 
 
 
0f03dd5
37e8cfe
 
 
 
 
 
 
 
 
 
 
 
 
 
0f03dd5
37e8cfe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0f03dd5
37e8cfe
 
 
a1ce4b0
37e8cfe
 
 
 
 
 
0f03dd5
37e8cfe
 
 
0f03dd5
37e8cfe
 
 
0f03dd5
37e8cfe
0f03dd5
37e8cfe
 
 
 
 
 
 
 
 
 
 
 
0f03dd5
 
37e8cfe

import os
import json
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
from sklearn.metrics import f1_score
import re
from collections import Counter
import string
from huggingface_hub import login
import gradio as gr
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt

# Normalization functions (same as extractor)
def normalize_answer(s):
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)
    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

def f1_score_qa(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    return (2 * precision * recall) / (precision + recall)

def exact_match_score(prediction, ground_truth):
    return normalize_answer(prediction) == normalize_answer(ground_truth)

# Identical confidence calculation to extractor
def calculate_confidence(model, tokenizer, question, context):
    inputs = tokenizer(
        question, 
        context, 
        return_tensors="pt",
        truncation=True,
        max_length=512,
        stride=128,
        padding=True
    )
    
    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}
        model = model.cuda()

    with torch.no_grad():
        outputs = model(**inputs)

    start_probs = torch.softmax(outputs.start_logits, dim=1)
    end_probs = torch.softmax(outputs.end_logits, dim=1)
    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1
    
    start_prob = start_probs[0, answer_start].item()
    end_prob = end_probs[0, answer_end-1].item()
    confidence = np.sqrt(start_prob * end_prob)
    
    answer_tokens = inputs["input_ids"][0][answer_start:answer_end]
    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True).strip()
    
    return answer, float(confidence)

def run_evaluation(num_samples=100):
    # Authenticate
    if token := os.getenv("HF_TOKEN"):
        login(token=token)
    
    # Load model same as extractor
    model_name = "AvocadoMuffin/roberta-cuad-qa-v2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    
    # Load CUAD dataset
    dataset = load_dataset("theatticusproject/cuad-qa", token=token)
    test_data = dataset["test"].select(range(min(num_samples, len(dataset["test"]))))
    
    results = []
    for example in test_data:
        context = example["context"]
        question = example["question"]
        gt_answer = example["answers"]["text"][0] if example["answers"]["text"] else ""
        
        pred_answer, confidence = calculate_confidence(model, tokenizer, question, context)
        
        results.append({
            "question": question,
            "prediction": pred_answer,
            "ground_truth": gt_answer,
            "confidence": confidence,
            "exact_match": exact_match_score(pred_answer, gt_answer),
            "f1": f1_score_qa(pred_answer, gt_answer)
        })
    
    # Generate report
    df = pd.DataFrame(results)
    avg_metrics = {
        "exact_match": df["exact_match"].mean() * 100,
        "f1": df["f1"].mean() * 100,
        "confidence": df["confidence"].mean() * 100
    }
    
    # Confidence calibration analysis
    high_conf_correct = df[(df["confidence"] > 0.8) & (df["exact_match"] == 1)].shape[0]
    high_conf_total = df[df["confidence"] > 0.8].shape[0]
    
    report = f"""
    CUAD Evaluation Report (n={len(df)})
    ========================
    Accuracy:
    - Exact Match: {avg_metrics['exact_match']:.2f}%
    - F1 Score: {avg_metrics['f1']:.2f}%
    
    Confidence Analysis:
    - Avg Confidence: {avg_metrics['confidence']:.2f}%
    - High-Confidence (>80%) Accuracy: {high_conf_correct}/{high_conf_total} ({high_conf_correct/max(1,high_conf_total)*100:.1f}%)
    
    Confidence vs Accuracy:
    {df[['confidence', 'exact_match']].corr().iloc[0,1]:.3f} correlation
    """
    
    # Save results
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    results_file = f"cuad_eval_{timestamp}.json"
    with open(results_file, "w") as f:
        json.dump({
            "metrics": avg_metrics,
            "samples": results,
            "config": {
                "model": model_name,
                "confidence_method": "geometric_mean_start_end_probs"
            }
        }, f, indent=2)
    
    return report, df, results_file

if __name__ == "__main__":
    report, df, _ = run_evaluation()
    print(report)
    print("\nSample predictions:")
    print(df.head())