Spaces:
Running
Running
File size: 6,505 Bytes
0f03dd5 cb1cf5c 0f03dd5 cb1cf5c 0f03dd5 cb1cf5c 0f03dd5 cb1cf5c 0f03dd5 e010197 cb1cf5c e010197 cb1cf5c 0f03dd5 cb1cf5c 0f03dd5 e010197 cb1cf5c 37e8cfe 0f03dd5 cb1cf5c e010197 37e8cfe e010197 37e8cfe e010197 37e8cfe e010197 37e8cfe e010197 37e8cfe 0f03dd5 e010197 0f03dd5 37e8cfe e010197 37e8cfe cb1cf5c 0f03dd5 cb1cf5c 37e8cfe cb1cf5c 37e8cfe cb1cf5c 37e8cfe 0f03dd5 cb1cf5c 37e8cfe cb1cf5c 37e8cfe e010197 cb1cf5c 37e8cfe 0f03dd5 cb1cf5c 0f03dd5 e010197 37e8cfe cb1cf5c e010197 cb1cf5c e010197 37e8cfe 0f03dd5 cb1cf5c 0f03dd5 cb1cf5c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
import os
import json
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
import torch
from sklearn.metrics import f1_score
import re
from collections import Counter
import string
from huggingface_hub import login
import gradio as gr
import pandas as pd
from datetime import datetime
# Normalization functions (identical to extractor)
def normalize_answer(s):
def remove_articles(text): return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text): return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text): return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s)))
def f1_score_qa(prediction, ground_truth):
prediction_tokens = normalize_answer(prediction).split()
ground_truth_tokens = normalize_answer(ground_truth).split()
common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
num_same = sum(common.values())
if num_same == 0: return 0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(ground_truth_tokens)
return (2 * precision * recall) / (precision + recall)
def exact_match_score(prediction, ground_truth):
return normalize_answer(prediction) == normalize_answer(ground_truth)
# Identical confidence calculation to extractor
def get_qa_confidence(model, tokenizer, question, context):
inputs = tokenizer(
question, context,
return_tensors="pt",
truncation=True,
max_length=512,
stride=128,
padding=True
)
if torch.cuda.is_available():
inputs = {k:v.cuda() for k,v in inputs.items()}
model = model.cuda()
with torch.no_grad():
outputs = model(**inputs)
start_probs = torch.softmax(outputs.start_logits, dim=1)
end_probs = torch.softmax(outputs.end_logits, dim=1)
answer_start = torch.argmax(outputs.start_logits)
answer_end = torch.argmax(outputs.end_logits) + 1
confidence = np.sqrt(
start_probs[0, answer_start].item() *
end_probs[0, answer_end-1].item()
)
answer_tokens = inputs["input_ids"][0][answer_start:answer_end]
answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
return answer.strip(), float(confidence)
def run_evaluation(num_samples, progress=gr.Progress()):
# Authentication
hf_token = os.getenv("EVAL_TOKEN")
if hf_token:
login(token=hf_token)
# Load model same as extractor
model_name = "AvocadoMuffin/roberta-cuad-qa-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
model = AutoModelForQuestionAnswering.from_pretrained(model_name, token=hf_token)
progress(0.1, desc="Loading CUAD dataset...")
try:
dataset = load_dataset(
"theatticusproject/cuad-qa",
trust_remote_code=True,
token=hf_token
)
test_data = dataset["test"].select(range(min(num_samples, len(dataset["test"]))))
print(f"β Loaded {len(test_data)} samples")
except Exception as e:
return f"β Dataset load failed: {str(e)}", pd.DataFrame(), None
results = []
for i, example in enumerate(test_data):
progress(0.2 + 0.7*i/num_samples, desc=f"Evaluating {i+1}/{num_samples}")
context = example["context"]
question = example["question"]
gt_answer = example["answers"]["text"][0] if example["answers"]["text"] else ""
pred_answer, confidence = get_qa_confidence(model, tokenizer, question, context)
results.append({
"Question": question[:100] + "..." if len(question) > 100 else question,
"Prediction": pred_answer,
"Truth": gt_answer,
"Confidence": confidence,
"Exact Match": exact_match_score(pred_answer, gt_answer),
"F1": f1_score_qa(pred_answer, gt_answer)
})
# Generate report
df = pd.DataFrame(results)
report = f"""
Evaluation Results (n={len(df)})
=================
- Exact Match: {df['Exact Match'].mean():.1%}
- F1 Score: {df['F1'].mean():.1%}
- Avg Confidence: {df['Confidence'].mean():.1%}
- High-Confidence (>80%) Accuracy: {
df[df['Confidence'] > 0.8]['Exact Match'].mean():.1%}
"""
# Save results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_file = f"eval_results_{timestamp}.json"
with open(results_file, 'w') as f:
json.dump({
"model": model_name,
"metrics": {
"exact_match": float(df['Exact Match'].mean()),
"f1": float(df['F1'].mean()),
"avg_confidence": float(df['Confidence'].mean())
},
"samples": results
}, f, indent=2)
return report, df, results_file
def create_gradio_interface():
with gr.Blocks(title="CUAD Evaluator") as demo:
gr.Markdown("## ποΈ CUAD QA Model Evaluation")
with gr.Row():
num_samples = gr.Slider(10, 500, value=100, step=10,
label="Number of Samples")
eval_btn = gr.Button("π Run Evaluation", variant="primary")
with gr.Row():
report = gr.Markdown("Results will appear here...")
results_table = gr.Dataframe(headers=["Question", "Prediction", "Confidence", "Exact Match"])
download = gr.File(label="Download Results", visible=False)
def run_and_display(num_samples):
report_text, df, file = run_evaluation(num_samples)
return (
report_text,
df[["Question", "Prediction", "Confidence", "Exact Match"]],
gr.File(visible=True, value=file)
)
eval_btn.click(
fn=run_and_display,
inputs=num_samples,
outputs=[report, results_table, download]
)
return demo
if __name__ == "__main__":
# Verify CUDA
if torch.cuda.is_available():
print(f"β CUDA available: {torch.cuda.get_device_name(0)}")
else:
print("! Using CPU")
# Launch Gradio
demo = create_gradio_interface()
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=True
) |