Spaces:

AvocadoMuffin
/

eval_model

Sleeping

App Files Files Community

AvocadoMuffin commited on Jul 11

Commit

cb1cf5c

verified ·

1 Parent(s): e010197

Update app.py

Browse files

Files changed (1) hide show

app.py +103 -48

app.py CHANGED Viewed

@@ -2,37 +2,41 @@ import os
 import json
 import numpy as np
 from datasets import load_dataset
-from transformers import AutoTokenizer, AutoModelForQuestionAnswering
 import torch
 from collections import Counter
 import string
 import pandas as pd
 from datetime import datetime
-# Normalization functions
 def normalize_answer(s):
     def remove_articles(text): return re.sub(r'\b(a|an|the)\b', ' ', text)
     def white_space_fix(text): return ' '.join(text.split())
     def remove_punc(text):
-        return ''.join(ch for ch in text if ch not in set(string.punctuation))
     def lower(text): return text.lower()
-    return white_space_fix(remove_articles(remove_punc(lower(s))))
-# Metrics
-def exact_match_score(pred, truth):
-    return int(normalize_answer(pred) == normalize_answer(truth))
-def f1_score_qa(pred, truth):
-    pred_tokens = normalize_answer(pred).split()
-    truth_tokens = normalize_answer(truth).split()
-    common = Counter(pred_tokens) & Counter(truth_tokens)
     num_same = sum(common.values())
     if num_same == 0: return 0
-    precision = num_same / len(pred_tokens)
-    recall = num_same / len(truth_tokens)
     return (2 * precision * recall) / (precision + recall)
-# Identical to extractor's QA confidence
 def get_qa_confidence(model, tokenizer, question, context):
     inputs = tokenizer(
         question, context,
@@ -63,35 +67,46 @@ def get_qa_confidence(model, tokenizer, question, context):
     answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
     return answer.strip(), float(confidence)
-def run_evaluation(num_samples=100):
-    # Load CUAD with remote code trust
-    dataset = load_dataset(
-        "theatticusproject/cuad-qa",
-        trust_remote_code=True,
-        token=os.getenv("HF_TOKEN", True)  # True allows anonymous access
-    )
-    test_data = dataset["test"].select(range(min(num_samples, len(dataset["test"]))))
-    # Load model
     model_name = "AvocadoMuffin/roberta-cuad-qa-v2"
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
     results = []
-    for example in test_data:
         context = example["context"]
         question = example["question"]
         gt_answer = example["answers"]["text"][0] if example["answers"]["text"] else ""
-        pred, conf = get_qa_confidence(model, tokenizer, question, context)
         results.append({
-            "question": question[:100] + "..." if len(question) > 100 else question,
-            "prediction": pred,
-            "confidence": conf,
-            "exact_match": exact_match_score(pred, gt_answer),
-            "f1": f1_score_qa(pred, gt_answer),
-            "ground_truth": gt_answer
         })
     # Generate report
@@ -99,31 +114,71 @@ def run_evaluation(num_samples=100):
     report = f"""
     Evaluation Results (n={len(df)})
     =================
-    Exact Match: {df['exact_match'].mean():.1%}
-    F1 Score: {df['f1'].mean():.1%}
-    Avg Confidence: {df['confidence'].mean():.1%}
-    High-Confidence Accuracy: {
-        df[df['confidence'] > 0.8]['exact_match'].mean():.1%}
     """
-    # Save
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     results_file = f"eval_results_{timestamp}.json"
     with open(results_file, 'w') as f:
         json.dump({
-            "config": {"model": model_name, "dataset": "cuad-qa"},
             "metrics": {
-                "exact_match": float(df['exact_match'].mean()),
-                "f1": float(df['f1'].mean()),
-                "confidence": float(df['confidence'].mean())
             },
             "samples": results
         }, f, indent=2)
     return report, df, results_file
 if __name__ == "__main__":
-    report, df, _ = run_evaluation(num_samples=50)
-    print(report)
-    print("\nSample predictions:")
-    print(df[["question", "confidence", "exact_match"]].head())

 import json
 import numpy as np
 from datasets import load_dataset
+from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
 import torch
+from sklearn.metrics import f1_score
+import re
 from collections import Counter
 import string
+from huggingface_hub import login
+import gradio as gr
 import pandas as pd
 from datetime import datetime
+# Normalization functions (identical to extractor)
 def normalize_answer(s):
     def remove_articles(text): return re.sub(r'\b(a|an|the)\b', ' ', text)
     def white_space_fix(text): return ' '.join(text.split())
     def remove_punc(text):
+        exclude = set(string.punctuation)
+        return ''.join(ch for ch in text if ch not in exclude)
     def lower(text): return text.lower()
+    return white_space_fix(remove_articles(remove_punc(lower(s)))
+def f1_score_qa(prediction, ground_truth):
+    prediction_tokens = normalize_answer(prediction).split()
+    ground_truth_tokens = normalize_answer(ground_truth).split()
+    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
     num_same = sum(common.values())
     if num_same == 0: return 0
+    precision = 1.0 * num_same / len(prediction_tokens)
+    recall = 1.0 * num_same / len(ground_truth_tokens)
     return (2 * precision * recall) / (precision + recall)
+def exact_match_score(prediction, ground_truth):
+    return normalize_answer(prediction) == normalize_answer(ground_truth)
+# Identical confidence calculation to extractor
 def get_qa_confidence(model, tokenizer, question, context):
     inputs = tokenizer(
         question, context,
     answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
     return answer.strip(), float(confidence)
+def run_evaluation(num_samples, progress=gr.Progress()):
+    # Authentication
+    hf_token = os.getenv("EVAL_TOKEN")
+    if hf_token:
+        login(token=hf_token)
+    # Load model same as extractor
     model_name = "AvocadoMuffin/roberta-cuad-qa-v2"
+    tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
+    model = AutoModelForQuestionAnswering.from_pretrained(model_name, token=hf_token)
+    progress(0.1, desc="Loading CUAD dataset...")
+    try:
+        dataset = load_dataset(
+            "theatticusproject/cuad-qa",
+            trust_remote_code=True,
+            token=hf_token
+        )
+        test_data = dataset["test"].select(range(min(num_samples, len(dataset["test"]))))
+        print(f"✓ Loaded {len(test_data)} samples")
+    except Exception as e:
+        return f"❌ Dataset load failed: {str(e)}", pd.DataFrame(), None
     results = []
+    for i, example in enumerate(test_data):
+        progress(0.2 + 0.7*i/num_samples, desc=f"Evaluating {i+1}/{num_samples}")
         context = example["context"]
         question = example["question"]
         gt_answer = example["answers"]["text"][0] if example["answers"]["text"] else ""
+        pred_answer, confidence = get_qa_confidence(model, tokenizer, question, context)
         results.append({
+            "Question": question[:100] + "..." if len(question) > 100 else question,
+            "Prediction": pred_answer,
+            "Truth": gt_answer,
+            "Confidence": confidence,
+            "Exact Match": exact_match_score(pred_answer, gt_answer),
+            "F1": f1_score_qa(pred_answer, gt_answer)
         })
     # Generate report
     report = f"""
     Evaluation Results (n={len(df)})
     =================
+    - Exact Match: {df['Exact Match'].mean():.1%}
+    - F1 Score: {df['F1'].mean():.1%}
+    - Avg Confidence: {df['Confidence'].mean():.1%}
+    - High-Confidence (>80%) Accuracy: {
+        df[df['Confidence'] > 0.8]['Exact Match'].mean():.1%}
     """
+    # Save results
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     results_file = f"eval_results_{timestamp}.json"
     with open(results_file, 'w') as f:
         json.dump({
+            "model": model_name,
             "metrics": {
+                "exact_match": float(df['Exact Match'].mean()),
+                "f1": float(df['F1'].mean()),
+                "avg_confidence": float(df['Confidence'].mean())
             },
             "samples": results
         }, f, indent=2)
     return report, df, results_file
+def create_gradio_interface():
+    with gr.Blocks(title="CUAD Evaluator") as demo:
+        gr.Markdown("## 🏛️ CUAD QA Model Evaluation")
+        with gr.Row():
+            num_samples = gr.Slider(10, 500, value=100, step=10,
+                                   label="Number of Samples")
+            eval_btn = gr.Button("🚀 Run Evaluation", variant="primary")
+        with gr.Row():
+            report = gr.Markdown("Results will appear here...")
+            results_table = gr.Dataframe(headers=["Question", "Prediction", "Confidence", "Exact Match"])
+        download = gr.File(label="Download Results", visible=False)
+        def run_and_display(num_samples):
+            report_text, df, file = run_evaluation(num_samples)
+            return (
+                report_text,
+                df[["Question", "Prediction", "Confidence", "Exact Match"]],
+                gr.File(visible=True, value=file)
+            )
+        eval_btn.click(
+            fn=run_and_display,
+            inputs=num_samples,
+            outputs=[report, results_table, download]
+        )
+    return demo
 if __name__ == "__main__":
+    # Verify CUDA
+    if torch.cuda.is_available():
+        print(f"✓ CUDA available: {torch.cuda.get_device_name(0)}")
+    else:
+        print("! Using CPU")
+    # Launch Gradio
+    demo = create_gradio_interface()
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=True
+    )