AvocadoMuffin commited on
Commit
cb1cf5c
Β·
verified Β·
1 Parent(s): e010197

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -48
app.py CHANGED
@@ -2,37 +2,41 @@ import os
2
  import json
3
  import numpy as np
4
  from datasets import load_dataset
5
- from transformers import AutoTokenizer, AutoModelForQuestionAnswering
6
  import torch
 
 
7
  from collections import Counter
8
  import string
 
 
9
  import pandas as pd
10
  from datetime import datetime
11
 
12
- # Normalization functions
13
  def normalize_answer(s):
14
  def remove_articles(text): return re.sub(r'\b(a|an|the)\b', ' ', text)
15
  def white_space_fix(text): return ' '.join(text.split())
16
  def remove_punc(text):
17
- return ''.join(ch for ch in text if ch not in set(string.punctuation))
 
18
  def lower(text): return text.lower()
19
- return white_space_fix(remove_articles(remove_punc(lower(s))))
20
 
21
- # Metrics
22
- def exact_match_score(pred, truth):
23
- return int(normalize_answer(pred) == normalize_answer(truth))
24
-
25
- def f1_score_qa(pred, truth):
26
- pred_tokens = normalize_answer(pred).split()
27
- truth_tokens = normalize_answer(truth).split()
28
- common = Counter(pred_tokens) & Counter(truth_tokens)
29
  num_same = sum(common.values())
30
  if num_same == 0: return 0
31
- precision = num_same / len(pred_tokens)
32
- recall = num_same / len(truth_tokens)
33
  return (2 * precision * recall) / (precision + recall)
34
 
35
- # Identical to extractor's QA confidence
 
 
 
36
  def get_qa_confidence(model, tokenizer, question, context):
37
  inputs = tokenizer(
38
  question, context,
@@ -63,35 +67,46 @@ def get_qa_confidence(model, tokenizer, question, context):
63
  answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
64
  return answer.strip(), float(confidence)
65
 
66
- def run_evaluation(num_samples=100):
67
- # Load CUAD with remote code trust
68
- dataset = load_dataset(
69
- "theatticusproject/cuad-qa",
70
- trust_remote_code=True,
71
- token=os.getenv("HF_TOKEN", True) # True allows anonymous access
72
- )
73
- test_data = dataset["test"].select(range(min(num_samples, len(dataset["test"]))))
74
 
75
- # Load model
76
  model_name = "AvocadoMuffin/roberta-cuad-qa-v2"
77
- tokenizer = AutoTokenizer.from_pretrained(model_name)
78
- model = AutoModelForQuestionAnswering.from_pretrained(model_name)
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
  results = []
81
- for example in test_data:
 
 
82
  context = example["context"]
83
  question = example["question"]
84
  gt_answer = example["answers"]["text"][0] if example["answers"]["text"] else ""
85
 
86
- pred, conf = get_qa_confidence(model, tokenizer, question, context)
87
 
88
  results.append({
89
- "question": question[:100] + "..." if len(question) > 100 else question,
90
- "prediction": pred,
91
- "confidence": conf,
92
- "exact_match": exact_match_score(pred, gt_answer),
93
- "f1": f1_score_qa(pred, gt_answer),
94
- "ground_truth": gt_answer
95
  })
96
 
97
  # Generate report
@@ -99,31 +114,71 @@ def run_evaluation(num_samples=100):
99
  report = f"""
100
  Evaluation Results (n={len(df)})
101
  =================
102
- Exact Match: {df['exact_match'].mean():.1%}
103
- F1 Score: {df['f1'].mean():.1%}
104
- Avg Confidence: {df['confidence'].mean():.1%}
105
- High-Confidence Accuracy: {
106
- df[df['confidence'] > 0.8]['exact_match'].mean():.1%}
107
  """
108
 
109
- # Save
110
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
111
  results_file = f"eval_results_{timestamp}.json"
112
  with open(results_file, 'w') as f:
113
  json.dump({
114
- "config": {"model": model_name, "dataset": "cuad-qa"},
115
  "metrics": {
116
- "exact_match": float(df['exact_match'].mean()),
117
- "f1": float(df['f1'].mean()),
118
- "confidence": float(df['confidence'].mean())
119
  },
120
  "samples": results
121
  }, f, indent=2)
122
 
123
  return report, df, results_file
124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  if __name__ == "__main__":
126
- report, df, _ = run_evaluation(num_samples=50)
127
- print(report)
128
- print("\nSample predictions:")
129
- print(df[["question", "confidence", "exact_match"]].head())
 
 
 
 
 
 
 
 
 
 
2
  import json
3
  import numpy as np
4
  from datasets import load_dataset
5
+ from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
6
  import torch
7
+ from sklearn.metrics import f1_score
8
+ import re
9
  from collections import Counter
10
  import string
11
+ from huggingface_hub import login
12
+ import gradio as gr
13
  import pandas as pd
14
  from datetime import datetime
15
 
16
+ # Normalization functions (identical to extractor)
17
  def normalize_answer(s):
18
  def remove_articles(text): return re.sub(r'\b(a|an|the)\b', ' ', text)
19
  def white_space_fix(text): return ' '.join(text.split())
20
  def remove_punc(text):
21
+ exclude = set(string.punctuation)
22
+ return ''.join(ch for ch in text if ch not in exclude)
23
  def lower(text): return text.lower()
24
+ return white_space_fix(remove_articles(remove_punc(lower(s)))
25
 
26
+ def f1_score_qa(prediction, ground_truth):
27
+ prediction_tokens = normalize_answer(prediction).split()
28
+ ground_truth_tokens = normalize_answer(ground_truth).split()
29
+ common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
 
 
 
 
30
  num_same = sum(common.values())
31
  if num_same == 0: return 0
32
+ precision = 1.0 * num_same / len(prediction_tokens)
33
+ recall = 1.0 * num_same / len(ground_truth_tokens)
34
  return (2 * precision * recall) / (precision + recall)
35
 
36
+ def exact_match_score(prediction, ground_truth):
37
+ return normalize_answer(prediction) == normalize_answer(ground_truth)
38
+
39
+ # Identical confidence calculation to extractor
40
  def get_qa_confidence(model, tokenizer, question, context):
41
  inputs = tokenizer(
42
  question, context,
 
67
  answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
68
  return answer.strip(), float(confidence)
69
 
70
+ def run_evaluation(num_samples, progress=gr.Progress()):
71
+ # Authentication
72
+ hf_token = os.getenv("EVAL_TOKEN")
73
+ if hf_token:
74
+ login(token=hf_token)
 
 
 
75
 
76
+ # Load model same as extractor
77
  model_name = "AvocadoMuffin/roberta-cuad-qa-v2"
78
+ tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
79
+ model = AutoModelForQuestionAnswering.from_pretrained(model_name, token=hf_token)
80
+
81
+ progress(0.1, desc="Loading CUAD dataset...")
82
+ try:
83
+ dataset = load_dataset(
84
+ "theatticusproject/cuad-qa",
85
+ trust_remote_code=True,
86
+ token=hf_token
87
+ )
88
+ test_data = dataset["test"].select(range(min(num_samples, len(dataset["test"]))))
89
+ print(f"βœ“ Loaded {len(test_data)} samples")
90
+ except Exception as e:
91
+ return f"❌ Dataset load failed: {str(e)}", pd.DataFrame(), None
92
 
93
  results = []
94
+ for i, example in enumerate(test_data):
95
+ progress(0.2 + 0.7*i/num_samples, desc=f"Evaluating {i+1}/{num_samples}")
96
+
97
  context = example["context"]
98
  question = example["question"]
99
  gt_answer = example["answers"]["text"][0] if example["answers"]["text"] else ""
100
 
101
+ pred_answer, confidence = get_qa_confidence(model, tokenizer, question, context)
102
 
103
  results.append({
104
+ "Question": question[:100] + "..." if len(question) > 100 else question,
105
+ "Prediction": pred_answer,
106
+ "Truth": gt_answer,
107
+ "Confidence": confidence,
108
+ "Exact Match": exact_match_score(pred_answer, gt_answer),
109
+ "F1": f1_score_qa(pred_answer, gt_answer)
110
  })
111
 
112
  # Generate report
 
114
  report = f"""
115
  Evaluation Results (n={len(df)})
116
  =================
117
+ - Exact Match: {df['Exact Match'].mean():.1%}
118
+ - F1 Score: {df['F1'].mean():.1%}
119
+ - Avg Confidence: {df['Confidence'].mean():.1%}
120
+ - High-Confidence (>80%) Accuracy: {
121
+ df[df['Confidence'] > 0.8]['Exact Match'].mean():.1%}
122
  """
123
 
124
+ # Save results
125
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
126
  results_file = f"eval_results_{timestamp}.json"
127
  with open(results_file, 'w') as f:
128
  json.dump({
129
+ "model": model_name,
130
  "metrics": {
131
+ "exact_match": float(df['Exact Match'].mean()),
132
+ "f1": float(df['F1'].mean()),
133
+ "avg_confidence": float(df['Confidence'].mean())
134
  },
135
  "samples": results
136
  }, f, indent=2)
137
 
138
  return report, df, results_file
139
 
140
+ def create_gradio_interface():
141
+ with gr.Blocks(title="CUAD Evaluator") as demo:
142
+ gr.Markdown("## πŸ›οΈ CUAD QA Model Evaluation")
143
+
144
+ with gr.Row():
145
+ num_samples = gr.Slider(10, 500, value=100, step=10,
146
+ label="Number of Samples")
147
+ eval_btn = gr.Button("πŸš€ Run Evaluation", variant="primary")
148
+
149
+ with gr.Row():
150
+ report = gr.Markdown("Results will appear here...")
151
+ results_table = gr.Dataframe(headers=["Question", "Prediction", "Confidence", "Exact Match"])
152
+
153
+ download = gr.File(label="Download Results", visible=False)
154
+
155
+ def run_and_display(num_samples):
156
+ report_text, df, file = run_evaluation(num_samples)
157
+ return (
158
+ report_text,
159
+ df[["Question", "Prediction", "Confidence", "Exact Match"]],
160
+ gr.File(visible=True, value=file)
161
+ )
162
+
163
+ eval_btn.click(
164
+ fn=run_and_display,
165
+ inputs=num_samples,
166
+ outputs=[report, results_table, download]
167
+ )
168
+
169
+ return demo
170
+
171
  if __name__ == "__main__":
172
+ # Verify CUDA
173
+ if torch.cuda.is_available():
174
+ print(f"βœ“ CUDA available: {torch.cuda.get_device_name(0)}")
175
+ else:
176
+ print("! Using CPU")
177
+
178
+ # Launch Gradio
179
+ demo = create_gradio_interface()
180
+ demo.launch(
181
+ server_name="0.0.0.0",
182
+ server_port=7860,
183
+ share=True
184
+ )