AvocadoMuffin commited on
Commit
98d17bf
Β·
verified Β·
1 Parent(s): cb1cf5c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -72
app.py CHANGED
@@ -2,7 +2,7 @@ import os
2
  import json
3
  import numpy as np
4
  from datasets import load_dataset
5
- from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
6
  import torch
7
  from sklearn.metrics import f1_score
8
  import re
@@ -13,17 +13,17 @@ import gradio as gr
13
  import pandas as pd
14
  from datetime import datetime
15
 
16
- # Normalization functions (identical to extractor)
17
  def normalize_answer(s):
 
18
  def remove_articles(text): return re.sub(r'\b(a|an|the)\b', ' ', text)
19
  def white_space_fix(text): return ' '.join(text.split())
20
  def remove_punc(text):
21
- exclude = set(string.punctuation)
22
- return ''.join(ch for ch in text if ch not in exclude)
23
  def lower(text): return text.lower()
24
- return white_space_fix(remove_articles(remove_punc(lower(s)))
25
 
26
  def f1_score_qa(prediction, ground_truth):
 
27
  prediction_tokens = normalize_answer(prediction).split()
28
  ground_truth_tokens = normalize_answer(ground_truth).split()
29
  common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
@@ -34,10 +34,11 @@ def f1_score_qa(prediction, ground_truth):
34
  return (2 * precision * recall) / (precision + recall)
35
 
36
  def exact_match_score(prediction, ground_truth):
 
37
  return normalize_answer(prediction) == normalize_answer(ground_truth)
38
 
39
- # Identical confidence calculation to extractor
40
  def get_qa_confidence(model, tokenizer, question, context):
 
41
  inputs = tokenizer(
42
  question, context,
43
  return_tensors="pt",
@@ -48,7 +49,6 @@ def get_qa_confidence(model, tokenizer, question, context):
48
  )
49
  if torch.cuda.is_available():
50
  inputs = {k:v.cuda() for k,v in inputs.items()}
51
- model = model.cuda()
52
 
53
  with torch.no_grad():
54
  outputs = model(**inputs)
@@ -64,20 +64,30 @@ def get_qa_confidence(model, tokenizer, question, context):
64
  )
65
 
66
  answer_tokens = inputs["input_ids"][0][answer_start:answer_end]
67
- answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
68
- return answer.strip(), float(confidence)
69
 
70
  def run_evaluation(num_samples, progress=gr.Progress()):
 
71
  # Authentication
72
  hf_token = os.getenv("EVAL_TOKEN")
73
  if hf_token:
74
- login(token=hf_token)
 
 
 
75
 
76
- # Load model same as extractor
77
  model_name = "AvocadoMuffin/roberta-cuad-qa-v2"
78
- tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
79
- model = AutoModelForQuestionAnswering.from_pretrained(model_name, token=hf_token)
 
 
 
 
 
80
 
 
81
  progress(0.1, desc="Loading CUAD dataset...")
82
  try:
83
  dataset = load_dataset(
@@ -86,96 +96,111 @@ def run_evaluation(num_samples, progress=gr.Progress()):
86
  token=hf_token
87
  )
88
  test_data = dataset["test"].select(range(min(num_samples, len(dataset["test"]))))
89
- print(f"βœ“ Loaded {len(test_data)} samples")
90
  except Exception as e:
91
- return f"❌ Dataset load failed: {str(e)}", pd.DataFrame(), None
92
 
93
- results = []
94
  for i, example in enumerate(test_data):
95
- progress(0.2 + 0.7*i/num_samples, desc=f"Evaluating {i+1}/{num_samples}")
96
-
97
- context = example["context"]
98
- question = example["question"]
99
- gt_answer = example["answers"]["text"][0] if example["answers"]["text"] else ""
100
 
101
- pred_answer, confidence = get_qa_confidence(model, tokenizer, question, context)
102
-
103
- results.append({
104
- "Question": question[:100] + "..." if len(question) > 100 else question,
105
- "Prediction": pred_answer,
106
- "Truth": gt_answer,
107
- "Confidence": confidence,
108
- "Exact Match": exact_match_score(pred_answer, gt_answer),
109
- "F1": f1_score_qa(pred_answer, gt_answer)
110
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
- # Generate report
113
- df = pd.DataFrame(results)
114
- report = f"""
115
- Evaluation Results (n={len(df)})
116
- =================
117
- - Exact Match: {df['Exact Match'].mean():.1%}
118
- - F1 Score: {df['F1'].mean():.1%}
119
- - Avg Confidence: {df['Confidence'].mean():.1%}
 
 
 
120
  - High-Confidence (>80%) Accuracy: {
121
- df[df['Confidence'] > 0.8]['Exact Match'].mean():.1%}
122
  """
123
 
124
- # Save results
125
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
126
- results_file = f"eval_results_{timestamp}.json"
127
- with open(results_file, 'w') as f:
128
  json.dump({
129
  "model": model_name,
130
  "metrics": {
131
- "exact_match": float(df['Exact Match'].mean()),
132
- "f1": float(df['F1'].mean()),
133
  "avg_confidence": float(df['Confidence'].mean())
134
  },
135
- "samples": results
136
  }, f, indent=2)
137
 
138
- return report, df, results_file
139
 
 
140
  def create_gradio_interface():
141
- with gr.Blocks(title="CUAD Evaluator") as demo:
142
- gr.Markdown("## πŸ›οΈ CUAD QA Model Evaluation")
143
-
144
- with gr.Row():
145
- num_samples = gr.Slider(10, 500, value=100, step=10,
146
- label="Number of Samples")
147
- eval_btn = gr.Button("πŸš€ Run Evaluation", variant="primary")
 
148
 
149
  with gr.Row():
150
- report = gr.Markdown("Results will appear here...")
151
- results_table = gr.Dataframe(headers=["Question", "Prediction", "Confidence", "Exact Match"])
 
 
 
 
 
152
 
153
- download = gr.File(label="Download Results", visible=False)
 
 
154
 
155
- def run_and_display(num_samples):
156
- report_text, df, file = run_evaluation(num_samples)
157
  return (
158
- report_text,
159
- df[["Question", "Prediction", "Confidence", "Exact Match"]],
160
- gr.File(visible=True, value=file)
161
  )
162
 
163
- eval_btn.click(
164
- fn=run_and_display,
165
  inputs=num_samples,
166
- outputs=[report, results_table, download]
 
167
  )
168
 
169
  return demo
170
 
171
  if __name__ == "__main__":
172
- # Verify CUDA
173
- if torch.cuda.is_available():
174
- print(f"βœ“ CUDA available: {torch.cuda.get_device_name(0)}")
175
- else:
176
- print("! Using CPU")
177
-
178
- # Launch Gradio
179
  demo = create_gradio_interface()
180
  demo.launch(
181
  server_name="0.0.0.0",
 
2
  import json
3
  import numpy as np
4
  from datasets import load_dataset
5
+ from transformers import AutoTokenizer, AutoModelForQuestionAnswering
6
  import torch
7
  from sklearn.metrics import f1_score
8
  import re
 
13
  import pandas as pd
14
  from datetime import datetime
15
 
 
16
  def normalize_answer(s):
17
+ """Identical to extractor's normalization"""
18
  def remove_articles(text): return re.sub(r'\b(a|an|the)\b', ' ', text)
19
  def white_space_fix(text): return ' '.join(text.split())
20
  def remove_punc(text):
21
+ return ''.join(ch for ch in text if ch not in set(string.punctuation))
 
22
  def lower(text): return text.lower()
23
+ return white_space_fix(remove_articles(remove_punc(lower(s))))
24
 
25
  def f1_score_qa(prediction, ground_truth):
26
+ """Identical to original"""
27
  prediction_tokens = normalize_answer(prediction).split()
28
  ground_truth_tokens = normalize_answer(ground_truth).split()
29
  common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
 
34
  return (2 * precision * recall) / (precision + recall)
35
 
36
  def exact_match_score(prediction, ground_truth):
37
+ """Identical to original"""
38
  return normalize_answer(prediction) == normalize_answer(ground_truth)
39
 
 
40
  def get_qa_confidence(model, tokenizer, question, context):
41
+ """Identical to extractor's confidence calculation"""
42
  inputs = tokenizer(
43
  question, context,
44
  return_tensors="pt",
 
49
  )
50
  if torch.cuda.is_available():
51
  inputs = {k:v.cuda() for k,v in inputs.items()}
 
52
 
53
  with torch.no_grad():
54
  outputs = model(**inputs)
 
64
  )
65
 
66
  answer_tokens = inputs["input_ids"][0][answer_start:answer_end]
67
+ answer = tokenizer.decode(answer_tokens, skip_special_tokens=True).strip()
68
+ return answer, float(confidence)
69
 
70
  def run_evaluation(num_samples, progress=gr.Progress()):
71
+ """Modified to use extractor's confidence calculation"""
72
  # Authentication
73
  hf_token = os.getenv("EVAL_TOKEN")
74
  if hf_token:
75
+ try:
76
+ login(token=hf_token)
77
+ except Exception as e:
78
+ print(f"Auth error: {e}")
79
 
80
+ # Load model (raw instead of pipeline)
81
  model_name = "AvocadoMuffin/roberta-cuad-qa-v2"
82
+ try:
83
+ tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
84
+ model = AutoModelForQuestionAnswering.from_pretrained(model_name, token=hf_token)
85
+ if torch.cuda.is_available():
86
+ model = model.cuda()
87
+ except Exception as e:
88
+ return f"❌ Model load failed: {e}", pd.DataFrame(), None
89
 
90
+ # Load dataset
91
  progress(0.1, desc="Loading CUAD dataset...")
92
  try:
93
  dataset = load_dataset(
 
96
  token=hf_token
97
  )
98
  test_data = dataset["test"].select(range(min(num_samples, len(dataset["test"]))))
 
99
  except Exception as e:
100
+ return f"❌ Dataset load failed: {e}", pd.DataFrame(), None
101
 
102
+ predictions = []
103
  for i, example in enumerate(test_data):
104
+ progress((0.2 + 0.7 * i / num_samples), desc=f"Processing {i+1}/{num_samples}")
 
 
 
 
105
 
106
+ try:
107
+ context = example["context"]
108
+ question = example["question"]
109
+ gt_answer = example["answers"]["text"][0] if example["answers"]["text"] else ""
110
+
111
+ # Use extractor-style confidence
112
+ pred_answer, confidence = get_qa_confidence(model, tokenizer, question, context)
113
+
114
+ predictions.append({
115
+ "Sample_ID": i+1,
116
+ "Question": question[:100] + "..." if len(question) > 100 else question,
117
+ "Predicted_Answer": pred_answer,
118
+ "Ground_Truth": gt_answer,
119
+ "Exact_Match": exact_match_score(pred_answer, gt_answer),
120
+ "F1_Score": round(f1_score_qa(pred_answer, gt_answer), 3),
121
+ "Confidence": round(confidence, 3) # Now matches extractor
122
+ })
123
+ except Exception as e:
124
+ print(f"Error sample {i}: {e}")
125
+ continue
126
+
127
+ # Generate report (identical to original)
128
+ if not predictions:
129
+ return "❌ No valid predictions", pd.DataFrame(), None
130
 
131
+ df = pd.DataFrame(predictions)
132
+ avg_em = df["Exact_Match"].mean() * 100
133
+ avg_f1 = df["F1_Score"].mean() * 100
134
+
135
+ results_summary = f"""
136
+ # πŸ“Š Evaluation Results (n={len(df)})
137
+ ## 🎯 Metrics
138
+ - Exact Match: {avg_em:.2f}%
139
+ - F1 Score: {avg_f1:.2f}%
140
+ - Avg Confidence: {df['Confidence'].mean():.2%}
141
+ ## πŸ” Confidence Analysis
142
  - High-Confidence (>80%) Accuracy: {
143
+ df[df['Confidence'] > 0.8]['Exact_Match'].mean():.1%}
144
  """
145
 
146
+ # Save results (identical to original)
147
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
148
+ results_file = f"cuad_eval_{timestamp}.json"
149
+ with open(results_file, "w") as f:
150
  json.dump({
151
  "model": model_name,
152
  "metrics": {
153
+ "exact_match": float(avg_em),
154
+ "f1_score": float(avg_f1),
155
  "avg_confidence": float(df['Confidence'].mean())
156
  },
157
+ "samples": predictions
158
  }, f, indent=2)
159
 
160
+ return results_summary, df, results_file
161
 
162
+ # YOUR ORIGINAL GRADIO INTERFACE (COMPLETELY UNCHANGED)
163
  def create_gradio_interface():
164
+ with gr.Blocks(title="CUAD Model Evaluator", theme=gr.themes.Soft()) as demo:
165
+ gr.HTML("""
166
+ <div style="text-align: center; padding: 20px;">
167
+ <h1>πŸ›οΈ CUAD Model Evaluation Dashboard</h1>
168
+ <p>Evaluate your CUAD (Contract Understanding Atticus Dataset) Question Answering model</p>
169
+ <p><strong>Model:</strong> AvocadoMuffin/roberta-cuad-qa-v2</p>
170
+ </div>
171
+ """)
172
 
173
  with gr.Row():
174
+ with gr.Column(scale=1):
175
+ gr.HTML("<h3>βš™οΈ Evaluation Settings</h3>")
176
+ num_samples = gr.Slider(10, 500, value=100, step=10, label="Number of samples")
177
+ evaluate_btn = gr.Button("πŸš€ Start Evaluation", variant="primary")
178
+
179
+ with gr.Column(scale=2):
180
+ results_summary = gr.Markdown("Click 'πŸš€ Start Evaluation' to begin...")
181
 
182
+ gr.HTML("<hr>")
183
+ detailed_results = gr.Dataframe(interactive=False, wrap=True)
184
+ download_file = gr.File(visible=False)
185
 
186
+ def handle_eval(num_samples):
187
+ summary, df, file = run_evaluation(num_samples)
188
  return (
189
+ summary,
190
+ df[["Sample_ID", "Question", "Predicted_Answer", "Confidence", "Exact_Match"]],
191
+ gr.File(visible=True, value=file) if file else gr.File(visible=False)
192
  )
193
 
194
+ evaluate_btn.click(
195
+ fn=handle_eval,
196
  inputs=num_samples,
197
+ outputs=[results_summary, detailed_results, download_file],
198
+ show_progress=True
199
  )
200
 
201
  return demo
202
 
203
  if __name__ == "__main__":
 
 
 
 
 
 
 
204
  demo = create_gradio_interface()
205
  demo.launch(
206
  server_name="0.0.0.0",