AvocadoMuffin commited on
Commit
e010197
·
verified ·
1 Parent(s): 380b5ba

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -88
app.py CHANGED
@@ -4,151 +4,126 @@ import numpy as np
4
  from datasets import load_dataset
5
  from transformers import AutoTokenizer, AutoModelForQuestionAnswering
6
  import torch
7
- from sklearn.metrics import f1_score
8
- import re
9
  from collections import Counter
10
  import string
11
- from huggingface_hub import login
12
- import gradio as gr
13
  import pandas as pd
14
  from datetime import datetime
15
- import matplotlib.pyplot as plt
16
 
17
- # Normalization functions (same as extractor)
18
  def normalize_answer(s):
19
- def remove_articles(text):
20
- return re.sub(r'\b(a|an|the)\b', ' ', text)
21
- def white_space_fix(text):
22
- return ' '.join(text.split())
23
- def remove_punc(text):
24
- exclude = set(string.punctuation)
25
- return ''.join(ch for ch in text if ch not in exclude)
26
- def lower(text):
27
- return text.lower()
28
  return white_space_fix(remove_articles(remove_punc(lower(s))))
29
 
30
- def f1_score_qa(prediction, ground_truth):
31
- prediction_tokens = normalize_answer(prediction).split()
32
- ground_truth_tokens = normalize_answer(ground_truth).split()
33
- common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
 
 
 
 
34
  num_same = sum(common.values())
35
- if num_same == 0:
36
- return 0
37
- precision = 1.0 * num_same / len(prediction_tokens)
38
- recall = 1.0 * num_same / len(ground_truth_tokens)
39
  return (2 * precision * recall) / (precision + recall)
40
 
41
- def exact_match_score(prediction, ground_truth):
42
- return normalize_answer(prediction) == normalize_answer(ground_truth)
43
-
44
- # Identical confidence calculation to extractor
45
- def calculate_confidence(model, tokenizer, question, context):
46
  inputs = tokenizer(
47
- question,
48
- context,
49
  return_tensors="pt",
50
  truncation=True,
51
  max_length=512,
52
  stride=128,
53
  padding=True
54
  )
55
-
56
  if torch.cuda.is_available():
57
- inputs = {k: v.cuda() for k, v in inputs.items()}
58
  model = model.cuda()
59
-
60
  with torch.no_grad():
61
  outputs = model(**inputs)
62
-
63
  start_probs = torch.softmax(outputs.start_logits, dim=1)
64
  end_probs = torch.softmax(outputs.end_logits, dim=1)
65
  answer_start = torch.argmax(outputs.start_logits)
66
  answer_end = torch.argmax(outputs.end_logits) + 1
67
 
68
- start_prob = start_probs[0, answer_start].item()
69
- end_prob = end_probs[0, answer_end-1].item()
70
- confidence = np.sqrt(start_prob * end_prob)
 
71
 
72
  answer_tokens = inputs["input_ids"][0][answer_start:answer_end]
73
- answer = tokenizer.decode(answer_tokens, skip_special_tokens=True).strip()
74
-
75
- return answer, float(confidence)
76
 
77
  def run_evaluation(num_samples=100):
78
- # Authenticate
79
- if token := os.getenv("HF_TOKEN"):
80
- login(token=token)
 
 
 
 
81
 
82
- # Load model same as extractor
83
  model_name = "AvocadoMuffin/roberta-cuad-qa-v2"
84
  tokenizer = AutoTokenizer.from_pretrained(model_name)
85
  model = AutoModelForQuestionAnswering.from_pretrained(model_name)
86
 
87
- # Load CUAD dataset
88
- dataset = load_dataset("theatticusproject/cuad-qa", token=token)
89
- test_data = dataset["test"].select(range(min(num_samples, len(dataset["test"]))))
90
-
91
  results = []
92
  for example in test_data:
93
  context = example["context"]
94
  question = example["question"]
95
  gt_answer = example["answers"]["text"][0] if example["answers"]["text"] else ""
96
 
97
- pred_answer, confidence = calculate_confidence(model, tokenizer, question, context)
98
 
99
  results.append({
100
- "question": question,
101
- "prediction": pred_answer,
102
- "ground_truth": gt_answer,
103
- "confidence": confidence,
104
- "exact_match": exact_match_score(pred_answer, gt_answer),
105
- "f1": f1_score_qa(pred_answer, gt_answer)
106
  })
107
 
108
  # Generate report
109
  df = pd.DataFrame(results)
110
- avg_metrics = {
111
- "exact_match": df["exact_match"].mean() * 100,
112
- "f1": df["f1"].mean() * 100,
113
- "confidence": df["confidence"].mean() * 100
114
- }
115
-
116
- # Confidence calibration analysis
117
- high_conf_correct = df[(df["confidence"] > 0.8) & (df["exact_match"] == 1)].shape[0]
118
- high_conf_total = df[df["confidence"] > 0.8].shape[0]
119
-
120
  report = f"""
121
- CUAD Evaluation Report (n={len(df)})
122
- ========================
123
- Accuracy:
124
- - Exact Match: {avg_metrics['exact_match']:.2f}%
125
- - F1 Score: {avg_metrics['f1']:.2f}%
126
-
127
- Confidence Analysis:
128
- - Avg Confidence: {avg_metrics['confidence']:.2f}%
129
- - High-Confidence (>80%) Accuracy: {high_conf_correct}/{high_conf_total} ({high_conf_correct/max(1,high_conf_total)*100:.1f}%)
130
-
131
- Confidence vs Accuracy:
132
- {df[['confidence', 'exact_match']].corr().iloc[0,1]:.3f} correlation
133
  """
134
 
135
- # Save results
136
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
137
- results_file = f"cuad_eval_{timestamp}.json"
138
- with open(results_file, "w") as f:
139
  json.dump({
140
- "metrics": avg_metrics,
141
- "samples": results,
142
- "config": {
143
- "model": model_name,
144
- "confidence_method": "geometric_mean_start_end_probs"
145
- }
 
146
  }, f, indent=2)
147
 
148
  return report, df, results_file
149
 
150
  if __name__ == "__main__":
151
- report, df, _ = run_evaluation()
152
  print(report)
153
  print("\nSample predictions:")
154
- print(df.head())
 
4
  from datasets import load_dataset
5
  from transformers import AutoTokenizer, AutoModelForQuestionAnswering
6
  import torch
 
 
7
  from collections import Counter
8
  import string
 
 
9
  import pandas as pd
10
  from datetime import datetime
 
11
 
12
+ # Normalization functions
13
  def normalize_answer(s):
14
+ def remove_articles(text): return re.sub(r'\b(a|an|the)\b', ' ', text)
15
+ def white_space_fix(text): return ' '.join(text.split())
16
+ def remove_punc(text):
17
+ return ''.join(ch for ch in text if ch not in set(string.punctuation))
18
+ def lower(text): return text.lower()
 
 
 
 
19
  return white_space_fix(remove_articles(remove_punc(lower(s))))
20
 
21
+ # Metrics
22
+ def exact_match_score(pred, truth):
23
+ return int(normalize_answer(pred) == normalize_answer(truth))
24
+
25
+ def f1_score_qa(pred, truth):
26
+ pred_tokens = normalize_answer(pred).split()
27
+ truth_tokens = normalize_answer(truth).split()
28
+ common = Counter(pred_tokens) & Counter(truth_tokens)
29
  num_same = sum(common.values())
30
+ if num_same == 0: return 0
31
+ precision = num_same / len(pred_tokens)
32
+ recall = num_same / len(truth_tokens)
 
33
  return (2 * precision * recall) / (precision + recall)
34
 
35
+ # Identical to extractor's QA confidence
36
+ def get_qa_confidence(model, tokenizer, question, context):
 
 
 
37
  inputs = tokenizer(
38
+ question, context,
 
39
  return_tensors="pt",
40
  truncation=True,
41
  max_length=512,
42
  stride=128,
43
  padding=True
44
  )
 
45
  if torch.cuda.is_available():
46
+ inputs = {k:v.cuda() for k,v in inputs.items()}
47
  model = model.cuda()
48
+
49
  with torch.no_grad():
50
  outputs = model(**inputs)
51
+
52
  start_probs = torch.softmax(outputs.start_logits, dim=1)
53
  end_probs = torch.softmax(outputs.end_logits, dim=1)
54
  answer_start = torch.argmax(outputs.start_logits)
55
  answer_end = torch.argmax(outputs.end_logits) + 1
56
 
57
+ confidence = np.sqrt(
58
+ start_probs[0, answer_start].item() *
59
+ end_probs[0, answer_end-1].item()
60
+ )
61
 
62
  answer_tokens = inputs["input_ids"][0][answer_start:answer_end]
63
+ answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
64
+ return answer.strip(), float(confidence)
 
65
 
66
  def run_evaluation(num_samples=100):
67
+ # Load CUAD with remote code trust
68
+ dataset = load_dataset(
69
+ "theatticusproject/cuad-qa",
70
+ trust_remote_code=True,
71
+ token=os.getenv("HF_TOKEN", True) # True allows anonymous access
72
+ )
73
+ test_data = dataset["test"].select(range(min(num_samples, len(dataset["test"]))))
74
 
75
+ # Load model
76
  model_name = "AvocadoMuffin/roberta-cuad-qa-v2"
77
  tokenizer = AutoTokenizer.from_pretrained(model_name)
78
  model = AutoModelForQuestionAnswering.from_pretrained(model_name)
79
 
 
 
 
 
80
  results = []
81
  for example in test_data:
82
  context = example["context"]
83
  question = example["question"]
84
  gt_answer = example["answers"]["text"][0] if example["answers"]["text"] else ""
85
 
86
+ pred, conf = get_qa_confidence(model, tokenizer, question, context)
87
 
88
  results.append({
89
+ "question": question[:100] + "..." if len(question) > 100 else question,
90
+ "prediction": pred,
91
+ "confidence": conf,
92
+ "exact_match": exact_match_score(pred, gt_answer),
93
+ "f1": f1_score_qa(pred, gt_answer),
94
+ "ground_truth": gt_answer
95
  })
96
 
97
  # Generate report
98
  df = pd.DataFrame(results)
 
 
 
 
 
 
 
 
 
 
99
  report = f"""
100
+ Evaluation Results (n={len(df)})
101
+ =================
102
+ Exact Match: {df['exact_match'].mean():.1%}
103
+ F1 Score: {df['f1'].mean():.1%}
104
+ Avg Confidence: {df['confidence'].mean():.1%}
105
+ High-Confidence Accuracy: {
106
+ df[df['confidence'] > 0.8]['exact_match'].mean():.1%}
 
 
 
 
 
107
  """
108
 
109
+ # Save
110
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
111
+ results_file = f"eval_results_{timestamp}.json"
112
+ with open(results_file, 'w') as f:
113
  json.dump({
114
+ "config": {"model": model_name, "dataset": "cuad-qa"},
115
+ "metrics": {
116
+ "exact_match": float(df['exact_match'].mean()),
117
+ "f1": float(df['f1'].mean()),
118
+ "confidence": float(df['confidence'].mean())
119
+ },
120
+ "samples": results
121
  }, f, indent=2)
122
 
123
  return report, df, results_file
124
 
125
  if __name__ == "__main__":
126
+ report, df, _ = run_evaluation(num_samples=50)
127
  print(report)
128
  print("\nSample predictions:")
129
+ print(df[["question", "confidence", "exact_match"]].head())