AvocadoMuffin commited on
Commit
4a9c0e7
Β·
verified Β·
1 Parent(s): 5ad87e4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +155 -47
app.py CHANGED
@@ -61,6 +61,139 @@ def max_over_ground_truths(metric_fn, prediction, ground_truths):
61
  scores.append(score)
62
  return max(scores) if scores else 0
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  def evaluate_model():
65
  # Authenticate with Hugging Face using the token
66
  hf_token = os.getenv("EVAL_TOKEN")
@@ -86,21 +219,6 @@ def evaluate_model():
86
  print(f"βœ— Error loading model: {e}")
87
  return None, None
88
 
89
- def inspect_dataset_structure(dataset, num_samples=3):
90
- """Inspect dataset structure for debugging"""
91
- print(f"Dataset structure inspection:")
92
- print(f"Dataset type: {type(dataset)}")
93
- print(f"Dataset length: {len(dataset)}")
94
-
95
- if len(dataset) > 0:
96
- sample = dataset[0]
97
- print(f"Sample keys: {list(sample.keys()) if isinstance(sample, dict) else 'Not a dict'}")
98
- print(f"Sample structure:")
99
- for key, value in sample.items():
100
- print(f" {key}: {type(value)} - {str(value)[:100]}...")
101
-
102
- return dataset
103
-
104
  def run_evaluation(num_samples, progress=gr.Progress()):
105
  """Run evaluation and return results for Gradio interface"""
106
 
@@ -111,39 +229,19 @@ def run_evaluation(num_samples, progress=gr.Progress()):
111
 
112
  progress(0.1, desc="Loading CUAD dataset...")
113
 
114
- # Load dataset - try multiple approaches
115
- dataset = None
116
- test_data = None
117
-
118
- try:
119
- # Try cuad dataset directly
120
- print("Attempting to load CUAD dataset...")
121
- dataset = load_dataset("cuad", token=hf_token)
122
- test_data = dataset["test"]
123
- print(f"βœ“ Loaded CUAD dataset with {len(test_data)} samples")
124
-
125
- # Inspect structure
126
- test_data = inspect_dataset_structure(test_data)
127
-
128
- except Exception as e:
129
- print(f"Error loading CUAD dataset: {e}")
130
- try:
131
- # Try squad format as fallback
132
- print("Trying SQuAD format...")
133
- dataset = load_dataset("squad", split="validation", token=hf_token)
134
- test_data = dataset.select(range(min(1000, len(dataset))))
135
- print(f"βœ“ Loaded SQuAD dataset as fallback with {len(test_data)} samples")
136
- except Exception as e2:
137
- return f"❌ Error loading any dataset: {e2}", pd.DataFrame(), None
138
-
139
  if test_data is None:
140
- return "❌ No test data available", pd.DataFrame(), None
 
 
 
141
 
142
  # Limit samples
143
  num_samples = min(num_samples, len(test_data))
144
  test_subset = test_data.select(range(num_samples))
145
 
146
- progress(0.2, desc=f"Starting evaluation on {num_samples} samples...")
147
 
148
  # Initialize metrics
149
  exact_matches = []
@@ -179,11 +277,11 @@ def run_evaluation(num_samples, progress=gr.Progress()):
179
  if isinstance(answers, dict):
180
  if "text" in answers:
181
  if isinstance(answers["text"], list):
182
- ground_truths = [ans for ans in answers["text"] if ans.strip()]
183
  else:
184
- ground_truths = [answers["text"]] if answers["text"].strip() else []
185
  elif isinstance(answers, list):
186
- ground_truths = answers
187
 
188
  # Skip if no ground truth
189
  if not ground_truths:
@@ -238,9 +336,13 @@ def run_evaluation(num_samples, progress=gr.Progress()):
238
  # Create results summary
239
  results_summary = f"""
240
  # πŸ“Š CUAD Model Evaluation Results
 
 
 
 
 
241
  ## 🎯 Overall Performance
242
  - **Model**: AvocadoMuffin/roberta-cuad-qa-v3
243
- - **Dataset**: CUAD (Contract Understanding Atticus Dataset)
244
  - **Samples Evaluated**: {len(exact_matches)}
245
  - **Evaluation Date**: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
246
 
@@ -259,6 +361,8 @@ def run_evaluation(num_samples, progress=gr.Progress()):
259
  - **Samples with Multiple Ground Truths**: {len([p for p in predictions if p['Num_Ground_Truths'] > 1])}
260
 
261
  ## 🎯 Evaluation Quality
 
 
262
  The evaluation accounts for multiple ground truth answers where available, using the maximum score across all valid answers for each question.
263
  """
264
 
@@ -271,12 +375,13 @@ The evaluation accounts for multiple ground truth answers where available, using
271
 
272
  detailed_results = {
273
  "model_name": "AvocadoMuffin/roberta-cuad-qa-v3",
274
- "dataset": "cuad",
275
  "num_samples": len(exact_matches),
276
  "exact_match_score": avg_exact_match,
277
  "f1_score": avg_f1_score,
278
  "evaluation_date": datetime.now().isoformat(),
279
  "evaluation_methodology": "max_over_ground_truths",
 
280
  "predictions": predictions,
281
  "summary_stats": {
282
  "avg_confidence": float(np.mean([p['Confidence'] for p in predictions])),
@@ -306,6 +411,7 @@ def create_gradio_interface():
306
  <h1>πŸ›οΈ CUAD Model Evaluation Dashboard</h1>
307
  <p>Evaluate your CUAD (Contract Understanding Atticus Dataset) Question Answering model</p>
308
  <p><strong>Model:</strong> AvocadoMuffin/roberta-cuad-qa-v3</p>
 
309
  </div>
310
  """)
311
 
@@ -337,6 +443,7 @@ def create_gradio_interface():
337
  <li><strong>Confidence</strong>: Model's confidence in its predictions</li>
338
  <li><strong>Max-over-GT</strong>: Best score across multiple ground truth answers</li>
339
  </ul>
 
340
  </div>
341
  """)
342
 
@@ -386,6 +493,7 @@ def create_gradio_interface():
386
  <div style="text-align: center; margin-top: 30px; padding: 20px; color: #666;">
387
  <p>πŸ€– Powered by Hugging Face Transformers & Gradio</p>
388
  <p>πŸ“š CUAD Dataset by The Atticus Project</p>
 
389
  </div>
390
  """)
391
 
 
61
  scores.append(score)
62
  return max(scores) if scores else 0
63
 
64
+ def load_cuad_dataset(hf_token=None):
65
+ """Try multiple methods to load CUAD dataset"""
66
+ print("Attempting to load CUAD dataset...")
67
+
68
+ # Method 1: Try theatticusproject/cuad
69
+ try:
70
+ print("Trying theatticusproject/cuad...")
71
+ dataset = load_dataset("theatticusproject/cuad", token=hf_token)
72
+ if "test" in dataset:
73
+ test_data = dataset["test"]
74
+ print(f"βœ“ Loaded theatticusproject/cuad with {len(test_data)} test samples")
75
+ return test_data, "theatticusproject/cuad"
76
+ elif "validation" in dataset:
77
+ test_data = dataset["validation"]
78
+ print(f"βœ“ Loaded theatticusproject/cuad with {len(test_data)} validation samples")
79
+ return test_data, "theatticusproject/cuad"
80
+ else:
81
+ print("No test or validation split found in theatticusproject/cuad")
82
+ except Exception as e:
83
+ print(f"Failed to load theatticusproject/cuad: {e}")
84
+
85
+ # Method 2: Try theatticusproject/cuad-qa
86
+ try:
87
+ print("Trying theatticusproject/cuad-qa...")
88
+ dataset = load_dataset("theatticusproject/cuad-qa", token=hf_token)
89
+ if "test" in dataset:
90
+ test_data = dataset["test"]
91
+ print(f"βœ“ Loaded theatticusproject/cuad-qa with {len(test_data)} test samples")
92
+ return test_data, "theatticusproject/cuad-qa"
93
+ elif "validation" in dataset:
94
+ test_data = dataset["validation"]
95
+ print(f"βœ“ Loaded theatticusproject/cuad-qa with {len(test_data)} validation samples")
96
+ return test_data, "theatticusproject/cuad-qa"
97
+ except Exception as e:
98
+ print(f"Failed to load theatticusproject/cuad-qa: {e}")
99
+
100
+ # Method 3: Try the original cuad identifier
101
+ try:
102
+ print("Trying cuad...")
103
+ dataset = load_dataset("cuad", token=hf_token)
104
+ if "test" in dataset:
105
+ test_data = dataset["test"]
106
+ print(f"βœ“ Loaded cuad with {len(test_data)} test samples")
107
+ return test_data, "cuad"
108
+ elif "validation" in dataset:
109
+ test_data = dataset["validation"]
110
+ print(f"βœ“ Loaded cuad with {len(test_data)} validation samples")
111
+ return test_data, "cuad"
112
+ except Exception as e:
113
+ print(f"Failed to load cuad: {e}")
114
+
115
+ # Method 4: Try with trust_remote_code=True
116
+ try:
117
+ print("Trying with trust_remote_code=True...")
118
+ dataset = load_dataset("theatticusproject/cuad", token=hf_token, trust_remote_code=True)
119
+ if "test" in dataset:
120
+ test_data = dataset["test"]
121
+ print(f"βœ“ Loaded with trust_remote_code, test samples: {len(test_data)}")
122
+ return test_data, "theatticusproject/cuad (trust_remote_code)"
123
+ elif "validation" in dataset:
124
+ test_data = dataset["validation"]
125
+ print(f"βœ“ Loaded with trust_remote_code, validation samples: {len(test_data)}")
126
+ return test_data, "theatticusproject/cuad (trust_remote_code)"
127
+ except Exception as e:
128
+ print(f"Failed with trust_remote_code: {e}")
129
+
130
+ # Method 5: Create a synthetic CUAD-like dataset for testing
131
+ print("⚠️ Creating synthetic CUAD-like test data...")
132
+ synthetic_data = []
133
+
134
+ # Create some contract-like questions and contexts
135
+ contract_samples = [
136
+ {
137
+ "context": "This Agreement shall commence on January 1, 2024 and shall continue for a period of twelve (12) months unless terminated earlier in accordance with the terms hereof. The initial term may be extended for additional periods of twelve (12) months each upon mutual written consent of both parties.",
138
+ "question": "What is the duration of the agreement?",
139
+ "answers": {"text": ["twelve (12) months", "12 months"], "answer_start": [85, 85]}
140
+ },
141
+ {
142
+ "context": "The Company shall pay the Consultant a fee of $50,000 per month for services rendered under this Agreement. Payment shall be made within thirty (30) days of the end of each calendar month.",
143
+ "question": "What is the monthly fee?",
144
+ "answers": {"text": ["$50,000 per month", "$50,000"], "answer_start": [45, 45]}
145
+ },
146
+ {
147
+ "context": "Either party may terminate this Agreement immediately upon written notice in the event of a material breach by the other party that remains uncured for thirty (30) days after written notice of such breach.",
148
+ "question": "What is the cure period for material breach?",
149
+ "answers": {"text": ["thirty (30) days", "30 days"], "answer_start": [125, 132]}
150
+ },
151
+ {
152
+ "context": "The Contractor shall maintain commercial general liability insurance with coverage of not less than $1,000,000 per occurrence and $2,000,000 in the aggregate.",
153
+ "question": "What is the minimum insurance coverage per occurrence?",
154
+ "answers": {"text": ["$1,000,000 per occurrence", "$1,000,000"], "answer_start": [85, 85]}
155
+ },
156
+ {
157
+ "context": "All intellectual property developed under this Agreement shall be owned by the Company. The Contractor hereby assigns all rights, title and interest in such intellectual property to the Company.",
158
+ "question": "Who owns the intellectual property?",
159
+ "answers": {"text": ["the Company", "Company"], "answer_start": [70, 74]}
160
+ }
161
+ ]
162
+
163
+ # Duplicate samples to create a larger dataset
164
+ for i in range(100): # Create 500 samples
165
+ sample = contract_samples[i % len(contract_samples)].copy()
166
+ sample["id"] = f"synthetic_{i}"
167
+ synthetic_data.append(sample)
168
+
169
+ # Convert to dataset format
170
+ from datasets import Dataset
171
+ test_data = Dataset.from_list(synthetic_data)
172
+
173
+ print(f"βœ“ Created synthetic CUAD-like dataset with {len(test_data)} samples")
174
+ return test_data, "synthetic_cuad"
175
+
176
+ def inspect_dataset_structure(dataset, dataset_name="dataset"):
177
+ """Inspect dataset structure for debugging"""
178
+ print(f"\n=== {dataset_name} Dataset Structure ===")
179
+ print(f"Dataset type: {type(dataset)}")
180
+ print(f"Dataset length: {len(dataset)}")
181
+
182
+ if len(dataset) > 0:
183
+ sample = dataset[0]
184
+ print(f"Sample keys: {list(sample.keys()) if isinstance(sample, dict) else 'Not a dict'}")
185
+ print(f"Sample structure:")
186
+ for key, value in sample.items():
187
+ if isinstance(value, dict):
188
+ print(f" {key} (dict): {list(value.keys())}")
189
+ for sub_key, sub_value in value.items():
190
+ print(f" {sub_key}: {type(sub_value)} - {str(sub_value)[:50]}...")
191
+ else:
192
+ print(f" {key}: {type(value)} - {str(value)[:100]}...")
193
+ print("=" * 50)
194
+
195
+ return dataset
196
+
197
  def evaluate_model():
198
  # Authenticate with Hugging Face using the token
199
  hf_token = os.getenv("EVAL_TOKEN")
 
219
  print(f"βœ— Error loading model: {e}")
220
  return None, None
221
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
  def run_evaluation(num_samples, progress=gr.Progress()):
223
  """Run evaluation and return results for Gradio interface"""
224
 
 
229
 
230
  progress(0.1, desc="Loading CUAD dataset...")
231
 
232
+ # Load dataset
233
+ test_data, dataset_name = load_cuad_dataset(hf_token)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  if test_data is None:
235
+ return "❌ Failed to load any dataset", pd.DataFrame(), None
236
+
237
+ # Inspect dataset structure
238
+ test_data = inspect_dataset_structure(test_data, dataset_name)
239
 
240
  # Limit samples
241
  num_samples = min(num_samples, len(test_data))
242
  test_subset = test_data.select(range(num_samples))
243
 
244
+ progress(0.2, desc=f"Starting evaluation on {num_samples} samples from {dataset_name}...")
245
 
246
  # Initialize metrics
247
  exact_matches = []
 
277
  if isinstance(answers, dict):
278
  if "text" in answers:
279
  if isinstance(answers["text"], list):
280
+ ground_truths = [ans for ans in answers["text"] if ans and ans.strip()]
281
  else:
282
+ ground_truths = [answers["text"]] if answers["text"] and answers["text"].strip() else []
283
  elif isinstance(answers, list):
284
+ ground_truths = [ans for ans in answers if ans and ans.strip()]
285
 
286
  # Skip if no ground truth
287
  if not ground_truths:
 
336
  # Create results summary
337
  results_summary = f"""
338
  # πŸ“Š CUAD Model Evaluation Results
339
+
340
+ ## ⚠️ Dataset Information
341
+ - **Dataset Used**: {dataset_name}
342
+ - **Dataset Status**: {"βœ… Authentic CUAD" if "cuad" in dataset_name.lower() and "synthetic" not in dataset_name else "⚠️ Fallback/Synthetic Data"}
343
+
344
  ## 🎯 Overall Performance
345
  - **Model**: AvocadoMuffin/roberta-cuad-qa-v3
 
346
  - **Samples Evaluated**: {len(exact_matches)}
347
  - **Evaluation Date**: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
348
 
 
361
  - **Samples with Multiple Ground Truths**: {len([p for p in predictions if p['Num_Ground_Truths'] > 1])}
362
 
363
  ## 🎯 Evaluation Quality
364
+ {"βœ… This evaluation uses the proper CUAD dataset for contract understanding tasks." if "cuad" in dataset_name.lower() and "synthetic" not in dataset_name else "⚠️ WARNING: This evaluation used fallback data. Results may not be representative of actual CUAD performance."}
365
+
366
  The evaluation accounts for multiple ground truth answers where available, using the maximum score across all valid answers for each question.
367
  """
368
 
 
375
 
376
  detailed_results = {
377
  "model_name": "AvocadoMuffin/roberta-cuad-qa-v3",
378
+ "dataset": dataset_name,
379
  "num_samples": len(exact_matches),
380
  "exact_match_score": avg_exact_match,
381
  "f1_score": avg_f1_score,
382
  "evaluation_date": datetime.now().isoformat(),
383
  "evaluation_methodology": "max_over_ground_truths",
384
+ "dataset_authentic": "cuad" in dataset_name.lower() and "synthetic" not in dataset_name,
385
  "predictions": predictions,
386
  "summary_stats": {
387
  "avg_confidence": float(np.mean([p['Confidence'] for p in predictions])),
 
411
  <h1>πŸ›οΈ CUAD Model Evaluation Dashboard</h1>
412
  <p>Evaluate your CUAD (Contract Understanding Atticus Dataset) Question Answering model</p>
413
  <p><strong>Model:</strong> AvocadoMuffin/roberta-cuad-qa-v3</p>
414
+ <p><em>This tool will attempt to load the authentic CUAD dataset, with fallbacks if needed.</em></p>
415
  </div>
416
  """)
417
 
 
443
  <li><strong>Confidence</strong>: Model's confidence in its predictions</li>
444
  <li><strong>Max-over-GT</strong>: Best score across multiple ground truth answers</li>
445
  </ul>
446
+ <p><strong>Note:</strong> This tool will try to load the authentic CUAD dataset. If that fails, it will use synthetic contract data for testing purposes.</p>
447
  </div>
448
  """)
449
 
 
493
  <div style="text-align: center; margin-top: 30px; padding: 20px; color: #666;">
494
  <p>πŸ€– Powered by Hugging Face Transformers & Gradio</p>
495
  <p>πŸ“š CUAD Dataset by The Atticus Project</p>
496
+ <p><small>⚠️ If authentic CUAD data cannot be loaded, synthetic contract data will be used for testing purposes.</small></p>
497
  </div>
498
  """)
499