Quazim0t0 commited on
Commit
e3d539f
·
verified ·
1 Parent(s): 423b803

Delete benchmark_selection.py

Browse files
Files changed (1) hide show
  1. benchmark_selection.py +0 -511
benchmark_selection.py DELETED
@@ -1,511 +0,0 @@
1
- """
2
- Benchmark selection module for Dynamic Highscores system.
3
-
4
- This module handles browsing, selection, and loading of HuggingFace datasets
5
- to be used as benchmarks for model evaluation.
6
- """
7
-
8
- import os
9
- import json
10
- import gradio as gr
11
- from huggingface_hub import HfApi, list_datasets
12
- from datasets import load_dataset, get_dataset_config_names
13
- from functools import partial
14
-
15
- class BenchmarkSelector:
16
- """Benchmark selection manager for HuggingFace datasets."""
17
-
18
- def __init__(self, db_manager, auth_manager):
19
- """Initialize the benchmark selector.
20
-
21
- Args:
22
- db_manager: Database manager instance for benchmark storage
23
- auth_manager: Authentication manager instance for access control
24
- """
25
- self.db_manager = db_manager
26
- self.auth_manager = auth_manager
27
- self.hf_api = HfApi()
28
-
29
- # Common benchmark categories for filtering
30
- self.categories = [
31
- "All",
32
- "Text Generation",
33
- "Question Answering",
34
- "Summarization",
35
- "Translation",
36
- "Classification",
37
- "Code Generation",
38
- "Reasoning",
39
- "Math"
40
- ]
41
-
42
- # Common metrics for different benchmark types
43
- self.metric_templates = {
44
- "Text Generation": ["bleu", "rouge", "meteor"],
45
- "Question Answering": ["exact_match", "f1"],
46
- "Summarization": ["rouge1", "rouge2", "rougeL"],
47
- "Translation": ["bleu", "ter"],
48
- "Classification": ["accuracy", "f1", "precision", "recall"],
49
- "Code Generation": ["exact_match", "pass@k", "functional_correctness"],
50
- "Reasoning": ["accuracy", "consistency"],
51
- "Math": ["accuracy", "correct_steps"]
52
- }
53
-
54
- def search_datasets(self, query, category="All", limit=50):
55
- """Search for datasets on HuggingFace.
56
-
57
- Args:
58
- query: Search query string
59
- category: Dataset category to filter by
60
- limit: Maximum number of results to return
61
-
62
- Returns:
63
- list: List of dataset information dictionaries
64
- """
65
- try:
66
- # Apply category filter if not "All"
67
- filter_str = None
68
- if category != "All":
69
- filter_str = f"task_categories:{category}"
70
-
71
- # Search for datasets
72
- datasets = list_datasets(
73
- search=query,
74
- filter=filter_str,
75
- limit=limit
76
- )
77
-
78
- # Format results
79
- results = []
80
- for dataset in datasets:
81
- results.append({
82
- "id": dataset.id,
83
- "name": dataset.id.split("/")[-1],
84
- "author": dataset.author,
85
- "description": dataset.description[:200] + "..." if dataset.description and len(dataset.description) > 200 else dataset.description,
86
- "tags": dataset.tags,
87
- "downloads": dataset.downloads
88
- })
89
-
90
- return results
91
- except Exception as e:
92
- print(f"Dataset search error: {e}")
93
- return []
94
-
95
- def get_dataset_info(self, dataset_id):
96
- """Get detailed information about a dataset.
97
-
98
- Args:
99
- dataset_id: HuggingFace dataset ID
100
-
101
- Returns:
102
- dict: Dataset information
103
- """
104
- try:
105
- # Get dataset info from HuggingFace
106
- dataset_info = self.hf_api.dataset_info(dataset_id)
107
-
108
- # Get available configurations
109
- configs = get_dataset_config_names(dataset_id)
110
-
111
- # Format result
112
- result = {
113
- "id": dataset_info.id,
114
- "name": dataset_info.id.split("/")[-1],
115
- "author": dataset_info.author,
116
- "description": dataset_info.description,
117
- "citation": dataset_info.citation,
118
- "configs": configs,
119
- "tags": dataset_info.tags,
120
- "downloads": dataset_info.downloads
121
- }
122
-
123
- return result
124
- except Exception as e:
125
- print(f"Dataset info error: {e}")
126
- return None
127
-
128
- def load_dataset_sample(self, dataset_id, config=None, split="train", sample_size=5):
129
- """Load a sample from a dataset.
130
-
131
- Args:
132
- dataset_id: HuggingFace dataset ID
133
- config: Dataset configuration name
134
- split: Dataset split to sample from
135
- sample_size: Number of samples to load
136
-
137
- Returns:
138
- dict: Dataset sample information
139
- """
140
- try:
141
- # Load dataset
142
- if config:
143
- dataset = load_dataset(dataset_id, config, split=split)
144
- else:
145
- dataset = load_dataset(dataset_id, split=split)
146
-
147
- # Get sample
148
- if len(dataset) > sample_size:
149
- sample = dataset.select(range(sample_size))
150
- else:
151
- sample = dataset
152
-
153
- # Get features
154
- features = list(sample.features.keys())
155
-
156
- # Convert sample to list of dictionaries
157
- sample_data = []
158
- for item in sample:
159
- sample_item = {}
160
- for key in features:
161
- # Convert non-serializable values to strings
162
- if isinstance(item[key], (list, dict)):
163
- sample_item[key] = str(item[key])
164
- else:
165
- sample_item[key] = item[key]
166
- sample_data.append(sample_item)
167
-
168
- # Format result
169
- result = {
170
- "id": dataset_id,
171
- "config": config,
172
- "split": split,
173
- "features": features,
174
- "sample": sample_data,
175
- "total_size": len(dataset)
176
- }
177
-
178
- return result
179
- except Exception as e:
180
- print(f"Dataset sample error: {e}")
181
- return None
182
-
183
- def add_benchmark(self, dataset_id, name=None, description=None, metrics=None, config=None):
184
- """Add a dataset as a benchmark.
185
-
186
- Args:
187
- dataset_id: HuggingFace dataset ID
188
- name: Benchmark name (defaults to dataset name)
189
- description: Benchmark description (defaults to dataset description)
190
- metrics: Metrics to use for evaluation
191
- config: Dataset configuration to use
192
-
193
- Returns:
194
- int: Benchmark ID if successful, None otherwise
195
- """
196
- try:
197
- # Get dataset info if name or description not provided
198
- if not name or not description:
199
- dataset_info = self.get_dataset_info(dataset_id)
200
- if not dataset_info:
201
- return None
202
-
203
- if not name:
204
- name = dataset_info["name"]
205
-
206
- if not description:
207
- description = dataset_info["description"]
208
-
209
- # Format dataset ID with config if provided
210
- full_dataset_id = dataset_id
211
- if config:
212
- full_dataset_id = f"{dataset_id}:{config}"
213
-
214
- # Add benchmark to database
215
- benchmark_id = self.db_manager.add_benchmark(
216
- name=name,
217
- dataset_id=full_dataset_id,
218
- description=description,
219
- metrics=metrics
220
- )
221
-
222
- return benchmark_id
223
- except Exception as e:
224
- print(f"Add benchmark error: {e}")
225
- return None
226
-
227
- def get_benchmarks(self):
228
- """Get all available benchmarks.
229
-
230
- Returns:
231
- list: List of benchmark information dictionaries
232
- """
233
- return self.db_manager.get_benchmarks()
234
-
235
- # Benchmark selection UI components
236
- def create_benchmark_selection_ui(benchmark_selector, auth_manager):
237
- """Create the benchmark selection UI components.
238
-
239
- Args:
240
- benchmark_selector: Benchmark selector instance
241
- auth_manager: Authentication manager instance
242
-
243
- Returns:
244
- gr.Blocks: Gradio Blocks component with benchmark selection UI
245
- """
246
- with gr.Blocks() as benchmark_ui:
247
- gr.Markdown("## 📊 Dynamic Highscores Benchmark Selection")
248
- gr.Markdown("""
249
- ### Add your own datasets from HuggingFace as benchmarks!
250
-
251
- You can add any dataset from HuggingFace to use as a benchmark for evaluating models.
252
- Simply enter the dataset ID (e.g., 'squad', 'glue', 'hellaswag') and add it as a benchmark.
253
-
254
- Other users will be able to select your added benchmarks for their model evaluations.
255
- """, elem_classes=["info-text"])
256
-
257
- with gr.Tabs() as tabs:
258
- with gr.TabItem("➕ Add New Benchmark", id=0):
259
- with gr.Row():
260
- with gr.Column(scale=3):
261
- search_input = gr.Textbox(
262
- placeholder="Search for datasets on HuggingFace...",
263
- label="Search",
264
- show_label=False
265
- )
266
-
267
- with gr.Column(scale=1):
268
- category_dropdown = gr.Dropdown(
269
- choices=benchmark_selector.categories,
270
- value="All",
271
- label="Category"
272
- )
273
-
274
- with gr.Column(scale=1):
275
- search_button = gr.Button("Search")
276
-
277
- dataset_results = gr.Dataframe(
278
- headers=["Name", "Author", "Description", "Downloads"],
279
- datatype=["str", "str", "str", "number"],
280
- label="Search Results",
281
- interactive=True
282
- )
283
-
284
- with gr.Row():
285
- with gr.Column(scale=2):
286
- dataset_id_input = gr.Textbox(
287
- placeholder="Enter HuggingFace dataset ID (e.g., 'squad', 'glue', 'hellaswag')",
288
- label="Dataset ID",
289
- info="You can enter any dataset ID from HuggingFace"
290
- )
291
-
292
- with gr.Column(scale=1):
293
- view_button = gr.Button("View Dataset Details")
294
-
295
- with gr.Accordion("Dataset Details", open=False):
296
- dataset_info = gr.JSON(label="Dataset Information")
297
-
298
- with gr.Row():
299
- config_dropdown = gr.Dropdown(
300
- label="Configuration",
301
- choices=[],
302
- interactive=True
303
- )
304
-
305
- split_dropdown = gr.Dropdown(
306
- label="Split",
307
- choices=["train", "validation", "test"],
308
- value="train",
309
- interactive=True
310
- )
311
-
312
- sample_button = gr.Button("Load Sample")
313
-
314
- sample_data = gr.Dataframe(
315
- label="Sample Data",
316
- interactive=False
317
- )
318
-
319
- gr.Markdown("### Add this dataset as a benchmark")
320
- with gr.Row():
321
- with gr.Column(scale=2):
322
- benchmark_name = gr.Textbox(
323
- placeholder="Enter a name for this benchmark",
324
- label="Benchmark Name",
325
- info="A descriptive name for this benchmark"
326
- )
327
-
328
- benchmark_description = gr.Textbox(
329
- placeholder="Enter a description for this benchmark",
330
- label="Description",
331
- info="Explain what this benchmark evaluates",
332
- lines=3
333
- )
334
-
335
- with gr.Column(scale=1):
336
- metrics_input = gr.CheckboxGroup(
337
- label="Evaluation Metrics",
338
- choices=[],
339
- interactive=True,
340
- info="Select metrics to use for evaluation"
341
- )
342
-
343
- with gr.Row():
344
- add_benchmark_button = gr.Button("Add as Benchmark", size="lg", variant="primary")
345
-
346
- benchmark_status = gr.Markdown("")
347
-
348
- with gr.TabItem("📋 Available Benchmarks", id=1):
349
- gr.Markdown("### Benchmarks available for model evaluation")
350
- gr.Markdown("These benchmarks can be selected when submitting models for evaluation.")
351
-
352
- with gr.Row():
353
- refresh_benchmarks_button = gr.Button("Refresh Benchmarks")
354
-
355
- benchmarks_container = gr.Column()
356
- with benchmarks_container:
357
- no_benchmarks_message = gr.Markdown(
358
- "### No Datasets Added Yet\n\nBe the first to add a benchmark dataset! Go to the 'Add New Benchmark' tab to add a dataset from HuggingFace.",
359
- visible=True
360
- )
361
-
362
- my_benchmarks = gr.Dataframe(
363
- headers=["ID", "Name", "Dataset", "Description"],
364
- label="Available Benchmarks",
365
- interactive=True,
366
- visible=False
367
- )
368
-
369
- # Event handlers
370
- def search_datasets_handler(query, category):
371
- if not query:
372
- return None
373
-
374
- results = benchmark_selector.search_datasets(query, category)
375
-
376
- # Format for dataframe
377
- formatted_results = []
378
- for result in results:
379
- formatted_results.append([
380
- result["name"],
381
- result["author"],
382
- result["description"],
383
- result["downloads"]
384
- ])
385
-
386
- return formatted_results
387
-
388
- def view_dataset_handler(dataset_id):
389
- if not dataset_id:
390
- return None, [], None
391
-
392
- dataset_info = benchmark_selector.get_dataset_info(dataset_id)
393
-
394
- if not dataset_info:
395
- return None, [], None
396
-
397
- # Update metrics based on dataset tags
398
- metrics = []
399
- for category, category_metrics in benchmark_selector.metric_templates.items():
400
- if any(tag.lower() in [t.lower() for t in dataset_info["tags"]] for tag in category.lower().split()):
401
- metrics.extend(category_metrics)
402
-
403
- # Remove duplicates
404
- metrics = list(set(metrics))
405
-
406
- return dataset_info, dataset_info["configs"], gr.update(choices=metrics)
407
-
408
- def load_sample_handler(dataset_id, config, split):
409
- if not dataset_id:
410
- return None
411
-
412
- sample_info = benchmark_selector.load_dataset_sample(
413
- dataset_id,
414
- config=config if config else None,
415
- split=split
416
- )
417
-
418
- if not sample_info:
419
- return None
420
-
421
- return sample_info["sample"]
422
-
423
- def add_benchmark_handler(dataset_id, config, name, description, metrics, request: gr.Request):
424
- if not dataset_id:
425
- return "Please enter a dataset ID from HuggingFace."
426
-
427
- # Check if user is logged in
428
- user = auth_manager.check_login(request)
429
-
430
- if not user:
431
- return "Please log in to add benchmarks."
432
-
433
- # Add benchmark
434
- benchmark_id = benchmark_selector.add_benchmark(
435
- dataset_id=dataset_id,
436
- name=name if name else None,
437
- description=description if description else None,
438
- metrics=metrics if metrics else None,
439
- config=config if config else None
440
- )
441
-
442
- if benchmark_id:
443
- return f"✅ Benchmark added successfully with ID: {benchmark_id}\n\nThis dataset is now available for model evaluation. You can view it in the 'Available Benchmarks' tab."
444
- else:
445
- return "❌ Failed to add benchmark. Please check the dataset ID and try again."
446
-
447
- def get_benchmarks_handler(request: gr.Request):
448
- # Check if user is logged in
449
- user = auth_manager.check_login(request)
450
-
451
- if not user:
452
- return gr.update(visible=True), gr.update(visible=False), None
453
-
454
- # Get benchmarks
455
- benchmarks = benchmark_selector.get_benchmarks()
456
-
457
- # If no benchmarks, show message
458
- if not benchmarks or len(benchmarks) == 0:
459
- return gr.update(visible=True), gr.update(visible=False), None
460
-
461
- # Format for dataframe
462
- formatted_benchmarks = []
463
- for benchmark in benchmarks:
464
- formatted_benchmarks.append([
465
- benchmark["id"],
466
- benchmark["name"],
467
- benchmark["dataset_id"],
468
- benchmark["description"]
469
- ])
470
-
471
- return gr.update(visible=False), gr.update(visible=True), formatted_benchmarks
472
-
473
- # Connect event handlers
474
- search_button.click(
475
- fn=search_datasets_handler,
476
- inputs=[search_input, category_dropdown],
477
- outputs=[dataset_results]
478
- )
479
-
480
- view_button.click(
481
- fn=view_dataset_handler,
482
- inputs=[dataset_id_input],
483
- outputs=[dataset_info, config_dropdown, metrics_input]
484
- )
485
-
486
- sample_button.click(
487
- fn=load_sample_handler,
488
- inputs=[dataset_id_input, config_dropdown, split_dropdown],
489
- outputs=[sample_data]
490
- )
491
-
492
- add_benchmark_button.click(
493
- fn=add_benchmark_handler,
494
- inputs=[dataset_id_input, config_dropdown, benchmark_name, benchmark_description, metrics_input],
495
- outputs=[benchmark_status]
496
- )
497
-
498
- refresh_benchmarks_button.click(
499
- fn=get_benchmarks_handler,
500
- inputs=[],
501
- outputs=[no_benchmarks_message, my_benchmarks, my_benchmarks]
502
- )
503
-
504
- # Initialize benchmarks on load
505
- benchmark_ui.load(
506
- fn=get_benchmarks_handler,
507
- inputs=[],
508
- outputs=[no_benchmarks_message, my_benchmarks, my_benchmarks]
509
- )
510
-
511
- return benchmark_ui