|
""" |
|
Benchmark selection module for Dynamic Highscores system. |
|
|
|
This module handles browsing, selection, and loading of HuggingFace datasets |
|
to be used as benchmarks for model evaluation. |
|
""" |
|
|
|
import os |
|
import json |
|
import gradio as gr |
|
from huggingface_hub import HfApi, list_datasets |
|
from datasets import load_dataset, get_dataset_config_names |
|
from functools import partial |
|
|
|
class BenchmarkSelector: |
|
"""Benchmark selection manager for HuggingFace datasets.""" |
|
|
|
def __init__(self, db_manager, auth_manager): |
|
"""Initialize the benchmark selector. |
|
|
|
Args: |
|
db_manager: Database manager instance for benchmark storage |
|
auth_manager: Authentication manager instance for access control |
|
""" |
|
self.db_manager = db_manager |
|
self.auth_manager = auth_manager |
|
self.hf_api = HfApi() |
|
|
|
|
|
self.categories = [ |
|
"All", |
|
"Text Generation", |
|
"Question Answering", |
|
"Summarization", |
|
"Translation", |
|
"Classification", |
|
"Code Generation", |
|
"Reasoning", |
|
"Math" |
|
] |
|
|
|
|
|
self.metric_templates = { |
|
"Text Generation": ["bleu", "rouge", "meteor"], |
|
"Question Answering": ["exact_match", "f1"], |
|
"Summarization": ["rouge1", "rouge2", "rougeL"], |
|
"Translation": ["bleu", "ter"], |
|
"Classification": ["accuracy", "f1", "precision", "recall"], |
|
"Code Generation": ["exact_match", "pass@k", "functional_correctness"], |
|
"Reasoning": ["accuracy", "consistency"], |
|
"Math": ["accuracy", "correct_steps"] |
|
} |
|
|
|
def search_datasets(self, query, category="All", limit=50): |
|
"""Search for datasets on HuggingFace. |
|
|
|
Args: |
|
query: Search query string |
|
category: Dataset category to filter by |
|
limit: Maximum number of results to return |
|
|
|
Returns: |
|
list: List of dataset information dictionaries |
|
""" |
|
try: |
|
|
|
filter_str = None |
|
if category != "All": |
|
filter_str = f"task_categories:{category}" |
|
|
|
|
|
datasets = list_datasets( |
|
search=query, |
|
filter=filter_str, |
|
limit=limit |
|
) |
|
|
|
|
|
results = [] |
|
for dataset in datasets: |
|
|
|
dataset_description = "" |
|
if hasattr(dataset, 'description') and dataset.description: |
|
dataset_description = dataset.description[:200] + "..." if len(dataset.description) > 200 else dataset.description |
|
|
|
|
|
dataset_tags = [] |
|
if hasattr(dataset, 'tags'): |
|
dataset_tags = dataset.tags |
|
|
|
|
|
dataset_downloads = 0 |
|
if hasattr(dataset, 'downloads'): |
|
dataset_downloads = dataset.downloads |
|
|
|
|
|
dataset_author = "" |
|
if hasattr(dataset, 'author'): |
|
dataset_author = dataset.author |
|
|
|
results.append({ |
|
"id": dataset.id, |
|
"name": dataset.id.split("/")[-1], |
|
"author": dataset_author, |
|
"description": dataset_description, |
|
"tags": dataset_tags, |
|
"downloads": dataset_downloads |
|
}) |
|
|
|
return results |
|
except Exception as e: |
|
print(f"Dataset search error: {e}") |
|
return [] |
|
|
|
def get_dataset_info(self, dataset_id): |
|
"""Get detailed information about a dataset. |
|
|
|
Args: |
|
dataset_id: HuggingFace dataset ID |
|
|
|
Returns: |
|
dict: Dataset information |
|
""" |
|
try: |
|
|
|
dataset_info = self.hf_api.dataset_info(dataset_id) |
|
|
|
|
|
configs = [] |
|
try: |
|
configs = get_dataset_config_names(dataset_id) |
|
except Exception as e: |
|
print(f"Error getting dataset configs: {e}") |
|
|
|
|
|
dataset_description = "" |
|
if hasattr(dataset_info, 'description'): |
|
dataset_description = dataset_info.description |
|
|
|
dataset_citation = "" |
|
if hasattr(dataset_info, 'citation'): |
|
dataset_citation = dataset_info.citation |
|
|
|
dataset_tags = [] |
|
if hasattr(dataset_info, 'tags'): |
|
dataset_tags = dataset_info.tags |
|
|
|
dataset_downloads = 0 |
|
if hasattr(dataset_info, 'downloads'): |
|
dataset_downloads = dataset_info.downloads |
|
|
|
dataset_author = "" |
|
if hasattr(dataset_info, 'author'): |
|
dataset_author = dataset_info.author |
|
|
|
|
|
result = { |
|
"id": dataset_info.id, |
|
"name": dataset_info.id.split("/")[-1], |
|
"author": dataset_author, |
|
"description": dataset_description, |
|
"citation": dataset_citation, |
|
"configs": configs, |
|
"tags": dataset_tags, |
|
"downloads": dataset_downloads |
|
} |
|
|
|
return result |
|
except Exception as e: |
|
print(f"Dataset info error: {e}") |
|
return None |
|
|
|
def load_dataset_sample(self, dataset_id, config=None, split="train", sample_size=5): |
|
"""Load a sample from a dataset. |
|
|
|
Args: |
|
dataset_id: HuggingFace dataset ID |
|
config: Dataset configuration name |
|
split: Dataset split to sample from |
|
sample_size: Number of samples to load |
|
|
|
Returns: |
|
dict: Dataset sample information |
|
""" |
|
try: |
|
|
|
if config: |
|
dataset = load_dataset(dataset_id, config, split=split) |
|
else: |
|
dataset = load_dataset(dataset_id, split=split) |
|
|
|
|
|
if len(dataset) > sample_size: |
|
sample = dataset.select(range(sample_size)) |
|
else: |
|
sample = dataset |
|
|
|
|
|
features = list(sample.features.keys()) |
|
|
|
|
|
sample_data = [] |
|
for item in sample: |
|
sample_item = {} |
|
for key in features: |
|
|
|
if isinstance(item[key], (list, dict)): |
|
sample_item[key] = str(item[key]) |
|
else: |
|
sample_item[key] = item[key] |
|
sample_data.append(sample_item) |
|
|
|
|
|
result = { |
|
"id": dataset_id, |
|
"config": config, |
|
"split": split, |
|
"features": features, |
|
"sample": sample_data, |
|
"total_size": len(dataset) |
|
} |
|
|
|
return result |
|
except Exception as e: |
|
print(f"Dataset sample error: {e}") |
|
return None |
|
|
|
def add_benchmark(self, dataset_id, name=None, description=None, metrics=None, config=None): |
|
"""Add a dataset as a benchmark. |
|
|
|
Args: |
|
dataset_id: HuggingFace dataset ID |
|
name: Benchmark name (defaults to dataset name) |
|
description: Benchmark description (defaults to dataset description) |
|
metrics: Metrics to use for evaluation |
|
config: Dataset configuration to use |
|
|
|
Returns: |
|
int: Benchmark ID if successful, None otherwise |
|
""" |
|
try: |
|
|
|
if not name or not description: |
|
dataset_info = self.get_dataset_info(dataset_id) |
|
if not dataset_info: |
|
return None |
|
|
|
if not name: |
|
name = dataset_info["name"] |
|
|
|
if not description: |
|
description = dataset_info["description"] |
|
|
|
|
|
full_dataset_id = dataset_id |
|
if config: |
|
full_dataset_id = f"{dataset_id}:{config}" |
|
|
|
|
|
benchmark_id = self.db_manager.add_benchmark( |
|
name=name, |
|
dataset_id=full_dataset_id, |
|
description=description, |
|
metrics=metrics |
|
) |
|
|
|
return benchmark_id |
|
except Exception as e: |
|
print(f"Add benchmark error: {e}") |
|
return None |
|
|
|
def get_benchmarks(self): |
|
"""Get all available benchmarks. |
|
|
|
Returns: |
|
list: List of benchmark information dictionaries |
|
""" |
|
return self.db_manager.get_benchmarks() |
|
|
|
|
|
def create_benchmark_selection_ui(benchmark_selector, auth_manager): |
|
"""Create the benchmark selection UI components. |
|
|
|
Args: |
|
benchmark_selector: Benchmark selector instance |
|
auth_manager: Authentication manager instance |
|
|
|
Returns: |
|
gr.Blocks: Gradio Blocks component with benchmark selection UI |
|
""" |
|
with gr.Blocks() as benchmark_ui: |
|
gr.Markdown("## 📊 Dynamic Highscores Benchmark Selection") |
|
gr.Markdown(""" |
|
### Add your own datasets from HuggingFace as benchmarks! |
|
|
|
You can add any dataset from HuggingFace to use as a benchmark for evaluating models. |
|
Simply enter the dataset ID (e.g., 'squad', 'glue', 'hellaswag') and add it as a benchmark. |
|
|
|
Other users will be able to select your added benchmarks for their model evaluations. |
|
""", elem_classes=["info-text"]) |
|
|
|
with gr.Tabs() as tabs: |
|
with gr.TabItem("➕ Add New Benchmark", id=0): |
|
with gr.Row(): |
|
with gr.Column(scale=3): |
|
search_input = gr.Textbox( |
|
placeholder="Search for datasets on HuggingFace...", |
|
label="Search", |
|
show_label=False |
|
) |
|
|
|
with gr.Column(scale=1): |
|
category_dropdown = gr.Dropdown( |
|
choices=benchmark_selector.categories, |
|
value="All", |
|
label="Category" |
|
) |
|
|
|
with gr.Column(scale=1): |
|
search_button = gr.Button("Search") |
|
|
|
dataset_results = gr.Dataframe( |
|
headers=["Name", "Author", "Description", "Downloads"], |
|
datatype=["str", "str", "str", "number"], |
|
label="Search Results", |
|
interactive=True |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=2): |
|
dataset_id_input = gr.Textbox( |
|
placeholder="Enter HuggingFace dataset ID (e.g., 'squad', 'glue', 'hellaswag')", |
|
label="Dataset ID", |
|
info="You can enter any dataset ID from HuggingFace" |
|
) |
|
|
|
with gr.Column(scale=1): |
|
view_button = gr.Button("View Dataset Details") |
|
|
|
with gr.Accordion("Dataset Details", open=False): |
|
dataset_info = gr.JSON(label="Dataset Information") |
|
|
|
with gr.Row(): |
|
config_dropdown = gr.Dropdown( |
|
label="Configuration", |
|
choices=[], |
|
interactive=True |
|
) |
|
|
|
split_dropdown = gr.Dropdown( |
|
label="Split", |
|
choices=["train", "validation", "test"], |
|
value="train", |
|
interactive=True |
|
) |
|
|
|
sample_button = gr.Button("Load Sample") |
|
|
|
sample_data = gr.Dataframe( |
|
label="Sample Data", |
|
interactive=False |
|
) |
|
|
|
gr.Markdown("### Add this dataset as a benchmark") |
|
with gr.Row(): |
|
with gr.Column(scale=2): |
|
benchmark_name = gr.Textbox( |
|
placeholder="Enter a name for this benchmark", |
|
label="Benchmark Name", |
|
info="A descriptive name for this benchmark" |
|
) |
|
|
|
benchmark_description = gr.Textbox( |
|
placeholder="Enter a description for this benchmark", |
|
label="Description", |
|
info="Explain what this benchmark evaluates", |
|
lines=3 |
|
) |
|
|
|
with gr.Column(scale=1): |
|
metrics_input = gr.CheckboxGroup( |
|
label="Evaluation Metrics", |
|
choices=[], |
|
interactive=True, |
|
info="Select metrics to use for evaluation" |
|
) |
|
|
|
with gr.Row(): |
|
add_benchmark_button = gr.Button("Add as Benchmark", size="lg", variant="primary") |
|
|
|
benchmark_status = gr.Markdown("") |
|
|
|
with gr.TabItem("📋 Available Benchmarks", id=1): |
|
gr.Markdown("### Benchmarks available for model evaluation") |
|
gr.Markdown("These benchmarks can be selected when submitting models for evaluation.") |
|
|
|
with gr.Row(): |
|
refresh_benchmarks_button = gr.Button("Refresh Benchmarks") |
|
reload_sample_benchmarks_button = gr.Button("Reload Sample Benchmarks", variant="secondary") |
|
|
|
reload_status = gr.Markdown("") |
|
|
|
benchmarks_container = gr.Column() |
|
with benchmarks_container: |
|
no_benchmarks_message = gr.Markdown( |
|
"### No Datasets Added Yet\n\nBe the first to add a benchmark dataset! Go to the 'Add New Benchmark' tab to add a dataset from HuggingFace.", |
|
visible=True |
|
) |
|
|
|
my_benchmarks = gr.Dataframe( |
|
headers=["ID", "Name", "Dataset", "Description"], |
|
label="Available Benchmarks", |
|
interactive=True, |
|
visible=False |
|
) |
|
|
|
|
|
def search_datasets_handler(query, category): |
|
if not query: |
|
return None |
|
|
|
results = benchmark_selector.search_datasets(query, category) |
|
|
|
|
|
formatted_results = [] |
|
for result in results: |
|
formatted_results.append([ |
|
result["name"], |
|
result["author"], |
|
result["description"], |
|
result["downloads"] |
|
]) |
|
|
|
return formatted_results |
|
|
|
def view_dataset_handler(dataset_id): |
|
if not dataset_id: |
|
return None, [], None |
|
|
|
dataset_info = benchmark_selector.get_dataset_info(dataset_id) |
|
|
|
if not dataset_info: |
|
return None, [], None |
|
|
|
|
|
metrics = [] |
|
for category, category_metrics in benchmark_selector.metric_templates.items(): |
|
if any(tag.lower() in [t.lower() for t in dataset_info["tags"]] for tag in category.lower().split()): |
|
metrics.extend(category_metrics) |
|
|
|
|
|
metrics = list(set(metrics)) |
|
|
|
return dataset_info, dataset_info["configs"], gr.update(choices=metrics) |
|
|
|
def load_sample_handler(dataset_id, config, split): |
|
if not dataset_id: |
|
return None |
|
|
|
sample_info = benchmark_selector.load_dataset_sample( |
|
dataset_id, |
|
config=config if config else None, |
|
split=split |
|
) |
|
|
|
if not sample_info: |
|
return None |
|
|
|
return sample_info["sample"] |
|
|
|
def add_benchmark_handler(dataset_id, config, name, description, metrics, request: gr.Request): |
|
if not dataset_id: |
|
return "Please enter a dataset ID from HuggingFace." |
|
|
|
|
|
user = auth_manager.check_login(request) |
|
|
|
if not user: |
|
return "Please log in to add benchmarks." |
|
|
|
|
|
benchmark_id = benchmark_selector.add_benchmark( |
|
dataset_id=dataset_id, |
|
name=name if name else None, |
|
description=description if description else None, |
|
metrics=metrics if metrics else None, |
|
config=config if config else None |
|
) |
|
|
|
if benchmark_id: |
|
return f"✅ Benchmark added successfully with ID: {benchmark_id}\n\nThis dataset is now available for model evaluation. You can view it in the 'Available Benchmarks' tab." |
|
else: |
|
return "❌ Failed to add benchmark. Please check the dataset ID and try again." |
|
|
|
def get_benchmarks_handler(request: gr.Request): |
|
|
|
user = auth_manager.check_login(request) |
|
|
|
if not user: |
|
return gr.update(visible=True), gr.update(visible=False), None |
|
|
|
|
|
benchmarks = benchmark_selector.get_benchmarks() |
|
|
|
|
|
if not benchmarks or len(benchmarks) == 0: |
|
return gr.update(visible=True), gr.update(visible=False), None |
|
|
|
|
|
formatted_benchmarks = [] |
|
for benchmark in benchmarks: |
|
formatted_benchmarks.append([ |
|
benchmark["id"], |
|
benchmark["name"], |
|
benchmark["dataset_id"], |
|
benchmark["description"] |
|
]) |
|
|
|
return gr.update(visible=False), gr.update(visible=True), formatted_benchmarks |
|
|
|
def reload_sample_benchmarks_handler(): |
|
try: |
|
from sample_benchmarks import add_sample_benchmarks |
|
num_added = add_sample_benchmarks() |
|
return f"✅ Successfully reloaded {num_added} sample benchmarks." |
|
except Exception as e: |
|
return f"❌ Error reloading benchmarks: {str(e)}" |
|
|
|
|
|
search_button.click( |
|
fn=search_datasets_handler, |
|
inputs=[search_input, category_dropdown], |
|
outputs=[dataset_results] |
|
) |
|
|
|
view_button.click( |
|
fn=view_dataset_handler, |
|
inputs=[dataset_id_input], |
|
outputs=[dataset_info, config_dropdown, metrics_input] |
|
) |
|
|
|
sample_button.click( |
|
fn=load_sample_handler, |
|
inputs=[dataset_id_input, config_dropdown, split_dropdown], |
|
outputs=[sample_data] |
|
) |
|
|
|
add_benchmark_button.click( |
|
fn=add_benchmark_handler, |
|
inputs=[dataset_id_input, config_dropdown, benchmark_name, benchmark_description, metrics_input], |
|
outputs=[benchmark_status] |
|
) |
|
|
|
refresh_benchmarks_button.click( |
|
fn=get_benchmarks_handler, |
|
inputs=[], |
|
outputs=[no_benchmarks_message, my_benchmarks, my_benchmarks] |
|
) |
|
|
|
reload_sample_benchmarks_button.click( |
|
fn=reload_sample_benchmarks_handler, |
|
inputs=[], |
|
outputs=[reload_status] |
|
) |
|
|
|
|
|
benchmark_ui.load( |
|
fn=get_benchmarks_handler, |
|
inputs=[], |
|
outputs=[no_benchmarks_message, my_benchmarks, my_benchmarks] |
|
) |
|
|
|
return benchmark_ui |