|
""" |
|
Sample benchmarks initialization for Dynamic Highscores system. |
|
|
|
This script adds sample benchmarks to the database to provide initial options for users. |
|
""" |
|
|
|
from database_schema import DynamicHighscoresDB |
|
|
|
def add_sample_benchmarks(): |
|
"""Add sample benchmarks to the database.""" |
|
|
|
db = DynamicHighscoresDB() |
|
|
|
|
|
sample_benchmarks = [ |
|
{ |
|
"name": "MMLU (Massive Multitask Language Understanding)", |
|
"dataset_id": "cais/mmlu", |
|
"description": "A benchmark for measuring massive multitask language understanding across 57 tasks including elementary mathematics, US history, computer science, law, and more.", |
|
"metrics": {"accuracy": 1.0, "consistency": 1.0} |
|
}, |
|
{ |
|
"name": "HumanEval (Code Generation)", |
|
"dataset_id": "openai/humaneval", |
|
"description": "A benchmark for evaluating language models on code generation tasks. It consists of 164 programming problems with unit tests.", |
|
"metrics": {"pass@1": 1.0, "functional_correctness": 1.0} |
|
}, |
|
{ |
|
"name": "HellaSwag (Commonsense Reasoning)", |
|
"dataset_id": "hellaswag", |
|
"description": "A challenge dataset for evaluating commonsense natural language inference. It consists of multiple-choice questions about grounded situations.", |
|
"metrics": {"accuracy": 1.0} |
|
}, |
|
{ |
|
"name": "GSM8K (Grade School Math)", |
|
"dataset_id": "gsm8k", |
|
"description": "A dataset of 8.5K high quality grade school math word problems. These problems take between 2 and 8 steps to solve, and solutions primarily involve performing a sequence of elementary calculations using basic arithmetic operations.", |
|
"metrics": {"accuracy": 1.0, "correct_steps": 1.0} |
|
}, |
|
{ |
|
"name": "TruthfulQA", |
|
"dataset_id": "truthful_qa", |
|
"description": "A benchmark to measure whether a language model is truthful in generating answers to questions. The benchmark comprises 817 questions that span 38 categories, including health, law, finance and politics.", |
|
"metrics": {"accuracy": 1.0, "truthfulness": 1.0} |
|
} |
|
] |
|
|
|
|
|
added_count = 0 |
|
for benchmark in sample_benchmarks: |
|
try: |
|
benchmark_id = db.add_benchmark( |
|
name=benchmark["name"], |
|
dataset_id=benchmark["dataset_id"], |
|
description=benchmark["description"], |
|
metrics=benchmark["metrics"] |
|
) |
|
|
|
if benchmark_id: |
|
print(f"Added benchmark '{benchmark['name']}' with ID: {benchmark_id}") |
|
added_count += 1 |
|
except Exception as e: |
|
print(f"Error adding benchmark '{benchmark['name']}': {e}") |
|
|
|
|
|
db.close() |
|
|
|
return added_count |
|
|
|
if __name__ == "__main__": |
|
num_added = add_sample_benchmarks() |
|
print(f"Added {num_added} sample benchmarks to the database.") |