File size: 3,144 Bytes
9a46619 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
"""
Sample benchmarks initialization for Dynamic Highscores system.
This script adds sample benchmarks to the database to provide initial options for users.
"""
from database_schema import DynamicHighscoresDB
def add_sample_benchmarks():
"""Add sample benchmarks to the database."""
# Initialize database
db = DynamicHighscoresDB()
# Sample benchmarks to add
sample_benchmarks = [
{
"name": "MMLU (Massive Multitask Language Understanding)",
"dataset_id": "cais/mmlu",
"description": "A benchmark for measuring massive multitask language understanding across 57 tasks including elementary mathematics, US history, computer science, law, and more.",
"metrics": {"accuracy": 1.0, "consistency": 1.0}
},
{
"name": "HumanEval (Code Generation)",
"dataset_id": "openai/humaneval",
"description": "A benchmark for evaluating language models on code generation tasks. It consists of 164 programming problems with unit tests.",
"metrics": {"pass@1": 1.0, "functional_correctness": 1.0}
},
{
"name": "HellaSwag (Commonsense Reasoning)",
"dataset_id": "hellaswag",
"description": "A challenge dataset for evaluating commonsense natural language inference. It consists of multiple-choice questions about grounded situations.",
"metrics": {"accuracy": 1.0}
},
{
"name": "GSM8K (Grade School Math)",
"dataset_id": "gsm8k",
"description": "A dataset of 8.5K high quality grade school math word problems. These problems take between 2 and 8 steps to solve, and solutions primarily involve performing a sequence of elementary calculations using basic arithmetic operations.",
"metrics": {"accuracy": 1.0, "correct_steps": 1.0}
},
{
"name": "TruthfulQA",
"dataset_id": "truthful_qa",
"description": "A benchmark to measure whether a language model is truthful in generating answers to questions. The benchmark comprises 817 questions that span 38 categories, including health, law, finance and politics.",
"metrics": {"accuracy": 1.0, "truthfulness": 1.0}
}
]
# Add each benchmark to the database
added_count = 0
for benchmark in sample_benchmarks:
try:
benchmark_id = db.add_benchmark(
name=benchmark["name"],
dataset_id=benchmark["dataset_id"],
description=benchmark["description"],
metrics=benchmark["metrics"]
)
if benchmark_id:
print(f"Added benchmark '{benchmark['name']}' with ID: {benchmark_id}")
added_count += 1
except Exception as e:
print(f"Error adding benchmark '{benchmark['name']}': {e}")
# Close database connection
db.close()
return added_count
if __name__ == "__main__":
num_added = add_sample_benchmarks()
print(f"Added {num_added} sample benchmarks to the database.") |