qLeaderboard-aBase4Community / sample_benchmarks.py
Quazim0t0's picture
Upload 16 files
9a46619 verified
raw
history blame
3.14 kB
"""
Sample benchmarks initialization for Dynamic Highscores system.
This script adds sample benchmarks to the database to provide initial options for users.
"""
from database_schema import DynamicHighscoresDB
def add_sample_benchmarks():
"""Add sample benchmarks to the database."""
# Initialize database
db = DynamicHighscoresDB()
# Sample benchmarks to add
sample_benchmarks = [
{
"name": "MMLU (Massive Multitask Language Understanding)",
"dataset_id": "cais/mmlu",
"description": "A benchmark for measuring massive multitask language understanding across 57 tasks including elementary mathematics, US history, computer science, law, and more.",
"metrics": {"accuracy": 1.0, "consistency": 1.0}
},
{
"name": "HumanEval (Code Generation)",
"dataset_id": "openai/humaneval",
"description": "A benchmark for evaluating language models on code generation tasks. It consists of 164 programming problems with unit tests.",
"metrics": {"pass@1": 1.0, "functional_correctness": 1.0}
},
{
"name": "HellaSwag (Commonsense Reasoning)",
"dataset_id": "hellaswag",
"description": "A challenge dataset for evaluating commonsense natural language inference. It consists of multiple-choice questions about grounded situations.",
"metrics": {"accuracy": 1.0}
},
{
"name": "GSM8K (Grade School Math)",
"dataset_id": "gsm8k",
"description": "A dataset of 8.5K high quality grade school math word problems. These problems take between 2 and 8 steps to solve, and solutions primarily involve performing a sequence of elementary calculations using basic arithmetic operations.",
"metrics": {"accuracy": 1.0, "correct_steps": 1.0}
},
{
"name": "TruthfulQA",
"dataset_id": "truthful_qa",
"description": "A benchmark to measure whether a language model is truthful in generating answers to questions. The benchmark comprises 817 questions that span 38 categories, including health, law, finance and politics.",
"metrics": {"accuracy": 1.0, "truthfulness": 1.0}
}
]
# Add each benchmark to the database
added_count = 0
for benchmark in sample_benchmarks:
try:
benchmark_id = db.add_benchmark(
name=benchmark["name"],
dataset_id=benchmark["dataset_id"],
description=benchmark["description"],
metrics=benchmark["metrics"]
)
if benchmark_id:
print(f"Added benchmark '{benchmark['name']}' with ID: {benchmark_id}")
added_count += 1
except Exception as e:
print(f"Error adding benchmark '{benchmark['name']}': {e}")
# Close database connection
db.close()
return added_count
if __name__ == "__main__":
num_added = add_sample_benchmarks()
print(f"Added {num_added} sample benchmarks to the database.")