Delete sample_benchmarks.py
Browse files- sample_benchmarks.py +0 -66
sample_benchmarks.py
DELETED
@@ -1,66 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
Sample benchmarks initialization for Dynamic Highscores system.
|
3 |
-
|
4 |
-
This script adds sample benchmarks to the database to provide initial options for users.
|
5 |
-
"""
|
6 |
-
|
7 |
-
from database_schema import init_db
|
8 |
-
|
9 |
-
def add_sample_benchmarks():
|
10 |
-
"""Add sample benchmarks to the database."""
|
11 |
-
# Initialize database
|
12 |
-
db = init_db()
|
13 |
-
|
14 |
-
# Sample benchmarks to add
|
15 |
-
sample_benchmarks = [
|
16 |
-
{
|
17 |
-
"name": "MMLU (Massive Multitask Language Understanding)",
|
18 |
-
"dataset_id": "cais/mmlu",
|
19 |
-
"description": "A benchmark for measuring massive multitask language understanding across 57 tasks including elementary mathematics, US history, computer science, law, and more.",
|
20 |
-
"metrics": {"accuracy": 1.0, "consistency": 1.0}
|
21 |
-
},
|
22 |
-
{
|
23 |
-
"name": "HumanEval (Code Generation)",
|
24 |
-
"dataset_id": "openai/humaneval",
|
25 |
-
"description": "A benchmark for evaluating language models on code generation tasks. It consists of 164 programming problems with unit tests.",
|
26 |
-
"metrics": {"pass@1": 1.0, "functional_correctness": 1.0}
|
27 |
-
},
|
28 |
-
{
|
29 |
-
"name": "HellaSwag (Commonsense Reasoning)",
|
30 |
-
"dataset_id": "hellaswag",
|
31 |
-
"description": "A challenge dataset for evaluating commonsense natural language inference. It consists of multiple-choice questions about grounded situations.",
|
32 |
-
"metrics": {"accuracy": 1.0}
|
33 |
-
},
|
34 |
-
{
|
35 |
-
"name": "GSM8K (Grade School Math)",
|
36 |
-
"dataset_id": "gsm8k",
|
37 |
-
"description": "A dataset of 8.5K high quality grade school math word problems. These problems take between 2 and 8 steps to solve, and solutions primarily involve performing a sequence of elementary calculations using basic arithmetic operations.",
|
38 |
-
"metrics": {"accuracy": 1.0, "correct_steps": 1.0}
|
39 |
-
},
|
40 |
-
{
|
41 |
-
"name": "TruthfulQA",
|
42 |
-
"dataset_id": "truthful_qa",
|
43 |
-
"description": "A benchmark to measure whether a language model is truthful in generating answers to questions. The benchmark comprises 817 questions that span 38 categories, including health, law, finance and politics.",
|
44 |
-
"metrics": {"accuracy": 1.0, "truthfulness": 1.0}
|
45 |
-
}
|
46 |
-
]
|
47 |
-
|
48 |
-
# Add each benchmark to the database
|
49 |
-
for benchmark in sample_benchmarks:
|
50 |
-
benchmark_id = db.add_benchmark(
|
51 |
-
name=benchmark["name"],
|
52 |
-
dataset_id=benchmark["dataset_id"],
|
53 |
-
description=benchmark["description"],
|
54 |
-
metrics=benchmark["metrics"]
|
55 |
-
)
|
56 |
-
|
57 |
-
print(f"Added benchmark '{benchmark['name']}' with ID: {benchmark_id}")
|
58 |
-
|
59 |
-
# Close database connection
|
60 |
-
db.close()
|
61 |
-
|
62 |
-
return len(sample_benchmarks)
|
63 |
-
|
64 |
-
if __name__ == "__main__":
|
65 |
-
num_added = add_sample_benchmarks()
|
66 |
-
print(f"Added {num_added} sample benchmarks to the database.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|