qLeaderboard-aBase4Community

Running

App Files Files Community

Quazim0t0 commited on Mar 21

Commit

5838505

verified ·

1 Parent(s): f8ea25a

Delete sample_benchmarks.py

Browse files

Files changed (1) hide show

sample_benchmarks.py +0 -66

sample_benchmarks.py DELETED Viewed

@@ -1,66 +0,0 @@
-"""
-Sample benchmarks initialization for Dynamic Highscores system.
-This script adds sample benchmarks to the database to provide initial options for users.
-"""
-from database_schema import init_db
-def add_sample_benchmarks():
-    """Add sample benchmarks to the database."""
-    # Initialize database
-    db = init_db()
-    # Sample benchmarks to add
-    sample_benchmarks = [
-        {
-            "name": "MMLU (Massive Multitask Language Understanding)",
-            "dataset_id": "cais/mmlu",
-            "description": "A benchmark for measuring massive multitask language understanding across 57 tasks including elementary mathematics, US history, computer science, law, and more.",
-            "metrics": {"accuracy": 1.0, "consistency": 1.0}
-        },
-        {
-            "name": "HumanEval (Code Generation)",
-            "dataset_id": "openai/humaneval",
-            "description": "A benchmark for evaluating language models on code generation tasks. It consists of 164 programming problems with unit tests.",
-            "metrics": {"pass@1": 1.0, "functional_correctness": 1.0}
-        },
-        {
-            "name": "HellaSwag (Commonsense Reasoning)",
-            "dataset_id": "hellaswag",
-            "description": "A challenge dataset for evaluating commonsense natural language inference. It consists of multiple-choice questions about grounded situations.",
-            "metrics": {"accuracy": 1.0}
-        },
-        {
-            "name": "GSM8K (Grade School Math)",
-            "dataset_id": "gsm8k",
-            "description": "A dataset of 8.5K high quality grade school math word problems. These problems take between 2 and 8 steps to solve, and solutions primarily involve performing a sequence of elementary calculations using basic arithmetic operations.",
-            "metrics": {"accuracy": 1.0, "correct_steps": 1.0}
-        },
-        {
-            "name": "TruthfulQA",
-            "dataset_id": "truthful_qa",
-            "description": "A benchmark to measure whether a language model is truthful in generating answers to questions. The benchmark comprises 817 questions that span 38 categories, including health, law, finance and politics.",
-            "metrics": {"accuracy": 1.0, "truthfulness": 1.0}
-        }
-    ]
-    # Add each benchmark to the database
-    for benchmark in sample_benchmarks:
-        benchmark_id = db.add_benchmark(
-            name=benchmark["name"],
-            dataset_id=benchmark["dataset_id"],
-            description=benchmark["description"],
-            metrics=benchmark["metrics"]
-        )
-        print(f"Added benchmark '{benchmark['name']}' with ID: {benchmark_id}")
-    # Close database connection
-    db.close()
-    return len(sample_benchmarks)
-if __name__ == "__main__":
-    num_added = add_sample_benchmarks()
-    print(f"Added {num_added} sample benchmarks to the database.")