Spaces:
Sleeping
Sleeping
| """ | |
| Sample benchmarks initialization for Dynamic Highscores system. | |
| This script adds sample benchmarks to the database to provide initial options for users. | |
| """ | |
| from database_schema import DynamicHighscoresDB | |
| def add_sample_benchmarks(): | |
| """Add sample benchmarks to the database.""" | |
| # Initialize database | |
| db = DynamicHighscoresDB() | |
| # Sample benchmarks to add | |
| sample_benchmarks = [ | |
| { | |
| "name": "MMLU (Massive Multitask Language Understanding)", | |
| "dataset_id": "cais/mmlu", | |
| "description": "A benchmark for measuring massive multitask language understanding across 57 tasks including elementary mathematics, US history, computer science, law, and more.", | |
| "metrics": {"accuracy": 1.0, "consistency": 1.0} | |
| }, | |
| { | |
| "name": "HumanEval (Code Generation)", | |
| "dataset_id": "openai/humaneval", | |
| "description": "A benchmark for evaluating language models on code generation tasks. It consists of 164 programming problems with unit tests.", | |
| "metrics": {"pass@1": 1.0, "functional_correctness": 1.0} | |
| }, | |
| { | |
| "name": "HellaSwag (Commonsense Reasoning)", | |
| "dataset_id": "hellaswag", | |
| "description": "A challenge dataset for evaluating commonsense natural language inference. It consists of multiple-choice questions about grounded situations.", | |
| "metrics": {"accuracy": 1.0} | |
| }, | |
| { | |
| "name": "GSM8K (Grade School Math)", | |
| "dataset_id": "gsm8k", | |
| "description": "A dataset of 8.5K high quality grade school math word problems. These problems take between 2 and 8 steps to solve, and solutions primarily involve performing a sequence of elementary calculations using basic arithmetic operations.", | |
| "metrics": {"accuracy": 1.0, "correct_steps": 1.0} | |
| }, | |
| { | |
| "name": "TruthfulQA", | |
| "dataset_id": "truthful_qa", | |
| "description": "A benchmark to measure whether a language model is truthful in generating answers to questions. The benchmark comprises 817 questions that span 38 categories, including health, law, finance and politics.", | |
| "metrics": {"accuracy": 1.0, "truthfulness": 1.0} | |
| } | |
| ] | |
| # Add each benchmark to the database | |
| added_count = 0 | |
| for benchmark in sample_benchmarks: | |
| try: | |
| benchmark_id = db.add_benchmark( | |
| name=benchmark["name"], | |
| dataset_id=benchmark["dataset_id"], | |
| description=benchmark["description"], | |
| metrics=benchmark["metrics"] | |
| ) | |
| if benchmark_id: | |
| print(f"Added benchmark '{benchmark['name']}' with ID: {benchmark_id}") | |
| added_count += 1 | |
| except Exception as e: | |
| print(f"Error adding benchmark '{benchmark['name']}': {e}") | |
| # Close database connection | |
| db.close() | |
| return added_count | |
| if __name__ == "__main__": | |
| num_added = add_sample_benchmarks() | |
| print(f"Added {num_added} sample benchmarks to the database.") |