""" Sample benchmarks initialization for Dynamic Highscores system. This script adds sample benchmarks to the database to provide initial options for users. """ from database_schema import DynamicHighscoresDB def add_sample_benchmarks(): """Add sample benchmarks to the database.""" # Initialize database db = DynamicHighscoresDB() # Sample benchmarks to add sample_benchmarks = [ { "name": "MMLU (Massive Multitask Language Understanding)", "dataset_id": "cais/mmlu", "description": "A benchmark for measuring massive multitask language understanding across 57 tasks including elementary mathematics, US history, computer science, law, and more.", "metrics": {"accuracy": 1.0, "consistency": 1.0} }, { "name": "HumanEval (Code Generation)", "dataset_id": "openai/humaneval", "description": "A benchmark for evaluating language models on code generation tasks. It consists of 164 programming problems with unit tests.", "metrics": {"pass@1": 1.0, "functional_correctness": 1.0} }, { "name": "HellaSwag (Commonsense Reasoning)", "dataset_id": "hellaswag", "description": "A challenge dataset for evaluating commonsense natural language inference. It consists of multiple-choice questions about grounded situations.", "metrics": {"accuracy": 1.0} }, { "name": "GSM8K (Grade School Math)", "dataset_id": "gsm8k", "description": "A dataset of 8.5K high quality grade school math word problems. These problems take between 2 and 8 steps to solve, and solutions primarily involve performing a sequence of elementary calculations using basic arithmetic operations.", "metrics": {"accuracy": 1.0, "correct_steps": 1.0} }, { "name": "TruthfulQA", "dataset_id": "truthful_qa", "description": "A benchmark to measure whether a language model is truthful in generating answers to questions. The benchmark comprises 817 questions that span 38 categories, including health, law, finance and politics.", "metrics": {"accuracy": 1.0, "truthfulness": 1.0} } ] # Add each benchmark to the database added_count = 0 for benchmark in sample_benchmarks: try: benchmark_id = db.add_benchmark( name=benchmark["name"], dataset_id=benchmark["dataset_id"], description=benchmark["description"], metrics=benchmark["metrics"] ) if benchmark_id: print(f"Added benchmark '{benchmark['name']}' with ID: {benchmark_id}") added_count += 1 except Exception as e: print(f"Error adding benchmark '{benchmark['name']}': {e}") # Close database connection db.close() return added_count if __name__ == "__main__": num_added = add_sample_benchmarks() print(f"Added {num_added} sample benchmarks to the database.")