Spaces:

Enderchef
/

AI-Leaderboard

Sleeping

App Files Files Community

AI-Leaderboard / sample_benchmarks.py

Quazim0t0

Update sample_benchmarks.py

c678804 verified 8 months ago

raw

history blame

3.14 kB

	"""
	Sample benchmarks initialization for Dynamic Highscores system.

	This script adds sample benchmarks to the database to provide initial options for users.
	"""

	from database_schema import DynamicHighscoresDB

	def add_sample_benchmarks():
	"""Add sample benchmarks to the database."""
	# Initialize database
	db = DynamicHighscoresDB()

	# Sample benchmarks to add
	sample_benchmarks = [
	{
	"name": "MMLU (Massive Multitask Language Understanding)",
	"dataset_id": "cais/mmlu",
	"description": "A benchmark for measuring massive multitask language understanding across 57 tasks including elementary mathematics, US history, computer science, law, and more.",
	"metrics": {"accuracy": 1.0, "consistency": 1.0}
	},
	{
	"name": "HumanEval (Code Generation)",
	"dataset_id": "openai/humaneval",
	"description": "A benchmark for evaluating language models on code generation tasks. It consists of 164 programming problems with unit tests.",
	"metrics": {"pass@1": 1.0, "functional_correctness": 1.0}
	},
	{
	"name": "HellaSwag (Commonsense Reasoning)",
	"dataset_id": "hellaswag",
	"description": "A challenge dataset for evaluating commonsense natural language inference. It consists of multiple-choice questions about grounded situations.",
	"metrics": {"accuracy": 1.0}
	},
	{
	"name": "GSM8K (Grade School Math)",
	"dataset_id": "gsm8k",
	"description": "A dataset of 8.5K high quality grade school math word problems. These problems take between 2 and 8 steps to solve, and solutions primarily involve performing a sequence of elementary calculations using basic arithmetic operations.",
	"metrics": {"accuracy": 1.0, "correct_steps": 1.0}
	},
	{
	"name": "TruthfulQA",
	"dataset_id": "truthful_qa",
	"description": "A benchmark to measure whether a language model is truthful in generating answers to questions. The benchmark comprises 817 questions that span 38 categories, including health, law, finance and politics.",
	"metrics": {"accuracy": 1.0, "truthfulness": 1.0}
	}
	]

	# Add each benchmark to the database
	added_count = 0
	for benchmark in sample_benchmarks:
	try:
	benchmark_id = db.add_benchmark(
	name=benchmark["name"],
	dataset_id=benchmark["dataset_id"],
	description=benchmark["description"],
	metrics=benchmark["metrics"]
	)

	if benchmark_id:
	print(f"Added benchmark '{benchmark['name']}' with ID: {benchmark_id}")
	added_count += 1
	except Exception as e:
	print(f"Error adding benchmark '{benchmark['name']}': {e}")

	# Close database connection
	db.close()

	return added_count

	if __name__ == "__main__":
	num_added = add_sample_benchmarks()
	print(f"Added {num_added} sample benchmarks to the database.")