MLRC_Bench

Running

App Files Files Community

MLRC_Bench / src /utils /config.py

Armeddinosaur

Updating table

06d4ee9 4 months ago

raw

history blame

4.28 kB

	# Theme and configuration settings for the Model Capability Leaderboard application

	# Import task mapping
	from src.utils.task_mapping import task_display_names

	# Theme colors - using dark mode by default
	dark_theme = {
	'bg_color': '#1a202c',
	'text_color': '#e2e8f0',
	'card_bg': '#2d3748',
	'primary': '#818cf8',
	'secondary': '#a78bfa',
	'border': '#4a5568',
	'hover': '#4a5568',
	'table_header': '#2d3748',
	'table_border': '#4a5568',
	'heading_color': '#e2e8f0',
	'gradient': 'linear-gradient(135deg, #818cf8 0%, #a78bfa 100%)',
	'warning_bg': '#7c2d12',
	'warning_border': '#f97316',
	'info_bg': '#1e3a8a',
	'info_border': '#3b82f6',
	'footer_color': '#a0aec0',
	'title_color': 'white',
	'subtitle_color': 'rgba(255, 255, 255, 0.9)',
	'footer_border': '#4a5568',
	'task_title': '#a5b4fc',
	'task_border': '#818cf8',
	# Table-specific colors for the custom table
	'table_bg': '#0a0a0a',
	'table_border_color': '#333',
	'table_header_bg': '#191919',
	'table_subheader_bg': '#141414',
	'table_average_column_bg': '#202020',
	'table_row_odd': '#0a0a0a',
	'table_row_even': '#111111',
	'table_hover_bg': '#1a1a1a',
	'positive_value_color': '#4ade80',
	'negative_value_color': '#f87171'
	}

	# Application settings
	app_config = {
	'title': 'MLRC-Bench Leaderboard',
	'description': 'Machine Learning Research Challenges Benchmark for AI Agents',
	'layout': 'wide',
	'initial_sidebar_state': 'collapsed'
	}

	# Metrics configuration
	metrics_config = {
	"Relative Improvement to Human": {
	"file": "src/data/metrics/relative_improvement_to_human.json",
	"description": "Measures how much of the performance gap between baseline and human the agent has closed. Calculated as: (Agent performance - Baseline) / (Human - Baseline) × 100%.",
	"min_value": -100, # Approximate, adjust as needed
	"max_value": 50, # Approximate, adjust as needed
	"color_map": "RdYlGn"
	},
	"Absolute Improvement to Baseline": {
	"file": "src/data/metrics/absolute_improvement_to_baseline.json",
	"description": "Measures the percentage improvement over the baseline performance. Calculated as: (Agent performance - Baseline) / Baseline × 100%.",
	"min_value": -100, # Approximate, adjust as needed
	"max_value": 100, # Approximate, adjust as needed
	"color_map": "RdYlGn"
	}
	# Future metrics can be added here
	# "Another Metric": {
	# "file": "src/data/metrics/another_metric.json",
	# "description": "Description of another metric",
	# "min_value": 0,
	# "max_value": 100,
	# "color_map": "viridis"
	# }
	}

	# Model type categories
	model_categories = {
	"MLAB (claude-3-5-sonnet-v2)": "Closed Source",
	"MLAB (gemini-exp-1206)": "Closed Source",
	"MLAB (o3-mini)": "Closed Source",
	"MLAB (gpt-4o)": "Closed Source",
	"MLAB (llama3-1-405b-instruct)": "Open Weights",
	"CoI-Agent (o1) + MLAB (gpt-4o)": "Closed Source",
	"Human": "Human",
	"Top Human in Competition": "Human"
	# More models would be added here as needed
	}

	# Task descriptions
	tasks_info = {
	task_display_names.get("Perception Temporal Action Loc", "Temporal Action Localisation"):
	"Testing the model's ability to understand and localize actions within temporal sequences of events.",
	task_display_names.get("Llm Merging", "LLM Merging"):
	"Assessing the capability to effectively merge knowledge from multiple language models.",
	task_display_names.get("Meta Learning", "Meta Learning"):
	"Evaluating the model's ability to learn how to learn - adapting quickly to new tasks.",
	task_display_names.get("Product Recommendation", "Next Product Recommendation"):
	"Testing the model's ability to recommend relevant products based on user preferences and behavior.",
	task_display_names.get("Machine Unlearning", "Machine Unlearning"):
	"Evaluating how well models can 'unlearn' specific information when required.",
	task_display_names.get("Backdoor Trigger Recovery", "Backdoor Trigger Recovery"):
	"Testing resilience against backdoor attacks and ability to recover from triggered behaviors."
	}