Spaces:
Running
Running
# Theme and configuration settings for the Model Capability Leaderboard application | |
# Import task mapping | |
from src.utils.task_mapping import task_display_names | |
# Theme colors - using dark mode by default | |
dark_theme = { | |
'bg_color': '#1a202c', | |
'text_color': '#e2e8f0', | |
'card_bg': '#2d3748', | |
'primary': '#818cf8', | |
'secondary': '#a78bfa', | |
'border': '#4a5568', | |
'hover': '#4a5568', | |
'table_header': '#2d3748', | |
'table_border': '#4a5568', | |
'heading_color': '#e2e8f0', | |
'gradient': 'linear-gradient(135deg, #818cf8 0%, #a78bfa 100%)', | |
'warning_bg': '#7c2d12', | |
'warning_border': '#f97316', | |
'info_bg': '#1e3a8a', | |
'info_border': '#3b82f6', | |
'footer_color': '#a0aec0', | |
'title_color': 'white', | |
'subtitle_color': 'rgba(255, 255, 255, 0.9)', | |
'footer_border': '#4a5568', | |
'task_title': '#a5b4fc', | |
'task_border': '#818cf8', | |
# Table-specific colors for the custom table | |
'table_bg': '#0a0a0a', | |
'table_border_color': '#333', | |
'table_header_bg': '#191919', | |
'table_subheader_bg': '#141414', | |
'table_average_column_bg': '#202020', | |
'table_row_odd': '#0a0a0a', | |
'table_row_even': '#111111', | |
'table_hover_bg': '#1a1a1a', | |
'positive_value_color': '#4ade80', | |
'negative_value_color': '#f87171' | |
} | |
# Application settings | |
app_config = { | |
'title': 'MLRC-Bench Leaderboard', | |
'description': 'Machine Learning Research Challenges Benchmark for AI Agents', | |
'layout': 'wide', | |
'initial_sidebar_state': 'collapsed' | |
} | |
# Metrics configuration | |
metrics_config = { | |
"Relative Improvement to Human": { | |
"file": "src/data/metrics/relative_improvement_to_human.json", | |
"description": "Measures how much of the performance gap between baseline and human the agent has closed. Calculated as: (Agent performance - Baseline) / (Human - Baseline) × 100%.", | |
"min_value": -100, # Approximate, adjust as needed | |
"max_value": 50, # Approximate, adjust as needed | |
"color_map": "RdYlGn" | |
}, | |
"Absolute Improvement to Baseline": { | |
"file": "src/data/metrics/absolute_improvement_to_baseline.json", | |
"description": "Measures the percentage improvement over the baseline performance. Calculated as: (Agent performance - Baseline) / Baseline × 100%.", | |
"min_value": -100, # Approximate, adjust as needed | |
"max_value": 100, # Approximate, adjust as needed | |
"color_map": "RdYlGn" | |
} | |
# Future metrics can be added here | |
# "Another Metric": { | |
# "file": "src/data/metrics/another_metric.json", | |
# "description": "Description of another metric", | |
# "min_value": 0, | |
# "max_value": 100, | |
# "color_map": "viridis" | |
# } | |
} | |
# Model type categories | |
model_categories = { | |
"MLAB (claude-3-5-sonnet-v2)": "Closed Source", | |
"MLAB (gemini-exp-1206)": "Closed Source", | |
"MLAB (o3-mini)": "Closed Source", | |
"MLAB (gpt-4o)": "Closed Source", | |
"MLAB (llama3-1-405b-instruct)": "Open Weights", | |
"CoI-Agent (o1) + MLAB (gpt-4o)": "Closed Source", | |
"Human": "Human", | |
"Top Human in Competition": "Human" | |
# More models would be added here as needed | |
} | |
# Task descriptions | |
tasks_info = { | |
task_display_names.get("Perception Temporal Action Loc", "Temporal Action Localisation"): | |
"Testing the model's ability to understand and localize actions within temporal sequences of events.", | |
task_display_names.get("Llm Merging", "LLM Merging"): | |
"Assessing the capability to effectively merge knowledge from multiple language models.", | |
task_display_names.get("Meta Learning", "Meta Learning"): | |
"Evaluating the model's ability to learn how to learn - adapting quickly to new tasks.", | |
task_display_names.get("Product Recommendation", "Next Product Recommendation"): | |
"Testing the model's ability to recommend relevant products based on user preferences and behavior.", | |
task_display_names.get("Machine Unlearning", "Machine Unlearning"): | |
"Evaluating how well models can 'unlearn' specific information when required.", | |
task_display_names.get("Backdoor Trigger Recovery", "Backdoor Trigger Recovery"): | |
"Testing resilience against backdoor attacks and ability to recover from triggered behaviors." | |
} |