MLRC_Bench / src /utils /config.py
Armeddinosaur's picture
Updating table
06d4ee9
raw
history blame
4.28 kB
# Theme and configuration settings for the Model Capability Leaderboard application
# Import task mapping
from src.utils.task_mapping import task_display_names
# Theme colors - using dark mode by default
dark_theme = {
'bg_color': '#1a202c',
'text_color': '#e2e8f0',
'card_bg': '#2d3748',
'primary': '#818cf8',
'secondary': '#a78bfa',
'border': '#4a5568',
'hover': '#4a5568',
'table_header': '#2d3748',
'table_border': '#4a5568',
'heading_color': '#e2e8f0',
'gradient': 'linear-gradient(135deg, #818cf8 0%, #a78bfa 100%)',
'warning_bg': '#7c2d12',
'warning_border': '#f97316',
'info_bg': '#1e3a8a',
'info_border': '#3b82f6',
'footer_color': '#a0aec0',
'title_color': 'white',
'subtitle_color': 'rgba(255, 255, 255, 0.9)',
'footer_border': '#4a5568',
'task_title': '#a5b4fc',
'task_border': '#818cf8',
# Table-specific colors for the custom table
'table_bg': '#0a0a0a',
'table_border_color': '#333',
'table_header_bg': '#191919',
'table_subheader_bg': '#141414',
'table_average_column_bg': '#202020',
'table_row_odd': '#0a0a0a',
'table_row_even': '#111111',
'table_hover_bg': '#1a1a1a',
'positive_value_color': '#4ade80',
'negative_value_color': '#f87171'
}
# Application settings
app_config = {
'title': 'MLRC-Bench Leaderboard',
'description': 'Machine Learning Research Challenges Benchmark for AI Agents',
'layout': 'wide',
'initial_sidebar_state': 'collapsed'
}
# Metrics configuration
metrics_config = {
"Relative Improvement to Human": {
"file": "src/data/metrics/relative_improvement_to_human.json",
"description": "Measures how much of the performance gap between baseline and human the agent has closed. Calculated as: (Agent performance - Baseline) / (Human - Baseline) × 100%.",
"min_value": -100, # Approximate, adjust as needed
"max_value": 50, # Approximate, adjust as needed
"color_map": "RdYlGn"
},
"Absolute Improvement to Baseline": {
"file": "src/data/metrics/absolute_improvement_to_baseline.json",
"description": "Measures the percentage improvement over the baseline performance. Calculated as: (Agent performance - Baseline) / Baseline × 100%.",
"min_value": -100, # Approximate, adjust as needed
"max_value": 100, # Approximate, adjust as needed
"color_map": "RdYlGn"
}
# Future metrics can be added here
# "Another Metric": {
# "file": "src/data/metrics/another_metric.json",
# "description": "Description of another metric",
# "min_value": 0,
# "max_value": 100,
# "color_map": "viridis"
# }
}
# Model type categories
model_categories = {
"MLAB (claude-3-5-sonnet-v2)": "Closed Source",
"MLAB (gemini-exp-1206)": "Closed Source",
"MLAB (o3-mini)": "Closed Source",
"MLAB (gpt-4o)": "Closed Source",
"MLAB (llama3-1-405b-instruct)": "Open Weights",
"CoI-Agent (o1) + MLAB (gpt-4o)": "Closed Source",
"Human": "Human",
"Top Human in Competition": "Human"
# More models would be added here as needed
}
# Task descriptions
tasks_info = {
task_display_names.get("Perception Temporal Action Loc", "Temporal Action Localisation"):
"Testing the model's ability to understand and localize actions within temporal sequences of events.",
task_display_names.get("Llm Merging", "LLM Merging"):
"Assessing the capability to effectively merge knowledge from multiple language models.",
task_display_names.get("Meta Learning", "Meta Learning"):
"Evaluating the model's ability to learn how to learn - adapting quickly to new tasks.",
task_display_names.get("Product Recommendation", "Next Product Recommendation"):
"Testing the model's ability to recommend relevant products based on user preferences and behavior.",
task_display_names.get("Machine Unlearning", "Machine Unlearning"):
"Evaluating how well models can 'unlearn' specific information when required.",
task_display_names.get("Backdoor Trigger Recovery", "Backdoor Trigger Recovery"):
"Testing resilience against backdoor attacks and ability to recover from triggered behaviors."
}