# Theme and configuration settings for the Model Capability Leaderboard application # Import task mapping from src.utils.task_mapping import task_display_names # Theme colors - using dark mode by default dark_theme = { 'bg_color': '#1a202c', 'text_color': '#e2e8f0', 'card_bg': '#2d3748', 'primary': '#818cf8', 'secondary': '#a78bfa', 'border': '#4a5568', 'hover': '#4a5568', 'table_header': '#2d3748', 'table_border': '#4a5568', 'heading_color': '#e2e8f0', 'gradient': 'linear-gradient(135deg, #818cf8 0%, #a78bfa 100%)', 'warning_bg': '#7c2d12', 'warning_border': '#f97316', 'info_bg': '#1e3a8a', 'info_border': '#3b82f6', 'footer_color': '#a0aec0', 'title_color': 'white', 'subtitle_color': 'rgba(255, 255, 255, 0.9)', 'footer_border': '#4a5568', 'task_title': '#a5b4fc', 'task_border': '#818cf8', # Table-specific colors for the custom table 'table_bg': '#0a0a0a', 'table_border_color': '#333', 'table_header_bg': '#191919', 'table_subheader_bg': '#141414', 'table_average_column_bg': '#202020', 'table_row_odd': '#0a0a0a', 'table_row_even': '#111111', 'table_hover_bg': '#1a1a1a', 'positive_value_color': '#4ade80', 'negative_value_color': '#f87171' } # Application settings app_config = { 'title': 'MLRC-Bench Leaderboard', 'description': 'Machine Learning Research Challenges Benchmark for AI Agents', 'layout': 'wide', 'initial_sidebar_state': 'collapsed' } # Metrics configuration metrics_config = { "Relative Improvement to Human": { "file": "src/data/metrics/relative_improvement_to_human.json", "description": "Measures how much of the performance gap between baseline and human the agent has closed. Calculated as: (Agent performance - Baseline) / (Human - Baseline) × 100%.", "min_value": -100, # Approximate, adjust as needed "max_value": 50, # Approximate, adjust as needed "color_map": "RdYlGn" }, "Absolute Improvement to Baseline": { "file": "src/data/metrics/absolute_improvement_to_baseline.json", "description": "Measures the percentage improvement over the baseline performance. Calculated as: (Agent performance - Baseline) / Baseline × 100%.", "min_value": -100, # Approximate, adjust as needed "max_value": 100, # Approximate, adjust as needed "color_map": "RdYlGn" } # Future metrics can be added here # "Another Metric": { # "file": "src/data/metrics/another_metric.json", # "description": "Description of another metric", # "min_value": 0, # "max_value": 100, # "color_map": "viridis" # } } # Model type categories model_categories = { "MLAB (claude-3-5-sonnet-v2)": "Closed Source", "MLAB (gemini-exp-1206)": "Closed Source", "MLAB (o3-mini)": "Closed Source", "MLAB (gpt-4o)": "Closed Source", "MLAB (llama3-1-405b-instruct)": "Open Weights", "CoI-Agent (o1) + MLAB (gpt-4o)": "Closed Source", "Human": "Human", "Top Human in Competition": "Human", "Human Idea + MLAB (gpt-4o)": "Closed Source" # More models would be added here as needed } # Task descriptions tasks_info = { task_display_names.get("Perception Temporal Action Loc", "Temporal Action Localisation"): "Testing the model's ability to understand and localize actions within temporal sequences of events.", task_display_names.get("Llm Merging", "LLM Merging"): "Assessing the capability to effectively merge knowledge from multiple language models.", task_display_names.get("Meta Learning", "Meta Learning"): "Evaluating the model's ability to learn how to learn - adapting quickly to new tasks.", task_display_names.get("Product Recommendation", "Next Product Recommendation"): "Testing the model's ability to recommend relevant products based on user preferences and behavior.", task_display_names.get("Machine Unlearning", "Machine Unlearning"): "Evaluating how well models can 'unlearn' specific information when required.", task_display_names.get("Backdoor Trigger Recovery", "Backdoor Trigger Recovery"): "Testing resilience against backdoor attacks and ability to recover from triggered behaviors.", task_display_names.get("Rainfall Prediction", "Rainfall Prediction"): "Testing the model's ability to predict rainfall based on historical data and weather patterns." }