""" GuardBench Leaderboard Application """ import os import json import tempfile import logging import gradio as gr from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns import pandas as pd import plotly.express as px import plotly.graph_objects as go from apscheduler.schedulers.background import BackgroundScheduler from src.about import ( CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE, ) from src.display.css_html_js import custom_css from src.display.utils import ( GUARDBENCH_COLUMN, DISPLAY_COLS, METRIC_COLS, HIDDEN_COLS, NEVER_HIDDEN_COLS, CATEGORIES, TEST_TYPES, ModelType, Precision, WeightType, GuardModelType ) from src.display.formatting import styled_message, styled_error, styled_warning from src.envs import ( ADMIN_USERNAME, ADMIN_PASSWORD, RESULTS_DATASET_ID, SUBMITTER_TOKEN, TOKEN, DATA_PATH ) from src.populate import get_leaderboard_df, get_category_leaderboard_df from src.submission.submit import process_submission # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # Ensure data directory exists os.makedirs(DATA_PATH, exist_ok=True) # Available benchmark versions BENCHMARK_VERSIONS = ["v0"] CURRENT_VERSION = "v0" # Initialize leaderboard data try: logger.info("Initializing leaderboard data...") LEADERBOARD_DF = get_leaderboard_df(version=CURRENT_VERSION) logger.info(f"Loaded leaderboard with {len(LEADERBOARD_DF)} entries") except Exception as e: logger.error(f"Error loading leaderboard data: {e}") LEADERBOARD_DF = pd.DataFrame() print(DISPLAY_COLS) def init_leaderboard(dataframe): """ Initialize the leaderboard component. """ if dataframe is None or dataframe.empty: # Create an empty dataframe with the right columns columns = [getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS] dataframe = pd.DataFrame(columns=columns) logger.warning("Initializing empty leaderboard") return Leaderboard( value=dataframe, datatype=[getattr(GUARDBENCH_COLUMN, col).type for col in DISPLAY_COLS], select_columns=SelectColumns( default_selection=[getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS], cant_deselect=[getattr(GUARDBENCH_COLUMN, col).name for col in NEVER_HIDDEN_COLS], label="Select Columns to Display:", ), search_columns=[GUARDBENCH_COLUMN.model_name.name], hide_columns=[getattr(GUARDBENCH_COLUMN, col).name for col in HIDDEN_COLS], filter_columns=[ ColumnFilter(GUARDBENCH_COLUMN.model_type.name, type="checkboxgroup", label="Model types"), ], interactive=False, ) def submit_results( model_name: str, base_model: str, revision: str, precision: str, weight_type: str, model_type: str, submission_file: tempfile._TemporaryFileWrapper, version: str, guard_model_type: GuardModelType ): """ Handle submission of results with model metadata. """ if submission_file is None: return styled_error("No submission file provided") if not model_name: return styled_error("Model name is required") if not model_type: return styled_error("Please select a model type") file_path = submission_file.name logger.info(f"Received submission for model {model_name}: {file_path}") # Add metadata to the submission metadata = { "model_name": model_name, "base_model": base_model, "revision": revision if revision else "main", "precision": precision, "weight_type": weight_type, "model_type": model_type, "version": version, "guard_model_type": guard_model_type } # Process the submission result = process_submission(file_path, metadata, version=version) # Refresh the leaderboard data global LEADERBOARD_DF try: logger.info(f"Refreshing leaderboard data after submission for version {version}...") LEADERBOARD_DF = get_leaderboard_df(version=version) logger.info("Refreshed leaderboard data after submission") except Exception as e: logger.error(f"Error refreshing leaderboard data: {e}") return result def refresh_data(version=CURRENT_VERSION): """ Refresh the leaderboard data from HuggingFace. """ global LEADERBOARD_DF try: logger.info(f"Performing scheduled refresh of leaderboard data for version {version}...") new_df = get_leaderboard_df(version=version) if new_df is not None and not new_df.empty: LEADERBOARD_DF = new_df logger.info("Scheduled refresh of leaderboard data completed") else: logger.warning("Refresh returned empty data, keeping existing data") # If empty, create a dataframe with correct columns if LEADERBOARD_DF is None or LEADERBOARD_DF.empty: columns = [getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS] LEADERBOARD_DF = pd.DataFrame(columns=columns) except Exception as e: logger.error(f"Error in scheduled refresh: {e}") # Ensure we have at least an empty dataframe with correct columns if LEADERBOARD_DF is None or LEADERBOARD_DF.empty: columns = [getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS] LEADERBOARD_DF = pd.DataFrame(columns=columns) return LEADERBOARD_DF def update_leaderboards(version): """ Update all leaderboard components with data for the selected version. """ new_df = get_leaderboard_df(version=version) category_dfs = [get_category_leaderboard_df(category, version=version) for category in CATEGORIES] return [init_leaderboard(new_df)] + [init_leaderboard(df) for df in category_dfs] def create_performance_plot(selected_models, category, metric="f1_binary", version=CURRENT_VERSION): """ Create a radar plot comparing model performance for selected models. """ if category == "📊 Overall Performance": df = get_leaderboard_df(version=version) else: df = get_category_leaderboard_df(category, version=version) if df.empty: return go.Figure() # Filter for selected models df = df[df['model_name'].isin(selected_models)] # Get the relevant metric columns metric_cols = [col for col in df.columns if metric in col] # Create figure fig = go.Figure() # Custom colors for different models colors = ['#8FCCCC', '#C2A4B6', '#98B4A6', '#B68F7C'] # Pale Cyan, Pale Pink, Pale Green, Pale Orange # Add traces for each model for idx, model in enumerate(selected_models): model_data = df[df['model_name'] == model] if not model_data.empty: values = model_data[metric_cols].values[0].tolist() # Add the first value again at the end to complete the polygon values = values + [values[0]] # Clean up test type names categories = [col.replace(f'_{metric}', '') for col in metric_cols] # Add the first category again at the end to complete the polygon categories = categories + [categories[0]] fig.add_trace(go.Scatterpolar( r=values, theta=categories, name=model, line_color=colors[idx % len(colors)], fill='toself' )) # Update layout with all settings at once fig.update_layout( paper_bgcolor='#000000', plot_bgcolor='#000000', font={'color': '#ffffff'}, title={ 'text': f'{category} - {metric.upper()} Score Comparison', 'font': {'color': '#ffffff', 'size': 24} }, polar=dict( bgcolor='#000000', radialaxis=dict( visible=True, range=[0, 1], gridcolor='#333333', linecolor='#333333', tickfont={'color': '#ffffff'}, ), angularaxis=dict( gridcolor='#333333', linecolor='#333333', tickfont={'color': '#ffffff'}, ) ), height=600, showlegend=True, legend=dict( yanchor="top", y=0.99, xanchor="right", x=0.99, bgcolor='rgba(0,0,0,0.5)', font={'color': '#ffffff'} ) ) return fig def update_model_choices(version): """ Update the list of available models for the given version. """ df = get_leaderboard_df(version=version) if df.empty: return [] return sorted(df['model_name'].unique().tolist()) def update_visualization(selected_models, selected_category, selected_metric, version): """ Update the visualization based on user selections. """ if not selected_models: return go.Figure() return create_performance_plot(selected_models, selected_category, selected_metric, version) # Create Gradio app demo = gr.Blocks(css=custom_css) with demo: gr.HTML(TITLE) gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.Row(): tabs = gr.Tabs(elem_classes="tab-buttons") with tabs: with gr.TabItem("🏅 Leaderboard", elem_id="guardbench-leaderboard-tab", id=0): with gr.Row(): refresh_button = gr.Button("Refresh Leaderboard", scale=3) version_selector = gr.Dropdown( choices=BENCHMARK_VERSIONS, label="Benchmark Version", value=CURRENT_VERSION, interactive=True, elem_classes="version-selector", scale=1 ) # Create tabs for each category with gr.Tabs(elem_classes="category-tabs") as category_tabs: # First tab for average metrics across all categories with gr.TabItem("📊 Overall Performance", elem_id="overall-tab"): leaderboard = init_leaderboard(LEADERBOARD_DF) # Create a tab for each category for category in CATEGORIES: with gr.TabItem(f"{category}", elem_id=f"category-{category.lower().replace(' ', '-')}-tab"): category_df = get_category_leaderboard_df(category, version=CURRENT_VERSION) category_leaderboard = init_leaderboard(category_df) # Refresh button functionality refresh_button.click( fn=lambda: [ init_leaderboard(get_leaderboard_df(version=version_selector.value)), *[init_leaderboard(get_category_leaderboard_df(category, version=version_selector.value)) for category in CATEGORIES] ], inputs=[], outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)] ) with gr.TabItem("📊 Visualize", elem_id="guardbench-viz-tab", id=1): with gr.Row(): with gr.Column(): viz_version_selector = gr.Dropdown( choices=BENCHMARK_VERSIONS, label="Benchmark Version", value=CURRENT_VERSION, interactive=True ) model_selector = gr.Dropdown( choices=update_model_choices(CURRENT_VERSION), label="Select Models to Compare", multiselect=True, interactive=True ) with gr.Column(): # Add Overall Performance to categories viz_categories = ["📊 Overall Performance"] + CATEGORIES category_selector = gr.Dropdown( choices=viz_categories, label="Select Category", value=viz_categories[0], interactive=True ) metric_selector = gr.Dropdown( choices=["f1_binary", "precision_binary", "recall_binary"], label="Select Metric", value="f1_binary", interactive=True ) plot_output = gr.Plot() # Update visualization when any selector changes for control in [viz_version_selector, model_selector, category_selector, metric_selector]: control.change( fn=update_visualization, inputs=[model_selector, category_selector, metric_selector, viz_version_selector], outputs=plot_output ) # Update model choices when version changes viz_version_selector.change( fn=update_model_choices, inputs=[viz_version_selector], outputs=[model_selector] ) with gr.TabItem("📝 About", elem_id="guardbench-about-tab", id=2): gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") with gr.TabItem("🚀 Submit", elem_id="guardbench-submit-tab", id=3): gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") with gr.Row(): with gr.Column(scale=3): gr.Markdown("# âœ‰ī¸âœ¨ Submit your results here!", elem_classes="markdown-text") with gr.Column(scale=1): # Add version selector specifically for the submission tab submission_version_selector = gr.Dropdown( choices=BENCHMARK_VERSIONS, label="Benchmark Version", value=CURRENT_VERSION, interactive=True, elem_classes="version-selector" ) with gr.Row(): with gr.Column(): model_name_textbox = gr.Textbox(label="Model name") revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main") model_type = gr.Dropdown( choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown], label="Model type", multiselect=False, value=None, interactive=True, ) guard_model_type = gr.Dropdown( choices=[t.name for t in GuardModelType], label="Guard model type", multiselect=False, value=GuardModelType.LLM_REGEXP.name, interactive=True, ) with gr.Column(): precision = gr.Dropdown( choices=[i.name for i in Precision if i != Precision.Unknown], label="Precision", multiselect=False, value="float16", interactive=True, ) weight_type = gr.Dropdown( choices=[i.name for i in WeightType], label="Weights type", multiselect=False, value="Original", interactive=True, ) base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)") with gr.Row(): file_input = gr.File( label="Upload JSONL Results File", file_types=[".jsonl"] ) submit_button = gr.Button("Submit Results") result_output = gr.Markdown() submit_button.click( fn=submit_results, inputs=[ model_name_textbox, base_model_name_textbox, revision_name_textbox, precision, weight_type, model_type, file_input, submission_version_selector, guard_model_type ], outputs=result_output ) # Version selector functionality version_selector.change( fn=update_leaderboards, inputs=[version_selector], outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)] ) with gr.Row(): with gr.Accordion("📙 Citation", open=False): citation_button = gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=10, elem_id="citation-button", show_copy_button=True, ) with gr.Accordion("â„šī¸ Dataset Information", open=False): dataset_info = gr.Markdown(f""" ## Dataset Information Results are stored in the HuggingFace dataset: [{RESULTS_DATASET_ID}](https://huggingface.co/datasets/{RESULTS_DATASET_ID}) Last updated: {pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S UTC")} """) scheduler = BackgroundScheduler() scheduler.add_job(lambda: refresh_data(version=CURRENT_VERSION), 'interval', minutes=30) scheduler.start() # Launch the app if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860, share=True)