""" GuardBench Leaderboard Application """ import os import json import tempfile import logging import gradio as gr from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns import pandas as pd from apscheduler.schedulers.background import BackgroundScheduler from src.about import ( CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE, ) from src.display.css_html_js import custom_css from src.display.utils import ( GUARDBENCH_COLUMN, DISPLAY_COLS, METRIC_COLS, HIDDEN_COLS, NEVER_HIDDEN_COLS, CATEGORIES, TEST_TYPES, ModelType, Precision, WeightType ) from src.display.formatting import styled_message, styled_error, styled_warning from src.envs import ( ADMIN_USERNAME, ADMIN_PASSWORD, RESULTS_DATASET_ID, SUBMITTER_TOKEN, TOKEN, DATA_PATH ) from src.populate import get_leaderboard_df, download_leaderboard_data, get_category_leaderboard_df from src.submission.submit import process_submission # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # Ensure data directory exists os.makedirs(DATA_PATH, exist_ok=True) # Available benchmark versions BENCHMARK_VERSIONS = ["v0"] CURRENT_VERSION = "v0" # Initialize leaderboard data try: logger.info("Initializing leaderboard data...") LEADERBOARD_DF = get_leaderboard_df(version=CURRENT_VERSION) logger.info(f"Loaded leaderboard with {len(LEADERBOARD_DF)} entries") except Exception as e: logger.error(f"Error loading leaderboard data: {e}") LEADERBOARD_DF = pd.DataFrame() def init_leaderboard(dataframe): """ Initialize the leaderboard component. """ if dataframe is None or dataframe.empty: # Create an empty dataframe with the right columns columns = [getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS] dataframe = pd.DataFrame(columns=columns) logger.warning("Initializing empty leaderboard") return Leaderboard( value=dataframe, datatype=[getattr(GUARDBENCH_COLUMN, col).type for col in DISPLAY_COLS], select_columns=SelectColumns( default_selection=[getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS], cant_deselect=[getattr(GUARDBENCH_COLUMN, col).name for col in NEVER_HIDDEN_COLS], label="Select Columns to Display:", ), search_columns=[GUARDBENCH_COLUMN.model_name.name], hide_columns=[getattr(GUARDBENCH_COLUMN, col).name for col in HIDDEN_COLS], filter_columns=[ ColumnFilter(GUARDBENCH_COLUMN.model_type.name, type="checkboxgroup", label="Model types"), ], interactive=False, ) def submit_results( model_name: str, base_model: str, revision: str, precision: str, weight_type: str, model_type: str, submission_file: tempfile._TemporaryFileWrapper, version: str ): """ Handle submission of results with model metadata. """ if submission_file is None: return styled_error("No submission file provided") if not model_name: return styled_error("Model name is required") if not model_type: return styled_error("Please select a model type") file_path = submission_file.name logger.info(f"Received submission for model {model_name}: {file_path}") # Add metadata to the submission metadata = { "model_name": model_name, "base_model": base_model, "revision": revision if revision else "main", "precision": precision, "weight_type": weight_type, "model_type": model_type, "version": version } # Process the submission result = process_submission(file_path, metadata, version=version) # Refresh the leaderboard data global LEADERBOARD_DF try: logger.info(f"Refreshing leaderboard data after submission for version {version}...") LEADERBOARD_DF = get_leaderboard_df(version=version) logger.info("Refreshed leaderboard data after submission") except Exception as e: logger.error(f"Error refreshing leaderboard data: {e}") return result def refresh_data(version=CURRENT_VERSION): """ Refresh the leaderboard data from HuggingFace. """ global LEADERBOARD_DF try: logger.info(f"Performing scheduled refresh of leaderboard data for version {version}...") LEADERBOARD_DF = get_leaderboard_df(version=version) logger.info("Scheduled refresh of leaderboard data completed") except Exception as e: logger.error(f"Error in scheduled refresh: {e}") return LEADERBOARD_DF def update_leaderboards(version): """ Update all leaderboard components with data for the selected version. """ new_df = get_leaderboard_df(version=version) category_dfs = [get_category_leaderboard_df(category, version=version) for category in CATEGORIES] return [init_leaderboard(new_df)] + [init_leaderboard(df) for df in category_dfs] # Create Gradio app demo = gr.Blocks(css=custom_css) with demo: gr.HTML(TITLE) with gr.Row(): with gr.Column(scale=3): gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.Column(scale=1): version_selector = gr.Dropdown( choices=BENCHMARK_VERSIONS, label="Benchmark Version", value=CURRENT_VERSION, interactive=True, elem_classes="version-selector" ) with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("🏅 Leaderboard", elem_id="guardbench-leaderboard-tab", id=0): refresh_button = gr.Button("Refresh Leaderboard") # Create tabs for each category with gr.Tabs(elem_classes="category-tabs") as category_tabs: # First tab for average metrics across all categories with gr.TabItem("📊 Overall Performance", elem_id="overall-tab"): leaderboard = init_leaderboard(LEADERBOARD_DF) # Create a tab for each category for category in CATEGORIES: with gr.TabItem(f"{category}", elem_id=f"category-{category.lower().replace(' ', '-')}-tab"): category_df = get_category_leaderboard_df(category, version=CURRENT_VERSION) category_leaderboard = init_leaderboard(category_df) # Refresh button functionality refresh_button.click( fn=lambda: [ init_leaderboard(get_leaderboard_df(version=version_selector.value)), *[init_leaderboard(get_category_leaderboard_df(category, version=version_selector.value)) for category in CATEGORIES] ], inputs=[], outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)] ) with gr.TabItem("📝 About", elem_id="guardbench-about-tab", id=1): gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") with gr.TabItem("🚀 Submit", elem_id="guardbench-submit-tab", id=2): gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") with gr.Row(): gr.Markdown("# âœ‰ī¸âœ¨ Submit your results here!", elem_classes="markdown-text") with gr.Row(): with gr.Column(): model_name_textbox = gr.Textbox(label="Model name") revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main") model_type = gr.Dropdown( choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown], label="Model type", multiselect=False, value=None, interactive=True, ) with gr.Column(): precision = gr.Dropdown( choices=[i.name for i in Precision if i != Precision.Unknown], label="Precision", multiselect=False, value="float16", interactive=True, ) weight_type = gr.Dropdown( choices=[i.name for i in WeightType], label="Weights type", multiselect=False, value="Original", interactive=True, ) base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)") with gr.Row(): file_input = gr.File( label="Upload JSONL Results File", file_types=[".jsonl"] ) submit_button = gr.Button("Submit Results") result_output = gr.Markdown() submit_button.click( fn=submit_results, inputs=[ model_name_textbox, base_model_name_textbox, revision_name_textbox, precision, weight_type, model_type, file_input, version_selector ], outputs=result_output ) # Version selector functionality version_selector.change( fn=update_leaderboards, inputs=[version_selector], outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)] ) with gr.Row(): with gr.Accordion("📙 Citation", open=False): citation_button = gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=10, elem_id="citation-button", show_copy_button=True, ) with gr.Accordion("â„šī¸ Dataset Information", open=False): dataset_info = gr.Markdown(f""" ## Dataset Information Results are stored in the HuggingFace dataset: [{RESULTS_DATASET_ID}](https://huggingface.co/datasets/{RESULTS_DATASET_ID}) Last updated: {pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S UTC")} """) scheduler = BackgroundScheduler() scheduler.add_job(lambda: refresh_data(version=CURRENT_VERSION), 'interval', minutes=30) scheduler.start() # Launch the app if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860, share=True)