Spaces:
Running
Running
| """ | |
| Main module for the WhisperKit Evaluation Dashboard. | |
| This module sets up and runs the Gradio interface for the WhisperKit Evaluation Dashboard, | |
| allowing users to explore and compare speech recognition model performance across different | |
| devices, operating systems, and datasets. | |
| """ | |
| import json | |
| import os | |
| import re | |
| from math import ceil, floor | |
| import gradio as gr | |
| import pandas as pd | |
| from argmax_gradio_components import RangeSlider | |
| from dotenv import load_dotenv | |
| from huggingface_hub import login | |
| # Import custom constants and utility functions | |
| from constants import ( | |
| BANNER_TEXT, | |
| CITATION_BUTTON_LABEL, | |
| CITATION_BUTTON_TEXT, | |
| COL_NAMES, | |
| HEADER, | |
| METHODOLOGY_TEXT, | |
| PERFORMANCE_TEXT, | |
| ) | |
| from utils import ( | |
| add_datasets_to_performance_columns, | |
| calculate_quality_parity, | |
| create_initial_performance_column_dict, | |
| css, | |
| fields, | |
| get_os_name_and_version, | |
| make_model_name_clickable_link, | |
| plot_metric, | |
| read_json_line_by_line, | |
| ) | |
| # Load environment variables | |
| load_dotenv() | |
| # Get the Hugging Face token from the environment variable | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| # Use the token for login | |
| login(token=HF_TOKEN, add_to_git_credential=True) | |
| # Define repository and directory information | |
| repo_id = "argmaxinc/whisperkit-evals-dataset" | |
| directory = "xcresults/benchmark_results" | |
| local_dir = "" | |
| # Load benchmark data from JSON files | |
| PERFORMANCE_DATA = read_json_line_by_line("dashboard_data/performance_data.json") | |
| with open("dashboard_data/version.json", "r") as file: | |
| VERSION_DATA = json.load(file) | |
| # Load quality data (ground truth WER) | |
| QUALITY_DATA = read_json_line_by_line("dashboard_data/quality_data.json") | |
| SHA_TO_VERSION = { | |
| VERSION_DATA["releases"][i]: VERSION_DATA["versions"][i] | |
| for i in range(len(VERSION_DATA["versions"])) | |
| } | |
| # Convert JSON data to pandas DataFrames - performance only | |
| benchmark_df = pd.json_normalize(PERFORMANCE_DATA) | |
| releases = VERSION_DATA["releases"] | |
| # Process timestamp data | |
| benchmark_df["timestamp"] = pd.to_datetime(benchmark_df["timestamp"]).dt.tz_localize( | |
| None | |
| ) | |
| # Use average_wer directly from performance data | |
| benchmark_df["english_wer"] = benchmark_df["average_wer"] | |
| sorted_performance_df = ( | |
| benchmark_df.assign(model_len=benchmark_df["model"].str.len()) | |
| .sort_values( | |
| by=["model_len", "model", "device", "os", "timestamp"], | |
| ascending=[True, True, True, True, False], | |
| ) | |
| .drop(columns=["model_len"]) | |
| .drop_duplicates(subset=["model", "device", "os"], keep="first") | |
| .reset_index(drop=True) | |
| ) | |
| # Identify dataset-specific columns | |
| dataset_speed_columns = [ | |
| col for col in sorted_performance_df.columns if col.startswith("dataset_speed.") | |
| ] | |
| dataset_toks_columns = [ | |
| col | |
| for col in sorted_performance_df.columns | |
| if col.startswith("dataset_tokens_per_second.") | |
| ] | |
| # Extract dataset names | |
| PERFORMANCE_DATASETS = [col.split(".")[-1] for col in dataset_speed_columns] | |
| # Prepare DataFrames for display | |
| performance_df = sorted_performance_df[ | |
| [ | |
| "model", | |
| "device", | |
| "os", | |
| "english_wer", | |
| "qoi", | |
| "speed", | |
| "tokens_per_second", | |
| "timestamp", | |
| "commit_hash", | |
| ] | |
| + dataset_speed_columns | |
| + dataset_toks_columns | |
| ].copy() | |
| # Calculate parity (difference between measured WER and ground truth WER) | |
| performance_df["parity"] = performance_df.apply( | |
| lambda row: calculate_quality_parity(QUALITY_DATA, row), axis=1 | |
| ) | |
| # Rename columns for clarity | |
| performance_df = performance_df.rename( | |
| lambda x: COL_NAMES[x] if x in COL_NAMES else x, axis="columns" | |
| ) | |
| # Process dataset-specific columns | |
| for col in dataset_speed_columns: | |
| dataset_name = col.split(".")[-1] | |
| performance_df = performance_df.rename( | |
| columns={ | |
| col: f"{'Short-Form' if dataset_name == 'librispeech-10mins' else 'Long-Form'} Speed" | |
| } | |
| ) | |
| for col in dataset_toks_columns: | |
| dataset_name = col.split(".")[-1] | |
| performance_df = performance_df.rename( | |
| columns={ | |
| col: f"{'Short-Form' if dataset_name == 'librispeech-10mins' else 'Long-Form'} Tok/s" | |
| } | |
| ) | |
| # Process model names for display | |
| performance_df["model_raw"] = performance_df["Model"].copy() | |
| performance_df["Model"] = performance_df["Model"].apply( | |
| lambda x: make_model_name_clickable_link(x) | |
| ) | |
| # Extract unique devices and OS versions | |
| initial_release_df = benchmark_df[benchmark_df["commit_hash"] == releases[-1]] | |
| PERFORMANCE_DEVICES = initial_release_df["device"].unique().tolist() | |
| PERFORMANCE_OS = ( | |
| initial_release_df["os"].apply(get_os_name_and_version).unique().tolist() | |
| ) | |
| PERFORMANCE_OS.sort() | |
| # Create initial column dictionaries and update with dataset information | |
| initial_performance_column_dict = create_initial_performance_column_dict() | |
| performance_column_info = add_datasets_to_performance_columns( | |
| initial_performance_column_dict, PERFORMANCE_DATASETS | |
| ) | |
| # Unpack the returned dictionaries | |
| updated_performance_column_dict = performance_column_info["column_dict"] | |
| PerformanceAutoEvalColumn = performance_column_info["AutoEvalColumn"] | |
| # Define column sets for different views | |
| PERFORMANCE_COLS = performance_column_info["COLS"] | |
| PERFORMANCE_TYPES = performance_column_info["TYPES"] | |
| PERFORMANCE_ALWAYS_HERE_COLS = performance_column_info["ALWAYS_HERE_COLS"] | |
| PERFORMANCE_TOGGLE_COLS = performance_column_info["TOGGLE_COLS"] | |
| PERFORMANCE_SELECTED_COLS = performance_column_info["SELECTED_COLS"] | |
| def get_release_devices(release): | |
| """ | |
| Get the list of devices for a specific release. | |
| :param release: Selected release hash | |
| :return: List of devices available in the release | |
| """ | |
| release_df = benchmark_df[benchmark_df["commit_hash"] == release] | |
| return release_df["device"].unique().tolist() | |
| def performance_filter( | |
| df, | |
| columns, | |
| model_query, | |
| exclude_models, | |
| devices, | |
| os, | |
| short_speed_slider, | |
| long_speed_slider, | |
| short_toks_slider, | |
| long_toks_slider, | |
| release, | |
| ): | |
| """ | |
| Filters the performance DataFrame based on specified criteria. | |
| :param df: The DataFrame to be filtered. | |
| :param columns: The columns to be included in the filtered DataFrame. | |
| :param model_query: The query string to filter the 'Model' column. | |
| :param exclude_models: Models to exclude from the results. | |
| :param devices: The devices to filter the 'Device' column. | |
| :param os: The list of operating systems to filter the 'OS' column. | |
| :param short_speed_slider: The range of values to filter the 'Short-Form Speed' column. | |
| :param long_speed_slider: The range of values to filter the 'Long-Form Speed' column. | |
| :param short_toks_slider: The range of values to filter the 'Short-Form Tok/s' column. | |
| :param long_toks_slider: The range of values to filter the 'Long-Form Tok/s' column. | |
| :return: The filtered DataFrame. | |
| """ | |
| filtered_df = df[df["commit_hash"] == release] | |
| # Select columns based on input and always-present columns | |
| filtered_df = filtered_df[ | |
| PERFORMANCE_ALWAYS_HERE_COLS | |
| + [c for c in PERFORMANCE_COLS if c in df.columns and c in columns] | |
| ] | |
| # Filter models based on query | |
| if model_query: | |
| filtered_df = filtered_df[ | |
| filtered_df["Model"].str.contains( | |
| "|".join(q.strip() for q in model_query.split(";")), case=False | |
| ) | |
| ] | |
| # Exclude specified models | |
| if exclude_models: | |
| exclude_list = [m.strip() for m in exclude_models.split(";")] | |
| filtered_df = filtered_df[ | |
| ~filtered_df["Model"].str.contains("|".join(exclude_list), case=False) | |
| ] | |
| # Filter by devices | |
| if devices: | |
| filtered_df = filtered_df[filtered_df["Device"].isin(devices)] | |
| else: | |
| filtered_df = pd.DataFrame(columns=filtered_df.columns) | |
| # Filter by operating systems | |
| filtered_df = ( | |
| filtered_df[ | |
| ( | |
| filtered_df["OS"].str.contains( | |
| "|".join(q.strip() for q in os), case=False | |
| ) | |
| ) | |
| ] | |
| if os | |
| else pd.DataFrame(columns=filtered_df.columns) | |
| ) | |
| # Apply short-form and long-form speed and tokens per second filters | |
| min_short_speed, max_short_speed = short_speed_slider | |
| min_long_speed, max_long_speed = long_speed_slider | |
| min_short_toks, max_short_toks = short_toks_slider | |
| min_long_toks, max_long_toks = long_toks_slider | |
| df["Short-Form Speed"] = pd.to_numeric(df["Short-Form Speed"], errors="coerce") | |
| df["Long-Form Speed"] = pd.to_numeric(df["Long-Form Speed"], errors="coerce") | |
| df["Short-Form Tok/s"] = pd.to_numeric(df["Short-Form Tok/s"], errors="coerce") | |
| df["Long-Form Tok/s"] = pd.to_numeric(df["Long-Form Tok/s"], errors="coerce") | |
| if "Short-Form Speed" in filtered_df.columns: | |
| filtered_df = filtered_df[ | |
| (filtered_df["Short-Form Speed"] >= min_short_speed) | |
| & (filtered_df["Short-Form Speed"] <= max_short_speed) | |
| ] | |
| if "Long-Form Speed" in filtered_df.columns: | |
| filtered_df = filtered_df[ | |
| (filtered_df["Long-Form Speed"] >= min_long_speed) | |
| & (filtered_df["Long-Form Speed"] <= max_long_speed) | |
| ] | |
| if "Short-Form Tok/s" in filtered_df.columns: | |
| filtered_df = filtered_df[ | |
| (filtered_df["Short-Form Tok/s"] >= min_short_toks) | |
| & (filtered_df["Short-Form Tok/s"] <= max_short_toks) | |
| ] | |
| if "Long-Form Tok/s" in filtered_df.columns: | |
| filtered_df = filtered_df[ | |
| (filtered_df["Long-Form Tok/s"] >= min_long_toks) | |
| & (filtered_df["Long-Form Tok/s"] <= max_long_toks) | |
| ] | |
| return filtered_df | |
| def update_performance_filters(release): | |
| """ | |
| Updates the performance filters (devices and OS) based on the selected release. | |
| :param release: Selected release hash | |
| :return: Tuple containing updated device and OS choices | |
| """ | |
| # Filter benchmark data for the selected release | |
| release_df = benchmark_df[benchmark_df["commit_hash"] == release] | |
| # Get unique devices and OS versions for this release | |
| release_devices = release_df["device"].unique().tolist() | |
| release_os = release_df["os"].apply(get_os_name_and_version).unique().tolist() | |
| release_os.sort() | |
| return ( | |
| gr.update(choices=release_devices, value=release_devices), | |
| gr.update(choices=release_os, value=release_os), | |
| ) | |
| def update_support_table(release): | |
| """ | |
| Updates the support table and its column configuration for a given release. | |
| :param release: Selected release hash | |
| :return: Tuple containing (updated DataFrame, updated column choices, updated column values) | |
| """ | |
| # Load new support data | |
| support_data = pd.read_csv(f"dashboard_data/support_data_{release[:7]}.csv") | |
| support_data.set_index(support_data.columns[0], inplace=True) | |
| # Process model names | |
| support_data["Model"] = support_data["Model"].apply(lambda x: x.replace("_", "/")) | |
| support_data["Model"] = support_data["Model"].apply( | |
| lambda x: make_model_name_clickable_link(x) | |
| ) | |
| # Sort by model name length | |
| support_data = ( | |
| support_data.assign(model_len=support_data["Model"].str.len()) | |
| .sort_values( | |
| by=["model_len"], | |
| ascending=[True], | |
| ) | |
| .drop(columns=["model_len"]) | |
| ) | |
| # Get new columns (excluding 'Model') | |
| new_columns = support_data.columns.tolist()[1:] | |
| return ( | |
| gr.update(value=support_data, datatype=["html" for _ in support_data.columns]), | |
| gr.update(choices=new_columns, value=new_columns), | |
| gr.update(value=support_data), | |
| ) | |
| diff_tab = gr.TabItem("Difference Checker", elem_id="diff_checker", id=2) | |
| text_diff_elems = [] | |
| tabs = gr.Tabs(elem_id="tab-elems") | |
| font = [ | |
| "Zwizz Regular", # Local font | |
| "IBM Plex Mono", # Monospace font | |
| "ui-sans-serif", | |
| "system-ui", | |
| "sans-serif", | |
| ] | |
| # Macos 14, 15, 26 | |
| # ios 17, 18, 26 | |
| # Define the Gradio interface | |
| with gr.Blocks(css=css, theme=gr.themes.Base(font=font)) as demo: | |
| # Add header and banner to the interface | |
| gr.HTML(HEADER) | |
| gr.HTML(BANNER_TEXT, elem_classes="markdown-text") | |
| gr.Markdown("### Release") | |
| release_dropdown = gr.Dropdown( | |
| choices=[ | |
| (f"{release} v{SHA_TO_VERSION[release]}", release) for release in releases | |
| ], | |
| label="Select Release", | |
| value=releases[-1] if releases else None, | |
| elem_id="release-dropdown", | |
| container=False, | |
| ) | |
| # Create tabs for different sections of the dashboard | |
| with tabs.render(): | |
| # Performance Tab | |
| with gr.TabItem("Benchmark", elem_id="benchmark", id=0): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| with gr.Row(): | |
| with gr.Column(scale=6, elem_classes="filter_models_column"): | |
| filter_performance_models = gr.Textbox( | |
| placeholder="π Filter Model (separate multiple queries with ';')", | |
| label="Filter Models", | |
| ) | |
| with gr.Column(scale=4, elem_classes="exclude_models_column"): | |
| exclude_performance_models = gr.Textbox( | |
| placeholder="π Exclude Model", | |
| label="Exclude Model", | |
| ) | |
| with gr.Row(): | |
| with gr.Accordion("See All Columns", open=False): | |
| with gr.Row(): | |
| with gr.Column(scale=9, elem_id="performance_columns"): | |
| performance_shown_columns = gr.CheckboxGroup( | |
| choices=PERFORMANCE_TOGGLE_COLS, | |
| value=PERFORMANCE_SELECTED_COLS, | |
| label="Toggle Columns", | |
| elem_id="column-select", | |
| interactive=True, | |
| ) | |
| with gr.Column( | |
| scale=1, | |
| min_width=200, | |
| elem_id="performance_select_columns", | |
| ): | |
| with gr.Row(): | |
| select_all_button = gr.Button( | |
| "Select All", | |
| elem_id="select-all-button", | |
| interactive=True, | |
| ) | |
| deselect_all_button = gr.Button( | |
| "Deselect All", | |
| elem_id="deselect-all-button", | |
| interactive=True, | |
| ) | |
| def select_all_columns(): | |
| return PERFORMANCE_TOGGLE_COLS | |
| def deselect_all_columns(): | |
| return [] | |
| select_all_button.click( | |
| select_all_columns, | |
| inputs=[], | |
| outputs=performance_shown_columns, | |
| ) | |
| deselect_all_button.click( | |
| deselect_all_columns, | |
| inputs=[], | |
| outputs=performance_shown_columns, | |
| ) | |
| with gr.Row(): | |
| with gr.Accordion("Filter Devices", open=False): | |
| with gr.Row(): | |
| with gr.Column( | |
| scale=9, elem_id="filter_devices_column" | |
| ): | |
| performance_shown_devices = gr.CheckboxGroup( | |
| choices=get_release_devices(releases[-1]), | |
| value=get_release_devices(releases[-1]), | |
| label="Filter Devices", | |
| interactive=True, | |
| ) | |
| with gr.Column( | |
| scale=1, | |
| min_width=200, | |
| elem_id="filter_select_devices", | |
| ): | |
| with gr.Row(): | |
| select_all_devices_button = gr.Button( | |
| "Select All", | |
| elem_id="select-all-devices-button", | |
| interactive=True, | |
| ) | |
| deselect_all_devices_button = gr.Button( | |
| "Deselect All", | |
| elem_id="deselect-all-devices-button", | |
| interactive=True, | |
| ) | |
| def select_all_devices(release): | |
| """Returns all devices available in the current release""" | |
| return get_release_devices(release) | |
| def deselect_all_devices(): | |
| """Returns an empty list for deselecting all devices""" | |
| return [] | |
| select_all_devices_button.click( | |
| select_all_devices, | |
| inputs=[release_dropdown], | |
| outputs=performance_shown_devices, | |
| ) | |
| deselect_all_devices_button.click( | |
| deselect_all_devices, | |
| inputs=[], | |
| outputs=performance_shown_devices, | |
| ) | |
| with gr.Row(): | |
| performance_shown_os = gr.CheckboxGroup( | |
| choices=PERFORMANCE_OS, | |
| value=PERFORMANCE_OS, | |
| label="Filter OS", | |
| interactive=True, | |
| ) | |
| with gr.Column(scale=1): | |
| with gr.Accordion("See Performance Filters"): | |
| with gr.Row(): | |
| with gr.Row(): | |
| min_short_speed, max_short_speed = floor( | |
| min(performance_df["Short-Form Speed"]) | |
| ), ceil(max(performance_df["Short-Form Speed"])) | |
| short_speed_slider = RangeSlider( | |
| value=[min_short_speed, max_short_speed], | |
| minimum=min_short_speed, | |
| maximum=max_short_speed, | |
| step=0.001, | |
| label="Short-Form Speed", | |
| ) | |
| with gr.Row(): | |
| min_long_speed, max_long_speed = floor( | |
| min(performance_df["Long-Form Speed"]) | |
| ), ceil(max(performance_df["Long-Form Speed"])) | |
| long_speed_slider = RangeSlider( | |
| value=[min_long_speed, max_long_speed], | |
| minimum=min_long_speed, | |
| maximum=max_long_speed, | |
| step=0.001, | |
| label="Long-Form Speed", | |
| ) | |
| with gr.Row(): | |
| with gr.Row(): | |
| min_short_toks, max_short_toks = floor( | |
| min(performance_df["Short-Form Tok/s"]) | |
| ), ceil(max(performance_df["Short-Form Tok/s"])) | |
| short_toks_slider = RangeSlider( | |
| value=[min_short_toks, max_short_toks], | |
| minimum=min_short_toks, | |
| maximum=max_short_toks, | |
| step=0.001, | |
| label="Short-Form Tok/s", | |
| ) | |
| with gr.Row(): | |
| min_long_toks, max_long_toks = floor( | |
| min(performance_df["Long-Form Tok/s"]) | |
| ), ceil(max(performance_df["Long-Form Tok/s"])) | |
| long_toks_slider = RangeSlider( | |
| value=[min_long_toks, max_long_toks], | |
| minimum=min_long_toks, | |
| maximum=max_long_toks, | |
| step=0.001, | |
| label="Long-Form Tok/s", | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(PERFORMANCE_TEXT, elem_classes="markdown-text") | |
| with gr.Row(): | |
| initial_df = performance_df[ | |
| performance_df["commit_hash"] == releases[-1] | |
| ] | |
| leaderboard_df = gr.components.Dataframe( | |
| value=initial_df[ | |
| PERFORMANCE_ALWAYS_HERE_COLS + performance_shown_columns.value | |
| ], | |
| headers=[ | |
| PERFORMANCE_ALWAYS_HERE_COLS + performance_shown_columns.value | |
| ], | |
| datatype=[ | |
| c.type | |
| for c in fields(PerformanceAutoEvalColumn) | |
| if c.name in PERFORMANCE_COLS | |
| ], | |
| elem_id="leaderboard-table", | |
| elem_classes="large-table", | |
| interactive=False, | |
| ) | |
| # Copy of the leaderboard dataframe to apply filters to | |
| hidden_leaderboard_df = gr.components.Dataframe( | |
| value=performance_df, | |
| headers=PERFORMANCE_COLS, | |
| datatype=[ | |
| c.type | |
| for c in fields(PerformanceAutoEvalColumn) | |
| if c.name in PERFORMANCE_COLS | |
| ], | |
| visible=False, | |
| ) | |
| # Inputs for the dataframe filter function | |
| performance_filter_inputs = [ | |
| hidden_leaderboard_df, | |
| performance_shown_columns, | |
| filter_performance_models, | |
| exclude_performance_models, | |
| performance_shown_devices, | |
| performance_shown_os, | |
| short_speed_slider, | |
| long_speed_slider, | |
| short_toks_slider, | |
| long_toks_slider, | |
| release_dropdown, | |
| ] | |
| filter_output = leaderboard_df | |
| filter_performance_models.change( | |
| performance_filter, performance_filter_inputs, filter_output | |
| ) | |
| exclude_performance_models.change( | |
| performance_filter, performance_filter_inputs, filter_output | |
| ) | |
| performance_shown_columns.change( | |
| performance_filter, performance_filter_inputs, filter_output | |
| ) | |
| performance_shown_devices.change( | |
| performance_filter, performance_filter_inputs, filter_output | |
| ) | |
| performance_shown_os.change( | |
| performance_filter, performance_filter_inputs, filter_output | |
| ) | |
| short_speed_slider.change( | |
| performance_filter, performance_filter_inputs, filter_output | |
| ) | |
| long_speed_slider.change( | |
| performance_filter, performance_filter_inputs, filter_output | |
| ) | |
| short_toks_slider.change( | |
| performance_filter, performance_filter_inputs, filter_output | |
| ) | |
| long_toks_slider.change( | |
| performance_filter, performance_filter_inputs, filter_output | |
| ) | |
| release_dropdown.change( | |
| fn=update_performance_filters, | |
| inputs=[release_dropdown], | |
| outputs=[performance_shown_devices, performance_shown_os], | |
| queue=False, | |
| ).then( | |
| fn=performance_filter, | |
| inputs=performance_filter_inputs, | |
| outputs=filter_output, | |
| ) | |
| # Timeline Tab | |
| with gr.TabItem("Timeline", elem_id="timeline", id=4): | |
| # Create subtabs for different metrics | |
| with gr.Tabs(): | |
| with gr.TabItem("QoI", id=0): | |
| with gr.Row(): | |
| with gr.Column(scale=6): | |
| filter_qoi = gr.Textbox( | |
| placeholder="π Filter Model-Device-OS (separate multiple queries with ';')", | |
| label="Filter", | |
| ) | |
| with gr.Column(scale=4): | |
| exclude_qoi = gr.Textbox( | |
| placeholder="π Exclude Model-Device-OS", | |
| label="Exclude", | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| qoi_plot = gr.Plot(container=True) | |
| demo.load( | |
| lambda x, y, z: plot_metric( | |
| x, | |
| "qoi", | |
| "QoI", | |
| "QoI Over Time for Model-Device-OS Combinations", | |
| y, | |
| z, | |
| ), | |
| [ | |
| gr.Dataframe(benchmark_df, visible=False), | |
| filter_qoi, | |
| exclude_qoi, | |
| ], | |
| qoi_plot, | |
| ) | |
| filter_qoi.change( | |
| lambda x, y, z: plot_metric( | |
| x, | |
| "qoi", | |
| "QoI", | |
| "QoI Over Time for Model-Device-OS Combinations", | |
| y, | |
| z, | |
| ), | |
| [ | |
| gr.Dataframe(benchmark_df, visible=False), | |
| filter_qoi, | |
| exclude_qoi, | |
| ], | |
| qoi_plot, | |
| ) | |
| exclude_qoi.change( | |
| lambda x, y, z: plot_metric( | |
| x, | |
| "qoi", | |
| "QoI", | |
| "QoI Over Time for Model-Device-OS Combinations", | |
| y, | |
| z, | |
| ), | |
| [ | |
| gr.Dataframe(benchmark_df, visible=False), | |
| filter_qoi, | |
| exclude_qoi, | |
| ], | |
| qoi_plot, | |
| ) | |
| with gr.TabItem("Average WER", id=1): | |
| with gr.Row(): | |
| with gr.Column(scale=6): | |
| filter_average_wer = gr.Textbox( | |
| placeholder="π Filter Model-Device-OS (separate multiple queries with ';')", | |
| label="Filter", | |
| ) | |
| with gr.Column(scale=4): | |
| exclude_average_wer = gr.Textbox( | |
| placeholder="π Exclude Model-Device-OS", | |
| label="Exclude", | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| average_wer_plot = gr.Plot(container=True) | |
| demo.load( | |
| lambda x, y, z: plot_metric( | |
| x, | |
| "average_wer", | |
| "Average WER", | |
| "Average WER Over Time for Model-Device-OS Combinations", | |
| y, | |
| z, | |
| ), | |
| [ | |
| gr.Dataframe(benchmark_df, visible=False), | |
| filter_average_wer, | |
| exclude_average_wer, | |
| ], | |
| average_wer_plot, | |
| ) | |
| filter_average_wer.change( | |
| lambda x, y, z: plot_metric( | |
| x, | |
| "average_wer", | |
| "Average WER", | |
| "Average WER Over Time for Model-Device-OS Combinations", | |
| y, | |
| z, | |
| ), | |
| [ | |
| gr.Dataframe(benchmark_df, visible=False), | |
| filter_average_wer, | |
| exclude_average_wer, | |
| ], | |
| average_wer_plot, | |
| ) | |
| exclude_average_wer.change( | |
| lambda x, y, z: plot_metric( | |
| x, | |
| "average_wer", | |
| "Average WER", | |
| "Average WER Over Time for Model-Device-OS Combinations", | |
| y, | |
| z, | |
| ), | |
| [ | |
| gr.Dataframe(benchmark_df, visible=False), | |
| filter_average_wer, | |
| exclude_average_wer, | |
| ], | |
| average_wer_plot, | |
| ) | |
| with gr.TabItem("Speed", id=2): | |
| with gr.Row(): | |
| with gr.Column(scale=6): | |
| filter_speed = gr.Textbox( | |
| placeholder="π Filter Model-Device-OS (separate multiple queries with ';')", | |
| label="Filter", | |
| ) | |
| with gr.Column(scale=4): | |
| exclude_speed = gr.Textbox( | |
| placeholder="π Exclude Model-Device-OS", | |
| label="Exclude", | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| speed_plot = gr.Plot(container=True) | |
| demo.load( | |
| lambda x, y, z: plot_metric( | |
| x, | |
| "speed", | |
| "Speed", | |
| "Speed Over Time for Model-Device-OS Combinations", | |
| y, | |
| z, | |
| ), | |
| [ | |
| gr.Dataframe(benchmark_df, visible=False), | |
| filter_speed, | |
| exclude_speed, | |
| ], | |
| speed_plot, | |
| ) | |
| filter_speed.change( | |
| lambda x, y, z: plot_metric( | |
| x, | |
| "speed", | |
| "Speed", | |
| "Speed Over Time for Model-Device-OS Combinations", | |
| y, | |
| z, | |
| ), | |
| [ | |
| gr.Dataframe(benchmark_df, visible=False), | |
| filter_speed, | |
| exclude_speed, | |
| ], | |
| speed_plot, | |
| ) | |
| exclude_speed.change( | |
| lambda x, y, z: plot_metric( | |
| x, | |
| "speed", | |
| "Speed", | |
| "Speed Over Time for Model-Device-OS Combinations", | |
| y, | |
| z, | |
| ), | |
| [ | |
| gr.Dataframe(benchmark_df, visible=False), | |
| filter_speed, | |
| exclude_speed, | |
| ], | |
| speed_plot, | |
| ) | |
| with gr.TabItem("Tok/s", id=3): | |
| with gr.Row(): | |
| with gr.Column(scale=6): | |
| filter_toks = gr.Textbox( | |
| placeholder="π Filter Model-Device-OS (separate multiple queries with ';')", | |
| label="Filter", | |
| ) | |
| with gr.Column(scale=4): | |
| exclude_toks = gr.Textbox( | |
| placeholder="π Exclude Model-Device-OS", | |
| label="Exclude", | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| toks_plot = gr.Plot(container=True) | |
| demo.load( | |
| lambda x, y, z: plot_metric( | |
| x, | |
| "tokens_per_second", | |
| "Tok/s", | |
| "Tok/s Over Time for Model-Device-OS Combinations", | |
| y, | |
| z, | |
| ), | |
| [ | |
| gr.Dataframe(benchmark_df, visible=False), | |
| filter_toks, | |
| exclude_toks, | |
| ], | |
| toks_plot, | |
| ) | |
| filter_toks.change( | |
| lambda x, y, z: plot_metric( | |
| x, | |
| "tokens_per_second", | |
| "Tok/s", | |
| "Tok/s Over Time for Model-Device-OS Combinations", | |
| y, | |
| z, | |
| ), | |
| [ | |
| gr.Dataframe(benchmark_df, visible=False), | |
| filter_toks, | |
| exclude_toks, | |
| ], | |
| toks_plot, | |
| ) | |
| exclude_toks.change( | |
| lambda x, y, z: plot_metric( | |
| x, | |
| "tokens_per_second", | |
| "Tok/s", | |
| "Tok/s Over Time for Model-Device-OS Combinations", | |
| y, | |
| z, | |
| ), | |
| [ | |
| gr.Dataframe(benchmark_df, visible=False), | |
| filter_toks, | |
| exclude_toks, | |
| ], | |
| toks_plot, | |
| ) | |
| # Device Support Tab | |
| with gr.TabItem("Device Support", elem_id="device_support", id=6): | |
| # Add clear description of what Device Support means | |
| gr.Markdown( | |
| """ | |
| ## Device Support | |
| This tab shows **test results for SKUs that we actually attempted to test**. It tells you whether tests passed, failed, or couldn't be completed for the devices we tried to run tests on. | |
| ### Please Note: | |
| **This tab only shows devices we attempted to test** - it doesn't show the full universe of available devices. | |
| **π For comprehensive coverage analysis**, see the **Test Coverage** tab which shows ALL available SKUs. | |
| """, | |
| elem_classes="markdown-text" | |
| ) | |
| # Load device support data from CSV | |
| support_data = pd.read_csv( | |
| f"dashboard_data/support_data_{releases[-1][:7]}.csv" | |
| ) | |
| support_data.set_index(support_data.columns[0], inplace=True) | |
| support_data["Model"] = support_data["Model"].apply( | |
| lambda x: x.replace("_", "/") | |
| ) | |
| support_data["Model"] = support_data["Model"].apply( | |
| lambda x: make_model_name_clickable_link(x) | |
| ) | |
| support_data = ( | |
| support_data.assign(model_len=support_data["Model"].str.len()) | |
| .sort_values( | |
| by=["model_len"], | |
| ascending=[True], | |
| ) | |
| .drop(columns=["model_len"]) | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| with gr.Row(): | |
| with gr.Column(scale=6, elem_id="filter_models_column"): | |
| filter_support_models = gr.Textbox( | |
| placeholder="π Filter Model (separate multiple queries with ';')", | |
| label="Filter Models", | |
| ) | |
| with gr.Column(scale=4, elem_classes="exclude_models_column"): | |
| exclude_support_models = gr.Textbox( | |
| placeholder="π Exclude Model", | |
| label="Exclude Model", | |
| ) | |
| with gr.Row(): | |
| with gr.Accordion("See All Columns", open=False): | |
| with gr.Row(): | |
| with gr.Column(scale=9): | |
| support_shown_columns = gr.CheckboxGroup( | |
| choices=support_data.columns.tolist()[ | |
| 1: | |
| ], # Exclude 'Model' column | |
| value=support_data.columns.tolist()[1:], | |
| label="Toggle Columns", | |
| elem_id="support-column-select", | |
| interactive=True, | |
| ) | |
| with gr.Column(scale=1, min_width=200): | |
| with gr.Row(): | |
| select_all_support_button = gr.Button( | |
| "Select All", | |
| elem_id="select-all-support-button", | |
| interactive=True, | |
| ) | |
| deselect_all_support_button = gr.Button( | |
| "Deselect All", | |
| elem_id="deselect-all-support-button", | |
| interactive=True, | |
| ) | |
| with gr.Column(): | |
| gr.Markdown( | |
| """ | |
| ### Legend | |
| - β Supported: The model is supported and tested on this device. | |
| - β οΈ Failed: Either the model tests failed on this device or the Speed Factor for the test is less than 1. | |
| - ? Not Tested: The model is supported on this device but no test information available. | |
| - Not Supported: The model is not supported on this device as per the [WhisperKit configuration](https://huggingface.co/argmaxinc/whisperkit-coreml/blob/main/config.json). | |
| """ | |
| ) | |
| # Display device support data in a table | |
| device_support_table = gr.Dataframe( | |
| value=support_data, | |
| headers=support_data.columns.tolist(), | |
| datatype=["html" for _ in support_data.columns], | |
| elem_id="device-support-table", | |
| elem_classes="large-table", | |
| interactive=False, | |
| ) | |
| # Hidden dataframe to store the original data | |
| hidden_support_df = gr.Dataframe(value=support_data, visible=False) | |
| def filter_support_data(df, columns, model_query, exclude_models): | |
| """ | |
| Filters the device support data based on specified criteria. | |
| :param df: The DataFrame to be filtered | |
| :param columns: Columns to include in the output | |
| :param model_query: Query string to filter models | |
| :param exclude_models: Models to exclude | |
| :return: Filtered DataFrame | |
| """ | |
| filtered_df = df.copy() | |
| # Filter models based on query | |
| if model_query: | |
| filtered_df = filtered_df[ | |
| filtered_df["Model"].str.contains( | |
| "|".join(q.strip() for q in model_query.split(";")), | |
| case=False, | |
| regex=True, | |
| ) | |
| ] | |
| # Exclude specified models | |
| if exclude_models: | |
| exclude_list = [ | |
| re.escape(m.strip()) for m in exclude_models.split(";") | |
| ] | |
| filtered_df = filtered_df[ | |
| ~filtered_df["Model"].str.contains( | |
| "|".join(exclude_list), case=False, regex=True | |
| ) | |
| ] | |
| # Select columns | |
| selected_columns = ["Model"] + [ | |
| col for col in columns if col in df.columns | |
| ] | |
| filtered_df = filtered_df[selected_columns] | |
| return filtered_df | |
| def select_all_support_columns(release): | |
| """ | |
| Returns all current columns from the support shown columns. | |
| :param release: Selected release hash | |
| :return: List of all available choices | |
| """ | |
| # Load new support data for the current release | |
| support_data = pd.read_csv( | |
| f"dashboard_data/support_data_{release[:7]}.csv" | |
| ) | |
| support_data.set_index(support_data.columns[0], inplace=True) | |
| # Return all columns except 'Model' | |
| return [col for col in support_data.columns if col != "Model"] | |
| def deselect_all_support_columns(): | |
| return [] | |
| # Connect select all and deselect all buttons | |
| select_all_support_button.click( | |
| select_all_support_columns, | |
| inputs=[release_dropdown], | |
| outputs=support_shown_columns, | |
| ) | |
| deselect_all_support_button.click( | |
| deselect_all_support_columns, | |
| inputs=[], | |
| outputs=support_shown_columns, | |
| ) | |
| # Connect release dropdown to support data update | |
| release_dropdown.change( | |
| update_support_table, | |
| inputs=[release_dropdown], | |
| outputs=[ | |
| device_support_table, | |
| support_shown_columns, | |
| hidden_support_df, | |
| ], | |
| ).then( | |
| filter_support_data, | |
| inputs=[ | |
| hidden_support_df, | |
| support_shown_columns, | |
| filter_support_models, | |
| exclude_support_models, | |
| ], | |
| outputs=device_support_table, | |
| ) | |
| # Also connect the filter inputs to update the table | |
| for input_elem in [ | |
| filter_support_models, | |
| exclude_support_models, | |
| support_shown_columns, | |
| ]: | |
| input_elem.change( | |
| filter_support_data, | |
| inputs=[ | |
| hidden_support_df, | |
| support_shown_columns, | |
| filter_support_models, | |
| exclude_support_models, | |
| ], | |
| outputs=device_support_table, | |
| ) | |
| # Test Coverage Tab | |
| with gr.TabItem("Test Coverage", elem_id="test_coverage", id=7): | |
| # Add clear description of what Test Coverage means | |
| gr.Markdown( | |
| """ | |
| ## Test Coverage | |
| This tab shows **ALL available SKUs** and our testing coverage across the entire device ecosystem. Uses chip-based expansion where testing one device covers all devices with the same chip. | |
| """, | |
| elem_classes="markdown-text" | |
| ) | |
| def load_coverage_data(release): | |
| """Load test coverage data for a specific release.""" | |
| try: | |
| with open(f"dashboard_data/test_coverage_{release}.json", "r") as f: | |
| return json.load(f) | |
| except FileNotFoundError: | |
| return { | |
| "commit_hash": release, | |
| "total_devices": 0, | |
| "tested_devices": 0, | |
| "skipped_devices": 0, | |
| "coverage_percentage": 0.0, | |
| "tested_device_list": [], | |
| "skipped_device_list": [], | |
| "tested_os_versions": [], | |
| "has_target_os_coverage": False, | |
| "covered_target_versions": [], | |
| "missing_target_versions": [], | |
| } | |
| def format_coverage_devices(device_list): | |
| """Convert device list to DataFrame format.""" | |
| if not device_list: | |
| return pd.DataFrame(columns=["Device"]) | |
| df = pd.DataFrame({"Device": device_list}) | |
| return df.sort_values(["Device"]) | |
| def update_coverage_data(release): | |
| """Update coverage data when release changes.""" | |
| coverage_data = load_coverage_data(release) | |
| # Format tested and skipped devices | |
| tested_df = format_coverage_devices(coverage_data["tested_device_list"]) | |
| skipped_df = format_coverage_devices( | |
| coverage_data["skipped_device_list"] | |
| ) | |
| # Check target OS coverage | |
| target_os_status = "" | |
| covered_versions = coverage_data.get("covered_target_versions", []) | |
| missing_versions = coverage_data.get("missing_target_versions", []) | |
| if covered_versions or missing_versions: | |
| target_os_status = "\n- **Target OS Coverage**:\n" | |
| if covered_versions: | |
| unique_versions = sorted(set(covered_versions)) | |
| target_os_status += f" - β **Tested**: {', '.join(unique_versions)}\n" | |
| if missing_versions: | |
| target_os_status += f" - β **Missing**: {', '.join(missing_versions)}" | |
| # Create coverage summary | |
| coverage_summary = f"""## Test Coverage Summary for Release {release} (v{SHA_TO_VERSION.get(release, 'Unknown')}) | |
| - **Total Devices**: {coverage_data['total_devices']} | |
| - **Tested Devices**: {coverage_data['tested_devices']} | |
| - **Skipped Devices**: {coverage_data['skipped_devices']} | |
| - **Coverage Percentage**: {coverage_data['coverage_percentage']:.1f}% | |
| {target_os_status}""" | |
| return ( | |
| gr.update(value=coverage_summary), | |
| gr.update(value=tested_df), | |
| gr.update(value=skipped_df), | |
| tested_df, | |
| skipped_df, | |
| ) | |
| def filter_coverage_devices(df, device_query, exclude_devices): | |
| """Filter coverage devices based on device queries.""" | |
| if df is None or df.empty: | |
| return df | |
| filtered_df = df.copy() | |
| # Filter devices based on query | |
| if device_query: | |
| filtered_df = filtered_df[ | |
| filtered_df["Device"].str.contains( | |
| "|".join(q.strip() for q in device_query.split(";")), | |
| case=False, | |
| regex=True, | |
| ) | |
| ] | |
| # Exclude specified devices | |
| if exclude_devices: | |
| exclude_list = [ | |
| re.escape(d.strip()) for d in exclude_devices.split(";") | |
| ] | |
| filtered_df = filtered_df[ | |
| ~filtered_df["Device"].str.contains( | |
| "|".join(exclude_list), case=False, regex=True | |
| ) | |
| ] | |
| return filtered_df | |
| # Load initial coverage data | |
| initial_coverage = load_coverage_data(releases[-1]) | |
| initial_tested_df = format_coverage_devices( | |
| initial_coverage["tested_device_list"] | |
| ) | |
| initial_skipped_df = format_coverage_devices( | |
| initial_coverage["skipped_device_list"] | |
| ) | |
| # Generate initial target OS status | |
| initial_target_os_status = "" | |
| covered_versions = initial_coverage.get("covered_target_versions", []) | |
| missing_versions = initial_coverage.get("missing_target_versions", []) | |
| if covered_versions or missing_versions: | |
| initial_target_os_status = "\n- **Target OS Coverage**:\n" | |
| if covered_versions: | |
| unique_versions = sorted(set(covered_versions)) | |
| initial_target_os_status += f" - β **Tested**: {', '.join(unique_versions)}\n" | |
| if missing_versions: | |
| initial_target_os_status += f" - β **Missing**: {', '.join(missing_versions)}" | |
| # Create initial coverage summary content | |
| initial_summary_content = f"""## Test Coverage Summary for Release {releases[-1]} (v{SHA_TO_VERSION.get(releases[-1], 'Unknown')}) | |
| - **Total Devices**: {initial_coverage['total_devices']} | |
| - **Tested Devices**: {initial_coverage['tested_devices']} | |
| - **Skipped Devices**: {initial_coverage['skipped_devices']} | |
| - **Coverage Percentage**: {initial_coverage['coverage_percentage']:.1f}% | |
| {initial_target_os_status}""" | |
| # Coverage summary | |
| coverage_summary_text = gr.Markdown( | |
| value=initial_summary_content, | |
| elem_classes="markdown-text" | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| with gr.Row(): | |
| with gr.Column(scale=6): | |
| filter_coverage_devices_input = gr.Textbox( | |
| placeholder="π Filter Device (separate multiple queries with ';')", | |
| label="Filter Devices", | |
| ) | |
| with gr.Column(scale=4): | |
| exclude_coverage_devices_input = gr.Textbox( | |
| placeholder="π Exclude Device", | |
| label="Exclude Device", | |
| ) | |
| # Create tabs for tested vs skipped devices | |
| with gr.Tabs(): | |
| with gr.TabItem("Tested Devices", id=0): | |
| tested_devices_table = gr.Dataframe( | |
| value=initial_tested_df, | |
| headers=["Device"], | |
| datatype=["str"], | |
| elem_id="tested-devices-table", | |
| elem_classes="large-table", | |
| interactive=False, | |
| ) | |
| with gr.TabItem("Skipped Devices", id=1): | |
| skipped_devices_table = gr.Dataframe( | |
| value=initial_skipped_df, | |
| headers=["Device"], | |
| datatype=["str"], | |
| elem_id="skipped-devices-table", | |
| elem_classes="large-table", | |
| interactive=False, | |
| ) | |
| # Hidden dataframes for filtering | |
| hidden_tested_df = gr.Dataframe(value=initial_tested_df, visible=False) | |
| hidden_skipped_df = gr.Dataframe(value=initial_skipped_df, visible=False) | |
| # Connect release dropdown to coverage data update | |
| release_dropdown.change( | |
| update_coverage_data, | |
| inputs=[release_dropdown], | |
| outputs=[ | |
| coverage_summary_text, | |
| tested_devices_table, | |
| skipped_devices_table, | |
| hidden_tested_df, | |
| hidden_skipped_df, | |
| ], | |
| queue=False, | |
| ) | |
| # Connect filter inputs to update both tables | |
| for input_elem in [ | |
| filter_coverage_devices_input, | |
| exclude_coverage_devices_input, | |
| ]: | |
| input_elem.change( | |
| lambda tested_df, skipped_df, device_query, exclude_devices: ( | |
| filter_coverage_devices( | |
| tested_df, device_query, exclude_devices | |
| ), | |
| filter_coverage_devices( | |
| skipped_df, device_query, exclude_devices | |
| ), | |
| ), | |
| inputs=[ | |
| hidden_tested_df, | |
| hidden_skipped_df, | |
| filter_coverage_devices_input, | |
| exclude_coverage_devices_input, | |
| ], | |
| outputs=[tested_devices_table, skipped_devices_table], | |
| ) | |
| # Methodology Tab | |
| with gr.TabItem("Methodology", elem_id="methodology", id=8): | |
| gr.Markdown(METHODOLOGY_TEXT, elem_id="methodology-text") | |
| # Citation section | |
| with gr.Accordion("π Citation", open=False): | |
| citation_button = gr.Textbox( | |
| value=CITATION_BUTTON_TEXT, | |
| label=CITATION_BUTTON_LABEL, | |
| lines=7, | |
| elem_id="citation-button", | |
| show_copy_button=True, | |
| ) | |
| # Launch the Gradio interface | |
| demo.launch(debug=True, share=True) | |