from pathlib import Path import altair as alt import polars as pl import gradio as gr DATASETS = [] BENCHMARKS = { # Name: (device, AMP, compile, single thread) "Parameters": (None, None, None, None), "GPU Memory": (None, None, None, None), "CPU rate": ("cpu", False, False, False), "CPU rate single core": ("cpu", False, False, True), "CPU rate with compile": ("cpu", False, True, False), "CPU rate AMP with compile": ("cpu", True, True, False), "CUDA rate": ("cuda", False, False, False), "CUDA rate with compile": ("cuda", False, True, False), "CUDA rate AMP with compile": ("cuda", True, True, False), } def plot_acc_param(param_compare_results_df: pl.DataFrame, width: int = 1000, height: int = 680) -> alt.LayerChart: df = param_compare_results_df.select( "Model name", "Model type", "Accuracy", "Top-3 accuracy", "Resolution", "Parameters (M)", "Pareto frontier (p)", "Intermediate", "MIM", "Distilled", ) base = df.plot.point( x="Parameters (M)", y="Accuracy", color="Model type", shape="Resolution:N", tooltip=[ "Parameters (M)", "Accuracy", "Top-3 accuracy", "Model name", "Model type", "Resolution", "Intermediate", "MIM", "Distilled", ], ) text = base.mark_text(align="center", baseline="middle", dy=-10).encode(text="Model name") frontier = df.plot.line(x="Parameters (M)", y="Pareto frontier (p)").mark_line( interpolate="step-after", color="red", strokeWidth=0.3, strokeDash=(2, 2) ) chart = base + text + frontier return chart.properties(title="Accuray vs Parameter Count", width=width, height=height).configure_scale(zero=False) def plot_acc_memory(memory_compare_results_df: pl.DataFrame, width: int = 900, height: int = 640) -> alt.LayerChart: if len(memory_compare_results_df) > 0: batch_size = memory_compare_results_df["max_batch_size"][0] amp = memory_compare_results_df["amp"][0] else: batch_size = "" amp = "" df = memory_compare_results_df.select( "Model name", "Model type", "Accuracy", "Top-3 accuracy", "Resolution", "Peak GPU memory (MB)", "Parameters (M)", "Pareto frontier (mem)", "Intermediate", "MIM", "Distilled", ) base = df.plot.point( x="Peak GPU memory (MB)", y="Accuracy", color="Model type", shape="Resolution:N", tooltip=[ "Peak GPU memory (MB)", "Parameters (M)", "Accuracy", "Top-3 accuracy", "Model name", "Model type", "Resolution", "Intermediate", "MIM", "Distilled", ], ) text = base.mark_text(align="center", baseline="middle", dy=-10).encode(text="Model name") frontier = df.plot.line(x="Peak GPU memory (MB)", y="Pareto frontier (mem)").mark_line( interpolate="step-after", color="red", strokeWidth=0.3, strokeDash=(2, 2) ) chart = base + text + frontier return chart.properties( title=f"Accuray vs GPU Memory (batch size={batch_size}, amp={amp})", width=width, height=height ).configure_scale(zero=False) def plot_acc_rate(rate_compare_results_df: pl.DataFrame, width: int = 1000, height: int = 680) -> alt.LayerChart: if len(rate_compare_results_df) > 0: device = rate_compare_results_df["device"][0] compiled = rate_compare_results_df["compile"][0] batch_size = rate_compare_results_df["max_batch_size"][0] amp = rate_compare_results_df["amp"][0] single_thread = rate_compare_results_df["single_thread"][0] else: device = "" compiled = "" batch_size = "" amp = "" single_thread = False df = rate_compare_results_df.select( "Model name", "Model type", "Accuracy", "Top-3 accuracy", "Resolution", "ms / sample", "Parameters (M)", "Pareto frontier (ms)", "Intermediate", "MIM", "Distilled", ) base = df.plot.point( x="ms / sample", y="Accuracy", color="Model type", shape="Resolution:N", tooltip=[ "ms / sample", "Parameters (M)", "Accuracy", "Top-3 accuracy", "Model name", "Model type", "Resolution", "Intermediate", "MIM", "Distilled", ], ) text = base.mark_text(align="center", baseline="middle", dy=-10).encode(text="Model name") frontier = df.plot.line(x="ms / sample", y="Pareto frontier (ms)").mark_line( interpolate="step-after", color="red", strokeWidth=0.3, strokeDash=(2, 2) ) chart = base + text + frontier if single_thread is True: single_thread_title = " Single Core" else: single_thread_title = "" return chart.properties( title=( f"Accuracy vs {device.upper()}{single_thread_title} Rate (compile={compiled}, " f"batch size={batch_size}, amp={amp})" ), width=width, height=height, ).configure_scale(zero=False) def update_data( dataset: str, benchmark: str, intermediate: bool, mim: bool, dist: bool, log_x: bool, search_bar: str ) -> tuple[alt.LayerChart, pl.DataFrame]: compare_results_df = pl.read_csv(f"results_{dataset}.csv") if intermediate is False: compare_results_df = compare_results_df.filter(pl.col("Intermediate") == intermediate) if mim is False: compare_results_df = compare_results_df.filter(pl.col("MIM") == mim) if dist is False: compare_results_df = compare_results_df.filter(pl.col("Distilled") == dist) x_scale_type = "log" if log_x is True else "linear" # Filter models compare_results_df = compare_results_df.filter(pl.col("Model name").str.contains(search_bar)) # Parameter count if benchmark == "Parameters": param_compare_results_df = compare_results_df.unique(subset=["Model name", "Resolution"]).sort( "Parameters (M)", descending=False ) param_compare_results_df = param_compare_results_df.with_columns( pl.col("Accuracy").cum_max().alias("Pareto frontier (p)") ) param_compare_results_df = param_compare_results_df.drop( "Samples / sec", "device", "ms / sample", "Peak GPU memory (MB)" ) chart = plot_acc_param(param_compare_results_df) x_max = param_compare_results_df["Parameters (M)"].quantile(0.9) x_min = param_compare_results_df["Parameters (M)"].quantile(0.1) chart.layer[0].encoding.x.scale = alt.Scale(domain=[x_min, x_max], type=x_scale_type) output_df = param_compare_results_df # Peak memory elif benchmark == "GPU Memory": memory_compare_results_df = compare_results_df.drop_nulls(subset=["Peak GPU memory (MB)"]) memory_compare_results_df = memory_compare_results_df.unique(subset=["Model name", "Resolution"]).sort( "Peak GPU memory (MB)", descending=False ) memory_compare_results_df = memory_compare_results_df.with_columns( pl.col("Accuracy").cum_max().alias("Pareto frontier (mem)") ) memory_compare_results_df = memory_compare_results_df.drop("Samples / sec", "device", "ms / sample") chart = plot_acc_memory(memory_compare_results_df) x_max = memory_compare_results_df["Peak GPU memory (MB)"].quantile(0.9) x_min = memory_compare_results_df["Peak GPU memory (MB)"].quantile(0.1) chart.layer[0].encoding.x.scale = alt.Scale(domain=[x_min, x_max], type=x_scale_type) output_df = memory_compare_results_df # Rate else: (device, amp_enabled, compiled, single_thread) = BENCHMARKS[benchmark] df = compare_results_df.drop_nulls(subset=["ms / sample"]) df = df.filter(device=device, amp=amp_enabled, compile=compiled, single_thread=single_thread) device_compare_results_df = df.unique(subset=["Model name", "Resolution"]).sort("ms / sample", descending=False) device_compare_results_df = device_compare_results_df.drop("Peak GPU memory (MB)") device_compare_results_df = device_compare_results_df.with_columns( pl.col("Accuracy").cum_max().alias("Pareto frontier (ms)") ) chart = plot_acc_rate(device_compare_results_df) x_max = device_compare_results_df["ms / sample"].quantile(0.95) x_min = device_compare_results_df["ms / sample"].min() if x_max is not None and x_min is not None: x_max = x_max * 1.04 x_min = x_min * 0.96 chart.layer[0].encoding.x.scale = alt.Scale(domain=[x_min, x_max], type=x_scale_type) output_df = device_compare_results_df output_df = output_df.select( [ pl.col(col).round(4) if output_df.schema[col] in [pl.Float32, pl.Float64] else col for col in output_df.columns ] ) return (chart, output_df.drop("Mistakes", "Samples", "torch_version")) def app() -> None: with gr.Blocks(title="Birder Leaderboard", analytics_enabled=False) as leaderboard: gr.HTML("

The Birder Leaderboard

") with gr.Row(): with gr.Column(): pass with gr.Column(): gr.Markdown( """ Leaderboard of all the pre-trained Birder models across multiple datasets. ### Benchmark Setup * GPU: A5000 ADA Generation * CPU: AMD Ryzen Threadripper PRO 7975WX * PyTorch version: 2.5.1+cu124 ### Dataset Information | Name | Training samples | Validation samples | Classes | |---------------------|------------------|--------------------|-------------| | arabian-peninsula | 583,868 | 21,634 | 735 | | eu-common | 569,784 | 19,869 | 707 | | il-all | 462,346 | 18,614 | 550 | | il-common | 330,880 | 15,828 | 371 | """ ) with gr.Column(): pass with gr.Row(): with gr.Column(): pass with gr.Column(): dataset_dropdown = gr.Dropdown( choices=DATASETS, label="Select Dataset", value=DATASETS[0] if DATASETS else None, ) benchmark_dropdown = gr.Dropdown( choices=BENCHMARKS.keys(), label="Select Benchmark", value=next(iter(BENCHMARKS.keys())) if BENCHMARKS else None, filterable=False, ) with gr.Column(): intermediate = gr.Checkbox( label="Intermediate", value=True, info="Show models that underwent intermediate training (extra data)", ) mim = gr.Checkbox(label="MIM", value=True, info="Show models with Masked Image Modeling pre-training") dist = gr.Checkbox(label="Distilled", value=True, info="Show distilled models") log_x = gr.Checkbox(label="Log scale X-axis", value=False) with gr.Column(): pass with gr.Row(): with gr.Column(): pass with gr.Column(scale=2): search_bar = gr.Textbox(label="Model Filter", placeholder="e.g. convnext, efficient|mobile") with gr.Column(): pass plot = gr.Plot(container=False) table = gr.Dataframe(show_search="search") inputs = [dataset_dropdown, benchmark_dropdown, intermediate, mim, dist, log_x, search_bar] outputs = [plot, table] leaderboard.load(update_data, inputs=inputs, outputs=outputs) dataset_dropdown.change(update_data, inputs=inputs, outputs=outputs) benchmark_dropdown.change(update_data, inputs=inputs, outputs=outputs) intermediate.change(update_data, inputs=inputs, outputs=outputs) mim.change(update_data, inputs=inputs, outputs=outputs) dist.change(update_data, inputs=inputs, outputs=outputs) log_x.change(update_data, inputs=inputs, outputs=outputs) search_bar.change(update_data, inputs=inputs, outputs=outputs) leaderboard.launch() # Launch the app if __name__ == "__main__": file_info = [] for p in Path.glob(Path("."), "results_*.csv"): file_info.append((p.stat().st_size, p.stem.removeprefix("results_"))) DATASETS = [dataset_name for _, dataset_name in sorted(file_info, key=lambda x: x[1], reverse=True)] app()