Spaces:

birder-project
/

leaderboard

Running

File size: 7,369 Bytes

e7103ef

import altair as alt
import polars as pl

import gradio as gr

DATASETS = ["il-common"]
BENCHMARKS = {
    "Parameters": (None, None, None),
    "CPU Rate with compile": ("cpu", False, True),
    "CPU Rate without compile": ("cpu", False, False),
    "CUDA Rate with compile": ("cuda", False, True),
    "CUDA Rate AMP with compile": ("cuda", True, True),
}


def plot_acc_param(param_compare_results_df: pl.DataFrame, width: int = 1000, height: int = 680) -> alt.LayerChart:
    df = param_compare_results_df.select(
        "Model name", "Model type", "Accuracy", "Top-3 accuracy", "Resolution", "Parameters (M)", "Pareto frontier (p)"
    )
    base = df.plot.point(
        x="Parameters (M)",
        y="Accuracy",
        color="Model type",
        shape="Resolution:N",
        tooltip=["Parameters (M)", "Accuracy", "Top-3 accuracy", "Model name", "Model type", "Resolution"],
    )
    text = base.mark_text(align="center", baseline="middle", dy=-10).encode(text="Model name")
    frontier = df.plot.line(x="Parameters (M)", y="Pareto frontier (p)").mark_line(
        interpolate="step-after", color="red", strokeWidth=0.3, strokeDash=(2, 2)
    )

    chart = base + text + frontier
    return chart.properties(title="Accuray vs Parameter Count", width=width, height=height).configure_scale(zero=False)


def plot_acc_rate(rate_compare_results_df: pl.DataFrame, width: int = 1000, height: int = 680) -> alt.LayerChart:
    device = rate_compare_results_df["device"][0]
    compiled = rate_compare_results_df["compile"][0]
    batch_size = rate_compare_results_df["batch_size"][0]
    amp = rate_compare_results_df["amp"][0]
    df = rate_compare_results_df.select(
        "Model name",
        "Model type",
        "Accuracy",
        "Top-3 accuracy",
        "Resolution",
        "ms / sample",
        "Parameters (M)",
        "Pareto frontier (ms)",
    )
    base = df.plot.point(
        x="ms / sample",
        y="Accuracy",
        color="Model type",
        shape="Resolution:N",
        tooltip=[
            "ms / sample",
            "Parameters (M)",
            "Accuracy",
            "Top-3 accuracy",
            "Model name",
            "Model type",
            "Resolution",
        ],
    )
    text = base.mark_text(align="center", baseline="middle", dy=-10).encode(text="Model name")
    frontier = df.plot.line(x="ms / sample", y="Pareto frontier (ms)").mark_line(
        interpolate="step-after", color="red", strokeWidth=0.3, strokeDash=(2, 2)
    )

    chart = base + text + frontier
    return chart.properties(
        title=f"Accuray vs {device.upper()} Rate (compile={compiled}, batch size={batch_size}, amp={amp})",
        width=width,
        height=height,
    ).configure_scale(zero=False)


def update_data(
    dataset: str, benchmark: str, intermediate: bool, mim: bool, log_x: bool
) -> tuple[alt.LayerChart, pl.DataFrame]:
    compare_results_df = pl.read_csv(f"results_{dataset}.csv")
    if intermediate is False:
        compare_results_df = compare_results_df.filter(pl.col("Intermediate") == intermediate)
    if mim is False:
        compare_results_df = compare_results_df.filter(pl.col("MIM") == mim)

    x_scale_type = "log" if log_x is True else "linear"

    # Parameter count
    if benchmark == "Parameters":
        param_compare_results_df = compare_results_df.unique(subset=["Model name"]).sort(
            "Parameters (M)", descending=False
        )
        param_compare_results_df = param_compare_results_df.with_columns(
            pl.col("Accuracy").cum_max().alias("Pareto frontier (p)")
        )
        param_compare_results_df = param_compare_results_df.drop("Samples / sec", "device", "ms / sample")
        chart = plot_acc_param(param_compare_results_df)

        chart.layer[0].encoding.x.scale = alt.Scale(domain=[0.5, 7.5], type=x_scale_type)
        output_df = param_compare_results_df

    # Rate
    else:
        (device, amp_enabled, compiled) = BENCHMARKS[benchmark]
        df = compare_results_df.filter(device=device, amp=amp_enabled, compile=compiled)
        device_compare_results_df = df.unique(subset=["Model name"]).sort("ms / sample", descending=False)
        device_compare_results_df = device_compare_results_df.with_columns(
            pl.col("Accuracy").cum_max().alias("Pareto frontier (ms)")
        )
        chart = plot_acc_rate(device_compare_results_df)

        x_max = device_compare_results_df["ms / sample"].quantile(0.95) * 1.04
        x_min = device_compare_results_df["ms / sample"].min() * 0.96
        chart.layer[0].encoding.x.scale = alt.Scale(domain=[x_min, x_max], type=x_scale_type)
        output_df = device_compare_results_df

    output_df = output_df.select(
        [
            pl.col(col).round(4) if output_df.schema[col] in [pl.Float32, pl.Float64] else col
            for col in output_df.columns
        ]
    )
    return (chart, output_df.drop("Mistakes", "Samples"))


def app() -> None:
    with gr.Blocks(title="Birder Leaderboard", analytics_enabled=False) as leaderboard:
        gr.HTML("<center><h1>The Birder Leaderboard</h1></center>")
        with gr.Row():
            with gr.Column():
                pass

            with gr.Column():
                gr.Markdown(
                    """
                    Leaderboard of all the pre-trained Birder models across all datasets.

                    * GPU: A5000 ADA Generation
                    * CPU: AMD Ryzen Threadripper PRO 7975WX
                    """
                )

            with gr.Column():
                pass

        with gr.Row():
            with gr.Column():
                pass

            with gr.Column():
                dataset_dropdown = gr.Dropdown(
                    choices=DATASETS,
                    label="Select Dataset",
                    value=DATASETS[0] if DATASETS else None,
                )
                benchmark_dropdown = gr.Dropdown(
                    choices=BENCHMARKS.keys(),
                    label="Select Benchmark",
                    value=next(iter(BENCHMARKS.keys())) if BENCHMARKS else None,
                    filterable=False,
                )

            with gr.Column():
                intermediate = gr.Checkbox(
                    label="Intermediate",
                    value=True,
                    info="Show models that underwent intermediate training (extra data)",
                )
                mim = gr.Checkbox(label="MIM", value=True, info="Show models with Masked Image Modeling pre-training")
                log_x = gr.Checkbox(label="Log scale X-axis", value=False)

            with gr.Column():
                pass

        plot = gr.Plot(container=False)
        table = gr.Dataframe(show_search=True)

        inputs = [dataset_dropdown, benchmark_dropdown, intermediate, mim, log_x]
        outputs = [plot, table]
        leaderboard.load(update_data, inputs=inputs, outputs=outputs)

        dataset_dropdown.change(update_data, inputs=inputs, outputs=outputs)
        benchmark_dropdown.change(update_data, inputs=inputs, outputs=outputs)
        intermediate.change(update_data, inputs=inputs, outputs=outputs)
        mim.change(update_data, inputs=inputs, outputs=outputs)
        log_x.change(update_data, inputs=inputs, outputs=outputs)

    leaderboard.launch()


# Launch the app
if __name__ == "__main__":
    app()