|
import os |
|
import gradio as gr |
|
import pandas as pd |
|
import plotly.express as px |
|
from apscheduler.schedulers.background import BackgroundScheduler |
|
|
|
from src.assets.css_html_js import custom_css, custom_js |
|
from src.assets.text_content import ( |
|
TITLE, |
|
INTRODUCTION_TEXT, |
|
ABOUT_TEXT, |
|
EXAMPLE_CONFIG_TEXT, |
|
CITATION_BUTTON_LABEL, |
|
CITATION_BUTTON_TEXT, |
|
) |
|
from src.utils import ( |
|
change_tab, |
|
restart_space, |
|
load_dataset_repo, |
|
process_model_name, |
|
process_model_type, |
|
) |
|
|
|
|
|
LLM_PERF_LEADERBOARD_REPO = "optimum/llm-perf-leaderboard" |
|
LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset" |
|
OPTIMUM_TOKEN = os.environ.get("OPTIMUM_TOKEN", None) |
|
|
|
|
|
TRUE_WEIGHT_CLASSES = { |
|
"6B": "7B", |
|
} |
|
|
|
ALL_COLUMNS_MAPPING = { |
|
"model_type": "Type π€", |
|
"weight_class": "Class ποΈ", |
|
|
|
"backend.name": "Backend π", |
|
"backend.torch_dtype": "Dtype π₯", |
|
"optimizations": "Optimizations π οΈ", |
|
|
|
"generate.throughput(tokens/s)": "Throughput (tokens/s) β¬οΈ", |
|
|
|
|
|
"best_scored_model": "Best Scored Model π", |
|
"best_score": "Best Score (%) β¬οΈ", |
|
} |
|
ALL_COLUMNS_DATATYPES = [ |
|
"str", |
|
"str", |
|
|
|
"str", |
|
"str", |
|
"str", |
|
|
|
"number", |
|
|
|
|
|
"markdown", |
|
"number", |
|
] |
|
SORTING_COLUMN = ["tradeoff"] |
|
|
|
llm_perf_dataset_repo = load_dataset_repo(LLM_PERF_DATASET_REPO, OPTIMUM_TOKEN) |
|
|
|
|
|
def get_benchmark_df(benchmark="1xA100-80GB"): |
|
if llm_perf_dataset_repo: |
|
llm_perf_dataset_repo.git_pull() |
|
|
|
|
|
bench_df = pd.read_csv(f"./llm-perf-dataset/reports/{benchmark}.csv") |
|
scores_df = pd.read_csv( |
|
"./llm-perf-dataset/reports/Weighted+Classed-Open-LLM-Leaderboard.csv" |
|
) |
|
|
|
bench_df["merge_id"] = bench_df.experiment_name.str.split("_1_1000_").str[-1] |
|
scores_df["merge_id"] = scores_df.weight_class + "_" + scores_df.model_type |
|
merged_df = bench_df.merge(scores_df, on="merge_id") |
|
|
|
|
|
merged_df["weight_class"] = merged_df["weight_class"].apply( |
|
lambda x: TRUE_WEIGHT_CLASSES[x] if x in TRUE_WEIGHT_CLASSES else x |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
merged_df["optimizations"] = merged_df[ |
|
["backend.bettertransformer", "backend.load_in_8bit", "backend.load_in_4bit"] |
|
].apply( |
|
lambda x: ", ".join( |
|
filter( |
|
lambda x: x != "", |
|
[ |
|
"BetterTransformer" if x[0] == True else "", |
|
"LLM.int8" if x[1] == True else "", |
|
"LLM.fp4" if x[2] == True else "", |
|
], |
|
), |
|
) |
|
if any([x[0] == True, x[1] == True, x[2] == True]) |
|
else "None", |
|
axis=1, |
|
) |
|
|
|
merged_df["quantized"] = merged_df["optimizations"].str.contains("LLM.int8|LLM.fp4") |
|
|
|
|
|
score_distance = 100 - merged_df["best_score"] |
|
|
|
latency_distance = merged_df["generate.latency(s)"] |
|
merged_df["tradeoff"] = (score_distance**2 + latency_distance**2) ** 0.5 |
|
merged_df["tradeoff"] = merged_df["tradeoff"].round(2) |
|
|
|
return merged_df |
|
|
|
|
|
def get_benchmark_table(bench_df): |
|
|
|
copy_df = bench_df.copy() |
|
|
|
copy_df["best_score"] = copy_df.apply( |
|
lambda x: f"{x['best_score']}**" if x["quantized"] else x["best_score"], |
|
axis=1, |
|
) |
|
|
|
|
|
copy_df.sort_values(by=SORTING_COLUMN, ascending=True, inplace=True) |
|
|
|
copy_df = copy_df[list(ALL_COLUMNS_MAPPING.keys())] |
|
|
|
copy_df.rename(columns=ALL_COLUMNS_MAPPING, inplace=True) |
|
|
|
copy_df["Type π€"] = copy_df["Type π€"].apply(process_model_type) |
|
copy_df["Best Scored Model π"] = copy_df["Best Scored Model π"].apply( |
|
process_model_name |
|
) |
|
|
|
return copy_df |
|
|
|
|
|
def get_benchmark_plot(bench_df): |
|
fig = px.scatter( |
|
bench_df, |
|
x="generate.latency(s)", |
|
y="best_score", |
|
color="model_type", |
|
|
|
custom_data=[ |
|
"best_scored_model", |
|
"backend.name", |
|
"backend.torch_dtype", |
|
"optimizations", |
|
|
|
"generate.throughput(tokens/s)", |
|
], |
|
color_discrete_sequence=px.colors.qualitative.Light24, |
|
) |
|
|
|
fig.update_layout( |
|
title={ |
|
"text": "Model Score vs. Latency", |
|
"y": 0.95, |
|
"x": 0.5, |
|
"xanchor": "center", |
|
"yanchor": "top", |
|
}, |
|
xaxis_title="Per 1000 Tokens Latency (s)", |
|
yaxis_title="Open LLM Score (%)", |
|
legend_title="Model Type", |
|
width=1200, |
|
height=600, |
|
) |
|
|
|
fig.update_traces( |
|
hovertemplate="<br>".join( |
|
[ |
|
"Model: %{customdata[0]}", |
|
"Backend: %{customdata[1]}", |
|
"Load Datatype: %{customdata[2]}", |
|
"Optimizations: %{customdata[3]}", |
|
|
|
"Throughput (tokens/s): %{customdata[4]}", |
|
"Per 1000 Tokens Latency (s): %{x}", |
|
"Open LLM Score (%): %{y}", |
|
] |
|
) |
|
) |
|
|
|
return fig |
|
|
|
|
|
def filter_query( |
|
text, |
|
backends, |
|
datatypes, |
|
optimizations, |
|
score, |
|
|
|
benchmark="1xA100-80GB", |
|
): |
|
raw_df = get_benchmark_df(benchmark=benchmark) |
|
|
|
filtered_df = raw_df[ |
|
raw_df["best_scored_model"].str.lower().str.contains(text.lower()) |
|
& raw_df["backend.name"].isin(backends) |
|
& raw_df["backend.torch_dtype"].isin(datatypes) |
|
& ( |
|
pd.concat( |
|
[ |
|
raw_df["optimizations"].str.contains(optimization) |
|
for optimization in optimizations |
|
], |
|
axis=1, |
|
).any(axis="columns") |
|
if len(optimizations) > 0 |
|
else True |
|
) |
|
& (raw_df["best_score"] >= score) |
|
|
|
] |
|
|
|
filtered_table = get_benchmark_table(filtered_df) |
|
filtered_plot = get_benchmark_plot(filtered_df) |
|
|
|
return filtered_table, filtered_plot |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
demo = gr.Blocks(css=custom_css) |
|
with demo: |
|
|
|
gr.HTML(TITLE) |
|
|
|
|
|
gr.Markdown(INTRODUCTION_TEXT, elem_classes="descriptive-text") |
|
|
|
|
|
gr.HTML( |
|
"π§ This leaderboard is currently under maintenance. π§", |
|
elem_classes="descriptive-text", |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scheduler = BackgroundScheduler() |
|
scheduler.add_job( |
|
restart_space, |
|
"interval", |
|
seconds=3600, |
|
args=[LLM_PERF_LEADERBOARD_REPO, OPTIMUM_TOKEN], |
|
) |
|
scheduler.start() |
|
|
|
|
|
demo.queue(concurrency_count=40).launch() |
|
|