Spaces:
Sleeping
Sleeping
Commit
Β·
ab5f5f1
1
Parent(s):
988dbd8
update
Browse files- app.py +71 -344
- huggy_bench.png β logo.png +0 -0
- pyproject.toml +21 -0
- script.py +0 -14
- src/{assets/css_html_js.py β assets.py} +3 -3
- src/bettertransformer.py +148 -0
- src/control_panel.py +168 -0
- src/flashattentionv2.py +148 -0
- src/latency_score_memory.py +67 -0
- src/leaderboard.py +60 -0
- src/llm_perf.py +127 -0
- src/{assets/text_content.py β text.py} +32 -18
- src/utils.py +21 -28
app.py
CHANGED
|
@@ -1,371 +1,98 @@
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
import gradio as gr
|
| 4 |
-
import pandas as pd
|
| 5 |
-
import plotly.express as px
|
| 6 |
-
from huggingface_hub.file_download import hf_hub_download
|
| 7 |
|
| 8 |
-
|
| 9 |
-
from src.
|
| 10 |
-
from src.
|
| 11 |
-
from src.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
TITLE,
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
|
|
|
| 16 |
CITATION_BUTTON_LABEL,
|
| 17 |
-
CITATION_BUTTON_TEXT,
|
| 18 |
)
|
| 19 |
|
| 20 |
-
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
| 21 |
-
LOGO_URL = "https://huggingface.co/spaces/optimum/llm-perf-leaderboard/resolve/main/huggy_bench.png"
|
| 22 |
-
LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
|
| 23 |
-
ALL_COLUMNS_MAPPING = {
|
| 24 |
-
"Model": "Model π€",
|
| 25 |
-
"Arch": "Arch ποΈ",
|
| 26 |
-
"Size": "Params (B) π",
|
| 27 |
-
# deployment settings
|
| 28 |
-
"backend.name": "Backend π",
|
| 29 |
-
"backend.torch_dtype": "Dtype π₯",
|
| 30 |
-
"optimization": "Optimization π οΈ",
|
| 31 |
-
"quantization": "Quantization ποΈ",
|
| 32 |
-
# measurements
|
| 33 |
-
"Score": "Open LLM Score (%) β¬οΈ",
|
| 34 |
-
"decode.throughput(tokens/s)": "Decode Throughput (tokens/s) β¬οΈ",
|
| 35 |
-
"generate.throughput(tokens/s)": "E2E Throughput (tokens/s) β¬οΈ",
|
| 36 |
-
"forward.latency(s)": "Prefill Latency (s) β¬οΈ",
|
| 37 |
-
"generate.latency(s)": "E2E Latency (s) β¬οΈ",
|
| 38 |
-
"generate.max_memory_allocated(MB)": "Allocated Memory (MB) β¬οΈ",
|
| 39 |
-
"generate.max_memory_reserved(MB)": "Reserved Memory (MB) β¬οΈ",
|
| 40 |
-
"generate.max_memory_used(MB)": "Used Memory (MB) β¬οΈ",
|
| 41 |
-
"generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh) β¬οΈ",
|
| 42 |
-
}
|
| 43 |
-
SORTING_COLUMN = ["Score", "generate.throughput(tokens/s)"]
|
| 44 |
-
SORTING_ASCENDING = [False, False]
|
| 45 |
-
ALL_COLUMNS_DATATYPES = [
|
| 46 |
-
# open llm
|
| 47 |
-
"markdown",
|
| 48 |
-
"markdown",
|
| 49 |
-
"number",
|
| 50 |
-
# deployment settings
|
| 51 |
-
"str",
|
| 52 |
-
"str",
|
| 53 |
-
"str",
|
| 54 |
-
"str",
|
| 55 |
-
# measurements
|
| 56 |
-
"number",
|
| 57 |
-
"number",
|
| 58 |
-
"number",
|
| 59 |
-
"number",
|
| 60 |
-
"number",
|
| 61 |
-
"number",
|
| 62 |
-
"number",
|
| 63 |
-
"number",
|
| 64 |
-
"number",
|
| 65 |
-
"number",
|
| 66 |
-
]
|
| 67 |
-
# download data
|
| 68 |
-
hf_hub_download(
|
| 69 |
-
repo_id="optimum/llm-perf-dataset",
|
| 70 |
-
filename="open-llm.csv",
|
| 71 |
-
local_dir="dataset",
|
| 72 |
-
repo_type="dataset",
|
| 73 |
-
token=HF_TOKEN,
|
| 74 |
-
)
|
| 75 |
-
OPEN_LLM_DF = pd.read_csv("dataset/open-llm.csv")
|
| 76 |
|
|
|
|
| 77 |
MACHINE_TO_HARDWARE = {"hf-dgx-01": "A100-80GB π₯οΈ"}
|
| 78 |
-
|
| 79 |
-
for machine in MACHINE_TO_HARDWARE:
|
| 80 |
-
hf_hub_download(
|
| 81 |
-
repo_id="optimum/llm-perf-dataset",
|
| 82 |
-
filename=f"{machine}/perf-report.csv",
|
| 83 |
-
local_dir="dataset",
|
| 84 |
-
repo_type="dataset",
|
| 85 |
-
token=HF_TOKEN,
|
| 86 |
-
)
|
| 87 |
-
MACHINE_TO_PERF[machine] = pd.read_csv(f"dataset/{machine}/perf-report.csv")
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
def get_benchmark_df(machine="hf-dgx-01"):
|
| 91 |
-
# merge on model
|
| 92 |
-
machine_perf_df = MACHINE_TO_PERF[machine].copy()
|
| 93 |
-
merged_df = OPEN_LLM_DF.merge(machine_perf_df, left_on="Model", right_on="model")
|
| 94 |
-
# transpose energy consumption
|
| 95 |
-
merged_df["generate.energy_consumption(tokens/kWh)"] = (
|
| 96 |
-
1 / merged_df["generate.energy_consumption(kWh/token)"].fillna(1)
|
| 97 |
-
).astype(int)
|
| 98 |
-
# fix nan values
|
| 99 |
-
merged_df.loc[
|
| 100 |
-
merged_df["generate.energy_consumption(tokens/kWh)"] == 1,
|
| 101 |
-
"generate.energy_consumption(tokens/kWh)",
|
| 102 |
-
] = pd.NA
|
| 103 |
-
# add optimization column
|
| 104 |
-
merged_df["optimization"] = merged_df[
|
| 105 |
-
["backend.to_bettertransformer", "backend.use_flash_attention_2"]
|
| 106 |
-
].apply(
|
| 107 |
-
lambda x: "BetterTransformer"
|
| 108 |
-
if x["backend.to_bettertransformer"]
|
| 109 |
-
else ("FlashAttentionV2" if x["backend.use_flash_attention_2"] else "None"),
|
| 110 |
-
axis=1,
|
| 111 |
-
)
|
| 112 |
-
# add quantization scheme
|
| 113 |
-
merged_df["quantization"] = merged_df[
|
| 114 |
-
["backend.quantization_scheme", "backend.quantization_config.exllama_config.version"]
|
| 115 |
-
].apply(
|
| 116 |
-
lambda x: "BnB.4bit"
|
| 117 |
-
if x["backend.quantization_scheme"] == "bnb"
|
| 118 |
-
else (
|
| 119 |
-
"GPTQ.4bit+ExllamaV1"
|
| 120 |
-
if (x["backend.quantization_scheme"] == "gptq")
|
| 121 |
-
and (x["backend.quantization_config.exllama_config.version"] == 1)
|
| 122 |
-
else (
|
| 123 |
-
"GPTQ.4bit+ExllamaV2"
|
| 124 |
-
if (x["backend.quantization_scheme"] == "gptq")
|
| 125 |
-
and (x["backend.quantization_config.exllama_config.version"] == 2)
|
| 126 |
-
else "None"
|
| 127 |
-
)
|
| 128 |
-
),
|
| 129 |
-
axis=1,
|
| 130 |
-
)
|
| 131 |
-
# add decode throughput
|
| 132 |
-
merged_df["decode.throughput(tokens/s)"] = (
|
| 133 |
-
1000 / (merged_df["generate.latency(s)"] - merged_df["forward.latency(s)"])
|
| 134 |
-
).round(2)
|
| 135 |
-
# sort by metric
|
| 136 |
-
merged_df.sort_values(by=SORTING_COLUMN, ascending=SORTING_ASCENDING, inplace=True)
|
| 137 |
-
# filter columns
|
| 138 |
-
merged_df = merged_df[list(ALL_COLUMNS_MAPPING.keys())]
|
| 139 |
-
# rename columns
|
| 140 |
-
merged_df.rename(columns=ALL_COLUMNS_MAPPING, inplace=True)
|
| 141 |
-
|
| 142 |
-
return merged_df
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
def get_benchmark_table(bench_df):
|
| 146 |
-
copy_df = bench_df.copy()
|
| 147 |
-
# transform
|
| 148 |
-
copy_df["Model π€"] = copy_df["Model π€"].apply(process_model_name)
|
| 149 |
-
copy_df["Arch ποΈ"] = copy_df["Arch ποΈ"].apply(process_model_arch)
|
| 150 |
-
# process quantization
|
| 151 |
-
copy_df["Open LLM Score (%) β¬οΈ"] = copy_df.apply(
|
| 152 |
-
lambda x: f"{x['Open LLM Score (%) β¬οΈ']}**"
|
| 153 |
-
if x["Quantization ποΈ"] in ["BnB.4bit", "GPTQ.4bit"]
|
| 154 |
-
else x["Open LLM Score (%) β¬οΈ"],
|
| 155 |
-
axis=1,
|
| 156 |
-
)
|
| 157 |
-
return copy_df
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
def get_benchmark_chart(bench_df):
|
| 161 |
-
copy_df = bench_df.copy()
|
| 162 |
-
# transform
|
| 163 |
-
copy_df["Arch ποΈ"] = copy_df["Arch ποΈ"].apply(process_model_arch)
|
| 164 |
-
# plot
|
| 165 |
-
fig = px.scatter(
|
| 166 |
-
copy_df,
|
| 167 |
-
y="Open LLM Score (%) β¬οΈ",
|
| 168 |
-
x="E2E Latency (s) β¬οΈ",
|
| 169 |
-
size="Allocated Memory (MB) β¬οΈ",
|
| 170 |
-
color="Arch ποΈ",
|
| 171 |
-
custom_data=list(ALL_COLUMNS_MAPPING.values()),
|
| 172 |
-
color_discrete_sequence=px.colors.qualitative.Light24,
|
| 173 |
-
)
|
| 174 |
-
fig.update_layout(
|
| 175 |
-
title={
|
| 176 |
-
"text": "Latency vs. Score vs. Memory",
|
| 177 |
-
"y": 0.95,
|
| 178 |
-
"x": 0.5,
|
| 179 |
-
"xanchor": "center",
|
| 180 |
-
"yanchor": "top",
|
| 181 |
-
},
|
| 182 |
-
xaxis_title="Per 1000 Tokens Latency (s)",
|
| 183 |
-
yaxis_title="Open LLM Score (%)",
|
| 184 |
-
legend_title="LLM Architecture",
|
| 185 |
-
width=1200,
|
| 186 |
-
height=600,
|
| 187 |
-
)
|
| 188 |
-
fig.update_traces(
|
| 189 |
-
hovertemplate="<br>".join(
|
| 190 |
-
[
|
| 191 |
-
f"<b>{column}:</b> %{{customdata[{i}]}}"
|
| 192 |
-
for i, column in enumerate(ALL_COLUMNS_MAPPING.values())
|
| 193 |
-
]
|
| 194 |
-
)
|
| 195 |
-
)
|
| 196 |
-
return fig
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
def filter_query(
|
| 200 |
-
text,
|
| 201 |
-
backends,
|
| 202 |
-
datatypes,
|
| 203 |
-
optimizations,
|
| 204 |
-
quantizations,
|
| 205 |
-
score,
|
| 206 |
-
memory,
|
| 207 |
-
machine,
|
| 208 |
-
):
|
| 209 |
-
raw_df = get_benchmark_df(machine=machine)
|
| 210 |
-
filtered_df = raw_df[
|
| 211 |
-
raw_df["Model π€"].str.contains(text, case=False)
|
| 212 |
-
& raw_df["Backend π"].isin(backends)
|
| 213 |
-
& raw_df["Dtype π₯"].isin(datatypes)
|
| 214 |
-
& raw_df["Optimization π οΈ"].isin(optimizations)
|
| 215 |
-
& raw_df["Quantization ποΈ"].isin(quantizations)
|
| 216 |
-
& (raw_df["Open LLM Score (%) β¬οΈ"] >= score)
|
| 217 |
-
& (raw_df["Allocated Memory (MB) β¬οΈ"] <= memory)
|
| 218 |
-
]
|
| 219 |
-
filtered_table = get_benchmark_table(filtered_df)
|
| 220 |
-
filtered_chart = get_benchmark_chart(filtered_df)
|
| 221 |
-
return filtered_table, filtered_chart
|
| 222 |
|
| 223 |
|
| 224 |
-
# Demo interface
|
| 225 |
demo = gr.Blocks(css=custom_css)
|
| 226 |
with demo:
|
| 227 |
-
|
| 228 |
gr.HTML(f'<img src="{LOGO_URL}">', elem_classes="logo")
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
|
|
|
|
|
|
|
|
|
| 247 |
with gr.TabItem("Leaderboard π
", id=0):
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
)
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
)
|
| 277 |
-
with gr.Row():
|
| 278 |
-
with gr.Column():
|
| 279 |
-
search_bar = gr.Textbox(
|
| 280 |
-
label="Model π€",
|
| 281 |
-
info="π Search for a model name",
|
| 282 |
-
elem_id="search-bar",
|
| 283 |
-
)
|
| 284 |
-
with gr.Row():
|
| 285 |
-
with gr.Column(scale=1):
|
| 286 |
-
score_slider = gr.Slider(
|
| 287 |
-
label="Open LLM Score (%) π",
|
| 288 |
-
info="ποΈ Slide to minimum Open LLM score",
|
| 289 |
-
value=0,
|
| 290 |
-
elem_id="threshold-slider",
|
| 291 |
-
)
|
| 292 |
-
with gr.Column(scale=1):
|
| 293 |
-
memory_slider = gr.Slider(
|
| 294 |
-
label="Peak Memory (MB) π",
|
| 295 |
-
info="ποΈ Slide to maximum Peak Memory",
|
| 296 |
-
minimum=0,
|
| 297 |
-
maximum=80 * 1024,
|
| 298 |
-
value=80 * 1024,
|
| 299 |
-
elem_id="memory-slider",
|
| 300 |
-
)
|
| 301 |
-
with gr.Column(scale=1):
|
| 302 |
-
backend_checkboxes = gr.CheckboxGroup(
|
| 303 |
-
label="Backends π",
|
| 304 |
-
choices=["pytorch", "onnxruntime"],
|
| 305 |
-
value=["pytorch", "onnxruntime"],
|
| 306 |
-
info="βοΈ Select the backends",
|
| 307 |
-
elem_id="backend-checkboxes",
|
| 308 |
-
)
|
| 309 |
-
with gr.Row():
|
| 310 |
-
with gr.Column(scale=1):
|
| 311 |
-
datatype_checkboxes = gr.CheckboxGroup(
|
| 312 |
-
label="Load Dtypes π₯",
|
| 313 |
-
choices=["float32", "float16"],
|
| 314 |
-
value=["float32", "float16"],
|
| 315 |
-
info="βοΈ Select the load dtypes",
|
| 316 |
-
elem_id="dtype-checkboxes",
|
| 317 |
-
)
|
| 318 |
-
with gr.Column(scale=1):
|
| 319 |
-
optimization_checkboxes = gr.CheckboxGroup(
|
| 320 |
-
label="Optimizations π οΈ",
|
| 321 |
-
choices=["None", "BetterTransformer", "FlashAttentionV2"],
|
| 322 |
-
value=["None", "BetterTransformer", "FlashAttentionV2"],
|
| 323 |
-
info="βοΈ Select the optimization",
|
| 324 |
-
elem_id="optimization-checkboxes",
|
| 325 |
-
)
|
| 326 |
-
with gr.Column(scale=1):
|
| 327 |
-
quantization_checkboxes = gr.CheckboxGroup(
|
| 328 |
-
label="Quantizations ποΈ",
|
| 329 |
-
choices=["None", "BnB.4bit", "GPTQ.4bit"],
|
| 330 |
-
value=["None", "BnB.4bit", "GPTQ.4bit"],
|
| 331 |
-
info="βοΈ Select the quantization schemes",
|
| 332 |
-
elem_id="quantization-checkboxes",
|
| 333 |
-
)
|
| 334 |
-
with gr.Row():
|
| 335 |
-
filter_button = gr.Button(
|
| 336 |
-
value="Filter π",
|
| 337 |
-
elem_id="filter-button",
|
| 338 |
-
)
|
| 339 |
-
for machine in MACHINE_TO_HARDWARE:
|
| 340 |
-
filter_button.click(
|
| 341 |
-
filter_query,
|
| 342 |
-
[
|
| 343 |
-
search_bar,
|
| 344 |
-
backend_checkboxes,
|
| 345 |
-
datatype_checkboxes,
|
| 346 |
-
optimization_checkboxes,
|
| 347 |
-
quantization_checkboxes,
|
| 348 |
-
score_slider,
|
| 349 |
-
memory_slider,
|
| 350 |
-
machine_placeholders[machine],
|
| 351 |
-
],
|
| 352 |
-
[machine_tables[machine], machine_plots[machine]],
|
| 353 |
)
|
| 354 |
-
|
| 355 |
####################### ABOUT TAB #######################
|
| 356 |
with gr.TabItem("About π", id=3):
|
| 357 |
-
gr.HTML(
|
| 358 |
-
gr.Markdown(
|
| 359 |
-
|
| 360 |
-
####################### CITATION #######################
|
| 361 |
with gr.Row():
|
| 362 |
with gr.Accordion("π Citation", open=False):
|
| 363 |
citation_button = gr.Textbox(
|
| 364 |
-
value=
|
| 365 |
label=CITATION_BUTTON_LABEL,
|
| 366 |
elem_id="citation-button",
|
| 367 |
show_copy_button=True,
|
| 368 |
)
|
| 369 |
|
| 370 |
-
|
| 371 |
-
demo
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
+
from src.control_panel import create_control_panel, create_control_callback
|
| 6 |
+
from src.latency_score_memory import create_lat_score_mem_plot
|
| 7 |
+
from src.leaderboard import create_leaderboard_table
|
| 8 |
+
from src.flashattentionv2 import create_fa2_plots
|
| 9 |
+
from src.bettertransformer import create_bt_plots
|
| 10 |
+
from src.llm_perf import get_llm_perf_df
|
| 11 |
+
from src.assets import custom_css
|
| 12 |
+
from src.text import (
|
| 13 |
TITLE,
|
| 14 |
+
ABOUT,
|
| 15 |
+
INTRODUCTION,
|
| 16 |
+
EXAMPLE_CONFIG,
|
| 17 |
+
CITATION_BUTTON,
|
| 18 |
CITATION_BUTTON_LABEL,
|
|
|
|
| 19 |
)
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
+
LOGO_URL = "https://huggingface.co/spaces/optimum/llm-perf-leaderboard/resolve/main/logo.png"
|
| 23 |
MACHINE_TO_HARDWARE = {"hf-dgx-01": "A100-80GB π₯οΈ"}
|
| 24 |
+
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
|
|
|
|
| 27 |
demo = gr.Blocks(css=custom_css)
|
| 28 |
with demo:
|
| 29 |
+
gr.HTML(TITLE, elem_classes="title")
|
| 30 |
gr.HTML(f'<img src="{LOGO_URL}">', elem_classes="logo")
|
| 31 |
+
gr.Markdown(INTRODUCTION, elem_classes="descriptive-text")
|
| 32 |
+
####################### HARDWARE TABS #######################
|
| 33 |
+
with gr.Tabs(elem_classes="tabs"):
|
| 34 |
+
for id, (machine, hardware) in enumerate(MACHINE_TO_HARDWARE.items()):
|
| 35 |
+
with gr.TabItem(hardware, id=id):
|
| 36 |
+
####################### CONTROL PANEL #######################
|
| 37 |
+
(
|
| 38 |
+
filter_button,
|
| 39 |
+
machine_textbox,
|
| 40 |
+
search_bar,
|
| 41 |
+
score_slider,
|
| 42 |
+
memory_slider,
|
| 43 |
+
backend_checkboxes,
|
| 44 |
+
datatype_checkboxes,
|
| 45 |
+
optimization_checkboxes,
|
| 46 |
+
quantization_checkboxes,
|
| 47 |
+
) = create_control_panel()
|
| 48 |
+
####################### HARDWARE SUBTABS #######################
|
| 49 |
+
with gr.Tabs(elem_classes="subtabs"):
|
| 50 |
+
llm_perf_df = get_llm_perf_df(machine=machine)
|
| 51 |
+
####################### LEADERBOARD TAB #######################
|
| 52 |
with gr.TabItem("Leaderboard π
", id=0):
|
| 53 |
+
leaderboard_table = create_leaderboard_table(llm_perf_df)
|
| 54 |
+
####################### LAT. vs. SCORE vs. MEM. TAB #######################
|
| 55 |
+
with gr.TabItem("Latency vs. Score vs. Memory π", id=1):
|
| 56 |
+
lat_score_mem_plot = create_lat_score_mem_plot(llm_perf_df)
|
| 57 |
+
####################### BETTERTRANSFORMER SPEEDUP TAB #######################
|
| 58 |
+
with gr.TabItem("BetterTransformer Speedup π", id=2):
|
| 59 |
+
bt_prefill_plot, bt_decode_plot = create_bt_plots(llm_perf_df)
|
| 60 |
+
with gr.TabItem("FlashAttentionV2 Speedup π", id=3):
|
| 61 |
+
fa2_prefill_plot, fa2_decode_plot = create_fa2_plots(llm_perf_df)
|
| 62 |
+
####################### CONTROL CALLBACK #######################
|
| 63 |
+
create_control_callback(
|
| 64 |
+
filter_button,
|
| 65 |
+
# inputs
|
| 66 |
+
machine_textbox,
|
| 67 |
+
search_bar,
|
| 68 |
+
score_slider,
|
| 69 |
+
memory_slider,
|
| 70 |
+
backend_checkboxes,
|
| 71 |
+
datatype_checkboxes,
|
| 72 |
+
optimization_checkboxes,
|
| 73 |
+
quantization_checkboxes,
|
| 74 |
+
# outputs
|
| 75 |
+
leaderboard_table,
|
| 76 |
+
lat_score_mem_plot,
|
| 77 |
+
bt_prefill_plot,
|
| 78 |
+
bt_decode_plot,
|
| 79 |
+
fa2_prefill_plot,
|
| 80 |
+
fa2_decode_plot,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
)
|
|
|
|
| 82 |
####################### ABOUT TAB #######################
|
| 83 |
with gr.TabItem("About π", id=3):
|
| 84 |
+
gr.HTML(ABOUT, elem_classes="descriptive-text")
|
| 85 |
+
gr.Markdown(EXAMPLE_CONFIG, elem_classes="descriptive-text")
|
| 86 |
+
####################### CITATION
|
|
|
|
| 87 |
with gr.Row():
|
| 88 |
with gr.Accordion("π Citation", open=False):
|
| 89 |
citation_button = gr.Textbox(
|
| 90 |
+
value=CITATION_BUTTON,
|
| 91 |
label=CITATION_BUTTON_LABEL,
|
| 92 |
elem_id="citation-button",
|
| 93 |
show_copy_button=True,
|
| 94 |
)
|
| 95 |
|
| 96 |
+
if __name__ == "__main__":
|
| 97 |
+
# Launch demo
|
| 98 |
+
demo.queue().launch()
|
huggy_bench.png β logo.png
RENAMED
|
File without changes
|
pyproject.toml
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2021 The HuggingFace Team. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
[tool.black]
|
| 16 |
+
line-length = 119
|
| 17 |
+
target-version = ['py37']
|
| 18 |
+
|
| 19 |
+
[tool.ruff]
|
| 20 |
+
ignore = ["E501", "C901"]
|
| 21 |
+
select = ["C", "E", "F", "I", "W"]
|
script.py
DELETED
|
@@ -1,14 +0,0 @@
|
|
| 1 |
-
from huggingface_hub import hf_hub_download
|
| 2 |
-
import pandas as pd
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
hf_hub_download(
|
| 6 |
-
repo_id="optimum/llm-perf-dataset",
|
| 7 |
-
filename="open-llm.csv",
|
| 8 |
-
local_dir="dataset",
|
| 9 |
-
repo_type="dataset",
|
| 10 |
-
)
|
| 11 |
-
|
| 12 |
-
open_llm = pd.read_csv("dataset/open-llm.csv")
|
| 13 |
-
print(open_llm["Arch"].unique())
|
| 14 |
-
print(open_llm[open_llm["Arch"] == "rwkv"]["Model"].unique())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/{assets/css_html_js.py β assets.py}
RENAMED
|
@@ -6,14 +6,14 @@ custom_css = """
|
|
| 6 |
max-width: 100%
|
| 7 |
object-fit: contain;
|
| 8 |
}
|
| 9 |
-
.
|
| 10 |
font-size: 16px !important;
|
| 11 |
}
|
| 12 |
|
| 13 |
-
.
|
| 14 |
font-size: 20px;
|
| 15 |
}
|
| 16 |
-
.
|
| 17 |
font-size: 20px;
|
| 18 |
}
|
| 19 |
|
|
|
|
| 6 |
max-width: 100%
|
| 7 |
object-fit: contain;
|
| 8 |
}
|
| 9 |
+
.text {
|
| 10 |
font-size: 16px !important;
|
| 11 |
}
|
| 12 |
|
| 13 |
+
.tabs button {
|
| 14 |
font-size: 20px;
|
| 15 |
}
|
| 16 |
+
.subtabs button {
|
| 17 |
font-size: 20px;
|
| 18 |
}
|
| 19 |
|
src/bettertransformer.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import plotly.express as px
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
from src.utils import process_arch
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
BETTERTRANSFORMER_DATA = [
|
| 10 |
+
# open llm
|
| 11 |
+
"Model π€",
|
| 12 |
+
"Arch ποΈ",
|
| 13 |
+
"DType π₯",
|
| 14 |
+
"Backend π",
|
| 15 |
+
"Params (B)",
|
| 16 |
+
"Open LLM Score (%)",
|
| 17 |
+
# deployment settings
|
| 18 |
+
"DType π₯",
|
| 19 |
+
"Backend π",
|
| 20 |
+
"Quantization ποΈ",
|
| 21 |
+
# primary measurements
|
| 22 |
+
"Prefill Latency (s)",
|
| 23 |
+
"Prefill Latency (s) BetterTransformer",
|
| 24 |
+
"Decode Throughput (tokens/s)",
|
| 25 |
+
"Decode Throughput (tokens/s) BetterTransformer",
|
| 26 |
+
"E2E Throughput (tokens/s)",
|
| 27 |
+
"E2E Throughput (tokens/s) BetterTransformer",
|
| 28 |
+
# speedups
|
| 29 |
+
"Prefill Latency Speedup (%)",
|
| 30 |
+
"Decode Throughput Speedup (%)",
|
| 31 |
+
]
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def get_bt_df(llm_perf_df):
|
| 35 |
+
bt_df = llm_perf_df.copy()
|
| 36 |
+
# process
|
| 37 |
+
bt_df["Arch ποΈ"] = bt_df["Arch ποΈ"].apply(process_arch)
|
| 38 |
+
# seperate original model experiments from BetterTransformer experiments
|
| 39 |
+
original_df = bt_df[bt_df["Optimization π οΈ"] == "None"]
|
| 40 |
+
bt_df = bt_df[bt_df["Optimization π οΈ"] == "BetterTransformer"]
|
| 41 |
+
# merge the two dataframes
|
| 42 |
+
bt_df = pd.merge(
|
| 43 |
+
original_df,
|
| 44 |
+
bt_df,
|
| 45 |
+
on=["Model π€", "Quantization ποΈ"],
|
| 46 |
+
suffixes=["", " BetterTransformer"],
|
| 47 |
+
)
|
| 48 |
+
# compute speedups
|
| 49 |
+
bt_df["Prefill Latency Speedup (%)"] = (
|
| 50 |
+
(bt_df["Prefill Latency (s)"] / bt_df["Prefill Latency (s) BetterTransformer"]) * 100
|
| 51 |
+
).round(2)
|
| 52 |
+
bt_df["Decode Throughput Speedup (%)"] = (
|
| 53 |
+
(bt_df["Decode Throughput (tokens/s) BetterTransformer"] / bt_df["Decode Throughput (tokens/s)"]) * 100
|
| 54 |
+
).round(2)
|
| 55 |
+
|
| 56 |
+
# filter speedups > 1000%
|
| 57 |
+
bt_df = bt_df[bt_df["Prefill Latency Speedup (%)"] < 1000]
|
| 58 |
+
bt_df = bt_df[bt_df["Decode Throughput Speedup (%)"] < 1000]
|
| 59 |
+
|
| 60 |
+
return bt_df
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def get_bt_decode_fig(llm_perf_df):
|
| 64 |
+
bt_df = get_bt_df(llm_perf_df)
|
| 65 |
+
# plot
|
| 66 |
+
decode_fig = px.box(
|
| 67 |
+
bt_df,
|
| 68 |
+
x="Arch ποΈ",
|
| 69 |
+
y="Decode Throughput Speedup (%)",
|
| 70 |
+
color_discrete_sequence=px.colors.qualitative.Light24,
|
| 71 |
+
custom_data=BETTERTRANSFORMER_DATA,
|
| 72 |
+
color="Quantization ποΈ",
|
| 73 |
+
points="all",
|
| 74 |
+
)
|
| 75 |
+
# add hover data
|
| 76 |
+
decode_fig.update_traces(
|
| 77 |
+
hovertemplate="<br>".join(
|
| 78 |
+
[f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(BETTERTRANSFORMER_DATA)]
|
| 79 |
+
)
|
| 80 |
+
)
|
| 81 |
+
# add layout
|
| 82 |
+
decode_fig.update_layout(
|
| 83 |
+
title={
|
| 84 |
+
"text": "Decode Throughput Speedup per Architecture",
|
| 85 |
+
"y": 0.95,
|
| 86 |
+
"x": 0.5,
|
| 87 |
+
"xanchor": "center",
|
| 88 |
+
"yanchor": "top",
|
| 89 |
+
},
|
| 90 |
+
xaxis_title="LLM Architecture",
|
| 91 |
+
yaxis_title="Decode Speedup (%)",
|
| 92 |
+
legend_title="Quantization Scheme",
|
| 93 |
+
width=1200,
|
| 94 |
+
height=600,
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
return decode_fig
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def get_bt_prefill_fig(llm_perf_df):
|
| 101 |
+
bt_df = get_bt_df(llm_perf_df)
|
| 102 |
+
# plot
|
| 103 |
+
prefill_fig = px.box(
|
| 104 |
+
bt_df,
|
| 105 |
+
x="Arch ποΈ",
|
| 106 |
+
y="Prefill Latency Speedup (%)",
|
| 107 |
+
color_discrete_sequence=px.colors.qualitative.Light24,
|
| 108 |
+
custom_data=BETTERTRANSFORMER_DATA,
|
| 109 |
+
color="Quantization ποΈ",
|
| 110 |
+
points="all",
|
| 111 |
+
)
|
| 112 |
+
# add hover data
|
| 113 |
+
prefill_fig.update_traces(
|
| 114 |
+
hovertemplate="<br>".join(
|
| 115 |
+
[f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(BETTERTRANSFORMER_DATA)]
|
| 116 |
+
)
|
| 117 |
+
)
|
| 118 |
+
# add layout
|
| 119 |
+
prefill_fig.update_layout(
|
| 120 |
+
title={
|
| 121 |
+
"text": "Prefill Latency Speedup per Architecture",
|
| 122 |
+
"y": 0.95,
|
| 123 |
+
"x": 0.5,
|
| 124 |
+
"xanchor": "center",
|
| 125 |
+
"yanchor": "top",
|
| 126 |
+
},
|
| 127 |
+
xaxis_title="LLM Architecture",
|
| 128 |
+
yaxis_title="Prefill Speedup (%)",
|
| 129 |
+
legend_title="Quantization Scheme",
|
| 130 |
+
width=1200,
|
| 131 |
+
height=600,
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
return prefill_fig
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def create_bt_plots(llm_perf_df):
|
| 138 |
+
# descriptive text
|
| 139 |
+
gr.HTML("π Hover over the points π for additional information.", elem_id="text")
|
| 140 |
+
# get figures
|
| 141 |
+
prefill_fig = get_bt_prefill_fig(llm_perf_df)
|
| 142 |
+
decode_fig = get_bt_decode_fig(llm_perf_df)
|
| 143 |
+
|
| 144 |
+
# create plots
|
| 145 |
+
prefill_plot = gr.components.Plot(value=prefill_fig, elem_id="plot", show_label=False)
|
| 146 |
+
decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False)
|
| 147 |
+
|
| 148 |
+
return prefill_plot, decode_plot
|
src/control_panel.py
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
|
| 3 |
+
from src.llm_perf import get_llm_perf_df
|
| 4 |
+
from src.leaderboard import get_leaderboard_df
|
| 5 |
+
from src.latency_score_memory import get_lat_score_mem_fig
|
| 6 |
+
from src.bettertransformer import get_bt_prefill_fig, get_bt_decode_fig
|
| 7 |
+
from src.flashattentionv2 import get_fa2_prefill_fig, get_fa2_decode_fig
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def create_control_panel(machine: str = "hf-dgx-01"):
|
| 11 |
+
# descriptive text
|
| 12 |
+
gr.HTML("Use this control panel to filter this leaderboard.", elem_id="text")
|
| 13 |
+
# controls
|
| 14 |
+
machine_textbox = gr.Textbox(value=machine, visible=False)
|
| 15 |
+
with gr.Row():
|
| 16 |
+
with gr.Column():
|
| 17 |
+
search_bar = gr.Textbox(
|
| 18 |
+
label="Model π€",
|
| 19 |
+
info="π Search for a model name",
|
| 20 |
+
elem_id="search-bar",
|
| 21 |
+
)
|
| 22 |
+
with gr.Row():
|
| 23 |
+
with gr.Column(scale=1):
|
| 24 |
+
score_slider = gr.Slider(
|
| 25 |
+
label="Open LLM Score (%) π",
|
| 26 |
+
info="ποΈ Slide to minimum Open LLM score",
|
| 27 |
+
value=0,
|
| 28 |
+
elem_id="threshold-slider",
|
| 29 |
+
)
|
| 30 |
+
with gr.Column(scale=1):
|
| 31 |
+
memory_slider = gr.Slider(
|
| 32 |
+
label="Peak Memory (MB) π",
|
| 33 |
+
info="ποΈ Slide to maximum Peak Memory",
|
| 34 |
+
minimum=0,
|
| 35 |
+
maximum=80 * 1024,
|
| 36 |
+
value=80 * 1024,
|
| 37 |
+
elem_id="memory-slider",
|
| 38 |
+
)
|
| 39 |
+
with gr.Column(scale=1):
|
| 40 |
+
backend_checkboxes = gr.CheckboxGroup(
|
| 41 |
+
label="Backends π",
|
| 42 |
+
choices=["pytorch", "onnxruntime"],
|
| 43 |
+
value=["pytorch", "onnxruntime"],
|
| 44 |
+
info="βοΈ Select the backends",
|
| 45 |
+
elem_id="backend-checkboxes",
|
| 46 |
+
)
|
| 47 |
+
with gr.Row():
|
| 48 |
+
with gr.Column(scale=1):
|
| 49 |
+
datatype_checkboxes = gr.CheckboxGroup(
|
| 50 |
+
label="DTypes π₯",
|
| 51 |
+
choices=["float32", "float16"],
|
| 52 |
+
value=["float32", "float16"],
|
| 53 |
+
info="βοΈ Select the load data types",
|
| 54 |
+
elem_id="dtype-checkboxes",
|
| 55 |
+
)
|
| 56 |
+
with gr.Column(scale=1):
|
| 57 |
+
optimization_checkboxes = gr.CheckboxGroup(
|
| 58 |
+
label="Optimizations π οΈ",
|
| 59 |
+
choices=["None", "BetterTransformer", "FlashAttentionV2"],
|
| 60 |
+
value=["None", "BetterTransformer", "FlashAttentionV2"],
|
| 61 |
+
info="βοΈ Select the optimization",
|
| 62 |
+
elem_id="optimization-checkboxes",
|
| 63 |
+
)
|
| 64 |
+
with gr.Column(scale=1):
|
| 65 |
+
quantization_checkboxes = gr.CheckboxGroup(
|
| 66 |
+
label="Quantizations ποΈ",
|
| 67 |
+
choices=["None", "BnB.4bit", "GPTQ.4bit+ExllamaV1", "GPTQ.4bit+ExllamaV2"],
|
| 68 |
+
value=["None", "BnB.4bit", "GPTQ.4bit", "GPTQ.4bit+ExllamaV1", "GPTQ.4bit+ExllamaV2"],
|
| 69 |
+
info="βοΈ Select the quantization schemes",
|
| 70 |
+
elem_id="quantization-checkboxes",
|
| 71 |
+
)
|
| 72 |
+
with gr.Row():
|
| 73 |
+
filter_button = gr.Button(
|
| 74 |
+
value="Filter π",
|
| 75 |
+
elem_id="filter-button",
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
return (
|
| 79 |
+
filter_button,
|
| 80 |
+
machine_textbox,
|
| 81 |
+
search_bar,
|
| 82 |
+
score_slider,
|
| 83 |
+
memory_slider,
|
| 84 |
+
backend_checkboxes,
|
| 85 |
+
datatype_checkboxes,
|
| 86 |
+
optimization_checkboxes,
|
| 87 |
+
quantization_checkboxes,
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def filter_fn(
|
| 92 |
+
machine,
|
| 93 |
+
model,
|
| 94 |
+
backends,
|
| 95 |
+
datatypes,
|
| 96 |
+
optimizations,
|
| 97 |
+
quantizations,
|
| 98 |
+
score,
|
| 99 |
+
memory,
|
| 100 |
+
):
|
| 101 |
+
raw_df = get_llm_perf_df(machine=machine)
|
| 102 |
+
filtered_df = raw_df[
|
| 103 |
+
raw_df["Model π€"].str.contains(model, case=False)
|
| 104 |
+
& raw_df["Backend π"].isin(backends)
|
| 105 |
+
& raw_df["DType π₯"].isin(datatypes)
|
| 106 |
+
& raw_df["Optimization π οΈ"].isin(optimizations)
|
| 107 |
+
& raw_df["Quantization ποΈ"].isin(quantizations)
|
| 108 |
+
& (raw_df["Open LLM Score (%)"] >= score)
|
| 109 |
+
& (raw_df["Allocated Memory (MB)"] <= memory)
|
| 110 |
+
]
|
| 111 |
+
filtered_leaderboard_df = get_leaderboard_df(filtered_df)
|
| 112 |
+
filtered_lat_score_mem_fig = get_lat_score_mem_fig(filtered_df)
|
| 113 |
+
filtered_bt_prefill_fig = get_bt_prefill_fig(filtered_df)
|
| 114 |
+
filtered_bt_decode_fig = get_bt_decode_fig(filtered_df)
|
| 115 |
+
filtered_fa2_prefill_fig = get_fa2_prefill_fig(filtered_df)
|
| 116 |
+
filtered_fa2_decode_fig = get_fa2_decode_fig(filtered_df)
|
| 117 |
+
|
| 118 |
+
return [
|
| 119 |
+
filtered_leaderboard_df,
|
| 120 |
+
filtered_lat_score_mem_fig,
|
| 121 |
+
filtered_bt_prefill_fig,
|
| 122 |
+
filtered_bt_decode_fig,
|
| 123 |
+
filtered_fa2_prefill_fig,
|
| 124 |
+
filtered_fa2_decode_fig,
|
| 125 |
+
]
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def create_control_callback(
|
| 129 |
+
# button
|
| 130 |
+
filter_button,
|
| 131 |
+
# inputs
|
| 132 |
+
machine_textbox,
|
| 133 |
+
search_bar,
|
| 134 |
+
score_slider,
|
| 135 |
+
memory_slider,
|
| 136 |
+
backend_checkboxes,
|
| 137 |
+
datatype_checkboxes,
|
| 138 |
+
optimization_checkboxes,
|
| 139 |
+
quantization_checkboxes,
|
| 140 |
+
# outputs
|
| 141 |
+
leaderboard_table,
|
| 142 |
+
lat_score_mem_plot,
|
| 143 |
+
bt_prefill_plot,
|
| 144 |
+
bt_decode_plot,
|
| 145 |
+
fa2_prefill_plot,
|
| 146 |
+
fa2_decode_plot,
|
| 147 |
+
):
|
| 148 |
+
filter_button.click(
|
| 149 |
+
fn=filter_fn,
|
| 150 |
+
inputs=[
|
| 151 |
+
machine_textbox,
|
| 152 |
+
search_bar,
|
| 153 |
+
backend_checkboxes,
|
| 154 |
+
datatype_checkboxes,
|
| 155 |
+
optimization_checkboxes,
|
| 156 |
+
quantization_checkboxes,
|
| 157 |
+
score_slider,
|
| 158 |
+
memory_slider,
|
| 159 |
+
],
|
| 160 |
+
outputs=[
|
| 161 |
+
leaderboard_table,
|
| 162 |
+
lat_score_mem_plot,
|
| 163 |
+
bt_prefill_plot,
|
| 164 |
+
bt_decode_plot,
|
| 165 |
+
fa2_prefill_plot,
|
| 166 |
+
fa2_decode_plot,
|
| 167 |
+
],
|
| 168 |
+
)
|
src/flashattentionv2.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import plotly.express as px
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
from src.utils import process_arch
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
FLASHATTENTIONV2_DATA = [
|
| 10 |
+
# open llm
|
| 11 |
+
"Model π€",
|
| 12 |
+
"Arch ποΈ",
|
| 13 |
+
"DType π₯",
|
| 14 |
+
"Backend π",
|
| 15 |
+
"Params (B)",
|
| 16 |
+
"Open LLM Score (%)",
|
| 17 |
+
# deployment settings
|
| 18 |
+
"DType π₯",
|
| 19 |
+
"Backend π",
|
| 20 |
+
"Quantization ποΈ",
|
| 21 |
+
# primary measurements
|
| 22 |
+
"Prefill Latency (s)",
|
| 23 |
+
"Prefill Latency (s) FlashAttentionV2",
|
| 24 |
+
"Decode Throughput (tokens/s)",
|
| 25 |
+
"Decode Throughput (tokens/s) FlashAttentionV2",
|
| 26 |
+
"E2E Throughput (tokens/s)",
|
| 27 |
+
"E2E Throughput (tokens/s) FlashAttentionV2",
|
| 28 |
+
# speedups
|
| 29 |
+
"Prefill Latency Speedup (%)",
|
| 30 |
+
"Decode Throughput Speedup (%)",
|
| 31 |
+
]
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def get_fa2_df(llm_perf_df):
|
| 35 |
+
fa2_df = llm_perf_df.copy()
|
| 36 |
+
# process
|
| 37 |
+
fa2_df["Arch ποΈ"] = fa2_df["Arch ποΈ"].apply(process_arch)
|
| 38 |
+
# seperate original model experiments from FlashAttentionV2 experiments
|
| 39 |
+
original_df = fa2_df[fa2_df["Optimization π οΈ"] == "None"]
|
| 40 |
+
fa2_df = fa2_df[fa2_df["Optimization π οΈ"] == "FlashAttentionV2"]
|
| 41 |
+
# merge the two dataframes
|
| 42 |
+
fa2_df = pd.merge(
|
| 43 |
+
original_df,
|
| 44 |
+
fa2_df,
|
| 45 |
+
on=["Model π€", "Quantization ποΈ"],
|
| 46 |
+
suffixes=["", " FlashAttentionV2"],
|
| 47 |
+
)
|
| 48 |
+
# compute speedups
|
| 49 |
+
fa2_df["Prefill Latency Speedup (%)"] = (
|
| 50 |
+
(fa2_df["Prefill Latency (s)"] / fa2_df["Prefill Latency (s) FlashAttentionV2"]) * 100
|
| 51 |
+
).round(2)
|
| 52 |
+
fa2_df["Decode Throughput Speedup (%)"] = (
|
| 53 |
+
(fa2_df["Decode Throughput (tokens/s) FlashAttentionV2"] / fa2_df["Decode Throughput (tokens/s)"]) * 100
|
| 54 |
+
).round(2)
|
| 55 |
+
|
| 56 |
+
# filter speedups > 1000%
|
| 57 |
+
fa2_df = fa2_df[fa2_df["Prefill Latency Speedup (%)"] < 1000]
|
| 58 |
+
fa2_df = fa2_df[fa2_df["Decode Throughput Speedup (%)"] < 1000]
|
| 59 |
+
|
| 60 |
+
return fa2_df
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def get_fa2_decode_fig(llm_perf_df):
|
| 64 |
+
fa2_df = get_fa2_df(llm_perf_df)
|
| 65 |
+
# plot
|
| 66 |
+
decode_fig = px.box(
|
| 67 |
+
fa2_df,
|
| 68 |
+
x="Arch ποΈ",
|
| 69 |
+
y="Decode Throughput Speedup (%)",
|
| 70 |
+
color_discrete_sequence=px.colors.qualitative.Light24,
|
| 71 |
+
custom_data=FLASHATTENTIONV2_DATA,
|
| 72 |
+
color="Quantization ποΈ",
|
| 73 |
+
points="all",
|
| 74 |
+
)
|
| 75 |
+
# add hover data
|
| 76 |
+
decode_fig.update_traces(
|
| 77 |
+
hovertemplate="<br>".join(
|
| 78 |
+
[f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(FLASHATTENTIONV2_DATA)]
|
| 79 |
+
)
|
| 80 |
+
)
|
| 81 |
+
# add layout
|
| 82 |
+
decode_fig.update_layout(
|
| 83 |
+
title={
|
| 84 |
+
"text": "Decode Throughput Speedup per Architecture",
|
| 85 |
+
"y": 0.95,
|
| 86 |
+
"x": 0.5,
|
| 87 |
+
"xanchor": "center",
|
| 88 |
+
"yanchor": "top",
|
| 89 |
+
},
|
| 90 |
+
xaxis_title="LLM Architecture",
|
| 91 |
+
yaxis_title="Decode Speedup (%)",
|
| 92 |
+
legend_title="Quantization Scheme",
|
| 93 |
+
width=1200,
|
| 94 |
+
height=600,
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
return decode_fig
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def get_fa2_prefill_fig(llm_perf_df):
|
| 101 |
+
fa2_df = get_fa2_df(llm_perf_df)
|
| 102 |
+
# plot
|
| 103 |
+
prefill_fig = px.box(
|
| 104 |
+
fa2_df,
|
| 105 |
+
x="Arch ποΈ",
|
| 106 |
+
y="Prefill Latency Speedup (%)",
|
| 107 |
+
color_discrete_sequence=px.colors.qualitative.Light24,
|
| 108 |
+
custom_data=FLASHATTENTIONV2_DATA,
|
| 109 |
+
color="Quantization ποΈ",
|
| 110 |
+
points="all",
|
| 111 |
+
)
|
| 112 |
+
# add hover data
|
| 113 |
+
prefill_fig.update_traces(
|
| 114 |
+
hovertemplate="<br>".join(
|
| 115 |
+
[f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(FLASHATTENTIONV2_DATA)]
|
| 116 |
+
)
|
| 117 |
+
)
|
| 118 |
+
# add layout
|
| 119 |
+
prefill_fig.update_layout(
|
| 120 |
+
title={
|
| 121 |
+
"text": "Prefill Latency Speedup per Architecture",
|
| 122 |
+
"y": 0.95,
|
| 123 |
+
"x": 0.5,
|
| 124 |
+
"xanchor": "center",
|
| 125 |
+
"yanchor": "top",
|
| 126 |
+
},
|
| 127 |
+
xaxis_title="LLM Architecture",
|
| 128 |
+
yaxis_title="Prefill Speedup (%)",
|
| 129 |
+
legend_title="Quantization Scheme",
|
| 130 |
+
width=1200,
|
| 131 |
+
height=600,
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
return prefill_fig
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def create_fa2_plots(llm_perf_df):
|
| 138 |
+
# descriptive text
|
| 139 |
+
gr.HTML("π Hover over the points π for additional information.", elem_id="text")
|
| 140 |
+
# get figures
|
| 141 |
+
prefill_fig = get_fa2_prefill_fig(llm_perf_df)
|
| 142 |
+
decode_fig = get_fa2_decode_fig(llm_perf_df)
|
| 143 |
+
|
| 144 |
+
# create plots
|
| 145 |
+
prefill_plot = gr.components.Plot(value=prefill_fig, elem_id="plot", show_label=False)
|
| 146 |
+
decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False)
|
| 147 |
+
|
| 148 |
+
return prefill_plot, decode_plot
|
src/latency_score_memory.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import plotly.express as px
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
SCORE_MEMORY_LATENCY_DATA = [
|
| 6 |
+
"Model π€",
|
| 7 |
+
"Arch ποΈ",
|
| 8 |
+
"Params (B)",
|
| 9 |
+
"DType π₯",
|
| 10 |
+
"Backend π",
|
| 11 |
+
"Open LLM Score (%)",
|
| 12 |
+
"Prefill Latency (s)",
|
| 13 |
+
"Decode Throughput (tokens/s)",
|
| 14 |
+
"Allocated Memory (MB)",
|
| 15 |
+
"E2E Latency (s)",
|
| 16 |
+
"E2E Throughput (tokens/s)",
|
| 17 |
+
]
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def get_lat_score_mem_fig(llm_perf_df):
|
| 21 |
+
copy_df = llm_perf_df.copy()
|
| 22 |
+
# plot
|
| 23 |
+
fig = px.scatter(
|
| 24 |
+
copy_df,
|
| 25 |
+
x="E2E Latency (s)",
|
| 26 |
+
y="Open LLM Score (%)",
|
| 27 |
+
size="Allocated Memory (MB)",
|
| 28 |
+
color="Arch ποΈ",
|
| 29 |
+
custom_data=SCORE_MEMORY_LATENCY_DATA,
|
| 30 |
+
color_discrete_sequence=px.colors.qualitative.Light24,
|
| 31 |
+
)
|
| 32 |
+
fig.update_traces(
|
| 33 |
+
hovertemplate="<br>".join(
|
| 34 |
+
[f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(SCORE_MEMORY_LATENCY_DATA)]
|
| 35 |
+
)
|
| 36 |
+
)
|
| 37 |
+
fig.update_layout(
|
| 38 |
+
title={
|
| 39 |
+
"text": "Latency vs. Score vs. Memory",
|
| 40 |
+
"y": 0.95,
|
| 41 |
+
"x": 0.5,
|
| 42 |
+
"xanchor": "center",
|
| 43 |
+
"yanchor": "top",
|
| 44 |
+
},
|
| 45 |
+
xaxis_title="Per 1000 Tokens Latency (s)",
|
| 46 |
+
yaxis_title="Open LLM Score (%)",
|
| 47 |
+
legend_title="LLM Architecture",
|
| 48 |
+
width=1200,
|
| 49 |
+
height=600,
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
return fig
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def create_lat_score_mem_plot(llm_perf_df):
|
| 56 |
+
# descriptive text
|
| 57 |
+
gr.HTML("π Hover over the points π for additional information. ",elem_id="text")
|
| 58 |
+
# get figure
|
| 59 |
+
fig = get_lat_score_mem_fig(llm_perf_df)
|
| 60 |
+
# create plot
|
| 61 |
+
plot = gr.components.Plot(
|
| 62 |
+
value=fig,
|
| 63 |
+
elem_id="plot",
|
| 64 |
+
show_label=False,
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
return plot
|
src/leaderboard.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
|
| 3 |
+
from src.utils import model_hyperlink, process_score
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
LEADERBOARD_COLUMN_TO_DATATYPE = {
|
| 7 |
+
# open llm
|
| 8 |
+
"Model π€" :"markdown",
|
| 9 |
+
"Arch ποΈ" :"markdown",
|
| 10 |
+
"Params (B)": "number",
|
| 11 |
+
"Open LLM Score (%)": "number",
|
| 12 |
+
# deployment settings
|
| 13 |
+
"DType π₯" :"str",
|
| 14 |
+
"Backend π" :"str",
|
| 15 |
+
"Optimization π οΈ" :"str",
|
| 16 |
+
"Quantization ποΈ" :"str",
|
| 17 |
+
# primary measurements
|
| 18 |
+
"Prefill Latency (s)": "number",
|
| 19 |
+
"Decode Throughput (tokens/s)": "number",
|
| 20 |
+
"Allocated Memory (MB)": "number",
|
| 21 |
+
"Energy (tokens/kWh)": "number",
|
| 22 |
+
# additional measurements
|
| 23 |
+
"E2E Latency (s)": "number",
|
| 24 |
+
"E2E Throughput (tokens/s)": "number",
|
| 25 |
+
"Reserved Memory (MB)": "number",
|
| 26 |
+
"Used Memory (MB)": "number",
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def process_model(model_name):
|
| 31 |
+
link = f"https://huggingface.co/{model_name}"
|
| 32 |
+
return model_hyperlink(link, model_name)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def get_leaderboard_df(llm_perf_df):
|
| 36 |
+
df = llm_perf_df.copy()
|
| 37 |
+
# transform for leaderboard
|
| 38 |
+
df["Model π€"] = df["Model π€"].apply(process_model)
|
| 39 |
+
# process quantization for leaderboard
|
| 40 |
+
df["Open LLM Score (%)"] = df.apply(
|
| 41 |
+
lambda x: process_score(x["Open LLM Score (%)"], x["Quantization ποΈ"]),
|
| 42 |
+
axis=1,
|
| 43 |
+
)
|
| 44 |
+
return df
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def create_leaderboard_table(llm_perf_df):
|
| 48 |
+
# descriptive text
|
| 49 |
+
gr.HTML("π Scroll to the right π for additional columns.", elem_id="text")
|
| 50 |
+
# get dataframe
|
| 51 |
+
leaderboard_df = get_leaderboard_df(llm_perf_df)
|
| 52 |
+
# create table
|
| 53 |
+
leaderboard_table = gr.components.Dataframe(
|
| 54 |
+
value=leaderboard_df,
|
| 55 |
+
datatype=list(LEADERBOARD_COLUMN_TO_DATATYPE.values()),
|
| 56 |
+
headers=list(LEADERBOARD_COLUMN_TO_DATATYPE.keys()),
|
| 57 |
+
elem_id="table",
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
return leaderboard_table
|
src/llm_perf.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from huggingface_hub import hf_hub_download
|
| 5 |
+
|
| 6 |
+
LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
|
| 7 |
+
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
| 8 |
+
|
| 9 |
+
COLUMNS_MAPPING = {
|
| 10 |
+
"Model": "Model π€",
|
| 11 |
+
"Arch": "Arch ποΈ",
|
| 12 |
+
"Size": "Params (B)",
|
| 13 |
+
"Score": "Open LLM Score (%)",
|
| 14 |
+
# deployment settings
|
| 15 |
+
"backend.name": "Backend π",
|
| 16 |
+
"backend.torch_dtype": "DType π₯",
|
| 17 |
+
"optimization": "Optimization π οΈ",
|
| 18 |
+
"quantization": "Quantization ποΈ",
|
| 19 |
+
# primary measurements
|
| 20 |
+
"forward.latency(s)": "Prefill Latency (s)",
|
| 21 |
+
"decode.throughput(tokens/s)": "Decode Throughput (tokens/s)",
|
| 22 |
+
"generate.max_memory_allocated(MB)": "Allocated Memory (MB)",
|
| 23 |
+
"generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh)",
|
| 24 |
+
# additional measurements
|
| 25 |
+
"generate.latency(s)": "E2E Latency (s)",
|
| 26 |
+
"generate.throughput(tokens/s)": "E2E Throughput (tokens/s)",
|
| 27 |
+
"generate.max_memory_reserved(MB)": "Reserved Memory (MB)",
|
| 28 |
+
"generate.max_memory_used(MB)": "Used Memory (MB)",
|
| 29 |
+
}
|
| 30 |
+
SORTING_COLUMNS = [
|
| 31 |
+
"Open LLM Score (%)",
|
| 32 |
+
"Prefill Latency (s)",
|
| 33 |
+
"Decode Throughput (tokens/s)",
|
| 34 |
+
]
|
| 35 |
+
SORTING_ASCENDING = [False, True, False]
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def get_llm_df():
|
| 39 |
+
hf_hub_download(
|
| 40 |
+
repo_id=LLM_PERF_DATASET_REPO,
|
| 41 |
+
filename="open-llm.csv",
|
| 42 |
+
local_dir="dataset",
|
| 43 |
+
repo_type="dataset",
|
| 44 |
+
token=HF_TOKEN,
|
| 45 |
+
)
|
| 46 |
+
llm_df = pd.read_csv("dataset/open-llm.csv")
|
| 47 |
+
|
| 48 |
+
return llm_df
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def get_perf_df(machine: str = "hf-dgx-01"):
|
| 52 |
+
hf_hub_download(
|
| 53 |
+
repo_id=LLM_PERF_DATASET_REPO,
|
| 54 |
+
filename=f"{machine}/perf-report.csv",
|
| 55 |
+
local_dir="dataset",
|
| 56 |
+
repo_type="dataset",
|
| 57 |
+
token=HF_TOKEN,
|
| 58 |
+
)
|
| 59 |
+
perf_df = pd.read_csv(f"dataset/{machine}/perf-report.csv")
|
| 60 |
+
|
| 61 |
+
return perf_df
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def get_llm_perf_df(machine: str = "hf-dgx-01"):
|
| 65 |
+
# get dataframes
|
| 66 |
+
llm_df = get_llm_df()
|
| 67 |
+
perf_df = get_perf_df(machine=machine)
|
| 68 |
+
llm_perf_df = pd.merge(llm_df, perf_df, left_on="Model", right_on="model")
|
| 69 |
+
# some assertions
|
| 70 |
+
assert llm_perf_df["benchmark.input_shapes.batch_size"].nunique() == 1
|
| 71 |
+
assert llm_perf_df["benchmark.input_shapes.sequence_length"].nunique() == 1
|
| 72 |
+
assert llm_perf_df["benchmark.new_tokens"].nunique() == 1
|
| 73 |
+
# transpose energy consumption
|
| 74 |
+
llm_perf_df["generate.energy_consumption(tokens/kWh)"] = (
|
| 75 |
+
1 / llm_perf_df["generate.energy_consumption(kWh/token)"].fillna(1)
|
| 76 |
+
).astype(int)
|
| 77 |
+
# fix nan values
|
| 78 |
+
llm_perf_df.loc[
|
| 79 |
+
llm_perf_df["generate.energy_consumption(tokens/kWh)"] == 1,
|
| 80 |
+
"generate.energy_consumption(tokens/kWh)",
|
| 81 |
+
] = pd.NA
|
| 82 |
+
|
| 83 |
+
# add optimization column
|
| 84 |
+
llm_perf_df["optimization"] = llm_perf_df[["backend.to_bettertransformer", "backend.use_flash_attention_2"]].apply(
|
| 85 |
+
lambda x: "BetterTransformer"
|
| 86 |
+
if x["backend.to_bettertransformer"]
|
| 87 |
+
else ("FlashAttentionV2" if x["backend.use_flash_attention_2"] else "None"),
|
| 88 |
+
axis=1,
|
| 89 |
+
)
|
| 90 |
+
# add quantization scheme
|
| 91 |
+
llm_perf_df["quantization"] = llm_perf_df[
|
| 92 |
+
[
|
| 93 |
+
"backend.quantization_scheme",
|
| 94 |
+
"backend.quantization_config.exllama_config.version",
|
| 95 |
+
]
|
| 96 |
+
].apply(
|
| 97 |
+
lambda x: "BnB.4bit"
|
| 98 |
+
if x["backend.quantization_scheme"] == "bnb"
|
| 99 |
+
else (
|
| 100 |
+
"GPTQ.4bit+ExllamaV1"
|
| 101 |
+
if (x["backend.quantization_scheme"] == "gptq")
|
| 102 |
+
and (x["backend.quantization_config.exllama_config.version"] == 1)
|
| 103 |
+
else (
|
| 104 |
+
"GPTQ.4bit+ExllamaV2"
|
| 105 |
+
if (x["backend.quantization_scheme"] == "gptq")
|
| 106 |
+
and (x["backend.quantization_config.exllama_config.version"] == 2)
|
| 107 |
+
else "None"
|
| 108 |
+
)
|
| 109 |
+
),
|
| 110 |
+
axis=1,
|
| 111 |
+
)
|
| 112 |
+
# add decode throughput
|
| 113 |
+
llm_perf_df["decode.throughput(tokens/s)"] = (
|
| 114 |
+
1000 / (llm_perf_df["generate.latency(s)"] - llm_perf_df["forward.latency(s)"])
|
| 115 |
+
).round(2)
|
| 116 |
+
# filter columns
|
| 117 |
+
llm_perf_df = llm_perf_df[list(COLUMNS_MAPPING.keys())]
|
| 118 |
+
# rename columns
|
| 119 |
+
llm_perf_df.rename(columns=COLUMNS_MAPPING, inplace=True)
|
| 120 |
+
# sort by metric
|
| 121 |
+
llm_perf_df.sort_values(
|
| 122 |
+
by=SORTING_COLUMNS,
|
| 123 |
+
ascending=SORTING_ASCENDING,
|
| 124 |
+
inplace=True,
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
return llm_perf_df
|
src/{assets/text_content.py β text.py}
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
TITLE = """<h1 align="center" id="space-title">π€ LLM-Perf Leaderboard ποΈ</h1>"""
|
| 2 |
|
| 3 |
-
|
| 4 |
The π€ LLM-Perf Leaderboard ποΈ aims to benchmark the performance (latency, throughput, memory & energy) of Large Language Models (LLMs) with different hardwares, backends and optimizations using [Optimum-Benchmark](https://github.com/huggingface/optimum-benchmark) and [Optimum](https://github.com/huggingface/optimum) flavors.
|
| 5 |
|
| 6 |
Anyone from the community can request a model or a hardware/backend/optimization configuration for automated benchmarking:
|
|
@@ -8,7 +8,7 @@ Anyone from the community can request a model or a hardware/backend/optimization
|
|
| 8 |
- Hardware/Backend/Optimization performance requests should be made in the [community discussions](https://huggingface.co/spaces/optimum/llm-perf-leaderboard/discussions) to assess their relevance and feasibility.
|
| 9 |
"""
|
| 10 |
|
| 11 |
-
|
| 12 |
<ul>
|
| 13 |
<li>To avoid communication-dependent results, only one GPU is used.</li>
|
| 14 |
<li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">π€ Open LLM Leaderboard</a>.</li>
|
|
@@ -18,11 +18,26 @@ ABOUT_TEXT = """<h3>About the π€ LLM-Perf Leaderboard ποΈ</h3>
|
|
| 18 |
</ul>
|
| 19 |
"""
|
| 20 |
|
| 21 |
-
|
| 22 |
Here's an example of the configuration file used to benchmark the models with Optimum-Benchmark:
|
| 23 |
```yaml
|
| 24 |
defaults:
|
| 25 |
-
- backend: pytorch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
- benchmark: inference # default benchmark
|
| 27 |
- experiment # inheriting from experiment config
|
| 28 |
- _self_ # for hydra 1.1 compatibility
|
|
@@ -31,39 +46,38 @@ defaults:
|
|
| 31 |
|
| 32 |
hydra:
|
| 33 |
run:
|
| 34 |
-
dir:
|
| 35 |
job:
|
| 36 |
chdir: true
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
model: {model}
|
| 41 |
-
|
| 42 |
-
device: cuda
|
| 43 |
|
| 44 |
backend:
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
bettertransformer: true
|
| 48 |
-
quantization_scheme: gptq
|
| 49 |
-
|
| 50 |
|
| 51 |
benchmark:
|
|
|
|
| 52 |
memory: true
|
| 53 |
energy: true
|
| 54 |
-
|
| 55 |
new_tokens: 1000
|
| 56 |
input_shapes:
|
| 57 |
batch_size: 1
|
| 58 |
sequence_length: 256
|
| 59 |
|
| 60 |
-
|
|
|
|
| 61 |
```
|
| 62 |
"""
|
| 63 |
|
| 64 |
|
| 65 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results."
|
| 66 |
-
|
| 67 |
author = {Ilyas Moutawwakil, RΓ©gis Pierrard},
|
| 68 |
title = {LLM-Perf Leaderboard},
|
| 69 |
year = {2023},
|
|
|
|
| 1 |
TITLE = """<h1 align="center" id="space-title">π€ LLM-Perf Leaderboard ποΈ</h1>"""
|
| 2 |
|
| 3 |
+
INTRODUCTION = """
|
| 4 |
The π€ LLM-Perf Leaderboard ποΈ aims to benchmark the performance (latency, throughput, memory & energy) of Large Language Models (LLMs) with different hardwares, backends and optimizations using [Optimum-Benchmark](https://github.com/huggingface/optimum-benchmark) and [Optimum](https://github.com/huggingface/optimum) flavors.
|
| 5 |
|
| 6 |
Anyone from the community can request a model or a hardware/backend/optimization configuration for automated benchmarking:
|
|
|
|
| 8 |
- Hardware/Backend/Optimization performance requests should be made in the [community discussions](https://huggingface.co/spaces/optimum/llm-perf-leaderboard/discussions) to assess their relevance and feasibility.
|
| 9 |
"""
|
| 10 |
|
| 11 |
+
ABOUT = """<h3>About the π€ LLM-Perf Leaderboard ποΈ</h3>
|
| 12 |
<ul>
|
| 13 |
<li>To avoid communication-dependent results, only one GPU is used.</li>
|
| 14 |
<li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">π€ Open LLM Leaderboard</a>.</li>
|
|
|
|
| 18 |
</ul>
|
| 19 |
"""
|
| 20 |
|
| 21 |
+
EXAMPLE_CONFIG = """
|
| 22 |
Here's an example of the configuration file used to benchmark the models with Optimum-Benchmark:
|
| 23 |
```yaml
|
| 24 |
defaults:
|
| 25 |
+
- backend: pytorch
|
| 26 |
+
- _base_ # inheriting from base config
|
| 27 |
+
- _self_ # for hydra 1.1 compatibility
|
| 28 |
+
|
| 29 |
+
experiment_name: pytorch+cuda+float16+bettertransformer
|
| 30 |
+
device: cuda
|
| 31 |
+
|
| 32 |
+
backend:
|
| 33 |
+
no_weights: true
|
| 34 |
+
torch_dtype: float16
|
| 35 |
+
to_bettertransformer: true
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
Where the base config is:
|
| 39 |
+
```yaml
|
| 40 |
+
defaults:
|
| 41 |
- benchmark: inference # default benchmark
|
| 42 |
- experiment # inheriting from experiment config
|
| 43 |
- _self_ # for hydra 1.1 compatibility
|
|
|
|
| 46 |
|
| 47 |
hydra:
|
| 48 |
run:
|
| 49 |
+
dir: ???
|
| 50 |
job:
|
| 51 |
chdir: true
|
| 52 |
+
env_set:
|
| 53 |
+
CUDA_VISIBLE_DEVICES: 0
|
| 54 |
+
CUDA_DEVICE_ORDER: PCI_BUS_ID
|
| 55 |
|
| 56 |
+
model: ???
|
| 57 |
+
experiment_name: ???
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
backend:
|
| 60 |
+
initial_isolation_check: true
|
| 61 |
+
continous_isolation_check: true
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
benchmark:
|
| 64 |
+
duration: 10
|
| 65 |
memory: true
|
| 66 |
energy: true
|
| 67 |
+
|
| 68 |
new_tokens: 1000
|
| 69 |
input_shapes:
|
| 70 |
batch_size: 1
|
| 71 |
sequence_length: 256
|
| 72 |
|
| 73 |
+
hub_kwargs:
|
| 74 |
+
trust_remote_code: true
|
| 75 |
```
|
| 76 |
"""
|
| 77 |
|
| 78 |
|
| 79 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results."
|
| 80 |
+
CITATION_BUTTON = r"""@misc{llm-perf-leaderboard,
|
| 81 |
author = {Ilyas Moutawwakil, RΓ©gis Pierrard},
|
| 82 |
title = {LLM-Perf Leaderboard},
|
| 83 |
year = {2023},
|
src/utils.py
CHANGED
|
@@ -1,22 +1,3 @@
|
|
| 1 |
-
from huggingface_hub import HfApi, Repository
|
| 2 |
-
import gradio as gr
|
| 3 |
-
import json
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
def change_tab(query_param):
|
| 7 |
-
query_param = query_param.replace("'", '"')
|
| 8 |
-
query_param = json.loads(query_param)
|
| 9 |
-
|
| 10 |
-
if (
|
| 11 |
-
isinstance(query_param, dict)
|
| 12 |
-
and "tab" in query_param
|
| 13 |
-
and query_param["tab"] == "plot"
|
| 14 |
-
):
|
| 15 |
-
return gr.Tabs.update(selected=1)
|
| 16 |
-
else:
|
| 17 |
-
return gr.Tabs.update(selected=0)
|
| 18 |
-
|
| 19 |
-
|
| 20 |
LLM_MODEL_ARCHS = {
|
| 21 |
"stablelm_epoch": "π΄ StableLM-Epoch",
|
| 22 |
"stablelm_alpha": "π΄ StableLM-Alpha",
|
|
@@ -24,8 +5,8 @@ LLM_MODEL_ARCHS = {
|
|
| 24 |
"RefinedWebModel": "π¦
Falcon",
|
| 25 |
"gpt_bigcode": "β StarCoder",
|
| 26 |
"RefinedWeb": "π¦
Falcon",
|
| 27 |
-
"baichuan": "π Baichuan ηΎε·",
|
| 28 |
-
"internlm": "π§βπ InternLM δΉ¦η",
|
| 29 |
"mistral": "βοΈ Mistral",
|
| 30 |
"codegen": "βΎοΈ CodeGen",
|
| 31 |
"chatglm": "π¬ ChatGLM",
|
|
@@ -34,7 +15,7 @@ LLM_MODEL_ARCHS = {
|
|
| 34 |
"llama": "π¦ LLaMA",
|
| 35 |
"rwkv": "π¦ββ¬ RWKV",
|
| 36 |
"mpt": "π§± MPT",
|
| 37 |
-
"Yi": "π« Yi δΊΊ", # people
|
| 38 |
# suggest something
|
| 39 |
"gpt_neox": "GPT-NeoX",
|
| 40 |
"gpt_neo": "GPT-Neo",
|
|
@@ -50,13 +31,25 @@ def model_hyperlink(link, model_name):
|
|
| 50 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
| 51 |
|
| 52 |
|
| 53 |
-
def
|
| 54 |
-
link = f"https://huggingface.co/{model_name}"
|
| 55 |
-
return model_hyperlink(link, model_name)
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
def process_model_arch(model_arch):
|
| 59 |
if model_arch in LLM_MODEL_ARCHS:
|
| 60 |
return LLM_MODEL_ARCHS[model_arch]
|
| 61 |
else:
|
| 62 |
return model_arch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
LLM_MODEL_ARCHS = {
|
| 2 |
"stablelm_epoch": "π΄ StableLM-Epoch",
|
| 3 |
"stablelm_alpha": "π΄ StableLM-Alpha",
|
|
|
|
| 5 |
"RefinedWebModel": "π¦
Falcon",
|
| 6 |
"gpt_bigcode": "β StarCoder",
|
| 7 |
"RefinedWeb": "π¦
Falcon",
|
| 8 |
+
"baichuan": "π Baichuan ηΎε·", # river
|
| 9 |
+
"internlm": "π§βπ InternLM δΉ¦η", # scholar
|
| 10 |
"mistral": "βοΈ Mistral",
|
| 11 |
"codegen": "βΎοΈ CodeGen",
|
| 12 |
"chatglm": "π¬ ChatGLM",
|
|
|
|
| 15 |
"llama": "π¦ LLaMA",
|
| 16 |
"rwkv": "π¦ββ¬ RWKV",
|
| 17 |
"mpt": "π§± MPT",
|
| 18 |
+
"Yi": "π« Yi δΊΊ" , # people
|
| 19 |
# suggest something
|
| 20 |
"gpt_neox": "GPT-NeoX",
|
| 21 |
"gpt_neo": "GPT-Neo",
|
|
|
|
| 31 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
| 32 |
|
| 33 |
|
| 34 |
+
def process_arch(model_arch):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
if model_arch in LLM_MODEL_ARCHS:
|
| 36 |
return LLM_MODEL_ARCHS[model_arch]
|
| 37 |
else:
|
| 38 |
return model_arch
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def process_score(score, quantization):
|
| 42 |
+
if quantization != "None":
|
| 43 |
+
return f"{score:.2f}*"
|
| 44 |
+
else:
|
| 45 |
+
return f"{score:.2f} "
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
# def change_tab(query_param):
|
| 49 |
+
# query_param = query_param.replace("'", '"')
|
| 50 |
+
# query_param = json.loads(query_param)
|
| 51 |
+
|
| 52 |
+
# if isinstance(query_param, dict) and "tab" in query_param and query_param["tab"] == "plot":
|
| 53 |
+
# return gr.Tabs.update(selected=1)
|
| 54 |
+
# else:
|
| 55 |
+
# return gr.Tabs.update(selected=0)
|