Update app.py
Browse files
app.py
CHANGED
@@ -1,8 +1,36 @@
|
|
1 |
import gradio as gr
|
|
|
2 |
import plotly.express as px
|
3 |
from pathlib import Path
|
4 |
import pandas as pd
|
5 |
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
abs_path = Path(__file__).parent
|
7 |
|
8 |
def parse_model_args(model_args):
|
@@ -34,47 +62,75 @@ def parse_model_precision(model_args):
|
|
34 |
|
35 |
# Any pandas-compatible data
|
36 |
df = pd.read_csv(str(abs_path / "eval_results.csv"))
|
|
|
37 |
# take acc only
|
38 |
df = df[df['metric'] == 'acc']
|
39 |
# dedup
|
40 |
df = df.drop_duplicates(subset=['model', 'task'])
|
41 |
# pivot df, such that the column names are model,task,efficiency
|
42 |
# but keep precision in its original place
|
43 |
-
df
|
|
|
|
|
44 |
|
45 |
df['precision'] = df['model'].apply(lambda x: x.split(":")[-1])
|
46 |
df['model'] = df['model'].apply(lambda x: x.split(":")[0])
|
47 |
-
|
48 |
-
# average over all columns starting with 'task_'
|
49 |
df['avg_acc'] = df.filter(like='task_').mean(axis=1)
|
50 |
-
|
51 |
-
# rename columns starting with 'task_' by removing 'task_'
|
52 |
df = df.rename(columns=lambda x: x.replace('task_', ''))
|
53 |
numeric_columns = df.select_dtypes(include=[np.number]).columns
|
|
|
|
|
54 |
df[numeric_columns] = (df[numeric_columns]*100).round(2)
|
|
|
|
|
55 |
|
56 |
with gr.Blocks() as demo:
|
57 |
gr.Markdown("""
|
58 |
# 🥇 Efficient LLM Leaderboard
|
59 |
""")
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
if __name__ == "__main__":
|
80 |
-
demo.launch()
|
|
|
1 |
import gradio as gr
|
2 |
+
from gradio_leaderboard import Leaderboard
|
3 |
import plotly.express as px
|
4 |
from pathlib import Path
|
5 |
import pandas as pd
|
6 |
import numpy as np
|
7 |
+
from langchain_openai import ChatOpenAI
|
8 |
+
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
|
9 |
+
from langchain.agents.agent_types import AgentType
|
10 |
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
11 |
+
|
12 |
+
def explain_df(query, df):
|
13 |
+
agent = create_pandas_dataframe_agent(
|
14 |
+
# ChatOpenAI(
|
15 |
+
# base_url="https://fmapi.swissai.cscs.ch",
|
16 |
+
# temperature=0.01,
|
17 |
+
# model="meta-llama/Llama-3.3-70B-Instruct"
|
18 |
+
# ),
|
19 |
+
ChatGoogleGenerativeAI(
|
20 |
+
model="gemini-1.5-pro",
|
21 |
+
temperature=0,
|
22 |
+
max_tokens=None,
|
23 |
+
timeout=None,
|
24 |
+
max_retries=2,
|
25 |
+
),
|
26 |
+
df,
|
27 |
+
verbose=True,
|
28 |
+
allow_dangerous_code=True,
|
29 |
+
)
|
30 |
+
response = agent.invoke(query)
|
31 |
+
return response['output']
|
32 |
+
|
33 |
+
|
34 |
abs_path = Path(__file__).parent
|
35 |
|
36 |
def parse_model_args(model_args):
|
|
|
62 |
|
63 |
# Any pandas-compatible data
|
64 |
df = pd.read_csv(str(abs_path / "eval_results.csv"))
|
65 |
+
perf_df = pd.read_csv(str(abs_path / "perfbench_results.csv"))
|
66 |
# take acc only
|
67 |
df = df[df['metric'] == 'acc']
|
68 |
# dedup
|
69 |
df = df.drop_duplicates(subset=['model', 'task'])
|
70 |
# pivot df, such that the column names are model,task,efficiency
|
71 |
# but keep precision in its original place
|
72 |
+
df['model_physical_size'] = df['model_physical_size'].apply(lambda x: x/1024/1024/1024)
|
73 |
+
|
74 |
+
df = df.pivot(index=['model','hf_name','model_physical_size'], columns='task', values='value').reset_index()
|
75 |
|
76 |
df['precision'] = df['model'].apply(lambda x: x.split(":")[-1])
|
77 |
df['model'] = df['model'].apply(lambda x: x.split(":")[0])
|
|
|
|
|
78 |
df['avg_acc'] = df.filter(like='task_').mean(axis=1)
|
79 |
+
|
|
|
80 |
df = df.rename(columns=lambda x: x.replace('task_', ''))
|
81 |
numeric_columns = df.select_dtypes(include=[np.number]).columns
|
82 |
+
# remove physical size from numeric columns
|
83 |
+
numeric_columns = numeric_columns.drop('model_physical_size')
|
84 |
df[numeric_columns] = (df[numeric_columns]*100).round(2)
|
85 |
+
df['model_physical_size'] = df['model_physical_size'].round(2)
|
86 |
+
full_df = df.merge(perf_df, left_on='hf_name', right_on='hf_name', how='left')
|
87 |
|
88 |
with gr.Blocks() as demo:
|
89 |
gr.Markdown("""
|
90 |
# 🥇 Efficient LLM Leaderboard
|
91 |
""")
|
92 |
+
with gr.Tabs():
|
93 |
+
with gr.TabItem("Leaderboard"):
|
94 |
+
# ...existing code...
|
95 |
+
task_options = [col for col in df.columns if col not in ['model','hf_name','model_physical_size', 'precision']]
|
96 |
+
with gr.Row():
|
97 |
+
selected_tasks = gr.CheckboxGroup(choices=task_options, label="Select Tasks")
|
98 |
+
with gr.Row():
|
99 |
+
accuracy_plot = gr.Plot(label="Accuracy Plot")
|
100 |
+
line_plot = gr.Plot(label="Average Accuracy vs Model Size")
|
101 |
+
with gr.Row():
|
102 |
+
throughput_line_plot = gr.Plot(label="Throughput vs Average Accuracy")
|
103 |
+
latency_line_plot = gr.Plot(label="Latency vs Average Accuracy")
|
104 |
+
with gr.Row():
|
105 |
+
data_table = gr.Dataframe(value=df, label="Result Table")
|
106 |
+
def update_outputs(selected_tasks):
|
107 |
+
if not selected_tasks:
|
108 |
+
return df[['model', 'precision']], None, None
|
109 |
+
filtered_df = df[['model', 'precision', 'model_physical_size','hf_name'] + selected_tasks]
|
110 |
+
# average accuracy of selected tasks
|
111 |
+
filtered_df['avg_accuracy'] = filtered_df[selected_tasks].mean(axis=1)
|
112 |
+
bar_fig = px.bar(filtered_df, x='model', y='avg_accuracy', color='precision', barmode='group')
|
113 |
+
line_fig = px.line(filtered_df, x='model_physical_size', y='avg_accuracy', color='model', symbol='precision')
|
114 |
+
# set title of bar_fig
|
115 |
+
bar_fig.update_layout(title=f'tasks: {", ".join(selected_tasks)}')
|
116 |
+
line_fig.update_layout(title=f'tasks: {", ".join(selected_tasks)}')
|
117 |
+
with_perf_df = filtered_df.merge(perf_df, left_on='hf_name', right_on='hf_name', how='left')
|
118 |
+
throughput_line_fig = px.line(with_perf_df, x='output_throughput', y='avg_accuracy', color='model', symbol='precision')
|
119 |
+
latency_line_fig = px.line(with_perf_df, x="avg_e2e_latency", y='avg_accuracy', color='model', symbol='precision')
|
120 |
+
return with_perf_df, bar_fig, line_fig, throughput_line_fig, latency_line_fig
|
121 |
+
selected_tasks.change(
|
122 |
+
fn=update_outputs,
|
123 |
+
inputs=selected_tasks,
|
124 |
+
outputs=[data_table, accuracy_plot, line_plot, throughput_line_plot, latency_line_plot]
|
125 |
+
)
|
126 |
+
with gr.TabItem("Find Model"):
|
127 |
+
query_input = gr.Textbox(label="Enter your query", placeholder="Enter your query here")
|
128 |
+
response_output = gr.Textbox(label="Response", interactive=False)
|
129 |
+
query_input.submit(
|
130 |
+
fn=lambda query: explain_df(query, df),
|
131 |
+
inputs=query_input,
|
132 |
+
outputs=response_output
|
133 |
+
)
|
134 |
+
|
135 |
if __name__ == "__main__":
|
136 |
+
demo.launch(share=True)
|