xzyao commited on
Commit
1438b6d
·
verified ·
1 Parent(s): 0b5cd64

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -25
app.py CHANGED
@@ -1,8 +1,36 @@
1
  import gradio as gr
 
2
  import plotly.express as px
3
  from pathlib import Path
4
  import pandas as pd
5
  import numpy as np
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  abs_path = Path(__file__).parent
7
 
8
  def parse_model_args(model_args):
@@ -34,47 +62,75 @@ def parse_model_precision(model_args):
34
 
35
  # Any pandas-compatible data
36
  df = pd.read_csv(str(abs_path / "eval_results.csv"))
 
37
  # take acc only
38
  df = df[df['metric'] == 'acc']
39
  # dedup
40
  df = df.drop_duplicates(subset=['model', 'task'])
41
  # pivot df, such that the column names are model,task,efficiency
42
  # but keep precision in its original place
43
- df = df.pivot(index='model', columns='task', values='value').reset_index()
 
 
44
 
45
  df['precision'] = df['model'].apply(lambda x: x.split(":")[-1])
46
  df['model'] = df['model'].apply(lambda x: x.split(":")[0])
47
-
48
- # average over all columns starting with 'task_'
49
  df['avg_acc'] = df.filter(like='task_').mean(axis=1)
50
- # keep 2 decimal points for avg_acc, and all tasks_
51
- # rename columns starting with 'task_' by removing 'task_'
52
  df = df.rename(columns=lambda x: x.replace('task_', ''))
53
  numeric_columns = df.select_dtypes(include=[np.number]).columns
 
 
54
  df[numeric_columns] = (df[numeric_columns]*100).round(2)
 
 
55
 
56
  with gr.Blocks() as demo:
57
  gr.Markdown("""
58
  # 🥇 Efficient LLM Leaderboard
59
  """)
60
- task_options = [col for col in df.columns if col not in ['model', 'precision']]
61
-
62
- with gr.Row():
63
- selected_tasks = gr.CheckboxGroup(choices=task_options, label="Select Tasks")
64
- with gr.Row():
65
- accuracy_plot = gr.Plot(label="Accuracy Plot")
66
- data_table = gr.Dataframe(value=df, label="Result Table")
67
-
68
- def update_outputs(selected_tasks):
69
- if not selected_tasks:
70
- return df[['model', 'precision']], None
71
- filtered_df = df[['model', 'precision'] + selected_tasks]
72
- melted_df = filtered_df.melt(id_vars=['model', 'precision'], var_name='task', value_name='accuracy')
73
- fig = px.bar(melted_df, x='model', y='accuracy', color='precision', barmode='group', facet_col='task')
74
- return filtered_df, fig
75
-
76
- selected_tasks.change(fn=update_outputs, inputs=selected_tasks, outputs=[data_table, accuracy_plot])
77
-
78
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  if __name__ == "__main__":
80
- demo.launch()
 
1
  import gradio as gr
2
+ from gradio_leaderboard import Leaderboard
3
  import plotly.express as px
4
  from pathlib import Path
5
  import pandas as pd
6
  import numpy as np
7
+ from langchain_openai import ChatOpenAI
8
+ from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
9
+ from langchain.agents.agent_types import AgentType
10
+ from langchain_google_genai import ChatGoogleGenerativeAI
11
+
12
+ def explain_df(query, df):
13
+ agent = create_pandas_dataframe_agent(
14
+ # ChatOpenAI(
15
+ # base_url="https://fmapi.swissai.cscs.ch",
16
+ # temperature=0.01,
17
+ # model="meta-llama/Llama-3.3-70B-Instruct"
18
+ # ),
19
+ ChatGoogleGenerativeAI(
20
+ model="gemini-1.5-pro",
21
+ temperature=0,
22
+ max_tokens=None,
23
+ timeout=None,
24
+ max_retries=2,
25
+ ),
26
+ df,
27
+ verbose=True,
28
+ allow_dangerous_code=True,
29
+ )
30
+ response = agent.invoke(query)
31
+ return response['output']
32
+
33
+
34
  abs_path = Path(__file__).parent
35
 
36
  def parse_model_args(model_args):
 
62
 
63
  # Any pandas-compatible data
64
  df = pd.read_csv(str(abs_path / "eval_results.csv"))
65
+ perf_df = pd.read_csv(str(abs_path / "perfbench_results.csv"))
66
  # take acc only
67
  df = df[df['metric'] == 'acc']
68
  # dedup
69
  df = df.drop_duplicates(subset=['model', 'task'])
70
  # pivot df, such that the column names are model,task,efficiency
71
  # but keep precision in its original place
72
+ df['model_physical_size'] = df['model_physical_size'].apply(lambda x: x/1024/1024/1024)
73
+
74
+ df = df.pivot(index=['model','hf_name','model_physical_size'], columns='task', values='value').reset_index()
75
 
76
  df['precision'] = df['model'].apply(lambda x: x.split(":")[-1])
77
  df['model'] = df['model'].apply(lambda x: x.split(":")[0])
 
 
78
  df['avg_acc'] = df.filter(like='task_').mean(axis=1)
79
+
 
80
  df = df.rename(columns=lambda x: x.replace('task_', ''))
81
  numeric_columns = df.select_dtypes(include=[np.number]).columns
82
+ # remove physical size from numeric columns
83
+ numeric_columns = numeric_columns.drop('model_physical_size')
84
  df[numeric_columns] = (df[numeric_columns]*100).round(2)
85
+ df['model_physical_size'] = df['model_physical_size'].round(2)
86
+ full_df = df.merge(perf_df, left_on='hf_name', right_on='hf_name', how='left')
87
 
88
  with gr.Blocks() as demo:
89
  gr.Markdown("""
90
  # 🥇 Efficient LLM Leaderboard
91
  """)
92
+ with gr.Tabs():
93
+ with gr.TabItem("Leaderboard"):
94
+ # ...existing code...
95
+ task_options = [col for col in df.columns if col not in ['model','hf_name','model_physical_size', 'precision']]
96
+ with gr.Row():
97
+ selected_tasks = gr.CheckboxGroup(choices=task_options, label="Select Tasks")
98
+ with gr.Row():
99
+ accuracy_plot = gr.Plot(label="Accuracy Plot")
100
+ line_plot = gr.Plot(label="Average Accuracy vs Model Size")
101
+ with gr.Row():
102
+ throughput_line_plot = gr.Plot(label="Throughput vs Average Accuracy")
103
+ latency_line_plot = gr.Plot(label="Latency vs Average Accuracy")
104
+ with gr.Row():
105
+ data_table = gr.Dataframe(value=df, label="Result Table")
106
+ def update_outputs(selected_tasks):
107
+ if not selected_tasks:
108
+ return df[['model', 'precision']], None, None
109
+ filtered_df = df[['model', 'precision', 'model_physical_size','hf_name'] + selected_tasks]
110
+ # average accuracy of selected tasks
111
+ filtered_df['avg_accuracy'] = filtered_df[selected_tasks].mean(axis=1)
112
+ bar_fig = px.bar(filtered_df, x='model', y='avg_accuracy', color='precision', barmode='group')
113
+ line_fig = px.line(filtered_df, x='model_physical_size', y='avg_accuracy', color='model', symbol='precision')
114
+ # set title of bar_fig
115
+ bar_fig.update_layout(title=f'tasks: {", ".join(selected_tasks)}')
116
+ line_fig.update_layout(title=f'tasks: {", ".join(selected_tasks)}')
117
+ with_perf_df = filtered_df.merge(perf_df, left_on='hf_name', right_on='hf_name', how='left')
118
+ throughput_line_fig = px.line(with_perf_df, x='output_throughput', y='avg_accuracy', color='model', symbol='precision')
119
+ latency_line_fig = px.line(with_perf_df, x="avg_e2e_latency", y='avg_accuracy', color='model', symbol='precision')
120
+ return with_perf_df, bar_fig, line_fig, throughput_line_fig, latency_line_fig
121
+ selected_tasks.change(
122
+ fn=update_outputs,
123
+ inputs=selected_tasks,
124
+ outputs=[data_table, accuracy_plot, line_plot, throughput_line_plot, latency_line_plot]
125
+ )
126
+ with gr.TabItem("Find Model"):
127
+ query_input = gr.Textbox(label="Enter your query", placeholder="Enter your query here")
128
+ response_output = gr.Textbox(label="Response", interactive=False)
129
+ query_input.submit(
130
+ fn=lambda query: explain_df(query, df),
131
+ inputs=query_input,
132
+ outputs=response_output
133
+ )
134
+
135
  if __name__ == "__main__":
136
+ demo.launch(share=True)