CultriX commited on
Commit
982fdda
·
verified ·
1 Parent(s): 94d5ed8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +171 -204
app.py CHANGED
@@ -1,53 +1,53 @@
1
- import streamlit as st
2
  import pandas as pd
3
- from huggingface_hub import HfApi, ModelCard
4
- from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
5
- import re
6
- from io import StringIO
7
- from yall import create_yall
8
  import plotly.graph_objs as go
9
-
10
- def calculate_pages(df, items_per_page):
11
- """Calculate the number of pages needed for pagination."""
12
- return -(-len(df) // items_per_page) # Equivalent to math.ceil(len(df) / items_per_page)
13
-
14
- @st.cache_data
15
- def cached_model_info(_api, model):
16
- """Fetch model information from the Hugging Face API and cache the result."""
17
- try:
18
- return _api.model_info(repo_id=str(model))
19
- except (RepositoryNotFoundError, RevisionNotFoundError):
20
- return None
21
-
22
- @st.cache_data
23
- def get_model_info(df):
24
- """Get model information and update the DataFrame with likes and tags."""
25
- api = HfApi()
26
- with st.spinner("Fetching model information..."):
27
- for index, row in df.iterrows():
28
- model_info = cached_model_info(api, row['Model'].strip())
29
- if model_info:
30
- df.loc[index, 'Likes'] = model_info.likes
31
- df.loc[index, 'Tags'] = ', '.join(model_info.tags)
32
- else:
33
- df.loc[index, 'Likes'] = -1
34
- df.loc[index, 'Tags'] = ''
35
- return df
36
-
37
- def convert_markdown_table_to_dataframe(md_content):
38
- """Convert a markdown table to a pandas DataFrame."""
39
- cleaned_content = re.sub(r'\|\s*$', '', re.sub(r'^\|\s*', '', md_content, flags=re.MULTILINE), flags=re.MULTILINE)
40
- df = pd.read_csv(StringIO(cleaned_content), sep="\|", engine='python')
41
- df = df.drop(0, axis=0)
42
- df.columns = df.columns.str.strip()
43
- model_link_pattern = r'\[(.*?)\]\((.*?)\)\s*\[.*?\]\(.*?\)'
44
- df['URL'] = df['Model'].apply(lambda x: re.search(model_link_pattern, x).group(2) if re.search(model_link_pattern, x) else None)
45
- df['Model'] = df['Model'].apply(lambda x: re.sub(model_link_pattern, r'\1', x))
46
- return df
 
 
 
 
47
 
48
  def create_bar_chart(df, category):
49
  """Create a horizontal bar chart for the specified category."""
50
- st.write(f"### {category} Scores")
51
  sorted_df = df[['Model', category]].sort_values(by=category, ascending=True)
52
  fig = go.Figure(go.Bar(
53
  x=sorted_df[category],
@@ -62,164 +62,131 @@ def create_bar_chart(df, category):
62
  margin=dict(l=20, r=20, t=20, b=20),
63
  title=f"Leaderboard for {category} Scores"
64
  )
65
- st.plotly_chart(fig, use_container_width=True, height=len(df) * 35)
66
-
67
- def fetch_merge_configs(df):
68
- """Fetch and save merge configurations for the top models."""
69
- df_sorted = df.sort_values(by='Average', ascending=False)
70
- try:
71
- with open('/tmp/configurations.txt', 'a') as file:
72
- for index, row in df_sorted.head(20).iterrows():
73
- model_name = row['Model'].rstrip()
74
- try:
75
- card = ModelCard.load(model_name)
76
- file.write(f'Model Name: {model_name}\n')
77
- file.write(f'Scores: {row["Average"]}\n')
78
- file.write(f'AGIEval: {row["AGIEval"]}\n')
79
- file.write(f'GPT4All: {row["GPT4All"]}\n')
80
- file.write(f'TruthfulQA: {row["TruthfulQA"]}\n')
81
- file.write(f'Bigbench: {row["Bigbench"]}\n')
82
- file.write(f'Model Card: {card}\n')
83
- except Exception as e:
84
- st.error(f"Error loading model card for {model_name}: {str(e)}")
85
- with open('/tmp/configurations.txt', 'r') as file:
86
- content = file.read()
87
- matches = re.findall(r'yaml(.*?)```', content, re.DOTALL)
88
- with open('/tmp/configurations2.txt', 'w') as file:
89
- for row, match in zip(df_sorted[['Model', 'Average', 'AGIEval', 'GPT4All', 'TruthfulQA', 'Bigbench']].head(20).values, matches):
90
- file.write(f'Model Name: {row[0]}\n')
91
- file.write(f'Scores: {row[1]}\n')
92
- file.write(f'AGIEval: {row[2]}\n')
93
- file.write(f'GPT4All: {row[3]}\n')
94
- file.write(f'TruthfulQA: {row[4]}\n')
95
- file.write(f'Bigbench: {row[5]}\n')
96
- file.write('yaml' + match + '```\n')
97
- except Exception as e:
98
- st.error(f"Error while fetching merge configs: {str(e)}")
99
-
100
- def main():
101
- """Main function to set up the Streamlit app and display the leaderboard."""
102
- st.set_page_config(page_title="YALL - Yet Another LLM Leaderboard", layout="wide")
103
- st.title("🏆 YALL - Yet Another LLM Leaderboard")
104
- st.markdown("Leaderboard made with 🧐 [LLM AutoEval](https://github.com/mlabonne/llm-autoeval) using [Nous](https://huggingface.co/NousResearch) benchmark suite.")
105
- content = create_yall()
106
- tab1, tab2 = st.tabs(["🏆 Leaderboard", "📝 About"])
107
-
108
- with tab1:
109
- if content:
110
- try:
111
- score_columns = ['Average', 'AGIEval', 'GPT4All', 'TruthfulQA', 'Bigbench']
112
- full_df = convert_markdown_table_to_dataframe(content)
113
-
114
- for col in score_columns:
115
- full_df[col] = pd.to_numeric(full_df[col].str.strip(), errors='coerce')
116
-
117
- full_df = get_model_info(full_df)
118
- full_df['Tags'] = full_df['Tags'].fillna('')
119
- df = pd.DataFrame(columns=full_df.columns)
120
-
121
- show_phi = st.checkbox("Phi (2.8B)", value=True)
122
- show_mistral = st.checkbox("Mistral (7B)", value=True)
123
- show_other = st.checkbox("Other", value=True)
124
-
125
- dfs_to_concat = []
126
- if show_phi:
127
- dfs_to_concat.append(full_df[full_df['Tags'].str.lower().str.contains('phi,|phi-msft,')])
128
- if show_mistral:
129
- dfs_to_concat.append(full_df[full_df['Tags'].str.lower().str.contains('mistral,')])
130
- if show_other:
131
- other_df = full_df[~full_df['Tags'].str.lower().str.contains('phi,|phi-msft,|mistral,')]
132
- dfs_to_concat.append(other_df)
133
-
134
- if dfs_to_concat:
135
- df = pd.concat(dfs_to_concat, ignore_index=True)
136
-
137
- search_query = st.text_input("Search models", "")
138
- if search_query:
139
- df = df[df['Model'].str.contains(search_query, case=False)]
140
-
141
- items_per_page = 50
142
- pages = calculate_pages(df, items_per_page)
143
- page = st.selectbox("Page", list(range(1, pages + 1)))
144
-
145
- df = df.sort_values(by='Average', ascending=False)
146
- start = (page - 1) * items_per_page
147
- end = start + items_per_page
148
- df = df[start:end]
149
-
150
- selected_benchmarks = st.multiselect('Select benchmarks to include in the average', score_columns, default=score_columns)
151
-
152
- if selected_benchmarks:
153
- df['Filtered Average'] = df[selected_benchmarks].mean(axis=1)
154
- df = df.sort_values(by='Filtered Average', ascending=False)
155
- st.dataframe(
156
- df[['Model'] + selected_benchmarks + ['Filtered Average', 'Likes', 'URL']],
157
- use_container_width=True,
158
- column_config={
159
- "Likes": st.column_config.NumberColumn(
160
- "Likes",
161
- help="Number of likes on Hugging Face",
162
- format="%d ❤️",
163
- ),
164
- "URL": st.column_config.LinkColumn("URL"),
165
- },
166
- hide_index=True,
167
- height=len(df) * 37,
168
- )
169
-
170
- selected_models = st.multiselect('Select models to compare', df['Model'].unique())
171
- comparison_df = df[df['Model'].isin(selected_models)]
172
- st.dataframe(comparison_df)
173
-
174
- if st.button("Export to CSV"):
175
- csv_data = df.to_csv(index=False)
176
- st.download_button(
177
- label="Download CSV",
178
- data=csv_data,
179
- file_name="leaderboard.csv",
180
- key="download-csv",
181
- help="Click to download the CSV file",
182
- )
183
- if st.button("Fetch Merge-Configs"):
184
- fetch_merge_configs(full_df)
185
- st.success("Merge configurations have been fetched and saved.")
186
-
187
- create_bar_chart(df, 'Filtered Average')
188
-
189
- col1, col2 = st.columns(2)
190
- with col1:
191
- create_bar_chart(df, score_columns[1])
192
- with col2:
193
- create_bar_chart(df, score_columns[2])
194
-
195
- col3, col4 = st.columns(2)
196
- with col3:
197
- create_bar_chart(df, score_columns[3])
198
- with col4:
199
- create_bar_chart(df, score_columns[4])
200
-
201
- except Exception as e:
202
- st.error("An error occurred while processing the markdown table.")
203
- st.error(str(e))
204
- else:
205
- st.error("Failed to download the content from the URL provided.")
206
-
207
- with tab2:
208
- st.markdown('''
209
- ### Nous benchmark suite
210
- Popularized by [Teknium](https://huggingface.co/teknium) and [NousResearch](https://huggingface.co/NousResearch), this benchmark suite aggregates four benchmarks:
211
- * [**AGIEval**](https://arxiv.org/abs/2304.06364) (0-shot): `agieval_aqua_rat,agieval_logiqa_en,agieval_lsat_ar,agieval_lsat_lr,agieval_lsat_rc,agieval_sat_en,agieval_sat_en_without_passage,agieval_sat_math`
212
- * **GPT4ALL** (0-shot): `hellaswag,openbookqa,winogrande,arc_easy,arc_challenge,boolq,piqa`
213
- * [**TruthfulQA**](https://arxiv.org/abs/2109.07958) (0-shot): `truthfulqa_mc`
214
- * [**Bigbench**](https://arxiv.org/abs/2206.04615) (0-shot): `bigbench_causal_judgement,bigbench_date_understanding,bigbench_disambiguation_qa,bigbench_geometric_shapes,bigbench_logical_deduction_five_objects,bigbench_logical_deduction_seven_objects,bigbench_logical_deduction_three_objects,bigbench_movie_recommendation,bigbench_navigate,bigbench_reasoning_about_colored_objects,bigbench_ruin_names,bigbench_salient_translation_error_detection,bigbench_snarks,bigbench_sports_understanding,bigbench_temporal_sequences,bigbench_tracking_shuffled_objects_five_objects,bigbench_tracking_shuffled_objects_seven_objects,bigbench_tracking_shuffled_objects_three_objects`
215
- ### Reproducibility
216
- You can easily reproduce these results using 🧐 [LLM AutoEval](https://github.com/mlabonne/llm-autoeval/tree/master), a colab notebook that automates the evaluation process (benchmark: `nous`). This will upload the results to GitHub as gists. You can find the entire table with the links to the detailed results [here](https://gist.github.com/mlabonne/90294929a2dbcb8877f9696f28105fdf).
217
- ### Clone this space
218
- You can create your own leaderboard with your LLM AutoEval results on GitHub Gist. You just need to clone this space and specify two variables:
219
- * Change the `gist_id` in [yall.py](https://huggingface.co/spaces/mlabonne/Yet_Another_LLM_Leaderboard/blob/main/yall.py#L126).
220
- * Create "New Secret" in Settings > Variables and secrets (name: "github", value: [your GitHub token](https://github.com/settings/tokens))
221
- A special thanks to [gblazex](https://huggingface.co/gblazex) for providing many evaluations.
222
- ''')
223
 
224
  if __name__ == "__main__":
225
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
  import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+ import seaborn as sns
 
 
 
5
  import plotly.graph_objs as go
6
+ import plotly.offline as py
7
+ from io import StringIO
8
+ import base64
9
+
10
+ py.init_notebook_mode(connected=True)
11
+
12
+ # Read the data from the file
13
+ def parse_data(file_content):
14
+ lines = file_content.splitlines()
15
+
16
+ model_data = []
17
+ current_model = None
18
+
19
+ for line in lines:
20
+ line = line.strip()
21
+ if line.startswith('hf (pretrained='):
22
+ current_model = line.split('pretrained=')[1].split(',')[0]
23
+ elif line and current_model:
24
+ if not line.startswith('-') and '|' in line:
25
+ # Parse table row
26
+ parts = [p.strip() for p in line.split('|')]
27
+ if len(parts) >= 2: # Ensure the correct number of columns
28
+ try:
29
+ task_name = parts[0]
30
+ value = float(parts[1]) # Extract the numeric value
31
+ model_data.append([
32
+ current_model,
33
+ task_name, # Task name
34
+ value
35
+ ])
36
+ except ValueError:
37
+ print(f"Skipping row due to invalid value: {parts}")
38
+ if not model_data:
39
+ print("No valid data found in the file.")
40
+ return pd.DataFrame(model_data, columns=['Model', 'Task', 'Value'])
41
+
42
+ # Calculate average performance
43
+ def calculate_averages(data):
44
+ if data.empty:
45
+ print("No data available to calculate averages.")
46
+ return pd.DataFrame(columns=['Model', 'Average Performance'])
47
+ return data.groupby('Model')['Value'].mean().reset_index().rename(columns={'Value': 'Average Performance'})
48
 
49
  def create_bar_chart(df, category):
50
  """Create a horizontal bar chart for the specified category."""
 
51
  sorted_df = df[['Model', category]].sort_values(by=category, ascending=True)
52
  fig = go.Figure(go.Bar(
53
  x=sorted_df[category],
 
62
  margin=dict(l=20, r=20, t=20, b=20),
63
  title=f"Leaderboard for {category} Scores"
64
  )
65
+ return fig
66
+
67
+ def generate_visualizations(data, averages):
68
+ sns.set(style='whitegrid')
69
+
70
+ if averages.empty:
71
+ print("No averages to visualize.")
72
+ return None, None, None, None, None, None
73
+
74
+ averages = averages.sort_values(by='Average Performance')
75
+
76
+ # Matplotlib average performance plot
77
+ plt.figure(figsize=(12, 8))
78
+ sns.barplot(data=averages, x='Average Performance', y='Model', palette='viridis')
79
+ plt.title('Average Performance of Models', fontsize=16)
80
+ plt.xlabel('Average Performance', fontsize=12)
81
+ plt.ylabel('Model', fontsize=12)
82
+ plt.tight_layout()
83
+
84
+ # Save the plot to a buffer
85
+ buffer_avg = StringIO()
86
+ plt.savefig(buffer_avg, format='png')
87
+ buffer_avg.seek(0)
88
+ image_avg = base64.b64encode(buffer_avg.read()).decode('utf-8')
89
+ plt.close()
90
+
91
+ # Line plot for task performance by model
92
+ sorted_models = averages['Model'].tolist()
93
+ data['Model'] = pd.Categorical(data['Model'], categories=sorted_models, ordered=True)
94
+ data = data.sort_values(by=['Model', 'Task'])
95
+
96
+ if data.empty:
97
+ print("No data available for line plot.")
98
+ return image_avg, None, None, None, None, None
99
+
100
+ plt.figure(figsize=(14, 10))
101
+ sns.lineplot(data=data, x='Task', y='Value', hue='Model', marker='o')
102
+ plt.title('Task Performance by Model', fontsize=16)
103
+ plt.xlabel('Task', fontsize=12)
104
+ plt.ylabel('Performance', fontsize=12)
105
+ plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', title='Model')
106
+ plt.xticks(rotation=45)
107
+ plt.tight_layout()
108
+
109
+ # Save the line plot to a buffer
110
+ buffer_line = StringIO()
111
+ plt.savefig(buffer_line, format='png')
112
+ buffer_line.seek(0)
113
+ image_line = base64.b64encode(buffer_line.read()).decode('utf-8')
114
+ plt.close()
115
+
116
+ # Heatmap of task performance
117
+ pivot_table = data.pivot_table(index='Task', columns='Model', values='Value')
118
+ plt.figure(figsize=(12, 10))
119
+ sns.heatmap(pivot_table, annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
120
+ plt.title('Task Performance Heatmap', fontsize=16)
121
+ plt.xlabel('Model', fontsize=12)
122
+ plt.ylabel('Task', fontsize=12)
123
+ plt.tight_layout()
124
+
125
+ # Save the heatmap to a buffer
126
+ buffer_heatmap = StringIO()
127
+ plt.savefig(buffer_heatmap, format='png')
128
+ buffer_heatmap.seek(0)
129
+ image_heatmap = base64.b64encode(buffer_heatmap.read()).decode('utf-8')
130
+ plt.close()
131
+
132
+ # Boxplot of performance distribution per model
133
+ plt.figure(figsize=(12, 8))
134
+ sns.boxplot(data=data, x='Model', y='Value', palette='Set2')
135
+ plt.title('Performance Distribution per Model', fontsize=16)
136
+ plt.xlabel('Model', fontsize=12)
137
+ plt.ylabel('Performance', fontsize=12)
138
+ plt.xticks(rotation=45)
139
+ plt.tight_layout()
140
+
141
+ # Save the boxplot to a buffer
142
+ buffer_boxplot = StringIO()
143
+ plt.savefig(buffer_boxplot, format='png')
144
+ buffer_boxplot.seek(0)
145
+ image_boxplot = base64.b64encode(buffer_boxplot.read()).decode('utf-8')
146
+ plt.close()
147
+
148
+ # Create plotly bar charts
149
+ fig1 = create_bar_chart(averages, 'Average Performance')
150
+ plotly_avg = fig1.to_html(full_html=False)
151
+
152
+ plotly_tasks = {}
153
+ # Assuming you have tasks in the dataframe and want to display it
154
+ tasks = data['Task'].unique()
155
+ for task in tasks:
156
+ task_data = data[data['Task'] == task]
157
+ fig2 = create_bar_chart(task_data, 'Value')
158
+ fig2.update_layout(title=f"Leaderboard for {task} Scores")
159
+ plotly_tasks[task] = fig2.to_html(full_html=False)
160
+
161
+ return image_avg, image_line, image_heatmap, image_boxplot, plotly_avg, plotly_tasks
162
+
163
+ def process_and_visualize(file_content):
164
+ data = parse_data(file_content)
165
+ averages = calculate_averages(data)
166
+
167
+ image_avg, image_line, image_heatmap, image_boxplot, plotly_avg, plotly_tasks = generate_visualizations(data, averages)
168
+
169
+ output_text = f"Average Performance per Model:\n{averages.sort_values(by='Average Performance').to_string()}"
170
+
171
+ return output_text, image_avg, image_line, image_heatmap, image_boxplot, plotly_avg, plotly_tasks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
  if __name__ == "__main__":
174
+
175
+ iface = gr.Interface(
176
+ fn=process_and_visualize,
177
+ inputs=gr.Textbox(lines=10, label="Paste your data here"),
178
+ outputs=[
179
+ gr.Textbox(label="Average Performance per Model"),
180
+ gr.Image(label="Matplotlib Average Performance Chart"),
181
+ gr.Image(label="Matplotlib Task Performance Line Chart"),
182
+ gr.Image(label="Matplotlib Task Performance Heatmap"),
183
+ gr.Image(label="Matplotlib Performance Distribution Boxplot"),
184
+ gr.HTML(label="Plotly Average Performance Chart"),
185
+ gr.Accordion(
186
+ [gr.HTML(label=f"Plotly {task} Chart") for task in ['tinyArc', 'tinyHellaswag', 'tinyMMLU', 'tinyTruthfulQA', 'tinyTruthfulQA_mc1', 'tinyWinogrande']], label="Task Charts"),
187
+ ],
188
+ title="LLM Benchmark Visualizer",
189
+ description="Upload your LLM benchmark data and visualize the results."
190
+ )
191
+
192
+ iface.launch(share=True)