djstrong commited on
Commit
7a9f32a
·
1 Parent(s): 235501b
Files changed (4) hide show
  1. app.py +14 -3
  2. metadata.json +29 -1
  3. plot_results.py +153 -0
  4. requirements.txt +1 -0
app.py CHANGED
@@ -12,6 +12,7 @@ from src.about import (
12
  AUTHORS,
13
  )
14
  from src.display.formatting import make_clickable_model
 
15
 
16
  demo = gr.Blocks(css=custom_css)
17
  with demo:
@@ -96,9 +97,16 @@ with demo:
96
  # rename columns
97
  leaderboard_df = leaderboard_df.rename(columns={"Model Path": "Model"})
98
  leaderboard_df = leaderboard_df.rename(columns={"Num Questions Parseable": "Percentage Questions Parseable"})
 
99
 
100
- leaderboard_df_styled = leaderboard_df.style.background_gradient(cmap="RdYlGn")
101
- leaderboard_df_styled = leaderboard_df_styled.background_gradient(cmap="RdYlGn_r", subset=['Params'])
 
 
 
 
 
 
102
 
103
  rounding = {}
104
  # for col in ["Benchmark Score", "Num Questions Parseable"]:
@@ -110,13 +118,16 @@ with demo:
110
 
111
  leaderboard_table = gr.components.Dataframe(
112
  value=leaderboard_df_styled,
113
- # headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
114
  datatype=['markdown', 'number', 'number', 'number', 'str'],
115
  elem_id="leaderboard-table",
116
  interactive=False,
117
  visible=True,
118
  )
119
 
 
 
 
 
120
  gr.Markdown(AUTHORS, elem_classes="markdown-text")
121
 
122
  demo.queue(default_concurrency_limit=40).launch()
 
12
  AUTHORS,
13
  )
14
  from src.display.formatting import make_clickable_model
15
+ from plot_results import create_performance_plot
16
 
17
  demo = gr.Blocks(css=custom_css)
18
  with demo:
 
97
  # rename columns
98
  leaderboard_df = leaderboard_df.rename(columns={"Model Path": "Model"})
99
  leaderboard_df = leaderboard_df.rename(columns={"Num Questions Parseable": "Percentage Questions Parseable"})
100
+ # Set midpoint for gradient coloring based on data ranges
101
 
102
+ leaderboard_df_styled = leaderboard_df.style.background_gradient(
103
+ cmap="RdYlGn"
104
+ )
105
+ leaderboard_df_styled = leaderboard_df_styled.background_gradient(
106
+ cmap="RdYlGn_r",
107
+ subset=['Params'],
108
+ vmax=150
109
+ )
110
 
111
  rounding = {}
112
  # for col in ["Benchmark Score", "Num Questions Parseable"]:
 
118
 
119
  leaderboard_table = gr.components.Dataframe(
120
  value=leaderboard_df_styled,
 
121
  datatype=['markdown', 'number', 'number', 'number', 'str'],
122
  elem_id="leaderboard-table",
123
  interactive=False,
124
  visible=True,
125
  )
126
 
127
+ # Create and show the performance plot below the table
128
+ fig = create_performance_plot()
129
+ plot = gr.Plot(value=fig, elem_id="performance-plot")
130
+
131
  gr.Markdown(AUTHORS, elem_classes="markdown-text")
132
 
133
  demo.queue(default_concurrency_limit=40).launch()
metadata.json CHANGED
@@ -319,5 +319,33 @@
319
  "speakleash/Bielik-11B-v2.0-Instruct": 11,
320
  "speakleash/Bielik-11B-v2.2-Instruct": 11,
321
  "speakleash/Bielik-11B-v2.1-Instruct": 11,
322
- "speakleash/Bielik-11B-v2.3-Instruct": 11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
  }
 
319
  "speakleash/Bielik-11B-v2.0-Instruct": 11,
320
  "speakleash/Bielik-11B-v2.2-Instruct": 11,
321
  "speakleash/Bielik-11B-v2.1-Instruct": 11,
322
+ "speakleash/Bielik-11B-v2.3-Instruct": 11,
323
+ "CYFRAGOVPL/PLLuM-12B-nc-chat": 12,
324
+ "CYFRAGOVPL/PLLuM-12B-chat": 12,
325
+ "CYFRAGOVPL/PLLuM-12B-instruct": 12,
326
+ "CYFRAGOVPL/Llama-PLLuM-8B-instruct": 8,
327
+ "CYFRAGOVPL/PLLuM-12B-nc-instruct": 12,
328
+ "CYFRAGOVPL/Llama-PLLuM-8B-chat": 8,
329
+ "CYFRAGOVPL/PLLuM-8x7B-nc-chat": 46.7,
330
+ "CYFRAGOVPL/PLLuM-8x7B-nc-instruct": 46.7,
331
+ "CYFRAGOVPL/PLLuM-8x7B-chat": 46.7,
332
+ "CYFRAGOVPL/PLLuM-8x7B-instruct": 46.7,
333
+ "CYFRAGOVPL/Llama-PLLuM-70B-chat": 70,
334
+ "CYFRAGOVPL/Llama-PLLuM-70B-instruct": 70,
335
+ "Qwen/Qwen2.5-7B-Instruct": 7,
336
+ "Qwen/Qwen2.5-14B-Instruct": 14,
337
+ "Qwen/Qwen2.5-1.5B-Instruct": 1.5,
338
+ "microsoft/phi-4": 14.7,
339
+ "Qwen/Qwen2.5-32B-Instruct": 32,
340
+ "Qwen/Qwen2.5-72B-Instruct": 72,
341
+ "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF": 70,
342
+ "meta-llama/Llama-3.2-1B-Instruct": 1,
343
+ "utter-project/EuroLLM-9B-Instruct": 9,
344
+ "mistralai/Mistral-Small-Instruct-2409": 22.2,
345
+ "mistralai/Mistral-Small-24B-Instruct-2501": 24,
346
+ "meta-llama/Llama-3.3-70B-Instruct": 70,
347
+ "meta-llama/Llama-3.2-3B-Instruct": 3,
348
+ "Qwen/Qwen2.5-3B-Instruct": 3,
349
+ "mistralai/Mistral-Nemo-Instruct-2407": 12,
350
+ "microsoft/Phi-4-mini-instruct": 4
351
  }
plot_results.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import matplotlib.pyplot as plt
3
+ import numpy as np
4
+ import json
5
+ import csv
6
+
7
+ def create_performance_plot(csv_path='benchmark_results.csv', metadata_path='metadata.json'):
8
+ # Define whitelist of interesting models (partial matches)
9
+ WHITELIST = [
10
+ 'Meta-Llama-3.1-70B-Instruct'
11
+ ]
12
+
13
+ # Read the benchmark results with error handling for inconsistent rows
14
+ valid_rows = []
15
+ expected_fields = 14 # Number of expected fields in each row
16
+
17
+ with open(csv_path, 'r') as f:
18
+ reader = csv.reader(f)
19
+ header = next(reader) # Get header row
20
+ # Strip whitespace from header names
21
+ header = [h.strip() for h in header]
22
+ for row in reader:
23
+ if len(row) == expected_fields: # Only keep rows with correct number of fields
24
+ # Strip whitespace from values
25
+ valid_rows.append([val.strip() for val in row])
26
+
27
+ # Create DataFrame from valid rows
28
+ df = pd.DataFrame(valid_rows, columns=header)
29
+
30
+ # Read model sizes from metadata
31
+ with open(metadata_path, 'r') as f:
32
+ metadata = json.load(f)
33
+
34
+ # Process the data
35
+ # Keep only successful runs (where Benchmark Score is not FAILED)
36
+ df = df[df['Benchmark Score'] != 'FAILED']
37
+ df = df[df['Benchmark Score'].notna()]
38
+ # Convert score to numeric, handling invalid values
39
+ df['Benchmark Score'] = pd.to_numeric(df['Benchmark Score'], errors='coerce')
40
+ df = df[df['Benchmark Score'].notna()] # Remove rows where conversion failed
41
+
42
+ # Convert Num Questions Parseable to numeric and calculate adjusted score
43
+ df['Num Questions Parseable'] = pd.to_numeric(df['Num Questions Parseable'], errors='coerce')
44
+ df['Benchmark Score'] = df['Benchmark Score'] * (df['Num Questions Parseable'] / 171)
45
+
46
+ # For each model, keep only the latest run
47
+ df['Run ID'] = df['Run ID'].fillna('')
48
+ df['timestamp'] = pd.to_datetime(df['Benchmark Completed'])
49
+ df = df.sort_values('timestamp')
50
+ df = df.drop_duplicates(subset=['Model Path'], keep='last')
51
+
52
+ # Get model sizes
53
+ def get_model_size(model_path):
54
+ # Try exact match first
55
+ if model_path in metadata:
56
+ return metadata[model_path]
57
+ # Try with max_length suffix
58
+ if f"{model_path},max_length=4096" in metadata:
59
+ return metadata[f"{model_path},max_length=4096"]
60
+ return None
61
+
62
+ # Print models without size before filtering
63
+ print("\nModels without size assigned:")
64
+ models_without_size = df[df['Model Path'].apply(get_model_size).isna()]
65
+ for model in models_without_size['Model Path']:
66
+ print(f"- {model}")
67
+
68
+ df['Model Size'] = df['Model Path'].apply(get_model_size)
69
+ df = df[df['Model Size'].notna()]
70
+
71
+ # Remove extreme outliers (scores that are clearly errors)
72
+ q1 = df['Benchmark Score'].quantile(0.25)
73
+ q3 = df['Benchmark Score'].quantile(0.75)
74
+ iqr = q3 - q1
75
+ df = df[
76
+ (df['Benchmark Score'] >= q1 - 1.5 * iqr) &
77
+ (df['Benchmark Score'] <= q3 + 1.5 * iqr)
78
+ ]
79
+
80
+ # Find models on Pareto frontier
81
+ sizes = sorted(df['Model Size'].unique())
82
+ frontier_points = []
83
+ max_score = float('-inf')
84
+ frontier_models = set()
85
+
86
+ for size in sizes:
87
+ # Get scores for models of this size or smaller
88
+ subset = df[df['Model Size'] <= size]
89
+ if len(subset) > 0:
90
+ max_score_idx = subset['Benchmark Score'].idxmax()
91
+ current_max = subset.loc[max_score_idx, 'Benchmark Score']
92
+ if current_max > max_score:
93
+ max_score = current_max
94
+ frontier_points.append((size, max_score))
95
+ frontier_models.add(subset.loc[max_score_idx, 'Model Path'])
96
+
97
+ # Filter models - keep those on Pareto frontier or matching whitelist
98
+ df['Keep'] = False
99
+ for idx, row in df.iterrows():
100
+ if row['Model Path'] in frontier_models:
101
+ df.loc[idx, 'Keep'] = True
102
+ else:
103
+ for pattern in WHITELIST:
104
+ if pattern in row['Model Path']:
105
+ df.loc[idx, 'Keep'] = True
106
+ break
107
+
108
+ df = df[df['Keep']]
109
+
110
+ # Create the plot
111
+ fig = plt.figure(figsize=(12, 8))
112
+
113
+ # Create scatter plot
114
+ plt.scatter(df['Model Size'],
115
+ df['Benchmark Score'],
116
+ alpha=0.6)
117
+
118
+ # Add labels for points
119
+ for idx, row in df.iterrows():
120
+ # Get model name - either last part of path or full name for special cases
121
+ model_name = row['Model Path'].split('/')[-1]
122
+ if any(pattern in row['Model Path'] for pattern in ['gpt-3', 'gpt-4']):
123
+ model_name = row['Model Path']
124
+
125
+ plt.annotate(model_name,
126
+ (row['Model Size'], row['Benchmark Score']),
127
+ xytext=(5, 5), textcoords='offset points',
128
+ fontsize=8,
129
+ bbox=dict(facecolor='white', alpha=0.7, edgecolor='none', pad=0.5))
130
+
131
+ # Plot the Pareto frontier line
132
+ if frontier_points:
133
+ frontier_x, frontier_y = zip(*frontier_points)
134
+ plt.plot(frontier_x, frontier_y, 'r--', label='Pareto frontier')
135
+
136
+ # Customize the plot
137
+ plt.grid(True, linestyle='--', alpha=0.7)
138
+ plt.xlabel('Model Size (billions of parameters)')
139
+ plt.ylabel('Benchmark Score')
140
+ plt.title('Model Performance vs Size (Pareto Frontier)')
141
+
142
+ # Add legend
143
+ plt.legend()
144
+
145
+ # Adjust layout to prevent label cutoff
146
+ plt.tight_layout()
147
+
148
+ return fig
149
+
150
+ if __name__ == "__main__":
151
+ # When run as a script, save the plot to a file
152
+ fig = create_performance_plot()
153
+ fig.savefig('model_performance.png', dpi=300, bbox_inches='tight')
requirements.txt CHANGED
@@ -2,3 +2,4 @@ tqdm
2
  gradio
3
  gradio_client
4
  pandas
 
 
2
  gradio
3
  gradio_client
4
  pandas
5
+ matplotlib