djstrong commited on
Commit
b314a79
·
1 Parent(s): 288816d

new models

Browse files
Files changed (2) hide show
  1. app.py +11 -3
  2. benchmark_results.csv +31 -0
app.py CHANGED
@@ -75,8 +75,6 @@ with demo:
75
  # move column order
76
  leaderboard_df = leaderboard_df[["Model Path", "Params", "Benchmark Score", "Num Questions Parseable", 'Error']]
77
 
78
- leaderboard_df["Model Path"] = leaderboard_df["Model Path"].apply(lambda x: make_clickable_model(x))
79
-
80
  # change value of column to nan
81
  leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].replace('FAILED', numpy.nan)
82
 
@@ -93,10 +91,20 @@ with demo:
93
  # sort by 2 columns
94
  leaderboard_df = leaderboard_df.sort_values(by=["Benchmark Score", "Num Questions Parseable"],
95
  ascending=[False, False])
96
-
 
 
 
 
 
 
 
 
 
97
  # rename columns
98
  leaderboard_df = leaderboard_df.rename(columns={"Model Path": "Model"})
99
  leaderboard_df = leaderboard_df.rename(columns={"Num Questions Parseable": "Percentage Questions Parseable"})
 
100
  # Set midpoint for gradient coloring based on data ranges
101
 
102
  leaderboard_df_styled = leaderboard_df.style.background_gradient(
 
75
  # move column order
76
  leaderboard_df = leaderboard_df[["Model Path", "Params", "Benchmark Score", "Num Questions Parseable", 'Error']]
77
 
 
 
78
  # change value of column to nan
79
  leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].replace('FAILED', numpy.nan)
80
 
 
91
  # sort by 2 columns
92
  leaderboard_df = leaderboard_df.sort_values(by=["Benchmark Score", "Num Questions Parseable"],
93
  ascending=[False, False])
94
+
95
+ # Print model names and scores to console before HTML formatting
96
+ print("\n===== MODEL RESULTS =====")
97
+ for index, row in leaderboard_df.iterrows():
98
+ print(f"{row['Model Path']}: {row['Benchmark Score']:.2f}")
99
+ print("========================\n")
100
+
101
+ # Apply HTML formatting for display
102
+ leaderboard_df["Model Path"] = leaderboard_df["Model Path"].apply(lambda x: make_clickable_model(x))
103
+
104
  # rename columns
105
  leaderboard_df = leaderboard_df.rename(columns={"Model Path": "Model"})
106
  leaderboard_df = leaderboard_df.rename(columns={"Num Questions Parseable": "Percentage Questions Parseable"})
107
+
108
  # Set midpoint for gradient coloring based on data ranges
109
 
110
  leaderboard_df_styled = leaderboard_df.style.background_gradient(
benchmark_results.csv CHANGED
@@ -153,3 +153,34 @@ gpt-4o-mini-2024-07-18,2024-08-25 21:17:34,openai_api,gpt-4o-mini-2024-07-18,,,7
153
  gpt-4o-2024-08-06,2024-08-25 21:24:35,openai_api,gpt-4o-2024-08-06,,,75.15,eq-bench_v2_pl,171.0,1,openai,,,
154
  gpt-4-turbo-2024-04-09,2024-08-25 21:31:42,openai_api,gpt-4-turbo-2024-04-09,,,77.77,eq-bench_v2_pl,164.0,1,openai,,,
155
  Bielik_v2.3,2024-09-14 10:40:57,,speakleash/Bielik-11B-v2.3-Instruct,,,70.86,eq-bench_v2_pl,171.0,1,transformers, ,,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  gpt-4o-2024-08-06,2024-08-25 21:24:35,openai_api,gpt-4o-2024-08-06,,,75.15,eq-bench_v2_pl,171.0,1,openai,,,
154
  gpt-4-turbo-2024-04-09,2024-08-25 21:31:42,openai_api,gpt-4-turbo-2024-04-09,,,77.77,eq-bench_v2_pl,164.0,1,openai,,,
155
  Bielik_v2.3,2024-09-14 10:40:57,,speakleash/Bielik-11B-v2.3-Instruct,,,70.86,eq-bench_v2_pl,171.0,1,transformers, ,,
156
+ PLLuM-12B-nc-chat,2025-02-24 15:02:07,,CYFRAGOVPL/PLLuM-12B-nc-chat,,,49.23,eq-bench_pl,123.0,1,transformers, ,,123.0 questions were parseable (min is 83%)
157
+ Llama-PLLuM-8B-instruct,2025-02-24 16:55:16,,CYFRAGOVPL/Llama-PLLuM-8B-instruct,,,43.56,eq-bench_pl,124.0,1,transformers, ,,124.0 questions were parseable (min is 83%)
158
+ PLLuM-12B-nc-instruct,2025-02-24 17:38:48,,CYFRAGOVPL/PLLuM-12B-nc-instruct,,,29.50,eq-bench_pl,76.0,1,transformers, ,,76.0 questions were parseable (min is 83%)
159
+ PLLuM-12B-chat,2025-02-24 17:56:34,,CYFRAGOVPL/PLLuM-12B-chat,,,57.29,eq-bench_v2_pl,156.0,1,transformers, ,,
160
+ PLLuM-12B-instruct,2025-02-24 18:03:06,,CYFRAGOVPL/PLLuM-12B-instruct,,,40.21,eq-bench_v2_pl,154.0,1,transformers, ,,
161
+ Llama-PLLuM-8B-chat,2025-02-24 18:40:04,,CYFRAGOVPL/Llama-PLLuM-8B-chat,,,50.97,eq-bench_v2_pl,155.0,1,transformers, ,,
162
+ Llama-PLLuM-70B-instruct,2025-02-23 22:45:37,,CYFRAGOVPL/Llama-PLLuM-70B-instruct,,,69.99,eq-bench_v2_pl,171.0,1,transformers, ,,
163
+ Llama-PLLuM-70B-chat,2025-02-24 22:32:57,,CYFRAGOVPL/Llama-PLLuM-70B-chat,,,72.99,eq-bench_v2_pl,170.0,1,transformers, ,,
164
+ PLLuM-8x7B-nc-chat,2025-02-23 14:33:22,openai_api,CYFRAGOVPL/PLLuM-8x7B-nc-chat,,,47.29,eq-bench_v2_pl,171.0,1,openai,,,
165
+ PLLuM-8x7B-nc-instruct,2025-02-23 14:33:22,openai_api,CYFRAGOVPL/PLLuM-8x7B-nc-instruct,,,41.75,eq-bench_v2_pl,171.0,1,openai,,,
166
+ PLLuM-8x7B-chat,2025-02-23 14:33:22,openai_api,CYFRAGOVPL/PLLuM-8x7B-chat,,,45.22,eq-bench_v2_pl,171.0,1,openai,,,
167
+ PLLuM-8x7B-instruct,2025-02-23 14:33:22,openai_api,CYFRAGOVPL/PLLuM-8x7B-instruct,,,39.55,eq-bench_v2_pl,171.0,1,openai,,,
168
+ Qwen2.5-7B-Instruct,2025-03-01 11:49:28,,Qwen/Qwen2.5-7B-Instruct,,,58.58,eq-bench_v2_pl,171.0,1,transformers,,,
169
+ Qwen2.5-14B-Instruct,2025-03-01 12:01:56,,Qwen/Qwen2.5-14B-Instruct,,,69.58,eq-bench_v2_pl,170.0,1,transformers,,,
170
+ Qwen2.5-1.5B-Instruct,2025-03-01 12:09:18,,Qwen/Qwen2.5-1.5B-Instruct,,,27.79,eq-bench_v2_pl,170.0,1,transformers,,,
171
+ phi-4,2025-03-01 12:19:38,,microsoft/phi-4,,,64.37,eq-bench_v2_pl,157.0,1,transformers,,,
172
+ glm-4-9b-chat,2025-03-01 12:23:46,,THUDM/glm-4-9b-chat,,,61.79,eq-bench_v2_pl,171.0,1,transformers,,,
173
+ openchat-3.6-8b-20240522,2025-03-01 12:29:29,,openchat/openchat-3.6-8b-20240522,,,-2.0090659464796536e+18,eq-bench_v2_pl,170.0,1,transformers,,,
174
+ Qwen2.5-32B-Instruct,2025-03-02 14:08:52,,Qwen/Qwen2.5-32B-Instruct,,,71.15,eq-bench_v2_pl,171.0,1,transformers,,,
175
+ Qwen2.5-72B-Instruct,2025-03-02 14:25:32,,Qwen/Qwen2.5-72B-Instruct,,,68.89,eq-bench_v2_pl,170.0,1,transformers,,,
176
+ Llama-3.1-Nemotron-70B-Instruct-HF,2025-03-02 15:04:25,,nvidia/Llama-3.1-Nemotron-70B-Instruct-HF,,,74.75,eq-bench_pl,133.0,1,transformers,,,133.0 questions were parseable (min is 83%)
177
+ Llama-3.2-1B-Instruct,2025-03-02 16:35:24,,meta-llama/Llama-3.2-1B-Instruct,,,20.59,eq-bench_v2_pl,148.0,1,transformers,,,
178
+ EuroLLM-9B-Instruct,2025-03-02 16:41:02,,utter-project/EuroLLM-9B-Instruct,,,54.75,eq-bench_v2_pl,169.0,1,transformers,,,
179
+ Llama-3.3-70B-Instruct,2025-03-02 16:59:31,,meta-llama/Llama-3.3-70B-Instruct,,,72.86,eq-bench_v2_pl,166.0,1,transformers,,,
180
+ Llama-3.2-3B-Instruct,2025-03-02 17:14:17,,meta-llama/Llama-3.2-3B-Instruct,,,46.46,eq-bench_v2_pl,170.0,1,transformers,,,
181
+ Qwen2.5-3B-Instruct,2025-03-02 17:26:57,,Qwen/Qwen2.5-3B-Instruct,,,36.08,eq-bench_v2_pl,170.0,1,transformers,,,
182
+ Mistral-Small-24B-Instruct-2501,2025-03-02 17:33:14,,mistralai/Mistral-Small-24B-Instruct-2501,,,70.52,eq-bench_v2_pl,171.0,1,transformers,,,
183
+ Mistral-Small-Instruct-2409,2025-03-02 17:43:01,,mistralai/Mistral-Small-Instruct-2409,,,72.85,eq-bench_v2_pl,171.0,1,transformers,,,
184
+ Mistral-Nemo-Instruct-2407,2025-03-03 10:29:42,,mistralai/Mistral-Nemo-Instruct-2407,,,61.76,eq-bench_v2_pl,171.0,1,transformers,,,
185
+ Phi-4-mini-instruct,2025-03-03 13:20:03,,microsoft/Phi-4-mini-instruct,,,50.82,eq-bench_v2_pl,170.0,1,transformers,,,
186
+ Mistral-Large-Instruct-2411,2025-03-07 12:17:17,,mistralai/Mistral-Large-Instruct-2411,,,77.29,eq-bench_v2_pl,171.0,1,transformers,,,