Spaces:

speakleash
/

polish_eq-bench

Running

App Files Files Community

djstrong commited on 12 days ago

Commit

b314a79

1 Parent(s): 288816d

new models

Browse files

Files changed (2) hide show

app.py +11 -3
benchmark_results.csv +31 -0

app.py CHANGED Viewed

@@ -75,8 +75,6 @@ with demo:
     # move column order
     leaderboard_df = leaderboard_df[["Model Path", "Params", "Benchmark Score", "Num Questions Parseable", 'Error']]
-    leaderboard_df["Model Path"] = leaderboard_df["Model Path"].apply(lambda x: make_clickable_model(x))
     # change value of column to nan
     leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].replace('FAILED', numpy.nan)
@@ -93,10 +91,20 @@ with demo:
     # sort by 2 columns
     leaderboard_df = leaderboard_df.sort_values(by=["Benchmark Score", "Num Questions Parseable"],
                                                 ascending=[False, False])
     # rename columns
     leaderboard_df = leaderboard_df.rename(columns={"Model Path": "Model"})
     leaderboard_df = leaderboard_df.rename(columns={"Num Questions Parseable": "Percentage Questions Parseable"})
     # Set midpoint for gradient coloring based on data ranges
     leaderboard_df_styled = leaderboard_df.style.background_gradient(

     # move column order
     leaderboard_df = leaderboard_df[["Model Path", "Params", "Benchmark Score", "Num Questions Parseable", 'Error']]
     # change value of column to nan
     leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].replace('FAILED', numpy.nan)
     # sort by 2 columns
     leaderboard_df = leaderboard_df.sort_values(by=["Benchmark Score", "Num Questions Parseable"],
                                                 ascending=[False, False])
+    # Print model names and scores to console before HTML formatting
+    print("\n===== MODEL RESULTS =====")
+    for index, row in leaderboard_df.iterrows():
+        print(f"{row['Model Path']}: {row['Benchmark Score']:.2f}")
+    print("========================\n")
+    # Apply HTML formatting for display
+    leaderboard_df["Model Path"] = leaderboard_df["Model Path"].apply(lambda x: make_clickable_model(x))
     # rename columns
     leaderboard_df = leaderboard_df.rename(columns={"Model Path": "Model"})
     leaderboard_df = leaderboard_df.rename(columns={"Num Questions Parseable": "Percentage Questions Parseable"})
     # Set midpoint for gradient coloring based on data ranges
     leaderboard_df_styled = leaderboard_df.style.background_gradient(

benchmark_results.csv CHANGED Viewed

@@ -153,3 +153,34 @@ gpt-4o-mini-2024-07-18,2024-08-25 21:17:34,openai_api,gpt-4o-mini-2024-07-18,,,7
 gpt-4o-2024-08-06,2024-08-25 21:24:35,openai_api,gpt-4o-2024-08-06,,,75.15,eq-bench_v2_pl,171.0,1,openai,,,
 gpt-4-turbo-2024-04-09,2024-08-25 21:31:42,openai_api,gpt-4-turbo-2024-04-09,,,77.77,eq-bench_v2_pl,164.0,1,openai,,,
 Bielik_v2.3,2024-09-14 10:40:57,,speakleash/Bielik-11B-v2.3-Instruct,,,70.86,eq-bench_v2_pl,171.0,1,transformers, ,,

 gpt-4o-2024-08-06,2024-08-25 21:24:35,openai_api,gpt-4o-2024-08-06,,,75.15,eq-bench_v2_pl,171.0,1,openai,,,
 gpt-4-turbo-2024-04-09,2024-08-25 21:31:42,openai_api,gpt-4-turbo-2024-04-09,,,77.77,eq-bench_v2_pl,164.0,1,openai,,,
 Bielik_v2.3,2024-09-14 10:40:57,,speakleash/Bielik-11B-v2.3-Instruct,,,70.86,eq-bench_v2_pl,171.0,1,transformers, ,,
+PLLuM-12B-nc-chat,2025-02-24 15:02:07,,CYFRAGOVPL/PLLuM-12B-nc-chat,,,49.23,eq-bench_pl,123.0,1,transformers, ,,123.0 questions were parseable (min is 83%)
+Llama-PLLuM-8B-instruct,2025-02-24 16:55:16,,CYFRAGOVPL/Llama-PLLuM-8B-instruct,,,43.56,eq-bench_pl,124.0,1,transformers, ,,124.0 questions were parseable (min is 83%)
+PLLuM-12B-nc-instruct,2025-02-24 17:38:48,,CYFRAGOVPL/PLLuM-12B-nc-instruct,,,29.50,eq-bench_pl,76.0,1,transformers, ,,76.0 questions were parseable (min is 83%)
+PLLuM-12B-chat,2025-02-24 17:56:34,,CYFRAGOVPL/PLLuM-12B-chat,,,57.29,eq-bench_v2_pl,156.0,1,transformers, ,,
+PLLuM-12B-instruct,2025-02-24 18:03:06,,CYFRAGOVPL/PLLuM-12B-instruct,,,40.21,eq-bench_v2_pl,154.0,1,transformers, ,,
+Llama-PLLuM-8B-chat,2025-02-24 18:40:04,,CYFRAGOVPL/Llama-PLLuM-8B-chat,,,50.97,eq-bench_v2_pl,155.0,1,transformers, ,,
+Llama-PLLuM-70B-instruct,2025-02-23 22:45:37,,CYFRAGOVPL/Llama-PLLuM-70B-instruct,,,69.99,eq-bench_v2_pl,171.0,1,transformers, ,,
+Llama-PLLuM-70B-chat,2025-02-24 22:32:57,,CYFRAGOVPL/Llama-PLLuM-70B-chat,,,72.99,eq-bench_v2_pl,170.0,1,transformers, ,,
+PLLuM-8x7B-nc-chat,2025-02-23 14:33:22,openai_api,CYFRAGOVPL/PLLuM-8x7B-nc-chat,,,47.29,eq-bench_v2_pl,171.0,1,openai,,,
+PLLuM-8x7B-nc-instruct,2025-02-23 14:33:22,openai_api,CYFRAGOVPL/PLLuM-8x7B-nc-instruct,,,41.75,eq-bench_v2_pl,171.0,1,openai,,,
+PLLuM-8x7B-chat,2025-02-23 14:33:22,openai_api,CYFRAGOVPL/PLLuM-8x7B-chat,,,45.22,eq-bench_v2_pl,171.0,1,openai,,,
+PLLuM-8x7B-instruct,2025-02-23 14:33:22,openai_api,CYFRAGOVPL/PLLuM-8x7B-instruct,,,39.55,eq-bench_v2_pl,171.0,1,openai,,,
+Qwen2.5-7B-Instruct,2025-03-01 11:49:28,,Qwen/Qwen2.5-7B-Instruct,,,58.58,eq-bench_v2_pl,171.0,1,transformers,,,
+Qwen2.5-14B-Instruct,2025-03-01 12:01:56,,Qwen/Qwen2.5-14B-Instruct,,,69.58,eq-bench_v2_pl,170.0,1,transformers,,,
+Qwen2.5-1.5B-Instruct,2025-03-01 12:09:18,,Qwen/Qwen2.5-1.5B-Instruct,,,27.79,eq-bench_v2_pl,170.0,1,transformers,,,
+phi-4,2025-03-01 12:19:38,,microsoft/phi-4,,,64.37,eq-bench_v2_pl,157.0,1,transformers,,,
+glm-4-9b-chat,2025-03-01 12:23:46,,THUDM/glm-4-9b-chat,,,61.79,eq-bench_v2_pl,171.0,1,transformers,,,
+openchat-3.6-8b-20240522,2025-03-01 12:29:29,,openchat/openchat-3.6-8b-20240522,,,-2.0090659464796536e+18,eq-bench_v2_pl,170.0,1,transformers,,,
+Qwen2.5-32B-Instruct,2025-03-02 14:08:52,,Qwen/Qwen2.5-32B-Instruct,,,71.15,eq-bench_v2_pl,171.0,1,transformers,,,
+Qwen2.5-72B-Instruct,2025-03-02 14:25:32,,Qwen/Qwen2.5-72B-Instruct,,,68.89,eq-bench_v2_pl,170.0,1,transformers,,,
+Llama-3.1-Nemotron-70B-Instruct-HF,2025-03-02 15:04:25,,nvidia/Llama-3.1-Nemotron-70B-Instruct-HF,,,74.75,eq-bench_pl,133.0,1,transformers,,,133.0 questions were parseable (min is 83%)
+Llama-3.2-1B-Instruct,2025-03-02 16:35:24,,meta-llama/Llama-3.2-1B-Instruct,,,20.59,eq-bench_v2_pl,148.0,1,transformers,,,
+EuroLLM-9B-Instruct,2025-03-02 16:41:02,,utter-project/EuroLLM-9B-Instruct,,,54.75,eq-bench_v2_pl,169.0,1,transformers,,,
+Llama-3.3-70B-Instruct,2025-03-02 16:59:31,,meta-llama/Llama-3.3-70B-Instruct,,,72.86,eq-bench_v2_pl,166.0,1,transformers,,,
+Llama-3.2-3B-Instruct,2025-03-02 17:14:17,,meta-llama/Llama-3.2-3B-Instruct,,,46.46,eq-bench_v2_pl,170.0,1,transformers,,,
+Qwen2.5-3B-Instruct,2025-03-02 17:26:57,,Qwen/Qwen2.5-3B-Instruct,,,36.08,eq-bench_v2_pl,170.0,1,transformers,,,
+Mistral-Small-24B-Instruct-2501,2025-03-02 17:33:14,,mistralai/Mistral-Small-24B-Instruct-2501,,,70.52,eq-bench_v2_pl,171.0,1,transformers,,,
+Mistral-Small-Instruct-2409,2025-03-02 17:43:01,,mistralai/Mistral-Small-Instruct-2409,,,72.85,eq-bench_v2_pl,171.0,1,transformers,,,
+Mistral-Nemo-Instruct-2407,2025-03-03 10:29:42,,mistralai/Mistral-Nemo-Instruct-2407,,,61.76,eq-bench_v2_pl,171.0,1,transformers,,,
+Phi-4-mini-instruct,2025-03-03 13:20:03,,microsoft/Phi-4-mini-instruct,,,50.82,eq-bench_v2_pl,170.0,1,transformers,,,
+Mistral-Large-Instruct-2411,2025-03-07 12:17:17,,mistralai/Mistral-Large-Instruct-2411,,,77.29,eq-bench_v2_pl,171.0,1,transformers,,,