Upload arena_hard_leaderboard_20240703.csv
#41
by
connorchenn
- opened
arena_hard_leaderboard_20240703.csv
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model,score,rating_q025,rating_q975,CI,avg_tokens
|
2 |
+
gpt-4-turbo-2024-04-09,82.63,80.82,85.05,"(-1.81, +2.42)",662.0
|
3 |
+
claude-3-5-sonnet-20240620,79.35,77.59,81.07,"(-1.76, +1.72)",567.0
|
4 |
+
gpt-4o-2024-05-13,79.21,77.24,80.94,"(-1.97, +1.73)",696.0
|
5 |
+
gpt-4-0125-preview,77.96,76.15,79.98,"(-1.81, +2.02)",619.0
|
6 |
+
gemini-1.5-pro-api-0514,71.96,69.51,74.19,"(-2.45, +2.23)",676.0
|
7 |
+
yi-large-preview,71.48,69.51,73.92,"(-1.97, +2.44)",720.0
|
8 |
+
glm-4-0520,63.84,61.33,65.92,"(-2.51, +2.08)",636.0
|
9 |
+
yi-large,63.7,61.21,65.69,"(-2.49, +1.99)",626.0
|
10 |
+
deepseek-coder-v2,62.3,60.21,64.96,"(-2.09, +2.66)",578.0
|
11 |
+
claude-3-opus-20240229,60.36,58.07,62.92,"(-2.29, +2.56)",541.0
|
12 |
+
gemma-2-27b-it,57.51,55.77,60.15,"(-1.74, +2.64)",577.0
|
13 |
+
glm-4-0116,55.72,53.16,58.03,"(-2.56, +2.31)",622.0
|
14 |
+
gemini-1.5-pro-api-0409-preview,53.37,51.1,56.13,"(-2.27, +2.76)",478.0
|
15 |
+
glm-4-air,50.88,48.17,53.71,"(-2.71, +2.83)",619.0
|
16 |
+
gpt-4-0314,50.0,50.0,50.0,"(-0.00, +0.00)",423.0
|
17 |
+
gemini-1.5-flash-api-0514,49.61,47.48,52.38,"(-2.13, +2.77)",642.0
|
18 |
+
qwen2-72b-instruct,46.86,43.54,49.13,"(-3.32, +2.27)",515.0
|
19 |
+
claude-3-sonnet-20240229,46.8,44.39,49.4,"(-2.41, +2.60)",552.0
|
20 |
+
claude-3-haiku-20240307,41.47,39.33,43.99,"(-2.14, +2.52)",505.0
|
21 |
+
llama-3-70b-instruct,41.06,38.97,43.85,"(-2.09, +2.79)",583.0
|
22 |
+
gpt-4-0613,37.9,35.28,40.0,"(-2.62, +2.10)",354.0
|
23 |
+
mistral-large-2402,37.71,35.41,40.1,"(-2.30, +2.39)",400.0
|
24 |
+
mixtral-8x22b-instruct-v0.1,36.36,33.98,38.7,"(-2.38, +2.34)",430.0
|
25 |
+
qwen1.5-72b-chat,36.12,33.65,38.44,"(-2.47, +2.32)",474.0
|
26 |
+
phi-3-medium-4k-instruct,33.37,31.29,35.97,"(-2.08, +2.60)",517.0
|
27 |
+
command-r-plus,33.07,31.11,34.75,"(-1.96, +1.68)",541.0
|
28 |
+
mistral-medium,31.9,29.98,34.44,"(-1.92, +2.54)",485.0
|
29 |
+
phi-3-small-8k-instruct,29.77,27.8,31.83,"(-1.97, +2.06)",568.0
|
30 |
+
mistral-next,27.37,25.34,29.65,"(-2.03, +2.28)",297.0
|
31 |
+
gpt-3.5-turbo-0613,24.82,22.95,27.05,"(-1.87, +2.23)",401.0
|
32 |
+
dbrx-instruct-preview,24.63,22.99,26.74,"(-1.64, +2.11)",415.0
|
33 |
+
claude-2.0,23.99,22.17,25.79,"(-1.82, +1.80)",295.0
|
34 |
+
mixtral-8x7b-instruct-v0.1,23.4,21.54,25.54,"(-1.86, +2.14)",457.0
|
35 |
+
gpt-3.5-turbo-0125,23.34,21.58,25.38,"(-1.76, +2.04)",329.0
|
36 |
+
yi-34b-chat,23.15,21.26,25.15,"(-1.89, +2.00)",611.0
|
37 |
+
starling-lm-7b-beta,23.01,20.7,24.97,"(-2.31, +1.96)",530.0
|
38 |
+
claude-2.1,22.77,21.26,24.68,"(-1.51, +1.91)",290.0
|
39 |
+
snorkel-mistral-pairrm-dpo,20.73,18.76,22.76,"(-1.97, +2.03)",564.0
|
40 |
+
llama-3-8b-instruct,20.56,18.7,22.27,"(-1.86, +1.71)",585.0
|
41 |
+
gpt-3.5-turbo-1106,18.87,17.18,20.41,"(-1.69, +1.54)",285.0
|
42 |
+
gpt-3.5-turbo-0314,18.05,16.42,19.36,"(-1.63, +1.31)",334.0
|
43 |
+
gemini-pro,17.8,16.25,19.75,"(-1.55, +1.95)",322.0
|
44 |
+
snowflake-arctic-instruct,17.61,16.07,19.34,"(-1.54, +1.73)",365.0
|
45 |
+
command-r,17.02,15.4,18.41,"(-1.62, +1.39)",432.0
|
46 |
+
phi-3-mini-128k-instruct,15.43,13.94,17.05,"(-1.49, +1.62)",609.0
|
47 |
+
tulu-2-dpo-70b,14.99,13.35,16.92,"(-1.64, +1.93)",550.0
|
48 |
+
starling-lm-7b-alpha,12.8,11.47,14.34,"(-1.33, +1.54)",483.0
|
49 |
+
mistral-7b-instruct,12.57,11.06,14.35,"(-1.51, +1.78)",541.0
|
50 |
+
gemma-1.1-7b-it,12.09,10.65,13.64,"(-1.44, +1.55)",341.0
|
51 |
+
llama-2-70b-chat,11.55,10.28,12.72,"(-1.27, +1.17)",595.0
|
52 |
+
vicuna-33b,8.63,7.48,9.79,"(-1.15, +1.16)",451.0
|
53 |
+
gemma-7b-it,7.47,6.38,8.84,"(-1.09, +1.37)",378.0
|
54 |
+
gemma-1.1-2b-it,3.37,2.78,4.19,"(-0.59, +0.82)",316.0
|
55 |
+
gemma-2b-it,3.0,2.33,3.69,"(-0.67, +0.69)",369.0
|