Upload arena_hard_leaderboard_20240703.csv

#41
by connorchenn - opened
Files changed (1) hide show
  1. arena_hard_leaderboard_20240703.csv +55 -0
arena_hard_leaderboard_20240703.csv ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model,score,rating_q025,rating_q975,CI,avg_tokens
2
+ gpt-4-turbo-2024-04-09,82.63,80.82,85.05,"(-1.81, +2.42)",662.0
3
+ claude-3-5-sonnet-20240620,79.35,77.59,81.07,"(-1.76, +1.72)",567.0
4
+ gpt-4o-2024-05-13,79.21,77.24,80.94,"(-1.97, +1.73)",696.0
5
+ gpt-4-0125-preview,77.96,76.15,79.98,"(-1.81, +2.02)",619.0
6
+ gemini-1.5-pro-api-0514,71.96,69.51,74.19,"(-2.45, +2.23)",676.0
7
+ yi-large-preview,71.48,69.51,73.92,"(-1.97, +2.44)",720.0
8
+ glm-4-0520,63.84,61.33,65.92,"(-2.51, +2.08)",636.0
9
+ yi-large,63.7,61.21,65.69,"(-2.49, +1.99)",626.0
10
+ deepseek-coder-v2,62.3,60.21,64.96,"(-2.09, +2.66)",578.0
11
+ claude-3-opus-20240229,60.36,58.07,62.92,"(-2.29, +2.56)",541.0
12
+ gemma-2-27b-it,57.51,55.77,60.15,"(-1.74, +2.64)",577.0
13
+ glm-4-0116,55.72,53.16,58.03,"(-2.56, +2.31)",622.0
14
+ gemini-1.5-pro-api-0409-preview,53.37,51.1,56.13,"(-2.27, +2.76)",478.0
15
+ glm-4-air,50.88,48.17,53.71,"(-2.71, +2.83)",619.0
16
+ gpt-4-0314,50.0,50.0,50.0,"(-0.00, +0.00)",423.0
17
+ gemini-1.5-flash-api-0514,49.61,47.48,52.38,"(-2.13, +2.77)",642.0
18
+ qwen2-72b-instruct,46.86,43.54,49.13,"(-3.32, +2.27)",515.0
19
+ claude-3-sonnet-20240229,46.8,44.39,49.4,"(-2.41, +2.60)",552.0
20
+ claude-3-haiku-20240307,41.47,39.33,43.99,"(-2.14, +2.52)",505.0
21
+ llama-3-70b-instruct,41.06,38.97,43.85,"(-2.09, +2.79)",583.0
22
+ gpt-4-0613,37.9,35.28,40.0,"(-2.62, +2.10)",354.0
23
+ mistral-large-2402,37.71,35.41,40.1,"(-2.30, +2.39)",400.0
24
+ mixtral-8x22b-instruct-v0.1,36.36,33.98,38.7,"(-2.38, +2.34)",430.0
25
+ qwen1.5-72b-chat,36.12,33.65,38.44,"(-2.47, +2.32)",474.0
26
+ phi-3-medium-4k-instruct,33.37,31.29,35.97,"(-2.08, +2.60)",517.0
27
+ command-r-plus,33.07,31.11,34.75,"(-1.96, +1.68)",541.0
28
+ mistral-medium,31.9,29.98,34.44,"(-1.92, +2.54)",485.0
29
+ phi-3-small-8k-instruct,29.77,27.8,31.83,"(-1.97, +2.06)",568.0
30
+ mistral-next,27.37,25.34,29.65,"(-2.03, +2.28)",297.0
31
+ gpt-3.5-turbo-0613,24.82,22.95,27.05,"(-1.87, +2.23)",401.0
32
+ dbrx-instruct-preview,24.63,22.99,26.74,"(-1.64, +2.11)",415.0
33
+ claude-2.0,23.99,22.17,25.79,"(-1.82, +1.80)",295.0
34
+ mixtral-8x7b-instruct-v0.1,23.4,21.54,25.54,"(-1.86, +2.14)",457.0
35
+ gpt-3.5-turbo-0125,23.34,21.58,25.38,"(-1.76, +2.04)",329.0
36
+ yi-34b-chat,23.15,21.26,25.15,"(-1.89, +2.00)",611.0
37
+ starling-lm-7b-beta,23.01,20.7,24.97,"(-2.31, +1.96)",530.0
38
+ claude-2.1,22.77,21.26,24.68,"(-1.51, +1.91)",290.0
39
+ snorkel-mistral-pairrm-dpo,20.73,18.76,22.76,"(-1.97, +2.03)",564.0
40
+ llama-3-8b-instruct,20.56,18.7,22.27,"(-1.86, +1.71)",585.0
41
+ gpt-3.5-turbo-1106,18.87,17.18,20.41,"(-1.69, +1.54)",285.0
42
+ gpt-3.5-turbo-0314,18.05,16.42,19.36,"(-1.63, +1.31)",334.0
43
+ gemini-pro,17.8,16.25,19.75,"(-1.55, +1.95)",322.0
44
+ snowflake-arctic-instruct,17.61,16.07,19.34,"(-1.54, +1.73)",365.0
45
+ command-r,17.02,15.4,18.41,"(-1.62, +1.39)",432.0
46
+ phi-3-mini-128k-instruct,15.43,13.94,17.05,"(-1.49, +1.62)",609.0
47
+ tulu-2-dpo-70b,14.99,13.35,16.92,"(-1.64, +1.93)",550.0
48
+ starling-lm-7b-alpha,12.8,11.47,14.34,"(-1.33, +1.54)",483.0
49
+ mistral-7b-instruct,12.57,11.06,14.35,"(-1.51, +1.78)",541.0
50
+ gemma-1.1-7b-it,12.09,10.65,13.64,"(-1.44, +1.55)",341.0
51
+ llama-2-70b-chat,11.55,10.28,12.72,"(-1.27, +1.17)",595.0
52
+ vicuna-33b,8.63,7.48,9.79,"(-1.15, +1.16)",451.0
53
+ gemma-7b-it,7.47,6.38,8.84,"(-1.09, +1.37)",378.0
54
+ gemma-1.1-2b-it,3.37,2.78,4.19,"(-0.59, +0.82)",316.0
55
+ gemma-2b-it,3.0,2.33,3.69,"(-0.67, +0.69)",369.0