cyberosa commited on
Commit
a531010
·
1 Parent(s): 0016913

updated leaderboard with new accuracy values

Browse files
Files changed (1) hide show
  1. formatted_data.csv +6 -4
formatted_data.csv CHANGED
@@ -1,9 +1,9 @@
1
  Tool,Model,Accuracy,Correct,Total,Mean Tokens Used,Mean Cost ($)
2
  prediction-request-rag,gpt-4o-2024-08-06,0.7909967845659164,246,311,2437.675241157556,0.0314014469453376
3
- prediction-request-reasoning,claude-3-5-sonnet-20240620,0.7830508474576271,231,295,2815.3050847457625,0.0209063491525423
4
  prediction-request-reasoning,gpt-4o-2024-08-06,0.7642585551330798,201,263,2408.7148288973385,0.0412335361216729
5
  superforcaster,gpt-4o-2024-08-06,0.7638036809815951,249,326,2131.929447852761,0.0221592944785276
6
- prediction-request-rag,claude-3-5-sonnet-20240620,0.7582089552238805,254,335,2893.865671641791,0.0156288805970149
7
  prediction-offline,claude-3-sonnet-20240229,0.756838905775076,249,329,920.5987841945288,0.0033739787234042
8
  prediction-offline,claude-3-opus-20240229,0.7558823529411764,257,340,920.45,0.016866044117647
9
  prediction-request-reasoning,claude-3-sonnet-20240229,0.753125,241,320,2645.509375,0.0192545156249999
@@ -12,12 +12,14 @@ prediction-online,gpt-4o-2024-08-06,0.7507418397626113,253,337,2484.083086053412
12
  prediction-offline-sme,gpt-4o-2024-08-06,0.7485207100591716,253,338,1381.3727810650887,0.0173823668639053
13
  prediction-offline-sme,gpt-4-0125-preview,0.7484848484848485,247,330,1416.8484848484848,0.0181692121212121
14
  prediction-request-reasoning,gpt-4-0125-preview,0.7483221476510067,223,298,1980.7281879194632,0.0256767449664429
15
- prediction-offline,claude-3-5-sonnet-20240620,0.7433628318584071,252,339,815.4306784660766,0.0030337964601769
 
16
  prediction-online,claude-3-sonnet-20240229,0.7411764705882353,252,340,2832.7617647058823,0.0095903911764705
 
17
  prediction-url-cot,claude-3-sonnet-20240229,0.7355623100303952,242,329,14789.27963525836,0.0510609574468085
18
- prediction-online,claude-3-5-sonnet-20240620,0.7337278106508875,248,338,2773.6745562130177,0.0099932130177514
19
  prediction-request-reasoning,claude-3-opus-20240229,0.7337278106508875,248,338,2773.284023668639,0.1062446449704141
20
  prediction-request-rag,claude-3-sonnet-20240229,0.7331288343558282,239,326,2850.1196319018404,0.0146586533742331
 
21
  prediction-offline,claude-2,0.7201834862385321,157,218,779.4770642201835,0.0068916697247706
22
  prediction-online,databricks/dbrx-instruct:nitro,0.7173252279635258,236,329,2696.0607902735564,0.0024264547112461
23
  superforcaster,gpt-4-0125-preview,0.7169230769230769,233,325,2143.230769230769,0.0222704615384615
 
1
  Tool,Model,Accuracy,Correct,Total,Mean Tokens Used,Mean Cost ($)
2
  prediction-request-rag,gpt-4o-2024-08-06,0.7909967845659164,246,311,2437.675241157556,0.0314014469453376
3
+ prediction-request-reasoning,claude-3-5-sonnet-20240620,0.7893175074183977,266,337,2825.774480712166,0.020917299703264
4
  prediction-request-reasoning,gpt-4o-2024-08-06,0.7642585551330798,201,263,2408.7148288973385,0.0412335361216729
5
  superforcaster,gpt-4o-2024-08-06,0.7638036809815951,249,326,2131.929447852761,0.0221592944785276
6
+ prediction-request-rag,claude-3-5-sonnet-20240620,0.7603550295857988,257,338,2900.044378698225,0.0157145591715976
7
  prediction-offline,claude-3-sonnet-20240229,0.756838905775076,249,329,920.5987841945288,0.0033739787234042
8
  prediction-offline,claude-3-opus-20240229,0.7558823529411764,257,340,920.45,0.016866044117647
9
  prediction-request-reasoning,claude-3-sonnet-20240229,0.753125,241,320,2645.509375,0.0192545156249999
 
12
  prediction-offline-sme,gpt-4o-2024-08-06,0.7485207100591716,253,338,1381.3727810650887,0.0173823668639053
13
  prediction-offline-sme,gpt-4-0125-preview,0.7484848484848485,247,330,1416.8484848484848,0.0181692121212121
14
  prediction-request-reasoning,gpt-4-0125-preview,0.7483221476510067,223,298,1980.7281879194632,0.0256767449664429
15
+ prediction-online,claude-3-5-sonnet-20240620,0.7441176470588236,253,340,2774.755882352941,0.0100005970588235
16
+ prediction-online-sme,gpt-4o-2024-08-06,0.7430340557275542,240,323,3147.260061919505,0.0363173684210526
17
  prediction-online,claude-3-sonnet-20240229,0.7411764705882353,252,340,2832.7617647058823,0.0095903911764705
18
+ prediction-offline,claude-3-5-sonnet-20240620,0.7374631268436578,250,339,815.4542772861357,0.0030341504424778
19
  prediction-url-cot,claude-3-sonnet-20240229,0.7355623100303952,242,329,14789.27963525836,0.0510609574468085
 
20
  prediction-request-reasoning,claude-3-opus-20240229,0.7337278106508875,248,338,2773.284023668639,0.1062446449704141
21
  prediction-request-rag,claude-3-sonnet-20240229,0.7331288343558282,239,326,2850.1196319018404,0.0146586533742331
22
+ prediction-url-cot,claude-3-5-sonnet-20240620,0.7286135693215339,247,339,14794.787610619467,0.0514230530973451
23
  prediction-offline,claude-2,0.7201834862385321,157,218,779.4770642201835,0.0068916697247706
24
  prediction-online,databricks/dbrx-instruct:nitro,0.7173252279635258,236,329,2696.0607902735564,0.0024264547112461
25
  superforcaster,gpt-4-0125-preview,0.7169230769230769,233,325,2143.230769230769,0.0222704615384615