cyberosa
commited on
Commit
·
a531010
1
Parent(s):
0016913
updated leaderboard with new accuracy values
Browse files- formatted_data.csv +6 -4
formatted_data.csv
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
Tool,Model,Accuracy,Correct,Total,Mean Tokens Used,Mean Cost ($)
|
2 |
prediction-request-rag,gpt-4o-2024-08-06,0.7909967845659164,246,311,2437.675241157556,0.0314014469453376
|
3 |
-
prediction-request-reasoning,claude-3-5-sonnet-20240620,0.
|
4 |
prediction-request-reasoning,gpt-4o-2024-08-06,0.7642585551330798,201,263,2408.7148288973385,0.0412335361216729
|
5 |
superforcaster,gpt-4o-2024-08-06,0.7638036809815951,249,326,2131.929447852761,0.0221592944785276
|
6 |
-
prediction-request-rag,claude-3-5-sonnet-20240620,0.
|
7 |
prediction-offline,claude-3-sonnet-20240229,0.756838905775076,249,329,920.5987841945288,0.0033739787234042
|
8 |
prediction-offline,claude-3-opus-20240229,0.7558823529411764,257,340,920.45,0.016866044117647
|
9 |
prediction-request-reasoning,claude-3-sonnet-20240229,0.753125,241,320,2645.509375,0.0192545156249999
|
@@ -12,12 +12,14 @@ prediction-online,gpt-4o-2024-08-06,0.7507418397626113,253,337,2484.083086053412
|
|
12 |
prediction-offline-sme,gpt-4o-2024-08-06,0.7485207100591716,253,338,1381.3727810650887,0.0173823668639053
|
13 |
prediction-offline-sme,gpt-4-0125-preview,0.7484848484848485,247,330,1416.8484848484848,0.0181692121212121
|
14 |
prediction-request-reasoning,gpt-4-0125-preview,0.7483221476510067,223,298,1980.7281879194632,0.0256767449664429
|
15 |
-
prediction-
|
|
|
16 |
prediction-online,claude-3-sonnet-20240229,0.7411764705882353,252,340,2832.7617647058823,0.0095903911764705
|
|
|
17 |
prediction-url-cot,claude-3-sonnet-20240229,0.7355623100303952,242,329,14789.27963525836,0.0510609574468085
|
18 |
-
prediction-online,claude-3-5-sonnet-20240620,0.7337278106508875,248,338,2773.6745562130177,0.0099932130177514
|
19 |
prediction-request-reasoning,claude-3-opus-20240229,0.7337278106508875,248,338,2773.284023668639,0.1062446449704141
|
20 |
prediction-request-rag,claude-3-sonnet-20240229,0.7331288343558282,239,326,2850.1196319018404,0.0146586533742331
|
|
|
21 |
prediction-offline,claude-2,0.7201834862385321,157,218,779.4770642201835,0.0068916697247706
|
22 |
prediction-online,databricks/dbrx-instruct:nitro,0.7173252279635258,236,329,2696.0607902735564,0.0024264547112461
|
23 |
superforcaster,gpt-4-0125-preview,0.7169230769230769,233,325,2143.230769230769,0.0222704615384615
|
|
|
1 |
Tool,Model,Accuracy,Correct,Total,Mean Tokens Used,Mean Cost ($)
|
2 |
prediction-request-rag,gpt-4o-2024-08-06,0.7909967845659164,246,311,2437.675241157556,0.0314014469453376
|
3 |
+
prediction-request-reasoning,claude-3-5-sonnet-20240620,0.7893175074183977,266,337,2825.774480712166,0.020917299703264
|
4 |
prediction-request-reasoning,gpt-4o-2024-08-06,0.7642585551330798,201,263,2408.7148288973385,0.0412335361216729
|
5 |
superforcaster,gpt-4o-2024-08-06,0.7638036809815951,249,326,2131.929447852761,0.0221592944785276
|
6 |
+
prediction-request-rag,claude-3-5-sonnet-20240620,0.7603550295857988,257,338,2900.044378698225,0.0157145591715976
|
7 |
prediction-offline,claude-3-sonnet-20240229,0.756838905775076,249,329,920.5987841945288,0.0033739787234042
|
8 |
prediction-offline,claude-3-opus-20240229,0.7558823529411764,257,340,920.45,0.016866044117647
|
9 |
prediction-request-reasoning,claude-3-sonnet-20240229,0.753125,241,320,2645.509375,0.0192545156249999
|
|
|
12 |
prediction-offline-sme,gpt-4o-2024-08-06,0.7485207100591716,253,338,1381.3727810650887,0.0173823668639053
|
13 |
prediction-offline-sme,gpt-4-0125-preview,0.7484848484848485,247,330,1416.8484848484848,0.0181692121212121
|
14 |
prediction-request-reasoning,gpt-4-0125-preview,0.7483221476510067,223,298,1980.7281879194632,0.0256767449664429
|
15 |
+
prediction-online,claude-3-5-sonnet-20240620,0.7441176470588236,253,340,2774.755882352941,0.0100005970588235
|
16 |
+
prediction-online-sme,gpt-4o-2024-08-06,0.7430340557275542,240,323,3147.260061919505,0.0363173684210526
|
17 |
prediction-online,claude-3-sonnet-20240229,0.7411764705882353,252,340,2832.7617647058823,0.0095903911764705
|
18 |
+
prediction-offline,claude-3-5-sonnet-20240620,0.7374631268436578,250,339,815.4542772861357,0.0030341504424778
|
19 |
prediction-url-cot,claude-3-sonnet-20240229,0.7355623100303952,242,329,14789.27963525836,0.0510609574468085
|
|
|
20 |
prediction-request-reasoning,claude-3-opus-20240229,0.7337278106508875,248,338,2773.284023668639,0.1062446449704141
|
21 |
prediction-request-rag,claude-3-sonnet-20240229,0.7331288343558282,239,326,2850.1196319018404,0.0146586533742331
|
22 |
+
prediction-url-cot,claude-3-5-sonnet-20240620,0.7286135693215339,247,339,14794.787610619467,0.0514230530973451
|
23 |
prediction-offline,claude-2,0.7201834862385321,157,218,779.4770642201835,0.0068916697247706
|
24 |
prediction-online,databricks/dbrx-instruct:nitro,0.7173252279635258,236,329,2696.0607902735564,0.0024264547112461
|
25 |
superforcaster,gpt-4-0125-preview,0.7169230769230769,233,325,2143.230769230769,0.0222704615384615
|