open-agent-leaderboard / src /detail_results.csv
liaojiajia
updated scores
6add05a
Rank,Algorithm,Dataset,LLM,Eval Date,Score,Pass rate,X-shot,Cost($),Framework,Samples,All tokens,Total input tokens,Average input tokens,Total output tokens,Average output tokens
1,CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,86.22,0.9921,0,0.0808,,254,143289,25143,99,118146,465
2,SC-CoT,AQuA,gpt-4o,2025/1/22,85.83,0.9921,0,5.2456,,254,545431,27829,110,517602,2038
3,SC-CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,85.43,0.9685,0,0.4186,,254,742552,137990,543,604562,2380
4,SC-CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,84.65,0.9961,0,0.4438,,254,787312,175050,689,612262,2410
5,IO,AQuA,Qwen2.5-72B-Instruct,2025/1/22,84.25,0.9961,0,0.0742,,254,131604,25397,100,106207,418
6,CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,83.46,0.9843,0,0.0927,,254,164389,32555,128,131834,519
7,ToT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,83.07,1.0,0,2.9404,,254,5215848,4735188,18642,480660,1892
8,IO,AQuA,Llama-3.3-70B-Instruct,2025/1/22,82.68,0.9921,0,0.0798,,254,141567,32809,129,108758,428
9,CoT,AQuA,Doubao-lite-32k,2025/1/7,82.68,0.9724,0,0.0066,,254,94577,27978,110,66599,262
10,CoT,AQuA,gpt-4o,2025/1/22,82.68,0.9803,0,1.0417,,254,123017,25123,99,97894,385
11,ToT,AQuA,gpt-4o,2025/1/22,81.5,0.9921,0,8.5295,,254,2613607,2347538,9242,266069,1048
12,ToT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,81.1,0.9921,0,3.7389,,254,6632255,6371642,25085,260613,1026
13,CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,80.71,0.9961,0,0.0,,254,149736,33017,130,116719,460
14,PoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.53,0.9921,0,0.1746,,254,309799,240735,948,69064,272
15,SC-CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,79.53,1.0,0,0.0,,254,745410,177972,701,567438,2234
16,IO,AQuA,Doubao-lite-32k,2025/1/7,79.13,1.0,0,0.0058,,254,87742,33058,130,54684,215
17,ReAct-Pro*,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.13,0.9961,0,0.768,,254,1362379,1119143,4406,243236,958
18,IO,AQuA,Qwen2.5-7B-Instruct,2025/1/22,78.74,0.9843,0,0.0,,254,137771,33271,131,104500,411
19,ReAct-Pro*,AQuA,Doubao-lite-32k,2025/1/7,77.56,0.9606,0,0.0445,,254,1032841,977890,3850,54951,216
20,SC-CoT,AQuA,Doubao-lite-32k,2025/1/7,76.37,0.9173,0,0.0279,,254,356839,31703,125,325136,1280
21,IO,AQuA,gpt-4o,2025/1/22,75.59,0.9724,0,1.1453,,254,133752,25631,101,108121,426
22,PoT,AQuA,gpt-4o,2025/1/22,75.2,1.0,0,1.6087,,254,327908,222717,877,105191,414
23,PoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,75.2,1.0,0,0.1645,,254,291764,249215,981,42549,168
24,ReAct-Pro*,AQuA,Qwen2.5-7B-Instruct,2025/1/22,74.41,0.9921,0,0.0,,254,695844,564165,2221,131679,518
25,ReAct-Pro*,AQuA,Qwen2.5-72B-Instruct,2025/1/22,73.23,1.0,0,0.3177,,254,563603,441765,1739,121838,480
26,PoT,AQuA,Doubao-lite-32k,2025/1/7,71.65,0.9685,0,0.0147,,254,309436,259863,1023,49573,195
27,CoT,AQuA,deepseek-r1:1.5b,2025/1/23,71.65,0.9685,0,0.0,,254,333072,26413,104,306659,1207
28,IO,AQuA,deepseek-r1:1.5b,2025/1/22,68.9,0.9488,0,0.0,,254,351767,26667,105,325100,1280
29,PoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,68.11,1.0,0,0.0,,254,313728,264517,1041,49211,194
30,ReAct-Pro*,AQuA,gpt-3.5-turbo,2025/1/7,64.57,0.9803,0,0.4928,,254,903587,862614,3396,40973,161
31,CoT,AQuA,gpt-3.5-turbo,2025/1/7,61.02,0.937,0,0.0957,,254,80793,25447,100,55346,218
32,CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,60.63,1.0,0,0.0,,254,144435,32555,128,111880,440
33,PoT,AQuA,gpt-3.5-turbo,2025/1/7,59.45,1.0,0,0.1748,,254,266654,225162,886,41492,163
34,SC-CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,59.45,0.9567,0,0.0,,254,690077,145108,571,544969,2146
35,ToT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,59.06,1.0,0,0.0,,254,5739684,4896222,19276,843462,3321
36,SC-CoT,AQuA,gpt-3.5-turbo,2025/1/7,58.66,0.9252,0,0.3277,,254,237066,27906,110,209160,823
37,SC-CoT,AQuA,deepseek-r1:1.5b,2025/2/10,57.87,0.7402,0,0.0,,254,2132111,144710,570,1987401,7824
38,ReAct-Pro*,AQuA,gpt-4o,2025/1/22,57.48,0.9724,0,2.304,,254,692096,615589,2424,76507,301
39,ToT,AQuA,gpt-3.5-turbo,2025/1/7,57.09,0.9961,0,1.1513,,254,2001396,1850767,7286,150629,593
40,ReAct-Pro*,AQuA,Llama-3.1-8B-Instruct,2025/1/22,55.51,0.9685,0,0.0,,254,4340821,3764723,14822,576098,2268
41,PoT,AQuA,deepseek-r1:1.5b,2025/2/10,54.72,0.9724,0,0.0,,254,1016647,250690,987,765957,3016
42,ReAct-Pro*,AQuA,deepseek-r1:1.5b,2025/2/10,54.33,0.9646,0,0.0,,254,14445041,10578715,41648,3866326,15222
43,ToT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,53.94,1.0,0,0.0,,254,8602682,8224468,32380,378214,1489
44,CoT,AQuA,Internllm2_5-7B,2025/1/22,52.76,0.8937,0,0.0,,254,127520,26610,105,100910,397
45,IO,AQuA,Llama-3.1-8B-Instruct,2025/1/22,51.18,0.9882,0,0.0,,254,133106,26459,104,106647,420
46,IO,AQuA,Internllm2_5-7B,2025/1/22,47.64,0.9094,0,0.0,,254,185041,50232,198,134809,531
47,ToT,AQuA,Doubao-lite-32k,2025/1/7,45.28,0.7402,0,0.0881,,254,2000550,1850249,7284,150301,592
48,ReAct-Pro*,AQuA,Internllm2_5-7B,2025/1/22,40.94,0.9685,0,0.0,,254,4428801,3592039,14142,836762,3294
49,CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,40.55,0.9882,0,0.0,,254,110040,30477,120,79563,313
50,IO,AQuA,gpt-3.5-turbo,2025/1/7,38.98,1.0,0,0.038,,254,42471,25701,101,16770,66
51,SC-CoT,AQuA,Internllm2_5-7B,2025/1/22,38.58,0.9724,0,0.0,,254,879671,264557,1042,615114,2422
52,PoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,36.61,0.9685,0,0.0,,254,290914,240613,947,50301,198
53,PoT,AQuA,Internllm2_5-7B,2025/1/22,36.61,0.9882,0,0.0,,254,301962,233505,919,68457,270
54,ToT,AQuA,Internllm2_5-7B,2025/1/22,35.83,0.9961,0,0.0,,254,4734560,4263136,16784,471424,1856
55,CoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,33.07,0.9882,0,0.0,,254,117339,30477,120,86862,342
56,ToT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,31.5,0.9882,0,0.0,,254,6250702,6058022,23850,192680,759
57,PoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,30.71,0.9646,0,0.0,,254,298475,246560,971,51915,204
58,ToT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,29.92,1.0,0,0.0,,254,8700281,8100085,31890,600196,2363
59,IO,AQuA,Qwen2-1.5B-Instruct,2025/1/22,29.13,0.9764,0,0.0,,254,71047,27937,110,43110,170
60,IO,AQuA,Qwen2-0.5B-Instruct,2025/1/22,27.17,0.9882,0,0.0,,254,110415,27937,110,82478,325
61,ReAct-Pro*,AQuA,Qwen2-1.5B-Instruct,2025/1/22,25.59,0.9606,0,0.0,,254,5072004,4555858,17936,516146,2032
62,ToT,AQuA,deepseek-r1:1.5b,2025/2/10,24.8,0.5551,0,0.0,,254,794512,605028,2382,189484,746
63,ReAct-Pro*,AQuA,Qwen2-0.5B-Instruct,2025/1/22,24.02,0.9685,0,0.0,,254,7170087,6344167,24977,825920,3252
64,PoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,17.32,0.9213,0,0.0,,254,322281,258867,1019,63414,250
65,SC-CoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,17.32,0.8228,0,0.0,,254,753913,150787,594,603126,2375
66,SC-CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,10.63,0.5157,0,0.0,,254,701980,151410,596,550570,2168
1,CoT,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,80.2,1.0,4,0.349,,500,619015,338549,677,280466,561
2,SC-CoT,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,79.8,1.0,4,1.8504,,500,3282349,1775395,3551,1506954,3014
3,SC-CoT,MATH-500,gpt-4o,2025/1/22,74.6,1.0,4,12.3611,,500,1495125,345347,691,1149778,2300
4,SC-CoT,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,72.4,1.0,4,1.7845,,500,3165511,1797045,3594,1368466,2737
5,CoT,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,71.2,1.0,4,0.3463,,500,614221,342879,686,271342,543
6,SC-CoT,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,71.2,1.0,4,0.0,,500,3155475,1855922,3712,1299553,2599
7,IO,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,70.2,1.0,4,0.2506,,500,444591,169549,339,275042,550
8,CoT,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,69.8,1.0,4,0.0,,500,617204,354049,708,263155,526
9,IO,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,69.4,1.0,4,0.2386,,500,423216,155879,312,267337,535
10,CoT,MATH-500,gpt-4o,2025/1/22,68.0,1.0,4,3.0569,,500,552688,329332,659,223356,447
11,SC-CoT,MATH-500,Doubao-lite-32k,2025/1/7,65.8,0.998,4,0.0734,,500,1078003,362390,725,715613,1431
12,ReAct-Pro*,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,64.6,1.0,4,3.1806,,500,5641879,5223611,10447,418268,837
13,ReAct-Pro*,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,62.8,1.0,4,3.4541,,500,6127117,5747268,11495,379849,760
14,IO,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,59.4,1.0,4,0.0,,500,411362,169549,339,241813,484
15,CoT,MATH-500,Doubao-lite-32k,2025/1/7,59.0,1.0,4,0.0255,,500,479941,336370,673,143571,287
16,ReAct-Pro*,MATH-500,gpt-4o,2025/1/22,54.0,1.0,4,17.7735,,500,6153255,5834537,11669,318718,637
17,CoT,MATH-500,deepseek-r1:1.5b,2025/1/23,49.4,1.0,4,0.0,,500,1199129,341549,683,857580,1715
18,ReAct-Pro*,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,48.8,1.0,4,0.0,,500,4990240,4646708,9293,343532,687
19,ReAct-Pro*,MATH-500,Doubao-lite-32k,2025/1/7,47.2,1.0,4,0.186,,500,4388666,4234620,8469,154046,308
20,PoT,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,47.2,0.822,4,0.233,,500,413372,242549,485,170823,342
21,SC-CoT,MATH-500,deepseek-r1:1.5b,2025/2/10,46.8,0.992,4,0.0,,500,13968168,1858874,3718,12109294,24219
22,CoT,MATH-500,Internllm2_5-7B,2025/1/22,46.6,1.0,4,0.0,,500,546774,332883,666,213891,428
23,PoT,MATH-500,gpt-4o,2025/1/22,46.2,0.864,4,1.5994,,500,340960,241357,483,99603,199
24,IO,MATH-500,deepseek-r1:1.5b,2025/1/22,43.8,1.0,4,0.0,,500,1022548,157049,314,865499,1731
25,PoT,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,42.6,0.802,4,0.2839,,500,503596,253879,508,249717,499
26,IO,MATH-500,gpt-4o,2025/1/22,41.8,1.0,4,2.7907,,500,394447,153832,308,240615,481
27,SC-CoT,MATH-500,gpt-3.5-turbo,2025/1/7,40.8,1.0,4,1.2308,,500,1050819,345411,691,705408,1411
28,CoT,MATH-500,gpt-3.5-turbo,2025/1/7,39.8,1.0,4,0.3189,,500,432196,329381,659,102815,206
29,PoT,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,39.6,0.744,4,0.0,,500,408812,258549,517,150263,301
30,IO,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,38.6,1.0,4,0.0,,500,503934,155563,311,348371,697
31,IO,MATH-500,Doubao-lite-32k,2025/1/7,37.4,1.0,4,0.0187,,500,311730,166870,334,144860,290
32,PoT,MATH-500,Doubao-lite-32k,2025/1/7,32.6,0.68,4,0.0144,,500,303148,254377,509,48771,98
33,ReAct-Pro*,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,28.8,1.0,4,0.0,,500,8763629,7486706,14973,1276923,2554
34,PoT,MATH-500,gpt-3.5-turbo,2025/1/7,28.8,0.838,4,0.168,,500,271916,239902,480,32014,64
35,CoT,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,25.8,1.0,4,0.0,,500,625568,342879,686,282689,565
36,PoT,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,25.4,0.684,4,0.0,,500,462271,253879,508,208392,417
37,ReAct-Pro*,MATH-500,deepseek-r1:1.5b,2025/2/10,24.4,1.0,4,0.0,,500,30177348,20729970,41460,9447378,18895
38,ReAct-Pro*,MATH-500,gpt-3.5-turbo,2025/1/7,23.8,1.0,4,2.0406,,500,3832714,3708461,7417,124253,249
39,IO,MATH-500,Internllm2_5-7B,2025/1/22,22.8,1.0,4,0.0,,500,467888,201883,404,266005,532
40,SC-CoT,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,19.8,0.998,4,0.0,,500,3490834,1734545,3469,1756289,3513
41,IO,MATH-500,gpt-3.5-turbo,2025/1/7,17.2,1.0,4,0.2436,,500,265625,154881,310,110744,221
42,CoT,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,15.2,1.0,4,0.0,,500,536377,349049,698,187328,375
43,PoT,MATH-500,Internllm2_5-7B,2025/1/22,15.0,0.324,4,0.0,,500,368709,247883,496,120826,242
44,ReAct-Pro*,MATH-500,Internllm2_5-7B,2025/1/22,14.8,1.0,4,0.0,,500,14186105,11831496,23663,2354609,4709
45,ToT,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,10.8,1.0,4,9.0421,,500,16039361,15657730,31315,381631,763
46,ToT,MATH-500,gpt-3.5-turbo,2025/1/7,9.8,1.0,4,5.2914,,500,10001767,9711244,19422,290523,581
47,SC-CoT,MATH-500,Internllm2_5-7B,2025/1/22,9.2,0.974,4,0.0,,500,3249876,1994983,3990,1254893,2510
48,ReAct-Pro*,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,8.2,1.0,4,0.0,,500,8987061,8430774,16862,556287,1113
49,IO,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,7.0,1.0,4,0.0,,500,413878,158777,318,255101,510
50,CoT,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,6.2,1.0,4,0.0,,500,549188,349049,698,200139,400
51,ToT,MATH-500,gpt-4o,2025/1/22,3.2,1.0,4,40.8094,,500,15242432,14881985,29764,360447,721
52,IO,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,2.6,1.0,4,0.0,,500,429330,159049,318,270281,541
53,SC-CoT,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,2.2,0.988,4,0.0,,500,2797682,1808691,3617,988991,1978
54,SC-CoT,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,2.0,0.894,4,0.0,,500,3139024,1805170,3610,1333854,2668
55,ToT,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,1.8,0.908,4,0.0,,500,9035000,7729000,15458,1306000,2612
56,ToT,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,1.4,0.698,4,8.2699,,500,14669500,14099500,28199,570000,1140
57,ToT,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,1.4,0.916,4,0.0,,500,10167500,9749000,19498,418500,837
58,ToT,MATH-500,Doubao-lite-32k,2025/1/7,1.2,0.942,4,0.2371,,500,5564500,5338500,10677,226000,452
59,PoT,MATH-500,deepseek-r1:1.5b,2025/2/10,1.0,0.016,4,0.0,,500,1031067,245549,491,785518,1571
60,PoT,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,0.8,0.022,4,0.0,,500,786870,248509,497,538361,1077
61,ToT,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,0.8,0.972,4,0.0,,500,4535000,4408000,8816,127000,254
62,ReAct-Pro*,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,0.6,1.0,4,0.0,,500,19442440,18137392,36275,1305048,2610
63,ToT,MATH-500,deepseek-r1:1.5b,2025/2/10,0.4,0.716,4,0.0,,500,1941500,1831000,3662,110500,221
64,ToT,MATH-500,Internllm2_5-7B,2025/1/22,0.2,0.99,4,0.0,,500,8350500,7515000,15030,835500,1671
65,PoT,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,0.0,0.0,4,0.0,,500,437202,253549,507,183653,367
66,ToT,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,0.0,0.962,4,0.0,,500,5996500,5590500,11181,406000,812
1,SC-CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,95.22,1.0,8,3.7895,,1319,6722014,5295585,4015,1426429,1081
2,SC-CoT,gsm8k,gpt-4o,2025/1/22,94.77,1.0,8,18.2044,,1319,2491605,894889,678,1596716,1211
3,SC-CoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,94.77,1.0,8,4.045,,1319,7175258,5370360,4072,1804898,1368
4,CoT,gsm8k,gpt-4o,2025/1/22,94.09,1.0,8,4.5367,,1319,1165166,948668,719,216498,164
5,CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,93.93,1.0,8,0.687,,1319,1218665,990168,751,228497,173
6,PoT,gsm8k,gpt-4o,2025/1/22,93.1,0.9977,8,4.2166,,1319,1247912,1101672,835,146240,111
7,CoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,92.87,1.0,8,0.7195,,1319,1276252,1005119,762,271133,206
8,PoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,92.34,0.9939,8,0.7054,,1319,1251210,1106682,839,144528,110
9,IO,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,92.27,1.0,8,0.4709,,1319,835275,583916,443,251359,191
10,ToT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,91.89,1.0,8,20.8753,,1319,37029687,35096810,26609,1932877,1465
11,SC-CoT,gsm8k,Doubao-lite-32k,2025/1/7,91.58,0.9992,8,0.1118,,1319,1835891,942182,714,893709,678
12,ToT,gsm8k,gpt-4o,2025/1/22,91.13,1.0,8,86.8581,,1319,30769735,29445237,22324,1324498,1004
13,SC-CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,90.98,1.0,8,0.0,,1319,7259943,5580524,4231,1679419,1273
14,CoT,gsm8k,Doubao-lite-32k,2025/1/7,89.31,1.0,8,0.0558,,1319,1201820,1042095,790,159725,121
15,ToT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,88.88,1.0,8,23.5911,,1319,41847148,40435361,30656,1411787,1070
16,IO,gsm8k,gpt-4o,2025/1/22,88.4,1.0,8,3.3463,,1319,741446,542416,411,199030,151
17,ReAct-Pro*,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,87.64,0.9992,8,10.1124,,1319,17937864,17038928,12918,898936,682
18,ReAct-Pro*,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,87.26,1.0,8,10.5479,,1319,18710437,18160983,13769,549454,417
19,IO,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,86.58,1.0,8,0.4899,,1319,869060,555340,421,313720,238
20,CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,85.67,1.0,8,0.0,,1319,1290805,1046008,793,244797,186
21,ReAct-Pro*,gsm8k,Doubao-lite-32k,2025/1/7,85.6,0.9962,8,0.2512,,1319,5998639,5862016,4444,136623,104
22,ReAct-Pro*,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,82.87,1.0,8,0.0,,1319,14850914,14355752,10884,495162,375
23,PoT,gsm8k,Doubao-lite-32k,2025/1/7,79.61,0.9257,8,0.0576,,1319,1288055,1170038,887,118017,89
24,CoT,gsm8k,gpt-3.5-turbo,2025/1/7,78.7,1.0,8,0.6788,,1319,1088041,953242,723,134799,102
25,CoT,gsm8k,Internllm2_5-7B,2025/1/22,77.71,0.997,8,0.0,,1319,1202163,968163,734,234000,177
26,PoT,gsm8k,gpt-3.5-turbo,2025/1/7,76.88,0.9924,8,0.6902,,1319,1187080,1090418,827,96662,73
27,CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,75.44,0.9992,8,0.0,,1319,1248329,990168,751,258161,196
28,ReAct-Pro*,gsm8k,gpt-3.5-turbo,2025/1/7,74.91,0.9939,8,3.4633,,1319,6646286,6506164,4933,140122,106
29,PoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,73.09,0.7961,8,0.9736,,1319,1727044,1126025,854,601019,456
30,ToT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,72.21,0.9901,8,0.0,,1319,31657319,20196528,15312,11460791,8689
31,IO,gsm8k,Doubao-lite-32k,2025/1/7,72.02,0.9992,8,0.0354,,1319,740483,617377,468,123106,93
32,CoT,gsm8k,deepseek-r1:1.5b,2025/1/23,70.66,0.9977,8,0.0,,1319,2090625,1011714,767,1078911,818
33,SC-CoT,gsm8k,gpt-3.5-turbo,2025/1/7,69.29,0.9879,8,2.5203,,1319,2277249,895571,679,1381678,1048
34,SC-CoT,gsm8k,deepseek-r1:1.5b,2025/2/10,69.07,0.9879,8,0.0,,1319,10029684,5407357,4100,4622327,3504
35,ToT,gsm8k,gpt-3.5-turbo,2025/1/7,67.93,0.997,8,9.1707,,1319,16727175,15920037,12070,807138,612
36,ReAct-Pro*,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,67.78,0.9856,8,0.0,,1319,22835767,21044978,15955,1790789,1358
37,ToT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,65.05,0.9196,8,0.0,,1319,16432102,15554967,11793,877135,665
38,IO,gsm8k,deepseek-r1:1.5b,2025/1/22,64.14,0.9962,8,0.0,,1319,1483051,561935,426,921116,698
39,ReAct-Pro*,gsm8k,gpt-4o,2025/1/22,63.31,0.9955,8,39.0751,,1319,14715887,14411173,10926,304714,231
40,PoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,58.83,0.7051,8,0.0,,1319,1362822,1145390,868,217432,165
41,IO,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,57.24,1.0,8,0.0,,1319,887913,596229,452,291684,221
42,IO,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,57.16,0.9955,8,0.0,,1319,1745429,550941,418,1194488,906
43,CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,55.5,1.0,8,0.0,,1319,1218525,1032818,783,185707,141
44,SC-CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,54.36,0.9985,8,0.0,,1319,10956434,5136762,3894,5819672,4412
45,SC-CoT,gsm8k,Internllm2_5-7B,2025/1/22,44.66,0.9181,8,0.0,,1319,8162499,5847761,4433,2314738,1755
46,PoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,38.67,0.5542,8,0.0,,1319,1391111,1147538,870,243573,185
47,PoT,gsm8k,Internllm2_5-7B,2025/1/22,38.21,0.489,8,0.0,,1319,1324949,1136843,862,188106,143
48,IO,gsm8k,gpt-3.5-turbo,2025/1/7,37.83,0.9992,8,0.3328,,1319,586553,546990,415,39563,30
49,ToT,gsm8k,Doubao-lite-32k,2025/1/7,37.83,0.8734,8,0.8739,,1319,20274349,19208597,14563,1065752,808
50,ReAct-Pro*,gsm8k,deepseek-r1:1.5b,2025/2/10,35.94,0.9962,8,0.0,,1319,24219077,19299381,14632,4919696,3730
51,CoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,35.94,0.9992,8,0.0,,1319,1223459,1032818,783,190641,145
52,ReAct-Pro*,gsm8k,Internllm2_5-7B,2025/1/22,33.51,0.9795,8,0.0,,1319,35669989,30120070,22836,5549919,4208
53,ReAct-Pro*,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,24.87,0.8021,8,0.0,,1319,9828001,9133603,6925,694398,526
54,ToT,gsm8k,deepseek-r1:1.5b,2025/2/10,23.12,0.7248,8,0.0,,1319,3421486,2738244,2076,683242,518
55,ToT,gsm8k,Internllm2_5-7B,2025/1/22,20.85,0.7013,8,0.0,,1319,13178129,11768118,8922,1410011,1069
56,ToT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,19.64,0.7726,8,0.0,,1319,12758687,12124248,9192,634439,481
57,PoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,18.5,0.3101,8,0.0,,1319,1327522,1151528,873,175994,133
58,IO,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,16.68,1.0,8,0.0,,1319,736996,568530,431,168466,128
59,IO,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,14.71,1.0,8,0.0,,1319,834897,568116,431,266781,202
60,PoT,gsm8k,deepseek-r1:1.5b,2025/2/10,11.9,0.1744,8,0.0,,1319,1954509,1138872,863,815637,618
61,IO,gsm8k,Internllm2_5-7B,2025/1/22,11.6,0.9795,8,0.0,,1319,1113728,679302,515,434426,329
62,PoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,9.63,0.1691,8,0.0,,1319,1389135,1151528,873,237607,180
63,SC-CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,8.19,0.6876,8,0.0,,1319,7386453,5439568,4124,1946885,1476
64,ReAct-Pro*,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,7.66,0.9522,8,0.0,,1319,55392611,52431343,39751,2961268,2245
65,SC-CoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,4.17,0.9447,8,0.0,,1319,7478767,5441962,4126,2036805,1544
66,ToT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,0.0,0.0,8,0.0,,1319,0,0,0,0,0