File size: 19,497 Bytes
c9a97c2
6add05a
 
 
 
 
 
 
 
 
 
cd01d35
6add05a
 
 
 
 
 
 
 
 
cd01d35
 
 
 
 
 
 
 
 
6add05a
 
 
 
 
 
 
 
cd01d35
 
 
 
 
 
 
 
 
 
 
 
6add05a
 
cd01d35
 
 
 
 
 
 
 
 
 
 
 
6add05a
 
 
cd01d35
6add05a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd01d35
 
6add05a
 
 
 
 
 
cd01d35
 
 
 
 
6add05a
 
cd01d35
 
 
6add05a
 
 
 
 
 
 
 
 
 
cd01d35
 
 
 
 
 
6add05a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd01d35
 
 
 
6add05a
 
 
 
 
 
 
 
 
 
 
 
cd01d35
 
 
 
 
 
 
 
 
6add05a
 
cd01d35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6add05a
 
 
cd01d35
6add05a
cd01d35
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
Rank,Algorithm,Dataset,LLM,Eval Date,Score,Pass rate,X-shot,Cost($),Framework,Samples,All tokens,Total input tokens,Average input tokens,Total output tokens,Average output tokens
1,CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,86.22,0.9921,0,0.0808,,254,143289,25143,99,118146,465
2,SC-CoT,AQuA,gpt-4o,2025/1/22,85.83,0.9921,0,5.2456,,254,545431,27829,110,517602,2038
3,SC-CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,85.43,0.9685,0,0.4186,,254,742552,137990,543,604562,2380
4,SC-CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,84.65,0.9961,0,0.4438,,254,787312,175050,689,612262,2410
5,IO,AQuA,Qwen2.5-72B-Instruct,2025/1/22,84.25,0.9961,0,0.0742,,254,131604,25397,100,106207,418
6,CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,83.46,0.9843,0,0.0927,,254,164389,32555,128,131834,519
7,ToT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,83.07,1.0,0,2.9404,,254,5215848,4735188,18642,480660,1892
8,IO,AQuA,Llama-3.3-70B-Instruct,2025/1/22,82.68,0.9921,0,0.0798,,254,141567,32809,129,108758,428
9,CoT,AQuA,Doubao-lite-32k,2025/1/7,82.68,0.9724,0,0.0066,,254,94577,27978,110,66599,262
10,CoT,AQuA,gpt-4o,2025/1/22,82.68,0.9803,0,1.0417,,254,123017,25123,99,97894,385
11,ToT,AQuA,gpt-4o,2025/1/22,81.5,0.9921,0,8.5295,,254,2613607,2347538,9242,266069,1048
12,ToT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,81.1,0.9921,0,3.7389,,254,6632255,6371642,25085,260613,1026
13,CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,80.71,0.9961,0,0.0,,254,149736,33017,130,116719,460
14,PoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.53,0.9921,0,0.1746,,254,309799,240735,948,69064,272
15,SC-CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,79.53,1.0,0,0.0,,254,745410,177972,701,567438,2234
16,IO,AQuA,Doubao-lite-32k,2025/1/7,79.13,1.0,0,0.0058,,254,87742,33058,130,54684,215
17,ReAct-Pro*,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.13,0.9961,0,0.768,,254,1362379,1119143,4406,243236,958
18,IO,AQuA,Qwen2.5-7B-Instruct,2025/1/22,78.74,0.9843,0,0.0,,254,137771,33271,131,104500,411
19,ReAct-Pro*,AQuA,Doubao-lite-32k,2025/1/7,77.56,0.9606,0,0.0445,,254,1032841,977890,3850,54951,216
20,SC-CoT,AQuA,Doubao-lite-32k,2025/1/7,76.37,0.9173,0,0.0279,,254,356839,31703,125,325136,1280
21,IO,AQuA,gpt-4o,2025/1/22,75.59,0.9724,0,1.1453,,254,133752,25631,101,108121,426
22,PoT,AQuA,gpt-4o,2025/1/22,75.2,1.0,0,1.6087,,254,327908,222717,877,105191,414
23,PoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,75.2,1.0,0,0.1645,,254,291764,249215,981,42549,168
24,ReAct-Pro*,AQuA,Qwen2.5-7B-Instruct,2025/1/22,74.41,0.9921,0,0.0,,254,695844,564165,2221,131679,518
25,ReAct-Pro*,AQuA,Qwen2.5-72B-Instruct,2025/1/22,73.23,1.0,0,0.3177,,254,563603,441765,1739,121838,480
26,PoT,AQuA,Doubao-lite-32k,2025/1/7,71.65,0.9685,0,0.0147,,254,309436,259863,1023,49573,195
27,CoT,AQuA,deepseek-r1:1.5b,2025/1/23,71.65,0.9685,0,0.0,,254,333072,26413,104,306659,1207
28,IO,AQuA,deepseek-r1:1.5b,2025/1/22,68.9,0.9488,0,0.0,,254,351767,26667,105,325100,1280
29,PoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,68.11,1.0,0,0.0,,254,313728,264517,1041,49211,194
30,ReAct-Pro*,AQuA,gpt-3.5-turbo,2025/1/7,64.57,0.9803,0,0.4928,,254,903587,862614,3396,40973,161
31,CoT,AQuA,gpt-3.5-turbo,2025/1/7,61.02,0.937,0,0.0957,,254,80793,25447,100,55346,218
32,CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,60.63,1.0,0,0.0,,254,144435,32555,128,111880,440
33,PoT,AQuA,gpt-3.5-turbo,2025/1/7,59.45,1.0,0,0.1748,,254,266654,225162,886,41492,163
34,SC-CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,59.45,0.9567,0,0.0,,254,690077,145108,571,544969,2146
35,ToT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,59.06,1.0,0,0.0,,254,5739684,4896222,19276,843462,3321
36,SC-CoT,AQuA,gpt-3.5-turbo,2025/1/7,58.66,0.9252,0,0.3277,,254,237066,27906,110,209160,823
37,SC-CoT,AQuA,deepseek-r1:1.5b,2025/2/10,57.87,0.7402,0,0.0,,254,2132111,144710,570,1987401,7824
38,ReAct-Pro*,AQuA,gpt-4o,2025/1/22,57.48,0.9724,0,2.304,,254,692096,615589,2424,76507,301
39,ToT,AQuA,gpt-3.5-turbo,2025/1/7,57.09,0.9961,0,1.1513,,254,2001396,1850767,7286,150629,593
40,ReAct-Pro*,AQuA,Llama-3.1-8B-Instruct,2025/1/22,55.51,0.9685,0,0.0,,254,4340821,3764723,14822,576098,2268
41,PoT,AQuA,deepseek-r1:1.5b,2025/2/10,54.72,0.9724,0,0.0,,254,1016647,250690,987,765957,3016
42,ReAct-Pro*,AQuA,deepseek-r1:1.5b,2025/2/10,54.33,0.9646,0,0.0,,254,14445041,10578715,41648,3866326,15222
43,ToT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,53.94,1.0,0,0.0,,254,8602682,8224468,32380,378214,1489
44,CoT,AQuA,Internllm2_5-7B,2025/1/22,52.76,0.8937,0,0.0,,254,127520,26610,105,100910,397
45,IO,AQuA,Llama-3.1-8B-Instruct,2025/1/22,51.18,0.9882,0,0.0,,254,133106,26459,104,106647,420
46,IO,AQuA,Internllm2_5-7B,2025/1/22,47.64,0.9094,0,0.0,,254,185041,50232,198,134809,531
47,ToT,AQuA,Doubao-lite-32k,2025/1/7,45.28,0.7402,0,0.0881,,254,2000550,1850249,7284,150301,592
48,ReAct-Pro*,AQuA,Internllm2_5-7B,2025/1/22,40.94,0.9685,0,0.0,,254,4428801,3592039,14142,836762,3294
49,CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,40.55,0.9882,0,0.0,,254,110040,30477,120,79563,313
50,IO,AQuA,gpt-3.5-turbo,2025/1/7,38.98,1.0,0,0.038,,254,42471,25701,101,16770,66
51,SC-CoT,AQuA,Internllm2_5-7B,2025/1/22,38.58,0.9724,0,0.0,,254,879671,264557,1042,615114,2422
52,PoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,36.61,0.9685,0,0.0,,254,290914,240613,947,50301,198
53,PoT,AQuA,Internllm2_5-7B,2025/1/22,36.61,0.9882,0,0.0,,254,301962,233505,919,68457,270
54,ToT,AQuA,Internllm2_5-7B,2025/1/22,35.83,0.9961,0,0.0,,254,4734560,4263136,16784,471424,1856
55,CoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,33.07,0.9882,0,0.0,,254,117339,30477,120,86862,342
56,ToT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,31.5,0.9882,0,0.0,,254,6250702,6058022,23850,192680,759
57,PoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,30.71,0.9646,0,0.0,,254,298475,246560,971,51915,204
58,ToT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,29.92,1.0,0,0.0,,254,8700281,8100085,31890,600196,2363
59,IO,AQuA,Qwen2-1.5B-Instruct,2025/1/22,29.13,0.9764,0,0.0,,254,71047,27937,110,43110,170
60,IO,AQuA,Qwen2-0.5B-Instruct,2025/1/22,27.17,0.9882,0,0.0,,254,110415,27937,110,82478,325
61,ReAct-Pro*,AQuA,Qwen2-1.5B-Instruct,2025/1/22,25.59,0.9606,0,0.0,,254,5072004,4555858,17936,516146,2032
62,ToT,AQuA,deepseek-r1:1.5b,2025/2/10,24.8,0.5551,0,0.0,,254,794512,605028,2382,189484,746
63,ReAct-Pro*,AQuA,Qwen2-0.5B-Instruct,2025/1/22,24.02,0.9685,0,0.0,,254,7170087,6344167,24977,825920,3252
64,PoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,17.32,0.9213,0,0.0,,254,322281,258867,1019,63414,250
65,SC-CoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,17.32,0.8228,0,0.0,,254,753913,150787,594,603126,2375
66,SC-CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,10.63,0.5157,0,0.0,,254,701980,151410,596,550570,2168
1,CoT,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,80.2,1.0,4,0.349,,500,619015,338549,677,280466,561
2,SC-CoT,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,79.8,1.0,4,1.8504,,500,3282349,1775395,3551,1506954,3014
3,SC-CoT,MATH-500,gpt-4o,2025/1/22,74.6,1.0,4,12.3611,,500,1495125,345347,691,1149778,2300
4,SC-CoT,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,72.4,1.0,4,1.7845,,500,3165511,1797045,3594,1368466,2737
5,CoT,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,71.2,1.0,4,0.3463,,500,614221,342879,686,271342,543
6,SC-CoT,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,71.2,1.0,4,0.0,,500,3155475,1855922,3712,1299553,2599
7,IO,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,70.2,1.0,4,0.2506,,500,444591,169549,339,275042,550
8,CoT,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,69.8,1.0,4,0.0,,500,617204,354049,708,263155,526
9,IO,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,69.4,1.0,4,0.2386,,500,423216,155879,312,267337,535
10,CoT,MATH-500,gpt-4o,2025/1/22,68.0,1.0,4,3.0569,,500,552688,329332,659,223356,447
11,SC-CoT,MATH-500,Doubao-lite-32k,2025/1/7,65.8,0.998,4,0.0734,,500,1078003,362390,725,715613,1431
12,ReAct-Pro*,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,64.6,1.0,4,3.1806,,500,5641879,5223611,10447,418268,837
13,ReAct-Pro*,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,62.8,1.0,4,3.4541,,500,6127117,5747268,11495,379849,760
14,IO,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,59.4,1.0,4,0.0,,500,411362,169549,339,241813,484
15,CoT,MATH-500,Doubao-lite-32k,2025/1/7,59.0,1.0,4,0.0255,,500,479941,336370,673,143571,287
16,ReAct-Pro*,MATH-500,gpt-4o,2025/1/22,54.0,1.0,4,17.7735,,500,6153255,5834537,11669,318718,637
17,CoT,MATH-500,deepseek-r1:1.5b,2025/1/23,49.4,1.0,4,0.0,,500,1199129,341549,683,857580,1715
18,ReAct-Pro*,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,48.8,1.0,4,0.0,,500,4990240,4646708,9293,343532,687
19,ReAct-Pro*,MATH-500,Doubao-lite-32k,2025/1/7,47.2,1.0,4,0.186,,500,4388666,4234620,8469,154046,308
20,PoT,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,47.2,0.822,4,0.233,,500,413372,242549,485,170823,342
21,SC-CoT,MATH-500,deepseek-r1:1.5b,2025/2/10,46.8,0.992,4,0.0,,500,13968168,1858874,3718,12109294,24219
22,CoT,MATH-500,Internllm2_5-7B,2025/1/22,46.6,1.0,4,0.0,,500,546774,332883,666,213891,428
23,PoT,MATH-500,gpt-4o,2025/1/22,46.2,0.864,4,1.5994,,500,340960,241357,483,99603,199
24,IO,MATH-500,deepseek-r1:1.5b,2025/1/22,43.8,1.0,4,0.0,,500,1022548,157049,314,865499,1731
25,PoT,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,42.6,0.802,4,0.2839,,500,503596,253879,508,249717,499
26,IO,MATH-500,gpt-4o,2025/1/22,41.8,1.0,4,2.7907,,500,394447,153832,308,240615,481
27,SC-CoT,MATH-500,gpt-3.5-turbo,2025/1/7,40.8,1.0,4,1.2308,,500,1050819,345411,691,705408,1411
28,CoT,MATH-500,gpt-3.5-turbo,2025/1/7,39.8,1.0,4,0.3189,,500,432196,329381,659,102815,206
29,PoT,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,39.6,0.744,4,0.0,,500,408812,258549,517,150263,301
30,IO,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,38.6,1.0,4,0.0,,500,503934,155563,311,348371,697
31,IO,MATH-500,Doubao-lite-32k,2025/1/7,37.4,1.0,4,0.0187,,500,311730,166870,334,144860,290
32,PoT,MATH-500,Doubao-lite-32k,2025/1/7,32.6,0.68,4,0.0144,,500,303148,254377,509,48771,98
33,ReAct-Pro*,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,28.8,1.0,4,0.0,,500,8763629,7486706,14973,1276923,2554
34,PoT,MATH-500,gpt-3.5-turbo,2025/1/7,28.8,0.838,4,0.168,,500,271916,239902,480,32014,64
35,CoT,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,25.8,1.0,4,0.0,,500,625568,342879,686,282689,565
36,PoT,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,25.4,0.684,4,0.0,,500,462271,253879,508,208392,417
37,ReAct-Pro*,MATH-500,deepseek-r1:1.5b,2025/2/10,24.4,1.0,4,0.0,,500,30177348,20729970,41460,9447378,18895
38,ReAct-Pro*,MATH-500,gpt-3.5-turbo,2025/1/7,23.8,1.0,4,2.0406,,500,3832714,3708461,7417,124253,249
39,IO,MATH-500,Internllm2_5-7B,2025/1/22,22.8,1.0,4,0.0,,500,467888,201883,404,266005,532
40,SC-CoT,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,19.8,0.998,4,0.0,,500,3490834,1734545,3469,1756289,3513
41,IO,MATH-500,gpt-3.5-turbo,2025/1/7,17.2,1.0,4,0.2436,,500,265625,154881,310,110744,221
42,CoT,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,15.2,1.0,4,0.0,,500,536377,349049,698,187328,375
43,PoT,MATH-500,Internllm2_5-7B,2025/1/22,15.0,0.324,4,0.0,,500,368709,247883,496,120826,242
44,ReAct-Pro*,MATH-500,Internllm2_5-7B,2025/1/22,14.8,1.0,4,0.0,,500,14186105,11831496,23663,2354609,4709
45,ToT,MATH-500,Qwen2.5-72B-Instruct,2025/1/22,10.8,1.0,4,9.0421,,500,16039361,15657730,31315,381631,763
46,ToT,MATH-500,gpt-3.5-turbo,2025/1/7,9.8,1.0,4,5.2914,,500,10001767,9711244,19422,290523,581
47,SC-CoT,MATH-500,Internllm2_5-7B,2025/1/22,9.2,0.974,4,0.0,,500,3249876,1994983,3990,1254893,2510
48,ReAct-Pro*,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,8.2,1.0,4,0.0,,500,8987061,8430774,16862,556287,1113
49,IO,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,7.0,1.0,4,0.0,,500,413878,158777,318,255101,510
50,CoT,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,6.2,1.0,4,0.0,,500,549188,349049,698,200139,400
51,ToT,MATH-500,gpt-4o,2025/1/22,3.2,1.0,4,40.8094,,500,15242432,14881985,29764,360447,721
52,IO,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,2.6,1.0,4,0.0,,500,429330,159049,318,270281,541
53,SC-CoT,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,2.2,0.988,4,0.0,,500,2797682,1808691,3617,988991,1978
54,SC-CoT,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,2.0,0.894,4,0.0,,500,3139024,1805170,3610,1333854,2668
55,ToT,MATH-500,Llama-3.1-8B-Instruct,2025/1/22,1.8,0.908,4,0.0,,500,9035000,7729000,15458,1306000,2612
56,ToT,MATH-500,Llama-3.3-70B-Instruct,2025/1/22,1.4,0.698,4,8.2699,,500,14669500,14099500,28199,570000,1140
57,ToT,MATH-500,Qwen2.5-7B-Instruct,2025/1/22,1.4,0.916,4,0.0,,500,10167500,9749000,19498,418500,837
58,ToT,MATH-500,Doubao-lite-32k,2025/1/7,1.2,0.942,4,0.2371,,500,5564500,5338500,10677,226000,452
59,PoT,MATH-500,deepseek-r1:1.5b,2025/2/10,1.0,0.016,4,0.0,,500,1031067,245549,491,785518,1571
60,PoT,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,0.8,0.022,4,0.0,,500,786870,248509,497,538361,1077
61,ToT,MATH-500,Qwen2-1.5B-Instruct,2025/1/22,0.8,0.972,4,0.0,,500,4535000,4408000,8816,127000,254
62,ReAct-Pro*,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,0.6,1.0,4,0.0,,500,19442440,18137392,36275,1305048,2610
63,ToT,MATH-500,deepseek-r1:1.5b,2025/2/10,0.4,0.716,4,0.0,,500,1941500,1831000,3662,110500,221
64,ToT,MATH-500,Internllm2_5-7B,2025/1/22,0.2,0.99,4,0.0,,500,8350500,7515000,15030,835500,1671
65,PoT,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,0.0,0.0,4,0.0,,500,437202,253549,507,183653,367
66,ToT,MATH-500,Qwen2-0.5B-Instruct,2025/1/22,0.0,0.962,4,0.0,,500,5996500,5590500,11181,406000,812
1,SC-CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,95.22,1.0,8,3.7895,,1319,6722014,5295585,4015,1426429,1081
2,SC-CoT,gsm8k,gpt-4o,2025/1/22,94.77,1.0,8,18.2044,,1319,2491605,894889,678,1596716,1211
3,SC-CoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,94.77,1.0,8,4.045,,1319,7175258,5370360,4072,1804898,1368
4,CoT,gsm8k,gpt-4o,2025/1/22,94.09,1.0,8,4.5367,,1319,1165166,948668,719,216498,164
5,CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,93.93,1.0,8,0.687,,1319,1218665,990168,751,228497,173
6,PoT,gsm8k,gpt-4o,2025/1/22,93.1,0.9977,8,4.2166,,1319,1247912,1101672,835,146240,111
7,CoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,92.87,1.0,8,0.7195,,1319,1276252,1005119,762,271133,206
8,PoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,92.34,0.9939,8,0.7054,,1319,1251210,1106682,839,144528,110
9,IO,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,92.27,1.0,8,0.4709,,1319,835275,583916,443,251359,191
10,ToT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,91.89,1.0,8,20.8753,,1319,37029687,35096810,26609,1932877,1465
11,SC-CoT,gsm8k,Doubao-lite-32k,2025/1/7,91.58,0.9992,8,0.1118,,1319,1835891,942182,714,893709,678
12,ToT,gsm8k,gpt-4o,2025/1/22,91.13,1.0,8,86.8581,,1319,30769735,29445237,22324,1324498,1004
13,SC-CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,90.98,1.0,8,0.0,,1319,7259943,5580524,4231,1679419,1273
14,CoT,gsm8k,Doubao-lite-32k,2025/1/7,89.31,1.0,8,0.0558,,1319,1201820,1042095,790,159725,121
15,ToT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,88.88,1.0,8,23.5911,,1319,41847148,40435361,30656,1411787,1070
16,IO,gsm8k,gpt-4o,2025/1/22,88.4,1.0,8,3.3463,,1319,741446,542416,411,199030,151
17,ReAct-Pro*,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,87.64,0.9992,8,10.1124,,1319,17937864,17038928,12918,898936,682
18,ReAct-Pro*,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,87.26,1.0,8,10.5479,,1319,18710437,18160983,13769,549454,417
19,IO,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,86.58,1.0,8,0.4899,,1319,869060,555340,421,313720,238
20,CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,85.67,1.0,8,0.0,,1319,1290805,1046008,793,244797,186
21,ReAct-Pro*,gsm8k,Doubao-lite-32k,2025/1/7,85.6,0.9962,8,0.2512,,1319,5998639,5862016,4444,136623,104
22,ReAct-Pro*,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,82.87,1.0,8,0.0,,1319,14850914,14355752,10884,495162,375
23,PoT,gsm8k,Doubao-lite-32k,2025/1/7,79.61,0.9257,8,0.0576,,1319,1288055,1170038,887,118017,89
24,CoT,gsm8k,gpt-3.5-turbo,2025/1/7,78.7,1.0,8,0.6788,,1319,1088041,953242,723,134799,102
25,CoT,gsm8k,Internllm2_5-7B,2025/1/22,77.71,0.997,8,0.0,,1319,1202163,968163,734,234000,177
26,PoT,gsm8k,gpt-3.5-turbo,2025/1/7,76.88,0.9924,8,0.6902,,1319,1187080,1090418,827,96662,73
27,CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,75.44,0.9992,8,0.0,,1319,1248329,990168,751,258161,196
28,ReAct-Pro*,gsm8k,gpt-3.5-turbo,2025/1/7,74.91,0.9939,8,3.4633,,1319,6646286,6506164,4933,140122,106
29,PoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,73.09,0.7961,8,0.9736,,1319,1727044,1126025,854,601019,456
30,ToT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,72.21,0.9901,8,0.0,,1319,31657319,20196528,15312,11460791,8689
31,IO,gsm8k,Doubao-lite-32k,2025/1/7,72.02,0.9992,8,0.0354,,1319,740483,617377,468,123106,93
32,CoT,gsm8k,deepseek-r1:1.5b,2025/1/23,70.66,0.9977,8,0.0,,1319,2090625,1011714,767,1078911,818
33,SC-CoT,gsm8k,gpt-3.5-turbo,2025/1/7,69.29,0.9879,8,2.5203,,1319,2277249,895571,679,1381678,1048
34,SC-CoT,gsm8k,deepseek-r1:1.5b,2025/2/10,69.07,0.9879,8,0.0,,1319,10029684,5407357,4100,4622327,3504
35,ToT,gsm8k,gpt-3.5-turbo,2025/1/7,67.93,0.997,8,9.1707,,1319,16727175,15920037,12070,807138,612
36,ReAct-Pro*,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,67.78,0.9856,8,0.0,,1319,22835767,21044978,15955,1790789,1358
37,ToT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,65.05,0.9196,8,0.0,,1319,16432102,15554967,11793,877135,665
38,IO,gsm8k,deepseek-r1:1.5b,2025/1/22,64.14,0.9962,8,0.0,,1319,1483051,561935,426,921116,698
39,ReAct-Pro*,gsm8k,gpt-4o,2025/1/22,63.31,0.9955,8,39.0751,,1319,14715887,14411173,10926,304714,231
40,PoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,58.83,0.7051,8,0.0,,1319,1362822,1145390,868,217432,165
41,IO,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,57.24,1.0,8,0.0,,1319,887913,596229,452,291684,221
42,IO,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,57.16,0.9955,8,0.0,,1319,1745429,550941,418,1194488,906
43,CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,55.5,1.0,8,0.0,,1319,1218525,1032818,783,185707,141
44,SC-CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,54.36,0.9985,8,0.0,,1319,10956434,5136762,3894,5819672,4412
45,SC-CoT,gsm8k,Internllm2_5-7B,2025/1/22,44.66,0.9181,8,0.0,,1319,8162499,5847761,4433,2314738,1755
46,PoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,38.67,0.5542,8,0.0,,1319,1391111,1147538,870,243573,185
47,PoT,gsm8k,Internllm2_5-7B,2025/1/22,38.21,0.489,8,0.0,,1319,1324949,1136843,862,188106,143
48,IO,gsm8k,gpt-3.5-turbo,2025/1/7,37.83,0.9992,8,0.3328,,1319,586553,546990,415,39563,30
49,ToT,gsm8k,Doubao-lite-32k,2025/1/7,37.83,0.8734,8,0.8739,,1319,20274349,19208597,14563,1065752,808
50,ReAct-Pro*,gsm8k,deepseek-r1:1.5b,2025/2/10,35.94,0.9962,8,0.0,,1319,24219077,19299381,14632,4919696,3730
51,CoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,35.94,0.9992,8,0.0,,1319,1223459,1032818,783,190641,145
52,ReAct-Pro*,gsm8k,Internllm2_5-7B,2025/1/22,33.51,0.9795,8,0.0,,1319,35669989,30120070,22836,5549919,4208
53,ReAct-Pro*,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,24.87,0.8021,8,0.0,,1319,9828001,9133603,6925,694398,526
54,ToT,gsm8k,deepseek-r1:1.5b,2025/2/10,23.12,0.7248,8,0.0,,1319,3421486,2738244,2076,683242,518
55,ToT,gsm8k,Internllm2_5-7B,2025/1/22,20.85,0.7013,8,0.0,,1319,13178129,11768118,8922,1410011,1069
56,ToT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,19.64,0.7726,8,0.0,,1319,12758687,12124248,9192,634439,481
57,PoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,18.5,0.3101,8,0.0,,1319,1327522,1151528,873,175994,133
58,IO,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,16.68,1.0,8,0.0,,1319,736996,568530,431,168466,128
59,IO,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,14.71,1.0,8,0.0,,1319,834897,568116,431,266781,202
60,PoT,gsm8k,deepseek-r1:1.5b,2025/2/10,11.9,0.1744,8,0.0,,1319,1954509,1138872,863,815637,618
61,IO,gsm8k,Internllm2_5-7B,2025/1/22,11.6,0.9795,8,0.0,,1319,1113728,679302,515,434426,329
62,PoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,9.63,0.1691,8,0.0,,1319,1389135,1151528,873,237607,180
63,SC-CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,8.19,0.6876,8,0.0,,1319,7386453,5439568,4124,1946885,1476
64,ReAct-Pro*,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,7.66,0.9522,8,0.0,,1319,55392611,52431343,39751,2961268,2245
65,SC-CoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,4.17,0.9447,8,0.0,,1319,7478767,5441962,4126,2036805,1544
66,ToT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,0.0,0.0,8,0.0,,1319,0,0,0,0,0