nm-research commited on
Commit
a5a5355
·
verified ·
1 Parent(s): 38b2975

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +26 -26
README.md CHANGED
@@ -167,57 +167,57 @@ lm_eval \
167
  <td rowspan="7"><b>OpenLLM V1</b></td>
168
  <td>ARC-Challenge (Acc-Norm, 25-shot)</td>
169
  <td>37.20</td>
170
- <td></td>
171
- <td>%</td>
172
  </tr>
173
  <tr>
174
  <td>GSM8K (Strict-Match, 5-shot)</td>
175
  <td>69.98</td>
176
- <td></td>
177
- <td>%</td>
178
  </tr>
179
  <tr>
180
  <td>HellaSwag (Acc-Norm, 10-shot)</td>
181
  <td>43.86</td>
182
- <td></td>
183
- <td>%</td>
184
  </tr>
185
  <tr>
186
  <td>MMLU (Acc, 5-shot)</td>
187
  <td>37.38</td>
188
- <td></td>
189
- <td>%</td>
190
  </tr>
191
  <tr>
192
  <td>TruthfulQA (MC2, 0-shot)</td>
193
  <td>45.21</td>
194
- <td></td>
195
- <td>%</td>
196
  </tr>
197
  <tr>
198
  <td>Winogrande (Acc, 5-shot)</td>
199
  <td>54.30</td>
200
- <td></td>
201
- <td>%</td>
202
  </tr>
203
  <tr>
204
  <td><b>Average Score</b></td>
205
  <td><b>47.99</b></td>
206
- <td><b></b></td>
207
- <td><b>%</b></td>
208
  </tr>
209
  <tr>
210
  <td rowspan="7"><b>OpenLLM V2</b></td>
211
  <td>IFEval (Inst Level Strict Acc, 0-shot)</td>
212
  <td>34.37</td>
213
- <td></td>
214
- <td>%</td>
215
  </tr>
216
  <tr>
217
  <td>BBH (Acc-Norm, 3-shot)</td>
218
  <td>34.44</td>
219
- <td></td>
220
- <td>%</td>
221
  </tr>
222
  <tr>
223
  <td>Math-Hard (Exact-Match, 4-shot)</td>
@@ -228,26 +228,26 @@ lm_eval \
228
  <tr>
229
  <td>GPQA (Acc-Norm, 0-shot)</td>
230
  <td>24.67</td>
231
- <td></td>
232
- <td>%</td>
233
  </tr>
234
  <tr>
235
  <td>MUSR (Acc-Norm, 0-shot)</td>
236
  <td>35.82</td>
237
- <td></td>
238
- <td>%</td>
239
  </tr>
240
  <tr>
241
  <td>MMLU-Pro (Acc, 5-shot)</td>
242
  <td>11.80</td>
243
- <td></td>
244
- <td>%</td>
245
  </tr>
246
  <tr>
247
  <td><b>Average Score</b></td>
248
  <td><b>23.52</b></td>
249
- <td><b></b></td>
250
- <td><b>%</b></td>
251
  </tr>
252
  <tr>
253
  <td rowspan="4"><b>Coding</b></td>
 
167
  <td rowspan="7"><b>OpenLLM V1</b></td>
168
  <td>ARC-Challenge (Acc-Norm, 25-shot)</td>
169
  <td>37.20</td>
170
+ <td>35.84</td>
171
+ <td>96.3%</td>
172
  </tr>
173
  <tr>
174
  <td>GSM8K (Strict-Match, 5-shot)</td>
175
  <td>69.98</td>
176
+ <td>68.01</td>
177
+ <td>97.2%</td>
178
  </tr>
179
  <tr>
180
  <td>HellaSwag (Acc-Norm, 10-shot)</td>
181
  <td>43.86</td>
182
+ <td>42.38</td>
183
+ <td>96.6%</td>
184
  </tr>
185
  <tr>
186
  <td>MMLU (Acc, 5-shot)</td>
187
  <td>37.38</td>
188
+ <td>36.98</td>
189
+ <td>98.9%</td>
190
  </tr>
191
  <tr>
192
  <td>TruthfulQA (MC2, 0-shot)</td>
193
  <td>45.21</td>
194
+ <td>46.68</td>
195
+ <td>103.3%</td>
196
  </tr>
197
  <tr>
198
  <td>Winogrande (Acc, 5-shot)</td>
199
  <td>54.30</td>
200
+ <td>55.49</td>
201
+ <td>102.2%</td>
202
  </tr>
203
  <tr>
204
  <td><b>Average Score</b></td>
205
  <td><b>47.99</b></td>
206
+ <td><b>47.56</b></td>
207
+ <td><b>99.1%</b></td>
208
  </tr>
209
  <tr>
210
  <td rowspan="7"><b>OpenLLM V2</b></td>
211
  <td>IFEval (Inst Level Strict Acc, 0-shot)</td>
212
  <td>34.37</td>
213
+ <td>34.42</td>
214
+ <td>100.2%</td>
215
  </tr>
216
  <tr>
217
  <td>BBH (Acc-Norm, 3-shot)</td>
218
  <td>34.44</td>
219
+ <td>36.48</td>
220
+ <td>105.9%</td>
221
  </tr>
222
  <tr>
223
  <td>Math-Hard (Exact-Match, 4-shot)</td>
 
228
  <tr>
229
  <td>GPQA (Acc-Norm, 0-shot)</td>
230
  <td>24.67</td>
231
+ <td>24.78</td>
232
+ <td>100.5%</td>
233
  </tr>
234
  <tr>
235
  <td>MUSR (Acc-Norm, 0-shot)</td>
236
  <td>35.82</td>
237
+ <td>35.55</td>
238
+ <td>99.3%</td>
239
  </tr>
240
  <tr>
241
  <td>MMLU-Pro (Acc, 5-shot)</td>
242
  <td>11.80</td>
243
+ <td>11.40</td>
244
+ <td>96.6%</td>
245
  </tr>
246
  <tr>
247
  <td><b>Average Score</b></td>
248
  <td><b>23.52</b></td>
249
+ <td><b>23.77</b></td>
250
+ <td><b>101.1%</b></td>
251
  </tr>
252
  <tr>
253
  <td rowspan="4"><b>Coding</b></td>