nm-research commited on
Commit
eb5a917
·
verified ·
1 Parent(s): 0af8874

Add reasoning evals

Browse files
Files changed (1) hide show
  1. README.md +25 -0
README.md CHANGED
@@ -168,6 +168,31 @@ lm_eval \
168
  </thead>
169
  <tbody>
170
  <tr>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  <td rowspan="7"><b>OpenLLM V1</b></td>
172
  <td>ARC-Challenge (Acc-Norm, 25-shot)</td>
173
  <td>37.20</td>
 
168
  </thead>
169
  <tbody>
170
  <tr>
171
+ <td rowspan="4"><b>Reasoning</b></td>
172
+ <td>AIME 2024 (pass@1)</td>
173
+ <td>30.05</td>
174
+ <td>26.67</td>
175
+ <td>88.75%</td>
176
+ </tr>
177
+ <tr>
178
+ <td>MATH-500 (pass@1)</td>
179
+ <td>84.66</td>
180
+ <td>84.39</td>
181
+ <td>99.68%</td>
182
+ </tr>
183
+ <tr>
184
+ <td>GPQA Diamond (pass@1)</td>
185
+ <td>35.37</td>
186
+ <td>34.43</td>
187
+ <td>97.34%</td>
188
+ </tr>
189
+ <tr>
190
+ <td><b>Average Score</b></td>
191
+ <td><b>50.03</b></td>
192
+ <td><b>48.5</b></td>
193
+ <td><b>96.94%</b></td>
194
+ </tr>
195
+ <tr>
196
  <td rowspan="7"><b>OpenLLM V1</b></td>
197
  <td>ARC-Challenge (Acc-Norm, 25-shot)</td>
198
  <td>37.20</td>