nm-research commited on
Commit
5d548d9
·
verified ·
1 Parent(s): b20d128

Add reasoning evals

Browse files
Files changed (1) hide show
  1. README.md +25 -0
README.md CHANGED
@@ -147,6 +147,31 @@ lm_eval \
147
  </tr>
148
  </thead>
149
  <tbody>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  <tr>
151
  <td rowspan="7"><b>OpenLLM V1</b></td>
152
  <td>ARC-Challenge (Acc-Norm, 25-shot)</td>
 
147
  </tr>
148
  </thead>
149
  <tbody>
150
+ <tr>
151
+ <td rowspan="4"><b>Reasoning</b></td>
152
+ <td>AIME 2024 (pass@1)</td>
153
+ <td>49.25</td>
154
+ <td>50.83</td>
155
+ <td>103.21%</td>
156
+ </tr>
157
+ <tr>
158
+ <td>MATH-500 (pass@1)</td>
159
+ <td>90.18</td>
160
+ <td>90.24</td>
161
+ <td>100.07%</td>
162
+ </tr>
163
+ <tr>
164
+ <td>GPQA Diamond (pass@1)</td>
165
+ <td>49.27</td>
166
+ <td>48.71</td>
167
+ <td>98.86%</td>
168
+ </tr>
169
+ <tr>
170
+ <td><b>Average Score</b></td>
171
+ <td><b>62.9</b></td>
172
+ <td><b>63.26</b></td>
173
+ <td><b>100.57%</b></td>
174
+ </tr>
175
  <tr>
176
  <td rowspan="7"><b>OpenLLM V1</b></td>
177
  <td>ARC-Challenge (Acc-Norm, 25-shot)</td>