Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -268,14 +268,18 @@ def run_evaluation(model_id, selected_benchmark_subject, sample_count, progress=
|
|
| 268 |
score_string = f"Accuracy for {benchmark_name} - {subject_name}: {accuracy:.2f}% out of {num_evaluated_samples} samples."
|
| 269 |
|
| 270 |
# Format detailed results for display in the text box
|
|
|
|
|
|
|
| 271 |
formatted_details = "\n\n".join([
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
|
|
|
|
|
|
| 279 |
for item in all_evaluation_results
|
| 280 |
])
|
| 281 |
|
|
|
|
| 268 |
score_string = f"Accuracy for {benchmark_name} - {subject_name}: {accuracy:.2f}% out of {num_evaluated_samples} samples."
|
| 269 |
|
| 270 |
# Format detailed results for display in the text box
|
| 271 |
+
# The key change here is to wrap the entire multi-line string construction for each item
|
| 272 |
+
# within parentheses to ensure it's treated as a single element in the list comprehension.
|
| 273 |
formatted_details = "\n\n".join([
|
| 274 |
+
(
|
| 275 |
+
f"### Question:\n{item['question']}\n\n"
|
| 276 |
+
+ f"**Choices:**\n" + "\n".join([f"{get_choice_letter(i)}. {c}" for i, c in enumerate(item['choices'])]) + "\n\n"
|
| 277 |
+
+ (f"**Note:** Reasoning models are currently not fully supported for single-letter extraction. The original model output followed:\n" if item.get('is_reasoning_model_output') else "")
|
| 278 |
+
+ f"**Model Raw Output:** {item['model_raw_output']}\n"
|
| 279 |
+
+ f"**Expected Answer:** {item['expected_answer_letter']}\n"
|
| 280 |
+
+ f"**Predicted Answer:** {item['predicted_answer_letter']}\n"
|
| 281 |
+
+ f"**Correct:** {'Yes' if item['is_correct'] else 'No'}"
|
| 282 |
+
)
|
| 283 |
for item in all_evaluation_results
|
| 284 |
])
|
| 285 |
|