Spaces:

Enderchef
/

SuperBench-Eval

Sleeping

Enderchef commited on Jun 25

Commit

2d01a29

verified ·

1 Parent(s): 02583ad

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -268,14 +268,18 @@ def run_evaluation(model_id, selected_benchmark_subject, sample_count, progress=
             score_string = f"Accuracy for {benchmark_name} - {subject_name}: {accuracy:.2f}% out of {num_evaluated_samples} samples."
         # Format detailed results for display in the text box
         formatted_details = "\n\n".join([
-            f"### Question:\n{item['question']}\n\n"
-            f"**Choices:**\n" + "\n".join([f"{get_choice_letter(i)}. {c}" for i, c in enumerate(item['choices'])]) + "\n\n"
-            + (f"**Note:** Reasoning models are currently not fully supported for single-letter extraction. The original model output followed:\n" if item.get('is_reasoning_model_output') else "")
-            f"**Model Raw Output:** {item['model_raw_output']}\n"
-            f"**Expected Answer:** {item['expected_answer_letter']}\n"
-            f"**Predicted Answer:** {item['predicted_answer_letter']}\n"
-            f"**Correct:** {'Yes' if item['is_correct'] else 'No'}"
             for item in all_evaluation_results
         ])

             score_string = f"Accuracy for {benchmark_name} - {subject_name}: {accuracy:.2f}% out of {num_evaluated_samples} samples."
         # Format detailed results for display in the text box
+        # The key change here is to wrap the entire multi-line string construction for each item
+        # within parentheses to ensure it's treated as a single element in the list comprehension.
         formatted_details = "\n\n".join([
+            (
+                f"### Question:\n{item['question']}\n\n"
+                + f"**Choices:**\n" + "\n".join([f"{get_choice_letter(i)}. {c}" for i, c in enumerate(item['choices'])]) + "\n\n"
+                + (f"**Note:** Reasoning models are currently not fully supported for single-letter extraction. The original model output followed:\n" if item.get('is_reasoning_model_output') else "")
+                + f"**Model Raw Output:** {item['model_raw_output']}\n"
+                + f"**Expected Answer:** {item['expected_answer_letter']}\n"
+                + f"**Predicted Answer:** {item['predicted_answer_letter']}\n"
+                + f"**Correct:** {'Yes' if item['is_correct'] else 'No'}"
+            )
             for item in all_evaluation_results
         ])