Spaces:

Enderchef
/

SuperBench-Eval

Running on Zero

App Files Files Community

Enderchef commited on Jun 24

Commit

63c5f6c

verified ·

1 Parent(s): 4976904

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -28

app.py CHANGED Viewed

@@ -37,6 +37,45 @@ def extract_choice_letter(output):
     return match.group(1) if match else None
 def evaluate(model_id, sample_count, config_name):
     gen = load_model(model_id)
     dataset = load_dataset("cais/mmlu", config_name, token=HF_TOKEN)["test"]
     dataset = dataset.shuffle(seed=42).select(range(min(sample_count, len(dataset))))
@@ -92,24 +131,11 @@ with gr.Blocks(css="body {font-family: Inter, sans-serif; padding: 1em; max-widt
     with gr.Row():
         model_id = gr.Textbox(label="Your Hugging Face Model ID", placeholder="e.g., your-org/your-model")
         config_name = gr.Dropdown(
-            label="Choose MMLU Subject",
-            choices=[
-                "abstract_algebra", "anatomy", "astronomy", "business_ethics", "college_biology",
-                "college_chemistry", "college_computer_science", "college_mathematics", "college_medicine",
-                "college_physics", "computer_security", "econometrics", "electrical_engineering",
-                "elementary_mathematics", "formal_logic", "global_facts", "high_school_biology",
-                "high_school_chemistry", "high_school_computer_science", "high_school_european_history",
-                "high_school_geography", "high_school_government_and_politics", "high_school_macroeconomics",
-                "high_school_microeconomics", "high_school_physics", "high_school_psychology",
-                "high_school_statistics", "high_school_us_history", "high_school_world_history", "human_aging",
-                "human_sexuality", "international_law", "jurisprudence", "logical_fallacies", "machine_learning",
-                "management", "marketing", "medical_genetics", "miscellaneous", "moral_disputes",
-                "moral_scenarios", "nutrition", "philosophy", "prehistory", "professional_accounting",
-                "professional_law", "professional_medicine", "professional_psychology", "public_relations",
-                "security_studies", "sociology", "us_foreign_policy", "virology", "world_religions"
-            ],
-            value="college_mathematics"
-        )
         sample_count = gr.Slider(label="Number of Samples", minimum=1, maximum=100, value=10, step=1)
     run_button = gr.Button("🚀 Run Evaluation")
@@ -122,18 +148,25 @@ with gr.Blocks(css="body {font-family: Inter, sans-serif; padding: 1em; max-widt
     with gr.Row():
         leaderboard_plot = gr.Plot(label="Leaderboard Chart")
-        leaderboard_table = gr.Dataframe(headers=["Model ID", "Subject", "Accuracy"], interactive=False)
     def load_leaderboard():
-        try:
-            df = pd.read_json("eval.jsonl", lines=True)
-            df_sorted = df.sort_values(by="accuracy", ascending=False).head(10)
-            fig, ax = plt.subplots()
-            ax.barh(df_sorted['model_id'], df_sorted['accuracy'])
-            ax.set_xlabel("Accuracy")
-            ax.set_ylabel("Model")
-            ax.set_title("Top 10 Models")
-            return fig, df_sorted
         except Exception as e:
             return plt.figure(), pd.DataFrame(columns=["model_id", "subject", "accuracy"])

     return match.group(1) if match else None
 def evaluate(model_id, sample_count, config_name):
+    if config_name == "ALL":
+        subjects = [
+            "abstract_algebra", "anatomy", "astronomy", "business_ethics", "college_biology",
+            "college_chemistry", "college_computer_science", "college_mathematics", "college_medicine",
+            "college_physics", "computer_security", "econometrics", "electrical_engineering",
+            "elementary_mathematics", "formal_logic", "global_facts", "high_school_biology",
+            "high_school_chemistry", "high_school_computer_science", "high_school_european_history",
+            "high_school_geography", "high_school_government_and_politics", "high_school_macroeconomics",
+            "high_school_microeconomics", "high_school_physics", "high_school_psychology",
+            "high_school_statistics", "high_school_us_history", "high_school_world_history", "human_aging",
+            "human_sexuality", "international_law", "jurisprudence", "logical_fallacies", "machine_learning",
+            "management", "marketing", "medical_genetics", "miscellaneous", "moral_disputes",
+            "moral_scenarios", "nutrition", "philosophy", "prehistory", "professional_accounting",
+            "professional_law", "professional_medicine", "professional_psychology", "public_relations",
+            "security_studies", "sociology", "us_foreign_policy", "virology", "world_religions"
+        ]
+        gen = load_model(model_id)
+        total_correct = 0
+        total_samples = 0
+        all_results = []
+        for subject in subjects:
+            dataset = load_dataset("cais/mmlu", subject, token=HF_TOKEN)["test"]
+            dataset = dataset.shuffle(seed=42).select(range(min(sample_count, len(dataset))))
+            correct = 0
+            for item in dataset:
+                prompt, answer = format_prompt(item)
+                output = gen(prompt, max_new_tokens=20, do_sample=False)[0]["generated_text"]
+                output_letter = extract_choice_letter(output)
+                correct += output_letter == answer
+                all_results.append((prompt, output.strip(), answer, output_letter, output_letter == answer))
+            accuracy = correct / len(dataset) * 100
+            record = {"model_id": model_id, "subject": subject, "accuracy": accuracy}
+            with open("eval.jsonl", "a") as f:
+                f.write(json.dumps(record) + "
+")
+            total_correct += correct
+            total_samples += len(dataset)
+        avg_accuracy = total_correct / total_samples * 100
+        return f"Average Accuracy: {avg_accuracy:.2f}% across all subjects", all_results
     gen = load_model(model_id)
     dataset = load_dataset("cais/mmlu", config_name, token=HF_TOKEN)["test"]
     dataset = dataset.shuffle(seed=42).select(range(min(sample_count, len(dataset))))
     with gr.Row():
         model_id = gr.Textbox(label="Your Hugging Face Model ID", placeholder="e.g., your-org/your-model")
         config_name = gr.Dropdown(
+    label="Choose MMLU Subject",
+    choices=["ALL"],
+    value="ALL",
+    interactive=False
+)
         sample_count = gr.Slider(label="Number of Samples", minimum=1, maximum=100, value=10, step=1)
     run_button = gr.Button("🚀 Run Evaluation")
     with gr.Row():
         leaderboard_plot = gr.Plot(label="Leaderboard Chart")
+        leaderboard_table = gr.Dataframe(headers=["Model ID", "Average Accuracy"], interactive=False, datatype=["str", "number"], row_count=20, col_count=2)
     def load_leaderboard():
+    try:
+        df = pd.read_json("eval.jsonl", lines=True)
+        df_avg = df.groupby("model_id")["accuracy"].mean().reset_index()
+        df_avg.columns = ["model_id", "average_accuracy"]
+        df_sorted = df_avg.sort_values(by="average_accuracy", ascending=False)
+        top10 = df_sorted.head(10)
+        fig, ax = plt.subplots()
+        ax.barh(top10['model_id'], top10['average_accuracy'])
+        ax.set_xlabel("Average Accuracy")
+        ax.set_ylabel("Model")
+        ax.set_title("Top 10 Models by Average Accuracy")
+        return fig, df_sorted
+    except Exception as e:
+        return plt.figure(), pd.DataFrame(columns=["model_id", "average_accuracy"])
         except Exception as e:
             return plt.figure(), pd.DataFrame(columns=["model_id", "subject", "accuracy"])