Enderchef commited on
Commit
63c5f6c
Β·
verified Β·
1 Parent(s): 4976904

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -28
app.py CHANGED
@@ -37,6 +37,45 @@ def extract_choice_letter(output):
37
  return match.group(1) if match else None
38
 
39
  def evaluate(model_id, sample_count, config_name):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  gen = load_model(model_id)
41
  dataset = load_dataset("cais/mmlu", config_name, token=HF_TOKEN)["test"]
42
  dataset = dataset.shuffle(seed=42).select(range(min(sample_count, len(dataset))))
@@ -92,24 +131,11 @@ with gr.Blocks(css="body {font-family: Inter, sans-serif; padding: 1em; max-widt
92
  with gr.Row():
93
  model_id = gr.Textbox(label="Your Hugging Face Model ID", placeholder="e.g., your-org/your-model")
94
  config_name = gr.Dropdown(
95
- label="Choose MMLU Subject",
96
- choices=[
97
- "abstract_algebra", "anatomy", "astronomy", "business_ethics", "college_biology",
98
- "college_chemistry", "college_computer_science", "college_mathematics", "college_medicine",
99
- "college_physics", "computer_security", "econometrics", "electrical_engineering",
100
- "elementary_mathematics", "formal_logic", "global_facts", "high_school_biology",
101
- "high_school_chemistry", "high_school_computer_science", "high_school_european_history",
102
- "high_school_geography", "high_school_government_and_politics", "high_school_macroeconomics",
103
- "high_school_microeconomics", "high_school_physics", "high_school_psychology",
104
- "high_school_statistics", "high_school_us_history", "high_school_world_history", "human_aging",
105
- "human_sexuality", "international_law", "jurisprudence", "logical_fallacies", "machine_learning",
106
- "management", "marketing", "medical_genetics", "miscellaneous", "moral_disputes",
107
- "moral_scenarios", "nutrition", "philosophy", "prehistory", "professional_accounting",
108
- "professional_law", "professional_medicine", "professional_psychology", "public_relations",
109
- "security_studies", "sociology", "us_foreign_policy", "virology", "world_religions"
110
- ],
111
- value="college_mathematics"
112
- )
113
  sample_count = gr.Slider(label="Number of Samples", minimum=1, maximum=100, value=10, step=1)
114
 
115
  run_button = gr.Button("πŸš€ Run Evaluation")
@@ -122,18 +148,25 @@ with gr.Blocks(css="body {font-family: Inter, sans-serif; padding: 1em; max-widt
122
 
123
  with gr.Row():
124
  leaderboard_plot = gr.Plot(label="Leaderboard Chart")
125
- leaderboard_table = gr.Dataframe(headers=["Model ID", "Subject", "Accuracy"], interactive=False)
126
 
127
  def load_leaderboard():
128
- try:
129
- df = pd.read_json("eval.jsonl", lines=True)
130
- df_sorted = df.sort_values(by="accuracy", ascending=False).head(10)
131
- fig, ax = plt.subplots()
132
- ax.barh(df_sorted['model_id'], df_sorted['accuracy'])
133
- ax.set_xlabel("Accuracy")
134
- ax.set_ylabel("Model")
135
- ax.set_title("Top 10 Models")
136
- return fig, df_sorted
 
 
 
 
 
 
 
137
  except Exception as e:
138
  return plt.figure(), pd.DataFrame(columns=["model_id", "subject", "accuracy"])
139
 
 
37
  return match.group(1) if match else None
38
 
39
  def evaluate(model_id, sample_count, config_name):
40
+ if config_name == "ALL":
41
+ subjects = [
42
+ "abstract_algebra", "anatomy", "astronomy", "business_ethics", "college_biology",
43
+ "college_chemistry", "college_computer_science", "college_mathematics", "college_medicine",
44
+ "college_physics", "computer_security", "econometrics", "electrical_engineering",
45
+ "elementary_mathematics", "formal_logic", "global_facts", "high_school_biology",
46
+ "high_school_chemistry", "high_school_computer_science", "high_school_european_history",
47
+ "high_school_geography", "high_school_government_and_politics", "high_school_macroeconomics",
48
+ "high_school_microeconomics", "high_school_physics", "high_school_psychology",
49
+ "high_school_statistics", "high_school_us_history", "high_school_world_history", "human_aging",
50
+ "human_sexuality", "international_law", "jurisprudence", "logical_fallacies", "machine_learning",
51
+ "management", "marketing", "medical_genetics", "miscellaneous", "moral_disputes",
52
+ "moral_scenarios", "nutrition", "philosophy", "prehistory", "professional_accounting",
53
+ "professional_law", "professional_medicine", "professional_psychology", "public_relations",
54
+ "security_studies", "sociology", "us_foreign_policy", "virology", "world_religions"
55
+ ]
56
+ gen = load_model(model_id)
57
+ total_correct = 0
58
+ total_samples = 0
59
+ all_results = []
60
+ for subject in subjects:
61
+ dataset = load_dataset("cais/mmlu", subject, token=HF_TOKEN)["test"]
62
+ dataset = dataset.shuffle(seed=42).select(range(min(sample_count, len(dataset))))
63
+ correct = 0
64
+ for item in dataset:
65
+ prompt, answer = format_prompt(item)
66
+ output = gen(prompt, max_new_tokens=20, do_sample=False)[0]["generated_text"]
67
+ output_letter = extract_choice_letter(output)
68
+ correct += output_letter == answer
69
+ all_results.append((prompt, output.strip(), answer, output_letter, output_letter == answer))
70
+ accuracy = correct / len(dataset) * 100
71
+ record = {"model_id": model_id, "subject": subject, "accuracy": accuracy}
72
+ with open("eval.jsonl", "a") as f:
73
+ f.write(json.dumps(record) + "
74
+ ")
75
+ total_correct += correct
76
+ total_samples += len(dataset)
77
+ avg_accuracy = total_correct / total_samples * 100
78
+ return f"Average Accuracy: {avg_accuracy:.2f}% across all subjects", all_results
79
  gen = load_model(model_id)
80
  dataset = load_dataset("cais/mmlu", config_name, token=HF_TOKEN)["test"]
81
  dataset = dataset.shuffle(seed=42).select(range(min(sample_count, len(dataset))))
 
131
  with gr.Row():
132
  model_id = gr.Textbox(label="Your Hugging Face Model ID", placeholder="e.g., your-org/your-model")
133
  config_name = gr.Dropdown(
134
+ label="Choose MMLU Subject",
135
+ choices=["ALL"],
136
+ value="ALL",
137
+ interactive=False
138
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  sample_count = gr.Slider(label="Number of Samples", minimum=1, maximum=100, value=10, step=1)
140
 
141
  run_button = gr.Button("πŸš€ Run Evaluation")
 
148
 
149
  with gr.Row():
150
  leaderboard_plot = gr.Plot(label="Leaderboard Chart")
151
+ leaderboard_table = gr.Dataframe(headers=["Model ID", "Average Accuracy"], interactive=False, datatype=["str", "number"], row_count=20, col_count=2)
152
 
153
  def load_leaderboard():
154
+ try:
155
+ df = pd.read_json("eval.jsonl", lines=True)
156
+ df_avg = df.groupby("model_id")["accuracy"].mean().reset_index()
157
+ df_avg.columns = ["model_id", "average_accuracy"]
158
+ df_sorted = df_avg.sort_values(by="average_accuracy", ascending=False)
159
+ top10 = df_sorted.head(10)
160
+
161
+ fig, ax = plt.subplots()
162
+ ax.barh(top10['model_id'], top10['average_accuracy'])
163
+ ax.set_xlabel("Average Accuracy")
164
+ ax.set_ylabel("Model")
165
+ ax.set_title("Top 10 Models by Average Accuracy")
166
+
167
+ return fig, df_sorted
168
+ except Exception as e:
169
+ return plt.figure(), pd.DataFrame(columns=["model_id", "average_accuracy"])
170
  except Exception as e:
171
  return plt.figure(), pd.DataFrame(columns=["model_id", "subject", "accuracy"])
172