Enderchef commited on
Commit
be06efe
·
verified ·
1 Parent(s): 3d20418

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -10
app.py CHANGED
@@ -36,7 +36,8 @@ def extract_choice_letter(output):
36
  match = re.search(r"\b([ABCD])\b", output.strip())
37
  return match.group(1) if match else None
38
 
39
- def evaluate(model_id, sample_count, config_name):
 
40
  if config_name == "ALL":
41
  subjects = [
42
  "abstract_algebra", "anatomy", "astronomy", "business_ethics", "college_biology",
@@ -57,11 +58,13 @@ def evaluate(model_id, sample_count, config_name):
57
  total_correct = 0
58
  total_samples = 0
59
  all_results = []
60
- for subject in subjects:
 
61
  dataset = load_dataset("cais/mmlu", subject, token=HF_TOKEN)["test"]
62
  dataset = dataset.shuffle(seed=42).select(range(min(sample_count, len(dataset))))
63
  correct = 0
64
- for item in dataset:
 
65
  prompt, answer = format_prompt(item)
66
  output = gen(prompt, max_new_tokens=20, do_sample=False)[0]["generated_text"]
67
  output_letter = extract_choice_letter(output)
@@ -70,7 +73,7 @@ def evaluate(model_id, sample_count, config_name):
70
  accuracy = correct / len(dataset) * 100
71
  record = {"model_id": model_id, "subject": subject, "accuracy": accuracy}
72
  with open("eval.jsonl", "a") as f:
73
- f.write(json.dumps(record) + "\n") # Fixed: added closing double quote and newline
74
  total_correct += correct
75
  total_samples += len(dataset)
76
  avg_accuracy = total_correct / total_samples * 100
@@ -82,7 +85,8 @@ def evaluate(model_id, sample_count, config_name):
82
  correct = 0
83
  results = []
84
 
85
- for item in dataset:
 
86
  prompt, answer = format_prompt(item)
87
  output = gen(prompt, max_new_tokens=20, do_sample=False)[0]["generated_text"]
88
  output_letter = extract_choice_letter(output)
@@ -93,8 +97,9 @@ def evaluate(model_id, sample_count, config_name):
93
  accuracy = correct / len(dataset) * 100
94
  return f"Accuracy: {accuracy:.2f}%, out of {len(dataset)} samples", results
95
 
96
- def run(model_id, sample_count, config_name):
97
- score, details = evaluate(model_id, sample_count, config_name)
 
98
  formatted = "\n\n".join([
99
  f"### Question:\n{q}\n\n**Model Answer:** {o}\n**Expected:** {a}\n**Predicted:** {g}\n**Correct:** {c}"
100
  for q, o, a, g, c in details
@@ -102,7 +107,7 @@ def run(model_id, sample_count, config_name):
102
  accuracy_value = float(score.split()[1][:-1])
103
  record = {"model_id": model_id, "subject": config_name, "accuracy": accuracy_value}
104
  with open("eval.jsonl", "a") as f:
105
- f.write(json.dumps(record) + "\n") # Fixed: added closing double quote and newline
106
  return score, formatted
107
 
108
  def save_text(text):
@@ -133,6 +138,7 @@ with gr.Blocks(css="body {font-family: Inter, sans-serif; padding: 1em; max-widt
133
  detail_output = gr.Textbox(label="Evaluation Details", lines=20, interactive=False)
134
  download_button = gr.Button("📥 Download Full Evaluation")
135
 
 
136
  run_button.click(run, inputs=[model_id, sample_count, config_name], outputs=[acc_output, detail_output])
137
  download_button.click(save_text, inputs=detail_output, outputs=gr.File())
138
 
@@ -157,8 +163,8 @@ with gr.Blocks(css="body {font-family: Inter, sans-serif; padding: 1em; max-widt
157
  return fig, df_sorted
158
  except Exception as e:
159
  # Handle the case where eval.jsonl might not exist yet
160
- return plt.figure(), pd.DataFrame(columns=["model_id", "average_accuracy"]) # Corrected columns
161
 
162
  demo.load(load_leaderboard, inputs=[], outputs=[leaderboard_plot, leaderboard_table])
163
 
164
- demo.launch()
 
36
  match = re.search(r"\b([ABCD])\b", output.strip())
37
  return match.group(1) if match else None
38
 
39
+ # Added progress parameter to the evaluate function
40
+ def evaluate(model_id, sample_count, config_name, progress=gr.Progress()):
41
  if config_name == "ALL":
42
  subjects = [
43
  "abstract_algebra", "anatomy", "astronomy", "business_ethics", "college_biology",
 
58
  total_correct = 0
59
  total_samples = 0
60
  all_results = []
61
+ # Use progress for subject iteration
62
+ for i, subject in enumerate(progress.tqdm(subjects, desc="Evaluating subjects")):
63
  dataset = load_dataset("cais/mmlu", subject, token=HF_TOKEN)["test"]
64
  dataset = dataset.shuffle(seed=42).select(range(min(sample_count, len(dataset))))
65
  correct = 0
66
+ # Use progress for sample iteration within each subject
67
+ for j, item in enumerate(progress.tqdm(dataset, desc=f"Processing {subject} samples")):
68
  prompt, answer = format_prompt(item)
69
  output = gen(prompt, max_new_tokens=20, do_sample=False)[0]["generated_text"]
70
  output_letter = extract_choice_letter(output)
 
73
  accuracy = correct / len(dataset) * 100
74
  record = {"model_id": model_id, "subject": subject, "accuracy": accuracy}
75
  with open("eval.jsonl", "a") as f:
76
+ f.write(json.dumps(record) + "\n")
77
  total_correct += correct
78
  total_samples += len(dataset)
79
  avg_accuracy = total_correct / total_samples * 100
 
85
  correct = 0
86
  results = []
87
 
88
+ # Use progress for sample iteration
89
+ for i, item in enumerate(progress.tqdm(dataset, desc=f"Processing {config_name} samples")):
90
  prompt, answer = format_prompt(item)
91
  output = gen(prompt, max_new_tokens=20, do_sample=False)[0]["generated_text"]
92
  output_letter = extract_choice_letter(output)
 
97
  accuracy = correct / len(dataset) * 100
98
  return f"Accuracy: {accuracy:.2f}%, out of {len(dataset)} samples", results
99
 
100
+ # Pass progress to evaluate function
101
+ def run(model_id, sample_count, config_name, progress=gr.Progress()):
102
+ score, details = evaluate(model_id, sample_count, config_name, progress)
103
  formatted = "\n\n".join([
104
  f"### Question:\n{q}\n\n**Model Answer:** {o}\n**Expected:** {a}\n**Predicted:** {g}\n**Correct:** {c}"
105
  for q, o, a, g, c in details
 
107
  accuracy_value = float(score.split()[1][:-1])
108
  record = {"model_id": model_id, "subject": config_name, "accuracy": accuracy_value}
109
  with open("eval.jsonl", "a") as f:
110
+ f.write(json.dumps(record) + "\n")
111
  return score, formatted
112
 
113
  def save_text(text):
 
138
  detail_output = gr.Textbox(label="Evaluation Details", lines=20, interactive=False)
139
  download_button = gr.Button("📥 Download Full Evaluation")
140
 
141
+ # Pass progress to the run function
142
  run_button.click(run, inputs=[model_id, sample_count, config_name], outputs=[acc_output, detail_output])
143
  download_button.click(save_text, inputs=detail_output, outputs=gr.File())
144
 
 
163
  return fig, df_sorted
164
  except Exception as e:
165
  # Handle the case where eval.jsonl might not exist yet
166
+ return plt.figure(), pd.DataFrame(columns=["model_id", "average_accuracy"])
167
 
168
  demo.load(load_leaderboard, inputs=[], outputs=[leaderboard_plot, leaderboard_table])
169
 
170
+ demo.launch()