Enderchef commited on
Commit
f6dce38
·
verified ·
1 Parent(s): be06efe

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -14
app.py CHANGED
@@ -36,7 +36,7 @@ def extract_choice_letter(output):
36
  match = re.search(r"\b([ABCD])\b", output.strip())
37
  return match.group(1) if match else None
38
 
39
- # Added progress parameter to the evaluate function
40
  def evaluate(model_id, sample_count, config_name, progress=gr.Progress()):
41
  if config_name == "ALL":
42
  subjects = [
@@ -58,26 +58,22 @@ def evaluate(model_id, sample_count, config_name, progress=gr.Progress()):
58
  total_correct = 0
59
  total_samples = 0
60
  all_results = []
61
- # Use progress for subject iteration
62
  for i, subject in enumerate(progress.tqdm(subjects, desc="Evaluating subjects")):
63
  dataset = load_dataset("cais/mmlu", subject, token=HF_TOKEN)["test"]
64
  dataset = dataset.shuffle(seed=42).select(range(min(sample_count, len(dataset))))
65
  correct = 0
66
- # Use progress for sample iteration within each subject
67
  for j, item in enumerate(progress.tqdm(dataset, desc=f"Processing {subject} samples")):
68
  prompt, answer = format_prompt(item)
69
  output = gen(prompt, max_new_tokens=20, do_sample=False)[0]["generated_text"]
70
  output_letter = extract_choice_letter(output)
71
  correct += output_letter == answer
72
  all_results.append((prompt, output.strip(), answer, output_letter, output_letter == answer))
73
- accuracy = correct / len(dataset) * 100
74
- record = {"model_id": model_id, "subject": subject, "accuracy": accuracy}
75
- with open("eval.jsonl", "a") as f:
76
- f.write(json.dumps(record) + "\n")
77
  total_correct += correct
78
  total_samples += len(dataset)
79
  avg_accuracy = total_correct / total_samples * 100
80
- return f"Average Accuracy: {avg_accuracy:.2f}% across all subjects", all_results
 
81
  gen = load_model(model_id)
82
  dataset = load_dataset("cais/mmlu", config_name, token=HF_TOKEN)["test"]
83
  dataset = dataset.shuffle(seed=42).select(range(min(sample_count, len(dataset))))
@@ -85,7 +81,6 @@ def evaluate(model_id, sample_count, config_name, progress=gr.Progress()):
85
  correct = 0
86
  results = []
87
 
88
- # Use progress for sample iteration
89
  for i, item in enumerate(progress.tqdm(dataset, desc=f"Processing {config_name} samples")):
90
  prompt, answer = format_prompt(item)
91
  output = gen(prompt, max_new_tokens=20, do_sample=False)[0]["generated_text"]
@@ -95,20 +90,30 @@ def evaluate(model_id, sample_count, config_name, progress=gr.Progress()):
95
  results.append((prompt, output.strip(), answer, output_letter, is_correct))
96
 
97
  accuracy = correct / len(dataset) * 100
98
- return f"Accuracy: {accuracy:.2f}%, out of {len(dataset)} samples", results
 
99
 
100
  # Pass progress to evaluate function
101
  def run(model_id, sample_count, config_name, progress=gr.Progress()):
102
- score, details = evaluate(model_id, sample_count, config_name, progress)
 
 
103
  formatted = "\n\n".join([
104
  f"### Question:\n{q}\n\n**Model Answer:** {o}\n**Expected:** {a}\n**Predicted:** {g}\n**Correct:** {c}"
105
  for q, o, a, g, c in details
106
  ])
107
- accuracy_value = float(score.split()[1][:-1])
 
 
 
 
 
 
 
108
  record = {"model_id": model_id, "subject": config_name, "accuracy": accuracy_value}
109
  with open("eval.jsonl", "a") as f:
110
  f.write(json.dumps(record) + "\n")
111
- return score, formatted
112
 
113
  def save_text(text):
114
  return "evaluation_results.txt", text
@@ -167,4 +172,4 @@ with gr.Blocks(css="body {font-family: Inter, sans-serif; padding: 1em; max-widt
167
 
168
  demo.load(load_leaderboard, inputs=[], outputs=[leaderboard_plot, leaderboard_table])
169
 
170
- demo.launch()
 
36
  match = re.search(r"\b([ABCD])\b", output.strip())
37
  return match.group(1) if match else None
38
 
39
+ # Modified evaluate function to return accuracy as a float directly
40
  def evaluate(model_id, sample_count, config_name, progress=gr.Progress()):
41
  if config_name == "ALL":
42
  subjects = [
 
58
  total_correct = 0
59
  total_samples = 0
60
  all_results = []
 
61
  for i, subject in enumerate(progress.tqdm(subjects, desc="Evaluating subjects")):
62
  dataset = load_dataset("cais/mmlu", subject, token=HF_TOKEN)["test"]
63
  dataset = dataset.shuffle(seed=42).select(range(min(sample_count, len(dataset))))
64
  correct = 0
 
65
  for j, item in enumerate(progress.tqdm(dataset, desc=f"Processing {subject} samples")):
66
  prompt, answer = format_prompt(item)
67
  output = gen(prompt, max_new_tokens=20, do_sample=False)[0]["generated_text"]
68
  output_letter = extract_choice_letter(output)
69
  correct += output_letter == answer
70
  all_results.append((prompt, output.strip(), answer, output_letter, output_letter == answer))
71
+ # No need to write subject-level record here, only aggregate
 
 
 
72
  total_correct += correct
73
  total_samples += len(dataset)
74
  avg_accuracy = total_correct / total_samples * 100
75
+ # Return the float accuracy value
76
+ return avg_accuracy, all_results
77
  gen = load_model(model_id)
78
  dataset = load_dataset("cais/mmlu", config_name, token=HF_TOKEN)["test"]
79
  dataset = dataset.shuffle(seed=42).select(range(min(sample_count, len(dataset))))
 
81
  correct = 0
82
  results = []
83
 
 
84
  for i, item in enumerate(progress.tqdm(dataset, desc=f"Processing {config_name} samples")):
85
  prompt, answer = format_prompt(item)
86
  output = gen(prompt, max_new_tokens=20, do_sample=False)[0]["generated_text"]
 
90
  results.append((prompt, output.strip(), answer, output_letter, is_correct))
91
 
92
  accuracy = correct / len(dataset) * 100
93
+ # Return the float accuracy value
94
+ return accuracy, results
95
 
96
  # Pass progress to evaluate function
97
  def run(model_id, sample_count, config_name, progress=gr.Progress()):
98
+ # Receive accuracy_value directly as a float
99
+ accuracy_value, details = evaluate(model_id, sample_count, config_name, progress)
100
+
101
  formatted = "\n\n".join([
102
  f"### Question:\n{q}\n\n**Model Answer:** {o}\n**Expected:** {a}\n**Predicted:** {g}\n**Correct:** {c}"
103
  for q, o, a, g, c in details
104
  ])
105
+
106
+ # Format the score string based on config_name
107
+ if config_name == "ALL":
108
+ score_string = f"Average Accuracy: {accuracy_value:.2f}% across all subjects"
109
+ else:
110
+ # Assuming len(details) corresponds to the number of samples processed for a single subject
111
+ score_string = f"Accuracy: {accuracy_value:.2f}%, out of {len(details)} samples"
112
+
113
  record = {"model_id": model_id, "subject": config_name, "accuracy": accuracy_value}
114
  with open("eval.jsonl", "a") as f:
115
  f.write(json.dumps(record) + "\n")
116
+ return score_string, formatted # Return the formatted string and details
117
 
118
  def save_text(text):
119
  return "evaluation_results.txt", text
 
172
 
173
  demo.load(load_leaderboard, inputs=[], outputs=[leaderboard_plot, leaderboard_table])
174
 
175
+ demo.launch()