Omartificial-Intelligence-Space commited on
Commit
7cbbcca
·
verified ·
1 Parent(s): 9535ee7

update submit

Browse files
Files changed (1) hide show
  1. src/submission/submit.py +21 -45
src/submission/submit.py CHANGED
@@ -51,10 +51,8 @@ def get_top_prediction(text, tokenizer, model):
51
  options = [' A', ' B', ' C', ' D']
52
  option_logits = []
53
 
54
- # Iterate through each option
55
  for option in options:
56
  option_ids = tokenizer(option).input_ids
57
- # Ensure option_ids are within range and not empty
58
  if option_ids and option_ids[-1] < logits.size(0):
59
  option_id = option_ids[-1]
60
  option_logit = logits[option_id]
@@ -65,14 +63,12 @@ def get_top_prediction(text, tokenizer, model):
65
  if not option_logits:
66
  return "No valid options"
67
 
68
- # Get the option with the highest logit
69
  top_option = max(option_logits, key=lambda x: x[0])[1]
70
  return top_option
71
 
72
  @spaces.GPU(duration=120)
73
  def evaluate_model_accuracy_by_subject(model_name, num_questions_per_subject=100):
74
  try:
75
- # Load the model and tokenizer
76
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
77
  tokenizer.pad_token = tokenizer.eos_token
78
 
@@ -85,14 +81,11 @@ def evaluate_model_accuracy_by_subject(model_name, num_questions_per_subject=100
85
  else:
86
  model = model.cpu()
87
 
88
- # Load your custom MMMLU dataset from HuggingFace
89
  dataset = load_dataset("Omartificial-Intelligence-Space/Arabic_Openai_MMMLU")
90
- dataset = dataset['test'[:100]]
91
 
92
- # Filter out excluded subjects
93
  dataset = dataset.filter(lambda x: x['Subject'] not in excluded_subjects)
94
 
95
- # Define prompt template
96
  template = """Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D].
97
  Question: {Question}
98
  A) {A}
@@ -103,16 +96,18 @@ Answer:"""
103
 
104
  prompt_template = PromptTemplate(template=template, input_variables=['Question', 'A', 'B', 'C', 'D'])
105
 
106
- # Initialize results storage
107
  subject_results = {}
108
  overall_correct_predictions = 0
109
  overall_total_questions = 0
110
 
111
  subjects = dataset.unique('Subject')
 
 
 
 
112
  for subject in subjects:
113
  subject_data = dataset.filter(lambda x: x['Subject'] == subject)
114
 
115
- # Sample num_questions_per_subject from each subject
116
  if num_questions_per_subject > 0:
117
  if len(subject_data) < num_questions_per_subject:
118
  print(f"Warning: Not enough questions for subject '{subject}'. Using all available questions.")
@@ -126,7 +121,6 @@ Answer:"""
126
  results = []
127
 
128
  for data in subject_data:
129
- # Prepare text input
130
  text = prompt_template.format(
131
  Question=data['Question'],
132
  A=data['A'],
@@ -135,7 +129,6 @@ Answer:"""
135
  D=data['D']
136
  )
137
 
138
- # Get the top prediction
139
  top_prediction = get_top_prediction(text, tokenizer, model)
140
  is_correct = (top_prediction == data['Answer'])
141
  correct_predictions += int(is_correct)
@@ -152,7 +145,11 @@ Answer:"""
152
 
153
  accuracy = correct_predictions / total_questions if total_questions > 0 else 0
154
 
155
- # Store results for this subject
 
 
 
 
156
  subject_results[subject] = {
157
  'Correct Predictions': correct_predictions,
158
  'Total Questions': total_questions,
@@ -162,13 +159,18 @@ Answer:"""
162
 
163
  overall_accuracy = (overall_correct_predictions / overall_total_questions) * 100 if overall_total_questions > 0 else 0
164
 
165
- return overall_accuracy, subject_results
166
 
167
  except Exception as e:
168
  import traceback
169
  tb = traceback.format_exc()
170
  print(f"Error in evaluate_model_accuracy_by_subject: {e}\n{tb}")
171
- return f"Error: {str(e)}", {}
 
 
 
 
 
172
 
173
  def add_new_eval(
174
  model: str,
@@ -199,7 +201,6 @@ def add_new_eval(
199
  if revision == "":
200
  revision = "main"
201
 
202
- # Is the model on the hub?
203
  if weight_type in ["Delta", "Adapter"]:
204
  base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
205
  if not base_model_on_hub:
@@ -210,36 +211,16 @@ def add_new_eval(
210
  if not model_on_hub:
211
  return styled_error(f'Model "{model}" {error}')
212
 
213
- # Is the model info correctly filled?
214
- try:
215
- model_info = API.model_info(repo_id=model, revision=revision)
216
- except Exception:
217
- return styled_error("Could not get your model information. Please fill it up properly.")
218
-
219
- model_size = get_model_size(model_info=model_info, precision=precision)
220
-
221
- # Were the model card and license filled?
222
- try:
223
- license = model_info.cardData["license"]
224
- except Exception:
225
- return styled_error("Please select a license for your model")
226
-
227
- modelcard_OK, error_msg = check_model_card(model)
228
- if not modelcard_OK:
229
- return styled_error(error_msg)
230
-
231
- # Check for duplicate submission
232
- if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
233
- return styled_warning("This model has been already submitted.")
234
-
235
- # Now, perform the evaluation
236
  try:
237
- overall_accuracy, subject_results = evaluate_model_accuracy_by_subject(model, num_questions_per_subject=100)
238
  if isinstance(overall_accuracy, str) and overall_accuracy.startswith("Error"):
239
  return styled_error(overall_accuracy)
240
  except Exception as e:
241
  return styled_error(f"An error occurred during evaluation: {str(e)}")
242
 
 
 
 
243
  # Prepare results for storage
244
  results_dict = {
245
  "config": {
@@ -250,10 +231,6 @@ def add_new_eval(
250
  "weight_type": weight_type,
251
  "model_type": model_type,
252
  "submitted_time": current_time,
253
- "license": license,
254
- "likes": model_info.likes,
255
- "params": model_size,
256
- "still_on_hub": True,
257
  },
258
  "results": {
259
  "average": overall_accuracy,
@@ -279,7 +256,6 @@ def add_new_eval(
279
  commit_message=f"Add results for {model}"
280
  )
281
 
282
- # Remove the local results file
283
  os.remove(results_file_path)
284
 
285
  return styled_message("Your model has been evaluated and the results are now on the leaderboard!")
 
51
  options = [' A', ' B', ' C', ' D']
52
  option_logits = []
53
 
 
54
  for option in options:
55
  option_ids = tokenizer(option).input_ids
 
56
  if option_ids and option_ids[-1] < logits.size(0):
57
  option_id = option_ids[-1]
58
  option_logit = logits[option_id]
 
63
  if not option_logits:
64
  return "No valid options"
65
 
 
66
  top_option = max(option_logits, key=lambda x: x[0])[1]
67
  return top_option
68
 
69
  @spaces.GPU(duration=120)
70
  def evaluate_model_accuracy_by_subject(model_name, num_questions_per_subject=100):
71
  try:
 
72
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
73
  tokenizer.pad_token = tokenizer.eos_token
74
 
 
81
  else:
82
  model = model.cpu()
83
 
 
84
  dataset = load_dataset("Omartificial-Intelligence-Space/Arabic_Openai_MMMLU")
85
+ dataset = dataset['test']
86
 
 
87
  dataset = dataset.filter(lambda x: x['Subject'] not in excluded_subjects)
88
 
 
89
  template = """Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D].
90
  Question: {Question}
91
  A) {A}
 
96
 
97
  prompt_template = PromptTemplate(template=template, input_variables=['Question', 'A', 'B', 'C', 'D'])
98
 
 
99
  subject_results = {}
100
  overall_correct_predictions = 0
101
  overall_total_questions = 0
102
 
103
  subjects = dataset.unique('Subject')
104
+
105
+ # To track best performance per subject
106
+ best_in_class = {subject: {"model_name": None, "accuracy": 0} for subject in subjects}
107
+
108
  for subject in subjects:
109
  subject_data = dataset.filter(lambda x: x['Subject'] == subject)
110
 
 
111
  if num_questions_per_subject > 0:
112
  if len(subject_data) < num_questions_per_subject:
113
  print(f"Warning: Not enough questions for subject '{subject}'. Using all available questions.")
 
121
  results = []
122
 
123
  for data in subject_data:
 
124
  text = prompt_template.format(
125
  Question=data['Question'],
126
  A=data['A'],
 
129
  D=data['D']
130
  )
131
 
 
132
  top_prediction = get_top_prediction(text, tokenizer, model)
133
  is_correct = (top_prediction == data['Answer'])
134
  correct_predictions += int(is_correct)
 
145
 
146
  accuracy = correct_predictions / total_questions if total_questions > 0 else 0
147
 
148
+ # Check if this model is the best for the current subject
149
+ if accuracy > best_in_class[subject]['accuracy']:
150
+ best_in_class[subject]['model_name'] = model_name
151
+ best_in_class[subject]['accuracy'] = accuracy
152
+
153
  subject_results[subject] = {
154
  'Correct Predictions': correct_predictions,
155
  'Total Questions': total_questions,
 
159
 
160
  overall_accuracy = (overall_correct_predictions / overall_total_questions) * 100 if overall_total_questions > 0 else 0
161
 
162
+ return overall_accuracy, subject_results, best_in_class
163
 
164
  except Exception as e:
165
  import traceback
166
  tb = traceback.format_exc()
167
  print(f"Error in evaluate_model_accuracy_by_subject: {e}\n{tb}")
168
+ return f"Error: {str(e)}", {}, {}
169
+
170
+ def display_best_in_class(best_in_class):
171
+ print("\nBest Model in Each Subject:\n")
172
+ for subject, info in best_in_class.items():
173
+ print(f"{subject}: {info['model_name']} with accuracy: {info['accuracy'] * 100:.2f}%")
174
 
175
  def add_new_eval(
176
  model: str,
 
201
  if revision == "":
202
  revision = "main"
203
 
 
204
  if weight_type in ["Delta", "Adapter"]:
205
  base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
206
  if not base_model_on_hub:
 
211
  if not model_on_hub:
212
  return styled_error(f'Model "{model}" {error}')
213
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  try:
215
+ overall_accuracy, subject_results, best_in_class = evaluate_model_accuracy_by_subject(model, num_questions_per_subject=100)
216
  if isinstance(overall_accuracy, str) and overall_accuracy.startswith("Error"):
217
  return styled_error(overall_accuracy)
218
  except Exception as e:
219
  return styled_error(f"An error occurred during evaluation: {str(e)}")
220
 
221
+ # Display the best in class results
222
+ display_best_in_class(best_in_class)
223
+
224
  # Prepare results for storage
225
  results_dict = {
226
  "config": {
 
231
  "weight_type": weight_type,
232
  "model_type": model_type,
233
  "submitted_time": current_time,
 
 
 
 
234
  },
235
  "results": {
236
  "average": overall_accuracy,
 
256
  commit_message=f"Add results for {model}"
257
  )
258
 
 
259
  os.remove(results_file_path)
260
 
261
  return styled_message("Your model has been evaluated and the results are now on the leaderboard!")