Omartificial-Intelligence-Space commited on
Commit
402ebfa
·
verified ·
1 Parent(s): 21cb2aa

update submit.py

Browse files
Files changed (1) hide show
  1. src/submission/submit.py +21 -27
src/submission/submit.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import json
2
  import os
3
  from datetime import datetime, timezone
@@ -10,7 +12,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
10
  from langchain.prompts import PromptTemplate
11
 
12
  from src.display.formatting import styled_error, styled_message, styled_warning
13
- from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO, EVAL_RESULTS_PATH, RESULTS_REPO
14
  from src.submission.check_validity import (
15
  already_submitted_models,
16
  check_model_card,
@@ -69,7 +71,7 @@ def get_top_prediction(text, tokenizer, model):
69
  return top_option
70
 
71
  @spaces.GPU(duration=120)
72
- def evaluate_model_accuracy_by_subject(model_name, num_examples):
73
  try:
74
  # Load the model and tokenizer
75
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
@@ -84,12 +86,13 @@ def evaluate_model_accuracy_by_subject(model_name, num_examples):
84
  else:
85
  model = model.cpu()
86
 
87
- # Load your custom MMMLU dataset
88
- dataset = load_dataset("Omartificial-Intelligence-Space/Arabic_Openai_MMMLU")
89
- dataset = dataset['test']
 
90
 
91
- # Filter out excluded subjects
92
- dataset = dataset.filter(lambda x: x['Subject'] not in excluded_subjects)
93
 
94
  # Define prompt template
95
  template = """Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D].
@@ -104,23 +107,15 @@ Answer:"""
104
 
105
  # Initialize results storage
106
  subject_results = {}
107
-
108
- subjects = dataset.unique('Subject')
109
  overall_correct_predictions = 0
110
  overall_total_questions = 0
111
 
112
- for subject in subjects:
113
- subject_data = dataset.filter(lambda x: x['Subject'] == subject)
114
-
115
- # Sample num_examples from each subject
116
- if num_examples > 0:
117
- subject_data = subject_data.shuffle().select(range(min(num_examples, len(subject_data))))
118
-
119
  correct_predictions = 0
120
  total_questions = 0
121
  results = []
122
 
123
- for data in subject_data:
124
  # Prepare text input
125
  text = prompt_template.format(
126
  Question=data['Question'],
@@ -171,8 +166,7 @@ def add_new_eval(
171
  revision: str,
172
  precision: str,
173
  weight_type: str,
174
- model_type: str,
175
- num_examples: int
176
  ):
177
  global REQUESTED_MODELS
178
  global USERS_TO_SUBMISSION_DATES
@@ -230,7 +224,7 @@ def add_new_eval(
230
 
231
  # Now, perform the evaluation
232
  try:
233
- overall_accuracy, subject_results = evaluate_model_accuracy_by_subject(model, int(num_examples))
234
  if isinstance(overall_accuracy, str) and overall_accuracy.startswith("Error"):
235
  return styled_error(overall_accuracy)
236
  except Exception as e:
@@ -239,17 +233,17 @@ def add_new_eval(
239
  # Prepare results for storage
240
  results_dict = {
241
  "config": {
242
- "model_name": model,
243
- "model_sha": revision,
244
- "model_dtype": precision,
245
- "submitted_time": current_time,
246
- "model_type": model_type,
247
  "weight_type": weight_type,
 
 
248
  "license": license,
249
  "likes": model_info.likes,
250
  "params": model_size,
251
  "still_on_hub": True,
252
- "precision": precision,
253
  },
254
  "results": {
255
  "average": overall_accuracy,
@@ -264,7 +258,7 @@ def add_new_eval(
264
  # Save results to a JSON file
265
  results_file_path = f"{EVAL_RESULTS_PATH}/{model.replace('/', '_')}_results.json"
266
  with open(results_file_path, "w") as f:
267
- json.dump(results_dict, f)
268
 
269
  # Upload the results file
270
  API.upload_file(
 
1
+ # src/submission/submit.py
2
+
3
  import json
4
  import os
5
  from datetime import datetime, timezone
 
12
  from langchain.prompts import PromptTemplate
13
 
14
  from src.display.formatting import styled_error, styled_message, styled_warning
15
+ from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO, EVAL_RESULTS_PATH, RESULTS_REPO, FIXED_QUESTIONS_FILE
16
  from src.submission.check_validity import (
17
  already_submitted_models,
18
  check_model_card,
 
71
  return top_option
72
 
73
  @spaces.GPU(duration=120)
74
+ def evaluate_model_accuracy_by_subject(model_name):
75
  try:
76
  # Load the model and tokenizer
77
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 
86
  else:
87
  model = model.cpu()
88
 
89
+ # Load fixed questions from JSON file
90
+ fixed_questions_path = os.path.join(EVAL_RESULTS_PATH, FIXED_QUESTIONS_FILE)
91
+ if not os.path.exists(fixed_questions_path):
92
+ return "Fixed questions file not found. Please run the preselection step.", {}
93
 
94
+ with open(fixed_questions_path, 'r') as f:
95
+ fixed_questions = json.load(f)
96
 
97
  # Define prompt template
98
  template = """Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D].
 
107
 
108
  # Initialize results storage
109
  subject_results = {}
 
 
110
  overall_correct_predictions = 0
111
  overall_total_questions = 0
112
 
113
+ for subject, questions in fixed_questions.items():
 
 
 
 
 
 
114
  correct_predictions = 0
115
  total_questions = 0
116
  results = []
117
 
118
+ for data in questions:
119
  # Prepare text input
120
  text = prompt_template.format(
121
  Question=data['Question'],
 
166
  revision: str,
167
  precision: str,
168
  weight_type: str,
169
+ model_type: str
 
170
  ):
171
  global REQUESTED_MODELS
172
  global USERS_TO_SUBMISSION_DATES
 
224
 
225
  # Now, perform the evaluation
226
  try:
227
+ overall_accuracy, subject_results = evaluate_model_accuracy_by_subject(model)
228
  if isinstance(overall_accuracy, str) and overall_accuracy.startswith("Error"):
229
  return styled_error(overall_accuracy)
230
  except Exception as e:
 
233
  # Prepare results for storage
234
  results_dict = {
235
  "config": {
236
+ "model": model,
237
+ "base_model": base_model,
238
+ "revision": revision,
239
+ "precision": precision,
 
240
  "weight_type": weight_type,
241
+ "model_type": model_type,
242
+ "submitted_time": current_time,
243
  "license": license,
244
  "likes": model_info.likes,
245
  "params": model_size,
246
  "still_on_hub": True,
 
247
  },
248
  "results": {
249
  "average": overall_accuracy,
 
258
  # Save results to a JSON file
259
  results_file_path = f"{EVAL_RESULTS_PATH}/{model.replace('/', '_')}_results.json"
260
  with open(results_file_path, "w") as f:
261
+ json.dump(results_dict, f, indent=4)
262
 
263
  # Upload the results file
264
  API.upload_file(