update submit.py
Browse files- src/submission/submit.py +21 -27
src/submission/submit.py
CHANGED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
import json
|
2 |
import os
|
3 |
from datetime import datetime, timezone
|
@@ -10,7 +12,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
10 |
from langchain.prompts import PromptTemplate
|
11 |
|
12 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
13 |
-
from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO, EVAL_RESULTS_PATH, RESULTS_REPO
|
14 |
from src.submission.check_validity import (
|
15 |
already_submitted_models,
|
16 |
check_model_card,
|
@@ -69,7 +71,7 @@ def get_top_prediction(text, tokenizer, model):
|
|
69 |
return top_option
|
70 |
|
71 |
@spaces.GPU(duration=120)
|
72 |
-
def evaluate_model_accuracy_by_subject(model_name
|
73 |
try:
|
74 |
# Load the model and tokenizer
|
75 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
@@ -84,12 +86,13 @@ def evaluate_model_accuracy_by_subject(model_name, num_examples):
|
|
84 |
else:
|
85 |
model = model.cpu()
|
86 |
|
87 |
-
# Load
|
88 |
-
|
89 |
-
|
|
|
90 |
|
91 |
-
|
92 |
-
|
93 |
|
94 |
# Define prompt template
|
95 |
template = """Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D].
|
@@ -104,23 +107,15 @@ Answer:"""
|
|
104 |
|
105 |
# Initialize results storage
|
106 |
subject_results = {}
|
107 |
-
|
108 |
-
subjects = dataset.unique('Subject')
|
109 |
overall_correct_predictions = 0
|
110 |
overall_total_questions = 0
|
111 |
|
112 |
-
for subject in
|
113 |
-
subject_data = dataset.filter(lambda x: x['Subject'] == subject)
|
114 |
-
|
115 |
-
# Sample num_examples from each subject
|
116 |
-
if num_examples > 0:
|
117 |
-
subject_data = subject_data.shuffle().select(range(min(num_examples, len(subject_data))))
|
118 |
-
|
119 |
correct_predictions = 0
|
120 |
total_questions = 0
|
121 |
results = []
|
122 |
|
123 |
-
for data in
|
124 |
# Prepare text input
|
125 |
text = prompt_template.format(
|
126 |
Question=data['Question'],
|
@@ -171,8 +166,7 @@ def add_new_eval(
|
|
171 |
revision: str,
|
172 |
precision: str,
|
173 |
weight_type: str,
|
174 |
-
model_type: str
|
175 |
-
num_examples: int
|
176 |
):
|
177 |
global REQUESTED_MODELS
|
178 |
global USERS_TO_SUBMISSION_DATES
|
@@ -230,7 +224,7 @@ def add_new_eval(
|
|
230 |
|
231 |
# Now, perform the evaluation
|
232 |
try:
|
233 |
-
overall_accuracy, subject_results = evaluate_model_accuracy_by_subject(model
|
234 |
if isinstance(overall_accuracy, str) and overall_accuracy.startswith("Error"):
|
235 |
return styled_error(overall_accuracy)
|
236 |
except Exception as e:
|
@@ -239,17 +233,17 @@ def add_new_eval(
|
|
239 |
# Prepare results for storage
|
240 |
results_dict = {
|
241 |
"config": {
|
242 |
-
"
|
243 |
-
"
|
244 |
-
"
|
245 |
-
"
|
246 |
-
"model_type": model_type,
|
247 |
"weight_type": weight_type,
|
|
|
|
|
248 |
"license": license,
|
249 |
"likes": model_info.likes,
|
250 |
"params": model_size,
|
251 |
"still_on_hub": True,
|
252 |
-
"precision": precision,
|
253 |
},
|
254 |
"results": {
|
255 |
"average": overall_accuracy,
|
@@ -264,7 +258,7 @@ def add_new_eval(
|
|
264 |
# Save results to a JSON file
|
265 |
results_file_path = f"{EVAL_RESULTS_PATH}/{model.replace('/', '_')}_results.json"
|
266 |
with open(results_file_path, "w") as f:
|
267 |
-
json.dump(results_dict, f)
|
268 |
|
269 |
# Upload the results file
|
270 |
API.upload_file(
|
|
|
1 |
+
# src/submission/submit.py
|
2 |
+
|
3 |
import json
|
4 |
import os
|
5 |
from datetime import datetime, timezone
|
|
|
12 |
from langchain.prompts import PromptTemplate
|
13 |
|
14 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
15 |
+
from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO, EVAL_RESULTS_PATH, RESULTS_REPO, FIXED_QUESTIONS_FILE
|
16 |
from src.submission.check_validity import (
|
17 |
already_submitted_models,
|
18 |
check_model_card,
|
|
|
71 |
return top_option
|
72 |
|
73 |
@spaces.GPU(duration=120)
|
74 |
+
def evaluate_model_accuracy_by_subject(model_name):
|
75 |
try:
|
76 |
# Load the model and tokenizer
|
77 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
|
|
86 |
else:
|
87 |
model = model.cpu()
|
88 |
|
89 |
+
# Load fixed questions from JSON file
|
90 |
+
fixed_questions_path = os.path.join(EVAL_RESULTS_PATH, FIXED_QUESTIONS_FILE)
|
91 |
+
if not os.path.exists(fixed_questions_path):
|
92 |
+
return "Fixed questions file not found. Please run the preselection step.", {}
|
93 |
|
94 |
+
with open(fixed_questions_path, 'r') as f:
|
95 |
+
fixed_questions = json.load(f)
|
96 |
|
97 |
# Define prompt template
|
98 |
template = """Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D].
|
|
|
107 |
|
108 |
# Initialize results storage
|
109 |
subject_results = {}
|
|
|
|
|
110 |
overall_correct_predictions = 0
|
111 |
overall_total_questions = 0
|
112 |
|
113 |
+
for subject, questions in fixed_questions.items():
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
correct_predictions = 0
|
115 |
total_questions = 0
|
116 |
results = []
|
117 |
|
118 |
+
for data in questions:
|
119 |
# Prepare text input
|
120 |
text = prompt_template.format(
|
121 |
Question=data['Question'],
|
|
|
166 |
revision: str,
|
167 |
precision: str,
|
168 |
weight_type: str,
|
169 |
+
model_type: str
|
|
|
170 |
):
|
171 |
global REQUESTED_MODELS
|
172 |
global USERS_TO_SUBMISSION_DATES
|
|
|
224 |
|
225 |
# Now, perform the evaluation
|
226 |
try:
|
227 |
+
overall_accuracy, subject_results = evaluate_model_accuracy_by_subject(model)
|
228 |
if isinstance(overall_accuracy, str) and overall_accuracy.startswith("Error"):
|
229 |
return styled_error(overall_accuracy)
|
230 |
except Exception as e:
|
|
|
233 |
# Prepare results for storage
|
234 |
results_dict = {
|
235 |
"config": {
|
236 |
+
"model": model,
|
237 |
+
"base_model": base_model,
|
238 |
+
"revision": revision,
|
239 |
+
"precision": precision,
|
|
|
240 |
"weight_type": weight_type,
|
241 |
+
"model_type": model_type,
|
242 |
+
"submitted_time": current_time,
|
243 |
"license": license,
|
244 |
"likes": model_info.likes,
|
245 |
"params": model_size,
|
246 |
"still_on_hub": True,
|
|
|
247 |
},
|
248 |
"results": {
|
249 |
"average": overall_accuracy,
|
|
|
258 |
# Save results to a JSON file
|
259 |
results_file_path = f"{EVAL_RESULTS_PATH}/{model.replace('/', '_')}_results.json"
|
260 |
with open(results_file_path, "w") as f:
|
261 |
+
json.dump(results_dict, f, indent=4)
|
262 |
|
263 |
# Upload the results file
|
264 |
API.upload_file(
|