update submit
Browse files- src/submission/submit.py +24 -13
src/submission/submit.py
CHANGED
|
@@ -12,7 +12,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
| 12 |
from langchain.prompts import PromptTemplate
|
| 13 |
|
| 14 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
| 15 |
-
from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO, EVAL_RESULTS_PATH, RESULTS_REPO
|
| 16 |
from src.submission.check_validity import (
|
| 17 |
already_submitted_models,
|
| 18 |
check_model_card,
|
|
@@ -71,7 +71,7 @@ def get_top_prediction(text, tokenizer, model):
|
|
| 71 |
return top_option
|
| 72 |
|
| 73 |
@spaces.GPU(duration=120)
|
| 74 |
-
def evaluate_model_accuracy_by_subject(model_name):
|
| 75 |
try:
|
| 76 |
# Load the model and tokenizer
|
| 77 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
|
@@ -86,13 +86,12 @@ def evaluate_model_accuracy_by_subject(model_name):
|
|
| 86 |
else:
|
| 87 |
model = model.cpu()
|
| 88 |
|
| 89 |
-
# Load
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
return "Fixed questions file not found. Please run the preselection step.", {}
|
| 93 |
|
| 94 |
-
|
| 95 |
-
|
| 96 |
|
| 97 |
# Define prompt template
|
| 98 |
template = """Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D].
|
|
@@ -110,12 +109,24 @@ Answer:"""
|
|
| 110 |
overall_correct_predictions = 0
|
| 111 |
overall_total_questions = 0
|
| 112 |
|
| 113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
correct_predictions = 0
|
| 115 |
total_questions = 0
|
| 116 |
results = []
|
| 117 |
|
| 118 |
-
for data in
|
| 119 |
# Prepare text input
|
| 120 |
text = prompt_template.format(
|
| 121 |
Question=data['Question'],
|
|
@@ -224,7 +235,7 @@ def add_new_eval(
|
|
| 224 |
|
| 225 |
# Now, perform the evaluation
|
| 226 |
try:
|
| 227 |
-
overall_accuracy, subject_results = evaluate_model_accuracy_by_subject(model)
|
| 228 |
if isinstance(overall_accuracy, str) and overall_accuracy.startswith("Error"):
|
| 229 |
return styled_error(overall_accuracy)
|
| 230 |
except Exception as e:
|
|
@@ -233,7 +244,7 @@ def add_new_eval(
|
|
| 233 |
# Prepare results for storage
|
| 234 |
results_dict = {
|
| 235 |
"config": {
|
| 236 |
-
"
|
| 237 |
"base_model": base_model,
|
| 238 |
"revision": revision,
|
| 239 |
"precision": precision,
|
|
@@ -272,4 +283,4 @@ def add_new_eval(
|
|
| 272 |
# Remove the local results file
|
| 273 |
os.remove(results_file_path)
|
| 274 |
|
| 275 |
-
return styled_message("Your model has been evaluated and the results are now on the leaderboard!")
|
|
|
|
| 12 |
from langchain.prompts import PromptTemplate
|
| 13 |
|
| 14 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
| 15 |
+
from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO, EVAL_RESULTS_PATH, RESULTS_REPO
|
| 16 |
from src.submission.check_validity import (
|
| 17 |
already_submitted_models,
|
| 18 |
check_model_card,
|
|
|
|
| 71 |
return top_option
|
| 72 |
|
| 73 |
@spaces.GPU(duration=120)
|
| 74 |
+
def evaluate_model_accuracy_by_subject(model_name, num_questions_per_subject=30):
|
| 75 |
try:
|
| 76 |
# Load the model and tokenizer
|
| 77 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
|
|
|
| 86 |
else:
|
| 87 |
model = model.cpu()
|
| 88 |
|
| 89 |
+
# Load your custom MMMLU dataset from HuggingFace
|
| 90 |
+
dataset = load_dataset("Omartificial-Intelligence-Space/Arabic_Openai_MMMLU")
|
| 91 |
+
dataset = dataset['test']
|
|
|
|
| 92 |
|
| 93 |
+
# Filter out excluded subjects
|
| 94 |
+
dataset = dataset.filter(lambda x: x['Subject'] not in excluded_subjects)
|
| 95 |
|
| 96 |
# Define prompt template
|
| 97 |
template = """Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D].
|
|
|
|
| 109 |
overall_correct_predictions = 0
|
| 110 |
overall_total_questions = 0
|
| 111 |
|
| 112 |
+
subjects = dataset.unique('Subject')
|
| 113 |
+
for subject in subjects:
|
| 114 |
+
subject_data = dataset.filter(lambda x: x['Subject'] == subject)
|
| 115 |
+
|
| 116 |
+
# Sample num_questions_per_subject from each subject
|
| 117 |
+
if num_questions_per_subject > 0:
|
| 118 |
+
if len(subject_data) < num_questions_per_subject:
|
| 119 |
+
print(f"Warning: Not enough questions for subject '{subject}'. Using all available questions.")
|
| 120 |
+
selected_indices = range(len(subject_data))
|
| 121 |
+
else:
|
| 122 |
+
selected_indices = random.sample(range(len(subject_data)), num_questions_per_subject)
|
| 123 |
+
subject_data = subject_data.select(selected_indices)
|
| 124 |
+
|
| 125 |
correct_predictions = 0
|
| 126 |
total_questions = 0
|
| 127 |
results = []
|
| 128 |
|
| 129 |
+
for data in subject_data:
|
| 130 |
# Prepare text input
|
| 131 |
text = prompt_template.format(
|
| 132 |
Question=data['Question'],
|
|
|
|
| 235 |
|
| 236 |
# Now, perform the evaluation
|
| 237 |
try:
|
| 238 |
+
overall_accuracy, subject_results = evaluate_model_accuracy_by_subject(model, num_questions_per_subject=30)
|
| 239 |
if isinstance(overall_accuracy, str) and overall_accuracy.startswith("Error"):
|
| 240 |
return styled_error(overall_accuracy)
|
| 241 |
except Exception as e:
|
|
|
|
| 244 |
# Prepare results for storage
|
| 245 |
results_dict = {
|
| 246 |
"config": {
|
| 247 |
+
"model_name": model,
|
| 248 |
"base_model": base_model,
|
| 249 |
"revision": revision,
|
| 250 |
"precision": precision,
|
|
|
|
| 283 |
# Remove the local results file
|
| 284 |
os.remove(results_file_path)
|
| 285 |
|
| 286 |
+
return styled_message("Your model has been evaluated and the results are now on the leaderboard!")
|