Spaces:

Omartificial-Intelligence-Space
/

Arabic-MMMLU-Leaderborad

Running

Omartificial-Intelligence-Space commited on Sep 26, 2024

Commit

9d42a0b

verified ·

1 Parent(s): 22c3520

update submit

Files changed (1) hide show

src/submission/submit.py CHANGED Viewed

@@ -70,7 +70,7 @@ def get_top_prediction(text, tokenizer, model):
     return top_option
 @spaces.GPU(duration=120)
-def evaluate_model_accuracy_by_subject(model_name, num_questions_per_subject=30):
     try:
         # Load the model and tokenizer
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
@@ -87,7 +87,7 @@ def evaluate_model_accuracy_by_subject(model_name, num_questions_per_subject=30)
         # Load your custom MMMLU dataset from HuggingFace
         dataset = load_dataset("Omartificial-Intelligence-Space/Arabic_Openai_MMMLU")
-        dataset = dataset['test']
         # Filter out excluded subjects
         dataset = dataset.filter(lambda x: x['Subject'] not in excluded_subjects)
@@ -234,7 +234,7 @@ def add_new_eval(
     # Now, perform the evaluation
     try:
-        overall_accuracy, subject_results = evaluate_model_accuracy_by_subject(model, num_questions_per_subject=30)
         if isinstance(overall_accuracy, str) and overall_accuracy.startswith("Error"):
             return styled_error(overall_accuracy)
     except Exception as e:

     return top_option
 @spaces.GPU(duration=120)
+def evaluate_model_accuracy_by_subject(model_name, num_questions_per_subject=100):
     try:
         # Load the model and tokenizer
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         # Load your custom MMMLU dataset from HuggingFace
         dataset = load_dataset("Omartificial-Intelligence-Space/Arabic_Openai_MMMLU")
+        dataset = dataset['test'[:100]]
         # Filter out excluded subjects
         dataset = dataset.filter(lambda x: x['Subject'] not in excluded_subjects)
     # Now, perform the evaluation
     try:
+        overall_accuracy, subject_results = evaluate_model_accuracy_by_subject(model, num_questions_per_subject=100)
         if isinstance(overall_accuracy, str) and overall_accuracy.startswith("Error"):
             return styled_error(overall_accuracy)
     except Exception as e: