Spaces:

Omartificial-Intelligence-Space
/

Arabic-MMMLU-Leaderborad

Running

App Files Files Community

Omartificial-Intelligence-Space commited on Oct 2, 2024

Commit

7cbbcca

verified ·

1 Parent(s): 9535ee7

update submit

Browse files

Files changed (1) hide show

src/submission/submit.py +21 -45

src/submission/submit.py CHANGED Viewed

@@ -51,10 +51,8 @@ def get_top_prediction(text, tokenizer, model):
     options = [' A', ' B', ' C', ' D']
     option_logits = []
-    # Iterate through each option
     for option in options:
         option_ids = tokenizer(option).input_ids
-        # Ensure option_ids are within range and not empty
         if option_ids and option_ids[-1] < logits.size(0):
             option_id = option_ids[-1]
             option_logit = logits[option_id]
@@ -65,14 +63,12 @@ def get_top_prediction(text, tokenizer, model):
     if not option_logits:
         return "No valid options"
-    # Get the option with the highest logit
     top_option = max(option_logits, key=lambda x: x[0])[1]
     return top_option
 @spaces.GPU(duration=120)
 def evaluate_model_accuracy_by_subject(model_name, num_questions_per_subject=100):
     try:
-        # Load the model and tokenizer
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         tokenizer.pad_token = tokenizer.eos_token
@@ -85,14 +81,11 @@ def evaluate_model_accuracy_by_subject(model_name, num_questions_per_subject=100
         else:
             model = model.cpu()
-        # Load your custom MMMLU dataset from HuggingFace
         dataset = load_dataset("Omartificial-Intelligence-Space/Arabic_Openai_MMMLU")
-        dataset = dataset['test'[:100]]
-        # Filter out excluded subjects
         dataset = dataset.filter(lambda x: x['Subject'] not in excluded_subjects)
-        # Define prompt template
         template = """Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D].
 Question: {Question}
 A) {A}
@@ -103,16 +96,18 @@ Answer:"""
         prompt_template = PromptTemplate(template=template, input_variables=['Question', 'A', 'B', 'C', 'D'])
-        # Initialize results storage
         subject_results = {}
         overall_correct_predictions = 0
         overall_total_questions = 0
         subjects = dataset.unique('Subject')
         for subject in subjects:
             subject_data = dataset.filter(lambda x: x['Subject'] == subject)
-            # Sample num_questions_per_subject from each subject
             if num_questions_per_subject > 0:
                 if len(subject_data) < num_questions_per_subject:
                     print(f"Warning: Not enough questions for subject '{subject}'. Using all available questions.")
@@ -126,7 +121,6 @@ Answer:"""
             results = []
             for data in subject_data:
-                # Prepare text input
                 text = prompt_template.format(
                     Question=data['Question'],
                     A=data['A'],
@@ -135,7 +129,6 @@ Answer:"""
                     D=data['D']
                 )
-                # Get the top prediction
                 top_prediction = get_top_prediction(text, tokenizer, model)
                 is_correct = (top_prediction == data['Answer'])
                 correct_predictions += int(is_correct)
@@ -152,7 +145,11 @@ Answer:"""
             accuracy = correct_predictions / total_questions if total_questions > 0 else 0
-            # Store results for this subject
             subject_results[subject] = {
                 'Correct Predictions': correct_predictions,
                 'Total Questions': total_questions,
@@ -162,13 +159,18 @@ Answer:"""
         overall_accuracy = (overall_correct_predictions / overall_total_questions) * 100 if overall_total_questions > 0 else 0
-        return overall_accuracy, subject_results
     except Exception as e:
         import traceback
         tb = traceback.format_exc()
         print(f"Error in evaluate_model_accuracy_by_subject: {e}\n{tb}")
-        return f"Error: {str(e)}", {}
 def add_new_eval(
     model: str,
@@ -199,7 +201,6 @@ def add_new_eval(
     if revision == "":
         revision = "main"
-    # Is the model on the hub?
     if weight_type in ["Delta", "Adapter"]:
         base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
         if not base_model_on_hub:
@@ -210,36 +211,16 @@ def add_new_eval(
         if not model_on_hub:
             return styled_error(f'Model "{model}" {error}')
-    # Is the model info correctly filled?
-    try:
-        model_info = API.model_info(repo_id=model, revision=revision)
-    except Exception:
-        return styled_error("Could not get your model information. Please fill it up properly.")
-    model_size = get_model_size(model_info=model_info, precision=precision)
-    # Were the model card and license filled?
-    try:
-        license = model_info.cardData["license"]
-    except Exception:
-        return styled_error("Please select a license for your model")
-    modelcard_OK, error_msg = check_model_card(model)
-    if not modelcard_OK:
-        return styled_error(error_msg)
-    # Check for duplicate submission
-    if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
-        return styled_warning("This model has been already submitted.")
-    # Now, perform the evaluation
     try:
-        overall_accuracy, subject_results = evaluate_model_accuracy_by_subject(model, num_questions_per_subject=100)
         if isinstance(overall_accuracy, str) and overall_accuracy.startswith("Error"):
             return styled_error(overall_accuracy)
     except Exception as e:
         return styled_error(f"An error occurred during evaluation: {str(e)}")
     # Prepare results for storage
     results_dict = {
         "config": {
@@ -250,10 +231,6 @@ def add_new_eval(
             "weight_type": weight_type,
             "model_type": model_type,
             "submitted_time": current_time,
-            "license": license,
-            "likes": model_info.likes,
-            "params": model_size,
-            "still_on_hub": True,
         },
         "results": {
             "average": overall_accuracy,
@@ -279,7 +256,6 @@ def add_new_eval(
         commit_message=f"Add results for {model}"
     )
-    # Remove the local results file
     os.remove(results_file_path)
     return styled_message("Your model has been evaluated and the results are now on the leaderboard!")

     options = [' A', ' B', ' C', ' D']
     option_logits = []
     for option in options:
         option_ids = tokenizer(option).input_ids
         if option_ids and option_ids[-1] < logits.size(0):
             option_id = option_ids[-1]
             option_logit = logits[option_id]
     if not option_logits:
         return "No valid options"
     top_option = max(option_logits, key=lambda x: x[0])[1]
     return top_option
 @spaces.GPU(duration=120)
 def evaluate_model_accuracy_by_subject(model_name, num_questions_per_subject=100):
     try:
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         tokenizer.pad_token = tokenizer.eos_token
         else:
             model = model.cpu()
         dataset = load_dataset("Omartificial-Intelligence-Space/Arabic_Openai_MMMLU")
+        dataset = dataset['test']
         dataset = dataset.filter(lambda x: x['Subject'] not in excluded_subjects)
         template = """Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D].
 Question: {Question}
 A) {A}
         prompt_template = PromptTemplate(template=template, input_variables=['Question', 'A', 'B', 'C', 'D'])
         subject_results = {}
         overall_correct_predictions = 0
         overall_total_questions = 0
         subjects = dataset.unique('Subject')
+        # To track best performance per subject
+        best_in_class = {subject: {"model_name": None, "accuracy": 0} for subject in subjects}
         for subject in subjects:
             subject_data = dataset.filter(lambda x: x['Subject'] == subject)
             if num_questions_per_subject > 0:
                 if len(subject_data) < num_questions_per_subject:
                     print(f"Warning: Not enough questions for subject '{subject}'. Using all available questions.")
             results = []
             for data in subject_data:
                 text = prompt_template.format(
                     Question=data['Question'],
                     A=data['A'],
                     D=data['D']
                 )
                 top_prediction = get_top_prediction(text, tokenizer, model)
                 is_correct = (top_prediction == data['Answer'])
                 correct_predictions += int(is_correct)
             accuracy = correct_predictions / total_questions if total_questions > 0 else 0
+            # Check if this model is the best for the current subject
+            if accuracy > best_in_class[subject]['accuracy']:
+                best_in_class[subject]['model_name'] = model_name
+                best_in_class[subject]['accuracy'] = accuracy
             subject_results[subject] = {
                 'Correct Predictions': correct_predictions,
                 'Total Questions': total_questions,
         overall_accuracy = (overall_correct_predictions / overall_total_questions) * 100 if overall_total_questions > 0 else 0
+        return overall_accuracy, subject_results, best_in_class
     except Exception as e:
         import traceback
         tb = traceback.format_exc()
         print(f"Error in evaluate_model_accuracy_by_subject: {e}\n{tb}")
+        return f"Error: {str(e)}", {}, {}
+def display_best_in_class(best_in_class):
+    print("\nBest Model in Each Subject:\n")
+    for subject, info in best_in_class.items():
+        print(f"{subject}: {info['model_name']} with accuracy: {info['accuracy'] * 100:.2f}%")
 def add_new_eval(
     model: str,
     if revision == "":
         revision = "main"
     if weight_type in ["Delta", "Adapter"]:
         base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
         if not base_model_on_hub:
         if not model_on_hub:
             return styled_error(f'Model "{model}" {error}')
     try:
+        overall_accuracy, subject_results, best_in_class = evaluate_model_accuracy_by_subject(model, num_questions_per_subject=100)
         if isinstance(overall_accuracy, str) and overall_accuracy.startswith("Error"):
             return styled_error(overall_accuracy)
     except Exception as e:
         return styled_error(f"An error occurred during evaluation: {str(e)}")
+    # Display the best in class results
+    display_best_in_class(best_in_class)
     # Prepare results for storage
     results_dict = {
         "config": {
             "weight_type": weight_type,
             "model_type": model_type,
             "submitted_time": current_time,
         },
         "results": {
             "average": overall_accuracy,
         commit_message=f"Add results for {model}"
     )
     os.remove(results_file_path)
     return styled_message("Your model has been evaluated and the results are now on the leaderboard!")