Spaces:

gaia-benchmark
/

leaderboard

Running on CPU Upgrade

App Files Files Community

Clémentine commited on 17 days ago

Commit

98b4261

1 Parent(s): 8fb9957

added a display of errors for users

Browse files

Files changed (1) hide show

app.py +153 -149

app.py CHANGED Viewed

@@ -35,7 +35,7 @@ ref_level_len = {"validation": {1: 53, 2: 86, 3: 26}, "test": {1: 93, 2: 159, 3:
 os.makedirs("scored", exist_ok=True)
 # Should be False on spaces and True outside
-LOCAL_DEBUG = False #not (os.environ.get("system") == "spaces")
 # Display the results
 eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
@@ -84,164 +84,168 @@ def add_new_eval(
     mail: str,
     profile: gr.OAuthProfile,
 ):
-    # Was the profile created less than 2 month ago?
-    user_data = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview")
-    creation_date = json.loads(user_data.content)["createdAt"]
-    if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=60):
-        return format_error("This account is not authorized to submit on GAIA.")
-    contact_infos = load_dataset(CONTACT_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
-    user_submission_dates = sorted(row["date"] for row in contact_infos[val_or_test] if row["username"] == profile.username)
-    if len(user_submission_dates) > 0 and user_submission_dates[-1] == datetime.datetime.today().strftime('%Y-%m-%d'):
-        return format_error("You already submitted once today, please try again tomorrow.")
-    is_validation = val_or_test == "validation"
-    # Very basic email parsing
-    _, parsed_mail = parseaddr(mail)
-    if not "@" in parsed_mail:
-        return format_warning("Please provide a valid email adress.")
-    print("Adding new eval")
-    # Check if the combination model/org already exists and prints a warning message if yes
-    if model.lower() in set([m.lower() for m in eval_results[val_or_test]["model"]]) and organisation.lower() in set([o.lower() for o in eval_results[val_or_test]["organisation"]]):
-        return format_warning("This model has been already submitted.")
-    if path_to_file is None:
-        return format_warning("Please attach a file.")
-    # SAVE UNSCORED SUBMISSION
-    if LOCAL_DEBUG:
-        print("mock uploaded submission")
-    else:
-        api.upload_file(
-            repo_id=SUBMISSION_DATASET,
-            path_or_fileobj=path_to_file.name,
-            path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_raw_{datetime.datetime.today()}.jsonl",
-            repo_type="dataset",
-            token=TOKEN
-        )
-    # SAVE CONTACT
-    contact_info = {
-        "model": model,
-        "model_family": model_family,
-        "url": url,
-        "organisation": organisation,
-        "username": profile.username,
-        "mail": mail,
-        "date": datetime.datetime.today().strftime('%Y-%m-%d')
-    }
-    contact_infos[val_or_test]= contact_infos[val_or_test].add_item(contact_info)
-    if LOCAL_DEBUG:
-        print("mock uploaded contact info")
-    else:
-        contact_infos.push_to_hub(CONTACT_DATASET, config_name = YEAR_VERSION, token=TOKEN)
-    # SCORE SUBMISSION
-    file_path = path_to_file.name
-    scores = {"all": 0, 1: 0, 2: 0, 3: 0}
-    num_questions = {"all": 0, 1: 0, 2: 0, 3: 0}
-    task_ids = []
-    with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file:
-        with open(file_path, 'r') as f:
-            for ix, line in enumerate(f):
-                try:
-                    task = json.loads(line)
-                except Exception:
-                    return format_error(f"Line {ix} is incorrectly formatted. Please fix it and resubmit your file.")
-                if "model_answer" not in task:
-                    return format_error(f"Line {ix} contains no model_answer key. Please fix it and resubmit your file.")
-                answer = task["model_answer"]
-                task_id = task["task_id"]
-                try:
-                    level = int(gold_results[val_or_test][task_id]["Level"])
-                except KeyError:
-                    return format_error(f"{task_id} not found in split {val_or_test}. Are you sure you submitted the correct file?")
-                score = question_scorer(task['model_answer'], gold_results[val_or_test][task_id]["Final answer"])
-                scored_file.write(
-                    json.dumps({
-                        "id": task_id,
-                        "model_answer": answer,
-                        "score": score,
-                        "level": level
-                    }) + "\n"
-                )
-                task_ids.append(task_id)
-                scores["all"] += score
-                scores[level] += score
-                num_questions["all"] += 1
-                num_questions[level] += 1
-    # Check if there's any duplicate in the submission
-    if len(task_ids) != len(set(task_ids)):
-        return format_error("There are duplicates in your submission. Please check your file and resubmit it.")
-    if any([num_questions[level] != ref_level_len[val_or_test][level] for level in [1, 2, 3]]):
-        return format_error(f"Your submission has {num_questions[1]} questions for level 1, {num_questions[2]} for level 2, and {num_questions[3]} for level 3, but it should have {ref_level_len[val_or_test][1]}, {ref_level_len[val_or_test][2]}, and {ref_level_len[val_or_test][3]} respectively. Please check your submission.")
-    # SAVE SCORED SUBMISSION
-    if LOCAL_DEBUG:
-        print("mock uploaded scored submission")
-    else:
-        api.upload_file(
-            repo_id=SUBMISSION_DATASET,
-            path_or_fileobj=f"scored/{organisation}_{model}.jsonl",
-            path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl",
-            repo_type="dataset",
-            token=TOKEN
-        )
-        # Save scored file
-        if is_validation:
             api.upload_file(
-                repo_id=SUBMISSION_DATASET_PUBLIC,
                 path_or_fileobj=f"scored/{organisation}_{model}.jsonl",
                 path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl",
                 repo_type="dataset",
                 token=TOKEN
             )
-    # SAVE TO LEADERBOARD DATA
-    eval_entry = {
-        "model": model,
-        "model_family": model_family,
-        "system_prompt": system_prompt,
-        "url": url,
-        "organisation": organisation,
-        "score": scores["all"]/ref_scores_len[val_or_test],
-        "score_level1": scores[1]/num_questions[1],
-        "score_level2": scores[2]/num_questions[2],
-        "score_level3": scores[3]/num_questions[3],
-        "date": datetime.datetime.today().strftime('%Y-%m-%d')
-    }
-    if num_questions[1] + num_questions[2] + num_questions[3] != ref_scores_len[val_or_test]:
-        return format_error(f"Your submission has {len(scores['all'])} questions for the {val_or_test} set, but it should have {ref_scores_len[val_or_test]}. Please check your submission.")
-    # Catching spam submissions of 100%
-    if all((eval_entry[k] == 1 for k in ["score_level1", "score_level2", "score_level3"])):
-        return format_error(f"There was a problem with your submission. Please open a discussion.")
-    # Testing for duplicates - to see if we want to add something like it as it would allow people to try to see the content of other submissions
-    #eval_entry_no_date = {k: v for k, v in eval_entry if k != "date"}
-    #columns_no_date = [c for c in eval_results[val_or_test].column_names if c != "date"]
-    #if eval_entry_no_date in eval_results[val_or_test].select_columns(columns_no_date):
-    #    return format_error(f"Your submission is an exact duplicate from an existing submission.")
-    eval_results[val_or_test] = eval_results[val_or_test].add_item(eval_entry)
-    print(eval_results)
-    if LOCAL_DEBUG:
-        print("mock uploaded results to lb")
-    else:
-        eval_results.push_to_hub(RESULTS_DATASET, config_name = YEAR_VERSION, token=TOKEN)
-    return format_log(f"Model {model} submitted by {organisation} successfully.\nPlease wait a few hours and refresh the leaderboard to see your score displayed.")
 def refresh():

 os.makedirs("scored", exist_ok=True)
 # Should be False on spaces and True outside
+LOCAL_DEBUG = False #os.environ.get("system", "") != "spaces"
 # Display the results
 eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
     mail: str,
     profile: gr.OAuthProfile,
 ):
+    try:
+        # Was the profile created less than 2 month ago?
+        user_data = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview")
+        creation_date = json.loads(user_data.content)["createdAt"]
+        if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=60):
+            return format_error("This account is not authorized to submit on GAIA.")
+        contact_infos = load_dataset(CONTACT_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
+        user_submission_dates = sorted(row["date"] for row in contact_infos[val_or_test] if row["username"] == profile.username)
+        if len(user_submission_dates) > 0 and user_submission_dates[-1] == datetime.datetime.today().strftime('%Y-%m-%d'):
+            return format_error("You already submitted once today, please try again tomorrow.")
+        is_validation = val_or_test == "validation"
+        # Very basic email parsing
+        _, parsed_mail = parseaddr(mail)
+        if not "@" in parsed_mail:
+            return format_warning("Please provide a valid email adress.")
+        print("Adding new eval")
+        # Check if the combination model/org already exists and prints a warning message if yes
+        if model.lower() in set([m.lower() for m in eval_results[val_or_test]["model"]]) and organisation.lower() in set([o.lower() for o in eval_results[val_or_test]["organisation"]]):
+            return format_warning("This model has been already submitted.")
+        if path_to_file is None:
+            return format_warning("Please attach a file.")
+        # SAVE UNSCORED SUBMISSION
+        if LOCAL_DEBUG:
+            print("mock uploaded submission")
+        else:
+            api.upload_file(
+                repo_id=SUBMISSION_DATASET,
+                path_or_fileobj=path_to_file.name,
+                path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_raw_{datetime.datetime.today()}.jsonl",
+                repo_type="dataset",
+                token=TOKEN
+            )
+        # SAVE CONTACT
+        contact_info = {
+            "model": model,
+            "model_family": model_family,
+            "url": url,
+            "organisation": organisation,
+            "username": profile.username,
+            "mail": mail,
+            "date": datetime.datetime.today().strftime('%Y-%m-%d')
+        }
+        contact_infos[val_or_test]= contact_infos[val_or_test].add_item(contact_info)
+        if LOCAL_DEBUG:
+            print("mock uploaded contact info")
+        else:
+            contact_infos.push_to_hub(CONTACT_DATASET, config_name = YEAR_VERSION, token=TOKEN)
+        # SCORE SUBMISSION
+        file_path = path_to_file.name
+        scores = {"all": 0, 1: 0, 2: 0, 3: 0}
+        num_questions = {"all": 0, 1: 0, 2: 0, 3: 0}
+        task_ids = []
+        with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file:
+            with open(file_path, 'r') as f:
+                for ix, line in enumerate(f):
+                    try:
+                        task = json.loads(line)
+                    except Exception:
+                        return format_error(f"Line {ix} is incorrectly formatted. Please fix it and resubmit your file.")
+                    if "model_answer" not in task:
+                        return format_error(f"Line {ix} contains no model_answer key. Please fix it and resubmit your file.")
+                    answer = task["model_answer"]
+                    task_id = task["task_id"]
+                    try:
+                        level = int(gold_results[val_or_test][task_id]["Level"])
+                    except KeyError:
+                        return format_error(f"{task_id} not found in split {val_or_test}. Are you sure you submitted the correct file?")
+                    score = question_scorer(task['model_answer'], gold_results[val_or_test][task_id]["Final answer"])
+                    scored_file.write(
+                        json.dumps({
+                            "id": task_id,
+                            "model_answer": answer,
+                            "score": score,
+                            "level": level
+                        }) + "\n"
+                    )
+                    task_ids.append(task_id)
+                    scores["all"] += score
+                    scores[level] += score
+                    num_questions["all"] += 1
+                    num_questions[level] += 1
+        # Check if there's any duplicate in the submission
+        if len(task_ids) != len(set(task_ids)):
+            return format_error("There are duplicates in your submission. Please check your file and resubmit it.")
+        if any([num_questions[level] != ref_level_len[val_or_test][level] for level in [1, 2, 3]]):
+            return format_error(f"Your submission has {num_questions[1]} questions for level 1, {num_questions[2]} for level 2, and {num_questions[3]} for level 3, but it should have {ref_level_len[val_or_test][1]}, {ref_level_len[val_or_test][2]}, and {ref_level_len[val_or_test][3]} respectively. Please check your submission.")
+        # SAVE SCORED SUBMISSION
+        if LOCAL_DEBUG:
+            print("mock uploaded scored submission")
+        else:
             api.upload_file(
+                repo_id=SUBMISSION_DATASET,
                 path_or_fileobj=f"scored/{organisation}_{model}.jsonl",
                 path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl",
                 repo_type="dataset",
                 token=TOKEN
             )
+            # Save scored file
+            if is_validation:
+                api.upload_file(
+                    repo_id=SUBMISSION_DATASET_PUBLIC,
+                    path_or_fileobj=f"scored/{organisation}_{model}.jsonl",
+                    path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl",
+                    repo_type="dataset",
+                    token=TOKEN
+                )
+        # SAVE TO LEADERBOARD DATA
+        eval_entry = {
+            "model": model,
+            "model_family": model_family,
+            "system_prompt": system_prompt,
+            "url": url,
+            "organisation": organisation,
+            "score": scores["all"]/ref_scores_len[val_or_test],
+            "score_level1": scores[1]/num_questions[1],
+            "score_level2": scores[2]/num_questions[2],
+            "score_level3": scores[3]/num_questions[3],
+            "date": datetime.datetime.today().strftime('%Y-%m-%d')
+        }
+        if num_questions[1] + num_questions[2] + num_questions[3] != ref_scores_len[val_or_test]:
+            return format_error(f"Your submission has {len(scores['all'])} questions for the {val_or_test} set, but it should have {ref_scores_len[val_or_test]}. Please check your submission.")
+        # Catching spam submissions of 100%
+        if all((eval_entry[k] == 1 for k in ["score_level1", "score_level2", "score_level3"])):
+            return format_error(f"There was a problem with your submission. Please open a discussion.")
+        # Testing for duplicates - to see if we want to add something like it as it would allow people to try to see the content of other submissions
+        #eval_entry_no_date = {k: v for k, v in eval_entry if k != "date"}
+        #columns_no_date = [c for c in eval_results[val_or_test].column_names if c != "date"]
+        #if eval_entry_no_date in eval_results[val_or_test].select_columns(columns_no_date):
+        #    return format_error(f"Your submission is an exact duplicate from an existing submission.")
+        eval_results[val_or_test] = eval_results[val_or_test].add_item(eval_entry)
+        print(eval_results)
+        if LOCAL_DEBUG:
+            print("mock uploaded results to lb")
+        else:
+            eval_results.push_to_hub(RESULTS_DATASET, config_name = YEAR_VERSION, token=TOKEN)
+        return format_log(f"Model {model} submitted by {organisation} successfully.\nPlease wait a few hours and refresh the leaderboard to see your score displayed.")
+    except Exception as e:
+        print(e)
+        return format_error(f"An error occurred, please open a discussion and indicate at what time you encountered the error.\n")
 def refresh():