Clémentine commited on
Commit
98b4261
·
1 Parent(s): 8fb9957

added a display of errors for users

Browse files
Files changed (1) hide show
  1. app.py +153 -149
app.py CHANGED
@@ -35,7 +35,7 @@ ref_level_len = {"validation": {1: 53, 2: 86, 3: 26}, "test": {1: 93, 2: 159, 3:
35
  os.makedirs("scored", exist_ok=True)
36
 
37
  # Should be False on spaces and True outside
38
- LOCAL_DEBUG = False #not (os.environ.get("system") == "spaces")
39
 
40
  # Display the results
41
  eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
@@ -84,164 +84,168 @@ def add_new_eval(
84
  mail: str,
85
  profile: gr.OAuthProfile,
86
  ):
87
- # Was the profile created less than 2 month ago?
88
- user_data = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview")
89
- creation_date = json.loads(user_data.content)["createdAt"]
90
- if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=60):
91
- return format_error("This account is not authorized to submit on GAIA.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
 
 
93
 
94
- contact_infos = load_dataset(CONTACT_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
95
- user_submission_dates = sorted(row["date"] for row in contact_infos[val_or_test] if row["username"] == profile.username)
96
- if len(user_submission_dates) > 0 and user_submission_dates[-1] == datetime.datetime.today().strftime('%Y-%m-%d'):
97
- return format_error("You already submitted once today, please try again tomorrow.")
98
-
99
-
100
- is_validation = val_or_test == "validation"
101
- # Very basic email parsing
102
- _, parsed_mail = parseaddr(mail)
103
- if not "@" in parsed_mail:
104
- return format_warning("Please provide a valid email adress.")
105
-
106
- print("Adding new eval")
107
-
108
- # Check if the combination model/org already exists and prints a warning message if yes
109
- if model.lower() in set([m.lower() for m in eval_results[val_or_test]["model"]]) and organisation.lower() in set([o.lower() for o in eval_results[val_or_test]["organisation"]]):
110
- return format_warning("This model has been already submitted.")
111
-
112
- if path_to_file is None:
113
- return format_warning("Please attach a file.")
114
-
115
- # SAVE UNSCORED SUBMISSION
116
- if LOCAL_DEBUG:
117
- print("mock uploaded submission")
118
- else:
119
- api.upload_file(
120
- repo_id=SUBMISSION_DATASET,
121
- path_or_fileobj=path_to_file.name,
122
- path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_raw_{datetime.datetime.today()}.jsonl",
123
- repo_type="dataset",
124
- token=TOKEN
125
- )
126
-
127
- # SAVE CONTACT
128
- contact_info = {
129
- "model": model,
130
- "model_family": model_family,
131
- "url": url,
132
- "organisation": organisation,
133
- "username": profile.username,
134
- "mail": mail,
135
- "date": datetime.datetime.today().strftime('%Y-%m-%d')
136
- }
137
- contact_infos[val_or_test]= contact_infos[val_or_test].add_item(contact_info)
138
- if LOCAL_DEBUG:
139
- print("mock uploaded contact info")
140
- else:
141
- contact_infos.push_to_hub(CONTACT_DATASET, config_name = YEAR_VERSION, token=TOKEN)
142
-
143
- # SCORE SUBMISSION
144
- file_path = path_to_file.name
145
- scores = {"all": 0, 1: 0, 2: 0, 3: 0}
146
- num_questions = {"all": 0, 1: 0, 2: 0, 3: 0}
147
- task_ids = []
148
- with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file:
149
- with open(file_path, 'r') as f:
150
- for ix, line in enumerate(f):
151
- try:
152
- task = json.loads(line)
153
- except Exception:
154
- return format_error(f"Line {ix} is incorrectly formatted. Please fix it and resubmit your file.")
155
-
156
- if "model_answer" not in task:
157
- return format_error(f"Line {ix} contains no model_answer key. Please fix it and resubmit your file.")
158
- answer = task["model_answer"]
159
- task_id = task["task_id"]
160
- try:
161
- level = int(gold_results[val_or_test][task_id]["Level"])
162
- except KeyError:
163
- return format_error(f"{task_id} not found in split {val_or_test}. Are you sure you submitted the correct file?")
164
-
165
- score = question_scorer(task['model_answer'], gold_results[val_or_test][task_id]["Final answer"])
166
-
167
- scored_file.write(
168
- json.dumps({
169
- "id": task_id,
170
- "model_answer": answer,
171
- "score": score,
172
- "level": level
173
- }) + "\n"
174
- )
175
- task_ids.append(task_id)
176
-
177
- scores["all"] += score
178
- scores[level] += score
179
- num_questions["all"] += 1
180
- num_questions[level] += 1
181
-
182
- # Check if there's any duplicate in the submission
183
- if len(task_ids) != len(set(task_ids)):
184
- return format_error("There are duplicates in your submission. Please check your file and resubmit it.")
185
-
186
- if any([num_questions[level] != ref_level_len[val_or_test][level] for level in [1, 2, 3]]):
187
- return format_error(f"Your submission has {num_questions[1]} questions for level 1, {num_questions[2]} for level 2, and {num_questions[3]} for level 3, but it should have {ref_level_len[val_or_test][1]}, {ref_level_len[val_or_test][2]}, and {ref_level_len[val_or_test][3]} respectively. Please check your submission.")
188
-
189
- # SAVE SCORED SUBMISSION
190
- if LOCAL_DEBUG:
191
- print("mock uploaded scored submission")
192
- else:
193
- api.upload_file(
194
- repo_id=SUBMISSION_DATASET,
195
- path_or_fileobj=f"scored/{organisation}_{model}.jsonl",
196
- path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl",
197
- repo_type="dataset",
198
- token=TOKEN
199
- )
200
 
201
- # Save scored file
202
- if is_validation:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  api.upload_file(
204
- repo_id=SUBMISSION_DATASET_PUBLIC,
205
  path_or_fileobj=f"scored/{organisation}_{model}.jsonl",
206
  path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl",
207
  repo_type="dataset",
208
  token=TOKEN
209
  )
210
 
211
- # SAVE TO LEADERBOARD DATA
212
- eval_entry = {
213
- "model": model,
214
- "model_family": model_family,
215
- "system_prompt": system_prompt,
216
- "url": url,
217
- "organisation": organisation,
218
- "score": scores["all"]/ref_scores_len[val_or_test],
219
- "score_level1": scores[1]/num_questions[1],
220
- "score_level2": scores[2]/num_questions[2],
221
- "score_level3": scores[3]/num_questions[3],
222
- "date": datetime.datetime.today().strftime('%Y-%m-%d')
223
- }
224
- if num_questions[1] + num_questions[2] + num_questions[3] != ref_scores_len[val_or_test]:
225
- return format_error(f"Your submission has {len(scores['all'])} questions for the {val_or_test} set, but it should have {ref_scores_len[val_or_test]}. Please check your submission.")
226
- # Catching spam submissions of 100%
227
- if all((eval_entry[k] == 1 for k in ["score_level1", "score_level2", "score_level3"])):
228
- return format_error(f"There was a problem with your submission. Please open a discussion.")
229
-
230
- # Testing for duplicates - to see if we want to add something like it as it would allow people to try to see the content of other submissions
231
- #eval_entry_no_date = {k: v for k, v in eval_entry if k != "date"}
232
- #columns_no_date = [c for c in eval_results[val_or_test].column_names if c != "date"]
233
- #if eval_entry_no_date in eval_results[val_or_test].select_columns(columns_no_date):
234
- # return format_error(f"Your submission is an exact duplicate from an existing submission.")
235
-
236
- eval_results[val_or_test] = eval_results[val_or_test].add_item(eval_entry)
237
- print(eval_results)
238
- if LOCAL_DEBUG:
239
- print("mock uploaded results to lb")
240
- else:
241
- eval_results.push_to_hub(RESULTS_DATASET, config_name = YEAR_VERSION, token=TOKEN)
242
-
243
-
244
- return format_log(f"Model {model} submitted by {organisation} successfully.\nPlease wait a few hours and refresh the leaderboard to see your score displayed.")
 
 
 
 
 
 
 
 
 
 
 
 
 
245
 
246
 
247
  def refresh():
 
35
  os.makedirs("scored", exist_ok=True)
36
 
37
  # Should be False on spaces and True outside
38
+ LOCAL_DEBUG = False #os.environ.get("system", "") != "spaces"
39
 
40
  # Display the results
41
  eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
 
84
  mail: str,
85
  profile: gr.OAuthProfile,
86
  ):
87
+ try:
88
+ # Was the profile created less than 2 month ago?
89
+ user_data = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview")
90
+ creation_date = json.loads(user_data.content)["createdAt"]
91
+ if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=60):
92
+ return format_error("This account is not authorized to submit on GAIA.")
93
+
94
+
95
+ contact_infos = load_dataset(CONTACT_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
96
+ user_submission_dates = sorted(row["date"] for row in contact_infos[val_or_test] if row["username"] == profile.username)
97
+ if len(user_submission_dates) > 0 and user_submission_dates[-1] == datetime.datetime.today().strftime('%Y-%m-%d'):
98
+ return format_error("You already submitted once today, please try again tomorrow.")
99
+
100
+
101
+ is_validation = val_or_test == "validation"
102
+ # Very basic email parsing
103
+ _, parsed_mail = parseaddr(mail)
104
+ if not "@" in parsed_mail:
105
+ return format_warning("Please provide a valid email adress.")
106
+
107
+ print("Adding new eval")
108
+
109
+ # Check if the combination model/org already exists and prints a warning message if yes
110
+ if model.lower() in set([m.lower() for m in eval_results[val_or_test]["model"]]) and organisation.lower() in set([o.lower() for o in eval_results[val_or_test]["organisation"]]):
111
+ return format_warning("This model has been already submitted.")
112
 
113
+ if path_to_file is None:
114
+ return format_warning("Please attach a file.")
115
 
116
+ # SAVE UNSCORED SUBMISSION
117
+ if LOCAL_DEBUG:
118
+ print("mock uploaded submission")
119
+ else:
120
+ api.upload_file(
121
+ repo_id=SUBMISSION_DATASET,
122
+ path_or_fileobj=path_to_file.name,
123
+ path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_raw_{datetime.datetime.today()}.jsonl",
124
+ repo_type="dataset",
125
+ token=TOKEN
126
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
+ # SAVE CONTACT
129
+ contact_info = {
130
+ "model": model,
131
+ "model_family": model_family,
132
+ "url": url,
133
+ "organisation": organisation,
134
+ "username": profile.username,
135
+ "mail": mail,
136
+ "date": datetime.datetime.today().strftime('%Y-%m-%d')
137
+ }
138
+ contact_infos[val_or_test]= contact_infos[val_or_test].add_item(contact_info)
139
+ if LOCAL_DEBUG:
140
+ print("mock uploaded contact info")
141
+ else:
142
+ contact_infos.push_to_hub(CONTACT_DATASET, config_name = YEAR_VERSION, token=TOKEN)
143
+
144
+ # SCORE SUBMISSION
145
+ file_path = path_to_file.name
146
+ scores = {"all": 0, 1: 0, 2: 0, 3: 0}
147
+ num_questions = {"all": 0, 1: 0, 2: 0, 3: 0}
148
+ task_ids = []
149
+ with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file:
150
+ with open(file_path, 'r') as f:
151
+ for ix, line in enumerate(f):
152
+ try:
153
+ task = json.loads(line)
154
+ except Exception:
155
+ return format_error(f"Line {ix} is incorrectly formatted. Please fix it and resubmit your file.")
156
+
157
+ if "model_answer" not in task:
158
+ return format_error(f"Line {ix} contains no model_answer key. Please fix it and resubmit your file.")
159
+ answer = task["model_answer"]
160
+ task_id = task["task_id"]
161
+ try:
162
+ level = int(gold_results[val_or_test][task_id]["Level"])
163
+ except KeyError:
164
+ return format_error(f"{task_id} not found in split {val_or_test}. Are you sure you submitted the correct file?")
165
+
166
+ score = question_scorer(task['model_answer'], gold_results[val_or_test][task_id]["Final answer"])
167
+
168
+ scored_file.write(
169
+ json.dumps({
170
+ "id": task_id,
171
+ "model_answer": answer,
172
+ "score": score,
173
+ "level": level
174
+ }) + "\n"
175
+ )
176
+ task_ids.append(task_id)
177
+
178
+ scores["all"] += score
179
+ scores[level] += score
180
+ num_questions["all"] += 1
181
+ num_questions[level] += 1
182
+
183
+ # Check if there's any duplicate in the submission
184
+ if len(task_ids) != len(set(task_ids)):
185
+ return format_error("There are duplicates in your submission. Please check your file and resubmit it.")
186
+
187
+ if any([num_questions[level] != ref_level_len[val_or_test][level] for level in [1, 2, 3]]):
188
+ return format_error(f"Your submission has {num_questions[1]} questions for level 1, {num_questions[2]} for level 2, and {num_questions[3]} for level 3, but it should have {ref_level_len[val_or_test][1]}, {ref_level_len[val_or_test][2]}, and {ref_level_len[val_or_test][3]} respectively. Please check your submission.")
189
+
190
+ # SAVE SCORED SUBMISSION
191
+ if LOCAL_DEBUG:
192
+ print("mock uploaded scored submission")
193
+ else:
194
  api.upload_file(
195
+ repo_id=SUBMISSION_DATASET,
196
  path_or_fileobj=f"scored/{organisation}_{model}.jsonl",
197
  path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl",
198
  repo_type="dataset",
199
  token=TOKEN
200
  )
201
 
202
+ # Save scored file
203
+ if is_validation:
204
+ api.upload_file(
205
+ repo_id=SUBMISSION_DATASET_PUBLIC,
206
+ path_or_fileobj=f"scored/{organisation}_{model}.jsonl",
207
+ path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl",
208
+ repo_type="dataset",
209
+ token=TOKEN
210
+ )
211
+
212
+ # SAVE TO LEADERBOARD DATA
213
+ eval_entry = {
214
+ "model": model,
215
+ "model_family": model_family,
216
+ "system_prompt": system_prompt,
217
+ "url": url,
218
+ "organisation": organisation,
219
+ "score": scores["all"]/ref_scores_len[val_or_test],
220
+ "score_level1": scores[1]/num_questions[1],
221
+ "score_level2": scores[2]/num_questions[2],
222
+ "score_level3": scores[3]/num_questions[3],
223
+ "date": datetime.datetime.today().strftime('%Y-%m-%d')
224
+ }
225
+ if num_questions[1] + num_questions[2] + num_questions[3] != ref_scores_len[val_or_test]:
226
+ return format_error(f"Your submission has {len(scores['all'])} questions for the {val_or_test} set, but it should have {ref_scores_len[val_or_test]}. Please check your submission.")
227
+ # Catching spam submissions of 100%
228
+ if all((eval_entry[k] == 1 for k in ["score_level1", "score_level2", "score_level3"])):
229
+ return format_error(f"There was a problem with your submission. Please open a discussion.")
230
+
231
+ # Testing for duplicates - to see if we want to add something like it as it would allow people to try to see the content of other submissions
232
+ #eval_entry_no_date = {k: v for k, v in eval_entry if k != "date"}
233
+ #columns_no_date = [c for c in eval_results[val_or_test].column_names if c != "date"]
234
+ #if eval_entry_no_date in eval_results[val_or_test].select_columns(columns_no_date):
235
+ # return format_error(f"Your submission is an exact duplicate from an existing submission.")
236
+
237
+ eval_results[val_or_test] = eval_results[val_or_test].add_item(eval_entry)
238
+ print(eval_results)
239
+ if LOCAL_DEBUG:
240
+ print("mock uploaded results to lb")
241
+ else:
242
+ eval_results.push_to_hub(RESULTS_DATASET, config_name = YEAR_VERSION, token=TOKEN)
243
+
244
+
245
+ return format_log(f"Model {model} submitted by {organisation} successfully.\nPlease wait a few hours and refresh the leaderboard to see your score displayed.")
246
+ except Exception as e:
247
+ print(e)
248
+ return format_error(f"An error occurred, please open a discussion and indicate at what time you encountered the error.\n")
249
 
250
 
251
  def refresh():