not-lain commited on
Commit
06b78c4
·
verified ·
1 Parent(s): 50e0fff

update dataset upload method to include users with dashes in their usernames

Browse files

currently you can't create splits for users with dashes in their names, some usernames that do not work with the previous implementation are `not-lain`, `julien-c`, etc....
by switching to this implementation we upload a file called `[username].parquet` for each username, I also made it compatible with the dataset viewer (see example [dataset](https://huggingface.co/datasets/not-lain/testing-my-upload) and example [space](https://huggingface.co/spaces/not-lain/unit_1_quiz))

cc

@burtenshaw
for review

Files changed (2) hide show
  1. app.py +21 -52
  2. data_to_parquet.py +52 -0
app.py CHANGED
@@ -1,14 +1,12 @@
1
  import os
2
- from datetime import datetime
3
  import random
4
 
5
- import pandas as pd
6
- from huggingface_hub import HfApi, hf_hub_download, Repository
7
- from huggingface_hub.repocard import metadata_load
8
 
9
  import gradio as gr
10
- from datasets import load_dataset, Dataset
11
- from huggingface_hub import whoami
 
12
 
13
  EXAM_DATASET_ID = os.getenv("EXAM_DATASET_ID") or "agents-course/unit_1_quiz"
14
  EXAM_MAX_QUESTIONS = os.getenv("EXAM_MAX_QUESTIONS") or 10
@@ -16,13 +14,7 @@ EXAM_PASSING_SCORE = os.getenv("EXAM_PASSING_SCORE") or 0.7
16
 
17
  ds = load_dataset(EXAM_DATASET_ID, split="train")
18
 
19
- DATASET_REPO_URL = "https://huggingface.co/datasets/agents-course/certificates"
20
- CERTIFIED_USERS_FILENAME = "certified_students.csv"
21
- CERTIFIED_USERS_DIR = "certificates"
22
- repo = Repository(
23
- local_dir=CERTIFIED_USERS_DIR, clone_from=DATASET_REPO_URL, use_auth_token=os.getenv("HF_TOKEN")
24
- )
25
-
26
  # Convert dataset to a list of dicts and randomly sort
27
  quiz_data = ds.to_pandas().to_dict("records")
28
  random.shuffle(quiz_data)
@@ -66,24 +58,6 @@ def on_user_logged_in(token: gr.OAuthToken | None):
66
  None, # no token
67
  ]
68
 
69
- def add_certified_user(hf_username, pass_percentage, submission_time):
70
- """
71
- Add the certified user to the database
72
- """
73
- print("ADD CERTIFIED USER")
74
- repo.git_pull()
75
- history = pd.read_csv(os.path.join(CERTIFIED_USERS_DIR, CERTIFIED_USERS_FILENAME))
76
-
77
- # Check if this hf_username is already in our dataset:
78
- check = history.loc[history['hf_username'] == hf_username]
79
- if not check.empty:
80
- history = history.drop(labels=check.index[0], axis=0)
81
-
82
- new_row = pd.DataFrame({'hf_username': hf_username, 'pass_percentage': pass_percentage, 'datetime': submission_time}, index=[0])
83
- history = pd.concat([new_row, history[:]]).reset_index(drop=True)
84
-
85
- history.to_csv(os.path.join(CERTIFIED_USERS_DIR, CERTIFIED_USERS_FILENAME), index=False)
86
- repo.push_to_hub(commit_message="Update certified users list")
87
 
88
  def push_results_to_hub(user_answers, token: gr.OAuthToken | None):
89
  """
@@ -103,33 +77,28 @@ def push_results_to_hub(user_answers, token: gr.OAuthToken | None):
103
  gr.Warning(
104
  f"Score {grade:.1%} below passing threshold of {float(EXAM_PASSING_SCORE):.1%}"
105
  )
106
- return f"You scored {grade:.1%}. Please try again to achieve at least {float(EXAM_PASSING_SCORE):.1%}"
107
 
108
  gr.Info("Submitting answers to the Hub. Please wait...", duration=2)
109
 
110
  user_info = whoami(token=token.token)
111
- repo_id = f"{EXAM_DATASET_ID}_student_responses"
112
- submission_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
113
-
114
- new_ds = Dataset.from_list(user_answers)
115
- new_ds = new_ds.map(
116
- lambda x: {
117
- "username": user_info["name"],
118
- "datetime": submission_time,
119
- "grade": grade,
120
- }
 
 
121
  )
122
- new_ds.push_to_hub(repo_id=repo_id, split=user_info["name"])
123
-
124
- # I'm adding a csv version
125
- # The idea, if the user passed, we create a simple row in a csv
126
- print("ADD CERTIFIED USER")
127
- # Add this user to our database
128
- add_certified_user(user_info["name"], grade, submission_time)
129
-
130
- return f"Your responses have been submitted to the Hub! Final grade: {grade:.1%}"
131
-
132
 
 
 
 
133
 
134
 
135
  def handle_quiz(question_idx, user_answers, selected_answer, is_start):
 
1
  import os
 
2
  import random
3
 
4
+ from huggingface_hub import HfApi, whoami
 
 
5
 
6
  import gradio as gr
7
+ from datasets import load_dataset
8
+
9
+ from data_to_parquet import to_parquet
10
 
11
  EXAM_DATASET_ID = os.getenv("EXAM_DATASET_ID") or "agents-course/unit_1_quiz"
12
  EXAM_MAX_QUESTIONS = os.getenv("EXAM_MAX_QUESTIONS") or 10
 
14
 
15
  ds = load_dataset(EXAM_DATASET_ID, split="train")
16
 
17
+ upload_api = HfApi(token=os.getenv("HF_TOKEN"))
 
 
 
 
 
 
18
  # Convert dataset to a list of dicts and randomly sort
19
  quiz_data = ds.to_pandas().to_dict("records")
20
  random.shuffle(quiz_data)
 
58
  None, # no token
59
  ]
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  def push_results_to_hub(user_answers, token: gr.OAuthToken | None):
63
  """
 
77
  gr.Warning(
78
  f"Score {grade:.1%} below passing threshold of {float(EXAM_PASSING_SCORE):.1%}"
79
  )
80
+ return # do not continue
81
 
82
  gr.Info("Submitting answers to the Hub. Please wait...", duration=2)
83
 
84
  user_info = whoami(token=token.token)
85
+ # TODO:
86
+ # check if username already has "username.parquet" in the dataset and download that (or read values directly from dataset viewer if possible)
87
+ # instead of replacing the values check if the new score is better than the old one
88
+ to_parquet(
89
+ upload_api, # api
90
+ "agents-course/students-data", # repo_id
91
+ user_info["name"], # username
92
+ grade, # unit1 score
93
+ 0.0, # unit2 score
94
+ 0.0, # unit3 score
95
+ 0.0, # unit4 score
96
+ 0, # already certified or not
97
  )
 
 
 
 
 
 
 
 
 
 
98
 
99
+ gr.Success(
100
+ f"Your responses have been submitted to the Hub! Final grade: {grade:.1%}"
101
+ )
102
 
103
 
104
  def handle_quiz(question_idx, user_answers, selected_answer, is_start):
data_to_parquet.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pyarrow as pa
2
+ import pyarrow.parquet as pq
3
+ import json
4
+ import tempfile
5
+
6
+
7
+ # current schema (refer to https://huggingface.co/spaces/phxia/dataset-builder/blob/main/dataset_uploader.py#L153 for more info)
8
+ schema = {
9
+ "username": {"_type": "Value", "dtype": "string"},
10
+ "unit1": {"_type": "Value", "dtype": "float64"},
11
+ "unit2": {"_type": "Value", "dtype": "float64"},
12
+ "unit3": {"_type": "Value", "dtype": "float64"},
13
+ "unit4": {"_type": "Value", "dtype": "float64"},
14
+ "certified": {"_type": "Value", "dtype": "int64"},
15
+ }
16
+
17
+
18
+ def to_parquet(
19
+ api,
20
+ repo: str,
21
+ username: str = "",
22
+ unit1: float = 0.0,
23
+ unit2: float = 0.0,
24
+ unit3: float = 0.0,
25
+ unit4: float = 0.0,
26
+ certified: int = 0,
27
+ ):
28
+ data = {
29
+ "username": username,
30
+ "unit1": unit1 * 100 if unit1 != 0 else 0.0,
31
+ "unit2": unit2 * 100 if unit2 != 0 else 0.0,
32
+ "unit3": unit3 * 100 if unit3 != 0 else 0.0,
33
+ "unit4": unit4 * 100 if unit4 != 0 else 0.0,
34
+ "certified": certified,
35
+ }
36
+ # Export data to Arrow format
37
+ table = pa.Table.from_pylist([data])
38
+ # Add metadata (used by datasets library)
39
+ table = table.replace_schema_metadata(
40
+ {"huggingface": json.dumps({"info": {"features": schema}})}
41
+ )
42
+ # Write to parquet file
43
+ archive_file = tempfile.NamedTemporaryFile(delete=False)
44
+ pq.write_table(table, archive_file.name)
45
+ archive_file.close()
46
+
47
+ api.upload_file(
48
+ repo_id=repo, # manually created repo
49
+ repo_type="dataset",
50
+ path_in_repo=f"{username}.parquet", # each user will have their own parquet
51
+ path_or_fileobj=archive_file.name,
52
+ )