Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
update dataset upload method to include users with dashes in their usernames
Browse filescurrently you can't create splits for users with dashes in their names, some usernames that do not work with the previous implementation are `not-lain`, `julien-c`, etc....
by switching to this implementation we upload a file called `[username].parquet` for each username, I also made it compatible with the dataset viewer (see example [dataset](https://huggingface.co/datasets/not-lain/testing-my-upload) and example [space](https://huggingface.co/spaces/not-lain/unit_1_quiz))
cc
@burtenshaw
for review
- app.py +21 -52
- data_to_parquet.py +52 -0
app.py
CHANGED
@@ -1,14 +1,12 @@
|
|
1 |
import os
|
2 |
-
from datetime import datetime
|
3 |
import random
|
4 |
|
5 |
-
import
|
6 |
-
from huggingface_hub import HfApi, hf_hub_download, Repository
|
7 |
-
from huggingface_hub.repocard import metadata_load
|
8 |
|
9 |
import gradio as gr
|
10 |
-
from datasets import load_dataset
|
11 |
-
|
|
|
12 |
|
13 |
EXAM_DATASET_ID = os.getenv("EXAM_DATASET_ID") or "agents-course/unit_1_quiz"
|
14 |
EXAM_MAX_QUESTIONS = os.getenv("EXAM_MAX_QUESTIONS") or 10
|
@@ -16,13 +14,7 @@ EXAM_PASSING_SCORE = os.getenv("EXAM_PASSING_SCORE") or 0.7
|
|
16 |
|
17 |
ds = load_dataset(EXAM_DATASET_ID, split="train")
|
18 |
|
19 |
-
|
20 |
-
CERTIFIED_USERS_FILENAME = "certified_students.csv"
|
21 |
-
CERTIFIED_USERS_DIR = "certificates"
|
22 |
-
repo = Repository(
|
23 |
-
local_dir=CERTIFIED_USERS_DIR, clone_from=DATASET_REPO_URL, use_auth_token=os.getenv("HF_TOKEN")
|
24 |
-
)
|
25 |
-
|
26 |
# Convert dataset to a list of dicts and randomly sort
|
27 |
quiz_data = ds.to_pandas().to_dict("records")
|
28 |
random.shuffle(quiz_data)
|
@@ -66,24 +58,6 @@ def on_user_logged_in(token: gr.OAuthToken | None):
|
|
66 |
None, # no token
|
67 |
]
|
68 |
|
69 |
-
def add_certified_user(hf_username, pass_percentage, submission_time):
|
70 |
-
"""
|
71 |
-
Add the certified user to the database
|
72 |
-
"""
|
73 |
-
print("ADD CERTIFIED USER")
|
74 |
-
repo.git_pull()
|
75 |
-
history = pd.read_csv(os.path.join(CERTIFIED_USERS_DIR, CERTIFIED_USERS_FILENAME))
|
76 |
-
|
77 |
-
# Check if this hf_username is already in our dataset:
|
78 |
-
check = history.loc[history['hf_username'] == hf_username]
|
79 |
-
if not check.empty:
|
80 |
-
history = history.drop(labels=check.index[0], axis=0)
|
81 |
-
|
82 |
-
new_row = pd.DataFrame({'hf_username': hf_username, 'pass_percentage': pass_percentage, 'datetime': submission_time}, index=[0])
|
83 |
-
history = pd.concat([new_row, history[:]]).reset_index(drop=True)
|
84 |
-
|
85 |
-
history.to_csv(os.path.join(CERTIFIED_USERS_DIR, CERTIFIED_USERS_FILENAME), index=False)
|
86 |
-
repo.push_to_hub(commit_message="Update certified users list")
|
87 |
|
88 |
def push_results_to_hub(user_answers, token: gr.OAuthToken | None):
|
89 |
"""
|
@@ -103,33 +77,28 @@ def push_results_to_hub(user_answers, token: gr.OAuthToken | None):
|
|
103 |
gr.Warning(
|
104 |
f"Score {grade:.1%} below passing threshold of {float(EXAM_PASSING_SCORE):.1%}"
|
105 |
)
|
106 |
-
return
|
107 |
|
108 |
gr.Info("Submitting answers to the Hub. Please wait...", duration=2)
|
109 |
|
110 |
user_info = whoami(token=token.token)
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
|
|
|
|
121 |
)
|
122 |
-
new_ds.push_to_hub(repo_id=repo_id, split=user_info["name"])
|
123 |
-
|
124 |
-
# I'm adding a csv version
|
125 |
-
# The idea, if the user passed, we create a simple row in a csv
|
126 |
-
print("ADD CERTIFIED USER")
|
127 |
-
# Add this user to our database
|
128 |
-
add_certified_user(user_info["name"], grade, submission_time)
|
129 |
-
|
130 |
-
return f"Your responses have been submitted to the Hub! Final grade: {grade:.1%}"
|
131 |
-
|
132 |
|
|
|
|
|
|
|
133 |
|
134 |
|
135 |
def handle_quiz(question_idx, user_answers, selected_answer, is_start):
|
|
|
1 |
import os
|
|
|
2 |
import random
|
3 |
|
4 |
+
from huggingface_hub import HfApi, whoami
|
|
|
|
|
5 |
|
6 |
import gradio as gr
|
7 |
+
from datasets import load_dataset
|
8 |
+
|
9 |
+
from data_to_parquet import to_parquet
|
10 |
|
11 |
EXAM_DATASET_ID = os.getenv("EXAM_DATASET_ID") or "agents-course/unit_1_quiz"
|
12 |
EXAM_MAX_QUESTIONS = os.getenv("EXAM_MAX_QUESTIONS") or 10
|
|
|
14 |
|
15 |
ds = load_dataset(EXAM_DATASET_ID, split="train")
|
16 |
|
17 |
+
upload_api = HfApi(token=os.getenv("HF_TOKEN"))
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
# Convert dataset to a list of dicts and randomly sort
|
19 |
quiz_data = ds.to_pandas().to_dict("records")
|
20 |
random.shuffle(quiz_data)
|
|
|
58 |
None, # no token
|
59 |
]
|
60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
def push_results_to_hub(user_answers, token: gr.OAuthToken | None):
|
63 |
"""
|
|
|
77 |
gr.Warning(
|
78 |
f"Score {grade:.1%} below passing threshold of {float(EXAM_PASSING_SCORE):.1%}"
|
79 |
)
|
80 |
+
return # do not continue
|
81 |
|
82 |
gr.Info("Submitting answers to the Hub. Please wait...", duration=2)
|
83 |
|
84 |
user_info = whoami(token=token.token)
|
85 |
+
# TODO:
|
86 |
+
# check if username already has "username.parquet" in the dataset and download that (or read values directly from dataset viewer if possible)
|
87 |
+
# instead of replacing the values check if the new score is better than the old one
|
88 |
+
to_parquet(
|
89 |
+
upload_api, # api
|
90 |
+
"agents-course/students-data", # repo_id
|
91 |
+
user_info["name"], # username
|
92 |
+
grade, # unit1 score
|
93 |
+
0.0, # unit2 score
|
94 |
+
0.0, # unit3 score
|
95 |
+
0.0, # unit4 score
|
96 |
+
0, # already certified or not
|
97 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
|
99 |
+
gr.Success(
|
100 |
+
f"Your responses have been submitted to the Hub! Final grade: {grade:.1%}"
|
101 |
+
)
|
102 |
|
103 |
|
104 |
def handle_quiz(question_idx, user_answers, selected_answer, is_start):
|
data_to_parquet.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pyarrow as pa
|
2 |
+
import pyarrow.parquet as pq
|
3 |
+
import json
|
4 |
+
import tempfile
|
5 |
+
|
6 |
+
|
7 |
+
# current schema (refer to https://huggingface.co/spaces/phxia/dataset-builder/blob/main/dataset_uploader.py#L153 for more info)
|
8 |
+
schema = {
|
9 |
+
"username": {"_type": "Value", "dtype": "string"},
|
10 |
+
"unit1": {"_type": "Value", "dtype": "float64"},
|
11 |
+
"unit2": {"_type": "Value", "dtype": "float64"},
|
12 |
+
"unit3": {"_type": "Value", "dtype": "float64"},
|
13 |
+
"unit4": {"_type": "Value", "dtype": "float64"},
|
14 |
+
"certified": {"_type": "Value", "dtype": "int64"},
|
15 |
+
}
|
16 |
+
|
17 |
+
|
18 |
+
def to_parquet(
|
19 |
+
api,
|
20 |
+
repo: str,
|
21 |
+
username: str = "",
|
22 |
+
unit1: float = 0.0,
|
23 |
+
unit2: float = 0.0,
|
24 |
+
unit3: float = 0.0,
|
25 |
+
unit4: float = 0.0,
|
26 |
+
certified: int = 0,
|
27 |
+
):
|
28 |
+
data = {
|
29 |
+
"username": username,
|
30 |
+
"unit1": unit1 * 100 if unit1 != 0 else 0.0,
|
31 |
+
"unit2": unit2 * 100 if unit2 != 0 else 0.0,
|
32 |
+
"unit3": unit3 * 100 if unit3 != 0 else 0.0,
|
33 |
+
"unit4": unit4 * 100 if unit4 != 0 else 0.0,
|
34 |
+
"certified": certified,
|
35 |
+
}
|
36 |
+
# Export data to Arrow format
|
37 |
+
table = pa.Table.from_pylist([data])
|
38 |
+
# Add metadata (used by datasets library)
|
39 |
+
table = table.replace_schema_metadata(
|
40 |
+
{"huggingface": json.dumps({"info": {"features": schema}})}
|
41 |
+
)
|
42 |
+
# Write to parquet file
|
43 |
+
archive_file = tempfile.NamedTemporaryFile(delete=False)
|
44 |
+
pq.write_table(table, archive_file.name)
|
45 |
+
archive_file.close()
|
46 |
+
|
47 |
+
api.upload_file(
|
48 |
+
repo_id=repo, # manually created repo
|
49 |
+
repo_type="dataset",
|
50 |
+
path_in_repo=f"{username}.parquet", # each user will have their own parquet
|
51 |
+
path_or_fileobj=archive_file.name,
|
52 |
+
)
|