update dataset upload method to include users with dashes in their usernames

#3
by not-lain - opened
Files changed (2) hide show
  1. app.py +21 -52
  2. data_to_parquet.py +52 -0
app.py CHANGED
@@ -1,14 +1,12 @@
1
  import os
2
- from datetime import datetime
3
  import random
4
 
5
- import pandas as pd
6
- from huggingface_hub import HfApi, hf_hub_download, Repository
7
- from huggingface_hub.repocard import metadata_load
8
 
9
  import gradio as gr
10
- from datasets import load_dataset, Dataset
11
- from huggingface_hub import whoami
 
12
 
13
  EXAM_DATASET_ID = os.getenv("EXAM_DATASET_ID") or "agents-course/unit_1_quiz"
14
  EXAM_MAX_QUESTIONS = os.getenv("EXAM_MAX_QUESTIONS") or 10
@@ -16,13 +14,7 @@ EXAM_PASSING_SCORE = os.getenv("EXAM_PASSING_SCORE") or 0.7
16
 
17
  ds = load_dataset(EXAM_DATASET_ID, split="train")
18
 
19
- DATASET_REPO_URL = "https://huggingface.co/datasets/agents-course/certificates"
20
- CERTIFIED_USERS_FILENAME = "certified_students.csv"
21
- CERTIFIED_USERS_DIR = "certificates"
22
- repo = Repository(
23
- local_dir=CERTIFIED_USERS_DIR, clone_from=DATASET_REPO_URL, use_auth_token=os.getenv("HF_TOKEN")
24
- )
25
-
26
  # Convert dataset to a list of dicts and randomly sort
27
  quiz_data = ds.to_pandas().to_dict("records")
28
  random.shuffle(quiz_data)
@@ -66,24 +58,6 @@ def on_user_logged_in(token: gr.OAuthToken | None):
66
  None, # no token
67
  ]
68
 
69
- def add_certified_user(hf_username, pass_percentage, submission_time):
70
- """
71
- Add the certified user to the database
72
- """
73
- print("ADD CERTIFIED USER")
74
- repo.git_pull()
75
- history = pd.read_csv(os.path.join(CERTIFIED_USERS_DIR, CERTIFIED_USERS_FILENAME))
76
-
77
- # Check if this hf_username is already in our dataset:
78
- check = history.loc[history['hf_username'] == hf_username]
79
- if not check.empty:
80
- history = history.drop(labels=check.index[0], axis=0)
81
-
82
- new_row = pd.DataFrame({'hf_username': hf_username, 'pass_percentage': pass_percentage, 'datetime': submission_time}, index=[0])
83
- history = pd.concat([new_row, history[:]]).reset_index(drop=True)
84
-
85
- history.to_csv(os.path.join(CERTIFIED_USERS_DIR, CERTIFIED_USERS_FILENAME), index=False)
86
- repo.push_to_hub(commit_message="Update certified users list")
87
 
88
  def push_results_to_hub(user_answers, token: gr.OAuthToken | None):
89
  """
@@ -103,33 +77,28 @@ def push_results_to_hub(user_answers, token: gr.OAuthToken | None):
103
  gr.Warning(
104
  f"Score {grade:.1%} below passing threshold of {float(EXAM_PASSING_SCORE):.1%}"
105
  )
106
- return f"You scored {grade:.1%}. Please try again to achieve at least {float(EXAM_PASSING_SCORE):.1%}"
107
 
108
  gr.Info("Submitting answers to the Hub. Please wait...", duration=2)
109
 
110
  user_info = whoami(token=token.token)
111
- repo_id = f"{EXAM_DATASET_ID}_student_responses"
112
- submission_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
113
-
114
- new_ds = Dataset.from_list(user_answers)
115
- new_ds = new_ds.map(
116
- lambda x: {
117
- "username": user_info["name"],
118
- "datetime": submission_time,
119
- "grade": grade,
120
- }
 
 
121
  )
122
- new_ds.push_to_hub(repo_id=repo_id, split=user_info["name"])
123
-
124
- # I'm adding a csv version
125
- # The idea, if the user passed, we create a simple row in a csv
126
- print("ADD CERTIFIED USER")
127
- # Add this user to our database
128
- add_certified_user(user_info["name"], grade, submission_time)
129
-
130
- return f"Your responses have been submitted to the Hub! Final grade: {grade:.1%}"
131
-
132
 
 
 
 
133
 
134
 
135
  def handle_quiz(question_idx, user_answers, selected_answer, is_start):
 
1
  import os
 
2
  import random
3
 
4
+ from huggingface_hub import HfApi, whoami
 
 
5
 
6
  import gradio as gr
7
+ from datasets import load_dataset
8
+
9
+ from data_to_parquet import to_parquet
10
 
11
  EXAM_DATASET_ID = os.getenv("EXAM_DATASET_ID") or "agents-course/unit_1_quiz"
12
  EXAM_MAX_QUESTIONS = os.getenv("EXAM_MAX_QUESTIONS") or 10
 
14
 
15
  ds = load_dataset(EXAM_DATASET_ID, split="train")
16
 
17
+ upload_api = HfApi(token=os.getenv("HF_TOKEN"))
 
 
 
 
 
 
18
  # Convert dataset to a list of dicts and randomly sort
19
  quiz_data = ds.to_pandas().to_dict("records")
20
  random.shuffle(quiz_data)
 
58
  None, # no token
59
  ]
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  def push_results_to_hub(user_answers, token: gr.OAuthToken | None):
63
  """
 
77
  gr.Warning(
78
  f"Score {grade:.1%} below passing threshold of {float(EXAM_PASSING_SCORE):.1%}"
79
  )
80
+ return # do not continue
81
 
82
  gr.Info("Submitting answers to the Hub. Please wait...", duration=2)
83
 
84
  user_info = whoami(token=token.token)
85
+ # TODO:
86
+ # check if username already has "username.parquet" in the dataset and download that (or read values directly from dataset viewer if possible)
87
+ # instead of replacing the values check if the new score is better than the old one
88
+ to_parquet(
89
+ upload_api, # api
90
+ "agents-course/students-data", # repo_id
91
+ user_info["name"], # username
92
+ grade, # unit1 score
93
+ 0.0, # unit2 score
94
+ 0.0, # unit3 score
95
+ 0.0, # unit4 score
96
+ 0, # already certified or not
97
  )
 
 
 
 
 
 
 
 
 
 
98
 
99
+ gr.Success(
100
+ f"Your responses have been submitted to the Hub! Final grade: {grade:.1%}"
101
+ )
102
 
103
 
104
  def handle_quiz(question_idx, user_answers, selected_answer, is_start):
data_to_parquet.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pyarrow as pa
2
+ import pyarrow.parquet as pq
3
+ import json
4
+ import tempfile
5
+
6
+
7
+ # current schema (refer to https://huggingface.co/spaces/phxia/dataset-builder/blob/main/dataset_uploader.py#L153 for more info)
8
+ schema = {
9
+ "username": {"_type": "Value", "dtype": "string"},
10
+ "unit1": {"_type": "Value", "dtype": "float64"},
11
+ "unit2": {"_type": "Value", "dtype": "float64"},
12
+ "unit3": {"_type": "Value", "dtype": "float64"},
13
+ "unit4": {"_type": "Value", "dtype": "float64"},
14
+ "certified": {"_type": "Value", "dtype": "int64"},
15
+ }
16
+
17
+
18
+ def to_parquet(
19
+ api,
20
+ repo: str,
21
+ username: str = "",
22
+ unit1: float = 0.0,
23
+ unit2: float = 0.0,
24
+ unit3: float = 0.0,
25
+ unit4: float = 0.0,
26
+ certified: int = 0,
27
+ ):
28
+ data = {
29
+ "username": username,
30
+ "unit1": unit1 * 100 if unit1 != 0 else 0.0,
31
+ "unit2": unit2 * 100 if unit2 != 0 else 0.0,
32
+ "unit3": unit3 * 100 if unit3 != 0 else 0.0,
33
+ "unit4": unit4 * 100 if unit4 != 0 else 0.0,
34
+ "certified": certified,
35
+ }
36
+ # Export data to Arrow format
37
+ table = pa.Table.from_pylist([data])
38
+ # Add metadata (used by datasets library)
39
+ table = table.replace_schema_metadata(
40
+ {"huggingface": json.dumps({"info": {"features": schema}})}
41
+ )
42
+ # Write to parquet file
43
+ archive_file = tempfile.NamedTemporaryFile(delete=False)
44
+ pq.write_table(table, archive_file.name)
45
+ archive_file.close()
46
+
47
+ api.upload_file(
48
+ repo_id=repo, # manually created repo
49
+ repo_type="dataset",
50
+ path_in_repo=f"{username}.parquet", # each user will have their own parquet
51
+ path_or_fileobj=archive_file.name,
52
+ )