unit_1_quiz

Running on CPU Upgrade

App Files Files Community

423

not-lain commited on Feb 9

Commit

06b78c4

verified ·

1 Parent(s): 50e0fff

update dataset upload method to include users with dashes in their usernames

Browse files

currently you can't create splits for users with dashes in their names, some usernames that do not work with the previous implementation are `not-lain`, `julien-c`, etc....
by switching to this implementation we upload a file called `[username].parquet` for each username, I also made it compatible with the dataset viewer (see example [dataset](https://huggingface.co/datasets/not-lain/testing-my-upload) and example [space](https://huggingface.co/spaces/not-lain/unit_1_quiz))

cc

@burtenshaw
for review

Files changed (2) hide show

app.py +21 -52
data_to_parquet.py +52 -0

app.py CHANGED Viewed

@@ -1,14 +1,12 @@
 import os
-from datetime import datetime
 import random
-import pandas as pd
-from huggingface_hub import HfApi, hf_hub_download, Repository
-from huggingface_hub.repocard import metadata_load
 import gradio as gr
-from datasets import load_dataset, Dataset
-from huggingface_hub import whoami
 EXAM_DATASET_ID = os.getenv("EXAM_DATASET_ID") or "agents-course/unit_1_quiz"
 EXAM_MAX_QUESTIONS = os.getenv("EXAM_MAX_QUESTIONS") or 10
@@ -16,13 +14,7 @@ EXAM_PASSING_SCORE = os.getenv("EXAM_PASSING_SCORE") or 0.7
 ds = load_dataset(EXAM_DATASET_ID, split="train")
-DATASET_REPO_URL = "https://huggingface.co/datasets/agents-course/certificates"
-CERTIFIED_USERS_FILENAME = "certified_students.csv"
-CERTIFIED_USERS_DIR = "certificates"
-repo = Repository(
-    local_dir=CERTIFIED_USERS_DIR, clone_from=DATASET_REPO_URL, use_auth_token=os.getenv("HF_TOKEN")
-)
 # Convert dataset to a list of dicts and randomly sort
 quiz_data = ds.to_pandas().to_dict("records")
 random.shuffle(quiz_data)
@@ -66,24 +58,6 @@ def on_user_logged_in(token: gr.OAuthToken | None):
             None,  # no token
         ]
-def add_certified_user(hf_username, pass_percentage, submission_time):
-  """
-  Add the certified user to the database
-  """
-  print("ADD CERTIFIED USER")
-  repo.git_pull()
-  history = pd.read_csv(os.path.join(CERTIFIED_USERS_DIR, CERTIFIED_USERS_FILENAME))
-  # Check if this hf_username is already in our dataset:
-  check = history.loc[history['hf_username'] == hf_username]
-  if not check.empty:
-    history = history.drop(labels=check.index[0], axis=0)
-  new_row = pd.DataFrame({'hf_username': hf_username, 'pass_percentage': pass_percentage, 'datetime': submission_time}, index=[0])
-  history = pd.concat([new_row, history[:]]).reset_index(drop=True)
-  history.to_csv(os.path.join(CERTIFIED_USERS_DIR, CERTIFIED_USERS_FILENAME), index=False)
-  repo.push_to_hub(commit_message="Update certified users list")
 def push_results_to_hub(user_answers, token: gr.OAuthToken | None):
     """
@@ -103,33 +77,28 @@ def push_results_to_hub(user_answers, token: gr.OAuthToken | None):
         gr.Warning(
             f"Score {grade:.1%} below passing threshold of {float(EXAM_PASSING_SCORE):.1%}"
         )
-        return f"You scored {grade:.1%}. Please try again to achieve at least {float(EXAM_PASSING_SCORE):.1%}"
     gr.Info("Submitting answers to the Hub. Please wait...", duration=2)
     user_info = whoami(token=token.token)
-    repo_id = f"{EXAM_DATASET_ID}_student_responses"
-    submission_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    new_ds = Dataset.from_list(user_answers)
-    new_ds = new_ds.map(
-        lambda x: {
-            "username": user_info["name"],
-            "datetime": submission_time,
-            "grade": grade,
-        }
     )
-    new_ds.push_to_hub(repo_id=repo_id, split=user_info["name"])
-    # I'm adding a csv version
-    # The idea, if the user passed, we create a simple row in a csv
-    print("ADD CERTIFIED USER")
-    # Add this user to our database
-    add_certified_user(user_info["name"], grade, submission_time)
-    return f"Your responses have been submitted to the Hub! Final grade: {grade:.1%}"
 def handle_quiz(question_idx, user_answers, selected_answer, is_start):

 import os
 import random
+from huggingface_hub import HfApi, whoami
 import gradio as gr
+from datasets import load_dataset
+from data_to_parquet import to_parquet
 EXAM_DATASET_ID = os.getenv("EXAM_DATASET_ID") or "agents-course/unit_1_quiz"
 EXAM_MAX_QUESTIONS = os.getenv("EXAM_MAX_QUESTIONS") or 10
 ds = load_dataset(EXAM_DATASET_ID, split="train")
+upload_api = HfApi(token=os.getenv("HF_TOKEN"))
 # Convert dataset to a list of dicts and randomly sort
 quiz_data = ds.to_pandas().to_dict("records")
 random.shuffle(quiz_data)
             None,  # no token
         ]
 def push_results_to_hub(user_answers, token: gr.OAuthToken | None):
     """
         gr.Warning(
             f"Score {grade:.1%} below passing threshold of {float(EXAM_PASSING_SCORE):.1%}"
         )
+        return  # do not continue
     gr.Info("Submitting answers to the Hub. Please wait...", duration=2)
     user_info = whoami(token=token.token)
+    # TODO:
+    # check if username already has "username.parquet" in the dataset and download that (or read values directly from dataset viewer if possible)
+    # instead of replacing the values check if the new score is better than the old one
+    to_parquet(
+        upload_api,  # api
+        "agents-course/students-data",  # repo_id
+        user_info["name"],  # username
+        grade,  # unit1 score
+        0.0,  # unit2 score
+        0.0,  # unit3 score
+        0.0,  # unit4 score
+        0,  # already certified or not
     )
+    gr.Success(
+        f"Your responses have been submitted to the Hub! Final grade: {grade:.1%}"
+    )
 def handle_quiz(question_idx, user_answers, selected_answer, is_start):

data_to_parquet.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import pyarrow as pa
+import pyarrow.parquet as pq
+import json
+import tempfile
+# current schema (refer to https://huggingface.co/spaces/phxia/dataset-builder/blob/main/dataset_uploader.py#L153 for more info)
+schema = {
+    "username": {"_type": "Value", "dtype": "string"},
+    "unit1": {"_type": "Value", "dtype": "float64"},
+    "unit2": {"_type": "Value", "dtype": "float64"},
+    "unit3": {"_type": "Value", "dtype": "float64"},
+    "unit4": {"_type": "Value", "dtype": "float64"},
+    "certified": {"_type": "Value", "dtype": "int64"},
+}
+def to_parquet(
+    api,
+    repo: str,
+    username: str = "",
+    unit1: float = 0.0,
+    unit2: float = 0.0,
+    unit3: float = 0.0,
+    unit4: float = 0.0,
+    certified: int = 0,
+):
+    data = {
+        "username": username,
+        "unit1": unit1 * 100 if unit1 != 0 else 0.0,
+        "unit2": unit2 * 100 if unit2 != 0 else 0.0,
+        "unit3": unit3 * 100 if unit3 != 0 else 0.0,
+        "unit4": unit4 * 100 if unit4 != 0 else 0.0,
+        "certified": certified,
+    }
+    # Export data to Arrow format
+    table = pa.Table.from_pylist([data])
+    # Add metadata (used by datasets library)
+    table = table.replace_schema_metadata(
+        {"huggingface": json.dumps({"info": {"features": schema}})}
+    )
+    # Write to parquet file
+    archive_file = tempfile.NamedTemporaryFile(delete=False)
+    pq.write_table(table, archive_file.name)
+    archive_file.close()
+    api.upload_file(
+        repo_id=repo,  # manually created repo
+        repo_type="dataset",
+        path_in_repo=f"{username}.parquet",  # each user will have their own parquet
+        path_or_fileobj=archive_file.name,
+    )