unit_1_quiz

Running on CPU Upgrade

App Files Files Community

429

update dataset upload method to include users with dashes in their usernames

by not-lain - opened Feb 9

base: refs/heads/main

←

from: refs/pr/3

Discussion Files changed

+73

-52

Files changed (2) hide show

app.py +21 -52
data_to_parquet.py +52 -0

app.py CHANGED Viewed

@@ -1,14 +1,12 @@
 import os
-from datetime import datetime
 import random
-import pandas as pd
-from huggingface_hub import HfApi, hf_hub_download, Repository
-from huggingface_hub.repocard import metadata_load
 import gradio as gr
-from datasets import load_dataset, Dataset
-from huggingface_hub import whoami
 EXAM_DATASET_ID = os.getenv("EXAM_DATASET_ID") or "agents-course/unit_1_quiz"
 EXAM_MAX_QUESTIONS = os.getenv("EXAM_MAX_QUESTIONS") or 10
@@ -16,13 +14,7 @@ EXAM_PASSING_SCORE = os.getenv("EXAM_PASSING_SCORE") or 0.7
 ds = load_dataset(EXAM_DATASET_ID, split="train")
-DATASET_REPO_URL = "https://huggingface.co/datasets/agents-course/certificates"
-CERTIFIED_USERS_FILENAME = "certified_students.csv"
-CERTIFIED_USERS_DIR = "certificates"
-repo = Repository(
-    local_dir=CERTIFIED_USERS_DIR, clone_from=DATASET_REPO_URL, use_auth_token=os.getenv("HF_TOKEN")
-)
 # Convert dataset to a list of dicts and randomly sort
 quiz_data = ds.to_pandas().to_dict("records")
 random.shuffle(quiz_data)
@@ -66,24 +58,6 @@ def on_user_logged_in(token: gr.OAuthToken | None):
             None,  # no token
         ]
-def add_certified_user(hf_username, pass_percentage, submission_time):
-  """
-  Add the certified user to the database
-  """
-  print("ADD CERTIFIED USER")
-  repo.git_pull()
-  history = pd.read_csv(os.path.join(CERTIFIED_USERS_DIR, CERTIFIED_USERS_FILENAME))
-  # Check if this hf_username is already in our dataset:
-  check = history.loc[history['hf_username'] == hf_username]
-  if not check.empty:
-    history = history.drop(labels=check.index[0], axis=0)
-  new_row = pd.DataFrame({'hf_username': hf_username, 'pass_percentage': pass_percentage, 'datetime': submission_time}, index=[0])
-  history = pd.concat([new_row, history[:]]).reset_index(drop=True)
-  history.to_csv(os.path.join(CERTIFIED_USERS_DIR, CERTIFIED_USERS_FILENAME), index=False)
-  repo.push_to_hub(commit_message="Update certified users list")
 def push_results_to_hub(user_answers, token: gr.OAuthToken | None):
     """
@@ -103,33 +77,28 @@ def push_results_to_hub(user_answers, token: gr.OAuthToken | None):
         gr.Warning(
             f"Score {grade:.1%} below passing threshold of {float(EXAM_PASSING_SCORE):.1%}"
         )
-        return f"You scored {grade:.1%}. Please try again to achieve at least {float(EXAM_PASSING_SCORE):.1%}"
     gr.Info("Submitting answers to the Hub. Please wait...", duration=2)
     user_info = whoami(token=token.token)
-    repo_id = f"{EXAM_DATASET_ID}_student_responses"
-    submission_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    new_ds = Dataset.from_list(user_answers)
-    new_ds = new_ds.map(
-        lambda x: {
-            "username": user_info["name"],
-            "datetime": submission_time,
-            "grade": grade,
-        }
     )
-    new_ds.push_to_hub(repo_id=repo_id, split=user_info["name"])
-    # I'm adding a csv version
-    # The idea, if the user passed, we create a simple row in a csv
-    print("ADD CERTIFIED USER")
-    # Add this user to our database
-    add_certified_user(user_info["name"], grade, submission_time)
-    return f"Your responses have been submitted to the Hub! Final grade: {grade:.1%}"
 def handle_quiz(question_idx, user_answers, selected_answer, is_start):

 import os
 import random
+from huggingface_hub import HfApi, whoami
 import gradio as gr
+from datasets import load_dataset
+from data_to_parquet import to_parquet
 EXAM_DATASET_ID = os.getenv("EXAM_DATASET_ID") or "agents-course/unit_1_quiz"
 EXAM_MAX_QUESTIONS = os.getenv("EXAM_MAX_QUESTIONS") or 10
 ds = load_dataset(EXAM_DATASET_ID, split="train")
+upload_api = HfApi(token=os.getenv("HF_TOKEN"))
 # Convert dataset to a list of dicts and randomly sort
 quiz_data = ds.to_pandas().to_dict("records")
 random.shuffle(quiz_data)
             None,  # no token
         ]
 def push_results_to_hub(user_answers, token: gr.OAuthToken | None):
     """
         gr.Warning(
             f"Score {grade:.1%} below passing threshold of {float(EXAM_PASSING_SCORE):.1%}"
         )
+        return  # do not continue
     gr.Info("Submitting answers to the Hub. Please wait...", duration=2)
     user_info = whoami(token=token.token)
+    # TODO:
+    # check if username already has "username.parquet" in the dataset and download that (or read values directly from dataset viewer if possible)
+    # instead of replacing the values check if the new score is better than the old one
+    to_parquet(
+        upload_api,  # api
+        "agents-course/students-data",  # repo_id
+        user_info["name"],  # username
+        grade,  # unit1 score
+        0.0,  # unit2 score
+        0.0,  # unit3 score
+        0.0,  # unit4 score
+        0,  # already certified or not
     )
+    gr.Success(
+        f"Your responses have been submitted to the Hub! Final grade: {grade:.1%}"
+    )
 def handle_quiz(question_idx, user_answers, selected_answer, is_start):

data_to_parquet.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import pyarrow as pa
+import pyarrow.parquet as pq
+import json
+import tempfile
+# current schema (refer to https://huggingface.co/spaces/phxia/dataset-builder/blob/main/dataset_uploader.py#L153 for more info)
+schema = {
+    "username": {"_type": "Value", "dtype": "string"},
+    "unit1": {"_type": "Value", "dtype": "float64"},
+    "unit2": {"_type": "Value", "dtype": "float64"},
+    "unit3": {"_type": "Value", "dtype": "float64"},
+    "unit4": {"_type": "Value", "dtype": "float64"},
+    "certified": {"_type": "Value", "dtype": "int64"},
+}
+def to_parquet(
+    api,
+    repo: str,
+    username: str = "",
+    unit1: float = 0.0,
+    unit2: float = 0.0,
+    unit3: float = 0.0,
+    unit4: float = 0.0,
+    certified: int = 0,
+):
+    data = {
+        "username": username,
+        "unit1": unit1 * 100 if unit1 != 0 else 0.0,
+        "unit2": unit2 * 100 if unit2 != 0 else 0.0,
+        "unit3": unit3 * 100 if unit3 != 0 else 0.0,
+        "unit4": unit4 * 100 if unit4 != 0 else 0.0,
+        "certified": certified,
+    }
+    # Export data to Arrow format
+    table = pa.Table.from_pylist([data])
+    # Add metadata (used by datasets library)
+    table = table.replace_schema_metadata(
+        {"huggingface": json.dumps({"info": {"features": schema}})}
+    )
+    # Write to parquet file
+    archive_file = tempfile.NamedTemporaryFile(delete=False)
+    pq.write_table(table, archive_file.name)
+    archive_file.close()
+    api.upload_file(
+        repo_id=repo,  # manually created repo
+        repo_type="dataset",
+        path_in_repo=f"{username}.parquet",  # each user will have their own parquet
+        path_or_fileobj=archive_file.name,
+    )