unit_1_quiz / data_to_parquet.py
not-lain's picture
update dataset upload method to include users with dashes in their usernames
06b78c4 verified
raw
history blame
1.66 kB
import pyarrow as pa
import pyarrow.parquet as pq
import json
import tempfile
# current schema (refer to https://huggingface.co/spaces/phxia/dataset-builder/blob/main/dataset_uploader.py#L153 for more info)
schema = {
"username": {"_type": "Value", "dtype": "string"},
"unit1": {"_type": "Value", "dtype": "float64"},
"unit2": {"_type": "Value", "dtype": "float64"},
"unit3": {"_type": "Value", "dtype": "float64"},
"unit4": {"_type": "Value", "dtype": "float64"},
"certified": {"_type": "Value", "dtype": "int64"},
}
def to_parquet(
api,
repo: str,
username: str = "",
unit1: float = 0.0,
unit2: float = 0.0,
unit3: float = 0.0,
unit4: float = 0.0,
certified: int = 0,
):
data = {
"username": username,
"unit1": unit1 * 100 if unit1 != 0 else 0.0,
"unit2": unit2 * 100 if unit2 != 0 else 0.0,
"unit3": unit3 * 100 if unit3 != 0 else 0.0,
"unit4": unit4 * 100 if unit4 != 0 else 0.0,
"certified": certified,
}
# Export data to Arrow format
table = pa.Table.from_pylist([data])
# Add metadata (used by datasets library)
table = table.replace_schema_metadata(
{"huggingface": json.dumps({"info": {"features": schema}})}
)
# Write to parquet file
archive_file = tempfile.NamedTemporaryFile(delete=False)
pq.write_table(table, archive_file.name)
archive_file.close()
api.upload_file(
repo_id=repo, # manually created repo
repo_type="dataset",
path_in_repo=f"{username}.parquet", # each user will have their own parquet
path_or_fileobj=archive_file.name,
)