unit_1_quiz

Running on CPU Upgrade

unit_1_quiz / data_to_parquet.py

update dataset upload method to include users with dashes in their usernames

06b78c4 verified 10 days ago

1.66 kB

	import pyarrow as pa
	import pyarrow.parquet as pq
	import json
	import tempfile


	# current schema (refer to https://huggingface.co/spaces/phxia/dataset-builder/blob/main/dataset_uploader.py#L153 for more info)
	schema = {
	"username": {"_type": "Value", "dtype": "string"},
	"unit1": {"_type": "Value", "dtype": "float64"},
	"unit2": {"_type": "Value", "dtype": "float64"},
	"unit3": {"_type": "Value", "dtype": "float64"},
	"unit4": {"_type": "Value", "dtype": "float64"},
	"certified": {"_type": "Value", "dtype": "int64"},
	}


	def to_parquet(
	api,
	repo: str,
	username: str = "",
	unit1: float = 0.0,
	unit2: float = 0.0,
	unit3: float = 0.0,
	unit4: float = 0.0,
	certified: int = 0,
	):
	data = {
	"username": username,
	"unit1": unit1 * 100 if unit1 != 0 else 0.0,
	"unit2": unit2 * 100 if unit2 != 0 else 0.0,
	"unit3": unit3 * 100 if unit3 != 0 else 0.0,
	"unit4": unit4 * 100 if unit4 != 0 else 0.0,
	"certified": certified,
	}
	# Export data to Arrow format
	table = pa.Table.from_pylist([data])
	# Add metadata (used by datasets library)
	table = table.replace_schema_metadata(
	{"huggingface": json.dumps({"info": {"features": schema}})}
	)
	# Write to parquet file
	archive_file = tempfile.NamedTemporaryFile(delete=False)
	pq.write_table(table, archive_file.name)
	archive_file.close()

	api.upload_file(
	repo_id=repo, # manually created repo
	repo_type="dataset",
	path_in_repo=f"{username}.parquet", # each user will have their own parquet
	path_or_fileobj=archive_file.name,
	)