import pyarrow as pa import pyarrow.parquet as pq import json import tempfile # current schema (refer to https://huggingface.co/spaces/phxia/dataset-builder/blob/main/dataset_uploader.py#L153 for more info) schema = { "username": {"_type": "Value", "dtype": "string"}, "unit1": {"_type": "Value", "dtype": "float64"}, "unit2": {"_type": "Value", "dtype": "float64"}, "unit3": {"_type": "Value", "dtype": "float64"}, "unit4": {"_type": "Value", "dtype": "float64"}, "certified": {"_type": "Value", "dtype": "int64"}, } def to_parquet( api, repo: str, username: str = "", unit1: float = 0.0, unit2: float = 0.0, unit3: float = 0.0, unit4: float = 0.0, certified: int = 0, ): data = { "username": username, "unit1": unit1 * 100 if unit1 != 0 else 0.0, "unit2": unit2 * 100 if unit2 != 0 else 0.0, "unit3": unit3 * 100 if unit3 != 0 else 0.0, "unit4": unit4 * 100 if unit4 != 0 else 0.0, "certified": certified, } # Export data to Arrow format table = pa.Table.from_pylist([data]) # Add metadata (used by datasets library) table = table.replace_schema_metadata( {"huggingface": json.dumps({"info": {"features": schema}})} ) # Write to parquet file archive_file = tempfile.NamedTemporaryFile(delete=False) pq.write_table(table, archive_file.name) archive_file.close() api.upload_file( repo_id=repo, # manually created repo repo_type="dataset", path_in_repo=f"{username}.parquet", # each user will have their own parquet path_or_fileobj=archive_file.name, )