Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| import pyarrow as pa | |
| import pyarrow.parquet as pq | |
| import json | |
| import tempfile | |
| # current schema (refer to https://huggingface.co/spaces/phxia/dataset-builder/blob/main/dataset_uploader.py#L153 for more info) | |
| schema = { | |
| "username": {"_type": "Value", "dtype": "string"}, | |
| "unit1": {"_type": "Value", "dtype": "float64"}, | |
| "unit2": {"_type": "Value", "dtype": "float64"}, | |
| "unit3": {"_type": "Value", "dtype": "float64"}, | |
| "unit4": {"_type": "Value", "dtype": "float64"}, | |
| "certified": {"_type": "Value", "dtype": "int64"}, | |
| } | |
| def to_parquet( | |
| api, | |
| repo: str, | |
| username: str = "", | |
| unit1: float = 0.0, | |
| unit2: float = 0.0, | |
| unit3: float = 0.0, | |
| unit4: float = 0.0, | |
| certified: int = 0, | |
| ): | |
| data = { | |
| "username": username, | |
| "unit1": unit1 * 100 if unit1 != 0 else 0.0, | |
| "unit2": unit2 * 100 if unit2 != 0 else 0.0, | |
| "unit3": unit3 * 100 if unit3 != 0 else 0.0, | |
| "unit4": unit4 * 100 if unit4 != 0 else 0.0, | |
| "certified": certified, | |
| } | |
| # Export data to Arrow format | |
| table = pa.Table.from_pylist([data]) | |
| # Add metadata (used by datasets library) | |
| table = table.replace_schema_metadata( | |
| {"huggingface": json.dumps({"info": {"features": schema}})} | |
| ) | |
| # Write to parquet file | |
| archive_file = tempfile.NamedTemporaryFile(delete=False) | |
| pq.write_table(table, archive_file.name) | |
| archive_file.close() | |
| api.upload_file( | |
| repo_id=repo, # manually created repo | |
| repo_type="dataset", | |
| path_in_repo=f"{username}.parquet", # each user will have their own parquet | |
| path_or_fileobj=archive_file.name, | |
| ) | |