File size: 1,707 Bytes
f04e8bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import pyarrow as pa
import pyarrow.parquet as pq
import json
import tempfile


# current schema (refer to https://huggingface.co/spaces/phxia/dataset-builder/blob/main/dataset_uploader.py#L153 for more info)
schema = {
    "username": {"_type": "Value", "dtype": "string"},
    "unit1": {"_type": "Value", "dtype": "float64"},
    "unit2": {"_type": "Value", "dtype": "float64"},
    "unit3": {"_type": "Value", "dtype": "float64"},
    "unit4": {"_type": "Value", "dtype": "float64"},
    "certified": {"_type": "Value", "dtype": "int64"},
}


def to_parquet(

    api,

    repo: str,

    username: str = "",

    unit1: float = 0.0,

    unit2: float = 0.0,

    unit3: float = 0.0,

    unit4: float = 0.0,

    certified: int = 0,

):
    data = {
        "username": username,
        "unit1": unit1 * 100 if unit1 != 0 else 0.0,
        "unit2": unit2 * 100 if unit2 != 0 else 0.0,
        "unit3": unit3 * 100 if unit3 != 0 else 0.0,
        "unit4": unit4 * 100 if unit4 != 0 else 0.0,
        "certified": certified,
    }
    # Export data to Arrow format
    table = pa.Table.from_pylist([data])
    # Add metadata (used by datasets library)
    table = table.replace_schema_metadata(
        {"huggingface": json.dumps({"info": {"features": schema}})}
    )
    # Write to parquet file
    archive_file = tempfile.NamedTemporaryFile(delete=False)
    pq.write_table(table, archive_file.name)
    archive_file.close()

    api.upload_file(
        repo_id=repo,  # manually created repo
        repo_type="dataset",
        path_in_repo=f"{username}.parquet",  # each user will have their own parquet
        path_or_fileobj=archive_file.name,
    )