File size: 2,677 Bytes
3fabb88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import gradio as gr
from huggingface_hub import hf_hub_download
import pandas as pd
import os
from unicodedata import normalize
import tempfile

HF_TOKEN = os.getenv("HF_TOKEN", None)


def process_file(dataset_type, user_file):
    if user_file is None:
        return gr.Error("Please upload your data.")

    if dataset_type == "FLORES+ dev":
        reference_file = hf_hub_download(
            repo_id="openlanguagedata/flores_plus",
            filename="dev/eng_Latn.parquet",
            repo_type="dataset",
            use_auth_token=HF_TOKEN,
        )
    elif dataset_type == "FLORES+ devtest":
        reference_file = hf_hub_download(
            repo_id="openlanguagedata/flores_plus",
            filename="devtest/eng_Latn.parquet",
            repo_type="dataset",
            use_auth_token=HF_TOKEN,
        )
    elif dataset_type == "OLDI-Seed":
        reference_file = hf_hub_download(
            repo_id="openlanguagedata/oldi_seed",
            filename="seed/eng_Latn.parquet",
            repo_type="dataset",
            use_auth_token=HF_TOKEN,
        )
    else:
        return gr.Error(f'Invalid dataset type "{dataset_type}".')

    reference_df = pd.read_parquet(reference_file)
    reference_size = len(reference_df)

    with open(user_file.name, "rt", encoding="utf-8") as f:
        user_lines = f.readlines()
    user_size = len(user_lines)

    if reference_size != user_size:
        return gr.Error(
            f"Line count mismatch: reference has {reference_size} rows, "
            f"the file you uploaded has {user_size} lines."
        )

    def normalise(raw):
        return normalize("NFC", raw).strip()

    user_data = []
    for i, line in enumerate(user_lines):
        user_data.append(
            {
                "id": i,
                "iso_639_3": "xxx",
                "iso_15924": "Xxxx",
                "glottocode": "xxxx1234",
                "text": normalise(line),
                "last_updated": "2.1",
            }
        )

    temp_dir = tempfile.mkdtemp()
    filename = "xxx_Xxxx.parquet"
    target_path = os.path.join(temp_dir, filename)
    pd.DataFrame(user_data).to_parquet(target_path, index=False)

    return target_path


with gr.Blocks() as demo:
    gr.Markdown("# Dataset checker")
    dataset_type = gr.Dropdown(
        ["FLORES+ dev", "FLORES+ devtest", "OLDI-Seed"],
        label="Dataset type",
    )
    dataset_file = gr.File(label="Dataset file")
    parquet_file = gr.File(label="Download Parquet file")
    btn = gr.Button("Check")
    btn.click(
        fn=process_file,
        inputs=[dataset_type, dataset_file],
        outputs=parquet_file,
    )

demo.launch()