Spaces:
Running
Running
import gradio as gr | |
from huggingface_hub import hf_hub_download | |
import pandas as pd | |
import os | |
from unicodedata import normalize | |
import tempfile | |
HF_TOKEN = os.getenv("HF_TOKEN", None) | |
def process_file(dataset_type, user_file): | |
if user_file is None: | |
return gr.Error("Please upload your data.") | |
if dataset_type == "FLORES+ dev": | |
reference_file = hf_hub_download( | |
repo_id="openlanguagedata/flores_plus", | |
filename="dev/eng_Latn.parquet", | |
repo_type="dataset", | |
use_auth_token=HF_TOKEN, | |
) | |
elif dataset_type == "FLORES+ devtest": | |
reference_file = hf_hub_download( | |
repo_id="openlanguagedata/flores_plus", | |
filename="devtest/eng_Latn.parquet", | |
repo_type="dataset", | |
use_auth_token=HF_TOKEN, | |
) | |
elif dataset_type == "OLDI-Seed": | |
reference_file = hf_hub_download( | |
repo_id="openlanguagedata/oldi_seed", | |
filename="seed/eng_Latn.parquet", | |
repo_type="dataset", | |
use_auth_token=HF_TOKEN, | |
) | |
else: | |
return gr.Error(f'Invalid dataset type "{dataset_type}".') | |
reference_df = pd.read_parquet(reference_file) | |
reference_size = len(reference_df) | |
with open(user_file.name, "rt", encoding="utf-8") as f: | |
user_lines = f.readlines() | |
user_size = len(user_lines) | |
if reference_size != user_size: | |
return gr.Error( | |
f"Line count mismatch: reference has {reference_size} rows, " | |
f"the file you uploaded has {user_size} lines." | |
) | |
def normalise(raw): | |
return normalize("NFC", raw).strip() | |
user_data = [] | |
for i, line in enumerate(user_lines): | |
user_data.append( | |
{ | |
"id": i, | |
"iso_639_3": "xxx", | |
"iso_15924": "Xxxx", | |
"glottocode": "xxxx1234", | |
"text": normalise(line), | |
"last_updated": "2.1", | |
} | |
) | |
temp_dir = tempfile.mkdtemp() | |
filename = "xxx_Xxxx.parquet" | |
target_path = os.path.join(temp_dir, filename) | |
pd.DataFrame(user_data).to_parquet(target_path, index=False) | |
return target_path | |
with gr.Blocks() as demo: | |
gr.Markdown("# Dataset checker") | |
dataset_type = gr.Dropdown( | |
["FLORES+ dev", "FLORES+ devtest", "OLDI-Seed"], | |
label="Dataset type", | |
) | |
dataset_file = gr.File(label="Dataset file") | |
parquet_file = gr.File(label="Download Parquet file") | |
btn = gr.Button("Check") | |
btn.click( | |
fn=process_file, | |
inputs=[dataset_type, dataset_file], | |
outputs=parquet_file, | |
) | |
demo.launch() | |