import gradio as gr from huggingface_hub import hf_hub_download import pandas as pd import os from unicodedata import normalize import tempfile HF_TOKEN = os.getenv("HF_TOKEN", None) def process_file(dataset_type, user_file): if user_file is None: return gr.Error("Please upload your data.") if dataset_type == "FLORES+ dev": reference_file = hf_hub_download( repo_id="openlanguagedata/flores_plus", filename="dev/eng_Latn.parquet", repo_type="dataset", use_auth_token=HF_TOKEN, ) elif dataset_type == "FLORES+ devtest": reference_file = hf_hub_download( repo_id="openlanguagedata/flores_plus", filename="devtest/eng_Latn.parquet", repo_type="dataset", use_auth_token=HF_TOKEN, ) elif dataset_type == "OLDI-Seed": reference_file = hf_hub_download( repo_id="openlanguagedata/oldi_seed", filename="seed/eng_Latn.parquet", repo_type="dataset", use_auth_token=HF_TOKEN, ) else: return gr.Error(f'Invalid dataset type "{dataset_type}".') reference_df = pd.read_parquet(reference_file) reference_size = len(reference_df) with open(user_file.name, "rt", encoding="utf-8") as f: user_lines = f.readlines() user_size = len(user_lines) if reference_size != user_size: return gr.Error( f"Line count mismatch: reference has {reference_size} rows, " f"the file you uploaded has {user_size} lines." ) def normalise(raw): return normalize("NFC", raw).strip() user_data = [] for i, line in enumerate(user_lines): user_data.append( { "id": i, "iso_639_3": "xxx", "iso_15924": "Xxxx", "glottocode": "xxxx1234", "text": normalise(line), "last_updated": "2.1", } ) temp_dir = tempfile.mkdtemp() filename = "xxx_Xxxx.parquet" target_path = os.path.join(temp_dir, filename) pd.DataFrame(user_data).to_parquet(target_path, index=False) return target_path with gr.Blocks() as demo: gr.Markdown("# Dataset checker") dataset_type = gr.Dropdown( ["FLORES+ dev", "FLORES+ devtest", "OLDI-Seed"], label="Dataset type", ) dataset_file = gr.File(label="Dataset file") parquet_file = gr.File(label="Download Parquet file") btn = gr.Button("Check") btn.click( fn=process_file, inputs=[dataset_type, dataset_file], outputs=parquet_file, ) demo.launch()