submit / app.py
jeanma's picture
Add application file
3fabb88
import gradio as gr
from huggingface_hub import hf_hub_download
import pandas as pd
import os
from unicodedata import normalize
import tempfile
HF_TOKEN = os.getenv("HF_TOKEN", None)
def process_file(dataset_type, user_file):
if user_file is None:
return gr.Error("Please upload your data.")
if dataset_type == "FLORES+ dev":
reference_file = hf_hub_download(
repo_id="openlanguagedata/flores_plus",
filename="dev/eng_Latn.parquet",
repo_type="dataset",
use_auth_token=HF_TOKEN,
)
elif dataset_type == "FLORES+ devtest":
reference_file = hf_hub_download(
repo_id="openlanguagedata/flores_plus",
filename="devtest/eng_Latn.parquet",
repo_type="dataset",
use_auth_token=HF_TOKEN,
)
elif dataset_type == "OLDI-Seed":
reference_file = hf_hub_download(
repo_id="openlanguagedata/oldi_seed",
filename="seed/eng_Latn.parquet",
repo_type="dataset",
use_auth_token=HF_TOKEN,
)
else:
return gr.Error(f'Invalid dataset type "{dataset_type}".')
reference_df = pd.read_parquet(reference_file)
reference_size = len(reference_df)
with open(user_file.name, "rt", encoding="utf-8") as f:
user_lines = f.readlines()
user_size = len(user_lines)
if reference_size != user_size:
return gr.Error(
f"Line count mismatch: reference has {reference_size} rows, "
f"the file you uploaded has {user_size} lines."
)
def normalise(raw):
return normalize("NFC", raw).strip()
user_data = []
for i, line in enumerate(user_lines):
user_data.append(
{
"id": i,
"iso_639_3": "xxx",
"iso_15924": "Xxxx",
"glottocode": "xxxx1234",
"text": normalise(line),
"last_updated": "2.1",
}
)
temp_dir = tempfile.mkdtemp()
filename = "xxx_Xxxx.parquet"
target_path = os.path.join(temp_dir, filename)
pd.DataFrame(user_data).to_parquet(target_path, index=False)
return target_path
with gr.Blocks() as demo:
gr.Markdown("# Dataset checker")
dataset_type = gr.Dropdown(
["FLORES+ dev", "FLORES+ devtest", "OLDI-Seed"],
label="Dataset type",
)
dataset_file = gr.File(label="Dataset file")
parquet_file = gr.File(label="Download Parquet file")
btn = gr.Button("Check")
btn.click(
fn=process_file,
inputs=[dataset_type, dataset_file],
outputs=parquet_file,
)
demo.launch()