Spaces:
Sleeping
Sleeping
import streamlit as st | |
from uuid import uuid4 | |
import langcodes | |
import itertools | |
example_languages_from_labse="""Afrikaans | |
Albanian | |
Amharic | |
Arabic | |
Armenian | |
Assamese | |
Azerbaijani | |
Basque | |
Belarusian | |
Bengali | |
Bosnian | |
Bulgarian | |
Burmese | |
Catalan | |
Cebuano | |
Chinese | |
Corsican | |
Croatian | |
Czech | |
Danish | |
Dutch | |
English | |
Esperanto | |
Estonian | |
Finnish | |
French | |
Western Frisian | |
Galician | |
Georgian | |
German | |
Greek | |
Gujarati | |
Haitian | |
Hausa | |
Hawaiian | |
Hebrew | |
Hindi | |
Hmong | |
Hungarian | |
Icelandic | |
Igbo | |
Indonesian | |
Irish | |
Italian | |
Japanese | |
Javanese | |
Kannada | |
Kazakh | |
Khmer | |
Kinyarwanda | |
Korean | |
Kurdish | |
Kyrgyz | |
Lao | |
Latin | |
Latvian | |
Lithuanian | |
Luxembourgish | |
Macedonian | |
Malagasy | |
Malay | |
Malayalam | |
Maltese | |
Māori | |
Marathi | |
Mongolian | |
Nepali | |
Norwegian | |
Chichewa | |
Oriya | |
Persian | |
Polish | |
Portuguese | |
Panjabi | |
Romanian | |
Russian | |
Samoan | |
Scottish Gaelic | |
Serbian | |
Southern Sotho | |
Shona | |
Sinhala | |
Slovak | |
Slovenian | |
Somali | |
Spanish | |
Sundanese | |
Swahili | |
Swedish | |
Tagalog | |
Tajik | |
Tamil | |
Tatar | |
Telugu | |
Thai | |
Tibetan | |
Turkish | |
Turkmen | |
Uyghur | |
Ukrainian | |
Urdu | |
Uzbek | |
Vietnamese | |
Welsh | |
Wolof | |
Xhosa | |
Yiddish | |
Yoruba | |
Zulu""".splitlines() | |
# example_language_tag_string_from_labse = """af | |
# sq | |
# am | |
# ar | |
# hy | |
# as | |
# az | |
# eu | |
# be | |
# bn | |
# bs | |
# bg | |
# my | |
# ca | |
# ceb | |
# zh | |
# co | |
# hr | |
# cs | |
# da | |
# nl | |
# en | |
# eo | |
# et | |
# fi | |
# fr | |
# fy | |
# gl | |
# ka | |
# de | |
# el | |
# gu | |
# ht | |
# ha | |
# haw | |
# he | |
# hi | |
# hmn | |
# hu | |
# is | |
# ig | |
# id | |
# ga | |
# it | |
# ja | |
# jv | |
# kn | |
# kk | |
# km | |
# rw | |
# ko | |
# ku | |
# ky | |
# lo | |
# la | |
# lv | |
# lt | |
# lb | |
# mk | |
# mg | |
# ms | |
# ml | |
# mt | |
# mi | |
# mr | |
# mn | |
# ne | |
# no | |
# ny | |
# or | |
# fa | |
# pl | |
# pt | |
# pa | |
# ro | |
# ru | |
# sm | |
# gd | |
# sr | |
# st | |
# sn | |
# si | |
# sk | |
# sl | |
# so | |
# es | |
# su | |
# sw | |
# sv | |
# tl | |
# tg | |
# ta | |
# tt | |
# te | |
# th | |
# bo | |
# tr | |
# tk | |
# ug | |
# uk | |
# ur | |
# uz | |
# vi | |
# cy | |
# wo | |
# xh | |
# yi | |
# yo | |
# zu""" | |
labse_huggingface_tags = """- af | |
- sq | |
- am | |
- ar | |
- hy | |
- as | |
- az | |
- eu | |
- be | |
- bn | |
- bs | |
- bg | |
- my | |
- ca | |
- ceb | |
- zh | |
- co | |
- hr | |
- cs | |
- da | |
- nl | |
- en | |
- eo | |
- et | |
- fi | |
- fr | |
- fy | |
- gl | |
- ka | |
- de | |
- el | |
- gu | |
- ht | |
- ha | |
- haw | |
- he | |
- hi | |
- hmn | |
- hu | |
- is | |
- ig | |
- id | |
- ga | |
- it | |
- ja | |
- jv | |
- kn | |
- kk | |
- km | |
- rw | |
- ko | |
- ku | |
- ky | |
- lo | |
- la | |
- lv | |
- lt | |
- lb | |
- mk | |
- mg | |
- ms | |
- ml | |
- mt | |
- mi | |
- mr | |
- mn | |
- ne | |
- no | |
- ny | |
- or | |
- fa | |
- pl | |
- pt | |
- pa | |
- ro | |
- ru | |
- sm | |
- gd | |
- sr | |
- st | |
- sn | |
- si | |
- sk | |
- sl | |
- so | |
- es | |
- su | |
- sw | |
- sv | |
- tl | |
- tg | |
- ta | |
- tt | |
- te | |
- th | |
- bo | |
- tr | |
- tk | |
- ug | |
- uk | |
- ur | |
- uz | |
- vi | |
- cy | |
- wo | |
- xh | |
- yi | |
- yo | |
- zu""".splitlines() | |
labse_huggingface_tags = [tag.strip() for tag in labse_huggingface_tags if tag] | |
labse_huggingface_tags = [tag.split()[-1] for tag in labse_huggingface_tags] | |
def match_based_on_tag_distance(model_languages, data_languages, model_name, data_name="eBible", dedupe=False, threshold=9): | |
print(f"Model language count: {len(model_languages)}") | |
print(f"Data language count: {len(data_languages)}") | |
if dedupe: | |
print(f"Filtering for duplicates...") | |
model_languages = list(set(model_languages)) | |
data_languages = list(set(data_languages)) | |
print(f"Model languages remaining: {len(model_languages)}") | |
print(f"Data language remaining: {len(data_languages)}") | |
# Match based on tag distances | |
tag_distance_matches = [] | |
product_of_lists = list(itertools.product(model_languages, data_languages)) | |
print(f"checking {len(model_languages)} model languages against {len(data_languages)} data languages, giving {len(product_of_lists)} combinations") | |
for combination in tqdm(product_of_lists): | |
model_lang = combination[0] | |
data_lang = combination[1] | |
tag_distance = langcodes.tag_distance(model_lang, data_lang) | |
# print(f"{model_lang} and {data_lang} are {tag_distance} tag-distance apart") | |
if tag_distance <= threshold: | |
tag_distance_matches.append((model_lang, data_lang, tag_distance)) | |
# print(f"{model_lang} and {data_lang} are {langcodes.tag_distance(model_lang, data_lang)} tag-distance apart") | |
# else: | |
# print(f"{model_lang} and {data_lang} are {langcodes.tag_distance(model_lang, data_lang)} tag-distance apart") | |
# tag_distance_matches = sorted(tag_distance_matches) | |
model_unmatched = [lang for lang in model_languages if lang not in [match[0] for match in tag_distance_matches]] | |
data_unmatched = [lang for lang in data_languages if lang not in [match[1] for match in tag_distance_matches]] | |
print(f"Found {len(tag_distance_matches)} matches, {len(model_unmatched)} model languages not matched") | |
return tag_distance_matches, model_unmatched,data_unmatched, model_languages, data_languages | |
def parse_language_list(): | |
language_list_options = ["Language names", "Language Tags/Codes", | |
# "huggingface model/dataset name" | |
] | |
language_list_type = st.selectbox(f"What format is your language list?",language_list_options, key=uuid4()) | |
language_list = [] | |
not_parsed = [] | |
if language_list_type==language_list_options[0]: | |
languages_input = st.text_area("Language names, comma-separated", f"{','.join(example_languages_from_labse)}", key=uuid4()) | |
for lang in languages_input.split(","): | |
try: | |
language_list.append(langcodes.find(lang.strip())) | |
except LookupError as e: | |
not_parsed.append(lang) | |
elif language_list_type==language_list_options[1]: | |
languages_input = st.text_area("Language tags, comma-separated", f"{','.join(labse_huggingface_tags)}", key=uuid4()) | |
for lang in languages_input.split(","): | |
try: | |
language_list.append(langcodes.get(lang.strip())) | |
except langcodes.tag_parser.LanguageTagError as e: | |
print(e) | |
not_parsed.append(lang) | |
# = [] | |
st.write(f"Langcodes list: {language_list}") | |
# st.write(f"Langcodes could not parse {not_parsed}") | |
return language_list | |
first_lang_list = parse_language_list() | |
# second_lang_list = parse_language_list() | |