Colin Leong
CDL: typo
c809510
import streamlit as st
from uuid import uuid4
import langcodes
import itertools
example_languages_from_labse="""Afrikaans
Albanian
Amharic
Arabic
Armenian
Assamese
Azerbaijani
Basque
Belarusian
Bengali
Bosnian
Bulgarian
Burmese
Catalan
Cebuano
Chinese
Corsican
Croatian
Czech
Danish
Dutch
English
Esperanto
Estonian
Finnish
French
Western Frisian
Galician
Georgian
German
Greek
Gujarati
Haitian
Hausa
Hawaiian
Hebrew
Hindi
Hmong
Hungarian
Icelandic
Igbo
Indonesian
Irish
Italian
Japanese
Javanese
Kannada
Kazakh
Khmer
Kinyarwanda
Korean
Kurdish
Kyrgyz
Lao
Latin
Latvian
Lithuanian
Luxembourgish
Macedonian
Malagasy
Malay
Malayalam
Maltese
Māori
Marathi
Mongolian
Nepali
Norwegian
Chichewa
Oriya
Persian
Polish
Portuguese
Panjabi
Romanian
Russian
Samoan
Scottish Gaelic
Serbian
Southern Sotho
Shona
Sinhala
Slovak
Slovenian
Somali
Spanish
Sundanese
Swahili
Swedish
Tagalog
Tajik
Tamil
Tatar
Telugu
Thai
Tibetan
Turkish
Turkmen
Uyghur
Ukrainian
Urdu
Uzbek
Vietnamese
Welsh
Wolof
Xhosa
Yiddish
Yoruba
Zulu""".splitlines()
# example_language_tag_string_from_labse = """af
# sq
# am
# ar
# hy
# as
# az
# eu
# be
# bn
# bs
# bg
# my
# ca
# ceb
# zh
# co
# hr
# cs
# da
# nl
# en
# eo
# et
# fi
# fr
# fy
# gl
# ka
# de
# el
# gu
# ht
# ha
# haw
# he
# hi
# hmn
# hu
# is
# ig
# id
# ga
# it
# ja
# jv
# kn
# kk
# km
# rw
# ko
# ku
# ky
# lo
# la
# lv
# lt
# lb
# mk
# mg
# ms
# ml
# mt
# mi
# mr
# mn
# ne
# no
# ny
# or
# fa
# pl
# pt
# pa
# ro
# ru
# sm
# gd
# sr
# st
# sn
# si
# sk
# sl
# so
# es
# su
# sw
# sv
# tl
# tg
# ta
# tt
# te
# th
# bo
# tr
# tk
# ug
# uk
# ur
# uz
# vi
# cy
# wo
# xh
# yi
# yo
# zu"""
labse_huggingface_tags = """- af
- sq
- am
- ar
- hy
- as
- az
- eu
- be
- bn
- bs
- bg
- my
- ca
- ceb
- zh
- co
- hr
- cs
- da
- nl
- en
- eo
- et
- fi
- fr
- fy
- gl
- ka
- de
- el
- gu
- ht
- ha
- haw
- he
- hi
- hmn
- hu
- is
- ig
- id
- ga
- it
- ja
- jv
- kn
- kk
- km
- rw
- ko
- ku
- ky
- lo
- la
- lv
- lt
- lb
- mk
- mg
- ms
- ml
- mt
- mi
- mr
- mn
- ne
- no
- ny
- or
- fa
- pl
- pt
- pa
- ro
- ru
- sm
- gd
- sr
- st
- sn
- si
- sk
- sl
- so
- es
- su
- sw
- sv
- tl
- tg
- ta
- tt
- te
- th
- bo
- tr
- tk
- ug
- uk
- ur
- uz
- vi
- cy
- wo
- xh
- yi
- yo
- zu""".splitlines()
labse_huggingface_tags = [tag.strip() for tag in labse_huggingface_tags if tag]
labse_huggingface_tags = [tag.split()[-1] for tag in labse_huggingface_tags]
def match_based_on_tag_distance(model_languages, data_languages, model_name, data_name="eBible", dedupe=False, threshold=9):
print(f"Model language count: {len(model_languages)}")
print(f"Data language count: {len(data_languages)}")
if dedupe:
print(f"Filtering for duplicates...")
model_languages = list(set(model_languages))
data_languages = list(set(data_languages))
print(f"Model languages remaining: {len(model_languages)}")
print(f"Data language remaining: {len(data_languages)}")
# Match based on tag distances
tag_distance_matches = []
product_of_lists = list(itertools.product(model_languages, data_languages))
print(f"checking {len(model_languages)} model languages against {len(data_languages)} data languages, giving {len(product_of_lists)} combinations")
for combination in tqdm(product_of_lists):
model_lang = combination[0]
data_lang = combination[1]
tag_distance = langcodes.tag_distance(model_lang, data_lang)
# print(f"{model_lang} and {data_lang} are {tag_distance} tag-distance apart")
if tag_distance <= threshold:
tag_distance_matches.append((model_lang, data_lang, tag_distance))
# print(f"{model_lang} and {data_lang} are {langcodes.tag_distance(model_lang, data_lang)} tag-distance apart")
# else:
# print(f"{model_lang} and {data_lang} are {langcodes.tag_distance(model_lang, data_lang)} tag-distance apart")
# tag_distance_matches = sorted(tag_distance_matches)
model_unmatched = [lang for lang in model_languages if lang not in [match[0] for match in tag_distance_matches]]
data_unmatched = [lang for lang in data_languages if lang not in [match[1] for match in tag_distance_matches]]
print(f"Found {len(tag_distance_matches)} matches, {len(model_unmatched)} model languages not matched")
return tag_distance_matches, model_unmatched,data_unmatched, model_languages, data_languages
def parse_language_list():
language_list_options = ["Language names", "Language Tags/Codes",
# "huggingface model/dataset name"
]
language_list_type = st.selectbox(f"What format is your language list?",language_list_options, key=uuid4())
language_list = []
not_parsed = []
if language_list_type==language_list_options[0]:
languages_input = st.text_area("Language names, comma-separated", f"{','.join(example_languages_from_labse)}", key=uuid4())
for lang in languages_input.split(","):
try:
language_list.append(langcodes.find(lang.strip()))
except LookupError as e:
not_parsed.append(lang)
elif language_list_type==language_list_options[1]:
languages_input = st.text_area("Language tags, comma-separated", f"{','.join(labse_huggingface_tags)}", key=uuid4())
for lang in languages_input.split(","):
try:
language_list.append(langcodes.get(lang.strip()))
except langcodes.tag_parser.LanguageTagError as e:
print(e)
not_parsed.append(lang)
# = []
st.write(f"Langcodes list: {language_list}")
# st.write(f"Langcodes could not parse {not_parsed}")
return language_list
first_lang_list = parse_language_list()
# second_lang_list = parse_language_list()