| import re | |
| from datetime import date | |
| import pandas as pd | |
| from joblib.memory import Memory | |
| from langcodes import standardize_tag | |
| from requests import get | |
| cache = Memory(location=".cache", verbose=0).cache | |
| # load CommonVoice stats | |
| # cache for 1 day | |
| def get_commonvoice_stats(date: date): | |
| return get("https://commonvoice.mozilla.org/api/v1/stats/languages").json() | |
| commonvoice = pd.DataFrame(get_commonvoice_stats(date.today())).rename( | |
| columns={"locale": "commonvoice_locale", "validatedHours": "commonvoice_hours"} | |
| )[["commonvoice_locale", "commonvoice_hours"]] | |
| # ignore country (language is language) (in practive this is only relevant to zh-CN/zh-TW/zh-HK) | |
| commonvoice["bcp_47"] = commonvoice["commonvoice_locale"].apply( | |
| lambda x: re.sub(r"-[A-Z]{2}$", "", x) | |
| ) | |
| commonvoice["bcp_47"] = commonvoice["bcp_47"].apply( | |
| lambda x: standardize_tag(x, macro=True) | |
| ) # this does not really seem to get macrolanguages though, e.g. not for Quechua | |
| commonvoice = ( | |
| commonvoice.groupby("bcp_47") | |
| .agg({"commonvoice_hours": "sum", "commonvoice_locale": "first"}) | |
| .reset_index() | |
| ) | |