David Pomerenke
		
	commited on
		
		
					Commit 
							
							·
						
						d5fc8b3
	
1
								Parent(s):
							
							8beab26
								
Use langcodes for language matching
Browse files- evals.py +79 -80
 - pyproject.toml +1 -0
 - results.json +71 -461
 - uv.lock +81 -0
 
    	
        evals.py
    CHANGED
    
    | 
         @@ -1,6 +1,7 @@ 
     | 
|
| 1 | 
         
             
            import asyncio
         
     | 
| 2 | 
         
             
            import json
         
     | 
| 3 | 
         
             
            import os
         
     | 
| 
         | 
|
| 4 | 
         
             
            from os import getenv
         
     | 
| 5 | 
         | 
| 6 | 
         
             
            import evaluate
         
     | 
| 
         @@ -14,17 +15,19 @@ from tqdm.asyncio import tqdm_asyncio 
     | 
|
| 14 | 
         
             
            from transformers import NllbTokenizer
         
     | 
| 15 | 
         
             
            from datetime import date
         
     | 
| 16 | 
         
             
            from requests import get
         
     | 
| 
         | 
|
| 
         | 
|
| 17 | 
         | 
| 18 | 
         
             
            # config
         
     | 
| 19 | 
         
             
            models = [
         
     | 
| 20 | 
         
            -
                "openai/gpt-4o-mini", 
     | 
| 21 | 
         
             
                # "anthropic/claude-3.5-haiku", # 4$/M tokens -> too expensive
         
     | 
| 22 | 
         
            -
                "meta-llama/llama-3.3-70b-instruct", 
     | 
| 23 | 
         
            -
                "mistralai/mistral-small-24b-instruct-2501", 
     | 
| 24 | 
         
            -
                "google/gemini-2.0-flash-001", 
     | 
| 25 | 
         
             
                # "qwen/qwen-turbo", # 0.2$/M tokens; recognizes "inappropriate content"
         
     | 
| 26 | 
         
            -
                "deepseek/deepseek-chat", 
     | 
| 27 | 
         
            -
                "microsoft/phi-4", 
     | 
| 28 | 
         
             
            ]
         
     | 
| 29 | 
         
             
            fast_model = "meta-llama/llama-3.3-70b-instruct"
         
     | 
| 30 | 
         
             
            n_sentences = 30
         
     | 
| 
         @@ -47,73 +50,79 @@ def reorder(language_name): 
     | 
|
| 47 | 
         
             
                    return language_name.split(",")[1] + " " + language_name.split(",")[0]
         
     | 
| 48 | 
         
             
                return language_name
         
     | 
| 49 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 50 | 
         
             
            # load benchmark languages and scripts
         
     | 
| 51 | 
         
             
            benchmark_dir = "data/floresp-v2.0-rc.3/dev"
         
     | 
| 52 | 
         
             
            benchmark_languages = pd.DataFrame(
         
     | 
| 53 | 
         
             
                [f.split(".")[1].split("_", 1) for f in os.listdir(benchmark_dir)],
         
     | 
| 54 | 
         
            -
                columns=[" 
     | 
| 55 | 
         
             
            )
         
     | 
| 56 | 
         
            -
             
     | 
| 57 | 
         
            -
             
     | 
| 58 | 
         
            -
             
     | 
| 59 | 
         
            -
             
     | 
| 60 | 
         
            -
            # load Ethnologue language names
         
     | 
| 61 | 
         
            -
            language_names = (
         
     | 
| 62 | 
         
            -
                pd.read_csv("data/LanguageCodes.tab", sep="\t")
         
     | 
| 63 | 
         
            -
                .rename(columns={"LangID": "language_code", "Name": "language_name"})[
         
     | 
| 64 | 
         
            -
                    ["language_code", "language_name"]
         
     | 
| 65 | 
         
            -
                ]
         
     | 
| 66 | 
         
            -
                .assign(language_name=lambda df: df["language_name"].apply(reorder).str.strip())
         
     | 
| 67 | 
         
             
            )
         
     | 
| 68 | 
         
            -
             
     | 
| 69 | 
         
            -
             
     | 
| 70 | 
         
            -
             
     | 
| 71 | 
         
            -
             
     | 
| 72 | 
         
            -
             
     | 
| 73 | 
         
            -
             
     | 
| 74 | 
         
            -
                 
     | 
| 75 | 
         
            -
                . 
     | 
| 76 | 
         
             
            )
         
     | 
| 77 | 
         
            -
            language_stats["speakers"] = pd.to_numeric(language_stats["speakers"], errors="coerce")
         
     | 
| 78 | 
         
            -
            ignored_languages = [
         
     | 
| 79 | 
         
            -
                "zho",  # Chinese -> use Mandarin (cmn) instead
         
     | 
| 80 | 
         
            -
                "ara",  # Arabic -> use Standard Arabic (arb) instead
         
     | 
| 81 | 
         
            -
                "pus",  # Pashto -> use Nothern / Central / Southern Pashto instead (pbt / pst / pbu)
         
     | 
| 82 | 
         
            -
                "fas",  # Persian -> use Iranian Persian (pes) instead
         
     | 
| 83 | 
         
            -
                "msa",  # Malay -> use Indonesian (ind) instead
         
     | 
| 84 | 
         
            -
            ]
         
     | 
| 85 | 
         
            -
            language_stats = language_stats[
         
     | 
| 86 | 
         
            -
                ~language_stats["language_code"].isin(ignored_languages)
         
     | 
| 87 | 
         
            -
            ]
         
     | 
| 88 | 
         
            -
             
     | 
| 89 | 
         
            -
            # load unicode script names
         
     | 
| 90 | 
         
            -
            script_names = pd.read_csv("data/ScriptCodes.csv").rename(
         
     | 
| 91 | 
         
            -
                columns={"Code": "script_code", "English Name": "script_name"}
         
     | 
| 92 | 
         
            -
            )[["script_code", "script_name"]]
         
     | 
| 93 | 
         | 
| 94 | 
         
            -
            # merge data
         
     | 
| 95 | 
         
            -
            languages = pd.merge(language_stats, language_names, on="language_code", how="outer")
         
     | 
| 96 | 
         
            -
            languages = pd.merge(benchmark_languages, languages, on="language_code", how="outer")
         
     | 
| 97 | 
         
            -
            languages = pd.merge(languages, script_names, on="script_code", how="left")
         
     | 
| 98 | 
         
            -
            languages["in_benchmark"] = languages["in_benchmark"].fillna(False)
         
     | 
| 99 | 
         
            -
            languages = languages.sort_values(by="speakers", ascending=False)
         
     | 
| 100 | 
         
            -
            languages = languages.iloc[:30]
         
     | 
| 101 | 
         | 
| 102 | 
         
            -
            #  
     | 
| 103 | 
         
            -
            @cache 
     | 
| 104 | 
         
             
            def get_commonvoice_stats(date: date):
         
     | 
| 105 | 
         
             
                return get("https://commonvoice.mozilla.org/api/v1/stats/languages").json()
         
     | 
| 106 | 
         | 
| 107 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 108 | 
         | 
| 109 | 
         
             
            # sample languages to translate to
         
     | 
| 110 | 
         
             
            target_languages = languages[languages["in_benchmark"]].sample(
         
     | 
| 111 | 
         
             
                n=n_sentences, weights="speakers", replace=True, random_state=42
         
     | 
| 112 | 
         
             
            )
         
     | 
| 113 | 
         
             
            # sample languages to analyze with all models
         
     | 
| 114 | 
         
            -
            detailed_languages = languages[languages["in_benchmark"]].sample(
         
     | 
| 115 | 
         
            -
                n=10, random_state=42
         
     | 
| 116 | 
         
            -
            )
         
     | 
| 117 | 
         | 
| 118 | 
         | 
| 119 | 
         
             
            # utils
         
     | 
| 
         @@ -140,15 +149,14 @@ async def complete(**kwargs): 
     | 
|
| 140 | 
         
             
                    raise Exception(response)
         
     | 
| 141 | 
         
             
                return response
         
     | 
| 142 | 
         | 
| 143 | 
         
            -
             
     | 
| 144 | 
         
            -
             
     | 
| 145 | 
         
            -
            async def translate(model, target_language, target_script, sentence):
         
     | 
| 146 | 
         
             
                reply = await complete(
         
     | 
| 147 | 
         
             
                    model=model,
         
     | 
| 148 | 
         
             
                    messages=[
         
     | 
| 149 | 
         
             
                        {
         
     | 
| 150 | 
         
             
                            "role": "user",
         
     | 
| 151 | 
         
            -
                            "content": f"Translate the following text to the {target_language} language; use the { 
     | 
| 152 | 
         
             
                        }
         
     | 
| 153 | 
         
             
                    ],
         
     | 
| 154 | 
         
             
                    temperature=0,
         
     | 
| 
         @@ -162,40 +170,33 @@ def mean(l): 
     | 
|
| 162 | 
         | 
| 163 | 
         | 
| 164 | 
         
             
            def load_sentences(language):
         
     | 
| 165 | 
         
            -
                return open(
         
     | 
| 166 | 
         
            -
                    f"{benchmark_dir}/dev.{language.language_code}_{language.script_code}"
         
     | 
| 167 | 
         
            -
                ).readlines()
         
     | 
| 168 | 
         | 
| 169 | 
         | 
| 170 | 
         
             
            # evaluation!
         
     | 
| 171 | 
         
             
            async def main():
         
     | 
| 172 | 
         
             
                results = []
         
     | 
| 173 | 
         
             
                for language in list(languages.itertuples()):
         
     | 
| 174 | 
         
            -
                    name = (
         
     | 
| 175 | 
         
            -
                        language.language_name
         
     | 
| 176 | 
         
            -
                        if not pd.isna(language.language_name)
         
     | 
| 177 | 
         
            -
                        else language.language_code
         
     | 
| 178 | 
         
            -
                    )
         
     | 
| 179 | 
         
            -
                    print(name)
         
     | 
| 180 | 
         
             
                    scores = []
         
     | 
| 181 | 
         
             
                    if language.in_benchmark:
         
     | 
| 182 | 
         
             
                        original_sentences = load_sentences(language)[:n_sentences]
         
     | 
| 183 | 
         
             
                        for model in models:
         
     | 
| 184 | 
         
             
                            if (
         
     | 
| 185 | 
         
             
                                model != fast_model
         
     | 
| 186 | 
         
            -
                                and language. 
     | 
| 187 | 
         
            -
                                not in detailed_languages.language_code.values
         
     | 
| 188 | 
         
             
                            ):
         
     | 
| 189 | 
         
             
                                continue
         
     | 
| 190 | 
         
            -
                            
         
     | 
| 191 | 
         
            -
                            print(model)
         
     | 
| 192 | 
         
             
                            predictions = [
         
     | 
| 193 | 
         
             
                                translate(
         
     | 
| 194 | 
         
            -
                                    model, 
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 195 | 
         
             
                                )
         
     | 
| 196 | 
         
            -
                                for sentence, language in zip(original_sentences, target_languages.itertuples())
         
     | 
| 197 | 
         
             
                            ]
         
     | 
| 198 | 
         
            -
                            predictions = await tqdm_asyncio.gather(*predictions, miniters=1)
         
     | 
| 199 | 
         
             
                            target_sentences = [
         
     | 
| 200 | 
         
             
                                load_sentences(lang)[i]
         
     | 
| 201 | 
         
             
                                for i, lang in enumerate(target_languages.itertuples())
         
     | 
| 
         @@ -217,17 +218,15 @@ async def main(): 
     | 
|
| 217 | 
         
             
                                    # "bert_score": mean(metrics_bert["f1"]),
         
     | 
| 218 | 
         
             
                                }
         
     | 
| 219 | 
         
             
                            )
         
     | 
| 220 | 
         
            -
                    commonvoice_hours = commonvoice_stats[commonvoice_stats["locale"] == language.iso639_1]["validatedHours"].values
         
     | 
| 221 | 
         
            -
                    commonvoice_hours = commonvoice_hours[0] if commonvoice_hours.size > 0 else "N/A"
         
     | 
| 222 | 
         
             
                    results.append(
         
     | 
| 223 | 
         
             
                        {
         
     | 
| 224 | 
         
            -
                            "language_name": name,
         
     | 
| 225 | 
         
            -
                            " 
     | 
| 226 | 
         
             
                            "speakers": language.speakers if not pd.isna(language.speakers) else 0,
         
     | 
| 227 | 
         
             
                            "scores": scores,
         
     | 
| 228 | 
         
             
                            "bleu": mean([s["bleu"] for s in scores]) if scores else None,
         
     | 
| 229 | 
         
             
                            # "bert_score": mean([s["bert_score"] for s in scores]),
         
     | 
| 230 | 
         
            -
                            "commonvoice_hours": commonvoice_hours,
         
     | 
| 231 | 
         
             
                        }
         
     | 
| 232 | 
         
             
                    )
         
     | 
| 233 | 
         
             
                with open("results.json", "w") as f:
         
     | 
| 
         | 
|
| 1 | 
         
             
            import asyncio
         
     | 
| 2 | 
         
             
            import json
         
     | 
| 3 | 
         
             
            import os
         
     | 
| 4 | 
         
            +
            import re
         
     | 
| 5 | 
         
             
            from os import getenv
         
     | 
| 6 | 
         | 
| 7 | 
         
             
            import evaluate
         
     | 
| 
         | 
|
| 15 | 
         
             
            from transformers import NllbTokenizer
         
     | 
| 16 | 
         
             
            from datetime import date
         
     | 
| 17 | 
         
             
            from requests import get
         
     | 
| 18 | 
         
            +
            from language_data.population_data import LANGUAGE_SPEAKING_POPULATION
         
     | 
| 19 | 
         
            +
            from langcodes import standardize_tag, Language
         
     | 
| 20 | 
         | 
| 21 | 
         
             
            # config
         
     | 
| 22 | 
         
             
            models = [
         
     | 
| 23 | 
         
            +
                "openai/gpt-4o-mini",  # 0.6$/M tokens
         
     | 
| 24 | 
         
             
                # "anthropic/claude-3.5-haiku", # 4$/M tokens -> too expensive
         
     | 
| 25 | 
         
            +
                "meta-llama/llama-3.3-70b-instruct",  # 0.3$/M tokens
         
     | 
| 26 | 
         
            +
                "mistralai/mistral-small-24b-instruct-2501",  # 0.14$/M tokens
         
     | 
| 27 | 
         
            +
                "google/gemini-2.0-flash-001",  # 0.4$/M tokens
         
     | 
| 28 | 
         
             
                # "qwen/qwen-turbo", # 0.2$/M tokens; recognizes "inappropriate content"
         
     | 
| 29 | 
         
            +
                "deepseek/deepseek-chat",  # 0.9$/M tokens
         
     | 
| 30 | 
         
            +
                "microsoft/phi-4",  # 0.07$/M tokens
         
     | 
| 31 | 
         
             
            ]
         
     | 
| 32 | 
         
             
            fast_model = "meta-llama/llama-3.3-70b-instruct"
         
     | 
| 33 | 
         
             
            n_sentences = 30
         
     | 
| 
         | 
|
| 50 | 
         
             
                    return language_name.split(",")[1] + " " + language_name.split(",")[0]
         
     | 
| 51 | 
         
             
                return language_name
         
     | 
| 52 | 
         | 
| 53 | 
         
            +
             
     | 
| 54 | 
         
            +
            # load general language data
         
     | 
| 55 | 
         
            +
            languages = {
         
     | 
| 56 | 
         
            +
                lang: pop
         
     | 
| 57 | 
         
            +
                for lang, pop in LANGUAGE_SPEAKING_POPULATION.items()
         
     | 
| 58 | 
         
            +
                if not re.match(r".*-[A-Z]{2}$", lang)
         
     | 
| 59 | 
         
            +
            }
         
     | 
| 60 | 
         
            +
            languages = pd.DataFrame(list(languages.items()), columns=["bcp_47", "speakers"])
         
     | 
| 61 | 
         
            +
            languages["name"] = languages["bcp_47"].apply(lambda x: Language.get(x).display_name())
         
     | 
| 62 | 
         
            +
             
     | 
| 63 | 
         
            +
            # load script codes and names
         
     | 
| 64 | 
         
            +
            scripts = pd.read_csv("data/ScriptCodes.csv").rename(columns={"Code": "iso15924", "English Name": "script_name"})
         
     | 
| 65 | 
         
            +
             
     | 
| 66 | 
         
            +
            def script_name(iso15924):
         
     | 
| 67 | 
         
            +
                return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]
         
     | 
| 68 | 
         
            +
             
     | 
| 69 | 
         
             
            # load benchmark languages and scripts
         
     | 
| 70 | 
         
             
            benchmark_dir = "data/floresp-v2.0-rc.3/dev"
         
     | 
| 71 | 
         
             
            benchmark_languages = pd.DataFrame(
         
     | 
| 72 | 
         
             
                [f.split(".")[1].split("_", 1) for f in os.listdir(benchmark_dir)],
         
     | 
| 73 | 
         
            +
                columns=["iso639_3", "iso15924"],
         
     | 
| 74 | 
         
             
            )
         
     | 
| 75 | 
         
            +
            benchmark_languages["bcp_47"] = benchmark_languages.apply(
         
     | 
| 76 | 
         
            +
                lambda row: standardize_tag(row["iso639_3"] + "-" + row["iso15924"], macro=True),
         
     | 
| 77 | 
         
            +
                axis=1,
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 78 | 
         
             
            )
         
     | 
| 79 | 
         
            +
            # ignore script (language is language)
         
     | 
| 80 | 
         
            +
            benchmark_languages["bcp_47"] = benchmark_languages["bcp_47"].apply(
         
     | 
| 81 | 
         
            +
                lambda x: re.sub(r"-[A-Z][a-z]+$", "", x)
         
     | 
| 82 | 
         
            +
            )
         
     | 
| 83 | 
         
            +
            benchmark_languages = (
         
     | 
| 84 | 
         
            +
                benchmark_languages.groupby("bcp_47")
         
     | 
| 85 | 
         
            +
                .agg({"iso639_3": "first", "iso15924": "first"})
         
     | 
| 86 | 
         
            +
                .reset_index()
         
     | 
| 87 | 
         
             
            )
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 88 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 89 | 
         | 
| 90 | 
         
            +
            # load CommonVoice stats
         
     | 
| 91 | 
         
            +
            @cache  # cache for 1 day
         
     | 
| 92 | 
         
             
            def get_commonvoice_stats(date: date):
         
     | 
| 93 | 
         
             
                return get("https://commonvoice.mozilla.org/api/v1/stats/languages").json()
         
     | 
| 94 | 
         | 
| 95 | 
         
            +
             
     | 
| 96 | 
         
            +
            commonvoice_stats = pd.DataFrame(get_commonvoice_stats(date.today())).rename(
         
     | 
| 97 | 
         
            +
                columns={"locale": "bcp_47", "validatedHours": "commonvoice_hours"}
         
     | 
| 98 | 
         
            +
            )[["bcp_47", "commonvoice_hours"]]
         
     | 
| 99 | 
         
            +
            # ignore country (language is language) (in practive this is only relevant to zh-CN/zh-TW/zh-HK)
         
     | 
| 100 | 
         
            +
            commonvoice_stats["bcp_47"] = commonvoice_stats["bcp_47"].apply(
         
     | 
| 101 | 
         
            +
                lambda x: re.sub(r"-[A-Z]{2}$", "", x)
         
     | 
| 102 | 
         
            +
            )
         
     | 
| 103 | 
         
            +
            commonvoice_stats["bcp_47"] = commonvoice_stats["bcp_47"].apply(
         
     | 
| 104 | 
         
            +
                lambda x: standardize_tag(x, macro=True)
         
     | 
| 105 | 
         
            +
            )  # this does not really seem to get macrolanguages though, e.g. not for Quechua
         
     | 
| 106 | 
         
            +
            commonvoice_stats = commonvoice_stats.groupby("bcp_47").sum().reset_index()
         
     | 
| 107 | 
         
            +
             
     | 
| 108 | 
         
            +
            # merge data
         
     | 
| 109 | 
         
            +
            languages = pd.merge(
         
     | 
| 110 | 
         
            +
                languages, benchmark_languages, on="bcp_47", how="left"
         
     | 
| 111 | 
         
            +
            )  # "left" because keep it simple for now
         
     | 
| 112 | 
         
            +
            languages = pd.merge(
         
     | 
| 113 | 
         
            +
                languages, commonvoice_stats, on="bcp_47", how="left"
         
     | 
| 114 | 
         
            +
            )  # "left" because keep it simple for now
         
     | 
| 115 | 
         
            +
            languages["in_benchmark"] = languages["bcp_47"].isin(benchmark_languages["bcp_47"])
         
     | 
| 116 | 
         
            +
             
     | 
| 117 | 
         
            +
            languages = languages.sort_values(by="speakers", ascending=False)
         
     | 
| 118 | 
         
            +
            languages = languages.iloc[:10]
         
     | 
| 119 | 
         | 
| 120 | 
         
             
            # sample languages to translate to
         
     | 
| 121 | 
         
             
            target_languages = languages[languages["in_benchmark"]].sample(
         
     | 
| 122 | 
         
             
                n=n_sentences, weights="speakers", replace=True, random_state=42
         
     | 
| 123 | 
         
             
            )
         
     | 
| 124 | 
         
             
            # sample languages to analyze with all models
         
     | 
| 125 | 
         
            +
            detailed_languages = languages[languages["in_benchmark"]].sample(n=3, random_state=42)
         
     | 
| 
         | 
|
| 
         | 
|
| 126 | 
         | 
| 127 | 
         | 
| 128 | 
         
             
            # utils
         
     | 
| 
         | 
|
| 149 | 
         
             
                    raise Exception(response)
         
     | 
| 150 | 
         
             
                return response
         
     | 
| 151 | 
         | 
| 152 | 
         
            +
            async def translate(model, target_language, sentence):
         
     | 
| 153 | 
         
            +
                script = script_name(target_language.iso15924)
         
     | 
| 
         | 
|
| 154 | 
         
             
                reply = await complete(
         
     | 
| 155 | 
         
             
                    model=model,
         
     | 
| 156 | 
         
             
                    messages=[
         
     | 
| 157 | 
         
             
                        {
         
     | 
| 158 | 
         
             
                            "role": "user",
         
     | 
| 159 | 
         
            +
                            "content": f"Translate the following text to the {target_language.name} language; use the {script} script; reply only with the translation:\n\n{sentence}",
         
     | 
| 160 | 
         
             
                        }
         
     | 
| 161 | 
         
             
                    ],
         
     | 
| 162 | 
         
             
                    temperature=0,
         
     | 
| 
         | 
|
| 170 | 
         | 
| 171 | 
         | 
| 172 | 
         
             
            def load_sentences(language):
         
     | 
| 173 | 
         
            +
                return open(f"{benchmark_dir}/dev.{language.iso639_3}_{language.iso15924}").readlines()
         
     | 
| 
         | 
|
| 
         | 
|
| 174 | 
         | 
| 175 | 
         | 
| 176 | 
         
             
            # evaluation!
         
     | 
| 177 | 
         
             
            async def main():
         
     | 
| 178 | 
         
             
                results = []
         
     | 
| 179 | 
         
             
                for language in list(languages.itertuples()):
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 180 | 
         
             
                    scores = []
         
     | 
| 181 | 
         
             
                    if language.in_benchmark:
         
     | 
| 182 | 
         
             
                        original_sentences = load_sentences(language)[:n_sentences]
         
     | 
| 183 | 
         
             
                        for model in models:
         
     | 
| 184 | 
         
             
                            if (
         
     | 
| 185 | 
         
             
                                model != fast_model
         
     | 
| 186 | 
         
            +
                                and language.bcp_47 not in detailed_languages.bcp_47.values
         
     | 
| 
         | 
|
| 187 | 
         
             
                            ):
         
     | 
| 188 | 
         
             
                                continue
         
     | 
| 
         | 
|
| 
         | 
|
| 189 | 
         
             
                            predictions = [
         
     | 
| 190 | 
         
             
                                translate(
         
     | 
| 191 | 
         
            +
                                    model,
         
     | 
| 192 | 
         
            +
                                    language,
         
     | 
| 193 | 
         
            +
                                    sentence,
         
     | 
| 194 | 
         
            +
                                )
         
     | 
| 195 | 
         
            +
                                for sentence, language in zip(
         
     | 
| 196 | 
         
            +
                                    original_sentences, target_languages.itertuples()
         
     | 
| 197 | 
         
             
                                )
         
     | 
| 
         | 
|
| 198 | 
         
             
                            ]
         
     | 
| 199 | 
         
            +
                            predictions = await tqdm_asyncio.gather(*predictions, miniters=1, desc=f"{language.name} {model.split('/')[0]}")
         
     | 
| 200 | 
         
             
                            target_sentences = [
         
     | 
| 201 | 
         
             
                                load_sentences(lang)[i]
         
     | 
| 202 | 
         
             
                                for i, lang in enumerate(target_languages.itertuples())
         
     | 
| 
         | 
|
| 218 | 
         
             
                                    # "bert_score": mean(metrics_bert["f1"]),
         
     | 
| 219 | 
         
             
                                }
         
     | 
| 220 | 
         
             
                            )
         
     | 
| 
         | 
|
| 
         | 
|
| 221 | 
         
             
                    results.append(
         
     | 
| 222 | 
         
             
                        {
         
     | 
| 223 | 
         
            +
                            "language_name": language.name,
         
     | 
| 224 | 
         
            +
                            "bcp_47": language.bcp_47,
         
     | 
| 225 | 
         
             
                            "speakers": language.speakers if not pd.isna(language.speakers) else 0,
         
     | 
| 226 | 
         
             
                            "scores": scores,
         
     | 
| 227 | 
         
             
                            "bleu": mean([s["bleu"] for s in scores]) if scores else None,
         
     | 
| 228 | 
         
             
                            # "bert_score": mean([s["bert_score"] for s in scores]),
         
     | 
| 229 | 
         
            +
                            "commonvoice_hours": language.commonvoice_hours,
         
     | 
| 230 | 
         
             
                        }
         
     | 
| 231 | 
         
             
                    )
         
     | 
| 232 | 
         
             
                with open("results.json", "w") as f:
         
     | 
    	
        pyproject.toml
    CHANGED
    
    | 
         @@ -16,6 +16,7 @@ dev-dependencies = [ 
     | 
|
| 16 | 
         
             
                "bert-score>=0.3.13",
         
     | 
| 17 | 
         
             
                "evaluate==0.4.0",
         
     | 
| 18 | 
         
             
                "joblib>=1.4.2",
         
     | 
| 
         | 
|
| 19 | 
         
             
                "openai>=1.52.2",
         
     | 
| 20 | 
         
             
                "protobuf>=5.28.3",
         
     | 
| 21 | 
         
             
                "python-dotenv>=1.0.1",
         
     | 
| 
         | 
|
| 16 | 
         
             
                "bert-score>=0.3.13",
         
     | 
| 17 | 
         
             
                "evaluate==0.4.0",
         
     | 
| 18 | 
         
             
                "joblib>=1.4.2",
         
     | 
| 19 | 
         
            +
                "langcodes>=3.5.0",
         
     | 
| 20 | 
         
             
                "openai>=1.52.2",
         
     | 
| 21 | 
         
             
                "protobuf>=5.28.3",
         
     | 
| 22 | 
         
             
                "python-dotenv>=1.0.1",
         
     | 
    	
        results.json
    CHANGED
    
    | 
         @@ -1,582 +1,192 @@ 
     | 
|
| 1 | 
         
             
            [
         
     | 
| 2 | 
         
             
              {
         
     | 
| 3 | 
         
             
                "language_name": "English",
         
     | 
| 4 | 
         
            -
                " 
     | 
| 5 | 
         
            -
                "speakers":  
     | 
| 6 | 
         
             
                "scores": [
         
     | 
| 7 | 
         
            -
                  {
         
     | 
| 8 | 
         
            -
                    "model": "openai/gpt-4o-mini",
         
     | 
| 9 | 
         
            -
                    "bleu": 0.47104084248165595
         
     | 
| 10 | 
         
            -
                  },
         
     | 
| 11 | 
         
             
                  {
         
     | 
| 12 | 
         
             
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 13 | 
         
            -
                    "bleu": 0. 
     | 
| 14 | 
         
            -
                  },
         
     | 
| 15 | 
         
            -
                  {
         
     | 
| 16 | 
         
            -
                    "model": "mistralai/mistral-small-24b-instruct-2501",
         
     | 
| 17 | 
         
            -
                    "bleu": 0.4642719176436136
         
     | 
| 18 | 
         
            -
                  },
         
     | 
| 19 | 
         
            -
                  {
         
     | 
| 20 | 
         
            -
                    "model": "google/gemini-2.0-flash-001",
         
     | 
| 21 | 
         
            -
                    "bleu": 0.5237470882988915
         
     | 
| 22 | 
         
            -
                  },
         
     | 
| 23 | 
         
            -
                  {
         
     | 
| 24 | 
         
            -
                    "model": "deepseek/deepseek-chat",
         
     | 
| 25 | 
         
            -
                    "bleu": 0.516570670982587
         
     | 
| 26 | 
         
            -
                  },
         
     | 
| 27 | 
         
            -
                  {
         
     | 
| 28 | 
         
            -
                    "model": "microsoft/phi-4",
         
     | 
| 29 | 
         
            -
                    "bleu": 0.44668905281921456
         
     | 
| 30 | 
         
             
                  }
         
     | 
| 31 | 
         
             
                ],
         
     | 
| 32 | 
         
            -
                "bleu": 0. 
     | 
| 33 | 
         
             
                "commonvoice_hours": 2649.0
         
     | 
| 34 | 
         
             
              },
         
     | 
| 35 | 
         
             
              {
         
     | 
| 36 | 
         
            -
                "language_name": " 
     | 
| 37 | 
         
            -
                " 
     | 
| 38 | 
         
            -
                "speakers":  
     | 
| 39 | 
         
            -
                "scores": [
         
     | 
| 40 | 
         
            -
                  {
         
     | 
| 41 | 
         
            -
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 42 | 
         
            -
                    "bleu": 0.48254866511762295
         
     | 
| 43 | 
         
            -
                  }
         
     | 
| 44 | 
         
            -
                ],
         
     | 
| 45 | 
         
            -
                "bleu": 0.48254866511762295,
         
     | 
| 46 | 
         
            -
                "commonvoice_hours": "N/A"
         
     | 
| 47 | 
         
            -
              },
         
     | 
| 48 | 
         
            -
              {
         
     | 
| 49 | 
         
            -
                "language_name": "Spanish",
         
     | 
| 50 | 
         
            -
                "language_code": "spa",
         
     | 
| 51 | 
         
            -
                "speakers": 485000000.0,
         
     | 
| 52 | 
         
            -
                "scores": [
         
     | 
| 53 | 
         
            -
                  {
         
     | 
| 54 | 
         
            -
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 55 | 
         
            -
                    "bleu": 0.31606621368361204
         
     | 
| 56 | 
         
            -
                  }
         
     | 
| 57 | 
         
            -
                ],
         
     | 
| 58 | 
         
            -
                "bleu": 0.31606621368361204,
         
     | 
| 59 | 
         
            -
                "commonvoice_hours": 446.0
         
     | 
| 60 | 
         
            -
              },
         
     | 
| 61 | 
         
            -
              {
         
     | 
| 62 | 
         
            -
                "language_name": "Hindi",
         
     | 
| 63 | 
         
            -
                "language_code": "hin",
         
     | 
| 64 | 
         
            -
                "speakers": 341000000.0,
         
     | 
| 65 | 
         
            -
                "scores": [
         
     | 
| 66 | 
         
            -
                  {
         
     | 
| 67 | 
         
            -
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 68 | 
         
            -
                    "bleu": 0.3273225856613046
         
     | 
| 69 | 
         
            -
                  }
         
     | 
| 70 | 
         
            -
                ],
         
     | 
| 71 | 
         
            -
                "bleu": 0.3273225856613046,
         
     | 
| 72 | 
         
            -
                "commonvoice_hours": 16.0
         
     | 
| 73 | 
         
            -
              },
         
     | 
| 74 | 
         
            -
              {
         
     | 
| 75 | 
         
            -
                "language_name": "Bengali",
         
     | 
| 76 | 
         
            -
                "language_code": "ben",
         
     | 
| 77 | 
         
            -
                "speakers": 300000000.0,
         
     | 
| 78 | 
         
            -
                "scores": [
         
     | 
| 79 | 
         
            -
                  {
         
     | 
| 80 | 
         
            -
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 81 | 
         
            -
                    "bleu": 0.23110496173302814
         
     | 
| 82 | 
         
            -
                  }
         
     | 
| 83 | 
         
            -
                ],
         
     | 
| 84 | 
         
            -
                "bleu": 0.23110496173302814,
         
     | 
| 85 | 
         
            -
                "commonvoice_hours": 49.0
         
     | 
| 86 | 
         
            -
              },
         
     | 
| 87 | 
         
            -
              {
         
     | 
| 88 | 
         
            -
                "language_name": "Portuguese",
         
     | 
| 89 | 
         
            -
                "language_code": "por",
         
     | 
| 90 | 
         
            -
                "speakers": 254300000.0,
         
     | 
| 91 | 
         
            -
                "scores": [
         
     | 
| 92 | 
         
            -
                  {
         
     | 
| 93 | 
         
            -
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 94 | 
         
            -
                    "bleu": 0.35032125995743685
         
     | 
| 95 | 
         
            -
                  }
         
     | 
| 96 | 
         
            -
                ],
         
     | 
| 97 | 
         
            -
                "bleu": 0.35032125995743685,
         
     | 
| 98 | 
         
            -
                "commonvoice_hours": 176.0
         
     | 
| 99 | 
         
            -
              },
         
     | 
| 100 | 
         
            -
              {
         
     | 
| 101 | 
         
            -
                "language_name": "French",
         
     | 
| 102 | 
         
            -
                "language_code": "fra",
         
     | 
| 103 | 
         
            -
                "speakers": 208157220.0,
         
     | 
| 104 | 
         
            -
                "scores": [
         
     | 
| 105 | 
         
            -
                  {
         
     | 
| 106 | 
         
            -
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 107 | 
         
            -
                    "bleu": 0.31625053573185663
         
     | 
| 108 | 
         
            -
                  }
         
     | 
| 109 | 
         
            -
                ],
         
     | 
| 110 | 
         
            -
                "bleu": 0.31625053573185663,
         
     | 
| 111 | 
         
            -
                "commonvoice_hours": 1051.0
         
     | 
| 112 | 
         
            -
              },
         
     | 
| 113 | 
         
            -
              {
         
     | 
| 114 | 
         
            -
                "language_name": "Indonesian",
         
     | 
| 115 | 
         
            -
                "language_code": "ind",
         
     | 
| 116 | 
         
            -
                "speakers": 198996550.0,
         
     | 
| 117 | 
         
            -
                "scores": [
         
     | 
| 118 | 
         
            -
                  {
         
     | 
| 119 | 
         
            -
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 120 | 
         
            -
                    "bleu": 0.3112185444311794
         
     | 
| 121 | 
         
            -
                  }
         
     | 
| 122 | 
         
            -
                ],
         
     | 
| 123 | 
         
            -
                "bleu": 0.3112185444311794,
         
     | 
| 124 | 
         
            -
                "commonvoice_hours": 33.0
         
     | 
| 125 | 
         
            -
              },
         
     | 
| 126 | 
         
            -
              {
         
     | 
| 127 | 
         
            -
                "language_name": "Russian",
         
     | 
| 128 | 
         
            -
                "language_code": "rus",
         
     | 
| 129 | 
         
            -
                "speakers": 171428900.0,
         
     | 
| 130 | 
         
             
                "scores": [
         
     | 
| 131 | 
         
             
                  {
         
     | 
| 132 | 
         
             
                    "model": "openai/gpt-4o-mini",
         
     | 
| 133 | 
         
            -
                    "bleu": 0. 
     | 
| 134 | 
         
             
                  },
         
     | 
| 135 | 
         
             
                  {
         
     | 
| 136 | 
         
             
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 137 | 
         
            -
                    "bleu": 0. 
     | 
| 138 | 
         
             
                  },
         
     | 
| 139 | 
         
             
                  {
         
     | 
| 140 | 
         
             
                    "model": "mistralai/mistral-small-24b-instruct-2501",
         
     | 
| 141 | 
         
            -
                    "bleu": 0. 
     | 
| 142 | 
         
             
                  },
         
     | 
| 143 | 
         
             
                  {
         
     | 
| 144 | 
         
             
                    "model": "google/gemini-2.0-flash-001",
         
     | 
| 145 | 
         
            -
                    "bleu": 0. 
     | 
| 146 | 
         
             
                  },
         
     | 
| 147 | 
         
             
                  {
         
     | 
| 148 | 
         
             
                    "model": "deepseek/deepseek-chat",
         
     | 
| 149 | 
         
            -
                    "bleu": 0. 
     | 
| 150 | 
         
             
                  },
         
     | 
| 151 | 
         
             
                  {
         
     | 
| 152 | 
         
             
                    "model": "microsoft/phi-4",
         
     | 
| 153 | 
         
            -
                    "bleu": 0. 
     | 
| 154 | 
         
             
                  }
         
     | 
| 155 | 
         
             
                ],
         
     | 
| 156 | 
         
            -
                "bleu": 0. 
     | 
| 157 | 
         
            -
                "commonvoice_hours":  
     | 
| 158 | 
         
             
              },
         
     | 
| 159 | 
         
             
              {
         
     | 
| 160 | 
         
            -
                "language_name": " 
     | 
| 161 | 
         
            -
                " 
     | 
| 162 | 
         
            -
                "speakers":  
     | 
| 163 | 
         
             
                "scores": [
         
     | 
| 164 | 
         
            -
                  {
         
     | 
| 165 | 
         
            -
                    "model": "openai/gpt-4o-mini",
         
     | 
| 166 | 
         
            -
                    "bleu": 0.28991739992953497
         
     | 
| 167 | 
         
            -
                  },
         
     | 
| 168 | 
         
             
                  {
         
     | 
| 169 | 
         
             
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 170 | 
         
            -
                    "bleu": 0. 
     | 
| 171 | 
         
            -
                  },
         
     | 
| 172 | 
         
            -
                  {
         
     | 
| 173 | 
         
            -
                    "model": "mistralai/mistral-small-24b-instruct-2501",
         
     | 
| 174 | 
         
            -
                    "bleu": 0.21348802780641032
         
     | 
| 175 | 
         
            -
                  },
         
     | 
| 176 | 
         
            -
                  {
         
     | 
| 177 | 
         
            -
                    "model": "google/gemini-2.0-flash-001",
         
     | 
| 178 | 
         
            -
                    "bleu": 0.3345265427223546
         
     | 
| 179 | 
         
            -
                  },
         
     | 
| 180 | 
         
            -
                  {
         
     | 
| 181 | 
         
            -
                    "model": "deepseek/deepseek-chat",
         
     | 
| 182 | 
         
            -
                    "bleu": 0.3101203037558905
         
     | 
| 183 | 
         
            -
                  },
         
     | 
| 184 | 
         
            -
                  {
         
     | 
| 185 | 
         
            -
                    "model": "microsoft/phi-4",
         
     | 
| 186 | 
         
            -
                    "bleu": 0.2585222780278109
         
     | 
| 187 | 
         
             
                  }
         
     | 
| 188 | 
         
             
                ],
         
     | 
| 189 | 
         
            -
                "bleu": 0. 
     | 
| 190 | 
         
            -
                "commonvoice_hours":  
     | 
| 191 | 
         
             
              },
         
     | 
| 192 | 
         
             
              {
         
     | 
| 193 | 
         
            -
                "language_name": " 
     | 
| 194 | 
         
            -
                " 
     | 
| 195 | 
         
            -
                "speakers":  
     | 
| 196 | 
         
             
                "scores": [
         
     | 
| 197 | 
         
             
                  {
         
     | 
| 198 | 
         
             
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 199 | 
         
            -
                    "bleu": 0. 
     | 
| 200 | 
         
             
                  }
         
     | 
| 201 | 
         
             
                ],
         
     | 
| 202 | 
         
            -
                "bleu": 0. 
     | 
| 203 | 
         
            -
                "commonvoice_hours":  
     | 
| 204 | 
         
             
              },
         
     | 
| 205 | 
         
             
              {
         
     | 
| 206 | 
         
            -
                "language_name": " 
     | 
| 207 | 
         
            -
                " 
     | 
| 208 | 
         
            -
                "speakers":  
     | 
| 209 | 
         
             
                "scores": [
         
     | 
| 210 | 
         
            -
                  {
         
     | 
| 211 | 
         
            -
                    "model": "openai/gpt-4o-mini",
         
     | 
| 212 | 
         
            -
                    "bleu": 0.39019323183176663
         
     | 
| 213 | 
         
            -
                  },
         
     | 
| 214 | 
         
             
                  {
         
     | 
| 215 | 
         
             
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 216 | 
         
            -
                    "bleu": 0. 
     | 
| 217 | 
         
            -
                  },
         
     | 
| 218 | 
         
            -
                  {
         
     | 
| 219 | 
         
            -
                    "model": "mistralai/mistral-small-24b-instruct-2501",
         
     | 
| 220 | 
         
            -
                    "bleu": 0.3647632576435612
         
     | 
| 221 | 
         
            -
                  },
         
     | 
| 222 | 
         
            -
                  {
         
     | 
| 223 | 
         
            -
                    "model": "google/gemini-2.0-flash-001",
         
     | 
| 224 | 
         
            -
                    "bleu": 0.4466723425292597
         
     | 
| 225 | 
         
            -
                  },
         
     | 
| 226 | 
         
            -
                  {
         
     | 
| 227 | 
         
            -
                    "model": "deepseek/deepseek-chat",
         
     | 
| 228 | 
         
            -
                    "bleu": 0.4045496243095387
         
     | 
| 229 | 
         
            -
                  },
         
     | 
| 230 | 
         
            -
                  {
         
     | 
| 231 | 
         
            -
                    "model": "microsoft/phi-4",
         
     | 
| 232 | 
         
            -
                    "bleu": 0.36047992103881465
         
     | 
| 233 | 
         
             
                  }
         
     | 
| 234 | 
         
             
                ],
         
     | 
| 235 | 
         
            -
                "bleu": 0. 
     | 
| 236 | 
         
            -
                "commonvoice_hours":  
     | 
| 237 | 
         
            -
              },
         
     | 
| 238 | 
         
            -
              {
         
     | 
| 239 | 
         
            -
                "language_name": "Egyptian Arabic",
         
     | 
| 240 | 
         
            -
                "language_code": "arz",
         
     | 
| 241 | 
         
            -
                "speakers": 100542400.0,
         
     | 
| 242 | 
         
            -
                "scores": [
         
     | 
| 243 | 
         
            -
                  {
         
     | 
| 244 | 
         
            -
                    "model": "openai/gpt-4o-mini",
         
     | 
| 245 | 
         
            -
                    "bleu": 0.2339779422333898
         
     | 
| 246 | 
         
            -
                  },
         
     | 
| 247 | 
         
            -
                  {
         
     | 
| 248 | 
         
            -
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 249 | 
         
            -
                    "bleu": 0.20475486619797384
         
     | 
| 250 | 
         
            -
                  },
         
     | 
| 251 | 
         
            -
                  {
         
     | 
| 252 | 
         
            -
                    "model": "mistralai/mistral-small-24b-instruct-2501",
         
     | 
| 253 | 
         
            -
                    "bleu": 0.20783660453505234
         
     | 
| 254 | 
         
            -
                  },
         
     | 
| 255 | 
         
            -
                  {
         
     | 
| 256 | 
         
            -
                    "model": "google/gemini-2.0-flash-001",
         
     | 
| 257 | 
         
            -
                    "bleu": 0.2840808045687292
         
     | 
| 258 | 
         
            -
                  },
         
     | 
| 259 | 
         
            -
                  {
         
     | 
| 260 | 
         
            -
                    "model": "deepseek/deepseek-chat",
         
     | 
| 261 | 
         
            -
                    "bleu": 0.2786287793608212
         
     | 
| 262 | 
         
            -
                  },
         
     | 
| 263 | 
         
            -
                  {
         
     | 
| 264 | 
         
            -
                    "model": "microsoft/phi-4",
         
     | 
| 265 | 
         
            -
                    "bleu": 0.19969813973959594
         
     | 
| 266 | 
         
            -
                  }
         
     | 
| 267 | 
         
            -
                ],
         
     | 
| 268 | 
         
            -
                "bleu": 0.23482952277259375,
         
     | 
| 269 | 
         
            -
                "commonvoice_hours": "N/A"
         
     | 
| 270 | 
         
             
              },
         
     | 
| 271 | 
         
             
              {
         
     | 
| 272 | 
         
             
                "language_name": "Urdu",
         
     | 
| 273 | 
         
            -
                " 
     | 
| 274 | 
         
            -
                "speakers":  
     | 
| 275 | 
         
             
                "scores": [
         
     | 
| 276 | 
         
             
                  {
         
     | 
| 277 | 
         
             
                    "model": "openai/gpt-4o-mini",
         
     | 
| 278 | 
         
            -
                    "bleu": 0. 
     | 
| 279 | 
         
             
                  },
         
     | 
| 280 | 
         
             
                  {
         
     | 
| 281 | 
         
             
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 282 | 
         
            -
                    "bleu": 0. 
     | 
| 283 | 
         
             
                  },
         
     | 
| 284 | 
         
             
                  {
         
     | 
| 285 | 
         
             
                    "model": "mistralai/mistral-small-24b-instruct-2501",
         
     | 
| 286 | 
         
            -
                    "bleu": 0. 
     | 
| 287 | 
         
             
                  },
         
     | 
| 288 | 
         
             
                  {
         
     | 
| 289 | 
         
             
                    "model": "google/gemini-2.0-flash-001",
         
     | 
| 290 | 
         
            -
                    "bleu": 0. 
     | 
| 291 | 
         
             
                  },
         
     | 
| 292 | 
         
             
                  {
         
     | 
| 293 | 
         
             
                    "model": "deepseek/deepseek-chat",
         
     | 
| 294 | 
         
            -
                    "bleu": 0. 
     | 
| 295 | 
         
             
                  },
         
     | 
| 296 | 
         
             
                  {
         
     | 
| 297 | 
         
             
                    "model": "microsoft/phi-4",
         
     | 
| 298 | 
         
            -
                    "bleu": 0. 
     | 
| 299 | 
         
             
                  }
         
     | 
| 300 | 
         
             
                ],
         
     | 
| 301 | 
         
            -
                "bleu": 0. 
     | 
| 302 | 
         
             
                "commonvoice_hours": 76.0
         
     | 
| 303 | 
         
             
              },
         
     | 
| 304 | 
         
             
              {
         
     | 
| 305 | 
         
            -
                "language_name": " 
     | 
| 306 | 
         
            -
                " 
     | 
| 307 | 
         
            -
                "speakers":  
     | 
| 308 | 
         
            -
                "scores": [
         
     | 
| 309 | 
         
            -
                  {
         
     | 
| 310 | 
         
            -
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 311 | 
         
            -
                    "bleu": 0.33268969497468076
         
     | 
| 312 | 
         
            -
                  }
         
     | 
| 313 | 
         
            -
                ],
         
     | 
| 314 | 
         
            -
                "bleu": 0.33268969497468076,
         
     | 
| 315 | 
         
            -
                "commonvoice_hours": "N/A"
         
     | 
| 316 | 
         
            -
              },
         
     | 
| 317 | 
         
            -
              {
         
     | 
| 318 | 
         
            -
                "language_name": "Javanese",
         
     | 
| 319 | 
         
            -
                "language_code": "jav",
         
     | 
| 320 | 
         
            -
                "speakers": 84308740.0,
         
     | 
| 321 | 
         
            -
                "scores": [
         
     | 
| 322 | 
         
            -
                  {
         
     | 
| 323 | 
         
            -
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 324 | 
         
            -
                    "bleu": 0.2528746866064681
         
     | 
| 325 | 
         
            -
                  }
         
     | 
| 326 | 
         
            -
                ],
         
     | 
| 327 | 
         
            -
                "bleu": 0.2528746866064681,
         
     | 
| 328 | 
         
            -
                "commonvoice_hours": 0.0
         
     | 
| 329 | 
         
            -
              },
         
     | 
| 330 | 
         
            -
              {
         
     | 
| 331 | 
         
            -
                "language_name": "Marathi",
         
     | 
| 332 | 
         
            -
                "language_code": "mar",
         
     | 
| 333 | 
         
            -
                "speakers": 83100000.0,
         
     | 
| 334 | 
         
            -
                "scores": [
         
     | 
| 335 | 
         
            -
                  {
         
     | 
| 336 | 
         
            -
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 337 | 
         
            -
                    "bleu": 0.24876051941895777
         
     | 
| 338 | 
         
            -
                  }
         
     | 
| 339 | 
         
            -
                ],
         
     | 
| 340 | 
         
            -
                "bleu": 0.24876051941895777,
         
     | 
| 341 | 
         
            -
                "commonvoice_hours": 20.0
         
     | 
| 342 | 
         
            -
              },
         
     | 
| 343 | 
         
            -
              {
         
     | 
| 344 | 
         
            -
                "language_name": "Swahili",
         
     | 
| 345 | 
         
            -
                "language_code": "swh",
         
     | 
| 346 | 
         
            -
                "speakers": 82300000.0,
         
     | 
| 347 | 
         
            -
                "scores": [
         
     | 
| 348 | 
         
            -
                  {
         
     | 
| 349 | 
         
            -
                    "model": "openai/gpt-4o-mini",
         
     | 
| 350 | 
         
            -
                    "bleu": 0.34863560100932933
         
     | 
| 351 | 
         
            -
                  },
         
     | 
| 352 | 
         
            -
                  {
         
     | 
| 353 | 
         
            -
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 354 | 
         
            -
                    "bleu": 0.30524292832054034
         
     | 
| 355 | 
         
            -
                  },
         
     | 
| 356 | 
         
            -
                  {
         
     | 
| 357 | 
         
            -
                    "model": "mistralai/mistral-small-24b-instruct-2501",
         
     | 
| 358 | 
         
            -
                    "bleu": 0.23580256234118713
         
     | 
| 359 | 
         
            -
                  },
         
     | 
| 360 | 
         
            -
                  {
         
     | 
| 361 | 
         
            -
                    "model": "google/gemini-2.0-flash-001",
         
     | 
| 362 | 
         
            -
                    "bleu": 0.3871437234807849
         
     | 
| 363 | 
         
            -
                  },
         
     | 
| 364 | 
         
            -
                  {
         
     | 
| 365 | 
         
            -
                    "model": "deepseek/deepseek-chat",
         
     | 
| 366 | 
         
            -
                    "bleu": 0.3476225063617937
         
     | 
| 367 | 
         
            -
                  },
         
     | 
| 368 | 
         
            -
                  {
         
     | 
| 369 | 
         
            -
                    "model": "microsoft/phi-4",
         
     | 
| 370 | 
         
            -
                    "bleu": 0.21803176063271826
         
     | 
| 371 | 
         
            -
                  }
         
     | 
| 372 | 
         
            -
                ],
         
     | 
| 373 | 
         
            -
                "bleu": 0.3070798470243923,
         
     | 
| 374 | 
         
            -
                "commonvoice_hours": "N/A"
         
     | 
| 375 | 
         
            -
              },
         
     | 
| 376 | 
         
            -
              {
         
     | 
| 377 | 
         
            -
                "language_name": "Turkish",
         
     | 
| 378 | 
         
            -
                "language_code": "tur",
         
     | 
| 379 | 
         
            -
                "speakers": 82231620.0,
         
     | 
| 380 | 
         
            -
                "scores": [
         
     | 
| 381 | 
         
            -
                  {
         
     | 
| 382 | 
         
            -
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 383 | 
         
            -
                    "bleu": 0.29874140544434125
         
     | 
| 384 | 
         
            -
                  }
         
     | 
| 385 | 
         
            -
                ],
         
     | 
| 386 | 
         
            -
                "bleu": 0.29874140544434125,
         
     | 
| 387 | 
         
            -
                "commonvoice_hours": 127.0
         
     | 
| 388 | 
         
            -
              },
         
     | 
| 389 | 
         
            -
              {
         
     | 
| 390 | 
         
            -
                "language_name": "Telugu",
         
     | 
| 391 | 
         
            -
                "language_code": "tel",
         
     | 
| 392 | 
         
            -
                "speakers": 82000000.0,
         
     | 
| 393 | 
         
            -
                "scores": [
         
     | 
| 394 | 
         
            -
                  {
         
     | 
| 395 | 
         
            -
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 396 | 
         
            -
                    "bleu": 0.28869836899054496
         
     | 
| 397 | 
         
            -
                  }
         
     | 
| 398 | 
         
            -
                ],
         
     | 
| 399 | 
         
            -
                "bleu": 0.28869836899054496,
         
     | 
| 400 | 
         
            -
                "commonvoice_hours": 0.3
         
     | 
| 401 | 
         
            -
              },
         
     | 
| 402 | 
         
            -
              {
         
     | 
| 403 | 
         
            -
                "language_name": "Wu Chinese",
         
     | 
| 404 | 
         
            -
                "language_code": "wuu",
         
     | 
| 405 | 
         
            -
                "speakers": 81400000.0,
         
     | 
| 406 | 
         
            -
                "scores": [],
         
     | 
| 407 | 
         
            -
                "bleu": null,
         
     | 
| 408 | 
         
            -
                "commonvoice_hours": "N/A"
         
     | 
| 409 | 
         
            -
              },
         
     | 
| 410 | 
         
            -
              {
         
     | 
| 411 | 
         
            -
                "language_name": "Korean",
         
     | 
| 412 | 
         
            -
                "language_code": "kor",
         
     | 
| 413 | 
         
            -
                "speakers": 77300000.0,
         
     | 
| 414 | 
         
            -
                "scores": [
         
     | 
| 415 | 
         
            -
                  {
         
     | 
| 416 | 
         
            -
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 417 | 
         
            -
                    "bleu": 0.2566453806044083
         
     | 
| 418 | 
         
            -
                  }
         
     | 
| 419 | 
         
            -
                ],
         
     | 
| 420 | 
         
            -
                "bleu": 0.2566453806044083,
         
     | 
| 421 | 
         
            -
                "commonvoice_hours": 1.7
         
     | 
| 422 | 
         
            -
              },
         
     | 
| 423 | 
         
            -
              {
         
     | 
| 424 | 
         
            -
                "language_name": "Vietnamese",
         
     | 
| 425 | 
         
            -
                "language_code": "vie",
         
     | 
| 426 | 
         
            -
                "speakers": 76000000.0,
         
     | 
| 427 | 
         
            -
                "scores": [
         
     | 
| 428 | 
         
            -
                  {
         
     | 
| 429 | 
         
            -
                    "model": "openai/gpt-4o-mini",
         
     | 
| 430 | 
         
            -
                    "bleu": 0.3104431723374164
         
     | 
| 431 | 
         
            -
                  },
         
     | 
| 432 | 
         
            -
                  {
         
     | 
| 433 | 
         
            -
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 434 | 
         
            -
                    "bleu": 0.3098478561790782
         
     | 
| 435 | 
         
            -
                  },
         
     | 
| 436 | 
         
            -
                  {
         
     | 
| 437 | 
         
            -
                    "model": "mistralai/mistral-small-24b-instruct-2501",
         
     | 
| 438 | 
         
            -
                    "bleu": 0.28074941515909896
         
     | 
| 439 | 
         
            -
                  },
         
     | 
| 440 | 
         
            -
                  {
         
     | 
| 441 | 
         
            -
                    "model": "google/gemini-2.0-flash-001",
         
     | 
| 442 | 
         
            -
                    "bleu": 0.37327273228460267
         
     | 
| 443 | 
         
            -
                  },
         
     | 
| 444 | 
         
            -
                  {
         
     | 
| 445 | 
         
            -
                    "model": "deepseek/deepseek-chat",
         
     | 
| 446 | 
         
            -
                    "bleu": 0.3487726531917602
         
     | 
| 447 | 
         
            -
                  },
         
     | 
| 448 | 
         
            -
                  {
         
     | 
| 449 | 
         
            -
                    "model": "microsoft/phi-4",
         
     | 
| 450 | 
         
            -
                    "bleu": 0.18355331419148843
         
     | 
| 451 | 
         
            -
                  }
         
     | 
| 452 | 
         
            -
                ],
         
     | 
| 453 | 
         
            -
                "bleu": 0.3011065238905742,
         
     | 
| 454 | 
         
            -
                "commonvoice_hours": 5.9
         
     | 
| 455 | 
         
            -
              },
         
     | 
| 456 | 
         
            -
              {
         
     | 
| 457 | 
         
            -
                "language_name": "Tamil",
         
     | 
| 458 | 
         
            -
                "language_code": "tam",
         
     | 
| 459 | 
         
            -
                "speakers": 75000000.0,
         
     | 
| 460 | 
         
            -
                "scores": [
         
     | 
| 461 | 
         
            -
                  {
         
     | 
| 462 | 
         
            -
                    "model": "openai/gpt-4o-mini",
         
     | 
| 463 | 
         
            -
                    "bleu": 0.24593649157372188
         
     | 
| 464 | 
         
            -
                  },
         
     | 
| 465 | 
         
            -
                  {
         
     | 
| 466 | 
         
            -
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 467 | 
         
            -
                    "bleu": 0.24009996232522382
         
     | 
| 468 | 
         
            -
                  },
         
     | 
| 469 | 
         
            -
                  {
         
     | 
| 470 | 
         
            -
                    "model": "mistralai/mistral-small-24b-instruct-2501",
         
     | 
| 471 | 
         
            -
                    "bleu": 0.16785828803139252
         
     | 
| 472 | 
         
            -
                  },
         
     | 
| 473 | 
         
            -
                  {
         
     | 
| 474 | 
         
            -
                    "model": "google/gemini-2.0-flash-001",
         
     | 
| 475 | 
         
            -
                    "bleu": 0.3411457686951495
         
     | 
| 476 | 
         
            -
                  },
         
     | 
| 477 | 
         
            -
                  {
         
     | 
| 478 | 
         
            -
                    "model": "deepseek/deepseek-chat",
         
     | 
| 479 | 
         
            -
                    "bleu": 0.2875340171253509
         
     | 
| 480 | 
         
            -
                  },
         
     | 
| 481 | 
         
            -
                  {
         
     | 
| 482 | 
         
            -
                    "model": "microsoft/phi-4",
         
     | 
| 483 | 
         
            -
                    "bleu": 0.12646276530642359
         
     | 
| 484 | 
         
            -
                  }
         
     | 
| 485 | 
         
            -
                ],
         
     | 
| 486 | 
         
            -
                "bleu": 0.23483954884287706,
         
     | 
| 487 | 
         
            -
                "commonvoice_hours": 234.0
         
     | 
| 488 | 
         
            -
              },
         
     | 
| 489 | 
         
            -
              {
         
     | 
| 490 | 
         
            -
                "language_name": "Yue Chinese",
         
     | 
| 491 | 
         
            -
                "language_code": "yue",
         
     | 
| 492 | 
         
            -
                "speakers": 73100000.0,
         
     | 
| 493 | 
         
             
                "scores": [
         
     | 
| 494 | 
         
             
                  {
         
     | 
| 495 | 
         
             
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 496 | 
         
            -
                    "bleu": 0. 
     | 
| 497 | 
         
             
                  }
         
     | 
| 498 | 
         
             
                ],
         
     | 
| 499 | 
         
            -
                "bleu": 0. 
     | 
| 500 | 
         
            -
                "commonvoice_hours":  
     | 
| 501 | 
         
             
              },
         
     | 
| 502 | 
         
             
              {
         
     | 
| 503 | 
         
            -
                "language_name": " 
     | 
| 504 | 
         
            -
                " 
     | 
| 505 | 
         
            -
                "speakers":  
     | 
| 506 | 
         
             
                "scores": [
         
     | 
| 507 | 
         
             
                  {
         
     | 
| 508 | 
         
             
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 509 | 
         
            -
                    "bleu": 0. 
     | 
| 510 | 
         
             
                  }
         
     | 
| 511 | 
         
             
                ],
         
     | 
| 512 | 
         
            -
                "bleu": 0. 
     | 
| 513 | 
         
            -
                "commonvoice_hours":  
     | 
| 514 | 
         
             
              },
         
     | 
| 515 | 
         
             
              {
         
     | 
| 516 | 
         
            -
                "language_name": " 
     | 
| 517 | 
         
            -
                " 
     | 
| 518 | 
         
            -
                "speakers":  
     | 
| 519 | 
         
             
                "scores": [
         
     | 
| 520 | 
         
             
                  {
         
     | 
| 521 | 
         
             
                    "model": "openai/gpt-4o-mini",
         
     | 
| 522 | 
         
            -
                    "bleu": 0. 
     | 
| 523 | 
         
             
                  },
         
     | 
| 524 | 
         
             
                  {
         
     | 
| 525 | 
         
             
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 526 | 
         
            -
                    "bleu": 0. 
     | 
| 527 | 
         
             
                  },
         
     | 
| 528 | 
         
             
                  {
         
     | 
| 529 | 
         
             
                    "model": "mistralai/mistral-small-24b-instruct-2501",
         
     | 
| 530 | 
         
            -
                    "bleu": 0. 
     | 
| 531 | 
         
             
                  },
         
     | 
| 532 | 
         
             
                  {
         
     | 
| 533 | 
         
             
                    "model": "google/gemini-2.0-flash-001",
         
     | 
| 534 | 
         
            -
                    "bleu": 0. 
     | 
| 535 | 
         
             
                  },
         
     | 
| 536 | 
         
             
                  {
         
     | 
| 537 | 
         
             
                    "model": "deepseek/deepseek-chat",
         
     | 
| 538 | 
         
            -
                    "bleu": 0. 
     | 
| 539 | 
         
             
                  },
         
     | 
| 540 | 
         
             
                  {
         
     | 
| 541 | 
         
             
                    "model": "microsoft/phi-4",
         
     | 
| 542 | 
         
            -
                    "bleu": 0. 
     | 
| 543 | 
         
             
                  }
         
     | 
| 544 | 
         
             
                ],
         
     | 
| 545 | 
         
            -
                "bleu": 0. 
     | 
| 546 | 
         
            -
                "commonvoice_hours":  
     | 
| 547 | 
         
            -
              },
         
     | 
| 548 | 
         
            -
              {
         
     | 
| 549 | 
         
            -
                "language_name": "Iranian Persian",
         
     | 
| 550 | 
         
            -
                "language_code": "pes",
         
     | 
| 551 | 
         
            -
                "speakers": 52800000.0,
         
     | 
| 552 | 
         
            -
                "scores": [
         
     | 
| 553 | 
         
            -
                  {
         
     | 
| 554 | 
         
            -
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 555 | 
         
            -
                    "bleu": 0.28359916806993934
         
     | 
| 556 | 
         
            -
                  }
         
     | 
| 557 | 
         
            -
                ],
         
     | 
| 558 | 
         
            -
                "bleu": 0.28359916806993934,
         
     | 
| 559 | 
         
            -
                "commonvoice_hours": "N/A"
         
     | 
| 560 | 
         
             
              },
         
     | 
| 561 | 
         
             
              {
         
     | 
| 562 | 
         
            -
                "language_name": " 
     | 
| 563 | 
         
            -
                " 
     | 
| 564 | 
         
            -
                "speakers":  
     | 
| 565 | 
         
             
                "scores": [
         
     | 
| 566 | 
         
             
                  {
         
     | 
| 567 | 
         
             
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 568 | 
         
            -
                    "bleu": 0. 
     | 
| 569 | 
         
             
                  }
         
     | 
| 570 | 
         
             
                ],
         
     | 
| 571 | 
         
            -
                "bleu": 0. 
     | 
| 572 | 
         
            -
                "commonvoice_hours":  
     | 
| 573 | 
         
            -
              },
         
     | 
| 574 | 
         
            -
              {
         
     | 
| 575 | 
         
            -
                "language_name": "Hakka Chinese",
         
     | 
| 576 | 
         
            -
                "language_code": "hak",
         
     | 
| 577 | 
         
            -
                "speakers": 48200000.0,
         
     | 
| 578 | 
         
            -
                "scores": [],
         
     | 
| 579 | 
         
            -
                "bleu": null,
         
     | 
| 580 | 
         
            -
                "commonvoice_hours": "N/A"
         
     | 
| 581 | 
         
             
              }
         
     | 
| 582 | 
         
             
            ]
         
     | 
| 
         | 
|
| 1 | 
         
             
            [
         
     | 
| 2 | 
         
             
              {
         
     | 
| 3 | 
         
             
                "language_name": "English",
         
     | 
| 4 | 
         
            +
                "bcp_47": "en",
         
     | 
| 5 | 
         
            +
                "speakers": 1636485840,
         
     | 
| 6 | 
         
             
                "scores": [
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 7 | 
         
             
                  {
         
     | 
| 8 | 
         
             
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 9 | 
         
            +
                    "bleu": 0.4931825583688982
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 10 | 
         
             
                  }
         
     | 
| 11 | 
         
             
                ],
         
     | 
| 12 | 
         
            +
                "bleu": 0.4931825583688982,
         
     | 
| 13 | 
         
             
                "commonvoice_hours": 2649.0
         
     | 
| 14 | 
         
             
              },
         
     | 
| 15 | 
         
             
              {
         
     | 
| 16 | 
         
            +
                "language_name": "Chinese",
         
     | 
| 17 | 
         
            +
                "bcp_47": "zh",
         
     | 
| 18 | 
         
            +
                "speakers": 1304678914,
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 19 | 
         
             
                "scores": [
         
     | 
| 20 | 
         
             
                  {
         
     | 
| 21 | 
         
             
                    "model": "openai/gpt-4o-mini",
         
     | 
| 22 | 
         
            +
                    "bleu": 0.4807599914028467
         
     | 
| 23 | 
         
             
                  },
         
     | 
| 24 | 
         
             
                  {
         
     | 
| 25 | 
         
             
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 26 | 
         
            +
                    "bleu": 0.48224897154012053
         
     | 
| 27 | 
         
             
                  },
         
     | 
| 28 | 
         
             
                  {
         
     | 
| 29 | 
         
             
                    "model": "mistralai/mistral-small-24b-instruct-2501",
         
     | 
| 30 | 
         
            +
                    "bleu": 0.2688927547323512
         
     | 
| 31 | 
         
             
                  },
         
     | 
| 32 | 
         
             
                  {
         
     | 
| 33 | 
         
             
                    "model": "google/gemini-2.0-flash-001",
         
     | 
| 34 | 
         
            +
                    "bleu": 0.4876059353172742
         
     | 
| 35 | 
         
             
                  },
         
     | 
| 36 | 
         
             
                  {
         
     | 
| 37 | 
         
             
                    "model": "deepseek/deepseek-chat",
         
     | 
| 38 | 
         
            +
                    "bleu": 0.46126489333496423
         
     | 
| 39 | 
         
             
                  },
         
     | 
| 40 | 
         
             
                  {
         
     | 
| 41 | 
         
             
                    "model": "microsoft/phi-4",
         
     | 
| 42 | 
         
            +
                    "bleu": 0.43306718920654086
         
     | 
| 43 | 
         
             
                  }
         
     | 
| 44 | 
         
             
                ],
         
     | 
| 45 | 
         
            +
                "bleu": 0.4356399559223496,
         
     | 
| 46 | 
         
            +
                "commonvoice_hours": 422.0
         
     | 
| 47 | 
         
             
              },
         
     | 
| 48 | 
         
             
              {
         
     | 
| 49 | 
         
            +
                "language_name": "Hindi",
         
     | 
| 50 | 
         
            +
                "bcp_47": "hi",
         
     | 
| 51 | 
         
            +
                "speakers": 546882144,
         
     | 
| 52 | 
         
             
                "scores": [
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 53 | 
         
             
                  {
         
     | 
| 54 | 
         
             
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 55 | 
         
            +
                    "bleu": 0.42910938007537924
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 56 | 
         
             
                  }
         
     | 
| 57 | 
         
             
                ],
         
     | 
| 58 | 
         
            +
                "bleu": 0.42910938007537924,
         
     | 
| 59 | 
         
            +
                "commonvoice_hours": 16.0
         
     | 
| 60 | 
         
             
              },
         
     | 
| 61 | 
         
             
              {
         
     | 
| 62 | 
         
            +
                "language_name": "Spanish",
         
     | 
| 63 | 
         
            +
                "bcp_47": "es",
         
     | 
| 64 | 
         
            +
                "speakers": 493528077,
         
     | 
| 65 | 
         
             
                "scores": [
         
     | 
| 66 | 
         
             
                  {
         
     | 
| 67 | 
         
             
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 68 | 
         
            +
                    "bleu": 0.3335615012680206
         
     | 
| 69 | 
         
             
                  }
         
     | 
| 70 | 
         
             
                ],
         
     | 
| 71 | 
         
            +
                "bleu": 0.3335615012680206,
         
     | 
| 72 | 
         
            +
                "commonvoice_hours": 446.0
         
     | 
| 73 | 
         
             
              },
         
     | 
| 74 | 
         
             
              {
         
     | 
| 75 | 
         
            +
                "language_name": "Arabic",
         
     | 
| 76 | 
         
            +
                "bcp_47": "ar",
         
     | 
| 77 | 
         
            +
                "speakers": 351664197,
         
     | 
| 78 | 
         
             
                "scores": [
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 79 | 
         
             
                  {
         
     | 
| 80 | 
         
             
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 81 | 
         
            +
                    "bleu": 0.19072998559991275
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 82 | 
         
             
                  }
         
     | 
| 83 | 
         
             
                ],
         
     | 
| 84 | 
         
            +
                "bleu": 0.19072998559991275,
         
     | 
| 85 | 
         
            +
                "commonvoice_hours": 91.0
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 86 | 
         
             
              },
         
     | 
| 87 | 
         
             
              {
         
     | 
| 88 | 
         
             
                "language_name": "Urdu",
         
     | 
| 89 | 
         
            +
                "bcp_47": "ur",
         
     | 
| 90 | 
         
            +
                "speakers": 290790290,
         
     | 
| 91 | 
         
             
                "scores": [
         
     | 
| 92 | 
         
             
                  {
         
     | 
| 93 | 
         
             
                    "model": "openai/gpt-4o-mini",
         
     | 
| 94 | 
         
            +
                    "bleu": 0.3223557428811336
         
     | 
| 95 | 
         
             
                  },
         
     | 
| 96 | 
         
             
                  {
         
     | 
| 97 | 
         
             
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 98 | 
         
            +
                    "bleu": 0.3361392064611452
         
     | 
| 99 | 
         
             
                  },
         
     | 
| 100 | 
         
             
                  {
         
     | 
| 101 | 
         
             
                    "model": "mistralai/mistral-small-24b-instruct-2501",
         
     | 
| 102 | 
         
            +
                    "bleu": 0.30361668093990973
         
     | 
| 103 | 
         
             
                  },
         
     | 
| 104 | 
         
             
                  {
         
     | 
| 105 | 
         
             
                    "model": "google/gemini-2.0-flash-001",
         
     | 
| 106 | 
         
            +
                    "bleu": 0.38811035932918286
         
     | 
| 107 | 
         
             
                  },
         
     | 
| 108 | 
         
             
                  {
         
     | 
| 109 | 
         
             
                    "model": "deepseek/deepseek-chat",
         
     | 
| 110 | 
         
            +
                    "bleu": 0.33221997814253806
         
     | 
| 111 | 
         
             
                  },
         
     | 
| 112 | 
         
             
                  {
         
     | 
| 113 | 
         
             
                    "model": "microsoft/phi-4",
         
     | 
| 114 | 
         
            +
                    "bleu": 0.2541447606474814
         
     | 
| 115 | 
         
             
                  }
         
     | 
| 116 | 
         
             
                ],
         
     | 
| 117 | 
         
            +
                "bleu": 0.32276445473356513,
         
     | 
| 118 | 
         
             
                "commonvoice_hours": 76.0
         
     | 
| 119 | 
         
             
              },
         
     | 
| 120 | 
         
             
              {
         
     | 
| 121 | 
         
            +
                "language_name": "French",
         
     | 
| 122 | 
         
            +
                "bcp_47": "fr",
         
     | 
| 123 | 
         
            +
                "speakers": 278611507,
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 124 | 
         
             
                "scores": [
         
     | 
| 125 | 
         
             
                  {
         
     | 
| 126 | 
         
             
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 127 | 
         
            +
                    "bleu": 0.40595466651226686
         
     | 
| 128 | 
         
             
                  }
         
     | 
| 129 | 
         
             
                ],
         
     | 
| 130 | 
         
            +
                "bleu": 0.40595466651226686,
         
     | 
| 131 | 
         
            +
                "commonvoice_hours": 1051.0
         
     | 
| 132 | 
         
             
              },
         
     | 
| 133 | 
         
             
              {
         
     | 
| 134 | 
         
            +
                "language_name": "Bangla",
         
     | 
| 135 | 
         
            +
                "bcp_47": "bn",
         
     | 
| 136 | 
         
            +
                "speakers": 267193288,
         
     | 
| 137 | 
         
             
                "scores": [
         
     | 
| 138 | 
         
             
                  {
         
     | 
| 139 | 
         
             
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 140 | 
         
            +
                    "bleu": 0.30570858536443696
         
     | 
| 141 | 
         
             
                  }
         
     | 
| 142 | 
         
             
                ],
         
     | 
| 143 | 
         
            +
                "bleu": 0.30570858536443696,
         
     | 
| 144 | 
         
            +
                "commonvoice_hours": 49.0
         
     | 
| 145 | 
         
             
              },
         
     | 
| 146 | 
         
             
              {
         
     | 
| 147 | 
         
            +
                "language_name": "Portuguese",
         
     | 
| 148 | 
         
            +
                "bcp_47": "pt",
         
     | 
| 149 | 
         
            +
                "speakers": 237496885,
         
     | 
| 150 | 
         
             
                "scores": [
         
     | 
| 151 | 
         
             
                  {
         
     | 
| 152 | 
         
             
                    "model": "openai/gpt-4o-mini",
         
     | 
| 153 | 
         
            +
                    "bleu": 0.4122096638493346
         
     | 
| 154 | 
         
             
                  },
         
     | 
| 155 | 
         
             
                  {
         
     | 
| 156 | 
         
             
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 157 | 
         
            +
                    "bleu": 0.39250552075952033
         
     | 
| 158 | 
         
             
                  },
         
     | 
| 159 | 
         
             
                  {
         
     | 
| 160 | 
         
             
                    "model": "mistralai/mistral-small-24b-instruct-2501",
         
     | 
| 161 | 
         
            +
                    "bleu": 0.22643923104785263
         
     | 
| 162 | 
         
             
                  },
         
     | 
| 163 | 
         
             
                  {
         
     | 
| 164 | 
         
             
                    "model": "google/gemini-2.0-flash-001",
         
     | 
| 165 | 
         
            +
                    "bleu": 0.42197093736929103
         
     | 
| 166 | 
         
             
                  },
         
     | 
| 167 | 
         
             
                  {
         
     | 
| 168 | 
         
             
                    "model": "deepseek/deepseek-chat",
         
     | 
| 169 | 
         
            +
                    "bleu": 0.42783260235353093
         
     | 
| 170 | 
         
             
                  },
         
     | 
| 171 | 
         
             
                  {
         
     | 
| 172 | 
         
             
                    "model": "microsoft/phi-4",
         
     | 
| 173 | 
         
            +
                    "bleu": 0.38611444119797594
         
     | 
| 174 | 
         
             
                  }
         
     | 
| 175 | 
         
             
                ],
         
     | 
| 176 | 
         
            +
                "bleu": 0.3778453994295843,
         
     | 
| 177 | 
         
            +
                "commonvoice_hours": 176.0
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 178 | 
         
             
              },
         
     | 
| 179 | 
         
             
              {
         
     | 
| 180 | 
         
            +
                "language_name": "Punjabi",
         
     | 
| 181 | 
         
            +
                "bcp_47": "pa",
         
     | 
| 182 | 
         
            +
                "speakers": 203571210,
         
     | 
| 183 | 
         
             
                "scores": [
         
     | 
| 184 | 
         
             
                  {
         
     | 
| 185 | 
         
             
                    "model": "meta-llama/llama-3.3-70b-instruct",
         
     | 
| 186 | 
         
            +
                    "bleu": 0.34311946995454473
         
     | 
| 187 | 
         
             
                  }
         
     | 
| 188 | 
         
             
                ],
         
     | 
| 189 | 
         
            +
                "bleu": 0.34311946995454473,
         
     | 
| 190 | 
         
            +
                "commonvoice_hours": 2.3
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 191 | 
         
             
              }
         
     | 
| 192 | 
         
             
            ]
         
     | 
    	
        uv.lock
    CHANGED
    
    | 
         @@ -898,6 +898,30 @@ wheels = [ 
     | 
|
| 898 | 
         
             
                { url = "https://files.pythonhosted.org/packages/ea/8b/d7497df4a1cae9367adf21665dd1f896c2a7aeb8769ad77b662c5e2bcce7/kiwisolver-1.4.7-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:eb542fe7933aa09d8d8f9d9097ef37532a7df6497819d16efe4359890a2f417a", size = 55715 },
         
     | 
| 899 | 
         
             
            ]
         
     | 
| 900 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 901 | 
         
             
            [[package]]
         
     | 
| 902 | 
         
             
            name = "languagebench"
         
     | 
| 903 | 
         
             
            version = "0.1.0"
         
     | 
| 
         @@ -914,6 +938,7 @@ dev = [ 
     | 
|
| 914 | 
         
             
                { name = "bert-score" },
         
     | 
| 915 | 
         
             
                { name = "evaluate" },
         
     | 
| 916 | 
         
             
                { name = "joblib" },
         
     | 
| 
         | 
|
| 917 | 
         
             
                { name = "openai" },
         
     | 
| 918 | 
         
             
                { name = "protobuf" },
         
     | 
| 919 | 
         
             
                { name = "python-dotenv" },
         
     | 
| 
         @@ -937,6 +962,7 @@ dev = [ 
     | 
|
| 937 | 
         
             
                { name = "bert-score", specifier = ">=0.3.13" },
         
     | 
| 938 | 
         
             
                { name = "evaluate", specifier = "==0.4.0" },
         
     | 
| 939 | 
         
             
                { name = "joblib", specifier = ">=1.4.2" },
         
     | 
| 
         | 
|
| 940 | 
         
             
                { name = "openai", specifier = ">=1.52.2" },
         
     | 
| 941 | 
         
             
                { name = "protobuf", specifier = ">=5.28.3" },
         
     | 
| 942 | 
         
             
                { name = "python-dotenv", specifier = ">=1.0.1" },
         
     | 
| 
         @@ -1029,6 +1055,61 @@ wheels = [ 
     | 
|
| 1029 | 
         
             
                { url = "https://files.pythonhosted.org/packages/ba/b2/6a22fb5c0885da3b00e116aee81f0b829ec9ac8f736cd414b4a09413fc7d/lxml-5.3.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:6e91cf736959057f7aac7adfc83481e03615a8e8dd5758aa1d95ea69e8931dba", size = 3487557 },
         
     | 
| 1030 | 
         
             
            ]
         
     | 
| 1031 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 1032 | 
         
             
            [[package]]
         
     | 
| 1033 | 
         
             
            name = "markdown-it-py"
         
     | 
| 1034 | 
         
             
            version = "3.0.0"
         
     | 
| 
         | 
|
| 898 | 
         
             
                { url = "https://files.pythonhosted.org/packages/ea/8b/d7497df4a1cae9367adf21665dd1f896c2a7aeb8769ad77b662c5e2bcce7/kiwisolver-1.4.7-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:eb542fe7933aa09d8d8f9d9097ef37532a7df6497819d16efe4359890a2f417a", size = 55715 },
         
     | 
| 899 | 
         
             
            ]
         
     | 
| 900 | 
         | 
| 901 | 
         
            +
            [[package]]
         
     | 
| 902 | 
         
            +
            name = "langcodes"
         
     | 
| 903 | 
         
            +
            version = "3.5.0"
         
     | 
| 904 | 
         
            +
            source = { registry = "https://pypi.org/simple" }
         
     | 
| 905 | 
         
            +
            dependencies = [
         
     | 
| 906 | 
         
            +
                { name = "language-data" },
         
     | 
| 907 | 
         
            +
            ]
         
     | 
| 908 | 
         
            +
            sdist = { url = "https://files.pythonhosted.org/packages/3a/7a/5a97e327063409a5caa21541e6d08ae4a0f2da328447e9f2c7b39e179226/langcodes-3.5.0.tar.gz", hash = "sha256:1eef8168d07e51e131a2497ffecad4b663f6208e7c3ae3b8dc15c51734a6f801", size = 191030 }
         
     | 
| 909 | 
         
            +
            wheels = [
         
     | 
| 910 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/c3/6b/068c2ea7a712bf805c62445bd9e9c06d7340358ef2824150eceac027444b/langcodes-3.5.0-py3-none-any.whl", hash = "sha256:853c69d1a35e0e13da2f427bb68fb2fa4a8f4fb899e0c62ad8df8d073dcfed33", size = 182974 },
         
     | 
| 911 | 
         
            +
            ]
         
     | 
| 912 | 
         
            +
             
     | 
| 913 | 
         
            +
            [[package]]
         
     | 
| 914 | 
         
            +
            name = "language-data"
         
     | 
| 915 | 
         
            +
            version = "1.3.0"
         
     | 
| 916 | 
         
            +
            source = { registry = "https://pypi.org/simple" }
         
     | 
| 917 | 
         
            +
            dependencies = [
         
     | 
| 918 | 
         
            +
                { name = "marisa-trie" },
         
     | 
| 919 | 
         
            +
            ]
         
     | 
| 920 | 
         
            +
            sdist = { url = "https://files.pythonhosted.org/packages/dd/ce/3f144716a9f2cbf42aa86ebc8b085a184be25c80aa453eea17c294d239c1/language_data-1.3.0.tar.gz", hash = "sha256:7600ef8aa39555145d06c89f0c324bf7dab834ea0b0a439d8243762e3ebad7ec", size = 5129310 }
         
     | 
| 921 | 
         
            +
            wheels = [
         
     | 
| 922 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/5d/e9/5a5ffd9b286db82be70d677d0a91e4d58f7912bb8dd026ddeeb4abe70679/language_data-1.3.0-py3-none-any.whl", hash = "sha256:e2ee943551b5ae5f89cd0e801d1fc3835bb0ef5b7e9c3a4e8e17b2b214548fbf", size = 5385760 },
         
     | 
| 923 | 
         
            +
            ]
         
     | 
| 924 | 
         
            +
             
     | 
| 925 | 
         
             
            [[package]]
         
     | 
| 926 | 
         
             
            name = "languagebench"
         
     | 
| 927 | 
         
             
            version = "0.1.0"
         
     | 
| 
         | 
|
| 938 | 
         
             
                { name = "bert-score" },
         
     | 
| 939 | 
         
             
                { name = "evaluate" },
         
     | 
| 940 | 
         
             
                { name = "joblib" },
         
     | 
| 941 | 
         
            +
                { name = "langcodes" },
         
     | 
| 942 | 
         
             
                { name = "openai" },
         
     | 
| 943 | 
         
             
                { name = "protobuf" },
         
     | 
| 944 | 
         
             
                { name = "python-dotenv" },
         
     | 
| 
         | 
|
| 962 | 
         
             
                { name = "bert-score", specifier = ">=0.3.13" },
         
     | 
| 963 | 
         
             
                { name = "evaluate", specifier = "==0.4.0" },
         
     | 
| 964 | 
         
             
                { name = "joblib", specifier = ">=1.4.2" },
         
     | 
| 965 | 
         
            +
                { name = "langcodes", specifier = ">=3.5.0" },
         
     | 
| 966 | 
         
             
                { name = "openai", specifier = ">=1.52.2" },
         
     | 
| 967 | 
         
             
                { name = "protobuf", specifier = ">=5.28.3" },
         
     | 
| 968 | 
         
             
                { name = "python-dotenv", specifier = ">=1.0.1" },
         
     | 
| 
         | 
|
| 1055 | 
         
             
                { url = "https://files.pythonhosted.org/packages/ba/b2/6a22fb5c0885da3b00e116aee81f0b829ec9ac8f736cd414b4a09413fc7d/lxml-5.3.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:6e91cf736959057f7aac7adfc83481e03615a8e8dd5758aa1d95ea69e8931dba", size = 3487557 },
         
     | 
| 1056 | 
         
             
            ]
         
     | 
| 1057 | 
         | 
| 1058 | 
         
            +
            [[package]]
         
     | 
| 1059 | 
         
            +
            name = "marisa-trie"
         
     | 
| 1060 | 
         
            +
            version = "1.2.1"
         
     | 
| 1061 | 
         
            +
            source = { registry = "https://pypi.org/simple" }
         
     | 
| 1062 | 
         
            +
            dependencies = [
         
     | 
| 1063 | 
         
            +
                { name = "setuptools" },
         
     | 
| 1064 | 
         
            +
            ]
         
     | 
| 1065 | 
         
            +
            sdist = { url = "https://files.pythonhosted.org/packages/31/15/9d9743897e4450b2de199ee673b50cb018980c4ced477d41cf91304a85e3/marisa_trie-1.2.1.tar.gz", hash = "sha256:3a27c408e2aefc03e0f1d25b2ff2afb85aac3568f6fa2ae2a53b57a2e87ce29d", size = 416124 }
         
     | 
| 1066 | 
         
            +
            wheels = [
         
     | 
| 1067 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/e4/83/ccf5b33f2123f3110705c608f8e0caa82002626511aafafc58f82e50d322/marisa_trie-1.2.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a2eb41d2f9114d8b7bd66772c237111e00d2bae2260824560eaa0a1e291ce9e8", size = 362200 },
         
     | 
| 1068 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/9d/74/f7ce1fc2ee480c7f8ceadd9b992caceaba442a97e5e99d6aea00d3635a0b/marisa_trie-1.2.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9e956e6a46f604b17d570901e66f5214fb6f658c21e5e7665deace236793cef6", size = 192309 },
         
     | 
| 1069 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/e4/52/5dbbc13e57ce54c2ef0d04962d7d8f66edc69ed34310c734a2913199a581/marisa_trie-1.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:bd45142501300e7538b2e544905580918b67b1c82abed1275fe4c682c95635fa", size = 174713 },
         
     | 
| 1070 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/57/49/2580372f3f980aea95c23d05b2c1d3bbb9ee1ab8cfd441545153e44f1be7/marisa_trie-1.2.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a8443d116c612cfd1961fbf76769faf0561a46d8e317315dd13f9d9639ad500c", size = 1314808 },
         
     | 
| 1071 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/5a/ba/e12a4d450f265414cc68df6a116a78beece72b95f774f04d29cd48e08d19/marisa_trie-1.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:875a6248e60fbb48d947b574ffa4170f34981f9e579bde960d0f9a49ea393ecc", size = 1346678 },
         
     | 
| 1072 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/b2/81/8e130cb1eea741fd17694d821096f7ec9841f0e3d3c69b740257f5eeafa8/marisa_trie-1.2.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:746a7c60a17fccd3cfcfd4326926f02ea4fcdfc25d513411a0c4fc8e4a1ca51f", size = 1307254 },
         
     | 
| 1073 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/d7/d0/3deb5ea2bf7e4d845339875dbb31f3c3f66c8d6568723db1d137fb08a91c/marisa_trie-1.2.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:e70869737cc0e5bd903f620667da6c330d6737048d1f44db792a6af68a1d35be", size = 2194712 },
         
     | 
| 1074 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/9c/5f/b38d728dd30954816497b53425cfaddaf7b93ac0912db5911888f191b07a/marisa_trie-1.2.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:06b099dd743676dbcd8abd8465ceac8f6d97d8bfaabe2c83b965495523b4cef2", size = 2355625 },
         
     | 
| 1075 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/7e/4f/61c0faa9ae9e53600a1b7a0c367bc9db1a4fdc625402ec232c755a05e094/marisa_trie-1.2.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d2a82eb21afdaf22b50d9b996472305c05ca67fc4ff5a026a220320c9c961db6", size = 2290290 },
         
     | 
| 1076 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/7c/7d/713b970fb3043248881ed776dbf4d54918398aa5dde843a38711d0d62c8f/marisa_trie-1.2.1-cp310-cp310-win32.whl", hash = "sha256:8951e7ce5d3167fbd085703b4cbb3f47948ed66826bef9a2173c379508776cf5", size = 130743 },
         
     | 
| 1077 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/cc/94/3d619cc82c30daeacd18a88674f4e6540ebfb7b4b7752ca0552793be80cf/marisa_trie-1.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:5685a14b3099b1422c4f59fa38b0bf4b5342ee6cc38ae57df9666a0b28eeaad3", size = 151891 },
         
     | 
| 1078 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/4a/93/ffb01dfa22b6eee918e798e0bc3487427036c608aa4c065725f31aaf4104/marisa_trie-1.2.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ed3fb4ed7f2084597e862bcd56c56c5529e773729a426c083238682dba540e98", size = 362823 },
         
     | 
| 1079 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/6d/1d/5c36500ac350c278c9bdfd88e17fa846fa4136d75597c167141ed973cdf2/marisa_trie-1.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0fe69fb9ffb2767746181f7b3b29bbd3454d1d24717b5958e030494f3d3cddf3", size = 192741 },
         
     | 
| 1080 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/e8/04/87dd0840f3f720e511eba56193c02bf64d7d96df1ca9f6d19994f55154be/marisa_trie-1.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4728ed3ae372d1ea2cdbd5eaa27b8f20a10e415d1f9d153314831e67d963f281", size = 174995 },
         
     | 
| 1081 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/c9/51/9e903a7e13b7593e2e675d0ec4c390ca076dc5df1c1a0d5e85a513b886a3/marisa_trie-1.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8cf4f25cf895692b232f49aa5397af6aba78bb679fb917a05fce8d3cb1ee446d", size = 1384728 },
         
     | 
| 1082 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/e8/3f/7362a5ac60c2b0aad0f52cd57e7bd0c708f20d2660d8df85360f3d8f1c4b/marisa_trie-1.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7cca7f96236ffdbf49be4b2e42c132e3df05968ac424544034767650913524de", size = 1412620 },
         
     | 
| 1083 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/1f/bc/aaa3eaf6875f78a204a8da9692d56e3a36f89997dad2c388628385614576/marisa_trie-1.2.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d7eb20bf0e8b55a58d2a9b518aabc4c18278787bdba476c551dd1c1ed109e509", size = 1361555 },
         
     | 
| 1084 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/18/98/e11b5a6206c5d110f32adab37fa84a85410d684e9c731acdd5c9250e2ce4/marisa_trie-1.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b1ec93f0d1ee6d7ab680a6d8ea1a08bf264636358e92692072170032dda652ba", size = 2257717 },
         
     | 
| 1085 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/d2/9d/6b4a40867875e738a67c5b29f83e2e490a66bd9067ace3dd9a5c497e2b7f/marisa_trie-1.2.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:e2699255d7ac610dee26d4ae7bda5951d05c7d9123a22e1f7c6a6f1964e0a4e4", size = 2417044 },
         
     | 
| 1086 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/fe/61/e25613c72f2931757334b8bcf6b501569ef713f5ee9c6c7688ec460bd720/marisa_trie-1.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:c484410911182457a8a1a0249d0c09c01e2071b78a0a8538cd5f7fa45589b13a", size = 2351960 },
         
     | 
| 1087 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/19/0a/a90ccaf3eb476d13ec261f80c6c52defaf10ebc7f35eb2bcd7dfb533aef7/marisa_trie-1.2.1-cp311-cp311-win32.whl", hash = "sha256:ad548117744b2bcf0e3d97374608be0a92d18c2af13d98b728d37cd06248e571", size = 130446 },
         
     | 
| 1088 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/fc/98/574b4e143e0a2f5f71af8716b6c4a8a46220f75a6e0847ce7d11ee0ba4aa/marisa_trie-1.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:436f62d27714970b9cdd3b3c41bdad046f260e62ebb0daa38125ef70536fc73b", size = 152037 },
         
     | 
| 1089 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/4e/bf/8bd4ac8436b33fd46c9e1ffe3c2a131cd9744cc1649dbbe13308f744ef2b/marisa_trie-1.2.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:638506eacf20ca503fff72221a7e66a6eadbf28d6a4a6f949fcf5b1701bb05ec", size = 360041 },
         
     | 
| 1090 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/ab/dd/4d3151e302e66ae387885f6ec265bd189e096b0c43c1379bfd9a3b9d2543/marisa_trie-1.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:de1665eaafefa48a308e4753786519888021740501a15461c77bdfd57638e6b4", size = 190520 },
         
     | 
| 1091 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/00/28/ae5991c74fb90b173167a366a634c83445f948ad044d37287b478d6b457e/marisa_trie-1.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f713af9b8aa66a34cd3a78c7d150a560a75734713abe818a69021fd269e927fa", size = 174175 },
         
     | 
| 1092 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/5a/6a/fbfa89a8680eaabc6847a6c421e65427c43182db0c4bdb60e1516c81c822/marisa_trie-1.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2a7d00f53f4945320b551bccb826b3fb26948bde1a10d50bb9802fabb611b10", size = 1354995 },
         
     | 
| 1093 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/9e/4c/2ba0b385e5f64ca4ddb0c10ec52ddf881bc4521f135948786fc339d1d6c8/marisa_trie-1.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:98042040d1d6085792e8d0f74004fc0f5f9ca6091c298f593dd81a22a4643854", size = 1390989 },
         
     | 
| 1094 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/6b/22/0791ed3045c91d0938345a86be472fc7c188b894f16c5dfad2ef31e7f882/marisa_trie-1.2.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6532615111eec2c79e711965ece0bc95adac1ff547a7fff5ffca525463116deb", size = 1328810 },
         
     | 
| 1095 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/9d/7d/3f566e563abae6efce7fc311c63282a447c611739b3cd66c0e36077c86f8/marisa_trie-1.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:20948e40ab2038e62b7000ca6b4a913bc16c91a2c2e6da501bd1f917eeb28d51", size = 2230222 },
         
     | 
| 1096 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/a5/0b/38fbb4611b5d1030242ddc2aa62e524438c8076e26f87395dbbf222dc62d/marisa_trie-1.2.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:66b23e5b35dd547f85bf98db7c749bc0ffc57916ade2534a6bbc32db9a4abc44", size = 2383620 },
         
     | 
| 1097 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/ae/17/4553c63de29904d5d2521a24cad817bc7883cfa90506ab702ec4dae59a7b/marisa_trie-1.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6704adf0247d2dda42e876b793be40775dff46624309ad99bc7537098bee106d", size = 2329202 },
         
     | 
| 1098 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/45/08/6307a630e63cd763fe77ac56516faa67fa9cd342060691e40fabc84be6b0/marisa_trie-1.2.1-cp312-cp312-win32.whl", hash = "sha256:3ad356442c2fea4c2a6f514738ddf213d23930f942299a2b2c05df464a00848a", size = 129652 },
         
     | 
| 1099 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/a1/fe/67c357bfd92710d95a16b86e1453c663d565415d7f7838781c79ff7e1a7e/marisa_trie-1.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:f2806f75817392cedcacb24ac5d80b0350dde8d3861d67d045c1d9b109764114", size = 150845 },
         
     | 
| 1100 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/2a/a4/a110cd9952f0e72da7bafea1f0084b18b9e03952110d9083bfda52279f5c/marisa_trie-1.2.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:b5ea16e69bfda0ac028c921b58de1a4aaf83d43934892977368579cd3c0a2554", size = 354439 },
         
     | 
| 1101 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/3c/a5/a6099eb1c3fd8d7e93408c45501e1d08536ac57dfef02ec331f78e1ace18/marisa_trie-1.2.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:9f627f4e41be710b6cb6ed54b0128b229ac9d50e2054d9cde3af0fef277c23cf", size = 188187 },
         
     | 
| 1102 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/7c/cc/f637127e2beffa920d21f7fc45b4029575bcd1b28a90c0d90cb2b08c2205/marisa_trie-1.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5e649f3dc8ab5476732094f2828cc90cac3be7c79bc0c8318b6fda0c1d248db4", size = 171484 },
         
     | 
| 1103 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/6d/0f/29f2ad7260b956570f69f25a542efa51ba76eb76ecd53c63ee9d21987c3d/marisa_trie-1.2.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:46e528ee71808c961baf8c3ce1c46a8337ec7a96cc55389d11baafe5b632f8e9", size = 1319770 },
         
     | 
| 1104 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/f2/12/0b69ed61fba59551a5f3d569af367afae614db7214ce1da12946ba9a433a/marisa_trie-1.2.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36aa4401a1180615f74d575571a6550081d84fc6461e9aefc0bb7b2427af098e", size = 1356488 },
         
     | 
| 1105 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/33/23/483b110db7ffe8729d6ebea2bf74258aef51f10fef5775f99e4bac7aef69/marisa_trie-1.2.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce59bcd2cda9bb52b0e90cc7f36413cd86c3d0ce7224143447424aafb9f4aa48", size = 1302334 },
         
     | 
| 1106 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/1c/6f/46c2be99ce925985127fdf78900f1673bce8cb72debfebee6dccd11032c6/marisa_trie-1.2.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f4cd800704a5fc57e53c39c3a6b0c9b1519ebdbcb644ede3ee67a06eb542697d", size = 2202624 },
         
     | 
| 1107 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/fd/b6/ef642327dbd4ec35be55d5682520b8f70fca98a54024f441ef2732f6b305/marisa_trie-1.2.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:2428b495003c189695fb91ceeb499f9fcced3a2dce853e17fa475519433c67ff", size = 2364206 },
         
     | 
| 1108 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/69/04/ef8197a79d0ab5043b781cc9b457bd11b81d4204fe78adf7625a67f48c21/marisa_trie-1.2.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:735c363d9aaac82eaf516a28f7c6b95084c2e176d8231c87328dc80e112a9afa", size = 2304801 },
         
     | 
| 1109 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/03/72/f87564d653daf31d8f33d9bf0121e99ccc21f18f5c485fb404ba06abc10e/marisa_trie-1.2.1-cp313-cp313-win32.whl", hash = "sha256:eba6ca45500ca1a042466a0684aacc9838e7f20fe2605521ee19f2853062798f", size = 128799 },
         
     | 
| 1110 | 
         
            +
                { url = "https://files.pythonhosted.org/packages/27/40/5f9eb8b73030cc4b0d6817176e66079a62a2ddd9d5530da54f8011473428/marisa_trie-1.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:aa7cd17e1c690ce96c538b2f4aae003d9a498e65067dd433c52dd069009951d4", size = 149035 },
         
     | 
| 1111 | 
         
            +
            ]
         
     | 
| 1112 | 
         
            +
             
     | 
| 1113 | 
         
             
            [[package]]
         
     | 
| 1114 | 
         
             
            name = "markdown-it-py"
         
     | 
| 1115 | 
         
             
            version = "3.0.0"
         
     |