Spaces:
Running
Running
| import random | |
| from functools import partial | |
| import evaluate | |
| import pandas as pd | |
| import sentencepiece as spm | |
| from datasets_.flores import flores_sentences | |
| from datasets_.mmlu import load_mmlu | |
| from joblib.memory import Memory | |
| from languages import languages, script_name | |
| from models import complete, transcribe | |
| cache = Memory(location=".cache", verbose=0).cache | |
| bleu = evaluate.load("bleu") | |
| chrf = evaluate.load("chrf") | |
| wer = evaluate.load("wer") | |
| tokenizer = spm.SentencePieceProcessor( | |
| model_file="data/spbleu/flores200_sacrebleu_tokenizer_spm.model" | |
| ) | |
| # sample languages to translate to | |
| target_languages = languages[languages["in_benchmark"]].sample( | |
| frac=1, weights="speakers", replace=True, random_state=42 | |
| ) | |
| async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"): | |
| original_language = languages[languages["bcp_47"] == bcp_47].iloc[0] | |
| target_language = target_languages.iloc[sentence_nr] | |
| match mode: | |
| case "from": | |
| pass | |
| case "to": | |
| original_language, target_language = target_language, original_language | |
| if not flores_sentences(original_language) or not flores_sentences(target_language): | |
| return [] | |
| original_sentence = flores_sentences(original_language)[sentence_nr].strip() | |
| target_sentence = flores_sentences(target_language)[sentence_nr].strip() | |
| script = script_name(target_language.flores_path.split("_")[1]) | |
| reply = await complete( | |
| model=model, | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": f"Translate the following text to the {target_language.language_name} language; use the {script} script; reply only with the translation:\n\n{original_sentence}", | |
| } | |
| ], | |
| temperature=0, | |
| max_tokens=1024, | |
| ) | |
| prediction = reply.choices[0].message.content.strip() | |
| if prediction.strip(): | |
| bleu_score = bleu.compute( | |
| predictions=[prediction], | |
| references=[target_sentence], | |
| tokenizer=tokenizer.tokenize, | |
| ) | |
| else: | |
| bleu_score = {"bleu": 0} | |
| chrf_score = chrf.compute(predictions=[prediction], references=[target_sentence]) | |
| return [ | |
| { | |
| "model": model, | |
| "bcp_47": bcp_47, | |
| "task": f"translation_{mode}", | |
| "metric": metric, | |
| "score": score, | |
| "sentence_nr": sentence_nr, | |
| } | |
| for metric, score in ( | |
| ("bleu", bleu_score["bleu"]), | |
| ("chrf", chrf_score["score"] / 100), | |
| ) | |
| ] | |
| # metadata = pd.read_csv("data/floresp-v2.0-rc.3/metadata_dev.tsv", sep="\t") | |
| async def classify_and_evaluate(model, bcp_47, nr): | |
| language = languages[languages["bcp_47"] == bcp_47].iloc[0] | |
| sentences = flores_sentences(language) | |
| if not sentences: | |
| return [] | |
| sentences = pd.DataFrame(sentences, columns=["text"]) | |
| sentences = pd.concat([metadata, sentences], axis=1) | |
| sentences = sentences.dropna(subset=["topic"]) | |
| sentences["topic"] = sentences["topic"].str.lower() | |
| paragraphs = ( | |
| sentences.groupby("URL").agg({"text": " ".join, "topic": "first"}).reset_index() | |
| ) | |
| top_topics = paragraphs.value_counts("topic").head(5).index | |
| paragraphs = paragraphs[paragraphs["topic"].isin(top_topics)] | |
| examples = pd.concat( | |
| [ | |
| paragraphs[paragraphs["topic"] == t].sample(n=1, random_state=42) | |
| for t in top_topics | |
| ] | |
| ).sample(frac=1, random_state=nr) | |
| test_paragraphs = paragraphs[~paragraphs["URL"].isin(examples["URL"])].sample( | |
| frac=1, random_state=42 | |
| ) | |
| test_paragraph = test_paragraphs.iloc[nr] | |
| def format_prompt(text): | |
| return f"{text}\n\nTopic: {'|'.join(top_topics)}?" | |
| messages = [] | |
| for example in examples.itertuples(): | |
| messages += [ | |
| {"role": "user", "content": format_prompt(example.text)}, | |
| {"role": "assistant", "content": example.topic}, | |
| ] | |
| # some models have poor tokenization for some languages, and the prompt for this task is relatively long, so it sometimes exceeds the context window | |
| # this is not just to blame on the context window but mostly on the model's tokenization, so we assign 0 accuracy in this case | |
| try: | |
| reply = await complete( | |
| model=model, | |
| messages=[ | |
| *messages, | |
| { | |
| "role": "user", | |
| "content": format_prompt(test_paragraph.text), | |
| }, | |
| ], | |
| temperature=0, | |
| max_tokens=30, | |
| ) | |
| response = reply.choices[0].message.content.strip().lower() | |
| true = test_paragraph.topic | |
| others = [t for t in top_topics if t != true] | |
| acc = int( | |
| response.startswith(true) | |
| or (true in response and not any(o in response for o in others)) | |
| ) | |
| except Exception as e: | |
| if "`inputs` tokens + `max_new_tokens` must be <= 4097" in str(e): | |
| print(f"Max tokens exceeded for {model} in {bcp_47}") | |
| acc = 0 | |
| else: | |
| raise e | |
| return [ | |
| { | |
| "model": model, | |
| "bcp_47": bcp_47, | |
| "task": "classification", | |
| "metric": "accuracy", | |
| "score": acc, | |
| "sentence_nr": nr, | |
| } | |
| ] | |
| def corrupt_sentence(sentence): | |
| # replace 5% of the sentence with <mask> | |
| mask_length = round(len(sentence) * 0.05) | |
| start = random.randint(0, len(sentence) - mask_length) | |
| end = start + mask_length | |
| return sentence[:start] + "<mask>" + sentence[end:] | |
| async def mlm_and_evaluate(model, language_bcp_47, nr): | |
| language = languages[languages["bcp_47"] == language_bcp_47].iloc[0] | |
| sentences = flores_sentences(language) | |
| if not sentences: | |
| return [] | |
| sentences = pd.DataFrame(sentences, columns=["text"]) | |
| sentences["corrupt_text"] = sentences["text"].apply(corrupt_sentence) | |
| examples = sentences.sample(n=10, random_state=42) | |
| test_sentences = sentences[~sentences["text"].isin(examples["text"])].sample( | |
| frac=1, random_state=42 | |
| ) | |
| test_sentence = test_sentences.iloc[nr] | |
| messages = [] | |
| for example in examples.itertuples(): | |
| messages += [ | |
| {"role": "user", "content": example.corrupt_text}, | |
| {"role": "assistant", "content": example.text}, | |
| ] | |
| reply = await complete( | |
| model=model, | |
| messages=[ | |
| *messages, | |
| { | |
| "role": "user", | |
| "content": test_sentence.corrupt_text, | |
| }, | |
| ], | |
| temperature=0, | |
| max_tokens=1024, | |
| ) | |
| prediction = reply.choices[0].message.content.strip() | |
| chrf_score = chrf.compute(predictions=[prediction], references=[test_sentence.text]) | |
| return [ | |
| { | |
| "model": model, | |
| "bcp_47": language["bcp_47"], | |
| "task": "language_modeling", | |
| "metric": "chrf", | |
| "score": chrf_score["score"] / 100, | |
| "sentence_nr": nr, | |
| } | |
| ] | |
| async def mmlu_and_evaluate(model, language_bcp_47, nr): | |
| ds_name, examples, task = load_mmlu(language_bcp_47, nr) | |
| if not task: | |
| return [] | |
| def format_item(item): | |
| return f"""{item["question"]} | |
| A: {item["choices"][0]} | |
| B: {item["choices"][1]} | |
| C: {item["choices"][2]} | |
| D: {item["choices"][3]} | |
| A|B|C|D?""" | |
| messages = [] | |
| for example in examples: | |
| messages += [ | |
| {"role": "user", "content": format_item(example)}, | |
| {"role": "assistant", "content": example["answer"]}, | |
| ] | |
| messages += [{"role": "user", "content": format_item(task)}] | |
| try: | |
| reply = await complete( | |
| model=model, | |
| messages=messages, | |
| temperature=0, | |
| max_tokens=1, | |
| ) | |
| acc = int(reply.choices[0].message.content[:1].strip() == task["answer"]) | |
| except Exception as e: | |
| if "ResponsibleAIPolicyViolation" in str(e): | |
| acc = 0 | |
| else: | |
| raise e | |
| return [ | |
| { | |
| "model": model, | |
| "bcp_47": language_bcp_47, | |
| "task": "mmlu", | |
| "metric": "accuracy", | |
| "score": acc, | |
| "sentence_nr": nr, | |
| } | |
| ] | |
| async def transcribe_and_evaluate(model, language_bcp_47, nr): | |
| language = languages[languages["bcp_47"] == language_bcp_47].iloc[0] | |
| fleurs = pd.read_csv( | |
| f"data/fleurs/{language.fleurs_tag}/dev.tsv", | |
| sep="\t", | |
| names=[ | |
| "id", | |
| "fname", | |
| "raw_transcription", | |
| "transcription", | |
| "words", | |
| "id2", | |
| "gender", | |
| ], | |
| ) | |
| item = fleurs.iloc[nr] | |
| path = f"data/fleurs/{language.fleurs_tag}/audio/dev/{item.fname}" | |
| pred = await transcribe(path, model=model) | |
| wer_score = wer.compute(predictions=[pred], references=[item.transcription]) | |
| return [ | |
| { | |
| "model": model, | |
| "bcp_47": language["bcp_47"], | |
| "task": "asr", | |
| "metric": "wer", | |
| "score": wer_score, | |
| "sentence_nr": nr, | |
| } | |
| ] | |
| tasks = { | |
| "translation_from": partial(translate_and_evaluate, mode="from"), | |
| "translation_to": partial(translate_and_evaluate, mode="to"), | |
| # "classification": classify_and_evaluate, | |
| # "mlm": mlm_and_evaluate, | |
| "mmlu": mmlu_and_evaluate, | |
| # "asr": transcribe_and_evaluate, | |
| } | |