Spaces:

naacl-anonymous
/

selective_pre_translation

Runtime error

File size: 3,684 Bytes

208053f

from typing import List, Dict, Any

from easygoogletranslate import EasyGoogleTranslate
from langchain.prompts import PromptTemplate, FewShotPromptTemplate

LANGUAGE_TO_GOOGLE_TRANSLATE_MARK = {
    "english": "en",
    "bambara": "bm",
    "ewe": "ee",
    "hausa": "ha",
    "igbo": "ig",
    "kinyarwanda": "rw",
    "chichewa": "ny",
    "twi": "ak",
    "yoruba": "yo",
    "slovak": "sk",
    "serbian": "sr",
    "swedish": "sv",
    "vietnamese": "vi",
    "italian": "it",
    "portuguese": "pt",
    "chinese": "zh",
    "english": "en",
    "french": "fr"



}

LANGAUGE_TO_PREFIX = {
    "bambara": "bam",
    "ewe": "ewe",
    "fon": "fon",
    "hausa": "hau",
    "igbo": "ibo",
    "kinyarwanda": "kin",
    "chichewa": "nya",
    "twi": "twi",
    "yoruba": "yor",
    "slovak": "sk",
    "serbian": "sr",
    "swedish": "sv",
    "vietnamese": "vi",
    "italian": "it",
    "portuguese": "pt",
    "chinese": "zh",
    "english": "en",
    "french": "fr"
}


def _translate_instruction(basic_instruction: str, target_language: str) -> str:
    translator = EasyGoogleTranslate(
        source_language="en",
        target_language=LANGAUGE_TO_PREFIX[target_language],
        timeout=10,
    )
    return translator.translate(basic_instruction)


def create_instruction(lang: str, expected_output: str):
    basic_instruction = f"""You are an NLP assistant whose
                            purpose is to perform Named Entity Recognition
                            (NER). You will need to give each entity a tag, from the following:
                            PER means a person, ORG means organization.
                            LOC means a location entity.
                            The output should be a list of tuples of the format:
                            ['Tag: Entity', 'Tag: Entity'] for each entity in the sentence. 
                            The entities should be in {expected_output} language"""

    return (
        basic_instruction
        if lang == "english"
        else _translate_instruction(basic_instruction, target_language=lang)
    )

def construct_prompt(
    instruction: str,
    test_example: dict,
    zero_shot: bool,
    dataset: str,
    num_examples: int,
    lang: str,
    config: Dict[str, str],
):
    if not instruction:
        print(lang)
        instruction = create_instruction(lang, config['prefix'])

    example_prompt = PromptTemplate(
        input_variables=["summary", "text"], template="Text: {text}\nSummary: {summary}"
    )

    zero_shot_template = f"""{instruction}""" + "\n Input: {text} " ""

    test_data = load_xlsum_data(lang=lang, split="test", limit=100)

    print(test_data)
    print(num_examples)
    print(lang)
    ic_examples = []
    if not zero_shot:

        ic_examples = choose_few_shot_examples(
            train_dataset=test_data,
            few_shot_size=num_examples,
            context=[config["context"]] * num_examples,
            selection_criteria="random",
            lang=lang,
        )

    prompt = (
        FewShotPromptTemplate(
            examples=ic_examples,
            prefix=instruction,
            example_prompt=example_prompt,
            suffix="<Text>: {text}",
            input_variables=["text"],
        )
        if not zero_shot
        else PromptTemplate(input_variables=["text"], template=zero_shot_template)
    )

    print("lang", lang)
    print(config["input"] , lang)
    if config["input"] != lang:
        test_example = _translate_example(
            example=test_example, src_language=lang, target_language=config["input"]
        )

    print("test_example", prompt)
    return prompt.format(text=test_example["text"])