from typing import List, Dict, Any from easygoogletranslate import EasyGoogleTranslate from langchain.prompts import PromptTemplate, FewShotPromptTemplate LANGUAGE_TO_GOOGLE_TRANSLATE_MARK = { "english": "en", "bambara": "bm", "ewe": "ee", "hausa": "ha", "igbo": "ig", "kinyarwanda": "rw", "chichewa": "ny", "twi": "ak", "yoruba": "yo", "slovak": "sk", "serbian": "sr", "swedish": "sv", "vietnamese": "vi", "italian": "it", "portuguese": "pt", "chinese": "zh", "english": "en", "french": "fr" } LANGAUGE_TO_PREFIX = { "bambara": "bam", "ewe": "ewe", "fon": "fon", "hausa": "hau", "igbo": "ibo", "kinyarwanda": "kin", "chichewa": "nya", "twi": "twi", "yoruba": "yor", "slovak": "sk", "serbian": "sr", "swedish": "sv", "vietnamese": "vi", "italian": "it", "portuguese": "pt", "chinese": "zh", "english": "en", "french": "fr" } def _translate_instruction(basic_instruction: str, target_language: str) -> str: translator = EasyGoogleTranslate( source_language="en", target_language=LANGAUGE_TO_PREFIX[target_language], timeout=10, ) return translator.translate(basic_instruction) def create_instruction(lang: str, expected_output: str): basic_instruction = f"""You are an NLP assistant whose purpose is to perform Named Entity Recognition (NER). You will need to give each entity a tag, from the following: PER means a person, ORG means organization. LOC means a location entity. The output should be a list of tuples of the format: ['Tag: Entity', 'Tag: Entity'] for each entity in the sentence. The entities should be in {expected_output} language""" return ( basic_instruction if lang == "english" else _translate_instruction(basic_instruction, target_language=lang) ) def construct_prompt( instruction: str, test_example: dict, zero_shot: bool, dataset: str, num_examples: int, lang: str, config: Dict[str, str], ): if not instruction: print(lang) instruction = create_instruction(lang, config['prefix']) example_prompt = PromptTemplate( input_variables=["summary", "text"], template="Text: {text}\nSummary: {summary}" ) zero_shot_template = f"""{instruction}""" + "\n Input: {text} " "" test_data = load_xlsum_data(lang=lang, split="test", limit=100) print(test_data) print(num_examples) print(lang) ic_examples = [] if not zero_shot: ic_examples = choose_few_shot_examples( train_dataset=test_data, few_shot_size=num_examples, context=[config["context"]] * num_examples, selection_criteria="random", lang=lang, ) prompt = ( FewShotPromptTemplate( examples=ic_examples, prefix=instruction, example_prompt=example_prompt, suffix=": {text}", input_variables=["text"], ) if not zero_shot else PromptTemplate(input_variables=["text"], template=zero_shot_template) ) print("lang", lang) print(config["input"] , lang) if config["input"] != lang: test_example = _translate_example( example=test_example, src_language=lang, target_language=config["input"] ) print("test_example", prompt) return prompt.format(text=test_example["text"])