Anonymous
add files
208053f
raw
history blame
3.68 kB
from typing import List, Dict, Any
from easygoogletranslate import EasyGoogleTranslate
from langchain.prompts import PromptTemplate, FewShotPromptTemplate
LANGUAGE_TO_GOOGLE_TRANSLATE_MARK = {
"english": "en",
"bambara": "bm",
"ewe": "ee",
"hausa": "ha",
"igbo": "ig",
"kinyarwanda": "rw",
"chichewa": "ny",
"twi": "ak",
"yoruba": "yo",
"slovak": "sk",
"serbian": "sr",
"swedish": "sv",
"vietnamese": "vi",
"italian": "it",
"portuguese": "pt",
"chinese": "zh",
"english": "en",
"french": "fr"
}
LANGAUGE_TO_PREFIX = {
"bambara": "bam",
"ewe": "ewe",
"fon": "fon",
"hausa": "hau",
"igbo": "ibo",
"kinyarwanda": "kin",
"chichewa": "nya",
"twi": "twi",
"yoruba": "yor",
"slovak": "sk",
"serbian": "sr",
"swedish": "sv",
"vietnamese": "vi",
"italian": "it",
"portuguese": "pt",
"chinese": "zh",
"english": "en",
"french": "fr"
}
def _translate_instruction(basic_instruction: str, target_language: str) -> str:
translator = EasyGoogleTranslate(
source_language="en",
target_language=LANGAUGE_TO_PREFIX[target_language],
timeout=10,
)
return translator.translate(basic_instruction)
def create_instruction(lang: str, expected_output: str):
basic_instruction = f"""You are an NLP assistant whose
purpose is to perform Named Entity Recognition
(NER). You will need to give each entity a tag, from the following:
PER means a person, ORG means organization.
LOC means a location entity.
The output should be a list of tuples of the format:
['Tag: Entity', 'Tag: Entity'] for each entity in the sentence.
The entities should be in {expected_output} language"""
return (
basic_instruction
if lang == "english"
else _translate_instruction(basic_instruction, target_language=lang)
)
def construct_prompt(
instruction: str,
test_example: dict,
zero_shot: bool,
dataset: str,
num_examples: int,
lang: str,
config: Dict[str, str],
):
if not instruction:
print(lang)
instruction = create_instruction(lang, config['prefix'])
example_prompt = PromptTemplate(
input_variables=["summary", "text"], template="Text: {text}\nSummary: {summary}"
)
zero_shot_template = f"""{instruction}""" + "\n Input: {text} " ""
test_data = load_xlsum_data(lang=lang, split="test", limit=100)
print(test_data)
print(num_examples)
print(lang)
ic_examples = []
if not zero_shot:
ic_examples = choose_few_shot_examples(
train_dataset=test_data,
few_shot_size=num_examples,
context=[config["context"]] * num_examples,
selection_criteria="random",
lang=lang,
)
prompt = (
FewShotPromptTemplate(
examples=ic_examples,
prefix=instruction,
example_prompt=example_prompt,
suffix="<Text>: {text}",
input_variables=["text"],
)
if not zero_shot
else PromptTemplate(input_variables=["text"], template=zero_shot_template)
)
print("lang", lang)
print(config["input"] , lang)
if config["input"] != lang:
test_example = _translate_example(
example=test_example, src_language=lang, target_language=config["input"]
)
print("test_example", prompt)
return prompt.format(text=test_example["text"])