Spaces:

naacl-anonymous
/

selective_pre_translation

Runtime error

selective_pre_translation / tasks /ner.py

Anonymous

add files

208053f about 1 year ago

3.68 kB

	from typing import List, Dict, Any

	from easygoogletranslate import EasyGoogleTranslate
	from langchain.prompts import PromptTemplate, FewShotPromptTemplate

	LANGUAGE_TO_GOOGLE_TRANSLATE_MARK = {
	"english": "en",
	"bambara": "bm",
	"ewe": "ee",
	"hausa": "ha",
	"igbo": "ig",
	"kinyarwanda": "rw",
	"chichewa": "ny",
	"twi": "ak",
	"yoruba": "yo",
	"slovak": "sk",
	"serbian": "sr",
	"swedish": "sv",
	"vietnamese": "vi",
	"italian": "it",
	"portuguese": "pt",
	"chinese": "zh",
	"english": "en",
	"french": "fr"



	}

	LANGAUGE_TO_PREFIX = {
	"bambara": "bam",
	"ewe": "ewe",
	"fon": "fon",
	"hausa": "hau",
	"igbo": "ibo",
	"kinyarwanda": "kin",
	"chichewa": "nya",
	"twi": "twi",
	"yoruba": "yor",
	"slovak": "sk",
	"serbian": "sr",
	"swedish": "sv",
	"vietnamese": "vi",
	"italian": "it",
	"portuguese": "pt",
	"chinese": "zh",
	"english": "en",
	"french": "fr"
	}


	def _translate_instruction(basic_instruction: str, target_language: str) -> str:
	translator = EasyGoogleTranslate(
	source_language="en",
	target_language=LANGAUGE_TO_PREFIX[target_language],
	timeout=10,
	)
	return translator.translate(basic_instruction)


	def create_instruction(lang: str, expected_output: str):
	basic_instruction = f"""You are an NLP assistant whose
	purpose is to perform Named Entity Recognition
	(NER). You will need to give each entity a tag, from the following:
	PER means a person, ORG means organization.
	LOC means a location entity.
	The output should be a list of tuples of the format:
	['Tag: Entity', 'Tag: Entity'] for each entity in the sentence.
	The entities should be in {expected_output} language"""

	return (
	basic_instruction
	if lang == "english"
	else _translate_instruction(basic_instruction, target_language=lang)
	)

	def construct_prompt(
	instruction: str,
	test_example: dict,
	zero_shot: bool,
	dataset: str,
	num_examples: int,
	lang: str,
	config: Dict[str, str],
	):
	if not instruction:
	print(lang)
	instruction = create_instruction(lang, config['prefix'])

	example_prompt = PromptTemplate(
	input_variables=["summary", "text"], template="Text: {text}\nSummary: {summary}"
	)

	zero_shot_template = f"""{instruction}""" + "\n Input: {text} " ""

	test_data = load_xlsum_data(lang=lang, split="test", limit=100)

	print(test_data)
	print(num_examples)
	print(lang)
	ic_examples = []
	if not zero_shot:

	ic_examples = choose_few_shot_examples(
	train_dataset=test_data,
	few_shot_size=num_examples,
	context=[config["context"]] * num_examples,
	selection_criteria="random",
	lang=lang,
	)

	prompt = (
	FewShotPromptTemplate(
	examples=ic_examples,
	prefix=instruction,
	example_prompt=example_prompt,
	suffix="<Text>: {text}",
	input_variables=["text"],
	)
	if not zero_shot
	else PromptTemplate(input_variables=["text"], template=zero_shot_template)
	)

	print("lang", lang)
	print(config["input"] , lang)
	if config["input"] != lang:
	test_example = _translate_example(
	example=test_example, src_language=lang, target_language=config["input"]
	)

	print("test_example", prompt)
	return prompt.format(text=test_example["text"])