Spaces:
Runtime error
Runtime error
File size: 5,271 Bytes
15f5208 208053f 15f5208 208053f 707f578 15f5208 208053f 707f578 208053f 15f5208 208053f 15f5208 208053f f31b1b7 208053f f31b1b7 208053f 707f578 208053f 707f578 208053f f31b1b7 208053f 707f578 208053f 707f578 208053f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
from typing import List, Dict, Union
import numpy as np
from datasets import Dataset, load_dataset
from easygoogletranslate import EasyGoogleTranslate
from langchain.prompts import PromptTemplate, FewShotPromptTemplate
LANGUAGE_TO_SUFFIX = {
"chinese_simplified": "zh-CN",
"french": "fr",
"portuguese": "pt",
"english": "en",
"arabic": "ar",
"hindi": "hi",
"indonesian": "id",
"amharic": "am",
"bengali": "bn",
"burmese": "my",
"uzbek": "uz",
"nepali": "ne",
"japanese": "ja",
"spanish": "es",
"turkish": "tr",
"persian": "fa",
"azerbaijani": "az",
"korean": "ko",
"hebrew": "he",
"telugu": "te",
"german": "de",
"greek": "el",
"tamil": "ta",
"assamese": "as",
"vietnamese": "vi",
"russian": "ru",
"romanian": "ro",
"malayalam": "ml",
"swahili": "sw",
"bulgarian": "bg",
"thai": "th",
"urdu": "ur",
"italian": "it",
"polish": "pl",
"dutch": "nl",
"swedish": "sv",
"danish": "da",
"norwegian": "no",
"finnish": "fi",
"hungarian": "hu",
"czech": "cs",
"slovak": "sk",
"ukrainian": "uk"
}
def choose_few_shot_examples(
train_dataset: Dataset, few_shot_size: int, context: List[str], selection_criteria: str, lang: str,
) -> List[Dict[str, Union[str, int]]]:
selected_examples = []
example_idxs = []
if selection_criteria == "first_k":
example_idxs = list(range(few_shot_size))
elif selection_criteria == "random":
example_idxs = (
np.random.choice(len(train_dataset), size=few_shot_size, replace=True)
.astype(int)
.tolist()
)
ic_examples = [{'text': train_dataset[idx]['text'], 'summary': train_dataset[idx]['summary']} for idx in
example_idxs]
for idx, ic_language in enumerate(context):
selected_examples.append(ic_examples[idx]) if ic_language == lang else (
selected_examples.append(
_translate_example(example=ic_examples[idx], src_language=lang, target_language=ic_language)))
return selected_examples
def _translate_instruction(basic_instruction: str, target_language: str) -> str:
translator = EasyGoogleTranslate(
source_language="en",
target_language=LANGUAGE_TO_SUFFIX[target_language],
timeout=50,
)
return translator.translate(basic_instruction)
def _translate_example(example: Dict[str, str], src_language: str, target_language: str):
translator = EasyGoogleTranslate(source_language=LANGUAGE_TO_SUFFIX[src_language],
target_language=LANGUAGE_TO_SUFFIX[target_language],
timeout=30)
try:
return {'text': translator.translate(example['text']), 'summary': ''}
except Exception as e:
print(e)
def create_instruction(lang: str, instruction_language: str, expected_output: str):
basic_instruction = (
f"Write a summary of the given <Text> \n The output should be in {expected_output} "
f"\n The output must be up to 2 sentences maximum!!!"
)
print(lang)
return (
basic_instruction
if instruction_language == "english"
else _translate_instruction(basic_instruction, target_language=lang)
)
def load_xlsum_data(lang, split, limit=5):
"""Loads the xlsum dataset"""
dataset = load_dataset("csebuetnlp/xlsum", lang)[split]
return dataset.select(range(limit))
def construct_prompt(
instruction: str,
test_example: dict,
zero_shot: bool,
dataset: str,
num_examples: int,
lang: str,
config: Dict[str, str],
):
if not instruction:
print(lang)
instruction = create_instruction(lang, config['prefix'], config['output'])
example_prompt = PromptTemplate(
input_variables=["summary", "text"], template="Text: {text}\nSummary: {summary}"
)
zero_shot_template = f"""{instruction}""" + "\n Input: {text} " ""
if not zero_shot:
try:
test_data = load_xlsum_data(lang=lang, split="test", limit=100)
except Exception as e:
raise KeyError(f"{lang} is not supported in XlSum dataset, choose supported language in few-shot")
ic_examples = []
if not zero_shot:
ic_examples = choose_few_shot_examples(
train_dataset=test_data,
few_shot_size=num_examples,
context=[config["context"]] * num_examples,
selection_criteria="random",
lang=lang,
)
prompt = (
FewShotPromptTemplate(
examples=ic_examples,
prefix=instruction,
example_prompt=example_prompt,
suffix="<Text>: {text}",
input_variables=["text"],
)
if not zero_shot
else PromptTemplate(input_variables=["text"], template=zero_shot_template)
)
print("lang", lang)
print(config["input"], lang)
if config["input"] != lang:
test_example = _translate_example(
example=test_example, src_language=lang, target_language=config["input"]
)
print("test_example", prompt)
return prompt.format(text=test_example["text"])
|