Spaces:
Sleeping
Sleeping
File size: 7,327 Bytes
56f497c 70ebea3 a103fac 9b5378d cae0132 56f497c cae0132 4596a5d 8b76c73 4894c8f 2da2e03 e647eeb 282f817 a770d4a 9b5378d a770d4a 9b5378d e13fba3 310e819 0b61062 8010198 9b5378d 4b5e076 9b5378d e647eeb 8010198 9b5378d 70ebea3 56f497c 4596a5d 0b61062 ad4a802 56f497c 8b76c73 56f497c 8b76c73 e13fba3 8b76c73 ad4a802 395e0a2 ad4a802 9b5378d 8010198 310e819 6dc437d d2894fa cae0132 44aa6cb d2894fa cae0132 d2894fa 310e819 d2894fa a770d4a d2894fa 56f497c bf6322b 56f497c 72e8644 2045817 72e8644 60b55c4 4894c8f 4fd4915 b98b60d 56f497c 2169b22 56f497c e647eeb 56f497c e647eeb 2169b22 56f497c 7b79ed9 2169b22 28c6232 56f497c ba3b9f6 56f497c 8b76c73 56f497c b98b60d 56f497c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
import gradio as gr
import spaces
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, pipeline
import languagecodes
favourite_langs = {"German": "de", "Romanian": "ro", "English": "en", "-----": "-----"}
all_langs = languagecodes.iso_languages
# Language options as list, add favourite languages first
options = list(favourite_langs.keys())
options.extend(list(all_langs.keys()))
models = ["Helsinki-NLP",
"t5-base", "t5-small", "t5-large",
"facebook/nllb-200-distilled-600M",
"facebook/nllb-200-distilled-1.3B",
"facebook/mbart-large-50-many-to-many-mmt",
"utter-project/EuroLLM-1.7B",
"Unbabel/TowerInstruct-7B-v0.2",
"Unbabel/TowerInstruct-Mistral-7B-v0.2"
]
def model_to_cuda(model):
# Move the model to GPU if available
if torch.cuda.is_available():
model = model.to('cuda')
print("CUDA is available! Using GPU.")
else:
print("CUDA not available! Using CPU.")
return model
def eurollm(model_name, sl, tl, input_text):
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
prompt = f"{sl}: {input_text} {tl}:"
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=512)
output = tokenizer.decode(outputs[0], skip_special_tokens=True)
result = output.rsplit(f'{tl}:')[-1].strip()
return result
def nllb(model_name, sl, tl, input_text):
tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang=sl)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto")
translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang=sl, tgt_lang=tl)
translated_text = translator(input_text, max_length=512)
return translated_text[0]['translation_text']
@spaces.GPU
def translate_text(input_text, sselected_language, tselected_language, model_name):
sl = all_langs[sselected_language]
tl = all_langs[tselected_language]
message_text = f'Translated from {sselected_language} to {tselected_language} with {model_name}'
print(message_text)
if model_name == "Helsinki-NLP":
try:
model_name = f"Helsinki-NLP/opus-mt-{sl}-{tl}"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = model_to_cuda(AutoModelForSeq2SeqLM.from_pretrained(model_name))
except EnvironmentError:
try:
model_name = f"Helsinki-NLP/opus-tatoeba-{sl}-{tl}"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = model_to_cuda(AutoModelForSeq2SeqLM.from_pretrained(model_name))
except EnvironmentError as error:
return f"Error finding model: {model_name}! Try other available language combination.", error
if 'eurollm' in model_name.lower():
translated_text = eurollm(model_name, sselected_language, tselected_language, input_text)
return translated_text, message_text
if 'nllb' in model_name.lower():
nnlbsl, nnlbtl = languagecodes.nllb_language_codes[sselected_language], languagecodes.nllb_language_codes[tselected_language]
translated_text = nllb(model_name, nnlbsl, nnlbtl, input_text)
return translated_text, message_text
if model_name.startswith('facebook/mbart-large'):
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
# translate source to target
tokenizer.src_lang = languagecodes.mbart_large_languages[sselected_language]
encoded = tokenizer(input_text, return_tensors="pt")
generated_tokens = model.generate(
**encoded,
forced_bos_token_id=tokenizer.lang_code_to_id[languagecodes.mbart_large_languages[tselected_language]]
)
return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0], message_text
if 'Unbabel' in model_name:
pipe = pipeline("text-generation", model=model_name, torch_dtype=torch.bfloat16, device_map="auto")
messages = [{"role": "user",
"content": f"Translate the following text from {sselected_language} into {tselected_language}.\n{sselected_language}: {input_text}.\n{tselected_language}:"}]
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
outputs = pipe(prompt, max_new_tokens=256, do_sample=False)
translated_text = outputs[0]["generated_text"]
return translated_text, message_text
if model_name.startswith('t5'):
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name, device_map="auto")
if model_name.startswith("Helsinki-NLP"):
prompt = input_text
else:
prompt = f"translate {sselected_language} to {tselected_language}: {input_text}"
input_ids = tokenizer.encode(prompt, return_tensors="pt")
output_ids = model.generate(input_ids, max_length=512)
translated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
message_text = f'Translated from {sselected_language} to {tselected_language} with {model_name}'
print(f'Translating from {sselected_language} to {tselected_language} with {model_name}:', f'{input_text} = {translated_text}', sep='\n')
return translated_text, message_text
# Define a function to swap dropdown values
def swap_languages(src_lang, tgt_lang):
return tgt_lang, src_lang
def create_interface():
with gr.Blocks() as interface:
gr.Markdown("### Machine Text Translation")
with gr.Row():
input_text = gr.Textbox(label="Enter text to translate:", placeholder="Type your text here, maximum 512 tokens")
with gr.Row():
sselected_language = gr.Dropdown(choices=options, value = options[0], label="Source language", interactive=True)
tselected_language = gr.Dropdown(choices=options, value = options[1], label="Target language", interactive=True)
swap_button = gr.Button("Swap Languages")
swap_button.click(fn=swap_languages, inputs=[sselected_language, tselected_language], outputs=[sselected_language, tselected_language])
model_name = gr.Dropdown(choices=models, label="Select a model", value = models[4], interactive=True)
translate_button = gr.Button("Translate")
translated_text = gr.Textbox(label="Translated text:", placeholder="Display field for translation", interactive=False, show_copy_button=True)
message_text = gr.Textbox(label="Messages:", placeholder="Display field for status and error messages", interactive=False)
translate_button.click(
translate_text,
inputs=[input_text, sselected_language, tselected_language, model_name],
outputs=[translated_text, message_text]
)
return interface
interface = create_interface()
interface.launch() |