|
import streamlit as st |
|
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM, pipeline |
|
import torch |
|
import os |
|
|
|
|
|
st.header("Text Machine Translation") |
|
input_text = st.text_input("Enter text to translate:") |
|
|
|
options = ["German", "Romanian", "English", "French", "Spanish", "Italian"] |
|
langs = {"English":"en", "Romanian":"ro", "German":"de", "French":"fr", "Spanish":"es", "Italian":"it"} |
|
models = ["Helsinki-NLP", "t5-base", "t5-small", "t5-large", "Unbabel/TowerInstruct-7B-v0.2", "Unbabel/TowerInstruct-Mistral-7B-v0.2", 'Google'] |
|
|
|
|
|
scol, tcol = st.columns(2) |
|
|
|
with scol: |
|
sselected_language = st.selectbox("Source language:", options, index=0, placeholder="Select source language") |
|
with tcol: |
|
tselected_language = st.selectbox("Target language:", options, index=1, placeholder="Select target language") |
|
model_name = st.selectbox("Select a model:", models, index=0, placeholder="Select language model") |
|
|
|
sl = langs[sselected_language] |
|
tl = langs[tselected_language] |
|
|
|
st.session_state["sselected_language"] = sselected_language |
|
st.session_state["tselected_language"] = tselected_language |
|
st.session_state["model_name"] = model_name |
|
|
|
if model_name == 'Helsinki-NLP': |
|
try: |
|
model_name = f"Helsinki-NLP/opus-mt-{sl}-{tl}" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) |
|
except EnvironmentError: |
|
model_name = f"Helsinki-NLP/opus-tatoeba-{sl}-{tl}" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) |
|
if model_name.startswith('t5'): |
|
tokenizer = T5Tokenizer.from_pretrained(model_name) |
|
model = T5ForConditionalGeneration.from_pretrained(model_name) |
|
|
|
st.write("Selected language combination:", sselected_language, " - ", tselected_language, "Selected model:", model_name) |
|
submit_button = st.button("Translate") |
|
translated_textarea = st.text("") |
|
|
|
|
|
if submit_button: |
|
if model_name.startswith('Helsinki-NLP'): |
|
prompt = input_text |
|
print(prompt) |
|
input_ids = tokenizer.encode(prompt, return_tensors='pt') |
|
|
|
output_ids = model.generate(input_ids) |
|
|
|
translated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True) |
|
elif model_name.startswith('Google'): |
|
import requests |
|
url = os.environ['GTRANSURL'] |
|
params = {'client': 'gtx', 'sl': sl, 'tl': tl, 'dt': 't', 'ie': 'UTF-8', 'oe': 'UTF-8', 'model': 'nmt', 'q': input_text} |
|
response = requests.get(url, params=params) |
|
translated_text = response.json()[0][0][0] |
|
print(response.json()[0][0]) |
|
elif model_name.startswith('t5'): |
|
prompt = f'translate {sselected_language} to {tselected_language}: {input_text}' |
|
print(prompt) |
|
input_ids = tokenizer.encode(prompt, return_tensors='pt') |
|
|
|
output_ids = model.generate(input_ids) |
|
|
|
translated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True) |
|
else: |
|
pipe = pipeline("text-generation", model=model_name, torch_dtype=torch.bfloat16, device_map="auto") |
|
|
|
messages = [ |
|
{"role": "user", "content": f"Translate the following text from {sselected_language} into {tselected_language}.\n{sselected_language}: {input_text}.\n{tselected_language}:"}, |
|
] |
|
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False) |
|
outputs = pipe(prompt, max_new_tokens=256, do_sample=False) |
|
translated_text = outputs[0]["generated_text"] |
|
|
|
|
|
print(translated_text) |
|
st.write(f"Translated text from {sselected_language} to {tselected_language} using {model_name}:") |
|
translated_textarea = st.text(translated_text) |