File size: 4,217 Bytes
931b71f a23a2a4 7bc13dc 3ddb276 931b71f 30f984e 1f648dc 7498514 59764d5 931b71f 1f648dc 931b71f 1f648dc 931b71f cd2dcf6 1f648dc 7498514 eed21e7 8717ad7 c8b57f0 e057a26 0f908c5 1f648dc 931b71f 1f648dc a1af82c 1f648dc 059e62b 1f648dc 931b71f 059e62b 931b71f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import streamlit as st
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch
import os
# Create the app layout
st.header("Text Machine Translation")
input_text = st.text_input("Enter text to translate:")
# Create a list of options for the select box
options = ["German", "Romanian", "English", "French", "Spanish", "Italian"]
langs = {"English":"en", "Romanian":"ro", "German":"de", "French":"fr", "Spanish":"es", "Italian":"it"}
models = ["Helsinki-NLP", "t5-base", "t5-small", "t5-large", "Unbabel/TowerInstruct-7B-v0.2", "Unbabel/TowerInstruct-Mistral-7B-v0.2", 'Google']
# Create two columns
scol, tcol = st.columns(2)
# Place select boxes in columns
with scol:
sselected_language = st.selectbox("Source language:", options, index=0, placeholder="Select source language")
with tcol:
tselected_language = st.selectbox("Target language:", options, index=1, placeholder="Select target language")
model_name = st.selectbox("Select a model:", models, index=0, placeholder="Select language model")
sl = langs[sselected_language]
tl = langs[tselected_language]
st.session_state["sselected_language"] = sselected_language
st.session_state["tselected_language"] = tselected_language
st.session_state["model_name"] = model_name
if model_name == 'Helsinki-NLP':
try:
model_name = f"Helsinki-NLP/opus-mt-{sl}-{tl}"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
except EnvironmentError:
model_name = f"Helsinki-NLP/opus-tatoeba-{sl}-{tl}"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
if model_name.startswith('t5'):
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
st.write("Selected language combination:", sselected_language, " - ", tselected_language, "Selected model:", model_name)
submit_button = st.button("Translate")
translated_textarea = st.text("")
# Handle the submit button click
if submit_button:
if model_name.startswith('Helsinki-NLP'):
prompt = input_text
print(prompt)
input_ids = tokenizer.encode(prompt, return_tensors='pt')
# Perform translation
output_ids = model.generate(input_ids)
# Decode the translated text
translated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
elif model_name.startswith('Google'):
import requests
url = os.environ['GTRANSURL']
params = {'client': 'gtx', 'sl': sl, 'tl': tl, 'dt': 't', 'ie': 'UTF-8', 'oe': 'UTF-8', 'model': 'nmt', 'q': input_text}
response = requests.get(url, params=params)
translated_text = response.json()[0][0][0]
print(response.json()[0][0])
elif model_name.startswith('t5'):
prompt = f'translate {sselected_language} to {tselected_language}: {input_text}'
print(prompt)
input_ids = tokenizer.encode(prompt, return_tensors='pt')
# Perform translation
output_ids = model.generate(input_ids)
# Decode the translated text
translated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
else:
pipe = pipeline("text-generation", model=model_name, torch_dtype=torch.bfloat16, device_map="auto")
# We use the tokenizer’s chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
messages = [
{"role": "user", "content": f"Translate the following text from {sselected_language} into {tselected_language}.\n{sselected_language}: {input_text}.\n{tselected_language}:"},
]
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
outputs = pipe(prompt, max_new_tokens=256, do_sample=False)
translated_text = outputs[0]["generated_text"]
# Display the translated text
print(translated_text)
st.write(f"Translated text from {sselected_language} to {tselected_language} using {model_name}:")
translated_textarea = st.text(translated_text) |