File size: 4,125 Bytes
931b71f
a23a2a4
 
3ddb276
931b71f
 
 
 
 
1f648dc
7498514
59764d5
931b71f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f648dc
931b71f
 
1f648dc
931b71f
 
 
 
 
 
 
cd2dcf6
1f648dc
 
 
 
 
 
7498514
 
 
 
 
1f648dc
931b71f
1f648dc
 
 
 
 
 
 
a1af82c
1f648dc
 
 
 
059e62b
1f648dc
 
 
931b71f
 
059e62b
931b71f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import streamlit as st
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch

# Create the app layout
st.header("Text Machine Translation")
input_text = st.text_input("Enter text to translate:")
# Create a list of options for the select box
options = ["German", "Romanian", "English", "French", "Spanish"]
langs = {"English":"en", "Romanian":"ro", "German":"de", "French":"fr", "Spanish":"es", "Italian":"it"}
models = ["Helsinki-NLP", "t5-base", "t5-small", "t5-large", "Unbabel/TowerInstruct-7B-v0.2", "Unbabel/TowerInstruct-Mistral-7B-v0.2", 'Google']

# Create two columns
scol, tcol = st.columns(2)
# Place select boxes in columns
with scol:
    sselected_language = st.selectbox("Source language:", options, index=0, placeholder="Select source language")
with tcol:
    tselected_language = st.selectbox("Target language:", options, index=1, placeholder="Select target language")
model_name = st.selectbox("Select a model:", models, index=0, placeholder="Select language model")

sl = langs[sselected_language]
tl = langs[tselected_language]

st.session_state["sselected_language"] = sselected_language
st.session_state["tselected_language"] = tselected_language
st.session_state["model_name"] = model_name

if model_name == 'Helsinki-NLP':
    try:
        model_name = f"Helsinki-NLP/opus-mt-{sl}-{tl}"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    except EnvironmentError:
        model_name = f"Helsinki-NLP/opus-tatoeba-{sl}-{tl}"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
if model_name.startswith('t5'):
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)

st.write("Selected language combination:", sselected_language, " - ", tselected_language, "Selected model:", model_name)
submit_button = st.button("Translate")
translated_textarea = st.text("")

# Handle the submit button click
if submit_button:
    if model_name.startswith('Helsinki-NLP'):
        prompt = input_text
        print(prompt)
        input_ids = tokenizer.encode(prompt, return_tensors='pt')
        # Perform translation
        output_ids = model.generate(input_ids)
        # Decode the translated text
        translated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    elif model_name.startswith('Google'):
        import requests
        url = os.environ('GTRANSURL')
        params = {'sl': sselected_language, 'tl': tselected_language, 'dt': 't', 'ie': 'UTF-8', 'oe': 'UTF-8', 'q': input_text}
        translated_text = requests.get(url, params=params).text[0][0]
    elif model_name.startswith('t5'):
        prompt = f'translate {sselected_language} to {tselected_language}: {input_text}'
        print(prompt)
        input_ids = tokenizer.encode(prompt, return_tensors='pt')
        # Perform translation
        output_ids = model.generate(input_ids)
        # Decode the translated text
        translated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    else:
        pipe = pipeline("text-generation", model=model_name, torch_dtype=torch.bfloat16, device_map="auto")
        # We use the tokenizer’s chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
        messages = [
            {"role": "user", "content": f"Translate the following text from {sselected_language} into {tselected_language}.\n{sselected_language}: {input_text}.\n{tselected_language}:"},
    ]
        prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
        outputs = pipe(prompt, max_new_tokens=256, do_sample=False)
        translated_text = outputs[0]["generated_text"]

    # Display the translated text
    print(translated_text)
    st.write(f"Translated text from {sselected_language} to {tselected_language} using {model_name}:")
    translated_textarea = st.text(translated_text)