File size: 4,217 Bytes
931b71f
a23a2a4
 
7bc13dc
3ddb276
931b71f
 
 
 
30f984e
1f648dc
7498514
59764d5
931b71f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f648dc
931b71f
 
1f648dc
931b71f
 
 
 
 
 
 
cd2dcf6
1f648dc
 
 
 
 
 
7498514
 
eed21e7
8717ad7
c8b57f0
e057a26
0f908c5
1f648dc
931b71f
1f648dc
 
 
 
 
 
 
a1af82c
1f648dc
 
 
 
059e62b
1f648dc
 
 
931b71f
 
059e62b
931b71f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import streamlit as st
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch
import os

# Create the app layout
st.header("Text Machine Translation")
input_text = st.text_input("Enter text to translate:")
# Create a list of options for the select box
options = ["German", "Romanian", "English", "French", "Spanish", "Italian"]
langs = {"English":"en", "Romanian":"ro", "German":"de", "French":"fr", "Spanish":"es", "Italian":"it"}
models = ["Helsinki-NLP", "t5-base", "t5-small", "t5-large", "Unbabel/TowerInstruct-7B-v0.2", "Unbabel/TowerInstruct-Mistral-7B-v0.2", 'Google']

# Create two columns
scol, tcol = st.columns(2)
# Place select boxes in columns
with scol:
    sselected_language = st.selectbox("Source language:", options, index=0, placeholder="Select source language")
with tcol:
    tselected_language = st.selectbox("Target language:", options, index=1, placeholder="Select target language")
model_name = st.selectbox("Select a model:", models, index=0, placeholder="Select language model")

sl = langs[sselected_language]
tl = langs[tselected_language]

st.session_state["sselected_language"] = sselected_language
st.session_state["tselected_language"] = tselected_language
st.session_state["model_name"] = model_name

if model_name == 'Helsinki-NLP':
    try:
        model_name = f"Helsinki-NLP/opus-mt-{sl}-{tl}"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    except EnvironmentError:
        model_name = f"Helsinki-NLP/opus-tatoeba-{sl}-{tl}"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
if model_name.startswith('t5'):
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)

st.write("Selected language combination:", sselected_language, " - ", tselected_language, "Selected model:", model_name)
submit_button = st.button("Translate")
translated_textarea = st.text("")

# Handle the submit button click
if submit_button:
    if model_name.startswith('Helsinki-NLP'):
        prompt = input_text
        print(prompt)
        input_ids = tokenizer.encode(prompt, return_tensors='pt')
        # Perform translation
        output_ids = model.generate(input_ids)
        # Decode the translated text
        translated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    elif model_name.startswith('Google'):
        import requests
        url = os.environ['GTRANSURL']
        params = {'client': 'gtx', 'sl': sl, 'tl': tl, 'dt': 't', 'ie': 'UTF-8', 'oe': 'UTF-8', 'model': 'nmt', 'q': input_text}
        response = requests.get(url, params=params)
        translated_text = response.json()[0][0][0]
        print(response.json()[0][0])
    elif model_name.startswith('t5'):
        prompt = f'translate {sselected_language} to {tselected_language}: {input_text}'
        print(prompt)
        input_ids = tokenizer.encode(prompt, return_tensors='pt')
        # Perform translation
        output_ids = model.generate(input_ids)
        # Decode the translated text
        translated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    else:
        pipe = pipeline("text-generation", model=model_name, torch_dtype=torch.bfloat16, device_map="auto")
        # We use the tokenizer’s chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
        messages = [
            {"role": "user", "content": f"Translate the following text from {sselected_language} into {tselected_language}.\n{sselected_language}: {input_text}.\n{tselected_language}:"},
    ]
        prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
        outputs = pipe(prompt, max_new_tokens=256, do_sample=False)
        translated_text = outputs[0]["generated_text"]

    # Display the translated text
    print(translated_text)
    st.write(f"Translated text from {sselected_language} to {tselected_language} using {model_name}:")
    translated_textarea = st.text(translated_text)