File size: 5,232 Bytes
931b71f
215212f
a23a2a4
7bc13dc
215212f
 
 
3ddb276
931b71f
 
 
 
30f984e
1f648dc
0cacd2e
59764d5
931b71f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f648dc
931b71f
 
1f648dc
931b71f
 
 
 
 
 
 
cd2dcf6
1f648dc
 
 
 
 
 
215212f
86f6a5a
84325a1
e057a26
0f908c5
1f648dc
931b71f
1f648dc
 
 
 
 
 
e165141
a1af82c
1f648dc
e165141
 
059e62b
1f648dc
 
e165141
 
 
0cacd2e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e165141
931b71f
 
059e62b
931b71f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import streamlit as st
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, logging
import torch
import os
import httpx

logging.set_verbosity_error()

# Create the app layout
st.header("Text Machine Translation")
input_text = st.text_input("Enter text to translate:")
# Create a list of options for the select box
options = ["German", "Romanian", "English", "French", "Spanish", "Italian"]
langs = {"English":"en", "Romanian":"ro", "German":"de", "French":"fr", "Spanish":"es", "Italian":"it"}
models = ["Helsinki-NLP", "t5-base", "t5-small", "t5-large", "Unbabel/Tower-Plus-2B", "Unbabel/TowerInstruct-Mistral-7B-v0.2", 'Google', 'Argos']

# Create two columns
scol, tcol = st.columns(2)
# Place select boxes in columns
with scol:
    sselected_language = st.selectbox("Source language:", options, index=0, placeholder="Select source language")
with tcol:
    tselected_language = st.selectbox("Target language:", options, index=1, placeholder="Select target language")
model_name = st.selectbox("Select a model:", models, index=0, placeholder="Select language model")

sl = langs[sselected_language]
tl = langs[tselected_language]

st.session_state["sselected_language"] = sselected_language
st.session_state["tselected_language"] = tselected_language
st.session_state["model_name"] = model_name

if model_name == 'Helsinki-NLP':
    try:
        model_name = f"Helsinki-NLP/opus-mt-{sl}-{tl}"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    except EnvironmentError:
        model_name = f"Helsinki-NLP/opus-tatoeba-{sl}-{tl}"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
if model_name.startswith('t5'):
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)

st.write("Selected language combination:", sselected_language, " - ", tselected_language, "Selected model:", model_name)
submit_button = st.button("Translate")
translated_textarea = st.text("")

# Handle the submit button click
if submit_button:
    if model_name.startswith('Helsinki-NLP'):
        prompt = input_text
        print(prompt)
        input_ids = tokenizer.encode(prompt, return_tensors='pt')
        # Perform translation
        output_ids = model.generate(input_ids)
        # Decode the translated text
        translated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    elif model_name.startswith('Google'): 
        url = os.environ['GCLIENT'] + f'sl={sl}&tl={tl}&q={input_text}'
        response = httpx.get(url)
        translated_text = response.json()[0][0][0]
        print(response.json()[0][0])
    elif model_name.startswith('t5'):
        prompt = f'translate {sselected_language} to {tselected_language}: {input_text}'
        print(prompt)
        input_ids = tokenizer.encode(prompt, return_tensors='pt')
        # Perform translation
        output_ids = model.generate(input_ids)
        # Decode the translated text
        translated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    if 'Unbabel' in model_name:   
        pipe = pipeline("text-generation", model=model_name, torch_dtype=torch.bfloat16, device_map="auto")
        # We use the tokenizer’s chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
        messages = [{"role": "user",
                     "content": f"Translate the following text from {sselected_language} into {tselected_language}.\n{sselected_language}: {input_text}.\n{tselected_language}:"}]
        prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
        outputs = pipe(prompt, max_new_tokens=256, do_sample=False)
        translated_text = outputs[0]["generated_text"]
        start_marker = "<end_of_turn>"
        if start_marker in translated_text:
            translated_text = translated_text.split(start_marker)[1].strip()
        translated_text = translated_text.replace('Answer:', '').strip() if translated_text.startswith('Answer:') else translated_text
    if 'Argos' in model_name:   
        import argostranslate.package
        import argostranslate.translate
        
        from_code = sl
        to_code = tl
        
        # Download and install Argos Translate package
        argostranslate.package.update_package_index()
        available_packages = argostranslate.package.get_available_packages()
        package_to_install = next(
            filter(
                lambda x: x.from_code == from_code and x.to_code == to_code, available_packages
            )
        )
        argostranslate.package.install_from_path(package_to_install.download())
        
        # Translate
        translated_text = argostranslate.translate.translate(input_text, from_code, to_code)
        print(translated_text)
        
    # Display the translated text
    print(translated_text)
    st.write(f"Translated text from {sselected_language} to {tselected_language} using {model_name}:")
    translated_textarea = st.text(translated_text)