File size: 7,327 Bytes
56f497c
70ebea3
a103fac
9b5378d
cae0132
56f497c
cae0132
4596a5d
8b76c73
4894c8f
2da2e03
 
e647eeb
 
282f817
 
a770d4a
9b5378d
a770d4a
9b5378d
 
e13fba3
310e819
 
 
 
 
 
 
 
0b61062
8010198
 
 
 
9b5378d
4b5e076
9b5378d
e647eeb
8010198
 
 
 
 
 
 
 
9b5378d
70ebea3
56f497c
4596a5d
 
0b61062
ad4a802
56f497c
 
8b76c73
 
 
56f497c
8b76c73
 
 
 
e13fba3
8b76c73
ad4a802
395e0a2
ad4a802
9b5378d
8010198
 
 
 
310e819
6dc437d
d2894fa
 
 
 
cae0132
44aa6cb
d2894fa
 
cae0132
d2894fa
310e819
d2894fa
a770d4a
 
 
 
 
 
 
 
 
d2894fa
56f497c
bf6322b
56f497c
 
 
 
 
 
72e8644
2045817
72e8644
60b55c4
4894c8f
4fd4915
b98b60d
56f497c
2169b22
 
 
 
56f497c
 
e647eeb
56f497c
 
e647eeb
2169b22
56f497c
7b79ed9
 
2169b22
28c6232
56f497c
ba3b9f6
56f497c
 
8b76c73
 
56f497c
 
 
 
b98b60d
56f497c
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import gradio as gr
import spaces
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, pipeline
import languagecodes

favourite_langs = {"German": "de", "Romanian": "ro", "English": "en", "-----": "-----"}
all_langs = languagecodes.iso_languages

# Language options as list, add favourite languages first
options = list(favourite_langs.keys())
options.extend(list(all_langs.keys()))
models = ["Helsinki-NLP",
          "t5-base", "t5-small", "t5-large",
          "facebook/nllb-200-distilled-600M",
          "facebook/nllb-200-distilled-1.3B",
          "facebook/mbart-large-50-many-to-many-mmt",
          "utter-project/EuroLLM-1.7B",
          "Unbabel/TowerInstruct-7B-v0.2",
          "Unbabel/TowerInstruct-Mistral-7B-v0.2"
          ]

def model_to_cuda(model):
    # Move the model to GPU if available
    if torch.cuda.is_available():
        model = model.to('cuda')
        print("CUDA is available! Using GPU.")
    else:
        print("CUDA not available! Using CPU.")
    return model

def eurollm(model_name, sl, tl, input_text):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)  
    prompt = f"{sl}: {input_text} {tl}:"
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=512)
    output = tokenizer.decode(outputs[0], skip_special_tokens=True) 
    result = output.rsplit(f'{tl}:')[-1].strip()
    return result

def nllb(model_name, sl, tl, input_text):
    tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang=sl)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto")
    translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang=sl, tgt_lang=tl)
    translated_text = translator(input_text, max_length=512)
    return translated_text[0]['translation_text']

@spaces.GPU
def translate_text(input_text, sselected_language, tselected_language, model_name):
    sl = all_langs[sselected_language]
    tl = all_langs[tselected_language]
    message_text = f'Translated from {sselected_language} to {tselected_language} with {model_name}'
    print(message_text)
    if model_name == "Helsinki-NLP":
        try:
            model_name = f"Helsinki-NLP/opus-mt-{sl}-{tl}"
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            model = model_to_cuda(AutoModelForSeq2SeqLM.from_pretrained(model_name))
        except EnvironmentError:
            try:   
                model_name = f"Helsinki-NLP/opus-tatoeba-{sl}-{tl}"
                tokenizer = AutoTokenizer.from_pretrained(model_name)
                model = model_to_cuda(AutoModelForSeq2SeqLM.from_pretrained(model_name))
            except EnvironmentError as error:
                return f"Error finding model: {model_name}! Try other available language combination.", error
    if 'eurollm' in model_name.lower():
        translated_text = eurollm(model_name, sselected_language, tselected_language, input_text)
        return translated_text, message_text
        
    if 'nllb' in model_name.lower():
        nnlbsl, nnlbtl = languagecodes.nllb_language_codes[sselected_language], languagecodes.nllb_language_codes[tselected_language]
        translated_text = nllb(model_name, nnlbsl, nnlbtl, input_text)
        return translated_text, message_text
    
    if model_name.startswith('facebook/mbart-large'):
        from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
        model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
        tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
        # translate source to target
        tokenizer.src_lang = languagecodes.mbart_large_languages[sselected_language]
        encoded = tokenizer(input_text, return_tensors="pt")
        generated_tokens = model.generate(
            **encoded,
            forced_bos_token_id=tokenizer.lang_code_to_id[languagecodes.mbart_large_languages[tselected_language]]
        )
        return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0], message_text

    if 'Unbabel' in model_name:   
        pipe = pipeline("text-generation", model=model_name, torch_dtype=torch.bfloat16, device_map="auto")
        messages = [{"role": "user",
                     "content": f"Translate the following text from {sselected_language} into {tselected_language}.\n{sselected_language}: {input_text}.\n{tselected_language}:"}]
        prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
        outputs = pipe(prompt, max_new_tokens=256, do_sample=False)
        translated_text = outputs[0]["generated_text"]
        return translated_text, message_text
    
    if model_name.startswith('t5'):
        tokenizer = T5Tokenizer.from_pretrained(model_name)
        model = T5ForConditionalGeneration.from_pretrained(model_name, device_map="auto")

    if model_name.startswith("Helsinki-NLP"):
        prompt = input_text
    else:
        prompt = f"translate {sselected_language} to {tselected_language}: {input_text}"

    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    output_ids = model.generate(input_ids, max_length=512)
    translated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    message_text = f'Translated from {sselected_language} to {tselected_language} with {model_name}'
    print(f'Translating from {sselected_language} to {tselected_language} with {model_name}:', f'{input_text} =  {translated_text}', sep='\n')
    return translated_text, message_text

# Define a function to swap dropdown values
def swap_languages(src_lang, tgt_lang):
    return tgt_lang, src_lang 

def create_interface():
    with gr.Blocks() as interface:
        gr.Markdown("### Machine Text Translation")

        with gr.Row():
            input_text = gr.Textbox(label="Enter text to translate:", placeholder="Type your text here, maximum 512 tokens")
        
        with gr.Row():
            sselected_language = gr.Dropdown(choices=options, value = options[0], label="Source language", interactive=True)
            tselected_language = gr.Dropdown(choices=options, value = options[1], label="Target language", interactive=True)
            swap_button = gr.Button("Swap Languages")
            swap_button.click(fn=swap_languages, inputs=[sselected_language, tselected_language], outputs=[sselected_language, tselected_language])

        model_name = gr.Dropdown(choices=models, label="Select a model", value = models[4], interactive=True)
        translate_button = gr.Button("Translate")

        translated_text = gr.Textbox(label="Translated text:", placeholder="Display field for translation", interactive=False, show_copy_button=True)
        message_text = gr.Textbox(label="Messages:", placeholder="Display field for status and error messages", interactive=False)

        translate_button.click(
            translate_text, 
            inputs=[input_text, sselected_language, tselected_language, model_name], 
            outputs=[translated_text, message_text]
        )

    return interface

interface = create_interface()
interface.launch()