Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
"""
|
2 |
-
File:
|
3 |
|
4 |
-
Description:
|
5 |
|
6 |
Author: Didier Guillevic
|
7 |
Date: 2024-09-07
|
@@ -16,8 +16,8 @@ logger = logging.getLogger(__name__)
|
|
16 |
logging.basicConfig(level=logging.INFO)
|
17 |
|
18 |
import model_translation as translation
|
19 |
-
from model_translation import tokenizer_multilingual
|
20 |
-
from model_translation import
|
21 |
|
22 |
from deep_translator import GoogleTranslator
|
23 |
|
@@ -116,7 +116,46 @@ def detect_language(text):
|
|
116 |
lang = langdetect.detect(text)
|
117 |
return lang
|
118 |
|
119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
def translate_with_multilingual_model(
|
121 |
text: str,
|
122 |
tgt_lang: str,
|
@@ -124,7 +163,7 @@ def translate_with_multilingual_model(
|
|
124 |
input_max_length: int=512,
|
125 |
output_max_length: int=512):
|
126 |
"""
|
127 |
-
Translate the
|
128 |
"""
|
129 |
chunks = build_text_chunks(text, None, sents_per_chunk)
|
130 |
translated_chunks = []
|
@@ -139,7 +178,8 @@ def translate_with_multilingual_model(
|
|
139 |
model_multilingual.device)
|
140 |
outputs = model_multilingual.generate(
|
141 |
input_ids=input_ids, max_length=output_max_length)
|
142 |
-
translated_chunk = tokenizer_multilingual.decode(
|
|
|
143 |
translated_chunks.append(translated_chunk)
|
144 |
|
145 |
return '\n'.join(translated_chunks)
|
@@ -153,25 +193,27 @@ def translate_text(
|
|
153 |
"""
|
154 |
Translate the given text into English (default "easy" language)
|
155 |
"""
|
|
|
|
|
|
|
156 |
#
|
157 |
# Bilingual (Helsinki model)
|
158 |
#
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
|
|
|
|
167 |
|
168 |
-
translated_text_bilingual_model = translate_with_model(
|
169 |
-
text, tokenizer, model, src_lang, sents_per_chunk)
|
170 |
-
|
171 |
#
|
172 |
# Multilingual model (Google MADLAD)
|
173 |
#
|
174 |
-
|
175 |
translated_text_multilingual_model = translate_with_multilingual_model(
|
176 |
text, tgt_lang, sents_per_chunk, input_max_length, output_max_length)
|
177 |
|
@@ -183,6 +225,7 @@ def translate_text(
|
|
183 |
|
184 |
return (
|
185 |
translated_text_bilingual_model,
|
|
|
186 |
translated_text_multilingual_model,
|
187 |
translated_text_google_translate
|
188 |
)
|
@@ -207,6 +250,11 @@ with gr.Blocks() as demo:
|
|
207 |
label="Bilingual translation model (Helsinki NLP)",
|
208 |
render=False
|
209 |
)
|
|
|
|
|
|
|
|
|
|
|
210 |
output_text_multilingual_model = gr.Textbox(
|
211 |
lines=6,
|
212 |
label="Multilingual translation model (**small** Google MADLAD)",
|
@@ -250,6 +298,7 @@ with gr.Blocks() as demo:
|
|
250 |
outputs=[
|
251 |
output_text_bilingual_model,
|
252 |
output_text_multilingual_model,
|
|
|
253 |
output_text_google_translate,
|
254 |
],
|
255 |
additional_inputs=[sentences_per_chunk,],
|
|
|
1 |
"""
|
2 |
+
File: app.py
|
3 |
|
4 |
+
Description: Translate text...
|
5 |
|
6 |
Author: Didier Guillevic
|
7 |
Date: 2024-09-07
|
|
|
16 |
logging.basicConfig(level=logging.INFO)
|
17 |
|
18 |
import model_translation as translation
|
19 |
+
from model_translation import tokenizer_multilingual, model_multilingual
|
20 |
+
from model_translation import tokenizer_m2m100, model_m2m100
|
21 |
|
22 |
from deep_translator import GoogleTranslator
|
23 |
|
|
|
116 |
lang = langdetect.detect(text)
|
117 |
return lang
|
118 |
|
119 |
+
def translate_with_bilingual_model(
|
120 |
+
text, src_lang, tgt_lang, sents_per_chunk
|
121 |
+
):
|
122 |
+
"""
|
123 |
+
Translate with Helsinki bilingual models
|
124 |
+
"""
|
125 |
+
if src_lang not in translation.src_langs:
|
126 |
+
return (
|
127 |
+
f"ISSUE: currently no model for language '{src_lang}'. "
|
128 |
+
"If wrong language, please specify language."
|
129 |
+
)
|
130 |
+
logger.info(f"LANG: {src_lang}, TEXT: {text[:50]}...")
|
131 |
+
tokenizer, model = translation.get_tokenizer_model_for_src_lang(src_lang)
|
132 |
+
translated_text_bilingual_model = translate_with_model(
|
133 |
+
text, tokenizer, model, src_lang, sents_per_chunk)
|
134 |
+
return translated_text_bilingual_model
|
135 |
+
|
136 |
+
|
137 |
+
#@spaces.GPU
|
138 |
+
def translate_with_m2m100_model(
|
139 |
+
text: str,
|
140 |
+
src_lang: str,
|
141 |
+
tgt_lang: str,
|
142 |
+
sents_per_chunk: int=5):
|
143 |
+
"""
|
144 |
+
Translate with the m2m100 model
|
145 |
+
"""
|
146 |
+
tokenizer_m2m100.src_lang = src_lang
|
147 |
+
input_ids = tokenizer_m2m100(text, return_tensors="pt").input_ids.to(
|
148 |
+
model_m2m100.device)
|
149 |
+
outputs = model_m2m100.generate(
|
150 |
+
input_ids=input_ids,
|
151 |
+
forced_bos_token_id=tokenizer_m2m100.get_lang_id(tgt_lang)
|
152 |
+
)
|
153 |
+
translated_text = tokenizer_m2m100.batch_decode(
|
154 |
+
outputs[0], skip_special_tokens=True)
|
155 |
+
return translated_text
|
156 |
+
|
157 |
+
|
158 |
+
#@spaces.GPU
|
159 |
def translate_with_multilingual_model(
|
160 |
text: str,
|
161 |
tgt_lang: str,
|
|
|
163 |
input_max_length: int=512,
|
164 |
output_max_length: int=512):
|
165 |
"""
|
166 |
+
Translate the given text into English (default "easy" language)
|
167 |
"""
|
168 |
chunks = build_text_chunks(text, None, sents_per_chunk)
|
169 |
translated_chunks = []
|
|
|
178 |
model_multilingual.device)
|
179 |
outputs = model_multilingual.generate(
|
180 |
input_ids=input_ids, max_length=output_max_length)
|
181 |
+
translated_chunk = tokenizer_multilingual.decode(
|
182 |
+
outputs[0], skip_special_tokens=True)
|
183 |
translated_chunks.append(translated_chunk)
|
184 |
|
185 |
return '\n'.join(translated_chunks)
|
|
|
193 |
"""
|
194 |
Translate the given text into English (default "easy" language)
|
195 |
"""
|
196 |
+
src_lang = src_lang if (src_lang and src_lang != "auto") else detect_language(text)
|
197 |
+
tgt_lang = 'en' # Default "easy" language
|
198 |
+
|
199 |
#
|
200 |
# Bilingual (Helsinki model)
|
201 |
#
|
202 |
+
translated_text_bilingual_model = translate_with_bilingual_model(
|
203 |
+
text, src_lang, tgt_lang, sents_per_chunk
|
204 |
+
)
|
205 |
+
|
206 |
+
#
|
207 |
+
# m2m100 model
|
208 |
+
#
|
209 |
+
translated_text_m2m100_model = translate_with_m2m100_model(
|
210 |
+
text, src_lang, tgt_lang, sents_per_chunk
|
211 |
+
)
|
212 |
|
|
|
|
|
|
|
213 |
#
|
214 |
# Multilingual model (Google MADLAD)
|
215 |
#
|
216 |
+
|
217 |
translated_text_multilingual_model = translate_with_multilingual_model(
|
218 |
text, tgt_lang, sents_per_chunk, input_max_length, output_max_length)
|
219 |
|
|
|
225 |
|
226 |
return (
|
227 |
translated_text_bilingual_model,
|
228 |
+
|
229 |
translated_text_multilingual_model,
|
230 |
translated_text_google_translate
|
231 |
)
|
|
|
250 |
label="Bilingual translation model (Helsinki NLP)",
|
251 |
render=False
|
252 |
)
|
253 |
+
output_text_m2m100_model = gr.Textbox(
|
254 |
+
lines=6,
|
255 |
+
label="Facebook m2m100 translation model (**small**)",
|
256 |
+
render=False
|
257 |
+
)
|
258 |
output_text_multilingual_model = gr.Textbox(
|
259 |
lines=6,
|
260 |
label="Multilingual translation model (**small** Google MADLAD)",
|
|
|
298 |
outputs=[
|
299 |
output_text_bilingual_model,
|
300 |
output_text_multilingual_model,
|
301 |
+
output_text_m2m100_model,
|
302 |
output_text_google_translate,
|
303 |
],
|
304 |
additional_inputs=[sentences_per_chunk,],
|