Didier commited on
Commit
c153533
·
verified ·
1 Parent(s): fa896dc

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -19
app.py CHANGED
@@ -1,7 +1,7 @@
1
  """
2
- File: module_translation_MADLAD.py
3
 
4
- Description: Module to translate between 400 languages.
5
 
6
  Author: Didier Guillevic
7
  Date: 2024-09-07
@@ -16,8 +16,8 @@ logger = logging.getLogger(__name__)
16
  logging.basicConfig(level=logging.INFO)
17
 
18
  import model_translation as translation
19
- from model_translation import tokenizer_multilingual
20
- from model_translation import model_multilingual
21
 
22
  from deep_translator import GoogleTranslator
23
 
@@ -116,7 +116,46 @@ def detect_language(text):
116
  lang = langdetect.detect(text)
117
  return lang
118
 
119
- @spaces.GPU
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  def translate_with_multilingual_model(
121
  text: str,
122
  tgt_lang: str,
@@ -124,7 +163,7 @@ def translate_with_multilingual_model(
124
  input_max_length: int=512,
125
  output_max_length: int=512):
126
  """
127
- Translate the givent text into English (default "easy" language)
128
  """
129
  chunks = build_text_chunks(text, None, sents_per_chunk)
130
  translated_chunks = []
@@ -139,7 +178,8 @@ def translate_with_multilingual_model(
139
  model_multilingual.device)
140
  outputs = model_multilingual.generate(
141
  input_ids=input_ids, max_length=output_max_length)
142
- translated_chunk = tokenizer_multilingual.decode(outputs[0], skip_special_tokens=True)
 
143
  translated_chunks.append(translated_chunk)
144
 
145
  return '\n'.join(translated_chunks)
@@ -153,25 +193,27 @@ def translate_text(
153
  """
154
  Translate the given text into English (default "easy" language)
155
  """
 
 
 
156
  #
157
  # Bilingual (Helsinki model)
158
  #
159
- src_lang = src_lang if (src_lang and src_lang != "auto") else detect_language(text)
160
- if src_lang not in translation.src_langs:
161
- return (
162
- f"ISSUE: currently no model for language '{src_lang}'. "
163
- "If wrong language, please specify language."
164
- )
165
- logger.info(f"LANG: {src_lang}, TEXT: {text[:50]}...")
166
- tokenizer, model = translation.get_tokenizer_model_for_src_lang(src_lang)
 
 
167
 
168
- translated_text_bilingual_model = translate_with_model(
169
- text, tokenizer, model, src_lang, sents_per_chunk)
170
-
171
  #
172
  # Multilingual model (Google MADLAD)
173
  #
174
- tgt_lang = 'en' # Default "easy" language
175
  translated_text_multilingual_model = translate_with_multilingual_model(
176
  text, tgt_lang, sents_per_chunk, input_max_length, output_max_length)
177
 
@@ -183,6 +225,7 @@ def translate_text(
183
 
184
  return (
185
  translated_text_bilingual_model,
 
186
  translated_text_multilingual_model,
187
  translated_text_google_translate
188
  )
@@ -207,6 +250,11 @@ with gr.Blocks() as demo:
207
  label="Bilingual translation model (Helsinki NLP)",
208
  render=False
209
  )
 
 
 
 
 
210
  output_text_multilingual_model = gr.Textbox(
211
  lines=6,
212
  label="Multilingual translation model (**small** Google MADLAD)",
@@ -250,6 +298,7 @@ with gr.Blocks() as demo:
250
  outputs=[
251
  output_text_bilingual_model,
252
  output_text_multilingual_model,
 
253
  output_text_google_translate,
254
  ],
255
  additional_inputs=[sentences_per_chunk,],
 
1
  """
2
+ File: app.py
3
 
4
+ Description: Translate text...
5
 
6
  Author: Didier Guillevic
7
  Date: 2024-09-07
 
16
  logging.basicConfig(level=logging.INFO)
17
 
18
  import model_translation as translation
19
+ from model_translation import tokenizer_multilingual, model_multilingual
20
+ from model_translation import tokenizer_m2m100, model_m2m100
21
 
22
  from deep_translator import GoogleTranslator
23
 
 
116
  lang = langdetect.detect(text)
117
  return lang
118
 
119
+ def translate_with_bilingual_model(
120
+ text, src_lang, tgt_lang, sents_per_chunk
121
+ ):
122
+ """
123
+ Translate with Helsinki bilingual models
124
+ """
125
+ if src_lang not in translation.src_langs:
126
+ return (
127
+ f"ISSUE: currently no model for language '{src_lang}'. "
128
+ "If wrong language, please specify language."
129
+ )
130
+ logger.info(f"LANG: {src_lang}, TEXT: {text[:50]}...")
131
+ tokenizer, model = translation.get_tokenizer_model_for_src_lang(src_lang)
132
+ translated_text_bilingual_model = translate_with_model(
133
+ text, tokenizer, model, src_lang, sents_per_chunk)
134
+ return translated_text_bilingual_model
135
+
136
+
137
+ #@spaces.GPU
138
+ def translate_with_m2m100_model(
139
+ text: str,
140
+ src_lang: str,
141
+ tgt_lang: str,
142
+ sents_per_chunk: int=5):
143
+ """
144
+ Translate with the m2m100 model
145
+ """
146
+ tokenizer_m2m100.src_lang = src_lang
147
+ input_ids = tokenizer_m2m100(text, return_tensors="pt").input_ids.to(
148
+ model_m2m100.device)
149
+ outputs = model_m2m100.generate(
150
+ input_ids=input_ids,
151
+ forced_bos_token_id=tokenizer_m2m100.get_lang_id(tgt_lang)
152
+ )
153
+ translated_text = tokenizer_m2m100.batch_decode(
154
+ outputs[0], skip_special_tokens=True)
155
+ return translated_text
156
+
157
+
158
+ #@spaces.GPU
159
  def translate_with_multilingual_model(
160
  text: str,
161
  tgt_lang: str,
 
163
  input_max_length: int=512,
164
  output_max_length: int=512):
165
  """
166
+ Translate the given text into English (default "easy" language)
167
  """
168
  chunks = build_text_chunks(text, None, sents_per_chunk)
169
  translated_chunks = []
 
178
  model_multilingual.device)
179
  outputs = model_multilingual.generate(
180
  input_ids=input_ids, max_length=output_max_length)
181
+ translated_chunk = tokenizer_multilingual.decode(
182
+ outputs[0], skip_special_tokens=True)
183
  translated_chunks.append(translated_chunk)
184
 
185
  return '\n'.join(translated_chunks)
 
193
  """
194
  Translate the given text into English (default "easy" language)
195
  """
196
+ src_lang = src_lang if (src_lang and src_lang != "auto") else detect_language(text)
197
+ tgt_lang = 'en' # Default "easy" language
198
+
199
  #
200
  # Bilingual (Helsinki model)
201
  #
202
+ translated_text_bilingual_model = translate_with_bilingual_model(
203
+ text, src_lang, tgt_lang, sents_per_chunk
204
+ )
205
+
206
+ #
207
+ # m2m100 model
208
+ #
209
+ translated_text_m2m100_model = translate_with_m2m100_model(
210
+ text, src_lang, tgt_lang, sents_per_chunk
211
+ )
212
 
 
 
 
213
  #
214
  # Multilingual model (Google MADLAD)
215
  #
216
+
217
  translated_text_multilingual_model = translate_with_multilingual_model(
218
  text, tgt_lang, sents_per_chunk, input_max_length, output_max_length)
219
 
 
225
 
226
  return (
227
  translated_text_bilingual_model,
228
+
229
  translated_text_multilingual_model,
230
  translated_text_google_translate
231
  )
 
250
  label="Bilingual translation model (Helsinki NLP)",
251
  render=False
252
  )
253
+ output_text_m2m100_model = gr.Textbox(
254
+ lines=6,
255
+ label="Facebook m2m100 translation model (**small**)",
256
+ render=False
257
+ )
258
  output_text_multilingual_model = gr.Textbox(
259
  lines=6,
260
  label="Multilingual translation model (**small** Google MADLAD)",
 
298
  outputs=[
299
  output_text_bilingual_model,
300
  output_text_multilingual_model,
301
+ output_text_m2m100_model,
302
  output_text_google_translate,
303
  ],
304
  additional_inputs=[sentences_per_chunk,],