Didier commited on
Commit
b10cb1c
·
1 Parent(s): faad283

Add spaces

Browse files
Files changed (1) hide show
  1. app.py +16 -17
app.py CHANGED
@@ -68,17 +68,6 @@ def build_text_chunks(text, src_lang, sents_per_chunk):
68
  # Append last chunk
69
  if chunk:
70
  chunks.append(chunk)
71
-
72
- # !!! SKIP splitting of text into chunks for now !!!
73
- # Might not be reliable for non-European languages.
74
- #chunks = [text, ]
75
-
76
- # NOTE: The 'fa' (Persian) model has multiple target languages to choose from.
77
- # We need to specifiy the desired languages among: fra ita por ron spa
78
- # https://huggingface.co/Helsinki-NLP/opus-mt-tc-big-fa-itc
79
- # Prepend text with >>fra<< in order to translate in French.
80
- if src_lang == 'fa':
81
- chunks = [">>fra<< " + chunk for chunk in chunks]
82
 
83
  return chunks
84
 
@@ -93,6 +82,14 @@ def translate_with_model(
93
  # Translate chunks
94
  translated_chunks = []
95
  for chunk in chunks:
 
 
 
 
 
 
 
 
96
  inputs = tokenizer(
97
  chunk, return_tensors="pt",
98
  max_length=input_max_length,
@@ -112,10 +109,12 @@ def translate_with_model(
112
 
113
  return '\n'.join(translated_chunks)
114
 
 
115
  def detect_language(text):
116
  lang = langdetect.detect(text)
117
  return lang
118
 
 
119
  def translate_with_bilingual_model(
120
  text, src_lang, tgt_lang, sents_per_chunk
121
  ):
@@ -134,7 +133,7 @@ def translate_with_bilingual_model(
134
  return translated_text_bilingual_model
135
 
136
 
137
- #@spaces.GPU
138
  def translate_with_m2m100_model(
139
  text: str,
140
  src_lang: str,
@@ -144,18 +143,17 @@ def translate_with_m2m100_model(
144
  Translate with the m2m100 model
145
  """
146
  tokenizer_m2m100.src_lang = src_lang
147
- input_ids = tokenizer_m2m100(text, return_tensors="pt").input_ids.to(
148
- model_m2m100.device)
149
  outputs = model_m2m100.generate(
150
  input_ids=input_ids,
151
- forced_bos_token_id=tokenizer_m2m100.get_lang_id(tgt_lang)
152
- )
153
  translated_text = tokenizer_m2m100.batch_decode(
154
  outputs[0], skip_special_tokens=True)
155
  return translated_text
156
 
157
 
158
- #@spaces.GPU
159
  def translate_with_multilingual_model(
160
  text: str,
161
  tgt_lang: str,
@@ -184,6 +182,7 @@ def translate_with_multilingual_model(
184
 
185
  return '\n'.join(translated_chunks)
186
 
 
187
  def translate_text(
188
  text: str,
189
  src_lang: str=None,
 
68
  # Append last chunk
69
  if chunk:
70
  chunks.append(chunk)
 
 
 
 
 
 
 
 
 
 
 
71
 
72
  return chunks
73
 
 
82
  # Translate chunks
83
  translated_chunks = []
84
  for chunk in chunks:
85
+
86
+ # NOTE: The 'fa' (Persian) model has multiple target languages to choose from.
87
+ # We need to specifiy the desired languages among: fra ita por ron spa
88
+ # https://huggingface.co/Helsinki-NLP/opus-mt-tc-big-fa-itc
89
+ # Prepend text with >>fra<< in order to translate in French.
90
+ if src_lang == 'fa':
91
+ chunk = ">>fra<< " + chunk
92
+
93
  inputs = tokenizer(
94
  chunk, return_tensors="pt",
95
  max_length=input_max_length,
 
109
 
110
  return '\n'.join(translated_chunks)
111
 
112
+
113
  def detect_language(text):
114
  lang = langdetect.detect(text)
115
  return lang
116
 
117
+
118
  def translate_with_bilingual_model(
119
  text, src_lang, tgt_lang, sents_per_chunk
120
  ):
 
133
  return translated_text_bilingual_model
134
 
135
 
136
+ @spaces.GPU
137
  def translate_with_m2m100_model(
138
  text: str,
139
  src_lang: str,
 
143
  Translate with the m2m100 model
144
  """
145
  tokenizer_m2m100.src_lang = src_lang
146
+ input_ids = tokenizer_m2m100(
147
+ text, return_tensors="pt").input_ids.to(model_m2m100.device)
148
  outputs = model_m2m100.generate(
149
  input_ids=input_ids,
150
+ forced_bos_token_id=tokenizer_m2m100.get_lang_id(tgt_lang))
 
151
  translated_text = tokenizer_m2m100.batch_decode(
152
  outputs[0], skip_special_tokens=True)
153
  return translated_text
154
 
155
 
156
+ @spaces.GPU
157
  def translate_with_multilingual_model(
158
  text: str,
159
  tgt_lang: str,
 
182
 
183
  return '\n'.join(translated_chunks)
184
 
185
+
186
  def translate_text(
187
  text: str,
188
  src_lang: str=None,