sflindrs commited on
Commit
4d90ea3
·
verified ·
1 Parent(s): 1d43c75

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -23
app.py CHANGED
@@ -5,8 +5,7 @@ import torch
5
  import spaces
6
  import json
7
  import re
8
- from langdetect import detect, LangDetectException
9
- from googletrans import Translator
10
 
11
  # Load the processor and model
12
  processor = AutoProcessor.from_pretrained(
@@ -96,52 +95,76 @@ def decode_unicode_sequences(unicode_seq):
96
 
97
  def is_mandarin(text):
98
  """
99
- Detects if the given text is in Mandarin.
100
 
101
  Args:
102
  text (str): The text to check.
103
 
104
  Returns:
105
- bool: True if the text is detected as Mandarin, False otherwise.
106
  """
107
- try:
108
- lang = detect(text)
109
- return lang == 'zh-cn' or lang == 'zh-tw' or lang == 'zh'
110
- except LangDetectException:
111
- return False
112
 
113
- def translate_to_english(text, translator):
114
  """
115
- Translates the given Mandarin text to English.
116
 
117
  Args:
118
  text (str): The Mandarin text to translate.
119
- translator (Translator): An instance of googletrans Translator.
120
 
121
  Returns:
122
  str: The translated English text.
123
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  try:
125
- translation = translator.translate(text, src='zh-cn', dest='en')
126
- return translation.text
127
- except Exception as e:
128
- print(f"Translation error: {e}")
 
 
 
129
  return text # Return the original text if translation fails
130
 
131
- def process_text_for_mandarin_unicode(input_string):
132
  """
133
  Processes the input string to find Unicode escape sequences representing Mandarin words,
134
- translates them to English, and replaces them accordingly.
135
 
136
  Args:
137
  input_string (str): The original string containing Unicode escape sequences.
 
138
 
139
  Returns:
140
  str: The processed string with translations where applicable.
141
  """
142
- # Initialize the translator
143
- translator = Translator()
144
-
145
  # Regular expression to find groups of consecutive \uXXXX sequences
146
  unicode_word_pattern = re.compile(r'(?:\\u[0-9a-fA-F]{4})+')
147
 
@@ -151,7 +174,7 @@ def process_text_for_mandarin_unicode(input_string):
151
  decoded_word = decode_unicode_sequences(unicode_seq)
152
 
153
  if is_mandarin(decoded_word):
154
- translated = translate_to_english(decoded_word, translator)
155
  return f"{translated} ({decoded_word})"
156
  else:
157
  # If not Mandarin, return the original sequence
@@ -183,7 +206,7 @@ def process_image_and_text(image, text):
183
  generated_tokens = output[0, inputs['input_ids'].size(1):]
184
  generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
185
  generated_text_w_json_wrapper = wrap_json_in_markdown(generated_text)
186
- generated_text_w_unicode_mdn = process_text_for_mandarin_unicode(generated_text_w_json_wrapper)
187
 
188
  return generated_text_w_unicode_mdn
189
 
 
5
  import spaces
6
  import json
7
  import re
8
+ import deepl
 
9
 
10
  # Load the processor and model
11
  processor = AutoProcessor.from_pretrained(
 
95
 
96
  def is_mandarin(text):
97
  """
98
+ Detects if the given text is in Mandarin using Unicode ranges.
99
 
100
  Args:
101
  text (str): The text to check.
102
 
103
  Returns:
104
+ bool: True if the text contains Chinese characters, False otherwise.
105
  """
106
+ # Chinese Unicode ranges
107
+ for char in text:
108
+ if '\u4e00' <= char <= '\u9fff':
109
+ return True
110
+ return False
111
 
112
+ def translate_to_english_deepl(text, api_key):
113
  """
114
+ Translates Mandarin text to English using DeepL API.
115
 
116
  Args:
117
  text (str): The Mandarin text to translate.
118
+ api_key (str): Your DeepL API authentication key.
119
 
120
  Returns:
121
  str: The translated English text.
122
  """
123
+ url = "https://api.deepl.com/v2/translate"
124
+ params = {
125
+ "auth_key": api_key,
126
+ "text": text,
127
+ "source_lang": "ZH",
128
+ "target_lang": "EN"
129
+ }
130
+
131
+ # try:
132
+ # response = requests.post(url, data=params)
133
+ # response.raise_for_status()
134
+ # result = response.json()
135
+ # return result['translations'][0]['text']
136
+ # except requests.exceptions.RequestException as e:
137
+ # print(f"DeepL Translation error: {e}")
138
+ # return text # Return the original text if translation fails
139
+
140
+ # auth_key = api_key # Replace with your key
141
+ # translator = deepl.Translator(auth_key)
142
+
143
+ # result = translator.translate_text("Hello, world!", target_lang="FR")
144
+ # print(result.text) # "Bonjour, le monde !"
145
+
146
  try:
147
+ auth_key = api_key # Replace with your key
148
+ translator = deepl.Translator(auth_key)
149
+ result = translator.translate_text(text, source_lang="ZH", target_lang="EN-US")
150
+ # print(result.text)
151
+ return result.text
152
+ except requests.exceptions.RequestException as e:
153
+ print(f"DeepL Translation error: {e}")
154
  return text # Return the original text if translation fails
155
 
156
+ def process_text_deepl(input_string, api_key):
157
  """
158
  Processes the input string to find Unicode escape sequences representing Mandarin words,
159
+ translates them to English using DeepL, and replaces them accordingly.
160
 
161
  Args:
162
  input_string (str): The original string containing Unicode escape sequences.
163
+ api_key (str): Your DeepL API authentication key.
164
 
165
  Returns:
166
  str: The processed string with translations where applicable.
167
  """
 
 
 
168
  # Regular expression to find groups of consecutive \uXXXX sequences
169
  unicode_word_pattern = re.compile(r'(?:\\u[0-9a-fA-F]{4})+')
170
 
 
174
  decoded_word = decode_unicode_sequences(unicode_seq)
175
 
176
  if is_mandarin(decoded_word):
177
+ translated = translate_to_english_deepl(decoded_word, api_key)
178
  return f"{translated} ({decoded_word})"
179
  else:
180
  # If not Mandarin, return the original sequence
 
206
  generated_tokens = output[0, inputs['input_ids'].size(1):]
207
  generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
208
  generated_text_w_json_wrapper = wrap_json_in_markdown(generated_text)
209
+ generated_text_w_unicode_mdn = process_text_deepl(generated_text_w_json_wrapper, "a5b1749b-7112-4c2d-81a3-33ea18478bb4:fx")
210
 
211
  return generated_text_w_unicode_mdn
212