Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -5,8 +5,7 @@ import torch
|
|
5 |
import spaces
|
6 |
import json
|
7 |
import re
|
8 |
-
|
9 |
-
from googletrans import Translator
|
10 |
|
11 |
# Load the processor and model
|
12 |
processor = AutoProcessor.from_pretrained(
|
@@ -96,52 +95,76 @@ def decode_unicode_sequences(unicode_seq):
|
|
96 |
|
97 |
def is_mandarin(text):
|
98 |
"""
|
99 |
-
Detects if the given text is in Mandarin.
|
100 |
|
101 |
Args:
|
102 |
text (str): The text to check.
|
103 |
|
104 |
Returns:
|
105 |
-
bool: True if the text
|
106 |
"""
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
|
113 |
-
def
|
114 |
"""
|
115 |
-
Translates
|
116 |
|
117 |
Args:
|
118 |
text (str): The Mandarin text to translate.
|
119 |
-
|
120 |
|
121 |
Returns:
|
122 |
str: The translated English text.
|
123 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
try:
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
print(
|
|
|
|
|
|
|
129 |
return text # Return the original text if translation fails
|
130 |
|
131 |
-
def
|
132 |
"""
|
133 |
Processes the input string to find Unicode escape sequences representing Mandarin words,
|
134 |
-
translates them to English, and replaces them accordingly.
|
135 |
|
136 |
Args:
|
137 |
input_string (str): The original string containing Unicode escape sequences.
|
|
|
138 |
|
139 |
Returns:
|
140 |
str: The processed string with translations where applicable.
|
141 |
"""
|
142 |
-
# Initialize the translator
|
143 |
-
translator = Translator()
|
144 |
-
|
145 |
# Regular expression to find groups of consecutive \uXXXX sequences
|
146 |
unicode_word_pattern = re.compile(r'(?:\\u[0-9a-fA-F]{4})+')
|
147 |
|
@@ -151,7 +174,7 @@ def process_text_for_mandarin_unicode(input_string):
|
|
151 |
decoded_word = decode_unicode_sequences(unicode_seq)
|
152 |
|
153 |
if is_mandarin(decoded_word):
|
154 |
-
translated =
|
155 |
return f"{translated} ({decoded_word})"
|
156 |
else:
|
157 |
# If not Mandarin, return the original sequence
|
@@ -183,7 +206,7 @@ def process_image_and_text(image, text):
|
|
183 |
generated_tokens = output[0, inputs['input_ids'].size(1):]
|
184 |
generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
|
185 |
generated_text_w_json_wrapper = wrap_json_in_markdown(generated_text)
|
186 |
-
generated_text_w_unicode_mdn =
|
187 |
|
188 |
return generated_text_w_unicode_mdn
|
189 |
|
|
|
5 |
import spaces
|
6 |
import json
|
7 |
import re
|
8 |
+
import deepl
|
|
|
9 |
|
10 |
# Load the processor and model
|
11 |
processor = AutoProcessor.from_pretrained(
|
|
|
95 |
|
96 |
def is_mandarin(text):
|
97 |
"""
|
98 |
+
Detects if the given text is in Mandarin using Unicode ranges.
|
99 |
|
100 |
Args:
|
101 |
text (str): The text to check.
|
102 |
|
103 |
Returns:
|
104 |
+
bool: True if the text contains Chinese characters, False otherwise.
|
105 |
"""
|
106 |
+
# Chinese Unicode ranges
|
107 |
+
for char in text:
|
108 |
+
if '\u4e00' <= char <= '\u9fff':
|
109 |
+
return True
|
110 |
+
return False
|
111 |
|
112 |
+
def translate_to_english_deepl(text, api_key):
|
113 |
"""
|
114 |
+
Translates Mandarin text to English using DeepL API.
|
115 |
|
116 |
Args:
|
117 |
text (str): The Mandarin text to translate.
|
118 |
+
api_key (str): Your DeepL API authentication key.
|
119 |
|
120 |
Returns:
|
121 |
str: The translated English text.
|
122 |
"""
|
123 |
+
url = "https://api.deepl.com/v2/translate"
|
124 |
+
params = {
|
125 |
+
"auth_key": api_key,
|
126 |
+
"text": text,
|
127 |
+
"source_lang": "ZH",
|
128 |
+
"target_lang": "EN"
|
129 |
+
}
|
130 |
+
|
131 |
+
# try:
|
132 |
+
# response = requests.post(url, data=params)
|
133 |
+
# response.raise_for_status()
|
134 |
+
# result = response.json()
|
135 |
+
# return result['translations'][0]['text']
|
136 |
+
# except requests.exceptions.RequestException as e:
|
137 |
+
# print(f"DeepL Translation error: {e}")
|
138 |
+
# return text # Return the original text if translation fails
|
139 |
+
|
140 |
+
# auth_key = api_key # Replace with your key
|
141 |
+
# translator = deepl.Translator(auth_key)
|
142 |
+
|
143 |
+
# result = translator.translate_text("Hello, world!", target_lang="FR")
|
144 |
+
# print(result.text) # "Bonjour, le monde !"
|
145 |
+
|
146 |
try:
|
147 |
+
auth_key = api_key # Replace with your key
|
148 |
+
translator = deepl.Translator(auth_key)
|
149 |
+
result = translator.translate_text(text, source_lang="ZH", target_lang="EN-US")
|
150 |
+
# print(result.text)
|
151 |
+
return result.text
|
152 |
+
except requests.exceptions.RequestException as e:
|
153 |
+
print(f"DeepL Translation error: {e}")
|
154 |
return text # Return the original text if translation fails
|
155 |
|
156 |
+
def process_text_deepl(input_string, api_key):
|
157 |
"""
|
158 |
Processes the input string to find Unicode escape sequences representing Mandarin words,
|
159 |
+
translates them to English using DeepL, and replaces them accordingly.
|
160 |
|
161 |
Args:
|
162 |
input_string (str): The original string containing Unicode escape sequences.
|
163 |
+
api_key (str): Your DeepL API authentication key.
|
164 |
|
165 |
Returns:
|
166 |
str: The processed string with translations where applicable.
|
167 |
"""
|
|
|
|
|
|
|
168 |
# Regular expression to find groups of consecutive \uXXXX sequences
|
169 |
unicode_word_pattern = re.compile(r'(?:\\u[0-9a-fA-F]{4})+')
|
170 |
|
|
|
174 |
decoded_word = decode_unicode_sequences(unicode_seq)
|
175 |
|
176 |
if is_mandarin(decoded_word):
|
177 |
+
translated = translate_to_english_deepl(decoded_word, api_key)
|
178 |
return f"{translated} ({decoded_word})"
|
179 |
else:
|
180 |
# If not Mandarin, return the original sequence
|
|
|
206 |
generated_tokens = output[0, inputs['input_ids'].size(1):]
|
207 |
generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
|
208 |
generated_text_w_json_wrapper = wrap_json_in_markdown(generated_text)
|
209 |
+
generated_text_w_unicode_mdn = process_text_deepl(generated_text_w_json_wrapper, "a5b1749b-7112-4c2d-81a3-33ea18478bb4:fx")
|
210 |
|
211 |
return generated_text_w_unicode_mdn
|
212 |
|