Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,8 +5,7 @@ import torch
|
|
| 5 |
import spaces
|
| 6 |
import json
|
| 7 |
import re
|
| 8 |
-
|
| 9 |
-
from googletrans import Translator
|
| 10 |
|
| 11 |
# Load the processor and model
|
| 12 |
processor = AutoProcessor.from_pretrained(
|
|
@@ -96,52 +95,76 @@ def decode_unicode_sequences(unicode_seq):
|
|
| 96 |
|
| 97 |
def is_mandarin(text):
|
| 98 |
"""
|
| 99 |
-
Detects if the given text is in Mandarin.
|
| 100 |
|
| 101 |
Args:
|
| 102 |
text (str): The text to check.
|
| 103 |
|
| 104 |
Returns:
|
| 105 |
-
bool: True if the text
|
| 106 |
"""
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
|
| 113 |
-
def
|
| 114 |
"""
|
| 115 |
-
Translates
|
| 116 |
|
| 117 |
Args:
|
| 118 |
text (str): The Mandarin text to translate.
|
| 119 |
-
|
| 120 |
|
| 121 |
Returns:
|
| 122 |
str: The translated English text.
|
| 123 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
try:
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
print(
|
|
|
|
|
|
|
|
|
|
| 129 |
return text # Return the original text if translation fails
|
| 130 |
|
| 131 |
-
def
|
| 132 |
"""
|
| 133 |
Processes the input string to find Unicode escape sequences representing Mandarin words,
|
| 134 |
-
translates them to English, and replaces them accordingly.
|
| 135 |
|
| 136 |
Args:
|
| 137 |
input_string (str): The original string containing Unicode escape sequences.
|
|
|
|
| 138 |
|
| 139 |
Returns:
|
| 140 |
str: The processed string with translations where applicable.
|
| 141 |
"""
|
| 142 |
-
# Initialize the translator
|
| 143 |
-
translator = Translator()
|
| 144 |
-
|
| 145 |
# Regular expression to find groups of consecutive \uXXXX sequences
|
| 146 |
unicode_word_pattern = re.compile(r'(?:\\u[0-9a-fA-F]{4})+')
|
| 147 |
|
|
@@ -151,7 +174,7 @@ def process_text_for_mandarin_unicode(input_string):
|
|
| 151 |
decoded_word = decode_unicode_sequences(unicode_seq)
|
| 152 |
|
| 153 |
if is_mandarin(decoded_word):
|
| 154 |
-
translated =
|
| 155 |
return f"{translated} ({decoded_word})"
|
| 156 |
else:
|
| 157 |
# If not Mandarin, return the original sequence
|
|
@@ -183,7 +206,7 @@ def process_image_and_text(image, text):
|
|
| 183 |
generated_tokens = output[0, inputs['input_ids'].size(1):]
|
| 184 |
generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
|
| 185 |
generated_text_w_json_wrapper = wrap_json_in_markdown(generated_text)
|
| 186 |
-
generated_text_w_unicode_mdn =
|
| 187 |
|
| 188 |
return generated_text_w_unicode_mdn
|
| 189 |
|
|
|
|
| 5 |
import spaces
|
| 6 |
import json
|
| 7 |
import re
|
| 8 |
+
import deepl
|
|
|
|
| 9 |
|
| 10 |
# Load the processor and model
|
| 11 |
processor = AutoProcessor.from_pretrained(
|
|
|
|
| 95 |
|
| 96 |
def is_mandarin(text):
|
| 97 |
"""
|
| 98 |
+
Detects if the given text is in Mandarin using Unicode ranges.
|
| 99 |
|
| 100 |
Args:
|
| 101 |
text (str): The text to check.
|
| 102 |
|
| 103 |
Returns:
|
| 104 |
+
bool: True if the text contains Chinese characters, False otherwise.
|
| 105 |
"""
|
| 106 |
+
# Chinese Unicode ranges
|
| 107 |
+
for char in text:
|
| 108 |
+
if '\u4e00' <= char <= '\u9fff':
|
| 109 |
+
return True
|
| 110 |
+
return False
|
| 111 |
|
| 112 |
+
def translate_to_english_deepl(text, api_key):
|
| 113 |
"""
|
| 114 |
+
Translates Mandarin text to English using DeepL API.
|
| 115 |
|
| 116 |
Args:
|
| 117 |
text (str): The Mandarin text to translate.
|
| 118 |
+
api_key (str): Your DeepL API authentication key.
|
| 119 |
|
| 120 |
Returns:
|
| 121 |
str: The translated English text.
|
| 122 |
"""
|
| 123 |
+
url = "https://api.deepl.com/v2/translate"
|
| 124 |
+
params = {
|
| 125 |
+
"auth_key": api_key,
|
| 126 |
+
"text": text,
|
| 127 |
+
"source_lang": "ZH",
|
| 128 |
+
"target_lang": "EN"
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
# try:
|
| 132 |
+
# response = requests.post(url, data=params)
|
| 133 |
+
# response.raise_for_status()
|
| 134 |
+
# result = response.json()
|
| 135 |
+
# return result['translations'][0]['text']
|
| 136 |
+
# except requests.exceptions.RequestException as e:
|
| 137 |
+
# print(f"DeepL Translation error: {e}")
|
| 138 |
+
# return text # Return the original text if translation fails
|
| 139 |
+
|
| 140 |
+
# auth_key = api_key # Replace with your key
|
| 141 |
+
# translator = deepl.Translator(auth_key)
|
| 142 |
+
|
| 143 |
+
# result = translator.translate_text("Hello, world!", target_lang="FR")
|
| 144 |
+
# print(result.text) # "Bonjour, le monde !"
|
| 145 |
+
|
| 146 |
try:
|
| 147 |
+
auth_key = api_key # Replace with your key
|
| 148 |
+
translator = deepl.Translator(auth_key)
|
| 149 |
+
result = translator.translate_text(text, source_lang="ZH", target_lang="EN-US")
|
| 150 |
+
# print(result.text)
|
| 151 |
+
return result.text
|
| 152 |
+
except requests.exceptions.RequestException as e:
|
| 153 |
+
print(f"DeepL Translation error: {e}")
|
| 154 |
return text # Return the original text if translation fails
|
| 155 |
|
| 156 |
+
def process_text_deepl(input_string, api_key):
|
| 157 |
"""
|
| 158 |
Processes the input string to find Unicode escape sequences representing Mandarin words,
|
| 159 |
+
translates them to English using DeepL, and replaces them accordingly.
|
| 160 |
|
| 161 |
Args:
|
| 162 |
input_string (str): The original string containing Unicode escape sequences.
|
| 163 |
+
api_key (str): Your DeepL API authentication key.
|
| 164 |
|
| 165 |
Returns:
|
| 166 |
str: The processed string with translations where applicable.
|
| 167 |
"""
|
|
|
|
|
|
|
|
|
|
| 168 |
# Regular expression to find groups of consecutive \uXXXX sequences
|
| 169 |
unicode_word_pattern = re.compile(r'(?:\\u[0-9a-fA-F]{4})+')
|
| 170 |
|
|
|
|
| 174 |
decoded_word = decode_unicode_sequences(unicode_seq)
|
| 175 |
|
| 176 |
if is_mandarin(decoded_word):
|
| 177 |
+
translated = translate_to_english_deepl(decoded_word, api_key)
|
| 178 |
return f"{translated} ({decoded_word})"
|
| 179 |
else:
|
| 180 |
# If not Mandarin, return the original sequence
|
|
|
|
| 206 |
generated_tokens = output[0, inputs['input_ids'].size(1):]
|
| 207 |
generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
|
| 208 |
generated_text_w_json_wrapper = wrap_json_in_markdown(generated_text)
|
| 209 |
+
generated_text_w_unicode_mdn = process_text_deepl(generated_text_w_json_wrapper, "a5b1749b-7112-4c2d-81a3-33ea18478bb4:fx")
|
| 210 |
|
| 211 |
return generated_text_w_unicode_mdn
|
| 212 |
|