Spaces:
Runtime error
Runtime error
Fix text clean.py
Browse files- app.py +3 -0
- fish_speech/text/clean.py +9 -47
app.py
CHANGED
|
@@ -1,4 +1,7 @@
|
|
| 1 |
import os
|
|
|
|
|
|
|
|
|
|
| 2 |
import queue
|
| 3 |
from huggingface_hub import snapshot_download
|
| 4 |
import hydra
|
|
|
|
| 1 |
import os
|
| 2 |
+
|
| 3 |
+
os.environ["TORCHAUDIO_USE_FFMPEG"] = "1"
|
| 4 |
+
|
| 5 |
import queue
|
| 6 |
from huggingface_hub import snapshot_download
|
| 7 |
import hydra
|
fish_speech/text/clean.py
CHANGED
|
@@ -1,61 +1,24 @@
|
|
| 1 |
-
import itertools
|
| 2 |
import re
|
| 3 |
|
| 4 |
-
LANGUAGE_UNICODE_RANGE_MAP = {
|
| 5 |
-
"ZH": [(0x4E00, 0x9FFF)],
|
| 6 |
-
"JP": [(0x4E00, 0x9FFF), (0x3040, 0x309F), (0x30A0, 0x30FF), (0x31F0, 0x31FF)],
|
| 7 |
-
"EN": [(0x0000, 0x007F)],
|
| 8 |
-
}
|
| 9 |
-
|
| 10 |
SYMBOLS_MAPPING = {
|
| 11 |
-
"οΌ": ",",
|
| 12 |
-
"οΌ": ",",
|
| 13 |
-
"οΌ": ",",
|
| 14 |
-
"γ": ".",
|
| 15 |
-
"οΌ": "!",
|
| 16 |
-
"οΌ": "?",
|
| 17 |
-
"\n": ".",
|
| 18 |
-
"Β·": ",",
|
| 19 |
-
"γ": ",",
|
| 20 |
-
"...": "β¦",
|
| 21 |
"β": "'",
|
| 22 |
"β": "'",
|
| 23 |
"β": "'",
|
| 24 |
"β": "'",
|
| 25 |
-
"
|
| 26 |
-
"
|
| 27 |
-
"
|
| 28 |
-
"
|
| 29 |
-
"
|
| 30 |
-
"
|
| 31 |
-
"
|
| 32 |
-
"
|
| 33 |
-
"
|
| 34 |
-
"]": "'",
|
| 35 |
-
"β": "-",
|
| 36 |
-
"ο½": "-",
|
| 37 |
-
"~": "-",
|
| 38 |
-
"γ»": "-",
|
| 39 |
-
"γ": "'",
|
| 40 |
-
"γ": "'",
|
| 41 |
-
";": ",",
|
| 42 |
-
":": ",",
|
| 43 |
}
|
| 44 |
|
| 45 |
REPLACE_SYMBOL_REGEX = re.compile(
|
| 46 |
"|".join(re.escape(p) for p in SYMBOLS_MAPPING.keys())
|
| 47 |
)
|
| 48 |
-
ALL_KNOWN_UTF8_RANGE = list(
|
| 49 |
-
itertools.chain.from_iterable(LANGUAGE_UNICODE_RANGE_MAP.values())
|
| 50 |
-
)
|
| 51 |
-
REMOVE_UNKNOWN_SYMBOL_REGEX = re.compile(
|
| 52 |
-
"[^"
|
| 53 |
-
+ "".join(
|
| 54 |
-
f"{re.escape(chr(start))}-{re.escape(chr(end))}"
|
| 55 |
-
for start, end in ALL_KNOWN_UTF8_RANGE
|
| 56 |
-
)
|
| 57 |
-
+ "]"
|
| 58 |
-
)
|
| 59 |
|
| 60 |
|
| 61 |
def clean_text(text):
|
|
@@ -64,6 +27,5 @@ def clean_text(text):
|
|
| 64 |
|
| 65 |
# Replace all chinese symbols with their english counterparts
|
| 66 |
text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
|
| 67 |
-
# text = REMOVE_UNKNOWN_SYMBOL_REGEX.sub("", text)
|
| 68 |
|
| 69 |
return text
|
|
|
|
|
|
|
| 1 |
import re
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
SYMBOLS_MAPPING = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
"β": "'",
|
| 5 |
"β": "'",
|
| 6 |
"β": "'",
|
| 7 |
"β": "'",
|
| 8 |
+
"γ": "",
|
| 9 |
+
"γ": "",
|
| 10 |
+
"[": "",
|
| 11 |
+
"]": "",
|
| 12 |
+
"οΌ": "",
|
| 13 |
+
"οΌ": "",
|
| 14 |
+
"(": "",
|
| 15 |
+
")": "",
|
| 16 |
+
"γ»": "Β·",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
}
|
| 18 |
|
| 19 |
REPLACE_SYMBOL_REGEX = re.compile(
|
| 20 |
"|".join(re.escape(p) for p in SYMBOLS_MAPPING.keys())
|
| 21 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
|
| 24 |
def clean_text(text):
|
|
|
|
| 27 |
|
| 28 |
# Replace all chinese symbols with their english counterparts
|
| 29 |
text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
|
|
|
|
| 30 |
|
| 31 |
return text
|