|
import regex as re
|
|
|
|
try:
|
|
from config import config
|
|
|
|
LANGUAGE_IDENTIFICATION_LIBRARY = (
|
|
config.webui_config.language_identification_library
|
|
)
|
|
except:
|
|
LANGUAGE_IDENTIFICATION_LIBRARY = "langid"
|
|
|
|
module = LANGUAGE_IDENTIFICATION_LIBRARY.lower()
|
|
|
|
langid_languages = [
|
|
"af",
|
|
"am",
|
|
"an",
|
|
"ar",
|
|
"as",
|
|
"az",
|
|
"be",
|
|
"bg",
|
|
"bn",
|
|
"br",
|
|
"bs",
|
|
"ca",
|
|
"cs",
|
|
"cy",
|
|
"da",
|
|
"de",
|
|
"dz",
|
|
"el",
|
|
"en",
|
|
"eo",
|
|
"es",
|
|
"et",
|
|
"eu",
|
|
"fa",
|
|
"fi",
|
|
"fo",
|
|
"fr",
|
|
"ga",
|
|
"gl",
|
|
"gu",
|
|
"he",
|
|
"hi",
|
|
"hr",
|
|
"ht",
|
|
"hu",
|
|
"hy",
|
|
"id",
|
|
"is",
|
|
"it",
|
|
"ja",
|
|
"jv",
|
|
"ka",
|
|
"kk",
|
|
"km",
|
|
"kn",
|
|
"ko",
|
|
"ku",
|
|
"ky",
|
|
"la",
|
|
"lb",
|
|
"lo",
|
|
"lt",
|
|
"lv",
|
|
"mg",
|
|
"mk",
|
|
"ml",
|
|
"mn",
|
|
"mr",
|
|
"ms",
|
|
"mt",
|
|
"nb",
|
|
"ne",
|
|
"nl",
|
|
"nn",
|
|
"no",
|
|
"oc",
|
|
"or",
|
|
"pa",
|
|
"pl",
|
|
"ps",
|
|
"pt",
|
|
"qu",
|
|
"ro",
|
|
"ru",
|
|
"rw",
|
|
"se",
|
|
"si",
|
|
"sk",
|
|
"sl",
|
|
"sq",
|
|
"sr",
|
|
"sv",
|
|
"sw",
|
|
"ta",
|
|
"te",
|
|
"th",
|
|
"tl",
|
|
"tr",
|
|
"ug",
|
|
"uk",
|
|
"ur",
|
|
"vi",
|
|
"vo",
|
|
"wa",
|
|
"xh",
|
|
"zh",
|
|
"zu",
|
|
]
|
|
|
|
|
|
def classify_language(text: str, target_languages: list = None) -> str:
|
|
if module == "fastlid" or module == "fasttext":
|
|
from fastlid import fastlid, supported_langs
|
|
|
|
classifier = fastlid
|
|
if target_languages != None:
|
|
target_languages = [
|
|
lang for lang in target_languages if lang in supported_langs
|
|
]
|
|
fastlid.set_languages = target_languages
|
|
elif module == "langid":
|
|
import langid
|
|
|
|
classifier = langid.classify
|
|
if target_languages != None:
|
|
target_languages = [
|
|
lang for lang in target_languages if lang in langid_languages
|
|
]
|
|
langid.set_languages(target_languages)
|
|
else:
|
|
raise ValueError(f"Wrong module {module}")
|
|
|
|
lang = classifier(text)[0]
|
|
|
|
return lang
|
|
|
|
|
|
def classify_zh_ja(text: str) -> str:
|
|
for idx, char in enumerate(text):
|
|
unicode_val = ord(char)
|
|
|
|
|
|
if 0x3040 <= unicode_val <= 0x309F or 0x30A0 <= unicode_val <= 0x30FF:
|
|
return "ja"
|
|
|
|
|
|
if 0x4E00 <= unicode_val <= 0x9FFF:
|
|
|
|
next_char = text[idx + 1] if idx + 1 < len(text) else None
|
|
|
|
if next_char and (
|
|
0x3040 <= ord(next_char) <= 0x309F or 0x30A0 <= ord(next_char) <= 0x30FF
|
|
):
|
|
return "ja"
|
|
|
|
return "zh"
|
|
|
|
|
|
def split_alpha_nonalpha(text, mode=1):
|
|
if mode == 1:
|
|
pattern = r"(?<=[\u4e00-\u9fff\u3040-\u30FF\d\s])(?=[\p{Latin}])|(?<=[\p{Latin}\s])(?=[\u4e00-\u9fff\u3040-\u30FF\d])"
|
|
elif mode == 2:
|
|
pattern = r"(?<=[\u4e00-\u9fff\u3040-\u30FF\s])(?=[\p{Latin}\d])|(?<=[\p{Latin}\d\s])(?=[\u4e00-\u9fff\u3040-\u30FF])"
|
|
else:
|
|
raise ValueError("Invalid mode. Supported modes are 1 and 2.")
|
|
|
|
return re.split(pattern, text)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
text = "这是一个测试文本"
|
|
print(classify_language(text))
|
|
print(classify_zh_ja(text))
|
|
|
|
text = "これはテストテキストです"
|
|
print(classify_language(text))
|
|
print(classify_zh_ja(text))
|
|
|
|
text = "vits和Bert-VITS2是tts模型。花费3days.花费3天。Take 3 days"
|
|
|
|
print(split_alpha_nonalpha(text, mode=1))
|
|
|
|
|
|
print(split_alpha_nonalpha(text, mode=2))
|
|
|
|
|
|
text = "vits 和 Bert-VITS2 是 tts 模型。花费3days.花费3天。Take 3 days"
|
|
print(split_alpha_nonalpha(text, mode=1))
|
|
|
|
|
|
text = "vits 和 Bert-VITS2 是 tts 模型。花费3days.花费3天。Take 3 days"
|
|
print(split_alpha_nonalpha(text, mode=2))
|
|
|
|
|