File size: 2,151 Bytes
3dd84f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import re

from text.english import english_to_ipa2
from text.mandarin import chinese_to_cnm3
from text.japanese import japanese_to_ipa2

language_module_map = {"PAD":0, "ZH": 1, "EN": 2, "JA": 3}

# 预编译正则表达式
ZH_PATTERN = re.compile(r'[\u3400-\u4DBF\u4e00-\u9FFF\uF900-\uFAFF\u3000-\u303F]')
EN_PATTERN = re.compile(r'[a-zA-Z.,!?\'"(){}[\]<>:;@#$%^&*-_+=/\\|~`]+')
JP_PATTERN = re.compile(r'[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FAF\u31F0-\u31FF\uFF00-\uFFEF\u3000-\u303F]')
CLEANER_PATTERN = re.compile(r'\[(ZH|EN|JA)\]')

def detect_language(text: str, prev_lang=None):
    """

    根据给定的文本检测语言



    :param text: 输入文本

    :param prev_lang: 上一个检测到的语言

    :return: 'ZH' for Chinese, 'EN' for English, 'JA' for Japanese, or prev_lang for spaces

    """
    if ZH_PATTERN.search(text): return 'ZH'
    if EN_PATTERN.search(text): return 'EN'
    if JP_PATTERN.search(text): return 'JA'
    if text.isspace(): return prev_lang  # 若是空格,则返回前一个语言
    return None

# auto detect language using re
def cjke_cleaners4(text: str):
    """

    根据文本内容自动检测语言并转换为IPA音标



    :param text: 输入文本

    :return: 转换为IPA音标的文本

    """
    text = CLEANER_PATTERN.sub('', text)
    pointer = 0
    output = ''
    current_language = detect_language(text[pointer])
    
    while pointer < len(text):
        temp_text = ''
        while pointer < len(text) and detect_language(text[pointer], current_language) == current_language:
            temp_text += text[pointer]
            pointer += 1
        if current_language == 'ZH':
            output += chinese_to_cnm3(temp_text)
        elif current_language == 'JA':
            output += japanese_to_ipa2(temp_text)
        elif current_language == 'EN':
            output += english_to_ipa2(temp_text)
        if pointer < len(text):
            current_language = detect_language(text[pointer])

    output = re.sub(r'\s+$', '', output)
    output = re.sub(r'([^\.,!\?\-…~])$', r'\1.', output)
    return output