Spaces:
Running
on
Zero
Running
on
Zero
Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +2 -0
- inference_webui.py +15 -0
- text/.gitignore +3 -0
- text/G2PWModel/MONOPHONIC_CHARS.txt +0 -0
- text/G2PWModel/POLYPHONIC_CHARS.txt +0 -0
- text/G2PWModel/bopomofo_to_pinyin_wo_tune_dict.json +1 -0
- text/G2PWModel/char_bopomofo_dict.json +0 -0
- text/G2PWModel/config.py +19 -0
- text/G2PWModel/g2pW.onnx +3 -0
- text/G2PWModel/record.log +1005 -0
- text/G2PWModel/version +1 -0
- text/LangSegmenter/__init__.py +1 -0
- text/LangSegmenter/langsegmenter.py +175 -0
- text/__init__.py +28 -0
- text/cantonese.py +222 -0
- text/chinese.py +208 -0
- text/chinese2.py +353 -0
- text/cleaner.py +94 -0
- text/cmudict-fast.rep +0 -0
- text/cmudict.rep +0 -0
- text/cmudict_cache.pickle +3 -0
- text/en_normalization/expend.py +283 -0
- text/engdict-hot.rep +3 -0
- text/engdict_cache.pickle +3 -0
- text/english.py +374 -0
- text/g2pw/__init__.py +1 -0
- text/g2pw/dataset.py +160 -0
- text/g2pw/g2pw.py +159 -0
- text/g2pw/onnx_api.py +243 -0
- text/g2pw/polyphonic-fix.rep +0 -0
- text/g2pw/polyphonic.pickle +3 -0
- text/g2pw/polyphonic.rep +53 -0
- text/g2pw/utils.py +143 -0
- text/ja_userdic/user.dict +3 -0
- text/ja_userdic/userdict.csv +3 -0
- text/ja_userdic/userdict.md5 +1 -0
- text/japanese.py +276 -0
- text/korean.py +337 -0
- text/namedict_cache.pickle +3 -0
- text/opencpop-strict.txt +429 -0
- text/symbols.py +399 -0
- text/symbols2.py +797 -0
- text/tone_sandhi.py +778 -0
- text/zh_normalization/README.md +16 -0
- text/zh_normalization/__init__.py +14 -0
- text/zh_normalization/char_convert.py +44 -0
- text/zh_normalization/chronology.py +139 -0
- text/zh_normalization/constants.py +62 -0
- text/zh_normalization/num.py +317 -0
- text/zh_normalization/phonecode.py +59 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
text/ja_userdic/user.dict filter=lfs diff=lfs merge=lfs -text
|
37 |
+
text/ja_userdic/userdict.csv filter=lfs diff=lfs merge=lfs -text
|
inference_webui.py
CHANGED
@@ -6,6 +6,21 @@
|
|
6 |
全部按英文识别
|
7 |
全部按日文识别
|
8 |
'''
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
import logging
|
10 |
import traceback
|
11 |
logging.getLogger("markdown_it").setLevel(logging.ERROR)
|
|
|
6 |
全部按英文识别
|
7 |
全部按日文识别
|
8 |
'''
|
9 |
+
import os
|
10 |
+
os.makedirs("pretrained_models",exist_ok=True)
|
11 |
+
from huggingface_hub import snapshot_download
|
12 |
+
snapshot_download(
|
13 |
+
repo_id="",
|
14 |
+
# repo_type="dataset",
|
15 |
+
# repo_type="model",
|
16 |
+
repo_type="space",
|
17 |
+
# allow_patterns="data/*", # 通配符匹配文件夹内容
|
18 |
+
# local_dir="/data/docker/liujing04/galgame/7zs", # 指定本地存储目录
|
19 |
+
# local_dir="/data/docker/accvideo_14B_i2v_480P", # 指定本地存储目录
|
20 |
+
# local_dir="/DATA/bvac/personal/gpt-vits-coeus/v2pp-hf-space", # 指定本地存储目录
|
21 |
+
# local_dir="/DATA/bvac/personal/modelscope-7z-packages/PluginsKers-Convbased-Studio-bak250621", # 指定本地存储目录
|
22 |
+
local_dir="/DATA/bvac/personal/modelscope-7z-packages/kemuriririn-Voice-Clone-Arena-bak250622", # 指定本地存储目录
|
23 |
+
)
|
24 |
import logging
|
25 |
import traceback
|
26 |
logging.getLogger("markdown_it").setLevel(logging.ERROR)
|
text/.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
G2PWModel
|
2 |
+
__pycache__
|
3 |
+
*.zip
|
text/G2PWModel/MONOPHONIC_CHARS.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
text/G2PWModel/POLYPHONIC_CHARS.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
text/G2PWModel/bopomofo_to_pinyin_wo_tune_dict.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"ㄌㄧㄥ": "ling", "ㄩㄢ": "yuan", "ㄒㄧㄥ": "xing", "ㄑㄧㄡ": "qiu", "ㄊㄧㄢ": "tian", "ㄎㄨㄚ": "kua", "ㄨ": "wu", "ㄧㄣ": "yin", "ㄧ": "yi", "ㄒㄧㄝ": "xie", "ㄔㄡ": "chou", "ㄋㄨㄛ": "nuo", "ㄉㄢ": "dan", "ㄒㄩ": "xu", "ㄒㄩㄥ": "xiong", "ㄌㄧㄡ": "liu", "ㄌㄧㄣ": "lin", "ㄒㄧㄤ": "xiang", "ㄩㄥ": "yong", "ㄒㄧㄣ": "xin", "ㄓㄣ": "zhen", "ㄉㄞ": "dai", "ㄆㄢ": "pan", "ㄖㄨ": "ru", "ㄇㄚ": "ma", "ㄑㄧㄢ": "qian", "ㄘ": "ci", "ㄓㄨㄥ": "zhong", "ㄋㄟ": "nei", "ㄔㄥ": "cheng", "ㄈㄥ": "feng", "ㄓㄨㄛ": "zhuo", "ㄈㄤ": "fang", "ㄠ": "ao", "ㄗㄨㄛ": "zuo", "ㄓㄡ": "zhou", "ㄉㄨㄥ": "dong", "ㄙㄨ": "su", "ㄑㄩㄥ": "qiong", "ㄎㄨㄤ": "kuang", "ㄨㄤ": "wang", "ㄌㄟ": "lei", "ㄋㄠ": "nao", "ㄓㄨ": "zhu", "ㄕㄨ": "shu", "ㄕㄣ": "shen", "ㄐㄧㄝ": "jie", "ㄉㄧㄝ": "die", "ㄔ": "chi", "ㄌㄨㄥ": "long", "ㄧㄥ": "ying", "ㄅㄥ": "beng", "ㄌㄢ": "lan", "ㄇㄧㄠ": "miao", "ㄌㄧ": "li", "ㄐㄧ": "ji", "ㄩ": "yu", "ㄌㄨㄛ": "luo", "ㄔㄞ": "chai", "ㄏㄨㄣ": "hun", "ㄏㄨㄟ": "hui", "ㄖㄠ": "rao", "ㄏㄢ": "han", "ㄒㄧ": "xi", "ㄊㄞ": "tai", "ㄧㄠ": "yao", "ㄐㄩㄣ": "jun", "ㄌㄩㄝ": "lve", "ㄊㄤ": "tang", "ㄓㄠ": "zhao", "ㄓㄞ": "zhai", "ㄓㄚ": "zha", "ㄦ": "er", "ㄖㄢ": "ran", "ㄑㄧ": "qi", "ㄙㄜ": "se", "ㄙ": "si", "ㄙㄚ": "sa", "ㄎㄨㄟ": "kui", "ㄆㄨ": "pu", "ㄊㄚ": "ta", "ㄉㄨ": "du", "ㄊㄨ": "tu", "ㄧㄤ": "yang", "ㄡ": "ou", "ㄇㄧㄢ": "mian", "ㄨㄣ": "wen", "ㄉㄧㄠ": "diao", "ㄇㄧㄝ": "mie", "ㄨㄚ": "wa", "ㄋㄧㄠ": "niao", "ㄧㄡ": "you", "ㄔㄜ": "che", "ㄑㄩㄢ": "quan", "ㄘㄞ": "cai", "ㄌㄧㄤ": "liang", "ㄍㄨ": "gu", "ㄇㄠ": "mao", "ㄍㄨㄚ": "gua", "ㄙㄨㄟ": "sui", "ㄇㄢ": "man", "ㄕ": "shi", "ㄎㄡ": "kou", "ㄊㄧㄥ": "ting", "ㄅㄧㄥ": "bing", "ㄏㄨㄛ": "huo", "ㄍㄨㄥ": "gong", "ㄑㄧㄣ": "qin", "ㄐㄩㄥ": "jiong", "ㄌㄨ": "lu", "ㄋㄢ": "nan", "ㄅㄧ": "bi", "ㄑㄧㄚ": "qia", "ㄆㄧ": "pi", "ㄉㄧㄢ": "dian", "ㄈㄨ": "fu", "ㄍㄜ": "ge", "ㄅㄞ": "bai", "ㄍㄢ": "gan", "ㄒㄩㄢ": "xuan", "ㄌㄤ": "lang", "ㄕㄜ": "she", "ㄏㄨㄚ": "hua", "ㄊㄡ": "tou", "ㄆㄧㄢ": "pian", "ㄉㄧ": "di", "ㄖㄨㄢ": "ruan", "ㄜ": "e", "ㄑㄧㄝ": "qie", "ㄉㄡ": "dou", "ㄖㄨㄟ": "rui", "ㄘㄨㄟ": "cui", "ㄐㄧㄢ": "jian", "ㄔㄨㄥ": "chong", "ㄉㄥ": "deng", "ㄐㄩㄝ": "jue", "ㄒㄩㄝ": "xue", "ㄒㄧㄠ": "xiao", "ㄗㄢ": "zan", "ㄓㄢ": "zhan", "ㄗㄡ": "zou", "ㄘㄡ": "cou", "ㄔㄨㄚ": "chua", "ㄈㄟ": "fei", "ㄅㄟ": "bei", "ㄔㄨ": "chu", "ㄅㄚ": "ba", "ㄎㄨㄞ": "kuai", "ㄒㄧㄚ": "xia", "ㄏㄜ": "he", "ㄅㄧㄝ": "bie", "ㄌㄩ": "lv", "ㄙㄨㄢ": "suan", "ㄏㄥ": "heng", "ㄍㄨㄟ": "gui", "ㄌㄡ": "lou", "ㄊㄧ": "ti", "ㄌㄜ": "le", "ㄙㄨㄣ": "sun", "ㄒㄧㄢ": "xian", "ㄑㄩㄝ": "que", "ㄓ": "zhi", "ㄐㄧㄚ": "jia", "ㄏㄨ": "hu", "ㄌㄚ": "la", "ㄎㄜ": "ke", "ㄞ": "ai", "ㄨㄟ": "wei", "ㄏㄨㄢ": "huan", "ㄕㄨㄚ": "shua", "ㄕㄨㄤ": "shuang", "ㄍㄞ": "gai", "ㄏㄞ": "hai", "ㄧㄢ": "yan", "ㄈㄢ": "fan", "ㄆㄤ": "pang", "ㄙㄨㄥ": "song", "ㄋㄜ": "ne", "ㄔㄣ": "chen", "ㄍㄨㄛ": "guo", "ㄣ": "en", "ㄋㄍ": "ng", "ㄆㄚ": "pa", "ㄈㄚ": "fa", "ㄆㄡ": "pou", "ㄏㄡ": "hou", "ㄑㄩ": "qu", "ㄒㄩㄣ": "xun", "ㄋㄧㄝ": "nie", "ㄏㄨㄥ": "hong", "ㄊㄨㄣ": "tun", "ㄨㄞ": "wai", "ㄕㄡ": "shou", "ㄧㄝ": "ye", "ㄐㄩ": "ju", "ㄙㄡ": "sou", "ㄌㄨㄣ": "lun", "ㄋㄧㄚ": "nia", "ㄆㄣ": "pen", "ㄈㄣ": "fen", "ㄔㄨㄣ": "chun", "ㄋㄧㄡ": "niu", "ㄖㄡ": "rou", "ㄉㄨㄛ": "duo", "ㄗㄜ": "ze", "ㄕㄥ": "sheng", "ㄎㄨ": "ku", "ㄧㄚ": "ya", "ㄓㄨㄟ": "zhui", "ㄍㄡ": "gou", "ㄅㄛ": "bo", "ㄋㄚ": "na", "ㄒㄧㄡ": "xiu", "ㄘㄨ": "cu", "ㄎㄨㄛ": "kuo", "ㄌㄠ": "lao", "ㄘㄨㄥ": "cong", "ㄉㄚ": "da", "ㄆㄛ": "po", "ㄙㄞ": "sai", "ㄌㄥ": "leng", "ㄖㄨㄥ": "rong", "ㄋㄧ": "ni", "ㄆㄠ": "pao", "ㄎㄢ": "kan", "ㄨㄥ": "weng", "ㄨㄢ": "wan", "ㄏㄠ": "hao", "ㄐㄧㄥ": "jing", "ㄊㄢ": "tan", "ㄅㄨ": "bu", "ㄗㄤ": "zang", "ㄐㄧㄡ": "jiu", "ㄇㄟ": "mei", "ㄇㄨ": "mu", "ㄉㄨㄟ": "dui", "ㄅㄤ": "bang", "ㄅㄠ": "bao", "ㄔㄤ": "chang", "ㄓㄤ": "zhang", "ㄗㄨㄥ": "zong", "ㄍㄨㄣ": "gun", "ㄌㄧㄠ": "liao", "ㄔㄢ": "chan", "ㄓㄜ": "zhe", "ㄇㄥ": "meng", "ㄑㄧㄠ": "qiao", "ㄋㄤ": "nang", "ㄩㄣ": "yun", "ㄎㄞ": "kai", "ㄍㄠ": "gao", "ㄊㄠ": "tao", "ㄕㄢ": "shan", "ㄌㄞ": "lai", "ㄅㄢ": "ban", "ㄎㄨㄥ": "kong", "ㄔㄨㄛ": "chuo", "ㄋㄨ": "nu", "ㄆㄟ": "pei", "ㄆㄥ": "peng", "ㄘㄢ": "can", "ㄙㄨㄛ": "suo", "ㄊㄨㄥ": "tong", "ㄑㄧㄤ": "qiang", "ㄙㄠ": "sao", "ㄓㄨㄢ": "zhuan", "ㄢ": "an", "ㄔㄚ": "cha", "ㄕㄚ": "sha", "ㄌㄧㄢ": "lian", "ㄇㄧ": "mi", "ㄋㄡ": "nou", "ㄘㄠ": "cao", "ㄙㄣ": "sen", "ㄋㄣ": "nen", "ㄋㄧㄢ": "nian", "ㄇㄞ": "mai", "ㄩㄝ": "yue", "ㄋㄞ": "nai", "ㄏㄨㄞ": "huai", "ㄗ": "zi", "ㄌㄨㄢ": "luan", "ㄉ��ㄥ": "ding", "ㄇㄤ": "mang", "ㄋㄧㄥ": "ning", "ㄇㄧㄥ": "ming", "ㄗㄨㄟ": "zui", "ㄎㄤ": "kang", "ㄉㄜ": "de", "ㄅㄧㄢ": "bian", "ㄐㄧㄣ": "jin", "ㄔㄨㄟ": "chui", "ㄊㄨㄟ": "tui", "ㄗㄚ": "za", "ㄘㄣ": "cen", "ㄇㄧㄣ": "min", "ㄏㄨㄤ": "huang", "ㄗㄨ": "zu", "ㄘㄨㄛ": "cuo", "ㄊㄨㄛ": "tuo", "ㄑㄩㄣ": "qun", "ㄅㄧㄣ": "bin", "ㄊㄧㄠ": "tiao", "ㄍㄤ": "gang", "ㄉㄨㄢ": "duan", "ㄅㄧㄠ": "biao", "ㄉㄠ": "dao", "ㄖㄨㄣ": "run", "ㄐㄧㄠ": "jiao", "ㄨㄛ": "wo", "ㄘㄨㄢ": "cuan", "ㄖㄣ": "ren", "ㄇㄣ": "men", "ㄓㄨㄣ": "zhun", "ㄎㄨㄣ": "kun", "ㄔㄨㄤ": "chuang", "ㄗㄠ": "zao", "ㄓㄥ": "zheng", "ㄆㄧㄣ": "pin", "ㄅㄣ": "ben", "ㄐㄧㄤ": "jiang", "ㄐㄩㄢ": "juan", "ㄘㄥ": "ceng", "ㄏㄤ": "hang", "ㄋㄧㄣ": "nin", "ㄌㄧㄝ": "lie", "ㄍㄨㄤ": "guang", "ㄙㄢ": "san", "ㄊㄜ": "te", "ㄕㄨㄣ": "shun", "ㄕㄨㄟ": "shui", "ㄔㄠ": "chao", "ㄘㄜ": "ce", "ㄍㄨㄞ": "guai", "ㄎㄥ": "keng", "ㄕㄞ": "shai", "ㄉㄣ": "den", "ㄊㄨㄢ": "tuan", "ㄆㄧㄠ": "piao", "ㄑㄧㄥ": "qing", "ㄍㄥ": "geng", "ㄔㄨㄞ": "chuai", "ㄕㄠ": "shao", "ㄍㄣ": "gen", "ㄋㄨㄢ": "nuan", "ㄖㄥ": "reng", "ㄇㄡ": "mou", "ㄆㄞ": "pai", "ㄤ": "ang", "ㄎㄚ": "ka", "ㄍㄨㄢ": "guan", "ㄕㄨㄛ": "shuo", "ㄏㄣ": "hen", "ㄔㄨㄢ": "chuan", "ㄎㄨㄢ": "kuan", "ㄏㄟ": "hei", "ㄇㄛ": "mo", "ㄗㄞ": "zai", "ㄋㄥ": "neng", "ㄕㄨㄞ": "shuai", "ㄖㄜ": "re", "ㄋㄩ": "nv", "ㄆㄧㄥ": "ping", "ㄘㄤ": "cang", "ㄋㄨㄥ": "nong", "ㄎㄠ": "kao", "ㄗㄨㄢ": "zuan", "ㄎㄣ": "ken", "ㄍㄚ": "ga", "ㄗㄣ": "zen", "ㄉㄤ": "dang", "ㄗㄥ": "zeng", "ㄉㄨㄣ": "dun", "ㄘㄚ": "ca", "ㄖㄤ": "rang", "ㄘㄨㄣ": "cun", "ㄖㄨㄛ": "ruo", "ㄊㄧㄝ": "tie", "ㄊㄥ": "teng", "ㄙㄥ": "seng", "ㄖ": "ri", "ㄗㄨㄣ": "zun", "ㄋㄧㄤ": "niang", "ㄋㄩㄝ": "nve", "ㄙㄤ": "sang", "ㄓㄨㄤ": "zhuang", "ㄕㄤ": "shang", "ㄆㄧㄝ": "pie", "ㄕㄨㄢ": "shuan", "ㄈㄡ": "fou", "ㄉㄧㄡ": "diu", "ㄇㄜ": "me", "ㄈㄛ": "fo", "ㄌㄧㄚ": "lia", "ㄎㄟ": "kei", "ㄏㄚ": "ha", "ㄚ": "a", "ㄌㄛ": "lo", "ㄧㄛ": "yo", "ㄛ": "o", "ㄏㄋㄍ": "hng", "ㄋ": "n", "ㄌㄣ": "len", "ㄉㄧㄚ": "dia", "ㄇㄧㄡ": "miu", "ㄉㄟ": "dei", "ㄏㄇ": "hm", "ㄋㄨㄣ": "nun", "ㄓㄨㄞ": "zhuai", "ㄊㄟ": "tei", "ㄗㄟ": "zei", "ㄓㄨㄚ": "zhua", "ㄖㄨㄚ": "rua", "ê": "ê", "ㄟ": "ei", "ㄍㄟ": "gei", "ㄈㄧㄠ": "fiao", "ㄕㄟ": "shei", "ㄓㄟ": "zhei", "ㄥ": "eng", "ㄘㄟ": "cei", "ㄉㄧㄣ": "din", "ㄅㄧㄤ": "biang", "ㄧㄞ": "yai"}
|
text/G2PWModel/char_bopomofo_dict.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
text/G2PWModel/config.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
manual_seed = 1313
|
2 |
+
model_source = 'bert-base-chinese'
|
3 |
+
window_size = 32
|
4 |
+
num_workers = 2
|
5 |
+
use_mask = True
|
6 |
+
use_conditional = True
|
7 |
+
param_conditional = {
|
8 |
+
'bias': True,
|
9 |
+
'char-linear': True,
|
10 |
+
'pos-linear': False,
|
11 |
+
'char+pos-second': True,
|
12 |
+
}
|
13 |
+
|
14 |
+
batch_size = 256
|
15 |
+
use_pos = True
|
16 |
+
param_pos = {
|
17 |
+
'weight': 0.1,
|
18 |
+
'pos_joint_training': True,
|
19 |
+
}
|
text/G2PWModel/g2pW.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2eb3c71fd95117b2e1abef8d2d0cd78aae894bbe7f0fac105ddc9c32ce63cbd0
|
3 |
+
size 635212732
|
text/G2PWModel/record.log
ADDED
@@ -0,0 +1,1005 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
device: cuda
|
2 |
+
now: 2022-04-01 22:13:18.349604
|
3 |
+
[200] train_loss=0.289519 valid_loss=0.102661 valid_pos_acc=0.924619 valid_acc=0.97596 / 0.703958 / 0.586078 best_acc=0.97596
|
4 |
+
now: 2022-04-01 22:25:27.330080
|
5 |
+
[400] train_loss=0.089245 valid_loss=0.0703849 valid_pos_acc=0.942315 valid_acc=0.984227 / 0.747566 / 0.616754 best_acc=0.984227
|
6 |
+
now: 2022-04-01 22:37:16.857336
|
7 |
+
[600] train_loss=0.0663516 valid_loss=0.0597114 valid_pos_acc=0.946489 valid_acc=0.98734 / 0.77479 / 0.638442 best_acc=0.98734
|
8 |
+
now: 2022-04-01 22:49:06.182095
|
9 |
+
[800] train_loss=0.0559394 valid_loss=0.0535268 valid_pos_acc=0.948245 valid_acc=0.988928 / 0.774415 / 0.643435 best_acc=0.988928
|
10 |
+
now: 2022-04-01 23:00:55.371920
|
11 |
+
[1000] train_loss=0.0497098 valid_loss=0.0490104 valid_pos_acc=0.954161 valid_acc=0.989486 / 0.796356 / 0.664386 best_acc=0.989486
|
12 |
+
now: 2022-04-01 23:12:49.781716
|
13 |
+
[1200] train_loss=0.0462926 valid_loss=0.0466889 valid_pos_acc=0.954634 valid_acc=0.989913 / 0.802885 / 0.673908 best_acc=0.989913
|
14 |
+
now: 2022-04-01 23:24:43.685062
|
15 |
+
[1400] train_loss=0.0433836 valid_loss=0.0451725 valid_pos_acc=0.956761 valid_acc=0.99049 / 0.805024 / 0.674369 best_acc=0.99049
|
16 |
+
now: 2022-04-01 23:36:46.100963
|
17 |
+
[1600] train_loss=0.0404561 valid_loss=0.0436914 valid_pos_acc=0.957201 valid_acc=0.991022 / 0.811412 / 0.679481 best_acc=0.991022
|
18 |
+
now: 2022-04-01 23:48:48.583240
|
19 |
+
[1800] train_loss=0.040905 valid_loss=0.0412648 valid_pos_acc=0.958418 valid_acc=0.991332 / 0.815194 / 0.681627 best_acc=0.991332
|
20 |
+
now: 2022-04-02 00:00:42.282365
|
21 |
+
[2000] train_loss=0.0384612 valid_loss=0.0402427 valid_pos_acc=0.959796 valid_acc=0.991534 / 0.819666 / 0.689516 best_acc=0.991534
|
22 |
+
now: 2022-04-02 00:12:52.902834
|
23 |
+
[2200] train_loss=0.0373539 valid_loss=0.0410455 valid_pos_acc=0.961692 valid_acc=0.991425 / 0.828402 / 0.696595 best_acc=0.991534
|
24 |
+
now: 2022-04-02 00:25:06.851427
|
25 |
+
[2400] train_loss=0.0367612 valid_loss=0.039694 valid_pos_acc=0.960926 valid_acc=0.991823 / 0.830391 / 0.700222 best_acc=0.991823
|
26 |
+
now: 2022-04-02 00:37:24.156808
|
27 |
+
[2600] train_loss=0.0386493 valid_loss=0.0377683 valid_pos_acc=0.962183 valid_acc=0.992202 / 0.832219 / 0.707156 best_acc=0.992202
|
28 |
+
now: 2022-04-02 00:49:37.943513
|
29 |
+
[2800] train_loss=0.0356553 valid_loss=0.0381159 valid_pos_acc=0.962729 valid_acc=0.992061 / 0.835112 / 0.707941 best_acc=0.992202
|
30 |
+
now: 2022-04-02 01:01:43.672504
|
31 |
+
[3000] train_loss=0.0338178 valid_loss=0.0386144 valid_pos_acc=0.962419 valid_acc=0.992322 / 0.835546 / 0.710556 best_acc=0.992322
|
32 |
+
now: 2022-04-02 01:13:56.991606
|
33 |
+
[3200] train_loss=0.0335683 valid_loss=0.0381786 valid_pos_acc=0.962755 valid_acc=0.992233 / 0.838008 / 0.713975 best_acc=0.992322
|
34 |
+
now: 2022-04-02 01:26:13.830261
|
35 |
+
[3400] train_loss=0.0316981 valid_loss=0.0373759 valid_pos_acc=0.963524 valid_acc=0.992309 / 0.843253 / 0.718974 best_acc=0.992322
|
36 |
+
now: 2022-04-02 01:38:08.308362
|
37 |
+
[3600] train_loss=0.0350782 valid_loss=0.0376615 valid_pos_acc=0.96404 valid_acc=0.992259 / 0.84979 / 0.725183 best_acc=0.992322
|
38 |
+
now: 2022-04-02 01:49:59.416353
|
39 |
+
[3800] train_loss=0.0321498 valid_loss=0.0367548 valid_pos_acc=0.964441 valid_acc=0.992801 / 0.850152 / 0.722988 best_acc=0.992801
|
40 |
+
now: 2022-04-02 02:02:09.238893
|
41 |
+
[4000] train_loss=0.0331685 valid_loss=0.0369892 valid_pos_acc=0.963339 valid_acc=0.992777 / 0.859395 / 0.730708 best_acc=0.992801
|
42 |
+
now: 2022-04-02 02:14:27.957159
|
43 |
+
[4200] train_loss=0.0317164 valid_loss=0.0350153 valid_pos_acc=0.965784 valid_acc=0.992656 / 0.853549 / 0.727562 best_acc=0.992801
|
44 |
+
now: 2022-04-02 02:26:43.092476
|
45 |
+
[4400] train_loss=0.0324034 valid_loss=0.0346509 valid_pos_acc=0.965843 valid_acc=0.992981 / 0.853694 / 0.73043 best_acc=0.992981
|
46 |
+
now: 2022-04-02 02:39:22.465030
|
47 |
+
[4600] train_loss=0.0298959 valid_loss=0.0356152 valid_pos_acc=0.965606 valid_acc=0.993022 / 0.855494 / 0.728954 best_acc=0.993022
|
48 |
+
now: 2022-04-02 02:51:53.210107
|
49 |
+
[4800] train_loss=0.0310447 valid_loss=0.0355586 valid_pos_acc=0.965446 valid_acc=0.992597 / 0.851145 / 0.728595 best_acc=0.993022
|
50 |
+
now: 2022-04-02 03:04:21.463931
|
51 |
+
[5000] train_loss=0.031017 valid_loss=0.034331 valid_pos_acc=0.965695 valid_acc=0.992866 / 0.852123 / 0.728928 best_acc=0.993022
|
52 |
+
now: 2022-04-02 03:16:32.777183
|
53 |
+
[5200] train_loss=0.0312034 valid_loss=0.0349778 valid_pos_acc=0.966472 valid_acc=0.993105 / 0.855114 / 0.733248 best_acc=0.993105
|
54 |
+
now: 2022-04-02 03:28:51.440974
|
55 |
+
[5400] train_loss=0.0294329 valid_loss=0.0339991 valid_pos_acc=0.966307 valid_acc=0.993109 / 0.852872 / 0.727198 best_acc=0.993109
|
56 |
+
now: 2022-04-02 03:41:07.884688
|
57 |
+
[5600] train_loss=0.0285982 valid_loss=0.0341394 valid_pos_acc=0.966307 valid_acc=0.993183 / 0.858873 / 0.736458 best_acc=0.993183
|
58 |
+
now: 2022-04-02 03:53:43.422479
|
59 |
+
[5800] train_loss=0.0283985 valid_loss=0.0325766 valid_pos_acc=0.96683 valid_acc=0.993376 / 0.856761 / 0.738166 best_acc=0.993376
|
60 |
+
now: 2022-04-02 04:06:06.964628
|
61 |
+
[6000] train_loss=0.0302441 valid_loss=0.0344224 valid_pos_acc=0.966774 valid_acc=0.992838 / 0.85689 / 0.733677 best_acc=0.993376
|
62 |
+
now: 2022-04-02 04:18:20.312766
|
63 |
+
[6200] train_loss=0.0289215 valid_loss=0.0348225 valid_pos_acc=0.966589 valid_acc=0.993367 / 0.858202 / 0.736723 best_acc=0.993376
|
64 |
+
now: 2022-04-02 04:30:36.722397
|
65 |
+
[6400] train_loss=0.0294263 valid_loss=0.0329629 valid_pos_acc=0.966854 valid_acc=0.993081 / 0.856632 / 0.7381 best_acc=0.993376
|
66 |
+
now: 2022-04-02 04:42:53.493232
|
67 |
+
[6600] train_loss=0.0285769 valid_loss=0.0333396 valid_pos_acc=0.967153 valid_acc=0.993547 / 0.865742 / 0.743425 best_acc=0.993547
|
68 |
+
now: 2022-04-02 04:55:17.818463
|
69 |
+
[6800] train_loss=0.0265485 valid_loss=0.0330653 valid_pos_acc=0.967776 valid_acc=0.993222 / 0.865918 / 0.743298 best_acc=0.993547
|
70 |
+
now: 2022-04-02 05:07:36.630349
|
71 |
+
[7000] train_loss=0.0284473 valid_loss=0.0320964 valid_pos_acc=0.968023 valid_acc=0.99355 / 0.868261 / 0.748849 best_acc=0.99355
|
72 |
+
now: 2022-04-02 05:20:01.434422
|
73 |
+
[7200] train_loss=0.0274993 valid_loss=0.0326511 valid_pos_acc=0.9669 valid_acc=0.993816 / 0.868294 / 0.746817 best_acc=0.993816
|
74 |
+
now: 2022-04-02 05:32:29.662142
|
75 |
+
[7400] train_loss=0.02851 valid_loss=0.0308467 valid_pos_acc=0.968453 valid_acc=0.993858 / 0.863909 / 0.746068 best_acc=0.993858
|
76 |
+
now: 2022-04-02 05:44:43.967440
|
77 |
+
[7600] train_loss=0.0282732 valid_loss=0.03368 valid_pos_acc=0.967292 valid_acc=0.993014 / 0.86581 / 0.745753 best_acc=0.993858
|
78 |
+
now: 2022-04-02 05:56:45.436298
|
79 |
+
[7800] train_loss=0.0252737 valid_loss=0.0315786 valid_pos_acc=0.967611 valid_acc=0.993799 / 0.869773 / 0.749114 best_acc=0.993858
|
80 |
+
now: 2022-04-02 06:08:51.140922
|
81 |
+
[8000] train_loss=0.0280509 valid_loss=0.0328118 valid_pos_acc=0.96732 valid_acc=0.99363 / 0.86537 / 0.74611 best_acc=0.993858
|
82 |
+
now: 2022-04-02 06:20:43.247091
|
83 |
+
[8200] train_loss=0.028321 valid_loss=0.0308812 valid_pos_acc=0.968106 valid_acc=0.993758 / 0.869684 / 0.751653 best_acc=0.993858
|
84 |
+
now: 2022-04-02 06:32:38.603877
|
85 |
+
[8400] train_loss=0.0271253 valid_loss=0.0326289 valid_pos_acc=0.968232 valid_acc=0.993426 / 0.869263 / 0.748637 best_acc=0.993858
|
86 |
+
now: 2022-04-02 06:44:45.010090
|
87 |
+
[8600] train_loss=0.02778 valid_loss=0.0308819 valid_pos_acc=0.968731 valid_acc=0.993693 / 0.87573 / 0.75794 best_acc=0.993858
|
88 |
+
now: 2022-04-02 06:56:45.886905
|
89 |
+
[8800] train_loss=0.0287492 valid_loss=0.0310371 valid_pos_acc=0.968256 valid_acc=0.993563 / 0.877011 / 0.759391 best_acc=0.993858
|
90 |
+
now: 2022-04-02 07:08:52.584840
|
91 |
+
[9000] train_loss=0.0281025 valid_loss=0.0297675 valid_pos_acc=0.968566 valid_acc=0.993979 / 0.866884 / 0.750877 best_acc=0.993979
|
92 |
+
now: 2022-04-02 07:21:04.827592
|
93 |
+
[9200] train_loss=0.026893 valid_loss=0.0310813 valid_pos_acc=0.968965 valid_acc=0.993758 / 0.869433 / 0.752492 best_acc=0.993979
|
94 |
+
now: 2022-04-02 07:33:11.165254
|
95 |
+
[9400] train_loss=0.0253738 valid_loss=0.0307835 valid_pos_acc=0.969295 valid_acc=0.994046 / 0.878856 / 0.754636 best_acc=0.994046
|
96 |
+
now: 2022-04-02 07:45:16.521889
|
97 |
+
[9600] train_loss=0.0263703 valid_loss=0.0308493 valid_pos_acc=0.969039 valid_acc=0.993986 / 0.873759 / 0.753114 best_acc=0.994046
|
98 |
+
now: 2022-04-02 07:57:19.055032
|
99 |
+
[9800] train_loss=0.0258709 valid_loss=0.0304116 valid_pos_acc=0.967514 valid_acc=0.993751 / 0.87442 / 0.760402 best_acc=0.994046
|
100 |
+
now: 2022-04-02 08:09:21.455984
|
101 |
+
[10000] train_loss=0.0261966 valid_loss=0.0310479 valid_pos_acc=0.968954 valid_acc=0.993786 / 0.879653 / 0.76128 best_acc=0.994046
|
102 |
+
now: 2022-04-02 08:21:30.441155
|
103 |
+
[10200] train_loss=0.0272568 valid_loss=0.0306756 valid_pos_acc=0.969087 valid_acc=0.993777 / 0.879809 / 0.760943 best_acc=0.994046
|
104 |
+
now: 2022-04-02 08:33:36.839764
|
105 |
+
[10400] train_loss=0.027559 valid_loss=0.0308756 valid_pos_acc=0.969366 valid_acc=0.993636 / 0.874443 / 0.755992 best_acc=0.994046
|
106 |
+
now: 2022-04-02 08:45:39.747008
|
107 |
+
[10600] train_loss=0.027269 valid_loss=0.0329513 valid_pos_acc=0.96888 valid_acc=0.992499 / 0.833803 / 0.727518 best_acc=0.994046
|
108 |
+
now: 2022-04-02 08:57:40.273311
|
109 |
+
[10800] train_loss=0.0255775 valid_loss=0.0318773 valid_pos_acc=0.969314 valid_acc=0.993576 / 0.865286 / 0.745435 best_acc=0.994046
|
110 |
+
now: 2022-04-02 09:09:28.232166
|
111 |
+
[11000] train_loss=0.027821 valid_loss=0.0324836 valid_pos_acc=0.969581 valid_acc=0.993517 / 0.858519 / 0.741976 best_acc=0.994046
|
112 |
+
now: 2022-04-02 09:21:18.956995
|
113 |
+
[11200] train_loss=0.0268467 valid_loss=0.0320919 valid_pos_acc=0.968768 valid_acc=0.993515 / 0.859293 / 0.743348 best_acc=0.994046
|
114 |
+
now: 2022-04-02 09:33:14.899728
|
115 |
+
[11400] train_loss=0.0277983 valid_loss=0.0304641 valid_pos_acc=0.969013 valid_acc=0.993803 / 0.857242 / 0.744822 best_acc=0.994046
|
116 |
+
now: 2022-04-02 09:45:20.431378
|
117 |
+
[11600] train_loss=0.0278141 valid_loss=0.0303669 valid_pos_acc=0.969312 valid_acc=0.993491 / 0.861739 / 0.751563 best_acc=0.994046
|
118 |
+
now: 2022-04-02 09:57:29.453034
|
119 |
+
[11800] train_loss=0.0272102 valid_loss=0.030325 valid_pos_acc=0.969045 valid_acc=0.993871 / 0.865131 / 0.753686 best_acc=0.994046
|
120 |
+
now: 2022-04-02 10:09:41.097392
|
121 |
+
[12000] train_loss=0.0271826 valid_loss=0.0302743 valid_pos_acc=0.969701 valid_acc=0.993645 / 0.865918 / 0.753973 best_acc=0.994046
|
122 |
+
now: 2022-04-02 10:21:38.263361
|
123 |
+
[12200] train_loss=0.0266099 valid_loss=0.0288773 valid_pos_acc=0.969902 valid_acc=0.994183 / 0.874035 / 0.76018 best_acc=0.994183
|
124 |
+
now: 2022-04-02 10:33:44.432773
|
125 |
+
[12400] train_loss=0.0252403 valid_loss=0.029718 valid_pos_acc=0.969434 valid_acc=0.994127 / 0.87762 / 0.756971 best_acc=0.994183
|
126 |
+
now: 2022-04-02 10:45:51.265489
|
127 |
+
[12600] train_loss=0.0247018 valid_loss=0.0312226 valid_pos_acc=0.969588 valid_acc=0.993641 / 0.881627 / 0.759701 best_acc=0.994183
|
128 |
+
now: 2022-04-02 10:57:42.967866
|
129 |
+
[12800] train_loss=0.0269899 valid_loss=0.0291686 valid_pos_acc=0.969384 valid_acc=0.994157 / 0.883137 / 0.766173 best_acc=0.994183
|
130 |
+
now: 2022-04-02 11:09:44.697867
|
131 |
+
[13000] train_loss=0.026225 valid_loss=0.0300868 valid_pos_acc=0.969607 valid_acc=0.993756 / 0.881478 / 0.764432 best_acc=0.994183
|
132 |
+
now: 2022-04-02 11:21:47.303966
|
133 |
+
[13200] train_loss=0.0251707 valid_loss=0.0292528 valid_pos_acc=0.969937 valid_acc=0.994205 / 0.882159 / 0.764326 best_acc=0.994205
|
134 |
+
now: 2022-04-02 11:34:07.084506
|
135 |
+
[13400] train_loss=0.0256715 valid_loss=0.0294879 valid_pos_acc=0.969972 valid_acc=0.994331 / 0.879444 / 0.763116 best_acc=0.994331
|
136 |
+
now: 2022-04-02 11:46:17.499315
|
137 |
+
[13600] train_loss=0.0266713 valid_loss=0.0307474 valid_pos_acc=0.968488 valid_acc=0.994092 / 0.88213 / 0.764678 best_acc=0.994331
|
138 |
+
now: 2022-04-02 11:58:29.053919
|
139 |
+
[13800] train_loss=0.0263307 valid_loss=0.0299171 valid_pos_acc=0.969182 valid_acc=0.994146 / 0.886646 / 0.766393 best_acc=0.994331
|
140 |
+
now: 2022-04-02 12:10:42.628035
|
141 |
+
[14000] train_loss=0.0254249 valid_loss=0.0291546 valid_pos_acc=0.970087 valid_acc=0.994083 / 0.884958 / 0.766853 best_acc=0.994331
|
142 |
+
now: 2022-04-02 12:22:47.459592
|
143 |
+
[14200] train_loss=0.0271003 valid_loss=0.0289376 valid_pos_acc=0.969803 valid_acc=0.994263 / 0.882713 / 0.765841 best_acc=0.994331
|
144 |
+
now: 2022-04-02 12:34:51.459159
|
145 |
+
[14400] train_loss=0.0253207 valid_loss=0.0284943 valid_pos_acc=0.97041 valid_acc=0.994483 / 0.883065 / 0.768558 best_acc=0.994483
|
146 |
+
now: 2022-04-02 12:47:03.082143
|
147 |
+
[14600] train_loss=0.0256933 valid_loss=0.0275894 valid_pos_acc=0.970781 valid_acc=0.994426 / 0.882073 / 0.768093 best_acc=0.994483
|
148 |
+
now: 2022-04-02 12:59:01.736374
|
149 |
+
[14800] train_loss=0.025288 valid_loss=0.0290729 valid_pos_acc=0.969898 valid_acc=0.994029 / 0.884465 / 0.764771 best_acc=0.994483
|
150 |
+
now: 2022-04-02 13:11:07.383148
|
151 |
+
[15000] train_loss=0.0254068 valid_loss=0.0292592 valid_pos_acc=0.970833 valid_acc=0.994096 / 0.888942 / 0.769842 best_acc=0.994483
|
152 |
+
now: 2022-04-02 13:23:22.842378
|
153 |
+
[15200] train_loss=0.0236412 valid_loss=0.0282784 valid_pos_acc=0.970644 valid_acc=0.994229 / 0.889311 / 0.768718 best_acc=0.994483
|
154 |
+
now: 2022-04-02 13:35:29.034906
|
155 |
+
[15400] train_loss=0.0243784 valid_loss=0.0292398 valid_pos_acc=0.970883 valid_acc=0.994187 / 0.889313 / 0.771062 best_acc=0.994483
|
156 |
+
now: 2022-04-02 13:47:31.205294
|
157 |
+
[15600] train_loss=0.0240879 valid_loss=0.0298062 valid_pos_acc=0.97036 valid_acc=0.994057 / 0.885163 / 0.767167 best_acc=0.994483
|
158 |
+
now: 2022-04-02 13:59:37.091807
|
159 |
+
[15800] train_loss=0.0241428 valid_loss=0.0298697 valid_pos_acc=0.97079 valid_acc=0.994118 / 0.886573 / 0.768139 best_acc=0.994483
|
160 |
+
now: 2022-04-02 14:11:45.813137
|
161 |
+
[16000] train_loss=0.0245795 valid_loss=0.028206 valid_pos_acc=0.970714 valid_acc=0.994365 / 0.895285 / 0.778247 best_acc=0.994483
|
162 |
+
now: 2022-04-02 14:23:31.259816
|
163 |
+
[16200] train_loss=0.0259529 valid_loss=0.0295037 valid_pos_acc=0.971532 valid_acc=0.994166 / 0.892761 / 0.773792 best_acc=0.994483
|
164 |
+
now: 2022-04-02 14:35:29.710419
|
165 |
+
[16400] train_loss=0.0245774 valid_loss=0.0282059 valid_pos_acc=0.970677 valid_acc=0.994159 / 0.892281 / 0.77366 best_acc=0.994483
|
166 |
+
now: 2022-04-02 14:47:28.883504
|
167 |
+
[16600] train_loss=0.0249353 valid_loss=0.0287864 valid_pos_acc=0.970805 valid_acc=0.994255 / 0.891979 / 0.776495 best_acc=0.994483
|
168 |
+
now: 2022-04-02 14:59:36.374751
|
169 |
+
[16800] train_loss=0.0266362 valid_loss=0.0283276 valid_pos_acc=0.970768 valid_acc=0.994281 / 0.898043 / 0.780269 best_acc=0.994483
|
170 |
+
now: 2022-04-02 15:11:40.341586
|
171 |
+
[17000] train_loss=0.0248526 valid_loss=0.0279962 valid_pos_acc=0.970482 valid_acc=0.994411 / 0.897109 / 0.780885 best_acc=0.994483
|
172 |
+
now: 2022-04-02 15:23:39.987145
|
173 |
+
[17200] train_loss=0.0237728 valid_loss=0.028023 valid_pos_acc=0.971417 valid_acc=0.994322 / 0.888697 / 0.776213 best_acc=0.994483
|
174 |
+
now: 2022-04-02 15:35:38.801398
|
175 |
+
[17400] train_loss=0.0249057 valid_loss=0.027339 valid_pos_acc=0.971389 valid_acc=0.994159 / 0.881219 / 0.768915 best_acc=0.994483
|
176 |
+
now: 2022-04-02 15:47:39.875724
|
177 |
+
[17600] train_loss=0.0246854 valid_loss=0.028386 valid_pos_acc=0.970968 valid_acc=0.994448 / 0.891969 / 0.775594 best_acc=0.994483
|
178 |
+
now: 2022-04-02 15:59:43.932068
|
179 |
+
[17800] train_loss=0.0264608 valid_loss=0.0281136 valid_pos_acc=0.971109 valid_acc=0.994465 / 0.896316 / 0.78033 best_acc=0.994483
|
180 |
+
now: 2022-04-02 16:11:42.780407
|
181 |
+
[18000] train_loss=0.0226492 valid_loss=0.0282867 valid_pos_acc=0.970959 valid_acc=0.994574 / 0.898661 / 0.782303 best_acc=0.994574
|
182 |
+
now: 2022-04-02 16:23:45.328393
|
183 |
+
[18200] train_loss=0.0253564 valid_loss=0.0272226 valid_pos_acc=0.971202 valid_acc=0.994485 / 0.894385 / 0.781905 best_acc=0.994574
|
184 |
+
now: 2022-04-02 16:35:43.743594
|
185 |
+
[18400] train_loss=0.0237427 valid_loss=0.0273525 valid_pos_acc=0.971284 valid_acc=0.994598 / 0.893183 / 0.778385 best_acc=0.994598
|
186 |
+
now: 2022-04-02 16:47:51.962569
|
187 |
+
[18600] train_loss=0.0226361 valid_loss=0.0275174 valid_pos_acc=0.971801 valid_acc=0.994608 / 0.897236 / 0.783469 best_acc=0.994608
|
188 |
+
now: 2022-04-02 17:00:05.072496
|
189 |
+
[18800] train_loss=0.0247811 valid_loss=0.0276029 valid_pos_acc=0.971766 valid_acc=0.994591 / 0.898223 / 0.780288 best_acc=0.994608
|
190 |
+
now: 2022-04-02 17:12:14.066971
|
191 |
+
[19000] train_loss=0.0249346 valid_loss=0.0269959 valid_pos_acc=0.9713 valid_acc=0.994433 / 0.889386 / 0.778011 best_acc=0.994608
|
192 |
+
now: 2022-04-02 17:24:25.443387
|
193 |
+
[19200] train_loss=0.024029 valid_loss=0.0273701 valid_pos_acc=0.971777 valid_acc=0.994565 / 0.89385 / 0.781098 best_acc=0.994608
|
194 |
+
now: 2022-04-02 17:36:21.119407
|
195 |
+
[19400] train_loss=0.0221598 valid_loss=0.028189 valid_pos_acc=0.971337 valid_acc=0.99447 / 0.892931 / 0.778729 best_acc=0.994608
|
196 |
+
now: 2022-04-02 17:48:26.051306
|
197 |
+
[19600] train_loss=0.0232854 valid_loss=0.027458 valid_pos_acc=0.97138 valid_acc=0.994535 / 0.892143 / 0.778963 best_acc=0.994608
|
198 |
+
now: 2022-04-02 18:00:41.153532
|
199 |
+
[19800] train_loss=0.0246367 valid_loss=0.0277884 valid_pos_acc=0.971415 valid_acc=0.994454 / 0.892544 / 0.777699 best_acc=0.994608
|
200 |
+
now: 2022-04-02 18:12:44.656831
|
201 |
+
[20000] train_loss=0.0193271 valid_loss=0.0288193 valid_pos_acc=0.97153 valid_acc=0.9945 / 0.89201 / 0.778198 best_acc=0.994608
|
202 |
+
now: 2022-04-02 18:24:47.237186
|
203 |
+
[20200] train_loss=0.0195292 valid_loss=0.0281468 valid_pos_acc=0.972115 valid_acc=0.99463 / 0.894395 / 0.782171 best_acc=0.99463
|
204 |
+
now: 2022-04-02 18:37:12.174319
|
205 |
+
[20400] train_loss=0.0194709 valid_loss=0.0272298 valid_pos_acc=0.971836 valid_acc=0.994686 / 0.901333 / 0.789041 best_acc=0.994686
|
206 |
+
now: 2022-04-02 18:49:37.572502
|
207 |
+
[20600] train_loss=0.0191372 valid_loss=0.0279598 valid_pos_acc=0.971139 valid_acc=0.994476 / 0.898648 / 0.786924 best_acc=0.994686
|
208 |
+
now: 2022-04-02 19:01:57.223764
|
209 |
+
[20800] train_loss=0.0191731 valid_loss=0.0283044 valid_pos_acc=0.971853 valid_acc=0.994665 / 0.906199 / 0.791619 best_acc=0.994686
|
210 |
+
now: 2022-04-02 19:14:03.413253
|
211 |
+
[21000] train_loss=0.0206041 valid_loss=0.0264156 valid_pos_acc=0.972094 valid_acc=0.994773 / 0.903901 / 0.792328 best_acc=0.994773
|
212 |
+
now: 2022-04-02 19:26:27.975422
|
213 |
+
[21200] train_loss=0.016823 valid_loss=0.0271615 valid_pos_acc=0.97184 valid_acc=0.99471 / 0.904068 / 0.790123 best_acc=0.994773
|
214 |
+
now: 2022-04-02 19:38:24.145598
|
215 |
+
[21400] train_loss=0.0205676 valid_loss=0.0285307 valid_pos_acc=0.971365 valid_acc=0.994502 / 0.899852 / 0.786174 best_acc=0.994773
|
216 |
+
now: 2022-04-02 19:50:32.570024
|
217 |
+
[21600] train_loss=0.0193456 valid_loss=0.0265744 valid_pos_acc=0.971851 valid_acc=0.994726 / 0.902354 / 0.791664 best_acc=0.994773
|
218 |
+
now: 2022-04-02 20:02:43.793933
|
219 |
+
[21800] train_loss=0.0202321 valid_loss=0.0266519 valid_pos_acc=0.971712 valid_acc=0.994788 / 0.900679 / 0.791454 best_acc=0.994788
|
220 |
+
now: 2022-04-02 20:14:55.874949
|
221 |
+
[22000] train_loss=0.0209497 valid_loss=0.0274821 valid_pos_acc=0.970824 valid_acc=0.99463 / 0.890966 / 0.780236 best_acc=0.994788
|
222 |
+
now: 2022-04-02 20:26:46.865536
|
223 |
+
[22200] train_loss=0.0212923 valid_loss=0.0270931 valid_pos_acc=0.971727 valid_acc=0.994695 / 0.893237 / 0.779557 best_acc=0.994788
|
224 |
+
now: 2022-04-02 20:38:52.796788
|
225 |
+
[22400] train_loss=0.021577 valid_loss=0.027899 valid_pos_acc=0.971914 valid_acc=0.994771 / 0.90721 / 0.790755 best_acc=0.994788
|
226 |
+
now: 2022-04-02 20:50:45.585864
|
227 |
+
[22600] train_loss=0.020184 valid_loss=0.0270084 valid_pos_acc=0.971606 valid_acc=0.994723 / 0.901765 / 0.793151 best_acc=0.994788
|
228 |
+
now: 2022-04-02 21:02:35.179860
|
229 |
+
[22800] train_loss=0.0210601 valid_loss=0.0255871 valid_pos_acc=0.972658 valid_acc=0.994784 / 0.89429 / 0.783486 best_acc=0.994788
|
230 |
+
now: 2022-04-02 21:14:32.385194
|
231 |
+
[23000] train_loss=0.0184889 valid_loss=0.0267728 valid_pos_acc=0.971968 valid_acc=0.994912 / 0.898744 / 0.787857 best_acc=0.994912
|
232 |
+
now: 2022-04-02 21:26:30.460311
|
233 |
+
[23200] train_loss=0.0196876 valid_loss=0.027992 valid_pos_acc=0.971883 valid_acc=0.99448 / 0.902701 / 0.789254 best_acc=0.994912
|
234 |
+
now: 2022-04-02 21:38:18.919183
|
235 |
+
[23400] train_loss=0.0192092 valid_loss=0.027083 valid_pos_acc=0.972339 valid_acc=0.994817 / 0.898885 / 0.789367 best_acc=0.994912
|
236 |
+
now: 2022-04-02 21:50:24.318095
|
237 |
+
[23600] train_loss=0.0205622 valid_loss=0.0268123 valid_pos_acc=0.972528 valid_acc=0.994867 / 0.898603 / 0.786304 best_acc=0.994912
|
238 |
+
now: 2022-04-02 22:02:15.239367
|
239 |
+
[23800] train_loss=0.0199384 valid_loss=0.0272734 valid_pos_acc=0.972411 valid_acc=0.99471 / 0.901925 / 0.78745 best_acc=0.994912
|
240 |
+
now: 2022-04-02 22:14:18.216068
|
241 |
+
[24000] train_loss=0.0178143 valid_loss=0.0279724 valid_pos_acc=0.971799 valid_acc=0.994817 / 0.902843 / 0.788122 best_acc=0.994912
|
242 |
+
now: 2022-04-02 22:26:08.869937
|
243 |
+
[24200] train_loss=0.0204505 valid_loss=0.0271799 valid_pos_acc=0.971603 valid_acc=0.994628 / 0.901783 / 0.790354 best_acc=0.994912
|
244 |
+
now: 2022-04-02 22:38:05.762235
|
245 |
+
[24400] train_loss=0.0196403 valid_loss=0.0279875 valid_pos_acc=0.971712 valid_acc=0.994673 / 0.90053 / 0.786413 best_acc=0.994912
|
246 |
+
now: 2022-04-02 22:49:58.498158
|
247 |
+
[24600] train_loss=0.0196382 valid_loss=0.0282003 valid_pos_acc=0.971235 valid_acc=0.994626 / 0.902809 / 0.787394 best_acc=0.994912
|
248 |
+
now: 2022-04-02 23:02:07.775949
|
249 |
+
[24800] train_loss=0.0193936 valid_loss=0.0267504 valid_pos_acc=0.972562 valid_acc=0.994665 / 0.902442 / 0.787082 best_acc=0.994912
|
250 |
+
now: 2022-04-02 23:14:14.202685
|
251 |
+
[25000] train_loss=0.0193612 valid_loss=0.0258062 valid_pos_acc=0.972654 valid_acc=0.994873 / 0.905545 / 0.794271 best_acc=0.994912
|
252 |
+
now: 2022-04-02 23:26:09.859008
|
253 |
+
[25200] train_loss=0.0190576 valid_loss=0.0271653 valid_pos_acc=0.972484 valid_acc=0.994906 / 0.909854 / 0.793673 best_acc=0.994912
|
254 |
+
now: 2022-04-02 23:38:05.448510
|
255 |
+
[25400] train_loss=0.0206686 valid_loss=0.0264603 valid_pos_acc=0.972343 valid_acc=0.994934 / 0.908615 / 0.793492 best_acc=0.994934
|
256 |
+
now: 2022-04-02 23:50:23.423237
|
257 |
+
[25600] train_loss=0.019207 valid_loss=0.0264518 valid_pos_acc=0.972285 valid_acc=0.994947 / 0.909035 / 0.793275 best_acc=0.994947
|
258 |
+
now: 2022-04-03 00:02:25.949756
|
259 |
+
[25800] train_loss=0.0204011 valid_loss=0.027376 valid_pos_acc=0.972029 valid_acc=0.994463 / 0.903614 / 0.787668 best_acc=0.994947
|
260 |
+
now: 2022-04-03 00:14:32.470694
|
261 |
+
[26000] train_loss=0.0198477 valid_loss=0.0271728 valid_pos_acc=0.972293 valid_acc=0.994845 / 0.909642 / 0.792003 best_acc=0.994947
|
262 |
+
now: 2022-04-03 00:26:30.440395
|
263 |
+
[26200] train_loss=0.0182446 valid_loss=0.0269998 valid_pos_acc=0.972797 valid_acc=0.994936 / 0.90766 / 0.791791 best_acc=0.994947
|
264 |
+
now: 2022-04-03 00:38:11.226201
|
265 |
+
[26400] train_loss=0.0188574 valid_loss=0.0277104 valid_pos_acc=0.971779 valid_acc=0.994897 / 0.91142 / 0.795515 best_acc=0.994947
|
266 |
+
now: 2022-04-03 00:50:19.930552
|
267 |
+
[26600] train_loss=0.0195086 valid_loss=0.0266913 valid_pos_acc=0.972246 valid_acc=0.994884 / 0.904172 / 0.791556 best_acc=0.994947
|
268 |
+
now: 2022-04-03 01:02:40.190107
|
269 |
+
[26800] train_loss=0.0204701 valid_loss=0.0262269 valid_pos_acc=0.972677 valid_acc=0.994895 / 0.901927 / 0.792913 best_acc=0.994947
|
270 |
+
now: 2022-04-03 01:14:37.242724
|
271 |
+
[27000] train_loss=0.0227353 valid_loss=0.0265531 valid_pos_acc=0.972556 valid_acc=0.994843 / 0.907322 / 0.797663 best_acc=0.994947
|
272 |
+
now: 2022-04-03 01:26:32.001134
|
273 |
+
[27200] train_loss=0.0199549 valid_loss=0.0256493 valid_pos_acc=0.972434 valid_acc=0.994821 / 0.907325 / 0.799249 best_acc=0.994947
|
274 |
+
now: 2022-04-03 01:38:35.194802
|
275 |
+
[27400] train_loss=0.0178435 valid_loss=0.0273656 valid_pos_acc=0.972109 valid_acc=0.994851 / 0.911329 / 0.795818 best_acc=0.994947
|
276 |
+
now: 2022-04-03 01:50:47.426096
|
277 |
+
[27600] train_loss=0.0196421 valid_loss=0.0268288 valid_pos_acc=0.972094 valid_acc=0.994532 / 0.90122 / 0.788515 best_acc=0.994947
|
278 |
+
now: 2022-04-03 02:02:43.219077
|
279 |
+
[27800] train_loss=0.0218845 valid_loss=0.0267693 valid_pos_acc=0.972343 valid_acc=0.994884 / 0.904811 / 0.790976 best_acc=0.994947
|
280 |
+
now: 2022-04-03 02:14:42.299297
|
281 |
+
[28000] train_loss=0.0201817 valid_loss=0.0273861 valid_pos_acc=0.972345 valid_acc=0.994552 / 0.904774 / 0.792625 best_acc=0.994947
|
282 |
+
now: 2022-04-03 02:26:51.221363
|
283 |
+
[28200] train_loss=0.0200979 valid_loss=0.026012 valid_pos_acc=0.972452 valid_acc=0.994995 / 0.908827 / 0.797558 best_acc=0.994995
|
284 |
+
now: 2022-04-03 02:39:03.091258
|
285 |
+
[28400] train_loss=0.0201783 valid_loss=0.0256883 valid_pos_acc=0.97261 valid_acc=0.994864 / 0.907562 / 0.798298 best_acc=0.994995
|
286 |
+
now: 2022-04-03 02:51:04.300548
|
287 |
+
[28600] train_loss=0.0194111 valid_loss=0.0273932 valid_pos_acc=0.971562 valid_acc=0.994906 / 0.911251 / 0.795228 best_acc=0.994995
|
288 |
+
now: 2022-04-03 03:03:10.729523
|
289 |
+
[28800] train_loss=0.0223215 valid_loss=0.0264798 valid_pos_acc=0.972521 valid_acc=0.994823 / 0.91054 / 0.798156 best_acc=0.994995
|
290 |
+
now: 2022-04-03 03:15:16.733585
|
291 |
+
[29000] train_loss=0.0203798 valid_loss=0.026267 valid_pos_acc=0.972539 valid_acc=0.994979 / 0.910358 / 0.797495 best_acc=0.994995
|
292 |
+
now: 2022-04-03 03:27:15.662423
|
293 |
+
[29200] train_loss=0.0198116 valid_loss=0.0271517 valid_pos_acc=0.972395 valid_acc=0.994804 / 0.90796 / 0.792402 best_acc=0.994995
|
294 |
+
now: 2022-04-03 03:39:25.302705
|
295 |
+
[29400] train_loss=0.0214404 valid_loss=0.0256727 valid_pos_acc=0.973261 valid_acc=0.994984 / 0.910775 / 0.799215 best_acc=0.994995
|
296 |
+
now: 2022-04-03 03:51:35.147315
|
297 |
+
[29600] train_loss=0.0187954 valid_loss=0.0264936 valid_pos_acc=0.97286 valid_acc=0.99499 / 0.91241 / 0.795951 best_acc=0.994995
|
298 |
+
now: 2022-04-03 04:03:37.804465
|
299 |
+
[29800] train_loss=0.0205343 valid_loss=0.0262582 valid_pos_acc=0.972979 valid_acc=0.994997 / 0.910193 / 0.797294 best_acc=0.994997
|
300 |
+
now: 2022-04-03 04:15:40.869840
|
301 |
+
[30000] train_loss=0.0212577 valid_loss=0.0263549 valid_pos_acc=0.972243 valid_acc=0.994684 / 0.909313 / 0.798057 best_acc=0.994997
|
302 |
+
now: 2022-04-03 04:27:42.822407
|
303 |
+
[30200] train_loss=0.0183862 valid_loss=0.0254776 valid_pos_acc=0.972784 valid_acc=0.994877 / 0.903545 / 0.791495 best_acc=0.994997
|
304 |
+
now: 2022-04-03 04:40:02.431413
|
305 |
+
[30400] train_loss=0.0197688 valid_loss=0.0264948 valid_pos_acc=0.973224 valid_acc=0.994997 / 0.911333 / 0.793087 best_acc=0.994997
|
306 |
+
now: 2022-04-03 04:52:02.140132
|
307 |
+
[30600] train_loss=0.0203299 valid_loss=0.0260468 valid_pos_acc=0.972966 valid_acc=0.994799 / 0.902308 / 0.791264 best_acc=0.994997
|
308 |
+
now: 2022-04-03 05:04:02.910278
|
309 |
+
[30800] train_loss=0.0216859 valid_loss=0.0270245 valid_pos_acc=0.972929 valid_acc=0.994676 / 0.910333 / 0.796835 best_acc=0.994997
|
310 |
+
now: 2022-04-03 05:16:05.153874
|
311 |
+
[31000] train_loss=0.0211773 valid_loss=0.0258814 valid_pos_acc=0.972768 valid_acc=0.994916 / 0.907256 / 0.793468 best_acc=0.994997
|
312 |
+
now: 2022-04-03 05:28:07.493337
|
313 |
+
[31200] train_loss=0.0202802 valid_loss=0.0254245 valid_pos_acc=0.972955 valid_acc=0.995092 / 0.914897 / 0.805234 best_acc=0.995092
|
314 |
+
now: 2022-04-03 05:40:12.780431
|
315 |
+
[31400] train_loss=0.0194152 valid_loss=0.0259566 valid_pos_acc=0.972664 valid_acc=0.99496 / 0.919699 / 0.808312 best_acc=0.995092
|
316 |
+
now: 2022-04-03 05:52:16.611374
|
317 |
+
[31600] train_loss=0.019009 valid_loss=0.0260382 valid_pos_acc=0.97263 valid_acc=0.995129 / 0.917311 / 0.804833 best_acc=0.995129
|
318 |
+
now: 2022-04-03 06:04:15.437060
|
319 |
+
[31800] train_loss=0.0205566 valid_loss=0.0260822 valid_pos_acc=0.973148 valid_acc=0.994979 / 0.919627 / 0.808927 best_acc=0.995129
|
320 |
+
now: 2022-04-03 06:16:10.198962
|
321 |
+
[32000] train_loss=0.0192955 valid_loss=0.0259151 valid_pos_acc=0.973142 valid_acc=0.995034 / 0.9158 / 0.804333 best_acc=0.995129
|
322 |
+
now: 2022-04-03 06:28:05.111366
|
323 |
+
[32200] train_loss=0.0204335 valid_loss=0.0255095 valid_pos_acc=0.972736 valid_acc=0.995073 / 0.916156 / 0.810316 best_acc=0.995129
|
324 |
+
now: 2022-04-03 06:40:09.460506
|
325 |
+
[32400] train_loss=0.0201043 valid_loss=0.0261998 valid_pos_acc=0.972545 valid_acc=0.994929 / 0.91379 / 0.802841 best_acc=0.995129
|
326 |
+
now: 2022-04-03 06:52:23.127760
|
327 |
+
[32600] train_loss=0.0180875 valid_loss=0.0249739 valid_pos_acc=0.973335 valid_acc=0.994997 / 0.912169 / 0.802927 best_acc=0.995129
|
328 |
+
now: 2022-04-03 07:04:31.009479
|
329 |
+
[32800] train_loss=0.0198901 valid_loss=0.0254487 valid_pos_acc=0.972621 valid_acc=0.995029 / 0.920603 / 0.80979 best_acc=0.995129
|
330 |
+
now: 2022-04-03 07:16:32.128110
|
331 |
+
[33000] train_loss=0.0208962 valid_loss=0.0254032 valid_pos_acc=0.972883 valid_acc=0.994979 / 0.909972 / 0.799621 best_acc=0.995129
|
332 |
+
now: 2022-04-03 07:28:44.400824
|
333 |
+
[33200] train_loss=0.0201999 valid_loss=0.0258948 valid_pos_acc=0.972847 valid_acc=0.994801 / 0.911759 / 0.799804 best_acc=0.995129
|
334 |
+
now: 2022-04-03 07:40:49.361680
|
335 |
+
[33400] train_loss=0.0217783 valid_loss=0.0256737 valid_pos_acc=0.973255 valid_acc=0.994951 / 0.914217 / 0.800689 best_acc=0.995129
|
336 |
+
now: 2022-04-03 07:52:50.822397
|
337 |
+
[33600] train_loss=0.0198491 valid_loss=0.0264241 valid_pos_acc=0.972823 valid_acc=0.994986 / 0.912473 / 0.801322 best_acc=0.995129
|
338 |
+
now: 2022-04-03 08:04:54.092732
|
339 |
+
[33800] train_loss=0.0221377 valid_loss=0.02493 valid_pos_acc=0.972877 valid_acc=0.994938 / 0.915101 / 0.804716 best_acc=0.995129
|
340 |
+
now: 2022-04-03 08:16:54.243602
|
341 |
+
[34000] train_loss=0.0213205 valid_loss=0.025545 valid_pos_acc=0.972677 valid_acc=0.994979 / 0.915158 / 0.805311 best_acc=0.995129
|
342 |
+
now: 2022-04-03 08:29:03.784710
|
343 |
+
[34200] train_loss=0.0192532 valid_loss=0.0251619 valid_pos_acc=0.97335 valid_acc=0.995099 / 0.916618 / 0.804815 best_acc=0.995129
|
344 |
+
now: 2022-04-03 08:41:15.345717
|
345 |
+
[34400] train_loss=0.0219833 valid_loss=0.0255126 valid_pos_acc=0.97335 valid_acc=0.995068 / 0.91454 / 0.801902 best_acc=0.995129
|
346 |
+
now: 2022-04-03 08:53:18.026172
|
347 |
+
[34600] train_loss=0.02057 valid_loss=0.0257689 valid_pos_acc=0.973476 valid_acc=0.995138 / 0.923171 / 0.810884 best_acc=0.995138
|
348 |
+
now: 2022-04-03 09:05:29.405654
|
349 |
+
[34800] train_loss=0.0212472 valid_loss=0.0260386 valid_pos_acc=0.973087 valid_acc=0.995238 / 0.919353 / 0.805834 best_acc=0.995238
|
350 |
+
now: 2022-04-03 09:17:41.688908
|
351 |
+
[35000] train_loss=0.0193925 valid_loss=0.02788 valid_pos_acc=0.972441 valid_acc=0.994587 / 0.915334 / 0.800469 best_acc=0.995238
|
352 |
+
now: 2022-04-03 09:29:44.816243
|
353 |
+
[35200] train_loss=0.0190577 valid_loss=0.0251073 valid_pos_acc=0.972968 valid_acc=0.995166 / 0.916023 / 0.808521 best_acc=0.995238
|
354 |
+
now: 2022-04-03 09:41:43.856892
|
355 |
+
[35400] train_loss=0.0225248 valid_loss=0.0244108 valid_pos_acc=0.973092 valid_acc=0.994988 / 0.911331 / 0.806775 best_acc=0.995238
|
356 |
+
now: 2022-04-03 09:53:53.841427
|
357 |
+
[35600] train_loss=0.0204164 valid_loss=0.0257028 valid_pos_acc=0.972951 valid_acc=0.994999 / 0.913099 / 0.803078 best_acc=0.995238
|
358 |
+
now: 2022-04-03 10:05:54.756481
|
359 |
+
[35800] train_loss=0.0207206 valid_loss=0.0250318 valid_pos_acc=0.973441 valid_acc=0.995112 / 0.915014 / 0.809688 best_acc=0.995238
|
360 |
+
now: 2022-04-03 10:17:52.373071
|
361 |
+
[36000] train_loss=0.0210285 valid_loss=0.0264345 valid_pos_acc=0.972836 valid_acc=0.994726 / 0.912637 / 0.802366 best_acc=0.995238
|
362 |
+
now: 2022-04-03 10:29:48.836136
|
363 |
+
[36200] train_loss=0.0210124 valid_loss=0.0253509 valid_pos_acc=0.972771 valid_acc=0.994962 / 0.908097 / 0.799097 best_acc=0.995238
|
364 |
+
now: 2022-04-03 10:41:53.650854
|
365 |
+
[36400] train_loss=0.0206329 valid_loss=0.0255921 valid_pos_acc=0.973576 valid_acc=0.995097 / 0.915445 / 0.807018 best_acc=0.995238
|
366 |
+
now: 2022-04-03 10:54:04.782468
|
367 |
+
[36600] train_loss=0.0190987 valid_loss=0.025047 valid_pos_acc=0.973278 valid_acc=0.99504 / 0.911809 / 0.804772 best_acc=0.995238
|
368 |
+
now: 2022-04-03 11:06:14.982105
|
369 |
+
[36800] train_loss=0.0193329 valid_loss=0.0255344 valid_pos_acc=0.973205 valid_acc=0.994995 / 0.914842 / 0.810892 best_acc=0.995238
|
370 |
+
now: 2022-04-03 11:18:21.542298
|
371 |
+
[37000] train_loss=0.019776 valid_loss=0.0257551 valid_pos_acc=0.973228 valid_acc=0.995025 / 0.911626 / 0.801857 best_acc=0.995238
|
372 |
+
now: 2022-04-03 11:30:14.909051
|
373 |
+
[37200] train_loss=0.0203762 valid_loss=0.0253398 valid_pos_acc=0.973005 valid_acc=0.995255 / 0.91017 / 0.804603 best_acc=0.995255
|
374 |
+
now: 2022-04-03 11:42:31.753627
|
375 |
+
[37400] train_loss=0.0188329 valid_loss=0.0251868 valid_pos_acc=0.973304 valid_acc=0.995248 / 0.915374 / 0.809239 best_acc=0.995255
|
376 |
+
now: 2022-04-03 11:54:33.700994
|
377 |
+
[37600] train_loss=0.0183661 valid_loss=0.0254057 valid_pos_acc=0.973443 valid_acc=0.995225 / 0.914564 / 0.805779 best_acc=0.995255
|
378 |
+
now: 2022-04-03 12:06:21.313158
|
379 |
+
[37800] train_loss=0.0211401 valid_loss=0.0246414 valid_pos_acc=0.973185 valid_acc=0.995164 / 0.914028 / 0.805807 best_acc=0.995255
|
380 |
+
now: 2022-04-03 12:18:25.200242
|
381 |
+
[38000] train_loss=0.021069 valid_loss=0.0244758 valid_pos_acc=0.973411 valid_acc=0.995285 / 0.914013 / 0.806843 best_acc=0.995285
|
382 |
+
now: 2022-04-03 12:30:27.765559
|
383 |
+
[38200] train_loss=0.0213957 valid_loss=0.0233822 valid_pos_acc=0.973997 valid_acc=0.995231 / 0.918159 / 0.809317 best_acc=0.995285
|
384 |
+
now: 2022-04-03 12:42:34.527640
|
385 |
+
[38400] train_loss=0.01998 valid_loss=0.0246681 valid_pos_acc=0.973157 valid_acc=0.995144 / 0.911465 / 0.802431 best_acc=0.995285
|
386 |
+
now: 2022-04-03 12:54:35.718367
|
387 |
+
[38600] train_loss=0.0219669 valid_loss=0.0265307 valid_pos_acc=0.973142 valid_acc=0.994856 / 0.913512 / 0.802637 best_acc=0.995285
|
388 |
+
now: 2022-04-03 13:06:31.081322
|
389 |
+
[38800] train_loss=0.0200181 valid_loss=0.0254716 valid_pos_acc=0.972988 valid_acc=0.995194 / 0.918644 / 0.809186 best_acc=0.995285
|
390 |
+
now: 2022-04-03 13:18:35.624914
|
391 |
+
[39000] train_loss=0.0210553 valid_loss=0.0242162 valid_pos_acc=0.973558 valid_acc=0.995114 / 0.914893 / 0.807304 best_acc=0.995285
|
392 |
+
now: 2022-04-03 13:30:40.043785
|
393 |
+
[39200] train_loss=0.0183049 valid_loss=0.0254598 valid_pos_acc=0.973745 valid_acc=0.995105 / 0.921543 / 0.812539 best_acc=0.995285
|
394 |
+
now: 2022-04-03 13:42:42.189633
|
395 |
+
[39400] train_loss=0.0204597 valid_loss=0.0242165 valid_pos_acc=0.973517 valid_acc=0.995151 / 0.916498 / 0.808433 best_acc=0.995285
|
396 |
+
now: 2022-04-03 13:54:41.379549
|
397 |
+
[39600] train_loss=0.0182456 valid_loss=0.0259895 valid_pos_acc=0.97315 valid_acc=0.995231 / 0.922271 / 0.812708 best_acc=0.995285
|
398 |
+
now: 2022-04-03 14:06:51.554192
|
399 |
+
[39800] train_loss=0.0163934 valid_loss=0.0258782 valid_pos_acc=0.973615 valid_acc=0.995144 / 0.916559 / 0.809627 best_acc=0.995285
|
400 |
+
now: 2022-04-03 14:18:44.881390
|
401 |
+
[40000] train_loss=0.015392 valid_loss=0.0261389 valid_pos_acc=0.974283 valid_acc=0.995225 / 0.920042 / 0.812848 best_acc=0.995285
|
402 |
+
now: 2022-04-03 14:30:42.900837
|
403 |
+
[40200] train_loss=0.0151938 valid_loss=0.0266225 valid_pos_acc=0.973879 valid_acc=0.995105 / 0.921502 / 0.815619 best_acc=0.995285
|
404 |
+
now: 2022-04-03 14:42:37.709297
|
405 |
+
[40400] train_loss=0.0156664 valid_loss=0.0252791 valid_pos_acc=0.973784 valid_acc=0.99524 / 0.920868 / 0.819319 best_acc=0.995285
|
406 |
+
now: 2022-04-03 14:54:43.815772
|
407 |
+
[40600] train_loss=0.0156026 valid_loss=0.0251239 valid_pos_acc=0.973782 valid_acc=0.995305 / 0.916191 / 0.813418 best_acc=0.995305
|
408 |
+
now: 2022-04-03 15:06:48.756040
|
409 |
+
[40800] train_loss=0.015617 valid_loss=0.0250889 valid_pos_acc=0.973797 valid_acc=0.995248 / 0.913875 / 0.809349 best_acc=0.995305
|
410 |
+
now: 2022-04-03 15:18:48.093498
|
411 |
+
[41000] train_loss=0.0160921 valid_loss=0.0254156 valid_pos_acc=0.974016 valid_acc=0.995324 / 0.918261 / 0.811825 best_acc=0.995324
|
412 |
+
now: 2022-04-03 15:30:49.330639
|
413 |
+
[41200] train_loss=0.0153382 valid_loss=0.0253904 valid_pos_acc=0.973777 valid_acc=0.995157 / 0.915275 / 0.813835 best_acc=0.995324
|
414 |
+
now: 2022-04-03 15:42:50.366939
|
415 |
+
[41400] train_loss=0.0149767 valid_loss=0.0262346 valid_pos_acc=0.97348 valid_acc=0.995216 / 0.916934 / 0.809716 best_acc=0.995324
|
416 |
+
now: 2022-04-03 15:54:50.211349
|
417 |
+
[41600] train_loss=0.0163188 valid_loss=0.0256865 valid_pos_acc=0.974062 valid_acc=0.995279 / 0.917293 / 0.810614 best_acc=0.995324
|
418 |
+
now: 2022-04-03 16:06:51.678378
|
419 |
+
[41800] train_loss=0.0152591 valid_loss=0.0257784 valid_pos_acc=0.973934 valid_acc=0.995374 / 0.918737 / 0.814463 best_acc=0.995374
|
420 |
+
now: 2022-04-03 16:19:01.215393
|
421 |
+
[42000] train_loss=0.0153742 valid_loss=0.0256425 valid_pos_acc=0.973743 valid_acc=0.995279 / 0.921392 / 0.819395 best_acc=0.995374
|
422 |
+
now: 2022-04-03 16:31:02.965955
|
423 |
+
[42200] train_loss=0.0170421 valid_loss=0.0256818 valid_pos_acc=0.973704 valid_acc=0.995233 / 0.921882 / 0.816565 best_acc=0.995374
|
424 |
+
now: 2022-04-03 16:42:58.346109
|
425 |
+
[42400] train_loss=0.0173119 valid_loss=0.0262359 valid_pos_acc=0.973567 valid_acc=0.995142 / 0.927474 / 0.824186 best_acc=0.995374
|
426 |
+
now: 2022-04-03 16:54:51.654450
|
427 |
+
[42600] train_loss=0.0151309 valid_loss=0.0263674 valid_pos_acc=0.974088 valid_acc=0.995246 / 0.928593 / 0.819351 best_acc=0.995374
|
428 |
+
now: 2022-04-03 17:06:53.863013
|
429 |
+
[42800] train_loss=0.0146644 valid_loss=0.0256878 valid_pos_acc=0.973406 valid_acc=0.995248 / 0.926729 / 0.822369 best_acc=0.995374
|
430 |
+
now: 2022-04-03 17:18:52.584371
|
431 |
+
[43000] train_loss=0.0165593 valid_loss=0.0256607 valid_pos_acc=0.973534 valid_acc=0.995077 / 0.92122 / 0.820859 best_acc=0.995374
|
432 |
+
now: 2022-04-03 17:30:49.691185
|
433 |
+
[43200] train_loss=0.0159887 valid_loss=0.0257545 valid_pos_acc=0.973704 valid_acc=0.995084 / 0.91855 / 0.813994 best_acc=0.995374
|
434 |
+
now: 2022-04-03 17:42:50.184875
|
435 |
+
[43400] train_loss=0.0176695 valid_loss=0.0257385 valid_pos_acc=0.973474 valid_acc=0.995123 / 0.92218 / 0.818823 best_acc=0.995374
|
436 |
+
now: 2022-04-03 17:54:44.709886
|
437 |
+
[43600] train_loss=0.017015 valid_loss=0.0253947 valid_pos_acc=0.973645 valid_acc=0.995281 / 0.918643 / 0.814191 best_acc=0.995374
|
438 |
+
now: 2022-04-03 18:06:47.227964
|
439 |
+
[43800] train_loss=0.0166192 valid_loss=0.0250654 valid_pos_acc=0.973309 valid_acc=0.995097 / 0.917002 / 0.81292 best_acc=0.995374
|
440 |
+
now: 2022-04-03 18:18:48.239180
|
441 |
+
[44000] train_loss=0.0179612 valid_loss=0.0247244 valid_pos_acc=0.974474 valid_acc=0.995259 / 0.922084 / 0.818199 best_acc=0.995374
|
442 |
+
now: 2022-04-03 18:30:58.780112
|
443 |
+
[44200] train_loss=0.0170823 valid_loss=0.0253463 valid_pos_acc=0.973673 valid_acc=0.995277 / 0.920828 / 0.819238 best_acc=0.995374
|
444 |
+
now: 2022-04-03 18:43:02.816823
|
445 |
+
[44400] train_loss=0.0180965 valid_loss=0.0255119 valid_pos_acc=0.973356 valid_acc=0.994997 / 0.920482 / 0.821315 best_acc=0.995374
|
446 |
+
now: 2022-04-03 18:55:04.839467
|
447 |
+
[44600] train_loss=0.0169638 valid_loss=0.0253875 valid_pos_acc=0.973916 valid_acc=0.995218 / 0.923398 / 0.819969 best_acc=0.995374
|
448 |
+
now: 2022-04-03 19:07:07.807908
|
449 |
+
[44800] train_loss=0.0175315 valid_loss=0.0250045 valid_pos_acc=0.973931 valid_acc=0.995225 / 0.919188 / 0.816001 best_acc=0.995374
|
450 |
+
now: 2022-04-03 19:18:53.383956
|
451 |
+
[45000] train_loss=0.0159776 valid_loss=0.0247916 valid_pos_acc=0.973947 valid_acc=0.99527 / 0.922533 / 0.819024 best_acc=0.995374
|
452 |
+
now: 2022-04-03 19:30:53.526194
|
453 |
+
[45200] train_loss=0.0171182 valid_loss=0.0256464 valid_pos_acc=0.97397 valid_acc=0.995225 / 0.923638 / 0.816987 best_acc=0.995374
|
454 |
+
now: 2022-04-03 19:42:50.236892
|
455 |
+
[45400] train_loss=0.0156089 valid_loss=0.0251274 valid_pos_acc=0.97379 valid_acc=0.995353 / 0.920763 / 0.815259 best_acc=0.995374
|
456 |
+
now: 2022-04-03 19:54:46.588740
|
457 |
+
[45600] train_loss=0.0163814 valid_loss=0.0255403 valid_pos_acc=0.973895 valid_acc=0.995298 / 0.923606 / 0.816856 best_acc=0.995374
|
458 |
+
now: 2022-04-03 20:06:53.199451
|
459 |
+
[45800] train_loss=0.017835 valid_loss=0.0246602 valid_pos_acc=0.973981 valid_acc=0.995372 / 0.923306 / 0.821397 best_acc=0.995374
|
460 |
+
now: 2022-04-03 20:19:00.767866
|
461 |
+
[46000] train_loss=0.0178728 valid_loss=0.025165 valid_pos_acc=0.973771 valid_acc=0.995149 / 0.922218 / 0.817612 best_acc=0.995374
|
462 |
+
now: 2022-04-03 20:30:58.988089
|
463 |
+
[46200] train_loss=0.0168901 valid_loss=0.0256853 valid_pos_acc=0.974276 valid_acc=0.995216 / 0.923973 / 0.820527 best_acc=0.995374
|
464 |
+
now: 2022-04-03 20:42:51.449300
|
465 |
+
[46400] train_loss=0.0167886 valid_loss=0.0253529 valid_pos_acc=0.973704 valid_acc=0.995159 / 0.92335 / 0.820036 best_acc=0.995374
|
466 |
+
now: 2022-04-03 20:55:00.160971
|
467 |
+
[46600] train_loss=0.0176656 valid_loss=0.0256036 valid_pos_acc=0.973929 valid_acc=0.995366 / 0.922949 / 0.818223 best_acc=0.995374
|
468 |
+
now: 2022-04-03 21:07:02.579327
|
469 |
+
[46800] train_loss=0.0168645 valid_loss=0.0251908 valid_pos_acc=0.974203 valid_acc=0.995385 / 0.92207 / 0.81494 best_acc=0.995385
|
470 |
+
now: 2022-04-03 21:18:57.696871
|
471 |
+
[47000] train_loss=0.0172549 valid_loss=0.0256528 valid_pos_acc=0.974391 valid_acc=0.995118 / 0.918959 / 0.811756 best_acc=0.995385
|
472 |
+
now: 2022-04-03 21:30:53.620916
|
473 |
+
[47200] train_loss=0.0177735 valid_loss=0.0247787 valid_pos_acc=0.97404 valid_acc=0.995071 / 0.922916 / 0.818151 best_acc=0.995385
|
474 |
+
now: 2022-04-03 21:42:47.443922
|
475 |
+
[47400] train_loss=0.0168849 valid_loss=0.0250654 valid_pos_acc=0.973947 valid_acc=0.995368 / 0.919669 / 0.817169 best_acc=0.995385
|
476 |
+
now: 2022-04-03 21:54:53.531320
|
477 |
+
[47600] train_loss=0.0162995 valid_loss=0.0245945 valid_pos_acc=0.973951 valid_acc=0.995285 / 0.922128 / 0.819555 best_acc=0.995385
|
478 |
+
now: 2022-04-03 22:06:54.960049
|
479 |
+
[47800] train_loss=0.0166094 valid_loss=0.0254666 valid_pos_acc=0.974647 valid_acc=0.995314 / 0.925199 / 0.818337 best_acc=0.995385
|
480 |
+
now: 2022-04-03 22:18:44.813398
|
481 |
+
[48000] train_loss=0.018357 valid_loss=0.0258162 valid_pos_acc=0.974018 valid_acc=0.99527 / 0.924671 / 0.820527 best_acc=0.995385
|
482 |
+
now: 2022-04-03 22:30:33.755723
|
483 |
+
[48200] train_loss=0.0168674 valid_loss=0.025839 valid_pos_acc=0.973747 valid_acc=0.995309 / 0.921703 / 0.813788 best_acc=0.995385
|
484 |
+
now: 2022-04-03 22:42:23.398005
|
485 |
+
[48400] train_loss=0.016813 valid_loss=0.0248057 valid_pos_acc=0.973541 valid_acc=0.995385 / 0.92499 / 0.824521 best_acc=0.995385
|
486 |
+
now: 2022-04-03 22:54:25.099880
|
487 |
+
[48600] train_loss=0.016574 valid_loss=0.0255942 valid_pos_acc=0.973608 valid_acc=0.995177 / 0.925379 / 0.821092 best_acc=0.995385
|
488 |
+
now: 2022-04-03 23:06:29.088401
|
489 |
+
[48800] train_loss=0.0164469 valid_loss=0.025258 valid_pos_acc=0.973758 valid_acc=0.995409 / 0.924966 / 0.819685 best_acc=0.995409
|
490 |
+
now: 2022-04-03 23:18:37.642881
|
491 |
+
[49000] train_loss=0.0179612 valid_loss=0.0246981 valid_pos_acc=0.974266 valid_acc=0.995318 / 0.920644 / 0.816254 best_acc=0.995409
|
492 |
+
now: 2022-04-03 23:30:33.455618
|
493 |
+
[49200] train_loss=0.0163115 valid_loss=0.025103 valid_pos_acc=0.973717 valid_acc=0.995405 / 0.917957 / 0.814416 best_acc=0.995409
|
494 |
+
now: 2022-04-03 23:42:28.866307
|
495 |
+
[49400] train_loss=0.0171099 valid_loss=0.0263086 valid_pos_acc=0.973556 valid_acc=0.995151 / 0.91743 / 0.810825 best_acc=0.995409
|
496 |
+
now: 2022-04-03 23:54:18.959772
|
497 |
+
[49600] train_loss=0.0189349 valid_loss=0.0246903 valid_pos_acc=0.974151 valid_acc=0.995272 / 0.92322 / 0.818726 best_acc=0.995409
|
498 |
+
now: 2022-04-04 00:06:15.909786
|
499 |
+
[49800] train_loss=0.0167492 valid_loss=0.025506 valid_pos_acc=0.974257 valid_acc=0.995439 / 0.925636 / 0.815986 best_acc=0.995439
|
500 |
+
now: 2022-04-04 00:18:24.239516
|
501 |
+
[50000] train_loss=0.0176046 valid_loss=0.024858 valid_pos_acc=0.974309 valid_acc=0.995346 / 0.930482 / 0.820187 best_acc=0.995439
|
502 |
+
now: 2022-04-04 00:30:15.506831
|
503 |
+
[50200] train_loss=0.0163029 valid_loss=0.0252902 valid_pos_acc=0.974383 valid_acc=0.995381 / 0.923226 / 0.818356 best_acc=0.995439
|
504 |
+
now: 2022-04-04 00:42:18.897896
|
505 |
+
[50400] train_loss=0.0171214 valid_loss=0.0246193 valid_pos_acc=0.974333 valid_acc=0.995368 / 0.921255 / 0.81669 best_acc=0.995439
|
506 |
+
now: 2022-04-04 00:54:13.841028
|
507 |
+
[50600] train_loss=0.0161805 valid_loss=0.0250435 valid_pos_acc=0.974437 valid_acc=0.995186 / 0.918271 / 0.813367 best_acc=0.995439
|
508 |
+
now: 2022-04-04 01:06:16.083462
|
509 |
+
[50800] train_loss=0.0179548 valid_loss=0.0245154 valid_pos_acc=0.974691 valid_acc=0.995259 / 0.922457 / 0.81699 best_acc=0.995439
|
510 |
+
now: 2022-04-04 01:18:13.790064
|
511 |
+
[51000] train_loss=0.0164793 valid_loss=0.0248721 valid_pos_acc=0.974378 valid_acc=0.995322 / 0.924732 / 0.817262 best_acc=0.995439
|
512 |
+
now: 2022-04-04 01:30:17.861722
|
513 |
+
[51200] train_loss=0.016939 valid_loss=0.0265039 valid_pos_acc=0.974007 valid_acc=0.995044 / 0.922191 / 0.811527 best_acc=0.995439
|
514 |
+
now: 2022-04-04 01:42:23.079103
|
515 |
+
[51400] train_loss=0.015836 valid_loss=0.0262405 valid_pos_acc=0.973289 valid_acc=0.995235 / 0.922246 / 0.817143 best_acc=0.995439
|
516 |
+
now: 2022-04-04 01:54:20.740833
|
517 |
+
[51600] train_loss=0.0175937 valid_loss=0.0250272 valid_pos_acc=0.973819 valid_acc=0.995214 / 0.925015 / 0.824207 best_acc=0.995439
|
518 |
+
now: 2022-04-04 02:06:12.246740
|
519 |
+
[51800] train_loss=0.0194151 valid_loss=0.0250101 valid_pos_acc=0.973567 valid_acc=0.995253 / 0.921771 / 0.822537 best_acc=0.995439
|
520 |
+
now: 2022-04-04 02:18:00.425728
|
521 |
+
[52000] train_loss=0.0175319 valid_loss=0.0252638 valid_pos_acc=0.973914 valid_acc=0.995235 / 0.92165 / 0.81825 best_acc=0.995439
|
522 |
+
now: 2022-04-04 02:30:00.527309
|
523 |
+
[52200] train_loss=0.0177649 valid_loss=0.0251296 valid_pos_acc=0.974302 valid_acc=0.995218 / 0.916528 / 0.820442 best_acc=0.995439
|
524 |
+
now: 2022-04-04 02:42:15.444828
|
525 |
+
[52400] train_loss=0.0142718 valid_loss=0.0261507 valid_pos_acc=0.974112 valid_acc=0.995218 / 0.926747 / 0.823267 best_acc=0.995439
|
526 |
+
now: 2022-04-04 02:54:22.534812
|
527 |
+
[52600] train_loss=0.0181186 valid_loss=0.024454 valid_pos_acc=0.974678 valid_acc=0.995266 / 0.920649 / 0.821322 best_acc=0.995439
|
528 |
+
now: 2022-04-04 03:06:28.190695
|
529 |
+
[52800] train_loss=0.0186914 valid_loss=0.0248696 valid_pos_acc=0.97445 valid_acc=0.995272 / 0.925439 / 0.824017 best_acc=0.995439
|
530 |
+
now: 2022-04-04 03:18:37.506758
|
531 |
+
[53000] train_loss=0.0180586 valid_loss=0.0249947 valid_pos_acc=0.973493 valid_acc=0.995246 / 0.922071 / 0.820278 best_acc=0.995439
|
532 |
+
now: 2022-04-04 03:30:42.193804
|
533 |
+
[53200] train_loss=0.0174878 valid_loss=0.0238657 valid_pos_acc=0.974348 valid_acc=0.995357 / 0.928082 / 0.822476 best_acc=0.995439
|
534 |
+
now: 2022-04-04 03:42:35.505948
|
535 |
+
[53400] train_loss=0.0175929 valid_loss=0.0238299 valid_pos_acc=0.974042 valid_acc=0.995331 / 0.921639 / 0.819943 best_acc=0.995439
|
536 |
+
now: 2022-04-04 03:54:28.949782
|
537 |
+
[53600] train_loss=0.0177671 valid_loss=0.0252258 valid_pos_acc=0.974318 valid_acc=0.995259 / 0.923376 / 0.819476 best_acc=0.995439
|
538 |
+
now: 2022-04-04 04:06:31.080946
|
539 |
+
[53800] train_loss=0.0181123 valid_loss=0.0245157 valid_pos_acc=0.974474 valid_acc=0.995344 / 0.926992 / 0.822282 best_acc=0.995439
|
540 |
+
now: 2022-04-04 04:18:31.824945
|
541 |
+
[54000] train_loss=0.0163909 valid_loss=0.0247618 valid_pos_acc=0.973955 valid_acc=0.995322 / 0.926681 / 0.819102 best_acc=0.995439
|
542 |
+
now: 2022-04-04 04:30:40.751754
|
543 |
+
[54200] train_loss=0.0182664 valid_loss=0.0249099 valid_pos_acc=0.973999 valid_acc=0.995418 / 0.928271 / 0.824931 best_acc=0.995439
|
544 |
+
now: 2022-04-04 04:42:35.399533
|
545 |
+
[54400] train_loss=0.0186873 valid_loss=0.0251008 valid_pos_acc=0.974205 valid_acc=0.995405 / 0.932078 / 0.826293 best_acc=0.995439
|
546 |
+
now: 2022-04-04 04:54:44.051651
|
547 |
+
[54600] train_loss=0.0176127 valid_loss=0.0242345 valid_pos_acc=0.97437 valid_acc=0.995394 / 0.926177 / 0.823775 best_acc=0.995439
|
548 |
+
now: 2022-04-04 05:06:49.804392
|
549 |
+
[54800] train_loss=0.0163823 valid_loss=0.0258135 valid_pos_acc=0.974289 valid_acc=0.995075 / 0.92924 / 0.822328 best_acc=0.995439
|
550 |
+
now: 2022-04-04 05:18:59.599641
|
551 |
+
[55000] train_loss=0.0179242 valid_loss=0.024379 valid_pos_acc=0.973957 valid_acc=0.995426 / 0.928179 / 0.828975 best_acc=0.995439
|
552 |
+
now: 2022-04-04 05:31:03.121724
|
553 |
+
[55200] train_loss=0.0184118 valid_loss=0.0241673 valid_pos_acc=0.974077 valid_acc=0.995316 / 0.929769 / 0.826971 best_acc=0.995439
|
554 |
+
now: 2022-04-04 05:43:02.794339
|
555 |
+
[55400] train_loss=0.0165821 valid_loss=0.0247912 valid_pos_acc=0.974302 valid_acc=0.995285 / 0.925138 / 0.817615 best_acc=0.995439
|
556 |
+
now: 2022-04-04 05:55:02.273212
|
557 |
+
[55600] train_loss=0.0186432 valid_loss=0.0251953 valid_pos_acc=0.974374 valid_acc=0.995231 / 0.926073 / 0.822135 best_acc=0.995439
|
558 |
+
now: 2022-04-04 06:07:03.296464
|
559 |
+
[55800] train_loss=0.0171283 valid_loss=0.0251252 valid_pos_acc=0.974962 valid_acc=0.995283 / 0.92335 / 0.820246 best_acc=0.995439
|
560 |
+
now: 2022-04-04 06:19:11.613771
|
561 |
+
[56000] train_loss=0.0186047 valid_loss=0.0247604 valid_pos_acc=0.97448 valid_acc=0.995277 / 0.927931 / 0.823571 best_acc=0.995439
|
562 |
+
now: 2022-04-04 06:31:02.274835
|
563 |
+
[56200] train_loss=0.0167043 valid_loss=0.0252192 valid_pos_acc=0.974166 valid_acc=0.995279 / 0.92564 / 0.822268 best_acc=0.995439
|
564 |
+
now: 2022-04-04 06:43:00.961416
|
565 |
+
[56400] train_loss=0.0193165 valid_loss=0.0252836 valid_pos_acc=0.974016 valid_acc=0.995268 / 0.9252 / 0.817183 best_acc=0.995439
|
566 |
+
now: 2022-04-04 06:55:01.506312
|
567 |
+
[56600] train_loss=0.0185221 valid_loss=0.0242256 valid_pos_acc=0.974535 valid_acc=0.995424 / 0.925529 / 0.821226 best_acc=0.995439
|
568 |
+
now: 2022-04-04 07:06:52.135175
|
569 |
+
[56800] train_loss=0.0171885 valid_loss=0.0252071 valid_pos_acc=0.974706 valid_acc=0.995227 / 0.924077 / 0.817878 best_acc=0.995439
|
570 |
+
now: 2022-04-04 07:18:50.316902
|
571 |
+
[57000] train_loss=0.0175959 valid_loss=0.0248842 valid_pos_acc=0.974235 valid_acc=0.995218 / 0.92808 / 0.823735 best_acc=0.995439
|
572 |
+
now: 2022-04-04 07:30:56.648830
|
573 |
+
[57200] train_loss=0.0171702 valid_loss=0.0248792 valid_pos_acc=0.974953 valid_acc=0.995392 / 0.927917 / 0.824835 best_acc=0.995439
|
574 |
+
now: 2022-04-04 07:43:00.234541
|
575 |
+
[57400] train_loss=0.0164997 valid_loss=0.0247215 valid_pos_acc=0.97473 valid_acc=0.995363 / 0.927725 / 0.822496 best_acc=0.995439
|
576 |
+
now: 2022-04-04 07:55:01.561685
|
577 |
+
[57600] train_loss=0.0175738 valid_loss=0.0246078 valid_pos_acc=0.974686 valid_acc=0.995413 / 0.9303 / 0.822759 best_acc=0.995439
|
578 |
+
now: 2022-04-04 08:07:03.437555
|
579 |
+
[57800] train_loss=0.0182974 valid_loss=0.0242409 valid_pos_acc=0.974461 valid_acc=0.995472 / 0.929586 / 0.827576 best_acc=0.995472
|
580 |
+
now: 2022-04-04 08:19:09.447609
|
581 |
+
[58000] train_loss=0.0175345 valid_loss=0.0243995 valid_pos_acc=0.974287 valid_acc=0.995515 / 0.93219 / 0.826722 best_acc=0.995515
|
582 |
+
now: 2022-04-04 08:31:08.680812
|
583 |
+
[58200] train_loss=0.0183634 valid_loss=0.0238503 valid_pos_acc=0.974051 valid_acc=0.995335 / 0.922815 / 0.821793 best_acc=0.995515
|
584 |
+
now: 2022-04-04 08:43:10.046244
|
585 |
+
[58400] train_loss=0.0163139 valid_loss=0.024504 valid_pos_acc=0.975008 valid_acc=0.995541 / 0.930931 / 0.823118 best_acc=0.995541
|
586 |
+
now: 2022-04-04 08:55:18.433955
|
587 |
+
[58600] train_loss=0.017843 valid_loss=0.0241767 valid_pos_acc=0.9746 valid_acc=0.995476 / 0.930497 / 0.832621 best_acc=0.995541
|
588 |
+
now: 2022-04-04 09:07:27.795974
|
589 |
+
[58800] train_loss=0.0176362 valid_loss=0.0243888 valid_pos_acc=0.974819 valid_acc=0.995457 / 0.929875 / 0.82877 best_acc=0.995541
|
590 |
+
now: 2022-04-04 09:19:29.918584
|
591 |
+
[59000] train_loss=0.017933 valid_loss=0.0242775 valid_pos_acc=0.974493 valid_acc=0.995507 / 0.930605 / 0.8321 best_acc=0.995541
|
592 |
+
now: 2022-04-04 09:31:33.387747
|
593 |
+
[59200] train_loss=0.0166545 valid_loss=0.0248671 valid_pos_acc=0.974669 valid_acc=0.995372 / 0.934492 / 0.832004 best_acc=0.995541
|
594 |
+
now: 2022-04-04 09:43:38.498786
|
595 |
+
[59400] train_loss=0.0144546 valid_loss=0.0250983 valid_pos_acc=0.974652 valid_acc=0.995494 / 0.93082 / 0.829281 best_acc=0.995541
|
596 |
+
now: 2022-04-04 09:55:40.006405
|
597 |
+
[59600] train_loss=0.0139316 valid_loss=0.02474 valid_pos_acc=0.974144 valid_acc=0.99545 / 0.932469 / 0.831644 best_acc=0.995541
|
598 |
+
now: 2022-04-04 10:07:46.456322
|
599 |
+
[59800] train_loss=0.0134991 valid_loss=0.0268217 valid_pos_acc=0.974183 valid_acc=0.995357 / 0.927956 / 0.825977 best_acc=0.995541
|
600 |
+
now: 2022-04-04 10:19:47.170813
|
601 |
+
[60000] train_loss=0.0152101 valid_loss=0.0253673 valid_pos_acc=0.974528 valid_acc=0.995463 / 0.932345 / 0.830733 best_acc=0.995541
|
602 |
+
now: 2022-04-04 10:31:54.967223
|
603 |
+
[60200] train_loss=0.0138335 valid_loss=0.0252645 valid_pos_acc=0.974877 valid_acc=0.995359 / 0.928942 / 0.827531 best_acc=0.995541
|
604 |
+
now: 2022-04-04 10:43:57.300159
|
605 |
+
[60400] train_loss=0.0136089 valid_loss=0.0252041 valid_pos_acc=0.974778 valid_acc=0.995353 / 0.92988 / 0.824825 best_acc=0.995541
|
606 |
+
now: 2022-04-04 10:55:50.855788
|
607 |
+
[60600] train_loss=0.013999 valid_loss=0.0255136 valid_pos_acc=0.975068 valid_acc=0.995389 / 0.93112 / 0.827874 best_acc=0.995541
|
608 |
+
now: 2022-04-04 11:07:45.181290
|
609 |
+
[60800] train_loss=0.0147645 valid_loss=0.024898 valid_pos_acc=0.974433 valid_acc=0.9954 / 0.932707 / 0.830214 best_acc=0.995541
|
610 |
+
now: 2022-04-04 11:19:31.882106
|
611 |
+
[61000] train_loss=0.0144584 valid_loss=0.0256239 valid_pos_acc=0.974613 valid_acc=0.995318 / 0.931253 / 0.826158 best_acc=0.995541
|
612 |
+
now: 2022-04-04 11:31:20.954761
|
613 |
+
[61200] train_loss=0.0148378 valid_loss=0.025611 valid_pos_acc=0.974391 valid_acc=0.995316 / 0.933647 / 0.831387 best_acc=0.995541
|
614 |
+
now: 2022-04-04 11:43:16.513341
|
615 |
+
[61400] train_loss=0.0164332 valid_loss=0.0245625 valid_pos_acc=0.974274 valid_acc=0.995533 / 0.933621 / 0.832726 best_acc=0.995541
|
616 |
+
now: 2022-04-04 11:55:19.484783
|
617 |
+
[61600] train_loss=0.0128714 valid_loss=0.0261392 valid_pos_acc=0.974322 valid_acc=0.995478 / 0.929096 / 0.826284 best_acc=0.995541
|
618 |
+
now: 2022-04-04 12:07:11.369487
|
619 |
+
[61800] train_loss=0.0133303 valid_loss=0.0250031 valid_pos_acc=0.974207 valid_acc=0.995429 / 0.929365 / 0.828611 best_acc=0.995541
|
620 |
+
now: 2022-04-04 12:19:18.413032
|
621 |
+
[62000] train_loss=0.015107 valid_loss=0.0254061 valid_pos_acc=0.974804 valid_acc=0.995526 / 0.9288 / 0.82914 best_acc=0.995541
|
622 |
+
now: 2022-04-04 12:31:20.055266
|
623 |
+
[62200] train_loss=0.0138232 valid_loss=0.0257889 valid_pos_acc=0.974927 valid_acc=0.99542 / 0.931311 / 0.830115 best_acc=0.995541
|
624 |
+
now: 2022-04-04 12:43:13.349393
|
625 |
+
[62400] train_loss=0.0141178 valid_loss=0.0256085 valid_pos_acc=0.97463 valid_acc=0.995348 / 0.926439 / 0.829561 best_acc=0.995541
|
626 |
+
now: 2022-04-04 12:55:00.906269
|
627 |
+
[62600] train_loss=0.0142263 valid_loss=0.0262747 valid_pos_acc=0.974962 valid_acc=0.995402 / 0.925902 / 0.82448 best_acc=0.995541
|
628 |
+
now: 2022-04-04 13:06:57.609555
|
629 |
+
[62800] train_loss=0.0147258 valid_loss=0.0259502 valid_pos_acc=0.974404 valid_acc=0.995504 / 0.928971 / 0.826749 best_acc=0.995541
|
630 |
+
now: 2022-04-04 13:18:51.020957
|
631 |
+
[63000] train_loss=0.0158923 valid_loss=0.0249411 valid_pos_acc=0.974795 valid_acc=0.995535 / 0.924258 / 0.825275 best_acc=0.995541
|
632 |
+
now: 2022-04-04 13:30:41.825005
|
633 |
+
[63200] train_loss=0.0129023 valid_loss=0.0256269 valid_pos_acc=0.974296 valid_acc=0.99553 / 0.924678 / 0.824117 best_acc=0.995541
|
634 |
+
now: 2022-04-04 13:42:34.233889
|
635 |
+
[63400] train_loss=0.0155354 valid_loss=0.0239082 valid_pos_acc=0.974923 valid_acc=0.995526 / 0.923666 / 0.830834 best_acc=0.995541
|
636 |
+
now: 2022-04-04 13:54:30.059726
|
637 |
+
[63600] train_loss=0.0151712 valid_loss=0.0252058 valid_pos_acc=0.97448 valid_acc=0.995481 / 0.926182 / 0.827018 best_acc=0.995541
|
638 |
+
now: 2022-04-04 14:06:25.923276
|
639 |
+
[63800] train_loss=0.0154875 valid_loss=0.0249389 valid_pos_acc=0.974504 valid_acc=0.995502 / 0.92361 / 0.826288 best_acc=0.995541
|
640 |
+
now: 2022-04-04 14:18:29.316170
|
641 |
+
[64000] train_loss=0.0151137 valid_loss=0.0256968 valid_pos_acc=0.974333 valid_acc=0.995459 / 0.924967 / 0.829635 best_acc=0.995541
|
642 |
+
now: 2022-04-04 14:30:34.246579
|
643 |
+
[64200] train_loss=0.0152567 valid_loss=0.0251615 valid_pos_acc=0.974465 valid_acc=0.995544 / 0.930084 / 0.832375 best_acc=0.995544
|
644 |
+
now: 2022-04-04 14:42:51.084382
|
645 |
+
[64400] train_loss=0.0145794 valid_loss=0.0253407 valid_pos_acc=0.975203 valid_acc=0.995587 / 0.929932 / 0.82946 best_acc=0.995587
|
646 |
+
now: 2022-04-04 14:54:58.723482
|
647 |
+
[64600] train_loss=0.0145396 valid_loss=0.0245939 valid_pos_acc=0.974552 valid_acc=0.995468 / 0.926718 / 0.83152 best_acc=0.995587
|
648 |
+
now: 2022-04-04 15:06:51.962741
|
649 |
+
[64800] train_loss=0.0141351 valid_loss=0.0257409 valid_pos_acc=0.975149 valid_acc=0.995465 / 0.92718 / 0.826698 best_acc=0.995587
|
650 |
+
now: 2022-04-04 15:18:55.351985
|
651 |
+
[65000] train_loss=0.0150484 valid_loss=0.0243046 valid_pos_acc=0.974875 valid_acc=0.995609 / 0.928307 / 0.831687 best_acc=0.995609
|
652 |
+
now: 2022-04-04 15:31:02.499770
|
653 |
+
[65200] train_loss=0.0137273 valid_loss=0.0247366 valid_pos_acc=0.974598 valid_acc=0.995478 / 0.927726 / 0.828495 best_acc=0.995609
|
654 |
+
now: 2022-04-04 15:43:12.293967
|
655 |
+
[65400] train_loss=0.0146922 valid_loss=0.0248483 valid_pos_acc=0.97481 valid_acc=0.995433 / 0.926791 / 0.822958 best_acc=0.995609
|
656 |
+
now: 2022-04-04 15:55:16.151208
|
657 |
+
[65600] train_loss=0.0163436 valid_loss=0.0252635 valid_pos_acc=0.974797 valid_acc=0.995318 / 0.927537 / 0.824153 best_acc=0.995609
|
658 |
+
now: 2022-04-04 16:07:11.212065
|
659 |
+
[65800] train_loss=0.0148312 valid_loss=0.0246498 valid_pos_acc=0.975016 valid_acc=0.995429 / 0.927171 / 0.82399 best_acc=0.995609
|
660 |
+
now: 2022-04-04 16:19:08.685980
|
661 |
+
[66000] train_loss=0.015976 valid_loss=0.0249127 valid_pos_acc=0.974988 valid_acc=0.995481 / 0.928756 / 0.828428 best_acc=0.995609
|
662 |
+
now: 2022-04-04 16:31:02.374897
|
663 |
+
[66200] train_loss=0.014783 valid_loss=0.0250196 valid_pos_acc=0.974611 valid_acc=0.995452 / 0.925158 / 0.827485 best_acc=0.995609
|
664 |
+
now: 2022-04-04 16:42:47.960220
|
665 |
+
[66400] train_loss=0.0157549 valid_loss=0.0247606 valid_pos_acc=0.974945 valid_acc=0.995422 / 0.927323 / 0.829169 best_acc=0.995609
|
666 |
+
now: 2022-04-04 16:54:48.551560
|
667 |
+
[66600] train_loss=0.0150551 valid_loss=0.0252349 valid_pos_acc=0.975318 valid_acc=0.995544 / 0.929646 / 0.829167 best_acc=0.995609
|
668 |
+
now: 2022-04-04 17:06:51.181160
|
669 |
+
[66800] train_loss=0.014767 valid_loss=0.0253356 valid_pos_acc=0.9747 valid_acc=0.995528 / 0.930906 / 0.830688 best_acc=0.995609
|
670 |
+
now: 2022-04-04 17:18:56.196395
|
671 |
+
[67000] train_loss=0.0143435 valid_loss=0.0247641 valid_pos_acc=0.974734 valid_acc=0.99553 / 0.930212 / 0.832562 best_acc=0.995609
|
672 |
+
now: 2022-04-04 17:30:49.412781
|
673 |
+
[67200] train_loss=0.0143362 valid_loss=0.0262739 valid_pos_acc=0.974728 valid_acc=0.995355 / 0.92964 / 0.824912 best_acc=0.995609
|
674 |
+
now: 2022-04-04 17:42:55.107881
|
675 |
+
[67400] train_loss=0.0162339 valid_loss=0.0246594 valid_pos_acc=0.974643 valid_acc=0.995465 / 0.927617 / 0.827191 best_acc=0.995609
|
676 |
+
now: 2022-04-04 17:54:54.064581
|
677 |
+
[67600] train_loss=0.0160092 valid_loss=0.0252428 valid_pos_acc=0.974259 valid_acc=0.995446 / 0.930018 / 0.833761 best_acc=0.995609
|
678 |
+
now: 2022-04-04 18:06:53.764486
|
679 |
+
[67800] train_loss=0.0152992 valid_loss=0.0253698 valid_pos_acc=0.97448 valid_acc=0.995392 / 0.926869 / 0.829631 best_acc=0.995609
|
680 |
+
now: 2022-04-04 18:18:51.081973
|
681 |
+
[68000] train_loss=0.0147154 valid_loss=0.0257934 valid_pos_acc=0.974166 valid_acc=0.995533 / 0.932229 / 0.833046 best_acc=0.995609
|
682 |
+
now: 2022-04-04 18:30:58.562494
|
683 |
+
[68200] train_loss=0.0133411 valid_loss=0.0261108 valid_pos_acc=0.974467 valid_acc=0.995472 / 0.928011 / 0.826824 best_acc=0.995609
|
684 |
+
now: 2022-04-04 18:42:51.828880
|
685 |
+
[68400] train_loss=0.0153007 valid_loss=0.0262793 valid_pos_acc=0.974007 valid_acc=0.995255 / 0.92542 / 0.825899 best_acc=0.995609
|
686 |
+
now: 2022-04-04 18:54:48.421492
|
687 |
+
[68600] train_loss=0.0156991 valid_loss=0.0250134 valid_pos_acc=0.974808 valid_acc=0.99552 / 0.925519 / 0.827053 best_acc=0.995609
|
688 |
+
now: 2022-04-04 19:06:46.853231
|
689 |
+
[68800] train_loss=0.0148552 valid_loss=0.0246115 valid_pos_acc=0.974407 valid_acc=0.995626 / 0.92897 / 0.829723 best_acc=0.995626
|
690 |
+
now: 2022-04-04 19:18:49.907609
|
691 |
+
[69000] train_loss=0.0148742 valid_loss=0.0240215 valid_pos_acc=0.974463 valid_acc=0.995648 / 0.932707 / 0.837616 best_acc=0.995648
|
692 |
+
now: 2022-04-04 19:30:58.657495
|
693 |
+
[69200] train_loss=0.0154729 valid_loss=0.0243906 valid_pos_acc=0.974762 valid_acc=0.995609 / 0.932612 / 0.832795 best_acc=0.995648
|
694 |
+
now: 2022-04-04 19:43:05.888719
|
695 |
+
[69400] train_loss=0.016396 valid_loss=0.0258633 valid_pos_acc=0.974834 valid_acc=0.995557 / 0.930491 / 0.82893 best_acc=0.995648
|
696 |
+
now: 2022-04-04 19:55:14.202470
|
697 |
+
[69600] train_loss=0.0151303 valid_loss=0.0253337 valid_pos_acc=0.975164 valid_acc=0.995507 / 0.926279 / 0.828464 best_acc=0.995648
|
698 |
+
now: 2022-04-04 20:07:14.789065
|
699 |
+
[69800] train_loss=0.0160168 valid_loss=0.0261905 valid_pos_acc=0.974936 valid_acc=0.995244 / 0.919724 / 0.822949 best_acc=0.995648
|
700 |
+
now: 2022-04-04 20:19:07.585227
|
701 |
+
[70000] train_loss=0.0145597 valid_loss=0.0257852 valid_pos_acc=0.974459 valid_acc=0.995411 / 0.922878 / 0.822821 best_acc=0.995648
|
702 |
+
now: 2022-04-04 20:31:00.634042
|
703 |
+
[70200] train_loss=0.0158036 valid_loss=0.0254432 valid_pos_acc=0.974986 valid_acc=0.995442 / 0.927001 / 0.827555 best_acc=0.995648
|
704 |
+
now: 2022-04-04 20:42:51.615293
|
705 |
+
[70400] train_loss=0.0152405 valid_loss=0.0246695 valid_pos_acc=0.974684 valid_acc=0.995517 / 0.926662 / 0.828915 best_acc=0.995648
|
706 |
+
now: 2022-04-04 20:54:54.958272
|
707 |
+
[70600] train_loss=0.0142744 valid_loss=0.0256758 valid_pos_acc=0.974656 valid_acc=0.995446 / 0.924052 / 0.827786 best_acc=0.995648
|
708 |
+
now: 2022-04-04 21:06:51.313734
|
709 |
+
[70800] train_loss=0.0156632 valid_loss=0.0254473 valid_pos_acc=0.97496 valid_acc=0.995385 / 0.92814 / 0.832267 best_acc=0.995648
|
710 |
+
now: 2022-04-04 21:18:43.761819
|
711 |
+
[71000] train_loss=0.0164862 valid_loss=0.0255363 valid_pos_acc=0.974172 valid_acc=0.995342 / 0.92511 / 0.826733 best_acc=0.995648
|
712 |
+
now: 2022-04-04 21:30:44.417684
|
713 |
+
[71200] train_loss=0.0138593 valid_loss=0.0255767 valid_pos_acc=0.974632 valid_acc=0.995394 / 0.930841 / 0.830682 best_acc=0.995648
|
714 |
+
now: 2022-04-04 21:42:42.850833
|
715 |
+
[71400] train_loss=0.0157498 valid_loss=0.025416 valid_pos_acc=0.975127 valid_acc=0.995617 / 0.928939 / 0.826405 best_acc=0.995648
|
716 |
+
now: 2022-04-04 21:54:46.086563
|
717 |
+
[71600] train_loss=0.0152817 valid_loss=0.0257021 valid_pos_acc=0.974856 valid_acc=0.995578 / 0.927289 / 0.824619 best_acc=0.995648
|
718 |
+
now: 2022-04-04 22:06:47.189552
|
719 |
+
[71800] train_loss=0.0149438 valid_loss=0.0264182 valid_pos_acc=0.974916 valid_acc=0.995507 / 0.931164 / 0.830416 best_acc=0.995648
|
720 |
+
now: 2022-04-04 22:18:39.870279
|
721 |
+
[72000] train_loss=0.0163576 valid_loss=0.0259418 valid_pos_acc=0.974795 valid_acc=0.995374 / 0.924427 / 0.824195 best_acc=0.995648
|
722 |
+
now: 2022-04-04 22:30:33.184078
|
723 |
+
[72200] train_loss=0.016734 valid_loss=0.0250246 valid_pos_acc=0.974524 valid_acc=0.995552 / 0.927199 / 0.829767 best_acc=0.995648
|
724 |
+
now: 2022-04-04 22:42:36.180178
|
725 |
+
[72400] train_loss=0.0154278 valid_loss=0.025262 valid_pos_acc=0.974578 valid_acc=0.9955 / 0.930839 / 0.835087 best_acc=0.995648
|
726 |
+
now: 2022-04-04 22:54:42.093287
|
727 |
+
[72600] train_loss=0.0154066 valid_loss=0.0257742 valid_pos_acc=0.973793 valid_acc=0.995394 / 0.933008 / 0.828687 best_acc=0.995648
|
728 |
+
now: 2022-04-04 23:06:54.703488
|
729 |
+
[72800] train_loss=0.0164941 valid_loss=0.0249958 valid_pos_acc=0.974504 valid_acc=0.995418 / 0.931091 / 0.83236 best_acc=0.995648
|
730 |
+
now: 2022-04-04 23:19:01.661367
|
731 |
+
[73000] train_loss=0.016436 valid_loss=0.0265112 valid_pos_acc=0.974443 valid_acc=0.995385 / 0.927996 / 0.829395 best_acc=0.995648
|
732 |
+
now: 2022-04-04 23:31:00.152399
|
733 |
+
[73200] train_loss=0.0162149 valid_loss=0.0255855 valid_pos_acc=0.974825 valid_acc=0.995448 / 0.928238 / 0.829202 best_acc=0.995648
|
734 |
+
now: 2022-04-04 23:43:00.392230
|
735 |
+
[73400] train_loss=0.0144402 valid_loss=0.0252108 valid_pos_acc=0.9747 valid_acc=0.995587 / 0.929875 / 0.834653 best_acc=0.995648
|
736 |
+
now: 2022-04-04 23:55:08.297531
|
737 |
+
[73600] train_loss=0.0156009 valid_loss=0.0243919 valid_pos_acc=0.974626 valid_acc=0.995526 / 0.930281 / 0.835324 best_acc=0.995648
|
738 |
+
now: 2022-04-05 00:07:21.557640
|
739 |
+
[73800] train_loss=0.0162462 valid_loss=0.0250821 valid_pos_acc=0.975079 valid_acc=0.995567 / 0.927734 / 0.826903 best_acc=0.995648
|
740 |
+
now: 2022-04-05 00:19:20.964981
|
741 |
+
[74000] train_loss=0.0149793 valid_loss=0.0246167 valid_pos_acc=0.974808 valid_acc=0.995598 / 0.930624 / 0.833348 best_acc=0.995648
|
742 |
+
now: 2022-04-05 00:31:27.707459
|
743 |
+
[74200] train_loss=0.016266 valid_loss=0.0240239 valid_pos_acc=0.975188 valid_acc=0.995574 / 0.931921 / 0.831013 best_acc=0.995648
|
744 |
+
now: 2022-04-05 00:43:16.494193
|
745 |
+
[74400] train_loss=0.0133132 valid_loss=0.0248758 valid_pos_acc=0.97506 valid_acc=0.99553 / 0.935143 / 0.835638 best_acc=0.995648
|
746 |
+
now: 2022-04-05 00:55:14.868213
|
747 |
+
[74600] train_loss=0.0161027 valid_loss=0.0249716 valid_pos_acc=0.97488 valid_acc=0.9955 / 0.933607 / 0.832621 best_acc=0.995648
|
748 |
+
now: 2022-04-05 01:07:25.462651
|
749 |
+
[74800] train_loss=0.014376 valid_loss=0.0250494 valid_pos_acc=0.975396 valid_acc=0.995504 / 0.933943 / 0.833992 best_acc=0.995648
|
750 |
+
now: 2022-04-05 01:19:25.187529
|
751 |
+
[75000] train_loss=0.0161117 valid_loss=0.0241017 valid_pos_acc=0.974871 valid_acc=0.995626 / 0.930057 / 0.833235 best_acc=0.995648
|
752 |
+
now: 2022-04-05 01:31:27.005032
|
753 |
+
[75200] train_loss=0.0158166 valid_loss=0.0236026 valid_pos_acc=0.974979 valid_acc=0.995587 / 0.929956 / 0.831575 best_acc=0.995648
|
754 |
+
now: 2022-04-05 01:43:35.408468
|
755 |
+
[75400] train_loss=0.0149878 valid_loss=0.0256847 valid_pos_acc=0.974949 valid_acc=0.995667 / 0.932654 / 0.832065 best_acc=0.995667
|
756 |
+
now: 2022-04-05 01:55:54.653418
|
757 |
+
[75600] train_loss=0.0155141 valid_loss=0.024818 valid_pos_acc=0.974934 valid_acc=0.995609 / 0.933709 / 0.831186 best_acc=0.995667
|
758 |
+
now: 2022-04-05 02:07:43.823418
|
759 |
+
[75800] train_loss=0.0144699 valid_loss=0.0258603 valid_pos_acc=0.975235 valid_acc=0.995617 / 0.933296 / 0.832935 best_acc=0.995667
|
760 |
+
now: 2022-04-05 02:19:44.987772
|
761 |
+
[76000] train_loss=0.0152171 valid_loss=0.0252886 valid_pos_acc=0.97524 valid_acc=0.995693 / 0.932408 / 0.835173 best_acc=0.995693
|
762 |
+
now: 2022-04-05 02:31:42.885767
|
763 |
+
[76200] train_loss=0.0155771 valid_loss=0.0255284 valid_pos_acc=0.97537 valid_acc=0.995643 / 0.935199 / 0.834838 best_acc=0.995693
|
764 |
+
now: 2022-04-05 02:43:44.509209
|
765 |
+
[76400] train_loss=0.0153983 valid_loss=0.0248721 valid_pos_acc=0.975023 valid_acc=0.995574 / 0.93275 / 0.835416 best_acc=0.995693
|
766 |
+
now: 2022-04-05 02:55:49.966478
|
767 |
+
[76600] train_loss=0.0154212 valid_loss=0.0257234 valid_pos_acc=0.975149 valid_acc=0.995465 / 0.930268 / 0.829088 best_acc=0.995693
|
768 |
+
now: 2022-04-05 03:08:00.620954
|
769 |
+
[76800] train_loss=0.0151146 valid_loss=0.0246848 valid_pos_acc=0.975071 valid_acc=0.995398 / 0.930615 / 0.832294 best_acc=0.995693
|
770 |
+
now: 2022-04-05 03:20:08.341056
|
771 |
+
[77000] train_loss=0.0160863 valid_loss=0.0271371 valid_pos_acc=0.974689 valid_acc=0.995244 / 0.93075 / 0.823407 best_acc=0.995693
|
772 |
+
now: 2022-04-05 03:32:17.172790
|
773 |
+
[77200] train_loss=0.0157593 valid_loss=0.0253599 valid_pos_acc=0.974697 valid_acc=0.995528 / 0.929039 / 0.8295 best_acc=0.995693
|
774 |
+
now: 2022-04-05 03:44:21.046580
|
775 |
+
[77400] train_loss=0.0161056 valid_loss=0.0245615 valid_pos_acc=0.974919 valid_acc=0.995472 / 0.93189 / 0.832852 best_acc=0.995693
|
776 |
+
now: 2022-04-05 03:56:20.405982
|
777 |
+
[77600] train_loss=0.0149618 valid_loss=0.0246419 valid_pos_acc=0.975073 valid_acc=0.995513 / 0.932703 / 0.831197 best_acc=0.995693
|
778 |
+
now: 2022-04-05 04:08:28.594227
|
779 |
+
[77800] train_loss=0.016911 valid_loss=0.0236979 valid_pos_acc=0.974723 valid_acc=0.995596 / 0.931261 / 0.832728 best_acc=0.995693
|
780 |
+
now: 2022-04-05 04:20:31.662452
|
781 |
+
[78000] train_loss=0.0165653 valid_loss=0.0239289 valid_pos_acc=0.974678 valid_acc=0.995502 / 0.929082 / 0.827912 best_acc=0.995693
|
782 |
+
now: 2022-04-05 04:32:28.019168
|
783 |
+
[78200] train_loss=0.0167179 valid_loss=0.0249402 valid_pos_acc=0.9747 valid_acc=0.995502 / 0.923062 / 0.82385 best_acc=0.995693
|
784 |
+
now: 2022-04-05 04:44:26.471615
|
785 |
+
[78400] train_loss=0.0167333 valid_loss=0.0240457 valid_pos_acc=0.9746 valid_acc=0.995606 / 0.92763 / 0.83109 best_acc=0.995693
|
786 |
+
now: 2022-04-05 04:56:24.357334
|
787 |
+
[78600] train_loss=0.0145129 valid_loss=0.0249626 valid_pos_acc=0.974871 valid_acc=0.995366 / 0.927649 / 0.828631 best_acc=0.995693
|
788 |
+
now: 2022-04-05 05:08:26.198865
|
789 |
+
[78800] train_loss=0.0172761 valid_loss=0.0238513 valid_pos_acc=0.975407 valid_acc=0.995611 / 0.925864 / 0.831329 best_acc=0.995693
|
790 |
+
now: 2022-04-05 05:20:20.936271
|
791 |
+
[79000] train_loss=0.0153685 valid_loss=0.0233658 valid_pos_acc=0.97557 valid_acc=0.995645 / 0.931677 / 0.832522 best_acc=0.995693
|
792 |
+
now: 2022-04-05 05:32:17.131240
|
793 |
+
[79200] train_loss=0.0127379 valid_loss=0.0258231 valid_pos_acc=0.974604 valid_acc=0.995533 / 0.933338 / 0.832997 best_acc=0.995693
|
794 |
+
now: 2022-04-05 05:44:21.126943
|
795 |
+
[79400] train_loss=0.0127164 valid_loss=0.025304 valid_pos_acc=0.975057 valid_acc=0.995698 / 0.930098 / 0.832254 best_acc=0.995698
|
796 |
+
now: 2022-04-05 05:56:18.071160
|
797 |
+
[79600] train_loss=0.0110389 valid_loss=0.0258263 valid_pos_acc=0.975101 valid_acc=0.995643 / 0.930643 / 0.835766 best_acc=0.995698
|
798 |
+
now: 2022-04-05 06:08:12.503631
|
799 |
+
[79800] train_loss=0.0132959 valid_loss=0.0263015 valid_pos_acc=0.974487 valid_acc=0.995446 / 0.932765 / 0.83365 best_acc=0.995698
|
800 |
+
now: 2022-04-05 06:20:20.191496
|
801 |
+
[80000] train_loss=0.0138857 valid_loss=0.025657 valid_pos_acc=0.974903 valid_acc=0.995635 / 0.931425 / 0.834477 best_acc=0.995698
|
802 |
+
now: 2022-04-05 06:32:30.331123
|
803 |
+
[80200] train_loss=0.0129796 valid_loss=0.0252719 valid_pos_acc=0.974886 valid_acc=0.995539 / 0.930875 / 0.836388 best_acc=0.995698
|
804 |
+
now: 2022-04-05 06:44:36.817366
|
805 |
+
[80400] train_loss=0.0126967 valid_loss=0.0255622 valid_pos_acc=0.97511 valid_acc=0.995624 / 0.930096 / 0.832861 best_acc=0.995698
|
806 |
+
now: 2022-04-05 06:56:45.374554
|
807 |
+
[80600] train_loss=0.0142307 valid_loss=0.0258526 valid_pos_acc=0.97511 valid_acc=0.995489 / 0.932336 / 0.832256 best_acc=0.995698
|
808 |
+
now: 2022-04-05 07:08:47.240323
|
809 |
+
[80800] train_loss=0.0138569 valid_loss=0.0268916 valid_pos_acc=0.974849 valid_acc=0.995609 / 0.936109 / 0.834517 best_acc=0.995698
|
810 |
+
now: 2022-04-05 07:21:07.181733
|
811 |
+
[81000] train_loss=0.0126373 valid_loss=0.0253882 valid_pos_acc=0.975253 valid_acc=0.995587 / 0.932359 / 0.834985 best_acc=0.995698
|
812 |
+
now: 2022-04-05 07:33:38.912179
|
813 |
+
[81200] train_loss=0.0130838 valid_loss=0.0268405 valid_pos_acc=0.974689 valid_acc=0.995491 / 0.933348 / 0.837004 best_acc=0.995698
|
814 |
+
now: 2022-04-05 07:45:58.486868
|
815 |
+
[81400] train_loss=0.0134847 valid_loss=0.0259947 valid_pos_acc=0.974574 valid_acc=0.99542 / 0.934234 / 0.837761 best_acc=0.995698
|
816 |
+
now: 2022-04-05 07:58:01.699833
|
817 |
+
[81600] train_loss=0.0127687 valid_loss=0.0267179 valid_pos_acc=0.975088 valid_acc=0.995461 / 0.932898 / 0.833357 best_acc=0.995698
|
818 |
+
now: 2022-04-05 08:10:22.110075
|
819 |
+
[81800] train_loss=0.0125759 valid_loss=0.026263 valid_pos_acc=0.974645 valid_acc=0.995409 / 0.934211 / 0.83588 best_acc=0.995698
|
820 |
+
now: 2022-04-05 08:22:30.965145
|
821 |
+
[82000] train_loss=0.0138023 valid_loss=0.0262454 valid_pos_acc=0.97488 valid_acc=0.995481 / 0.933091 / 0.835412 best_acc=0.995698
|
822 |
+
now: 2022-04-05 08:34:55.047735
|
823 |
+
[82200] train_loss=0.0127197 valid_loss=0.0260373 valid_pos_acc=0.975346 valid_acc=0.995517 / 0.934354 / 0.834983 best_acc=0.995698
|
824 |
+
now: 2022-04-05 08:47:21.849999
|
825 |
+
[82400] train_loss=0.0133346 valid_loss=0.0259693 valid_pos_acc=0.975274 valid_acc=0.995563 / 0.932429 / 0.834782 best_acc=0.995698
|
826 |
+
now: 2022-04-05 08:59:46.077640
|
827 |
+
[82600] train_loss=0.0119814 valid_loss=0.0268453 valid_pos_acc=0.97447 valid_acc=0.995485 / 0.935298 / 0.838585 best_acc=0.995698
|
828 |
+
now: 2022-04-05 09:12:10.429291
|
829 |
+
[82800] train_loss=0.0138231 valid_loss=0.0256561 valid_pos_acc=0.975485 valid_acc=0.995604 / 0.932017 / 0.837154 best_acc=0.995698
|
830 |
+
now: 2022-04-05 09:24:25.503501
|
831 |
+
[83000] train_loss=0.0138415 valid_loss=0.0260283 valid_pos_acc=0.975131 valid_acc=0.99547 / 0.93434 / 0.835245 best_acc=0.995698
|
832 |
+
now: 2022-04-05 09:36:38.417154
|
833 |
+
[83200] train_loss=0.0124775 valid_loss=0.0260958 valid_pos_acc=0.975194 valid_acc=0.995672 / 0.935893 / 0.837375 best_acc=0.995698
|
834 |
+
now: 2022-04-05 09:48:54.047384
|
835 |
+
[83400] train_loss=0.0132698 valid_loss=0.0259098 valid_pos_acc=0.974975 valid_acc=0.995465 / 0.934838 / 0.8402 best_acc=0.995698
|
836 |
+
now: 2022-04-05 10:00:55.953481
|
837 |
+
[83600] train_loss=0.0134172 valid_loss=0.0258395 valid_pos_acc=0.975396 valid_acc=0.995544 / 0.934034 / 0.836583 best_acc=0.995698
|
838 |
+
now: 2022-04-05 10:12:56.731723
|
839 |
+
[83800] train_loss=0.0129522 valid_loss=0.0251187 valid_pos_acc=0.975463 valid_acc=0.995596 / 0.933734 / 0.833624 best_acc=0.995698
|
840 |
+
now: 2022-04-05 10:24:54.745844
|
841 |
+
[84000] train_loss=0.0141191 valid_loss=0.0254002 valid_pos_acc=0.975463 valid_acc=0.995585 / 0.935384 / 0.836613 best_acc=0.995698
|
842 |
+
now: 2022-04-05 10:37:00.190609
|
843 |
+
[84200] train_loss=0.013092 valid_loss=0.0266141 valid_pos_acc=0.97488 valid_acc=0.995517 / 0.934064 / 0.836598 best_acc=0.995698
|
844 |
+
now: 2022-04-05 10:49:11.600372
|
845 |
+
[84400] train_loss=0.0146677 valid_loss=0.025732 valid_pos_acc=0.975146 valid_acc=0.995476 / 0.934118 / 0.834405 best_acc=0.995698
|
846 |
+
now: 2022-04-05 11:01:26.620881
|
847 |
+
[84600] train_loss=0.0147797 valid_loss=0.0257641 valid_pos_acc=0.974908 valid_acc=0.9955 / 0.931502 / 0.830953 best_acc=0.995698
|
848 |
+
now: 2022-04-05 11:13:35.462462
|
849 |
+
[84800] train_loss=0.0130389 valid_loss=0.0259705 valid_pos_acc=0.97511 valid_acc=0.995654 / 0.933228 / 0.838112 best_acc=0.995698
|
850 |
+
now: 2022-04-05 11:25:41.335912
|
851 |
+
[85000] train_loss=0.0130362 valid_loss=0.0270572 valid_pos_acc=0.975012 valid_acc=0.995524 / 0.935052 / 0.83898 best_acc=0.995698
|
852 |
+
now: 2022-04-05 11:37:45.634992
|
853 |
+
[85200] train_loss=0.0136579 valid_loss=0.0259226 valid_pos_acc=0.975407 valid_acc=0.995561 / 0.932682 / 0.833727 best_acc=0.995698
|
854 |
+
now: 2022-04-05 11:49:49.322682
|
855 |
+
[85400] train_loss=0.0138471 valid_loss=0.0278054 valid_pos_acc=0.974313 valid_acc=0.995253 / 0.926638 / 0.826328 best_acc=0.995698
|
856 |
+
now: 2022-04-05 12:02:06.577902
|
857 |
+
[85600] train_loss=0.0141456 valid_loss=0.0270836 valid_pos_acc=0.97522 valid_acc=0.995439 / 0.931749 / 0.83259 best_acc=0.995698
|
858 |
+
now: 2022-04-05 12:14:09.865872
|
859 |
+
[85800] train_loss=0.0136233 valid_loss=0.0261673 valid_pos_acc=0.975724 valid_acc=0.995485 / 0.932979 / 0.832328 best_acc=0.995698
|
860 |
+
now: 2022-04-05 12:26:24.557886
|
861 |
+
[86000] train_loss=0.0129815 valid_loss=0.026333 valid_pos_acc=0.975672 valid_acc=0.995426 / 0.933565 / 0.836186 best_acc=0.995698
|
862 |
+
now: 2022-04-05 12:38:22.054353
|
863 |
+
[86200] train_loss=0.0138994 valid_loss=0.025931 valid_pos_acc=0.975602 valid_acc=0.995533 / 0.927751 / 0.828184 best_acc=0.995698
|
864 |
+
now: 2022-04-05 12:50:25.067251
|
865 |
+
[86400] train_loss=0.0136555 valid_loss=0.0258883 valid_pos_acc=0.975031 valid_acc=0.995409 / 0.927127 / 0.829497 best_acc=0.995698
|
866 |
+
now: 2022-04-05 13:02:21.412803
|
867 |
+
[86600] train_loss=0.0142481 valid_loss=0.0267677 valid_pos_acc=0.975535 valid_acc=0.995535 / 0.929604 / 0.830529 best_acc=0.995698
|
868 |
+
now: 2022-04-05 13:14:29.577497
|
869 |
+
[86800] train_loss=0.0142199 valid_loss=0.026345 valid_pos_acc=0.974986 valid_acc=0.995331 / 0.93041 / 0.828971 best_acc=0.995698
|
870 |
+
now: 2022-04-05 13:26:39.137708
|
871 |
+
[87000] train_loss=0.0132453 valid_loss=0.0277596 valid_pos_acc=0.975698 valid_acc=0.995459 / 0.929227 / 0.825919 best_acc=0.995698
|
872 |
+
now: 2022-04-05 13:38:41.105816
|
873 |
+
[87200] train_loss=0.0140319 valid_loss=0.0262992 valid_pos_acc=0.975387 valid_acc=0.995372 / 0.925482 / 0.828088 best_acc=0.995698
|
874 |
+
now: 2022-04-05 13:50:47.452684
|
875 |
+
[87400] train_loss=0.0143099 valid_loss=0.0264765 valid_pos_acc=0.974565 valid_acc=0.995402 / 0.930457 / 0.829678 best_acc=0.995698
|
876 |
+
now: 2022-04-05 14:02:54.386306
|
877 |
+
[87600] train_loss=0.0139416 valid_loss=0.0260449 valid_pos_acc=0.975598 valid_acc=0.99555 / 0.930947 / 0.830807 best_acc=0.995698
|
878 |
+
now: 2022-04-05 14:14:49.214068
|
879 |
+
[87800] train_loss=0.013045 valid_loss=0.0260407 valid_pos_acc=0.974786 valid_acc=0.995637 / 0.934269 / 0.839114 best_acc=0.995698
|
880 |
+
now: 2022-04-05 14:26:42.833491
|
881 |
+
[88000] train_loss=0.0141046 valid_loss=0.0263996 valid_pos_acc=0.975235 valid_acc=0.995537 / 0.933095 / 0.831651 best_acc=0.995698
|
882 |
+
now: 2022-04-05 14:38:40.768985
|
883 |
+
[88200] train_loss=0.0146935 valid_loss=0.0263447 valid_pos_acc=0.975728 valid_acc=0.995494 / 0.932254 / 0.829169 best_acc=0.995698
|
884 |
+
now: 2022-04-05 14:50:39.132010
|
885 |
+
[88400] train_loss=0.013988 valid_loss=0.0252771 valid_pos_acc=0.975452 valid_acc=0.995494 / 0.928645 / 0.829601 best_acc=0.995698
|
886 |
+
now: 2022-04-05 15:02:36.068409
|
887 |
+
[88600] train_loss=0.0131047 valid_loss=0.0262127 valid_pos_acc=0.975194 valid_acc=0.995539 / 0.933965 / 0.833988 best_acc=0.995698
|
888 |
+
now: 2022-04-05 15:14:39.273565
|
889 |
+
[88800] train_loss=0.0134752 valid_loss=0.0262747 valid_pos_acc=0.975752 valid_acc=0.995544 / 0.93222 / 0.832838 best_acc=0.995698
|
890 |
+
now: 2022-04-05 15:26:47.762768
|
891 |
+
[89000] train_loss=0.0140879 valid_loss=0.026511 valid_pos_acc=0.975494 valid_acc=0.995559 / 0.930583 / 0.832717 best_acc=0.995698
|
892 |
+
now: 2022-04-05 15:38:34.298601
|
893 |
+
[89200] train_loss=0.0145324 valid_loss=0.0263811 valid_pos_acc=0.975348 valid_acc=0.995639 / 0.933692 / 0.831782 best_acc=0.995698
|
894 |
+
now: 2022-04-05 15:50:34.391187
|
895 |
+
[89400] train_loss=0.0143614 valid_loss=0.0266067 valid_pos_acc=0.975589 valid_acc=0.99558 / 0.93295 / 0.832933 best_acc=0.995698
|
896 |
+
now: 2022-04-05 16:02:28.811246
|
897 |
+
[89600] train_loss=0.0139432 valid_loss=0.025967 valid_pos_acc=0.975483 valid_acc=0.995678 / 0.930272 / 0.833364 best_acc=0.995698
|
898 |
+
now: 2022-04-05 16:14:16.963671
|
899 |
+
[89800] train_loss=0.015888 valid_loss=0.025742 valid_pos_acc=0.975517 valid_acc=0.995591 / 0.930457 / 0.833334 best_acc=0.995698
|
900 |
+
now: 2022-04-05 16:26:24.078615
|
901 |
+
[90000] train_loss=0.0144244 valid_loss=0.0256335 valid_pos_acc=0.975693 valid_acc=0.995632 / 0.93239 / 0.830571 best_acc=0.995698
|
902 |
+
now: 2022-04-05 16:38:39.760929
|
903 |
+
[90200] train_loss=0.0143907 valid_loss=0.0264985 valid_pos_acc=0.975496 valid_acc=0.995324 / 0.930264 / 0.822809 best_acc=0.995698
|
904 |
+
now: 2022-04-05 16:50:47.722632
|
905 |
+
[90400] train_loss=0.0137474 valid_loss=0.0254164 valid_pos_acc=0.975394 valid_acc=0.995574 / 0.93623 / 0.834253 best_acc=0.995698
|
906 |
+
now: 2022-04-05 17:02:52.333896
|
907 |
+
[90600] train_loss=0.0142556 valid_loss=0.0260749 valid_pos_acc=0.975272 valid_acc=0.995496 / 0.932219 / 0.832236 best_acc=0.995698
|
908 |
+
now: 2022-04-05 17:14:37.154862
|
909 |
+
[90800] train_loss=0.0138562 valid_loss=0.0259641 valid_pos_acc=0.975235 valid_acc=0.99537 / 0.92701 / 0.827433 best_acc=0.995698
|
910 |
+
now: 2022-04-05 17:26:38.096136
|
911 |
+
[91000] train_loss=0.0128578 valid_loss=0.0257195 valid_pos_acc=0.975418 valid_acc=0.995511 / 0.927246 / 0.829352 best_acc=0.995698
|
912 |
+
now: 2022-04-05 17:38:33.834741
|
913 |
+
[91200] train_loss=0.0139655 valid_loss=0.0263024 valid_pos_acc=0.975672 valid_acc=0.995513 / 0.931341 / 0.827522 best_acc=0.995698
|
914 |
+
now: 2022-04-05 17:50:27.641391
|
915 |
+
[91400] train_loss=0.0143926 valid_loss=0.0255845 valid_pos_acc=0.975272 valid_acc=0.995307 / 0.931624 / 0.830926 best_acc=0.995698
|
916 |
+
now: 2022-04-05 18:02:38.786522
|
917 |
+
[91600] train_loss=0.0145323 valid_loss=0.0253511 valid_pos_acc=0.975374 valid_acc=0.995478 / 0.93136 / 0.832302 best_acc=0.995698
|
918 |
+
now: 2022-04-05 18:14:33.259268
|
919 |
+
[91800] train_loss=0.0145714 valid_loss=0.0256718 valid_pos_acc=0.975031 valid_acc=0.995468 / 0.929101 / 0.830769 best_acc=0.995698
|
920 |
+
now: 2022-04-05 18:26:31.723280
|
921 |
+
[92000] train_loss=0.0150823 valid_loss=0.025818 valid_pos_acc=0.975392 valid_acc=0.995515 / 0.933814 / 0.836218 best_acc=0.995698
|
922 |
+
now: 2022-04-05 18:38:37.282828
|
923 |
+
[92200] train_loss=0.0144193 valid_loss=0.0249235 valid_pos_acc=0.975411 valid_acc=0.995576 / 0.930254 / 0.831995 best_acc=0.995698
|
924 |
+
now: 2022-04-05 18:50:39.861549
|
925 |
+
[92400] train_loss=0.0145149 valid_loss=0.0255686 valid_pos_acc=0.97557 valid_acc=0.995656 / 0.93331 / 0.837843 best_acc=0.995698
|
926 |
+
now: 2022-04-05 19:02:40.114394
|
927 |
+
[92600] train_loss=0.0136907 valid_loss=0.0251938 valid_pos_acc=0.975494 valid_acc=0.995682 / 0.934727 / 0.838754 best_acc=0.995698
|
928 |
+
now: 2022-04-05 19:14:38.262458
|
929 |
+
[92800] train_loss=0.0145302 valid_loss=0.0252417 valid_pos_acc=0.97527 valid_acc=0.995661 / 0.932038 / 0.837297 best_acc=0.995698
|
930 |
+
now: 2022-04-05 19:26:38.439473
|
931 |
+
[93000] train_loss=0.0156955 valid_loss=0.0263089 valid_pos_acc=0.975235 valid_acc=0.995504 / 0.933166 / 0.828832 best_acc=0.995698
|
932 |
+
now: 2022-04-05 19:38:54.778835
|
933 |
+
[93200] train_loss=0.0159822 valid_loss=0.0254963 valid_pos_acc=0.975159 valid_acc=0.995413 / 0.93672 / 0.840871 best_acc=0.995698
|
934 |
+
now: 2022-04-05 19:50:52.991181
|
935 |
+
[93400] train_loss=0.0137888 valid_loss=0.0254194 valid_pos_acc=0.975212 valid_acc=0.995617 / 0.933914 / 0.832067 best_acc=0.995698
|
936 |
+
now: 2022-04-05 20:03:09.308012
|
937 |
+
[93600] train_loss=0.0159022 valid_loss=0.0244989 valid_pos_acc=0.97522 valid_acc=0.995565 / 0.933225 / 0.836939 best_acc=0.995698
|
938 |
+
now: 2022-04-05 20:15:19.273222
|
939 |
+
[93800] train_loss=0.0142521 valid_loss=0.0248734 valid_pos_acc=0.975294 valid_acc=0.995643 / 0.93422 / 0.840297 best_acc=0.995698
|
940 |
+
now: 2022-04-05 20:27:27.195510
|
941 |
+
[94000] train_loss=0.0150861 valid_loss=0.0239483 valid_pos_acc=0.975533 valid_acc=0.995619 / 0.936434 / 0.84042 best_acc=0.995698
|
942 |
+
now: 2022-04-05 20:39:32.119195
|
943 |
+
[94200] train_loss=0.0152762 valid_loss=0.0248546 valid_pos_acc=0.975739 valid_acc=0.99565 / 0.932887 / 0.835955 best_acc=0.995698
|
944 |
+
now: 2022-04-05 20:51:35.782856
|
945 |
+
[94400] train_loss=0.0141213 valid_loss=0.0250847 valid_pos_acc=0.974971 valid_acc=0.995682 / 0.935789 / 0.840368 best_acc=0.995698
|
946 |
+
now: 2022-04-05 21:03:36.489728
|
947 |
+
[94600] train_loss=0.0144385 valid_loss=0.0249789 valid_pos_acc=0.975439 valid_acc=0.995544 / 0.938386 / 0.840405 best_acc=0.995698
|
948 |
+
now: 2022-04-05 21:15:30.678169
|
949 |
+
[94800] train_loss=0.013553 valid_loss=0.0256229 valid_pos_acc=0.975455 valid_acc=0.995611 / 0.93238 / 0.837687 best_acc=0.995698
|
950 |
+
now: 2022-04-05 21:27:32.536531
|
951 |
+
[95000] train_loss=0.0158608 valid_loss=0.0256513 valid_pos_acc=0.97537 valid_acc=0.995528 / 0.935879 / 0.844516 best_acc=0.995698
|
952 |
+
now: 2022-04-05 21:39:32.357321
|
953 |
+
[95200] train_loss=0.0151035 valid_loss=0.0254666 valid_pos_acc=0.97575 valid_acc=0.995669 / 0.934501 / 0.837649 best_acc=0.995698
|
954 |
+
now: 2022-04-05 21:51:43.227092
|
955 |
+
[95400] train_loss=0.0151553 valid_loss=0.0249784 valid_pos_acc=0.975474 valid_acc=0.995526 / 0.934746 / 0.835569 best_acc=0.995698
|
956 |
+
now: 2022-04-05 22:03:48.247576
|
957 |
+
[95600] train_loss=0.0155748 valid_loss=0.0260542 valid_pos_acc=0.975218 valid_acc=0.995552 / 0.933312 / 0.835163 best_acc=0.995698
|
958 |
+
now: 2022-04-05 22:15:38.086255
|
959 |
+
[95800] train_loss=0.0146772 valid_loss=0.0247799 valid_pos_acc=0.975637 valid_acc=0.995626 / 0.930346 / 0.833518 best_acc=0.995698
|
960 |
+
now: 2022-04-05 22:27:43.689507
|
961 |
+
[96000] train_loss=0.01602 valid_loss=0.0267345 valid_pos_acc=0.975233 valid_acc=0.995433 / 0.931918 / 0.829208 best_acc=0.995698
|
962 |
+
now: 2022-04-05 22:39:43.113855
|
963 |
+
[96200] train_loss=0.0151231 valid_loss=0.0255382 valid_pos_acc=0.9755 valid_acc=0.995478 / 0.926487 / 0.826398 best_acc=0.995698
|
964 |
+
now: 2022-04-05 22:51:31.072458
|
965 |
+
[96400] train_loss=0.0149376 valid_loss=0.0258116 valid_pos_acc=0.97491 valid_acc=0.995498 / 0.928117 / 0.831135 best_acc=0.995698
|
966 |
+
now: 2022-04-05 23:03:31.843003
|
967 |
+
[96600] train_loss=0.0144005 valid_loss=0.0254002 valid_pos_acc=0.975784 valid_acc=0.99558 / 0.930447 / 0.829763 best_acc=0.995698
|
968 |
+
now: 2022-04-05 23:15:34.625356
|
969 |
+
[96800] train_loss=0.0146142 valid_loss=0.0268592 valid_pos_acc=0.974348 valid_acc=0.995389 / 0.930559 / 0.830846 best_acc=0.995698
|
970 |
+
now: 2022-04-05 23:27:30.008875
|
971 |
+
[97000] train_loss=0.0142077 valid_loss=0.025247 valid_pos_acc=0.975439 valid_acc=0.995654 / 0.92956 / 0.835155 best_acc=0.995698
|
972 |
+
now: 2022-04-05 23:39:35.358671
|
973 |
+
[97200] train_loss=0.0136525 valid_loss=0.025149 valid_pos_acc=0.975429 valid_acc=0.995604 / 0.932177 / 0.838884 best_acc=0.995698
|
974 |
+
now: 2022-04-05 23:51:42.518252
|
975 |
+
[97400] train_loss=0.0142976 valid_loss=0.0252632 valid_pos_acc=0.975637 valid_acc=0.99558 / 0.928696 / 0.831515 best_acc=0.995698
|
976 |
+
now: 2022-04-06 00:03:39.610613
|
977 |
+
[97600] train_loss=0.01445 valid_loss=0.0247523 valid_pos_acc=0.975652 valid_acc=0.995626 / 0.928967 / 0.837404 best_acc=0.995698
|
978 |
+
now: 2022-04-06 00:15:39.948279
|
979 |
+
[97800] train_loss=0.0138557 valid_loss=0.0250991 valid_pos_acc=0.975672 valid_acc=0.99553 / 0.928176 / 0.832755 best_acc=0.995698
|
980 |
+
now: 2022-04-06 00:27:35.432314
|
981 |
+
[98000] train_loss=0.015877 valid_loss=0.0247035 valid_pos_acc=0.975576 valid_acc=0.995548 / 0.928882 / 0.835904 best_acc=0.995698
|
982 |
+
now: 2022-04-06 00:39:42.196877
|
983 |
+
[98200] train_loss=0.0141082 valid_loss=0.0244627 valid_pos_acc=0.975843 valid_acc=0.995676 / 0.928219 / 0.833121 best_acc=0.995698
|
984 |
+
now: 2022-04-06 00:51:45.295429
|
985 |
+
[98400] train_loss=0.0146779 valid_loss=0.0248106 valid_pos_acc=0.975962 valid_acc=0.995632 / 0.927196 / 0.83131 best_acc=0.995698
|
986 |
+
now: 2022-04-06 01:03:48.072286
|
987 |
+
[98600] train_loss=0.0150793 valid_loss=0.024931 valid_pos_acc=0.975741 valid_acc=0.995587 / 0.927742 / 0.832345 best_acc=0.995698
|
988 |
+
now: 2022-04-06 01:15:49.874914
|
989 |
+
[98800] train_loss=0.0138506 valid_loss=0.0256198 valid_pos_acc=0.976136 valid_acc=0.995611 / 0.929698 / 0.830346 best_acc=0.995698
|
990 |
+
now: 2022-04-06 01:27:46.508890
|
991 |
+
[99000] train_loss=0.0118288 valid_loss=0.026124 valid_pos_acc=0.976116 valid_acc=0.995596 / 0.930481 / 0.833991 best_acc=0.995698
|
992 |
+
now: 2022-04-06 01:39:52.754135
|
993 |
+
[99200] train_loss=0.0115159 valid_loss=0.0254321 valid_pos_acc=0.975875 valid_acc=0.995689 / 0.93085 / 0.835924 best_acc=0.995698
|
994 |
+
now: 2022-04-06 01:51:59.498058
|
995 |
+
[99400] train_loss=0.0118415 valid_loss=0.0261764 valid_pos_acc=0.976184 valid_acc=0.995604 / 0.930402 / 0.832654 best_acc=0.995698
|
996 |
+
now: 2022-04-06 02:04:00.516174
|
997 |
+
[99600] train_loss=0.0111765 valid_loss=0.0253784 valid_pos_acc=0.976212 valid_acc=0.995789 / 0.930556 / 0.834549 best_acc=0.995789
|
998 |
+
now: 2022-04-06 02:16:10.608263
|
999 |
+
[99800] train_loss=0.0122449 valid_loss=0.0257746 valid_pos_acc=0.976166 valid_acc=0.995726 / 0.931839 / 0.838174 best_acc=0.995789
|
1000 |
+
now: 2022-04-06 02:28:12.819393
|
1001 |
+
[100000] train_loss=0.0122801 valid_loss=0.0260771 valid_pos_acc=0.975372 valid_acc=0.99555 / 0.929529 / 0.838038 best_acc=0.995789
|
1002 |
+
now: 2022-04-06 02:40:11.627022
|
1003 |
+
testing ...
|
1004 |
+
reloading best accuracy model ...
|
1005 |
+
valid_best_acc=0.995789 test_loss=0.0266495 test_pos_acc=0.975942 test_acc=0.995605 / 0.918997 / 0.806213
|
text/G2PWModel/version
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
v2.0
|
text/LangSegmenter/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .langsegmenter import LangSegmenter
|
text/LangSegmenter/langsegmenter.py
ADDED
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import re
|
3 |
+
|
4 |
+
# jieba静音
|
5 |
+
import jieba
|
6 |
+
jieba.setLogLevel(logging.CRITICAL)
|
7 |
+
|
8 |
+
# 更改fast_langdetect大模型位置
|
9 |
+
from pathlib import Path
|
10 |
+
import fast_langdetect
|
11 |
+
fast_langdetect.infer._default_detector = fast_langdetect.infer.LangDetector(fast_langdetect.infer.LangDetectConfig(cache_dir=Path(__file__).parent.parent.parent / "pretrained_models" / "fast_langdetect"))
|
12 |
+
|
13 |
+
|
14 |
+
from split_lang import LangSplitter
|
15 |
+
|
16 |
+
|
17 |
+
def full_en(text):
|
18 |
+
pattern = r'^(?=.*[A-Za-z])[A-Za-z0-9\s\u0020-\u007E\u2000-\u206F\u3000-\u303F\uFF00-\uFFEF]+$'
|
19 |
+
return bool(re.match(pattern, text))
|
20 |
+
|
21 |
+
|
22 |
+
def full_cjk(text):
|
23 |
+
# 来自wiki
|
24 |
+
cjk_ranges = [
|
25 |
+
(0x4E00, 0x9FFF), # CJK Unified Ideographs
|
26 |
+
(0x3400, 0x4DB5), # CJK Extension A
|
27 |
+
(0x20000, 0x2A6DD), # CJK Extension B
|
28 |
+
(0x2A700, 0x2B73F), # CJK Extension C
|
29 |
+
(0x2B740, 0x2B81F), # CJK Extension D
|
30 |
+
(0x2B820, 0x2CEAF), # CJK Extension E
|
31 |
+
(0x2CEB0, 0x2EBEF), # CJK Extension F
|
32 |
+
(0x30000, 0x3134A), # CJK Extension G
|
33 |
+
(0x31350, 0x323AF), # CJK Extension H
|
34 |
+
(0x2EBF0, 0x2EE5D), # CJK Extension H
|
35 |
+
]
|
36 |
+
|
37 |
+
pattern = r'[0-9、-〜。!?.!?… /]+$'
|
38 |
+
|
39 |
+
cjk_text = ""
|
40 |
+
for char in text:
|
41 |
+
code_point = ord(char)
|
42 |
+
in_cjk = any(start <= code_point <= end for start, end in cjk_ranges)
|
43 |
+
if in_cjk or re.match(pattern, char):
|
44 |
+
cjk_text += char
|
45 |
+
return cjk_text
|
46 |
+
|
47 |
+
|
48 |
+
def split_jako(tag_lang,item):
|
49 |
+
if tag_lang == "ja":
|
50 |
+
pattern = r"([\u3041-\u3096\u3099\u309A\u30A1-\u30FA\u30FC]+(?:[0-9、-〜。!?.!?… ]+[\u3041-\u3096\u3099\u309A\u30A1-\u30FA\u30FC]*)*)"
|
51 |
+
else:
|
52 |
+
pattern = r"([\u1100-\u11FF\u3130-\u318F\uAC00-\uD7AF]+(?:[0-9、-〜。!?.!?… ]+[\u1100-\u11FF\u3130-\u318F\uAC00-\uD7AF]*)*)"
|
53 |
+
|
54 |
+
lang_list: list[dict] = []
|
55 |
+
tag = 0
|
56 |
+
for match in re.finditer(pattern, item['text']):
|
57 |
+
if match.start() > tag:
|
58 |
+
lang_list.append({'lang':item['lang'],'text':item['text'][tag:match.start()]})
|
59 |
+
|
60 |
+
tag = match.end()
|
61 |
+
lang_list.append({'lang':tag_lang,'text':item['text'][match.start():match.end()]})
|
62 |
+
|
63 |
+
if tag < len(item['text']):
|
64 |
+
lang_list.append({'lang':item['lang'],'text':item['text'][tag:len(item['text'])]})
|
65 |
+
|
66 |
+
return lang_list
|
67 |
+
|
68 |
+
|
69 |
+
def merge_lang(lang_list, item):
|
70 |
+
if lang_list and item['lang'] == lang_list[-1]['lang']:
|
71 |
+
lang_list[-1]['text'] += item['text']
|
72 |
+
else:
|
73 |
+
lang_list.append(item)
|
74 |
+
return lang_list
|
75 |
+
|
76 |
+
|
77 |
+
class LangSegmenter():
|
78 |
+
# 默认过滤器, 基于gsv目前四种语言
|
79 |
+
DEFAULT_LANG_MAP = {
|
80 |
+
"zh": "zh",
|
81 |
+
"yue": "zh", # 粤语
|
82 |
+
"wuu": "zh", # 吴语
|
83 |
+
"zh-cn": "zh",
|
84 |
+
"zh-tw": "x", # 繁体设置为x
|
85 |
+
"ko": "ko",
|
86 |
+
"ja": "ja",
|
87 |
+
"en": "en",
|
88 |
+
}
|
89 |
+
|
90 |
+
|
91 |
+
def getTexts(text):
|
92 |
+
lang_splitter = LangSplitter(lang_map=LangSegmenter.DEFAULT_LANG_MAP)
|
93 |
+
substr = lang_splitter.split_by_lang(text=text)
|
94 |
+
|
95 |
+
lang_list: list[dict] = []
|
96 |
+
|
97 |
+
for _, item in enumerate(substr):
|
98 |
+
dict_item = {'lang':item.lang,'text':item.text}
|
99 |
+
|
100 |
+
# 处理短英文被识别为其他语言的问题
|
101 |
+
if full_en(dict_item['text']):
|
102 |
+
dict_item['lang'] = 'en'
|
103 |
+
lang_list = merge_lang(lang_list,dict_item)
|
104 |
+
continue
|
105 |
+
|
106 |
+
# 处理非日语夹日文的问题(不包含CJK)
|
107 |
+
ja_list: list[dict] = []
|
108 |
+
if dict_item['lang'] != 'ja':
|
109 |
+
ja_list = split_jako('ja',dict_item)
|
110 |
+
|
111 |
+
if not ja_list:
|
112 |
+
ja_list.append(dict_item)
|
113 |
+
|
114 |
+
# 处理非韩语夹韩语的问题(不包含CJK)
|
115 |
+
ko_list: list[dict] = []
|
116 |
+
temp_list: list[dict] = []
|
117 |
+
for _, ko_item in enumerate(ja_list):
|
118 |
+
if ko_item["lang"] != 'ko':
|
119 |
+
ko_list = split_jako('ko',ko_item)
|
120 |
+
|
121 |
+
if ko_list:
|
122 |
+
temp_list.extend(ko_list)
|
123 |
+
else:
|
124 |
+
temp_list.append(ko_item)
|
125 |
+
|
126 |
+
# 未存在非日韩文夹日韩文
|
127 |
+
if len(temp_list) == 1:
|
128 |
+
# 未知语言检查是否为CJK
|
129 |
+
if dict_item['lang'] == 'x':
|
130 |
+
cjk_text = full_cjk(dict_item['text'])
|
131 |
+
if cjk_text:
|
132 |
+
dict_item = {'lang':'zh','text':cjk_text}
|
133 |
+
lang_list = merge_lang(lang_list,dict_item)
|
134 |
+
else:
|
135 |
+
lang_list = merge_lang(lang_list,dict_item)
|
136 |
+
continue
|
137 |
+
else:
|
138 |
+
lang_list = merge_lang(lang_list,dict_item)
|
139 |
+
continue
|
140 |
+
|
141 |
+
# 存在非日韩文夹日韩文
|
142 |
+
for _, temp_item in enumerate(temp_list):
|
143 |
+
# 未知语言检查是否为CJK
|
144 |
+
if temp_item['lang'] == 'x':
|
145 |
+
cjk_text = full_cjk(dict_item['text'])
|
146 |
+
if cjk_text:
|
147 |
+
dict_item = {'lang':'zh','text':cjk_text}
|
148 |
+
lang_list = merge_lang(lang_list,dict_item)
|
149 |
+
else:
|
150 |
+
lang_list = merge_lang(lang_list,dict_item)
|
151 |
+
else:
|
152 |
+
lang_list = merge_lang(lang_list,temp_item)
|
153 |
+
|
154 |
+
temp_list = lang_list
|
155 |
+
lang_list = []
|
156 |
+
for _, temp_item in enumerate(temp_list):
|
157 |
+
if temp_item['lang'] == 'x':
|
158 |
+
if lang_list:
|
159 |
+
temp_item['lang'] = lang_list[-1]['lang']
|
160 |
+
elif len(temp_list) > 1:
|
161 |
+
temp_item['lang'] = temp_list[1]['lang']
|
162 |
+
else:
|
163 |
+
temp_item['lang'] = 'zh'
|
164 |
+
|
165 |
+
lang_list = merge_lang(lang_list,temp_item)
|
166 |
+
|
167 |
+
return lang_list
|
168 |
+
|
169 |
+
|
170 |
+
if __name__ == "__main__":
|
171 |
+
text = "MyGO?,你也喜欢まいご吗?"
|
172 |
+
print(LangSegmenter.getTexts(text))
|
173 |
+
|
174 |
+
text = "ねえ、知ってる?最近、僕は天文学を勉強してるんだ。君の瞳が星空みたいにキラキラしてるからさ。"
|
175 |
+
print(LangSegmenter.getTexts(text))
|
text/__init__.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
# if os.environ.get("version","v1")=="v1":
|
3 |
+
# from text.symbols import symbols
|
4 |
+
# else:
|
5 |
+
# from text.symbols2 import symbols
|
6 |
+
|
7 |
+
from text import symbols as symbols_v1
|
8 |
+
from text import symbols2 as symbols_v2
|
9 |
+
|
10 |
+
_symbol_to_id_v1 = {s: i for i, s in enumerate(symbols_v1.symbols)}
|
11 |
+
_symbol_to_id_v2 = {s: i for i, s in enumerate(symbols_v2.symbols)}
|
12 |
+
|
13 |
+
|
14 |
+
def cleaned_text_to_sequence(cleaned_text, version=None):
|
15 |
+
"""Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
16 |
+
Args:
|
17 |
+
text: string to convert to a sequence
|
18 |
+
Returns:
|
19 |
+
List of integers corresponding to the symbols in the text
|
20 |
+
"""
|
21 |
+
if version is None:
|
22 |
+
version = os.environ.get("version", "v2")
|
23 |
+
if version == "v1":
|
24 |
+
phones = [_symbol_to_id_v1[symbol] for symbol in cleaned_text]
|
25 |
+
else:
|
26 |
+
phones = [_symbol_to_id_v2[symbol] for symbol in cleaned_text]
|
27 |
+
|
28 |
+
return phones
|
text/cantonese.py
ADDED
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# reference: https://huggingface.co/spaces/Naozumi0512/Bert-VITS2-Cantonese-Yue/blob/main/text/chinese.py
|
2 |
+
|
3 |
+
import re
|
4 |
+
import cn2an
|
5 |
+
import ToJyutping
|
6 |
+
|
7 |
+
from text.symbols import punctuation
|
8 |
+
from text.zh_normalization.text_normlization import TextNormalizer
|
9 |
+
|
10 |
+
normalizer = lambda x: cn2an.transform(x, "an2cn")
|
11 |
+
|
12 |
+
INITIALS = [
|
13 |
+
"aa",
|
14 |
+
"aai",
|
15 |
+
"aak",
|
16 |
+
"aap",
|
17 |
+
"aat",
|
18 |
+
"aau",
|
19 |
+
"ai",
|
20 |
+
"au",
|
21 |
+
"ap",
|
22 |
+
"at",
|
23 |
+
"ak",
|
24 |
+
"a",
|
25 |
+
"p",
|
26 |
+
"b",
|
27 |
+
"e",
|
28 |
+
"ts",
|
29 |
+
"t",
|
30 |
+
"dz",
|
31 |
+
"d",
|
32 |
+
"kw",
|
33 |
+
"k",
|
34 |
+
"gw",
|
35 |
+
"g",
|
36 |
+
"f",
|
37 |
+
"h",
|
38 |
+
"l",
|
39 |
+
"m",
|
40 |
+
"ng",
|
41 |
+
"n",
|
42 |
+
"s",
|
43 |
+
"y",
|
44 |
+
"w",
|
45 |
+
"c",
|
46 |
+
"z",
|
47 |
+
"j",
|
48 |
+
"ong",
|
49 |
+
"on",
|
50 |
+
"ou",
|
51 |
+
"oi",
|
52 |
+
"ok",
|
53 |
+
"o",
|
54 |
+
"uk",
|
55 |
+
"ung",
|
56 |
+
]
|
57 |
+
INITIALS += ["sp", "spl", "spn", "sil"]
|
58 |
+
|
59 |
+
|
60 |
+
rep_map = {
|
61 |
+
":": ",",
|
62 |
+
";": ",",
|
63 |
+
",": ",",
|
64 |
+
"。": ".",
|
65 |
+
"!": "!",
|
66 |
+
"?": "?",
|
67 |
+
"\n": ".",
|
68 |
+
"·": ",",
|
69 |
+
"、": ",",
|
70 |
+
"...": "…",
|
71 |
+
"$": ".",
|
72 |
+
"“": "'",
|
73 |
+
"”": "'",
|
74 |
+
'"': "'",
|
75 |
+
"‘": "'",
|
76 |
+
"’": "'",
|
77 |
+
"(": "'",
|
78 |
+
")": "'",
|
79 |
+
"(": "'",
|
80 |
+
")": "'",
|
81 |
+
"《": "'",
|
82 |
+
"》": "'",
|
83 |
+
"【": "'",
|
84 |
+
"】": "'",
|
85 |
+
"[": "'",
|
86 |
+
"]": "'",
|
87 |
+
"—": "-",
|
88 |
+
"~": "-",
|
89 |
+
"~": "-",
|
90 |
+
"「": "'",
|
91 |
+
"」": "'",
|
92 |
+
}
|
93 |
+
|
94 |
+
|
95 |
+
def replace_punctuation(text):
|
96 |
+
# text = text.replace("嗯", "恩").replace("呣", "母")
|
97 |
+
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
|
98 |
+
|
99 |
+
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
|
100 |
+
|
101 |
+
replaced_text = re.sub(r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text)
|
102 |
+
|
103 |
+
return replaced_text
|
104 |
+
|
105 |
+
|
106 |
+
def text_normalize(text):
|
107 |
+
tx = TextNormalizer()
|
108 |
+
sentences = tx.normalize(text)
|
109 |
+
dest_text = ""
|
110 |
+
for sentence in sentences:
|
111 |
+
dest_text += replace_punctuation(sentence)
|
112 |
+
return dest_text
|
113 |
+
|
114 |
+
|
115 |
+
punctuation_set = set(punctuation)
|
116 |
+
|
117 |
+
|
118 |
+
def jyuping_to_initials_finals_tones(jyuping_syllables):
|
119 |
+
initials_finals = []
|
120 |
+
tones = []
|
121 |
+
word2ph = []
|
122 |
+
|
123 |
+
for syllable in jyuping_syllables:
|
124 |
+
if syllable in punctuation:
|
125 |
+
initials_finals.append(syllable)
|
126 |
+
tones.append(0)
|
127 |
+
word2ph.append(1) # Add 1 for punctuation
|
128 |
+
elif syllable == "_":
|
129 |
+
initials_finals.append(syllable)
|
130 |
+
tones.append(0)
|
131 |
+
word2ph.append(1) # Add 1 for underscore
|
132 |
+
else:
|
133 |
+
try:
|
134 |
+
tone = int(syllable[-1])
|
135 |
+
syllable_without_tone = syllable[:-1]
|
136 |
+
except ValueError:
|
137 |
+
tone = 0
|
138 |
+
syllable_without_tone = syllable
|
139 |
+
|
140 |
+
for initial in INITIALS:
|
141 |
+
if syllable_without_tone.startswith(initial):
|
142 |
+
if syllable_without_tone.startswith("nga"):
|
143 |
+
initials_finals.extend(
|
144 |
+
[
|
145 |
+
syllable_without_tone[:2],
|
146 |
+
syllable_without_tone[2:] or syllable_without_tone[-1],
|
147 |
+
]
|
148 |
+
)
|
149 |
+
# tones.extend([tone, tone])
|
150 |
+
tones.extend([-1, tone])
|
151 |
+
word2ph.append(2)
|
152 |
+
else:
|
153 |
+
final = syllable_without_tone[len(initial) :] or initial[-1]
|
154 |
+
initials_finals.extend([initial, final])
|
155 |
+
# tones.extend([tone, tone])
|
156 |
+
tones.extend([-1, tone])
|
157 |
+
word2ph.append(2)
|
158 |
+
break
|
159 |
+
assert len(initials_finals) == len(tones)
|
160 |
+
|
161 |
+
###魔改为辅音+带音调的元音
|
162 |
+
phones = []
|
163 |
+
for a, b in zip(initials_finals, tones):
|
164 |
+
if b not in [-1, 0]: ###防止粤语和普通话重合开头加Y,如果是标点,不加。
|
165 |
+
todo = "%s%s" % (a, b)
|
166 |
+
else:
|
167 |
+
todo = a
|
168 |
+
if todo not in punctuation_set:
|
169 |
+
todo = "Y%s" % todo
|
170 |
+
phones.append(todo)
|
171 |
+
|
172 |
+
# return initials_finals, tones, word2ph
|
173 |
+
return phones, word2ph
|
174 |
+
|
175 |
+
|
176 |
+
def get_jyutping(text):
|
177 |
+
jyutping_array = []
|
178 |
+
punct_pattern = re.compile(r"^[{}]+$".format(re.escape("".join(punctuation))))
|
179 |
+
|
180 |
+
syllables = ToJyutping.get_jyutping_list(text)
|
181 |
+
|
182 |
+
for word, syllable in syllables:
|
183 |
+
if punct_pattern.match(word):
|
184 |
+
puncts = re.split(r"([{}])".format(re.escape("".join(punctuation))), word)
|
185 |
+
for punct in puncts:
|
186 |
+
if len(punct) > 0:
|
187 |
+
jyutping_array.append(punct)
|
188 |
+
else:
|
189 |
+
# match multple jyutping eg: liu4 ge3, or single jyutping eg: liu4
|
190 |
+
if not re.search(r"^([a-z]+[1-6]+[ ]?)+$", syllable):
|
191 |
+
raise ValueError(f"Failed to convert {word} to jyutping: {syllable}")
|
192 |
+
jyutping_array.append(syllable)
|
193 |
+
|
194 |
+
return jyutping_array
|
195 |
+
|
196 |
+
|
197 |
+
def get_bert_feature(text, word2ph):
|
198 |
+
from text import chinese_bert
|
199 |
+
|
200 |
+
return chinese_bert.get_bert_feature(text, word2ph)
|
201 |
+
|
202 |
+
|
203 |
+
def g2p(text):
|
204 |
+
# word2ph = []
|
205 |
+
jyuping = get_jyutping(text)
|
206 |
+
# print(jyuping)
|
207 |
+
# phones, tones, word2ph = jyuping_to_initials_finals_tones(jyuping)
|
208 |
+
phones, word2ph = jyuping_to_initials_finals_tones(jyuping)
|
209 |
+
# phones = ["_"] + phones + ["_"]
|
210 |
+
# tones = [0] + tones + [0]
|
211 |
+
# word2ph = [1] + word2ph + [1]
|
212 |
+
return phones, word2ph
|
213 |
+
|
214 |
+
|
215 |
+
if __name__ == "__main__":
|
216 |
+
# text = "啊!但是《原神》是由,米哈\游自主, [研发]的一款全.新开放世界.冒险游戏"
|
217 |
+
text = "佢個鋤頭太短啦。"
|
218 |
+
text = text_normalize(text)
|
219 |
+
# phones, tones, word2ph = g2p(text)
|
220 |
+
phones, word2ph = g2p(text)
|
221 |
+
# print(phones, tones, word2ph)
|
222 |
+
print(phones, word2ph)
|
text/chinese.py
ADDED
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
|
4 |
+
import cn2an
|
5 |
+
from pypinyin import lazy_pinyin, Style
|
6 |
+
|
7 |
+
from text.symbols import punctuation
|
8 |
+
from text.tone_sandhi import ToneSandhi
|
9 |
+
from text.zh_normalization.text_normlization import TextNormalizer
|
10 |
+
|
11 |
+
normalizer = lambda x: cn2an.transform(x, "an2cn")
|
12 |
+
|
13 |
+
current_file_path = os.path.dirname(__file__)
|
14 |
+
pinyin_to_symbol_map = {
|
15 |
+
line.split("\t")[0]: line.strip().split("\t")[1]
|
16 |
+
for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
|
17 |
+
}
|
18 |
+
|
19 |
+
import jieba_fast
|
20 |
+
import logging
|
21 |
+
|
22 |
+
jieba_fast.setLogLevel(logging.CRITICAL)
|
23 |
+
import jieba_fast.posseg as psg
|
24 |
+
|
25 |
+
|
26 |
+
rep_map = {
|
27 |
+
":": ",",
|
28 |
+
";": ",",
|
29 |
+
",": ",",
|
30 |
+
"。": ".",
|
31 |
+
"!": "!",
|
32 |
+
"?": "?",
|
33 |
+
"\n": ".",
|
34 |
+
"·": ",",
|
35 |
+
"、": ",",
|
36 |
+
"...": "…",
|
37 |
+
"$": ".",
|
38 |
+
"/": ",",
|
39 |
+
"—": "-",
|
40 |
+
"~": "…",
|
41 |
+
"~": "…",
|
42 |
+
}
|
43 |
+
|
44 |
+
tone_modifier = ToneSandhi()
|
45 |
+
|
46 |
+
|
47 |
+
def replace_punctuation(text):
|
48 |
+
text = text.replace("嗯", "恩").replace("呣", "母")
|
49 |
+
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
|
50 |
+
|
51 |
+
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
|
52 |
+
|
53 |
+
replaced_text = re.sub(r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text)
|
54 |
+
|
55 |
+
return replaced_text
|
56 |
+
|
57 |
+
|
58 |
+
def replace_punctuation_with_en(text):
|
59 |
+
text = text.replace("嗯", "恩").replace("呣", "母")
|
60 |
+
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
|
61 |
+
|
62 |
+
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
|
63 |
+
|
64 |
+
replaced_text = re.sub(r"[^\u4e00-\u9fa5A-Za-z" + "".join(punctuation) + r"]+", "", replaced_text)
|
65 |
+
|
66 |
+
return replaced_text
|
67 |
+
|
68 |
+
|
69 |
+
def replace_consecutive_punctuation(text):
|
70 |
+
punctuations = "".join(re.escape(p) for p in punctuation)
|
71 |
+
pattern = f"([{punctuations}])([{punctuations}])+"
|
72 |
+
result = re.sub(pattern, r"\1", text)
|
73 |
+
return result
|
74 |
+
|
75 |
+
|
76 |
+
def g2p(text):
|
77 |
+
pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
|
78 |
+
sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
|
79 |
+
phones, word2ph = _g2p(sentences)
|
80 |
+
return phones, word2ph
|
81 |
+
|
82 |
+
|
83 |
+
def _get_initials_finals(word):
|
84 |
+
initials = []
|
85 |
+
finals = []
|
86 |
+
orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
|
87 |
+
orig_finals = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
|
88 |
+
for c, v in zip(orig_initials, orig_finals):
|
89 |
+
initials.append(c)
|
90 |
+
finals.append(v)
|
91 |
+
return initials, finals
|
92 |
+
|
93 |
+
|
94 |
+
def _g2p(segments):
|
95 |
+
phones_list = []
|
96 |
+
word2ph = []
|
97 |
+
for seg in segments:
|
98 |
+
pinyins = []
|
99 |
+
# Replace all English words in the sentence
|
100 |
+
seg = re.sub("[a-zA-Z]+", "", seg)
|
101 |
+
seg_cut = psg.lcut(seg)
|
102 |
+
initials = []
|
103 |
+
finals = []
|
104 |
+
seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
|
105 |
+
for word, pos in seg_cut:
|
106 |
+
if pos == "eng":
|
107 |
+
continue
|
108 |
+
sub_initials, sub_finals = _get_initials_finals(word)
|
109 |
+
sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
|
110 |
+
initials.append(sub_initials)
|
111 |
+
finals.append(sub_finals)
|
112 |
+
|
113 |
+
# assert len(sub_initials) == len(sub_finals) == len(word)
|
114 |
+
initials = sum(initials, [])
|
115 |
+
finals = sum(finals, [])
|
116 |
+
#
|
117 |
+
for c, v in zip(initials, finals):
|
118 |
+
raw_pinyin = c + v
|
119 |
+
# NOTE: post process for pypinyin outputs
|
120 |
+
# we discriminate i, ii and iii
|
121 |
+
if c == v:
|
122 |
+
assert c in punctuation
|
123 |
+
phone = [c]
|
124 |
+
word2ph.append(1)
|
125 |
+
else:
|
126 |
+
v_without_tone = v[:-1]
|
127 |
+
tone = v[-1]
|
128 |
+
|
129 |
+
pinyin = c + v_without_tone
|
130 |
+
assert tone in "12345"
|
131 |
+
|
132 |
+
if c:
|
133 |
+
# 多音节
|
134 |
+
v_rep_map = {
|
135 |
+
"uei": "ui",
|
136 |
+
"iou": "iu",
|
137 |
+
"uen": "un",
|
138 |
+
}
|
139 |
+
if v_without_tone in v_rep_map.keys():
|
140 |
+
pinyin = c + v_rep_map[v_without_tone]
|
141 |
+
else:
|
142 |
+
# 单音节
|
143 |
+
pinyin_rep_map = {
|
144 |
+
"ing": "ying",
|
145 |
+
"i": "yi",
|
146 |
+
"in": "yin",
|
147 |
+
"u": "wu",
|
148 |
+
}
|
149 |
+
if pinyin in pinyin_rep_map.keys():
|
150 |
+
pinyin = pinyin_rep_map[pinyin]
|
151 |
+
else:
|
152 |
+
single_rep_map = {
|
153 |
+
"v": "yu",
|
154 |
+
"e": "e",
|
155 |
+
"i": "y",
|
156 |
+
"u": "w",
|
157 |
+
}
|
158 |
+
if pinyin[0] in single_rep_map.keys():
|
159 |
+
pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
|
160 |
+
|
161 |
+
assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
|
162 |
+
new_c, new_v = pinyin_to_symbol_map[pinyin].split(" ")
|
163 |
+
new_v = new_v + tone
|
164 |
+
phone = [new_c, new_v]
|
165 |
+
word2ph.append(len(phone))
|
166 |
+
|
167 |
+
phones_list += phone
|
168 |
+
return phones_list, word2ph
|
169 |
+
|
170 |
+
|
171 |
+
def text_normalize(text):
|
172 |
+
# https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
|
173 |
+
tx = TextNormalizer()
|
174 |
+
sentences = tx.normalize(text)
|
175 |
+
dest_text = ""
|
176 |
+
for sentence in sentences:
|
177 |
+
dest_text += replace_punctuation(sentence)
|
178 |
+
|
179 |
+
# 避免重复标点引起的参考泄露
|
180 |
+
dest_text = replace_consecutive_punctuation(dest_text)
|
181 |
+
return dest_text
|
182 |
+
|
183 |
+
|
184 |
+
# 不排除英文的文本格式化
|
185 |
+
def mix_text_normalize(text):
|
186 |
+
# https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
|
187 |
+
tx = TextNormalizer()
|
188 |
+
sentences = tx.normalize(text)
|
189 |
+
dest_text = ""
|
190 |
+
for sentence in sentences:
|
191 |
+
dest_text += replace_punctuation_with_en(sentence)
|
192 |
+
|
193 |
+
# 避免重复标点引起的参考泄露
|
194 |
+
dest_text = replace_consecutive_punctuation(dest_text)
|
195 |
+
return dest_text
|
196 |
+
|
197 |
+
|
198 |
+
if __name__ == "__main__":
|
199 |
+
text = "啊——但是《原神》是由,米哈\游自主,研发的一款全.新开放世界.冒险游戏"
|
200 |
+
text = "呣呣呣~就是…大人的鼹鼠党吧?"
|
201 |
+
text = "你好"
|
202 |
+
text = text_normalize(text)
|
203 |
+
print(g2p(text))
|
204 |
+
|
205 |
+
|
206 |
+
# # 示例用法
|
207 |
+
# text = "这是一个示例文本:,你好!这是一个测试..."
|
208 |
+
# print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试
|
text/chinese2.py
ADDED
@@ -0,0 +1,353 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
|
4 |
+
import cn2an
|
5 |
+
from pypinyin import lazy_pinyin, Style
|
6 |
+
from pypinyin.contrib.tone_convert import to_finals_tone3, to_initials
|
7 |
+
|
8 |
+
from text.symbols import punctuation
|
9 |
+
from text.tone_sandhi import ToneSandhi
|
10 |
+
from text.zh_normalization.text_normlization import TextNormalizer
|
11 |
+
|
12 |
+
normalizer = lambda x: cn2an.transform(x, "an2cn")
|
13 |
+
|
14 |
+
current_file_path = os.path.dirname(__file__)
|
15 |
+
pinyin_to_symbol_map = {
|
16 |
+
line.split("\t")[0]: line.strip().split("\t")[1]
|
17 |
+
for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
|
18 |
+
}
|
19 |
+
|
20 |
+
import jieba_fast
|
21 |
+
import logging
|
22 |
+
|
23 |
+
jieba_fast.setLogLevel(logging.CRITICAL)
|
24 |
+
import jieba_fast.posseg as psg
|
25 |
+
|
26 |
+
# is_g2pw_str = os.environ.get("is_g2pw", "True")##默认开启
|
27 |
+
# is_g2pw = False#True if is_g2pw_str.lower() == 'true' else False
|
28 |
+
is_g2pw = True # True if is_g2pw_str.lower() == 'true' else False
|
29 |
+
if is_g2pw:
|
30 |
+
# print("当前使用g2pw进行拼音推理")
|
31 |
+
from text.g2pw import G2PWPinyin, correct_pronunciation
|
32 |
+
|
33 |
+
parent_directory = os.path.dirname(current_file_path)
|
34 |
+
g2pw = G2PWPinyin(
|
35 |
+
model_dir="text/G2PWModel",
|
36 |
+
model_source=os.environ.get("bert_path", "pretrained_models/chinese-roberta-wwm-ext-large"),
|
37 |
+
v_to_u=False,
|
38 |
+
neutral_tone_with_five=True,
|
39 |
+
)
|
40 |
+
|
41 |
+
rep_map = {
|
42 |
+
":": ",",
|
43 |
+
";": ",",
|
44 |
+
",": ",",
|
45 |
+
"。": ".",
|
46 |
+
"!": "!",
|
47 |
+
"?": "?",
|
48 |
+
"\n": ".",
|
49 |
+
"·": ",",
|
50 |
+
"、": ",",
|
51 |
+
"...": "…",
|
52 |
+
"$": ".",
|
53 |
+
"/": ",",
|
54 |
+
"—": "-",
|
55 |
+
"~": "…",
|
56 |
+
"~": "…",
|
57 |
+
}
|
58 |
+
|
59 |
+
tone_modifier = ToneSandhi()
|
60 |
+
|
61 |
+
|
62 |
+
def replace_punctuation(text):
|
63 |
+
text = text.replace("嗯", "恩").replace("呣", "母")
|
64 |
+
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
|
65 |
+
|
66 |
+
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
|
67 |
+
|
68 |
+
replaced_text = re.sub(r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text)
|
69 |
+
|
70 |
+
return replaced_text
|
71 |
+
|
72 |
+
|
73 |
+
def g2p(text):
|
74 |
+
pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
|
75 |
+
sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
|
76 |
+
phones, word2ph = _g2p(sentences)
|
77 |
+
return phones, word2ph
|
78 |
+
|
79 |
+
|
80 |
+
def _get_initials_finals(word):
|
81 |
+
initials = []
|
82 |
+
finals = []
|
83 |
+
|
84 |
+
orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
|
85 |
+
orig_finals = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
|
86 |
+
|
87 |
+
for c, v in zip(orig_initials, orig_finals):
|
88 |
+
initials.append(c)
|
89 |
+
finals.append(v)
|
90 |
+
return initials, finals
|
91 |
+
|
92 |
+
|
93 |
+
must_erhua = {"小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿", "媳妇儿"}
|
94 |
+
not_erhua = {
|
95 |
+
"虐儿",
|
96 |
+
"为儿",
|
97 |
+
"护儿",
|
98 |
+
"瞒儿",
|
99 |
+
"救儿",
|
100 |
+
"替儿",
|
101 |
+
"有儿",
|
102 |
+
"一儿",
|
103 |
+
"我儿",
|
104 |
+
"俺儿",
|
105 |
+
"妻儿",
|
106 |
+
"拐儿",
|
107 |
+
"聋儿",
|
108 |
+
"乞儿",
|
109 |
+
"患儿",
|
110 |
+
"幼儿",
|
111 |
+
"孤儿",
|
112 |
+
"婴儿",
|
113 |
+
"婴幼儿",
|
114 |
+
"连体儿",
|
115 |
+
"脑瘫儿",
|
116 |
+
"流浪儿",
|
117 |
+
"体弱儿",
|
118 |
+
"混血儿",
|
119 |
+
"蜜雪儿",
|
120 |
+
"舫儿",
|
121 |
+
"祖儿",
|
122 |
+
"美儿",
|
123 |
+
"应采儿",
|
124 |
+
"可儿",
|
125 |
+
"侄儿",
|
126 |
+
"孙儿",
|
127 |
+
"侄孙儿",
|
128 |
+
"女儿",
|
129 |
+
"男儿",
|
130 |
+
"红孩儿",
|
131 |
+
"花儿",
|
132 |
+
"虫儿",
|
133 |
+
"马儿",
|
134 |
+
"鸟儿",
|
135 |
+
"猪儿",
|
136 |
+
"猫儿",
|
137 |
+
"狗儿",
|
138 |
+
"少儿",
|
139 |
+
}
|
140 |
+
|
141 |
+
|
142 |
+
def _merge_erhua(initials: list[str], finals: list[str], word: str, pos: str) -> list[list[str]]:
|
143 |
+
"""
|
144 |
+
Do erhub.
|
145 |
+
"""
|
146 |
+
# fix er1
|
147 |
+
for i, phn in enumerate(finals):
|
148 |
+
if i == len(finals) - 1 and word[i] == "儿" and phn == "er1":
|
149 |
+
finals[i] = "er2"
|
150 |
+
|
151 |
+
# 发音
|
152 |
+
if word not in must_erhua and (word in not_erhua or pos in {"a", "j", "nr"}):
|
153 |
+
return initials, finals
|
154 |
+
|
155 |
+
# "……" 等情况直接返回
|
156 |
+
if len(finals) != len(word):
|
157 |
+
return initials, finals
|
158 |
+
|
159 |
+
assert len(finals) == len(word)
|
160 |
+
|
161 |
+
# 与前一个字发同音
|
162 |
+
new_initials = []
|
163 |
+
new_finals = []
|
164 |
+
for i, phn in enumerate(finals):
|
165 |
+
if (
|
166 |
+
i == len(finals) - 1
|
167 |
+
and word[i] == "儿"
|
168 |
+
and phn in {"er2", "er5"}
|
169 |
+
and word[-2:] not in not_erhua
|
170 |
+
and new_finals
|
171 |
+
):
|
172 |
+
phn = "er" + new_finals[-1][-1]
|
173 |
+
|
174 |
+
new_initials.append(initials[i])
|
175 |
+
new_finals.append(phn)
|
176 |
+
|
177 |
+
return new_initials, new_finals
|
178 |
+
|
179 |
+
|
180 |
+
def _g2p(segments):
|
181 |
+
phones_list = []
|
182 |
+
word2ph = []
|
183 |
+
for seg in segments:
|
184 |
+
pinyins = []
|
185 |
+
# Replace all English words in the sentence
|
186 |
+
seg = re.sub("[a-zA-Z]+", "", seg)
|
187 |
+
seg_cut = psg.lcut(seg)
|
188 |
+
seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
|
189 |
+
initials = []
|
190 |
+
finals = []
|
191 |
+
|
192 |
+
if not is_g2pw:
|
193 |
+
for word, pos in seg_cut:
|
194 |
+
if pos == "eng":
|
195 |
+
continue
|
196 |
+
sub_initials, sub_finals = _get_initials_finals(word)
|
197 |
+
sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
|
198 |
+
# 儿化
|
199 |
+
sub_initials, sub_finals = _merge_erhua(sub_initials, sub_finals, word, pos)
|
200 |
+
initials.append(sub_initials)
|
201 |
+
finals.append(sub_finals)
|
202 |
+
# assert len(sub_initials) == len(sub_finals) == len(word)
|
203 |
+
initials = sum(initials, [])
|
204 |
+
finals = sum(finals, [])
|
205 |
+
print("pypinyin结果", initials, finals)
|
206 |
+
else:
|
207 |
+
# g2pw采用整句推理
|
208 |
+
pinyins = g2pw.lazy_pinyin(seg, neutral_tone_with_five=True, style=Style.TONE3)
|
209 |
+
|
210 |
+
pre_word_length = 0
|
211 |
+
for word, pos in seg_cut:
|
212 |
+
sub_initials = []
|
213 |
+
sub_finals = []
|
214 |
+
now_word_length = pre_word_length + len(word)
|
215 |
+
|
216 |
+
if pos == "eng":
|
217 |
+
pre_word_length = now_word_length
|
218 |
+
continue
|
219 |
+
|
220 |
+
word_pinyins = pinyins[pre_word_length:now_word_length]
|
221 |
+
|
222 |
+
# 多音字消歧
|
223 |
+
word_pinyins = correct_pronunciation(word, word_pinyins)
|
224 |
+
|
225 |
+
for pinyin in word_pinyins:
|
226 |
+
if pinyin[0].isalpha():
|
227 |
+
sub_initials.append(to_initials(pinyin))
|
228 |
+
sub_finals.append(to_finals_tone3(pinyin, neutral_tone_with_five=True))
|
229 |
+
else:
|
230 |
+
sub_initials.append(pinyin)
|
231 |
+
sub_finals.append(pinyin)
|
232 |
+
|
233 |
+
pre_word_length = now_word_length
|
234 |
+
sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
|
235 |
+
# 儿化
|
236 |
+
sub_initials, sub_finals = _merge_erhua(sub_initials, sub_finals, word, pos)
|
237 |
+
initials.append(sub_initials)
|
238 |
+
finals.append(sub_finals)
|
239 |
+
|
240 |
+
initials = sum(initials, [])
|
241 |
+
finals = sum(finals, [])
|
242 |
+
# print("g2pw结果",initials,finals)
|
243 |
+
|
244 |
+
for c, v in zip(initials, finals):
|
245 |
+
raw_pinyin = c + v
|
246 |
+
# NOTE: post process for pypinyin outputs
|
247 |
+
# we discriminate i, ii and iii
|
248 |
+
if c == v:
|
249 |
+
assert c in punctuation
|
250 |
+
phone = [c]
|
251 |
+
word2ph.append(1)
|
252 |
+
else:
|
253 |
+
v_without_tone = v[:-1]
|
254 |
+
tone = v[-1]
|
255 |
+
|
256 |
+
pinyin = c + v_without_tone
|
257 |
+
assert tone in "12345"
|
258 |
+
|
259 |
+
if c:
|
260 |
+
# 多音节
|
261 |
+
v_rep_map = {
|
262 |
+
"uei": "ui",
|
263 |
+
"iou": "iu",
|
264 |
+
"uen": "un",
|
265 |
+
}
|
266 |
+
if v_without_tone in v_rep_map.keys():
|
267 |
+
pinyin = c + v_rep_map[v_without_tone]
|
268 |
+
else:
|
269 |
+
# 单音节
|
270 |
+
pinyin_rep_map = {
|
271 |
+
"ing": "ying",
|
272 |
+
"i": "yi",
|
273 |
+
"in": "yin",
|
274 |
+
"u": "wu",
|
275 |
+
}
|
276 |
+
if pinyin in pinyin_rep_map.keys():
|
277 |
+
pinyin = pinyin_rep_map[pinyin]
|
278 |
+
else:
|
279 |
+
single_rep_map = {
|
280 |
+
"v": "yu",
|
281 |
+
"e": "e",
|
282 |
+
"i": "y",
|
283 |
+
"u": "w",
|
284 |
+
}
|
285 |
+
if pinyin[0] in single_rep_map.keys():
|
286 |
+
pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
|
287 |
+
|
288 |
+
assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
|
289 |
+
new_c, new_v = pinyin_to_symbol_map[pinyin].split(" ")
|
290 |
+
new_v = new_v + tone
|
291 |
+
phone = [new_c, new_v]
|
292 |
+
word2ph.append(len(phone))
|
293 |
+
|
294 |
+
phones_list += phone
|
295 |
+
return phones_list, word2ph
|
296 |
+
|
297 |
+
|
298 |
+
def replace_punctuation_with_en(text):
|
299 |
+
text = text.replace("嗯", "恩").replace("呣", "母")
|
300 |
+
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
|
301 |
+
|
302 |
+
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
|
303 |
+
|
304 |
+
replaced_text = re.sub(r"[^\u4e00-\u9fa5A-Za-z" + "".join(punctuation) + r"]+", "", replaced_text)
|
305 |
+
|
306 |
+
return replaced_text
|
307 |
+
|
308 |
+
|
309 |
+
def replace_consecutive_punctuation(text):
|
310 |
+
punctuations = "".join(re.escape(p) for p in punctuation)
|
311 |
+
pattern = f"([{punctuations}])([{punctuations}])+"
|
312 |
+
result = re.sub(pattern, r"\1", text)
|
313 |
+
return result
|
314 |
+
|
315 |
+
|
316 |
+
def text_normalize(text):
|
317 |
+
# https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
|
318 |
+
tx = TextNormalizer()
|
319 |
+
sentences = tx.normalize(text)
|
320 |
+
dest_text = ""
|
321 |
+
for sentence in sentences:
|
322 |
+
dest_text += replace_punctuation(sentence)
|
323 |
+
|
324 |
+
# 避免重复标点引起的参考泄露
|
325 |
+
dest_text = replace_consecutive_punctuation(dest_text)
|
326 |
+
return dest_text
|
327 |
+
|
328 |
+
|
329 |
+
# 不排除英文的文本格式化
|
330 |
+
def mix_text_normalize(text):
|
331 |
+
# https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
|
332 |
+
tx = TextNormalizer()
|
333 |
+
sentences = tx.normalize(text)
|
334 |
+
dest_text = ""
|
335 |
+
for sentence in sentences:
|
336 |
+
dest_text += replace_punctuation_with_en(sentence)
|
337 |
+
|
338 |
+
# 避免重复标点引起的参考泄露
|
339 |
+
dest_text = replace_consecutive_punctuation(dest_text)
|
340 |
+
return dest_text
|
341 |
+
|
342 |
+
|
343 |
+
if __name__ == "__main__":
|
344 |
+
text = "啊——但是《原神》是由,米哈\游自主,研发的一款全.新开放世界.冒险游戏"
|
345 |
+
text = "呣呣呣~就是…大人的鼹鼠党吧?"
|
346 |
+
text = "你好"
|
347 |
+
text = text_normalize(text)
|
348 |
+
print(g2p(text))
|
349 |
+
|
350 |
+
|
351 |
+
# # 示例用法
|
352 |
+
# text = "这是一个示例文本:,你好!这是一个测试..."
|
353 |
+
# print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试
|
text/cleaner.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from text import cleaned_text_to_sequence
|
2 |
+
import os
|
3 |
+
# if os.environ.get("version","v1")=="v1":
|
4 |
+
# from text import chinese
|
5 |
+
# from text.symbols import symbols
|
6 |
+
# else:
|
7 |
+
# from text import chinese2 as chinese
|
8 |
+
# from text.symbols2 import symbols
|
9 |
+
|
10 |
+
from text import symbols as symbols_v1
|
11 |
+
from text import symbols2 as symbols_v2
|
12 |
+
|
13 |
+
special = [
|
14 |
+
# ("%", "zh", "SP"),
|
15 |
+
("¥", "zh", "SP2"),
|
16 |
+
("^", "zh", "SP3"),
|
17 |
+
# ('@', 'zh', "SP4")#不搞鬼畜了,和第二版保持一致吧
|
18 |
+
]
|
19 |
+
|
20 |
+
|
21 |
+
def clean_text(text, language, version=None):
|
22 |
+
if version is None:
|
23 |
+
version = os.environ.get("version", "v2")
|
24 |
+
if version == "v1":
|
25 |
+
symbols = symbols_v1.symbols
|
26 |
+
language_module_map = {"zh": "chinese", "ja": "japanese", "en": "english"}
|
27 |
+
else:
|
28 |
+
symbols = symbols_v2.symbols
|
29 |
+
language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean", "yue": "cantonese"}
|
30 |
+
|
31 |
+
if language not in language_module_map:
|
32 |
+
language = "en"
|
33 |
+
text = " "
|
34 |
+
for special_s, special_l, target_symbol in special:
|
35 |
+
if special_s in text and language == special_l:
|
36 |
+
return clean_special(text, language, special_s, target_symbol, version)
|
37 |
+
language_module = __import__("text." + language_module_map[language], fromlist=[language_module_map[language]])
|
38 |
+
if hasattr(language_module, "text_normalize"):
|
39 |
+
norm_text = language_module.text_normalize(text)
|
40 |
+
else:
|
41 |
+
norm_text = text
|
42 |
+
if language == "zh" or language == "yue": ##########
|
43 |
+
phones, word2ph = language_module.g2p(norm_text)
|
44 |
+
assert len(phones) == sum(word2ph)
|
45 |
+
assert len(norm_text) == len(word2ph)
|
46 |
+
elif language == "en":
|
47 |
+
phones = language_module.g2p(norm_text)
|
48 |
+
if len(phones) < 4:
|
49 |
+
phones = [","] + phones
|
50 |
+
word2ph = None
|
51 |
+
else:
|
52 |
+
phones = language_module.g2p(norm_text)
|
53 |
+
word2ph = None
|
54 |
+
phones = ["UNK" if ph not in symbols else ph for ph in phones]
|
55 |
+
return phones, word2ph, norm_text
|
56 |
+
|
57 |
+
|
58 |
+
def clean_special(text, language, special_s, target_symbol, version=None):
|
59 |
+
if version is None:
|
60 |
+
version = os.environ.get("version", "v2")
|
61 |
+
if version == "v1":
|
62 |
+
symbols = symbols_v1.symbols
|
63 |
+
language_module_map = {"zh": "chinese", "ja": "japanese", "en": "english"}
|
64 |
+
else:
|
65 |
+
symbols = symbols_v2.symbols
|
66 |
+
language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean", "yue": "cantonese"}
|
67 |
+
|
68 |
+
"""
|
69 |
+
特殊静音段sp符号处理
|
70 |
+
"""
|
71 |
+
text = text.replace(special_s, ",")
|
72 |
+
language_module = __import__("text." + language_module_map[language], fromlist=[language_module_map[language]])
|
73 |
+
norm_text = language_module.text_normalize(text)
|
74 |
+
phones = language_module.g2p(norm_text)
|
75 |
+
new_ph = []
|
76 |
+
for ph in phones[0]:
|
77 |
+
assert ph in symbols
|
78 |
+
if ph == ",":
|
79 |
+
new_ph.append(target_symbol)
|
80 |
+
else:
|
81 |
+
new_ph.append(ph)
|
82 |
+
return new_ph, phones[1], norm_text
|
83 |
+
|
84 |
+
|
85 |
+
def text_to_sequence(text, language, version=None):
|
86 |
+
version = os.environ.get("version", version)
|
87 |
+
if version is None:
|
88 |
+
version = "v2"
|
89 |
+
phones = clean_text(text)
|
90 |
+
return cleaned_text_to_sequence(phones, version)
|
91 |
+
|
92 |
+
|
93 |
+
if __name__ == "__main__":
|
94 |
+
print(clean_text("你好%啊啊啊额、还是到付红四方。", "zh"))
|
text/cmudict-fast.rep
ADDED
The diff for this file is too large to render.
See raw diff
|
|
text/cmudict.rep
ADDED
The diff for this file is too large to render.
See raw diff
|
|
text/cmudict_cache.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b9b21b20325471934ba92f2e4a5976989e7d920caa32e7a286eacb027d197949
|
3 |
+
size 6212655
|
text/en_normalization/expend.py
ADDED
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# by https://github.com/Cosmo-klara
|
2 |
+
|
3 |
+
from __future__ import print_function
|
4 |
+
|
5 |
+
import re
|
6 |
+
import inflect
|
7 |
+
import unicodedata
|
8 |
+
|
9 |
+
# 后缀计量单位替换表
|
10 |
+
measurement_map = {
|
11 |
+
"m": ["meter", "meters"],
|
12 |
+
"km": ["kilometer", "kilometers"],
|
13 |
+
"km/h": ["kilometer per hour", "kilometers per hour"],
|
14 |
+
"ft": ["feet", "feet"],
|
15 |
+
"L": ["liter", "liters"],
|
16 |
+
"tbsp": ["tablespoon", "tablespoons"],
|
17 |
+
"tsp": ["teaspoon", "teaspoons"],
|
18 |
+
"h": ["hour", "hours"],
|
19 |
+
"min": ["minute", "minutes"],
|
20 |
+
"s": ["second", "seconds"],
|
21 |
+
"°C": ["degree celsius", "degrees celsius"],
|
22 |
+
"°F": ["degree fahrenheit", "degrees fahrenheit"],
|
23 |
+
}
|
24 |
+
|
25 |
+
|
26 |
+
# 识别 12,000 类型
|
27 |
+
_inflect = inflect.engine()
|
28 |
+
|
29 |
+
# 转化数字序数词
|
30 |
+
_ordinal_number_re = re.compile(r"\b([0-9]+)\. ")
|
31 |
+
|
32 |
+
# 我听说好像对于数字正则识别其实用 \d 会好一点
|
33 |
+
|
34 |
+
_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
|
35 |
+
|
36 |
+
# 时间识别
|
37 |
+
_time_re = re.compile(r"\b([01]?[0-9]|2[0-3]):([0-5][0-9])\b")
|
38 |
+
|
39 |
+
# 后缀计量单位识别
|
40 |
+
_measurement_re = re.compile(r"\b([0-9]+(\.[0-9]+)?(m|km|km/h|ft|L|tbsp|tsp|h|min|s|°C|°F))\b")
|
41 |
+
|
42 |
+
# 前后 £ 识别 ( 写了识别两边某一边的,但是不知道为什么失败了┭┮﹏┭┮ )
|
43 |
+
_pounds_re_start = re.compile(r"£([0-9\.\,]*[0-9]+)")
|
44 |
+
_pounds_re_end = re.compile(r"([0-9\.\,]*[0-9]+)£")
|
45 |
+
|
46 |
+
# 前后 $ 识别
|
47 |
+
_dollars_re_start = re.compile(r"\$([0-9\.\,]*[0-9]+)")
|
48 |
+
_dollars_re_end = re.compile(r"([(0-9\.\,]*[0-9]+)\$")
|
49 |
+
|
50 |
+
# 小数的识别
|
51 |
+
_decimal_number_re = re.compile(r"([0-9]+\.\s*[0-9]+)")
|
52 |
+
|
53 |
+
# 分数识别 (形式 "3/4" )
|
54 |
+
_fraction_re = re.compile(r"([0-9]+/[0-9]+)")
|
55 |
+
|
56 |
+
# 序数词识别
|
57 |
+
_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
|
58 |
+
|
59 |
+
# 数字处理
|
60 |
+
_number_re = re.compile(r"[0-9]+")
|
61 |
+
|
62 |
+
|
63 |
+
def _convert_ordinal(m):
|
64 |
+
"""
|
65 |
+
标准化序数词, 例如: 1. 2. 3. 4. 5. 6.
|
66 |
+
Examples:
|
67 |
+
input: "1. "
|
68 |
+
output: "1st"
|
69 |
+
然后在后面的 _expand_ordinal, 将其转化为 first 这类的
|
70 |
+
"""
|
71 |
+
ordinal = _inflect.ordinal(m.group(1))
|
72 |
+
return ordinal + ", "
|
73 |
+
|
74 |
+
|
75 |
+
def _remove_commas(m):
|
76 |
+
return m.group(1).replace(",", "")
|
77 |
+
|
78 |
+
|
79 |
+
def _expand_time(m):
|
80 |
+
"""
|
81 |
+
将 24 小时制的时间转换为 12 小时制的时间表示方式。
|
82 |
+
|
83 |
+
Examples:
|
84 |
+
input: "13:00 / 4:00 / 13:30"
|
85 |
+
output: "one o'clock p.m. / four o'clock am. / one thirty p.m."
|
86 |
+
"""
|
87 |
+
hours, minutes = map(int, m.group(1, 2))
|
88 |
+
period = "a.m." if hours < 12 else "p.m."
|
89 |
+
if hours > 12:
|
90 |
+
hours -= 12
|
91 |
+
|
92 |
+
hour_word = _inflect.number_to_words(hours)
|
93 |
+
minute_word = _inflect.number_to_words(minutes) if minutes != 0 else ""
|
94 |
+
|
95 |
+
if minutes == 0:
|
96 |
+
return f"{hour_word} o'clock {period}"
|
97 |
+
else:
|
98 |
+
return f"{hour_word} {minute_word} {period}"
|
99 |
+
|
100 |
+
|
101 |
+
def _expand_measurement(m):
|
102 |
+
"""
|
103 |
+
处理一些常见的测量单位后缀, 目前支持: m, km, km/h, ft, L, tbsp, tsp, h, min, s, °C, °F
|
104 |
+
如果要拓展的话修改: _measurement_re 和 measurement_map
|
105 |
+
"""
|
106 |
+
sign = m.group(3)
|
107 |
+
ptr = 1
|
108 |
+
# 想不到怎么方便的取数字,又懒得改正则,诶,1.2 反正也是复数读法,干脆直接去掉 "."
|
109 |
+
num = int(m.group(1).replace(sign, "").replace(".", ""))
|
110 |
+
decimal_part = m.group(2)
|
111 |
+
# 上面判断的漏洞,比如 0.1 的情况,在这里排除了
|
112 |
+
if decimal_part == None and num == 1:
|
113 |
+
ptr = 0
|
114 |
+
return m.group(1).replace(sign, " " + measurement_map[sign][ptr])
|
115 |
+
|
116 |
+
|
117 |
+
def _expand_pounds(m):
|
118 |
+
"""
|
119 |
+
没找到特别规范的说明,和美元的处理一样,其实可以把两个合并在一起
|
120 |
+
"""
|
121 |
+
match = m.group(1)
|
122 |
+
parts = match.split(".")
|
123 |
+
if len(parts) > 2:
|
124 |
+
return match + " pounds" # Unexpected format
|
125 |
+
pounds = int(parts[0]) if parts[0] else 0
|
126 |
+
pence = int(parts[1].ljust(2, "0")) if len(parts) > 1 and parts[1] else 0
|
127 |
+
if pounds and pence:
|
128 |
+
pound_unit = "pound" if pounds == 1 else "pounds"
|
129 |
+
penny_unit = "penny" if pence == 1 else "pence"
|
130 |
+
return "%s %s and %s %s" % (pounds, pound_unit, pence, penny_unit)
|
131 |
+
elif pounds:
|
132 |
+
pound_unit = "pound" if pounds == 1 else "pounds"
|
133 |
+
return "%s %s" % (pounds, pound_unit)
|
134 |
+
elif pence:
|
135 |
+
penny_unit = "penny" if pence == 1 else "pence"
|
136 |
+
return "%s %s" % (pence, penny_unit)
|
137 |
+
else:
|
138 |
+
return "zero pounds"
|
139 |
+
|
140 |
+
|
141 |
+
def _expand_dollars(m):
|
142 |
+
"""
|
143 |
+
change: 美分是 100 的限值, 应该要做补零的吧
|
144 |
+
Example:
|
145 |
+
input: "32.3$ / $6.24"
|
146 |
+
output: "thirty-two dollars and thirty cents" / "six dollars and twenty-four cents"
|
147 |
+
"""
|
148 |
+
match = m.group(1)
|
149 |
+
parts = match.split(".")
|
150 |
+
if len(parts) > 2:
|
151 |
+
return match + " dollars" # Unexpected format
|
152 |
+
dollars = int(parts[0]) if parts[0] else 0
|
153 |
+
cents = int(parts[1].ljust(2, "0")) if len(parts) > 1 and parts[1] else 0
|
154 |
+
if dollars and cents:
|
155 |
+
dollar_unit = "dollar" if dollars == 1 else "dollars"
|
156 |
+
cent_unit = "cent" if cents == 1 else "cents"
|
157 |
+
return "%s %s and %s %s" % (dollars, dollar_unit, cents, cent_unit)
|
158 |
+
elif dollars:
|
159 |
+
dollar_unit = "dollar" if dollars == 1 else "dollars"
|
160 |
+
return "%s %s" % (dollars, dollar_unit)
|
161 |
+
elif cents:
|
162 |
+
cent_unit = "cent" if cents == 1 else "cents"
|
163 |
+
return "%s %s" % (cents, cent_unit)
|
164 |
+
else:
|
165 |
+
return "zero dollars"
|
166 |
+
|
167 |
+
|
168 |
+
# 小数的处理
|
169 |
+
def _expand_decimal_number(m):
|
170 |
+
"""
|
171 |
+
Example:
|
172 |
+
input: "13.234"
|
173 |
+
output: "thirteen point two three four"
|
174 |
+
"""
|
175 |
+
match = m.group(1)
|
176 |
+
parts = match.split(".")
|
177 |
+
words = []
|
178 |
+
# 遍历字符串中的每个字符
|
179 |
+
for char in parts[1]:
|
180 |
+
if char == ".":
|
181 |
+
words.append("point")
|
182 |
+
else:
|
183 |
+
words.append(char)
|
184 |
+
return parts[0] + " point " + " ".join(words)
|
185 |
+
|
186 |
+
|
187 |
+
# 分数的处理
|
188 |
+
def _expend_fraction(m):
|
189 |
+
"""
|
190 |
+
规则1: 分子使用基数词读法, 分母用序数词读法.
|
191 |
+
规则2: 如果分子大于 1, 在读分母的时候使用序数词复数读法.
|
192 |
+
规则3: 当分母为2的时候, 分母读做 half, 并且当分子大于 1 的时候, half 也要用复数读法, 读为 halves.
|
193 |
+
Examples:
|
194 |
+
|
195 |
+
| Written | Said |
|
196 |
+
|:---:|:---:|
|
197 |
+
| 1/3 | one third |
|
198 |
+
| 3/4 | three fourths |
|
199 |
+
| 5/6 | five sixths |
|
200 |
+
| 1/2 | one half |
|
201 |
+
| 3/2 | three halves |
|
202 |
+
"""
|
203 |
+
match = m.group(0)
|
204 |
+
numerator, denominator = map(int, match.split("/"))
|
205 |
+
|
206 |
+
numerator_part = _inflect.number_to_words(numerator)
|
207 |
+
if denominator == 2:
|
208 |
+
if numerator == 1:
|
209 |
+
denominator_part = "half"
|
210 |
+
else:
|
211 |
+
denominator_part = "halves"
|
212 |
+
elif denominator == 1:
|
213 |
+
return f"{numerator_part}"
|
214 |
+
else:
|
215 |
+
denominator_part = _inflect.ordinal(_inflect.number_to_words(denominator))
|
216 |
+
if numerator > 1:
|
217 |
+
denominator_part += "s"
|
218 |
+
|
219 |
+
return f"{numerator_part} {denominator_part}"
|
220 |
+
|
221 |
+
|
222 |
+
def _expand_ordinal(m):
|
223 |
+
return _inflect.number_to_words(m.group(0))
|
224 |
+
|
225 |
+
|
226 |
+
def _expand_number(m):
|
227 |
+
num = int(m.group(0))
|
228 |
+
if num > 1000 and num < 3000:
|
229 |
+
if num == 2000:
|
230 |
+
return "two thousand"
|
231 |
+
elif num > 2000 and num < 2010:
|
232 |
+
return "two thousand " + _inflect.number_to_words(num % 100)
|
233 |
+
elif num % 100 == 0:
|
234 |
+
return _inflect.number_to_words(num // 100) + " hundred"
|
235 |
+
else:
|
236 |
+
return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ")
|
237 |
+
else:
|
238 |
+
return _inflect.number_to_words(num, andword="")
|
239 |
+
|
240 |
+
|
241 |
+
def normalize(text):
|
242 |
+
"""
|
243 |
+
!!! 所有的处理都需要正确的输入 !!!
|
244 |
+
可以添加新的处理,只需要添加正则表达式和对应的处理函数即可
|
245 |
+
"""
|
246 |
+
|
247 |
+
text = re.sub(_ordinal_number_re, _convert_ordinal, text)
|
248 |
+
text = re.sub(r"(?<!\d)-|-(?!\d)", " minus ", text)
|
249 |
+
text = re.sub(_comma_number_re, _remove_commas, text)
|
250 |
+
text = re.sub(_time_re, _expand_time, text)
|
251 |
+
text = re.sub(_measurement_re, _expand_measurement, text)
|
252 |
+
text = re.sub(_pounds_re_start, _expand_pounds, text)
|
253 |
+
text = re.sub(_pounds_re_end, _expand_pounds, text)
|
254 |
+
text = re.sub(_dollars_re_start, _expand_dollars, text)
|
255 |
+
text = re.sub(_dollars_re_end, _expand_dollars, text)
|
256 |
+
text = re.sub(_decimal_number_re, _expand_decimal_number, text)
|
257 |
+
text = re.sub(_fraction_re, _expend_fraction, text)
|
258 |
+
text = re.sub(_ordinal_re, _expand_ordinal, text)
|
259 |
+
text = re.sub(_number_re, _expand_number, text)
|
260 |
+
|
261 |
+
text = "".join(
|
262 |
+
char for char in unicodedata.normalize("NFD", text) if unicodedata.category(char) != "Mn"
|
263 |
+
) # Strip accents
|
264 |
+
|
265 |
+
text = re.sub("%", " percent", text)
|
266 |
+
text = re.sub("[^ A-Za-z'.,?!\-]", "", text)
|
267 |
+
text = re.sub(r"(?i)i\.e\.", "that is", text)
|
268 |
+
text = re.sub(r"(?i)e\.g\.", "for example", text)
|
269 |
+
# 增加纯大写单词拆分
|
270 |
+
text = re.sub(r"(?<!^)(?<![\s])([A-Z])", r" \1", text)
|
271 |
+
return text
|
272 |
+
|
273 |
+
|
274 |
+
if __name__ == "__main__":
|
275 |
+
# 我觉得其实可以把切分结果展示出来(只读,或者修改不影响传给TTS的实际text)
|
276 |
+
# 然后让用户确认后再输入给 TTS,可以让用户检查自己有没有不标准的输入
|
277 |
+
print(normalize("1. test ordinal number 1st"))
|
278 |
+
print(normalize("32.3$, $6.24, 1.1£, £7.14."))
|
279 |
+
print(normalize("3/23, 1/2, 3/2, 1/3, 6/1"))
|
280 |
+
print(normalize("1st, 22nd"))
|
281 |
+
print(normalize("a test 20h, 1.2s, 1L, 0.1km"))
|
282 |
+
print(normalize("a test of time 4:00, 13:00, 13:30"))
|
283 |
+
print(normalize("a test of temperature 4°F, 23°C, -19°C"))
|
text/engdict-hot.rep
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
CHATGPT CH AE1 T JH IY1 P IY1 T IY1
|
2 |
+
JSON JH EY1 S AH0 N
|
3 |
+
CONDA K AA1 N D AH0
|
text/engdict_cache.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9bff9393f4b192d873a11335efc8f124771087b6dc847d34fd240c2846889d2b
|
3 |
+
size 5965909
|
text/english.py
ADDED
@@ -0,0 +1,374 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pickle
|
2 |
+
import os
|
3 |
+
import re
|
4 |
+
import wordsegment
|
5 |
+
from g2p_en import G2p
|
6 |
+
|
7 |
+
from text.symbols import punctuation
|
8 |
+
|
9 |
+
from text.symbols2 import symbols
|
10 |
+
|
11 |
+
from builtins import str as unicode
|
12 |
+
from text.en_normalization.expend import normalize
|
13 |
+
from nltk.tokenize import TweetTokenizer
|
14 |
+
|
15 |
+
word_tokenize = TweetTokenizer().tokenize
|
16 |
+
from nltk import pos_tag
|
17 |
+
|
18 |
+
current_file_path = os.path.dirname(__file__)
|
19 |
+
CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep")
|
20 |
+
CMU_DICT_FAST_PATH = os.path.join(current_file_path, "cmudict-fast.rep")
|
21 |
+
CMU_DICT_HOT_PATH = os.path.join(current_file_path, "engdict-hot.rep")
|
22 |
+
CACHE_PATH = os.path.join(current_file_path, "engdict_cache.pickle")
|
23 |
+
NAMECACHE_PATH = os.path.join(current_file_path, "namedict_cache.pickle")
|
24 |
+
|
25 |
+
|
26 |
+
# 适配中文及 g2p_en 标点
|
27 |
+
rep_map = {
|
28 |
+
"[;::,;]": ",",
|
29 |
+
'["’]': "'",
|
30 |
+
"。": ".",
|
31 |
+
"!": "!",
|
32 |
+
"?": "?",
|
33 |
+
}
|
34 |
+
|
35 |
+
|
36 |
+
arpa = {
|
37 |
+
"AH0",
|
38 |
+
"S",
|
39 |
+
"AH1",
|
40 |
+
"EY2",
|
41 |
+
"AE2",
|
42 |
+
"EH0",
|
43 |
+
"OW2",
|
44 |
+
"UH0",
|
45 |
+
"NG",
|
46 |
+
"B",
|
47 |
+
"G",
|
48 |
+
"AY0",
|
49 |
+
"M",
|
50 |
+
"AA0",
|
51 |
+
"F",
|
52 |
+
"AO0",
|
53 |
+
"ER2",
|
54 |
+
"UH1",
|
55 |
+
"IY1",
|
56 |
+
"AH2",
|
57 |
+
"DH",
|
58 |
+
"IY0",
|
59 |
+
"EY1",
|
60 |
+
"IH0",
|
61 |
+
"K",
|
62 |
+
"N",
|
63 |
+
"W",
|
64 |
+
"IY2",
|
65 |
+
"T",
|
66 |
+
"AA1",
|
67 |
+
"ER1",
|
68 |
+
"EH2",
|
69 |
+
"OY0",
|
70 |
+
"UH2",
|
71 |
+
"UW1",
|
72 |
+
"Z",
|
73 |
+
"AW2",
|
74 |
+
"AW1",
|
75 |
+
"V",
|
76 |
+
"UW2",
|
77 |
+
"AA2",
|
78 |
+
"ER",
|
79 |
+
"AW0",
|
80 |
+
"UW0",
|
81 |
+
"R",
|
82 |
+
"OW1",
|
83 |
+
"EH1",
|
84 |
+
"ZH",
|
85 |
+
"AE0",
|
86 |
+
"IH2",
|
87 |
+
"IH",
|
88 |
+
"Y",
|
89 |
+
"JH",
|
90 |
+
"P",
|
91 |
+
"AY1",
|
92 |
+
"EY0",
|
93 |
+
"OY2",
|
94 |
+
"TH",
|
95 |
+
"HH",
|
96 |
+
"D",
|
97 |
+
"ER0",
|
98 |
+
"CH",
|
99 |
+
"AO1",
|
100 |
+
"AE1",
|
101 |
+
"AO2",
|
102 |
+
"OY1",
|
103 |
+
"AY2",
|
104 |
+
"IH1",
|
105 |
+
"OW0",
|
106 |
+
"L",
|
107 |
+
"SH",
|
108 |
+
}
|
109 |
+
|
110 |
+
|
111 |
+
def replace_phs(phs):
|
112 |
+
rep_map = {"'": "-"}
|
113 |
+
phs_new = []
|
114 |
+
for ph in phs:
|
115 |
+
if ph in symbols:
|
116 |
+
phs_new.append(ph)
|
117 |
+
elif ph in rep_map.keys():
|
118 |
+
phs_new.append(rep_map[ph])
|
119 |
+
else:
|
120 |
+
print("ph not in symbols: ", ph)
|
121 |
+
return phs_new
|
122 |
+
|
123 |
+
|
124 |
+
def replace_consecutive_punctuation(text):
|
125 |
+
punctuations = "".join(re.escape(p) for p in punctuation)
|
126 |
+
pattern = f"([{punctuations}\s])([{punctuations}])+"
|
127 |
+
result = re.sub(pattern, r"\1", text)
|
128 |
+
return result
|
129 |
+
|
130 |
+
|
131 |
+
def read_dict():
|
132 |
+
g2p_dict = {}
|
133 |
+
start_line = 49
|
134 |
+
with open(CMU_DICT_PATH) as f:
|
135 |
+
line = f.readline()
|
136 |
+
line_index = 1
|
137 |
+
while line:
|
138 |
+
if line_index >= start_line:
|
139 |
+
line = line.strip()
|
140 |
+
word_split = line.split(" ")
|
141 |
+
word = word_split[0].lower()
|
142 |
+
|
143 |
+
syllable_split = word_split[1].split(" - ")
|
144 |
+
g2p_dict[word] = []
|
145 |
+
for syllable in syllable_split:
|
146 |
+
phone_split = syllable.split(" ")
|
147 |
+
g2p_dict[word].append(phone_split)
|
148 |
+
|
149 |
+
line_index = line_index + 1
|
150 |
+
line = f.readline()
|
151 |
+
|
152 |
+
return g2p_dict
|
153 |
+
|
154 |
+
|
155 |
+
def read_dict_new():
|
156 |
+
g2p_dict = {}
|
157 |
+
with open(CMU_DICT_PATH) as f:
|
158 |
+
line = f.readline()
|
159 |
+
line_index = 1
|
160 |
+
while line:
|
161 |
+
if line_index >= 57:
|
162 |
+
line = line.strip()
|
163 |
+
word_split = line.split(" ")
|
164 |
+
word = word_split[0].lower()
|
165 |
+
g2p_dict[word] = [word_split[1].split(" ")]
|
166 |
+
|
167 |
+
line_index = line_index + 1
|
168 |
+
line = f.readline()
|
169 |
+
|
170 |
+
with open(CMU_DICT_FAST_PATH) as f:
|
171 |
+
line = f.readline()
|
172 |
+
line_index = 1
|
173 |
+
while line:
|
174 |
+
if line_index >= 0:
|
175 |
+
line = line.strip()
|
176 |
+
word_split = line.split(" ")
|
177 |
+
word = word_split[0].lower()
|
178 |
+
if word not in g2p_dict:
|
179 |
+
g2p_dict[word] = [word_split[1:]]
|
180 |
+
|
181 |
+
line_index = line_index + 1
|
182 |
+
line = f.readline()
|
183 |
+
|
184 |
+
return g2p_dict
|
185 |
+
|
186 |
+
|
187 |
+
def hot_reload_hot(g2p_dict):
|
188 |
+
with open(CMU_DICT_HOT_PATH) as f:
|
189 |
+
line = f.readline()
|
190 |
+
line_index = 1
|
191 |
+
while line:
|
192 |
+
if line_index >= 0:
|
193 |
+
line = line.strip()
|
194 |
+
word_split = line.split(" ")
|
195 |
+
word = word_split[0].lower()
|
196 |
+
# 自定义发音词直接覆盖字典
|
197 |
+
g2p_dict[word] = [word_split[1:]]
|
198 |
+
|
199 |
+
line_index = line_index + 1
|
200 |
+
line = f.readline()
|
201 |
+
|
202 |
+
return g2p_dict
|
203 |
+
|
204 |
+
|
205 |
+
def cache_dict(g2p_dict, file_path):
|
206 |
+
with open(file_path, "wb") as pickle_file:
|
207 |
+
pickle.dump(g2p_dict, pickle_file)
|
208 |
+
|
209 |
+
|
210 |
+
def get_dict():
|
211 |
+
if os.path.exists(CACHE_PATH):
|
212 |
+
with open(CACHE_PATH, "rb") as pickle_file:
|
213 |
+
g2p_dict = pickle.load(pickle_file)
|
214 |
+
else:
|
215 |
+
g2p_dict = read_dict_new()
|
216 |
+
cache_dict(g2p_dict, CACHE_PATH)
|
217 |
+
|
218 |
+
g2p_dict = hot_reload_hot(g2p_dict)
|
219 |
+
|
220 |
+
return g2p_dict
|
221 |
+
|
222 |
+
|
223 |
+
def get_namedict():
|
224 |
+
if os.path.exists(NAMECACHE_PATH):
|
225 |
+
with open(NAMECACHE_PATH, "rb") as pickle_file:
|
226 |
+
name_dict = pickle.load(pickle_file)
|
227 |
+
else:
|
228 |
+
name_dict = {}
|
229 |
+
|
230 |
+
return name_dict
|
231 |
+
|
232 |
+
|
233 |
+
def text_normalize(text):
|
234 |
+
# todo: eng text normalize
|
235 |
+
|
236 |
+
# 效果相同,和 chinese.py 保持一致
|
237 |
+
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
|
238 |
+
text = pattern.sub(lambda x: rep_map[x.group()], text)
|
239 |
+
|
240 |
+
text = unicode(text)
|
241 |
+
text = normalize(text)
|
242 |
+
|
243 |
+
# 避免重复标点引起的参考泄露
|
244 |
+
text = replace_consecutive_punctuation(text)
|
245 |
+
return text
|
246 |
+
|
247 |
+
|
248 |
+
class en_G2p(G2p):
|
249 |
+
def __init__(self):
|
250 |
+
super().__init__()
|
251 |
+
# 分词初始化
|
252 |
+
wordsegment.load()
|
253 |
+
|
254 |
+
# 扩展过时字典, 添加姓名字典
|
255 |
+
self.cmu = get_dict()
|
256 |
+
self.namedict = get_namedict()
|
257 |
+
|
258 |
+
# 剔除读音错误的几个缩写
|
259 |
+
for word in ["AE", "AI", "AR", "IOS", "HUD", "OS"]:
|
260 |
+
del self.cmu[word.lower()]
|
261 |
+
|
262 |
+
# 修正多音字
|
263 |
+
self.homograph2features["read"] = (["R", "IY1", "D"], ["R", "EH1", "D"], "VBP")
|
264 |
+
self.homograph2features["complex"] = (
|
265 |
+
["K", "AH0", "M", "P", "L", "EH1", "K", "S"],
|
266 |
+
["K", "AA1", "M", "P", "L", "EH0", "K", "S"],
|
267 |
+
"JJ",
|
268 |
+
)
|
269 |
+
|
270 |
+
def __call__(self, text):
|
271 |
+
# tokenization
|
272 |
+
words = word_tokenize(text)
|
273 |
+
tokens = pos_tag(words) # tuples of (word, tag)
|
274 |
+
|
275 |
+
# steps
|
276 |
+
prons = []
|
277 |
+
for o_word, pos in tokens:
|
278 |
+
# 还原 g2p_en 小写操作逻辑
|
279 |
+
word = o_word.lower()
|
280 |
+
|
281 |
+
if re.search("[a-z]", word) is None:
|
282 |
+
pron = [word]
|
283 |
+
# 先把单字母推出去
|
284 |
+
elif len(word) == 1:
|
285 |
+
# 单读 A 发音修正, 这里需要原格式 o_word 判断大写
|
286 |
+
if o_word == "A":
|
287 |
+
pron = ["EY1"]
|
288 |
+
else:
|
289 |
+
pron = self.cmu[word][0]
|
290 |
+
# g2p_en 原版多音字处理
|
291 |
+
elif word in self.homograph2features: # Check homograph
|
292 |
+
pron1, pron2, pos1 = self.homograph2features[word]
|
293 |
+
if pos.startswith(pos1):
|
294 |
+
pron = pron1
|
295 |
+
# pos1比pos长仅出现在read
|
296 |
+
elif len(pos) < len(pos1) and pos == pos1[: len(pos)]:
|
297 |
+
pron = pron1
|
298 |
+
else:
|
299 |
+
pron = pron2
|
300 |
+
else:
|
301 |
+
# 递归查找预测
|
302 |
+
pron = self.qryword(o_word)
|
303 |
+
|
304 |
+
prons.extend(pron)
|
305 |
+
prons.extend([" "])
|
306 |
+
|
307 |
+
return prons[:-1]
|
308 |
+
|
309 |
+
def qryword(self, o_word):
|
310 |
+
word = o_word.lower()
|
311 |
+
|
312 |
+
# 查字典, 单字母除外
|
313 |
+
if len(word) > 1 and word in self.cmu: # lookup CMU dict
|
314 |
+
return self.cmu[word][0]
|
315 |
+
|
316 |
+
# 单词仅首字母大写时查找姓名字典
|
317 |
+
if o_word.istitle() and word in self.namedict:
|
318 |
+
return self.namedict[word][0]
|
319 |
+
|
320 |
+
# oov 长度小于等于 3 直接读字母
|
321 |
+
if len(word) <= 3:
|
322 |
+
phones = []
|
323 |
+
for w in word:
|
324 |
+
# 单读 A 发音修正, 此处不存在大写的情况
|
325 |
+
if w == "a":
|
326 |
+
phones.extend(["EY1"])
|
327 |
+
elif not w.isalpha():
|
328 |
+
phones.extend([w])
|
329 |
+
else:
|
330 |
+
phones.extend(self.cmu[w][0])
|
331 |
+
return phones
|
332 |
+
|
333 |
+
# 尝试分离所有格
|
334 |
+
if re.match(r"^([a-z]+)('s)$", word):
|
335 |
+
phones = self.qryword(word[:-2])[:]
|
336 |
+
# P T K F TH HH 无声辅音结尾 's 发 ['S']
|
337 |
+
if phones[-1] in ["P", "T", "K", "F", "TH", "HH"]:
|
338 |
+
phones.extend(["S"])
|
339 |
+
# S Z SH ZH CH JH 擦声结尾 's 发 ['IH1', 'Z'] 或 ['AH0', 'Z']
|
340 |
+
elif phones[-1] in ["S", "Z", "SH", "ZH", "CH", "JH"]:
|
341 |
+
phones.extend(["AH0", "Z"])
|
342 |
+
# B D G DH V M N NG L R W Y 有声辅音结尾 's 发 ['Z']
|
343 |
+
# AH0 AH1 AH2 EY0 EY1 EY2 AE0 AE1 AE2 EH0 EH1 EH2 OW0 OW1 OW2 UH0 UH1 UH2 IY0 IY1 IY2 AA0 AA1 AA2 AO0 AO1 AO2
|
344 |
+
# ER ER0 ER1 ER2 UW0 UW1 UW2 AY0 AY1 AY2 AW0 AW1 AW2 OY0 OY1 OY2 IH IH0 IH1 IH2 元音结尾 's 发 ['Z']
|
345 |
+
else:
|
346 |
+
phones.extend(["Z"])
|
347 |
+
return phones
|
348 |
+
|
349 |
+
# 尝试进行分词,应对复合词
|
350 |
+
comps = wordsegment.segment(word.lower())
|
351 |
+
|
352 |
+
# 无法分词的送回去预测
|
353 |
+
if len(comps) == 1:
|
354 |
+
return self.predict(word)
|
355 |
+
|
356 |
+
# 可以分词的递归处理
|
357 |
+
return [phone for comp in comps for phone in self.qryword(comp)]
|
358 |
+
|
359 |
+
|
360 |
+
_g2p = en_G2p()
|
361 |
+
|
362 |
+
|
363 |
+
def g2p(text):
|
364 |
+
# g2p_en 整段推理,剔除不存在的arpa返回
|
365 |
+
phone_list = _g2p(text)
|
366 |
+
phones = [ph if ph != "<unk>" else "UNK" for ph in phone_list if ph not in [" ", "<pad>", "UW", "</s>", "<s>"]]
|
367 |
+
|
368 |
+
return replace_phs(phones)
|
369 |
+
|
370 |
+
|
371 |
+
if __name__ == "__main__":
|
372 |
+
print(g2p("hello"))
|
373 |
+
print(g2p(text_normalize("e.g. I used openai's AI tool to draw a picture.")))
|
374 |
+
print(g2p(text_normalize("In this; paper, we propose 1 DSPGAN, a GAN-based universal vocoder.")))
|
text/g2pw/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from text.g2pw.g2pw import *
|
text/g2pw/dataset.py
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
"""
|
15 |
+
Credits
|
16 |
+
This code is modified from https://github.com/GitYCC/g2pW
|
17 |
+
"""
|
18 |
+
|
19 |
+
from typing import Dict
|
20 |
+
from typing import List
|
21 |
+
from typing import Tuple
|
22 |
+
|
23 |
+
import numpy as np
|
24 |
+
|
25 |
+
from .utils import tokenize_and_map
|
26 |
+
|
27 |
+
ANCHOR_CHAR = "▁"
|
28 |
+
|
29 |
+
|
30 |
+
def prepare_onnx_input(
|
31 |
+
tokenizer,
|
32 |
+
labels: List[str],
|
33 |
+
char2phonemes: Dict[str, List[int]],
|
34 |
+
chars: List[str],
|
35 |
+
texts: List[str],
|
36 |
+
query_ids: List[int],
|
37 |
+
use_mask: bool = False,
|
38 |
+
window_size: int = None,
|
39 |
+
max_len: int = 512,
|
40 |
+
) -> Dict[str, np.array]:
|
41 |
+
if window_size is not None:
|
42 |
+
truncated_texts, truncated_query_ids = _truncate_texts(
|
43 |
+
window_size=window_size, texts=texts, query_ids=query_ids
|
44 |
+
)
|
45 |
+
input_ids = []
|
46 |
+
token_type_ids = []
|
47 |
+
attention_masks = []
|
48 |
+
phoneme_masks = []
|
49 |
+
char_ids = []
|
50 |
+
position_ids = []
|
51 |
+
|
52 |
+
for idx in range(len(texts)):
|
53 |
+
text = (truncated_texts if window_size else texts)[idx].lower()
|
54 |
+
query_id = (truncated_query_ids if window_size else query_ids)[idx]
|
55 |
+
|
56 |
+
try:
|
57 |
+
tokens, text2token, token2text = tokenize_and_map(tokenizer=tokenizer, text=text)
|
58 |
+
except Exception:
|
59 |
+
print(f'warning: text "{text}" is invalid')
|
60 |
+
return {}
|
61 |
+
|
62 |
+
text, query_id, tokens, text2token, token2text = _truncate(
|
63 |
+
max_len=max_len, text=text, query_id=query_id, tokens=tokens, text2token=text2token, token2text=token2text
|
64 |
+
)
|
65 |
+
|
66 |
+
processed_tokens = ["[CLS]"] + tokens + ["[SEP]"]
|
67 |
+
|
68 |
+
input_id = list(np.array(tokenizer.convert_tokens_to_ids(processed_tokens)))
|
69 |
+
token_type_id = list(np.zeros((len(processed_tokens),), dtype=int))
|
70 |
+
attention_mask = list(np.ones((len(processed_tokens),), dtype=int))
|
71 |
+
|
72 |
+
query_char = text[query_id]
|
73 |
+
phoneme_mask = (
|
74 |
+
[1 if i in char2phonemes[query_char] else 0 for i in range(len(labels))] if use_mask else [1] * len(labels)
|
75 |
+
)
|
76 |
+
char_id = chars.index(query_char)
|
77 |
+
position_id = text2token[query_id] + 1 # [CLS] token locate at first place
|
78 |
+
|
79 |
+
input_ids.append(input_id)
|
80 |
+
token_type_ids.append(token_type_id)
|
81 |
+
attention_masks.append(attention_mask)
|
82 |
+
phoneme_masks.append(phoneme_mask)
|
83 |
+
char_ids.append(char_id)
|
84 |
+
position_ids.append(position_id)
|
85 |
+
|
86 |
+
outputs = {
|
87 |
+
"input_ids": np.array(input_ids).astype(np.int64),
|
88 |
+
"token_type_ids": np.array(token_type_ids).astype(np.int64),
|
89 |
+
"attention_masks": np.array(attention_masks).astype(np.int64),
|
90 |
+
"phoneme_masks": np.array(phoneme_masks).astype(np.float32),
|
91 |
+
"char_ids": np.array(char_ids).astype(np.int64),
|
92 |
+
"position_ids": np.array(position_ids).astype(np.int64),
|
93 |
+
}
|
94 |
+
return outputs
|
95 |
+
|
96 |
+
|
97 |
+
def _truncate_texts(window_size: int, texts: List[str], query_ids: List[int]) -> Tuple[List[str], List[int]]:
|
98 |
+
truncated_texts = []
|
99 |
+
truncated_query_ids = []
|
100 |
+
for text, query_id in zip(texts, query_ids):
|
101 |
+
start = max(0, query_id - window_size // 2)
|
102 |
+
end = min(len(text), query_id + window_size // 2)
|
103 |
+
truncated_text = text[start:end]
|
104 |
+
truncated_texts.append(truncated_text)
|
105 |
+
|
106 |
+
truncated_query_id = query_id - start
|
107 |
+
truncated_query_ids.append(truncated_query_id)
|
108 |
+
return truncated_texts, truncated_query_ids
|
109 |
+
|
110 |
+
|
111 |
+
def _truncate(
|
112 |
+
max_len: int, text: str, query_id: int, tokens: List[str], text2token: List[int], token2text: List[Tuple[int]]
|
113 |
+
):
|
114 |
+
truncate_len = max_len - 2
|
115 |
+
if len(tokens) <= truncate_len:
|
116 |
+
return (text, query_id, tokens, text2token, token2text)
|
117 |
+
|
118 |
+
token_position = text2token[query_id]
|
119 |
+
|
120 |
+
token_start = token_position - truncate_len // 2
|
121 |
+
token_end = token_start + truncate_len
|
122 |
+
font_exceed_dist = -token_start
|
123 |
+
back_exceed_dist = token_end - len(tokens)
|
124 |
+
if font_exceed_dist > 0:
|
125 |
+
token_start += font_exceed_dist
|
126 |
+
token_end += font_exceed_dist
|
127 |
+
elif back_exceed_dist > 0:
|
128 |
+
token_start -= back_exceed_dist
|
129 |
+
token_end -= back_exceed_dist
|
130 |
+
|
131 |
+
start = token2text[token_start][0]
|
132 |
+
end = token2text[token_end - 1][1]
|
133 |
+
|
134 |
+
return (
|
135 |
+
text[start:end],
|
136 |
+
query_id - start,
|
137 |
+
tokens[token_start:token_end],
|
138 |
+
[i - token_start if i is not None else None for i in text2token[start:end]],
|
139 |
+
[(s - start, e - start) for s, e in token2text[token_start:token_end]],
|
140 |
+
)
|
141 |
+
|
142 |
+
|
143 |
+
def get_phoneme_labels(polyphonic_chars: List[List[str]]) -> Tuple[List[str], Dict[str, List[int]]]:
|
144 |
+
labels = sorted(list(set([phoneme for char, phoneme in polyphonic_chars])))
|
145 |
+
char2phonemes = {}
|
146 |
+
for char, phoneme in polyphonic_chars:
|
147 |
+
if char not in char2phonemes:
|
148 |
+
char2phonemes[char] = []
|
149 |
+
char2phonemes[char].append(labels.index(phoneme))
|
150 |
+
return labels, char2phonemes
|
151 |
+
|
152 |
+
|
153 |
+
def get_char_phoneme_labels(polyphonic_chars: List[List[str]]) -> Tuple[List[str], Dict[str, List[int]]]:
|
154 |
+
labels = sorted(list(set([f"{char} {phoneme}" for char, phoneme in polyphonic_chars])))
|
155 |
+
char2phonemes = {}
|
156 |
+
for char, phoneme in polyphonic_chars:
|
157 |
+
if char not in char2phonemes:
|
158 |
+
char2phonemes[char] = []
|
159 |
+
char2phonemes[char].append(labels.index(f"{char} {phoneme}"))
|
160 |
+
return labels, char2phonemes
|
text/g2pw/g2pw.py
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This code is modified from https://github.com/mozillazg/pypinyin-g2pW
|
2 |
+
|
3 |
+
import pickle
|
4 |
+
import os
|
5 |
+
|
6 |
+
from pypinyin.constants import RE_HANS
|
7 |
+
from pypinyin.core import Pinyin, Style
|
8 |
+
from pypinyin.seg.simpleseg import simple_seg
|
9 |
+
from pypinyin.converter import UltimateConverter
|
10 |
+
from pypinyin.contrib.tone_convert import to_tone
|
11 |
+
from .onnx_api import G2PWOnnxConverter
|
12 |
+
|
13 |
+
current_file_path = os.path.dirname(__file__)
|
14 |
+
CACHE_PATH = os.path.join(current_file_path, "polyphonic.pickle")
|
15 |
+
PP_DICT_PATH = os.path.join(current_file_path, "polyphonic.rep")
|
16 |
+
PP_FIX_DICT_PATH = os.path.join(current_file_path, "polyphonic-fix.rep")
|
17 |
+
|
18 |
+
|
19 |
+
class G2PWPinyin(Pinyin):
|
20 |
+
def __init__(
|
21 |
+
self,
|
22 |
+
model_dir="G2PWModel/",
|
23 |
+
model_source=None,
|
24 |
+
enable_non_tradional_chinese=True,
|
25 |
+
v_to_u=False,
|
26 |
+
neutral_tone_with_five=False,
|
27 |
+
tone_sandhi=False,
|
28 |
+
**kwargs,
|
29 |
+
):
|
30 |
+
self._g2pw = G2PWOnnxConverter(
|
31 |
+
model_dir=model_dir,
|
32 |
+
style="pinyin",
|
33 |
+
model_source=model_source,
|
34 |
+
enable_non_tradional_chinese=enable_non_tradional_chinese,
|
35 |
+
)
|
36 |
+
self._converter = Converter(
|
37 |
+
self._g2pw,
|
38 |
+
v_to_u=v_to_u,
|
39 |
+
neutral_tone_with_five=neutral_tone_with_five,
|
40 |
+
tone_sandhi=tone_sandhi,
|
41 |
+
)
|
42 |
+
|
43 |
+
def get_seg(self, **kwargs):
|
44 |
+
return simple_seg
|
45 |
+
|
46 |
+
|
47 |
+
class Converter(UltimateConverter):
|
48 |
+
def __init__(self, g2pw_instance, v_to_u=False, neutral_tone_with_five=False, tone_sandhi=False, **kwargs):
|
49 |
+
super(Converter, self).__init__(
|
50 |
+
v_to_u=v_to_u, neutral_tone_with_five=neutral_tone_with_five, tone_sandhi=tone_sandhi, **kwargs
|
51 |
+
)
|
52 |
+
|
53 |
+
self._g2pw = g2pw_instance
|
54 |
+
|
55 |
+
def convert(self, words, style, heteronym, errors, strict, **kwargs):
|
56 |
+
pys = []
|
57 |
+
if RE_HANS.match(words):
|
58 |
+
pys = self._to_pinyin(words, style=style, heteronym=heteronym, errors=errors, strict=strict)
|
59 |
+
post_data = self.post_pinyin(words, heteronym, pys)
|
60 |
+
if post_data is not None:
|
61 |
+
pys = post_data
|
62 |
+
|
63 |
+
pys = self.convert_styles(pys, words, style, heteronym, errors, strict)
|
64 |
+
|
65 |
+
else:
|
66 |
+
py = self.handle_nopinyin(words, style=style, errors=errors, heteronym=heteronym, strict=strict)
|
67 |
+
if py:
|
68 |
+
pys.extend(py)
|
69 |
+
|
70 |
+
return _remove_dup_and_empty(pys)
|
71 |
+
|
72 |
+
def _to_pinyin(self, han, style, heteronym, errors, strict, **kwargs):
|
73 |
+
pinyins = []
|
74 |
+
|
75 |
+
g2pw_pinyin = self._g2pw(han)
|
76 |
+
|
77 |
+
if not g2pw_pinyin: # g2pw 不支持的汉字改为使用 pypinyin 原有逻辑
|
78 |
+
return super(Converter, self).convert(han, Style.TONE, heteronym, errors, strict, **kwargs)
|
79 |
+
|
80 |
+
for i, item in enumerate(g2pw_pinyin[0]):
|
81 |
+
if item is None: # g2pw 不支持的汉字改为使用 pypinyin 原有逻辑
|
82 |
+
py = super(Converter, self).convert(han[i], Style.TONE, heteronym, errors, strict, **kwargs)
|
83 |
+
pinyins.extend(py)
|
84 |
+
else:
|
85 |
+
pinyins.append([to_tone(item)])
|
86 |
+
|
87 |
+
return pinyins
|
88 |
+
|
89 |
+
|
90 |
+
def _remove_dup_items(lst, remove_empty=False):
|
91 |
+
new_lst = []
|
92 |
+
for item in lst:
|
93 |
+
if remove_empty and not item:
|
94 |
+
continue
|
95 |
+
if item not in new_lst:
|
96 |
+
new_lst.append(item)
|
97 |
+
return new_lst
|
98 |
+
|
99 |
+
|
100 |
+
def _remove_dup_and_empty(lst_list):
|
101 |
+
new_lst_list = []
|
102 |
+
for lst in lst_list:
|
103 |
+
lst = _remove_dup_items(lst, remove_empty=True)
|
104 |
+
if lst:
|
105 |
+
new_lst_list.append(lst)
|
106 |
+
else:
|
107 |
+
new_lst_list.append([""])
|
108 |
+
|
109 |
+
return new_lst_list
|
110 |
+
|
111 |
+
|
112 |
+
def cache_dict(polyphonic_dict, file_path):
|
113 |
+
with open(file_path, "wb") as pickle_file:
|
114 |
+
pickle.dump(polyphonic_dict, pickle_file)
|
115 |
+
|
116 |
+
|
117 |
+
def get_dict():
|
118 |
+
if os.path.exists(CACHE_PATH):
|
119 |
+
with open(CACHE_PATH, "rb") as pickle_file:
|
120 |
+
polyphonic_dict = pickle.load(pickle_file)
|
121 |
+
else:
|
122 |
+
polyphonic_dict = read_dict()
|
123 |
+
cache_dict(polyphonic_dict, CACHE_PATH)
|
124 |
+
|
125 |
+
return polyphonic_dict
|
126 |
+
|
127 |
+
|
128 |
+
def read_dict():
|
129 |
+
polyphonic_dict = {}
|
130 |
+
with open(PP_DICT_PATH, encoding="utf-8") as f:
|
131 |
+
line = f.readline()
|
132 |
+
while line:
|
133 |
+
key, value_str = line.split(":")
|
134 |
+
value = eval(value_str.strip())
|
135 |
+
polyphonic_dict[key.strip()] = value
|
136 |
+
line = f.readline()
|
137 |
+
with open(PP_FIX_DICT_PATH, encoding="utf-8") as f:
|
138 |
+
line = f.readline()
|
139 |
+
while line:
|
140 |
+
key, value_str = line.split(":")
|
141 |
+
value = eval(value_str.strip())
|
142 |
+
polyphonic_dict[key.strip()] = value
|
143 |
+
line = f.readline()
|
144 |
+
return polyphonic_dict
|
145 |
+
|
146 |
+
|
147 |
+
def correct_pronunciation(word, word_pinyins):
|
148 |
+
new_pinyins = pp_dict.get(word, "")
|
149 |
+
if new_pinyins == "":
|
150 |
+
for idx, w in enumerate(word):
|
151 |
+
w_pinyin = pp_dict.get(w, "")
|
152 |
+
if w_pinyin != "":
|
153 |
+
word_pinyins[idx] = w_pinyin[0]
|
154 |
+
return word_pinyins
|
155 |
+
else:
|
156 |
+
return new_pinyins
|
157 |
+
|
158 |
+
|
159 |
+
pp_dict = get_dict()
|
text/g2pw/onnx_api.py
ADDED
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This code is modified from https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw
|
2 |
+
# This code is modified from https://github.com/GitYCC/g2pW
|
3 |
+
|
4 |
+
import warnings
|
5 |
+
|
6 |
+
warnings.filterwarnings("ignore")
|
7 |
+
import json
|
8 |
+
import os
|
9 |
+
import zipfile, requests
|
10 |
+
from typing import Any
|
11 |
+
from typing import Dict
|
12 |
+
from typing import List
|
13 |
+
from typing import Tuple
|
14 |
+
|
15 |
+
import numpy as np
|
16 |
+
import onnxruntime
|
17 |
+
|
18 |
+
onnxruntime.set_default_logger_severity(3)
|
19 |
+
from opencc import OpenCC
|
20 |
+
from transformers import AutoTokenizer
|
21 |
+
from pypinyin import pinyin
|
22 |
+
from pypinyin import Style
|
23 |
+
|
24 |
+
from .dataset import get_char_phoneme_labels
|
25 |
+
from .dataset import get_phoneme_labels
|
26 |
+
from .dataset import prepare_onnx_input
|
27 |
+
from .utils import load_config
|
28 |
+
from ..zh_normalization.char_convert import tranditional_to_simplified
|
29 |
+
|
30 |
+
model_version = '1.1'
|
31 |
+
|
32 |
+
|
33 |
+
def predict(session, onnx_input: Dict[str, Any],
|
34 |
+
labels: List[str]) -> Tuple[List[str], List[float]]:
|
35 |
+
all_preds = []
|
36 |
+
all_confidences = []
|
37 |
+
probs = session.run([], {
|
38 |
+
"input_ids": onnx_input['input_ids'],
|
39 |
+
"token_type_ids": onnx_input['token_type_ids'],
|
40 |
+
"attention_mask": onnx_input['attention_masks'],
|
41 |
+
"phoneme_mask": onnx_input['phoneme_masks'],
|
42 |
+
"char_ids": onnx_input['char_ids'],
|
43 |
+
"position_ids": onnx_input['position_ids']
|
44 |
+
})[0]
|
45 |
+
|
46 |
+
preds = np.argmax(probs, axis=1).tolist()
|
47 |
+
max_probs = []
|
48 |
+
for index, arr in zip(preds, probs.tolist()):
|
49 |
+
max_probs.append(arr[index])
|
50 |
+
all_preds += [labels[pred] for pred in preds]
|
51 |
+
all_confidences += max_probs
|
52 |
+
|
53 |
+
return all_preds, all_confidences
|
54 |
+
|
55 |
+
|
56 |
+
def download_and_decompress(model_dir: str = 'G2PWModel/'):
|
57 |
+
if not os.path.exists(model_dir):
|
58 |
+
parent_directory = os.path.dirname(model_dir)
|
59 |
+
zip_dir = os.path.join(parent_directory, "G2PWModel_1.1.zip")
|
60 |
+
extract_dir = os.path.join(parent_directory, "G2PWModel_1.1")
|
61 |
+
extract_dir_new = os.path.join(parent_directory, "G2PWModel")
|
62 |
+
print("Downloading g2pw model...")
|
63 |
+
modelscope_url = "https://www.modelscope.cn/models/kamiorinn/g2pw/resolve/master/G2PWModel_1.1.zip"
|
64 |
+
with requests.get(modelscope_url, stream=True) as r:
|
65 |
+
r.raise_for_status()
|
66 |
+
with open(zip_dir, 'wb') as f:
|
67 |
+
for chunk in r.iter_content(chunk_size=8192):
|
68 |
+
if chunk:
|
69 |
+
f.write(chunk)
|
70 |
+
|
71 |
+
print("Extracting g2pw model...")
|
72 |
+
with zipfile.ZipFile(zip_dir, "r") as zip_ref:
|
73 |
+
zip_ref.extractall(parent_directory)
|
74 |
+
|
75 |
+
os.rename(extract_dir, extract_dir_new)
|
76 |
+
|
77 |
+
return model_dir
|
78 |
+
|
79 |
+
|
80 |
+
class G2PWOnnxConverter:
|
81 |
+
def __init__(self,
|
82 |
+
model_dir: str = 'G2PWModel/',
|
83 |
+
style: str = 'bopomofo',
|
84 |
+
model_source: str = None,
|
85 |
+
enable_non_tradional_chinese: bool = False):
|
86 |
+
uncompress_path = download_and_decompress(model_dir)
|
87 |
+
|
88 |
+
sess_options = onnxruntime.SessionOptions()
|
89 |
+
sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
|
90 |
+
sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
|
91 |
+
sess_options.intra_op_num_threads = 2
|
92 |
+
# self.session_g2pW = onnxruntime.InferenceSession(os.path.join(uncompress_path, 'g2pW.onnx'), sess_options=sess_options, providers=['CPUExecutionProvider'])
|
93 |
+
self.session_g2pW = onnxruntime.InferenceSession(os.path.join(uncompress_path, 'g2pW.onnx'), sess_options=sess_options, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
|
94 |
+
|
95 |
+
self.config = load_config(
|
96 |
+
config_path=os.path.join(uncompress_path, 'config.py'),
|
97 |
+
use_default=True)
|
98 |
+
|
99 |
+
self.model_source = model_source if model_source else self.config.model_source
|
100 |
+
self.enable_opencc = enable_non_tradional_chinese
|
101 |
+
|
102 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.model_source)
|
103 |
+
|
104 |
+
polyphonic_chars_path = os.path.join(uncompress_path,
|
105 |
+
'POLYPHONIC_CHARS.txt')
|
106 |
+
monophonic_chars_path = os.path.join(uncompress_path,
|
107 |
+
'MONOPHONIC_CHARS.txt')
|
108 |
+
self.polyphonic_chars = [
|
109 |
+
line.split('\t')
|
110 |
+
for line in open(polyphonic_chars_path, encoding='utf-8').read()
|
111 |
+
.strip().split('\n')
|
112 |
+
]
|
113 |
+
self.non_polyphonic = {
|
114 |
+
'一', '不', '和', '咋', '嗲', '剖', '差', '攢', '倒', '難', '奔', '勁', '拗',
|
115 |
+
'肖', '瘙', '誒', '泊', '听', '噢'
|
116 |
+
}
|
117 |
+
self.non_monophonic = {'似', '攢'}
|
118 |
+
self.monophonic_chars = [
|
119 |
+
line.split('\t')
|
120 |
+
for line in open(monophonic_chars_path, encoding='utf-8').read()
|
121 |
+
.strip().split('\n')
|
122 |
+
]
|
123 |
+
self.labels, self.char2phonemes = get_char_phoneme_labels(
|
124 |
+
polyphonic_chars=self.polyphonic_chars
|
125 |
+
) if self.config.use_char_phoneme else get_phoneme_labels(
|
126 |
+
polyphonic_chars=self.polyphonic_chars)
|
127 |
+
|
128 |
+
self.chars = sorted(list(self.char2phonemes.keys()))
|
129 |
+
|
130 |
+
self.polyphonic_chars_new = set(self.chars)
|
131 |
+
for char in self.non_polyphonic:
|
132 |
+
if char in self.polyphonic_chars_new:
|
133 |
+
self.polyphonic_chars_new.remove(char)
|
134 |
+
|
135 |
+
self.monophonic_chars_dict = {
|
136 |
+
char: phoneme
|
137 |
+
for char, phoneme in self.monophonic_chars
|
138 |
+
}
|
139 |
+
for char in self.non_monophonic:
|
140 |
+
if char in self.monophonic_chars_dict:
|
141 |
+
self.monophonic_chars_dict.pop(char)
|
142 |
+
|
143 |
+
self.pos_tags = [
|
144 |
+
'UNK', 'A', 'C', 'D', 'I', 'N', 'P', 'T', 'V', 'DE', 'SHI'
|
145 |
+
]
|
146 |
+
|
147 |
+
with open(
|
148 |
+
os.path.join(uncompress_path,
|
149 |
+
'bopomofo_to_pinyin_wo_tune_dict.json'),
|
150 |
+
'r',
|
151 |
+
encoding='utf-8') as fr:
|
152 |
+
self.bopomofo_convert_dict = json.load(fr)
|
153 |
+
self.style_convert_func = {
|
154 |
+
'bopomofo': lambda x: x,
|
155 |
+
'pinyin': self._convert_bopomofo_to_pinyin,
|
156 |
+
}[style]
|
157 |
+
|
158 |
+
with open(
|
159 |
+
os.path.join(uncompress_path, 'char_bopomofo_dict.json'),
|
160 |
+
'r',
|
161 |
+
encoding='utf-8') as fr:
|
162 |
+
self.char_bopomofo_dict = json.load(fr)
|
163 |
+
|
164 |
+
if self.enable_opencc:
|
165 |
+
self.cc = OpenCC('s2tw')
|
166 |
+
|
167 |
+
def _convert_bopomofo_to_pinyin(self, bopomofo: str) -> str:
|
168 |
+
tone = bopomofo[-1]
|
169 |
+
assert tone in '12345'
|
170 |
+
component = self.bopomofo_convert_dict.get(bopomofo[:-1])
|
171 |
+
if component:
|
172 |
+
return component + tone
|
173 |
+
else:
|
174 |
+
print(f'Warning: "{bopomofo}" cannot convert to pinyin')
|
175 |
+
return None
|
176 |
+
|
177 |
+
def __call__(self, sentences: List[str]) -> List[List[str]]:
|
178 |
+
if isinstance(sentences, str):
|
179 |
+
sentences = [sentences]
|
180 |
+
|
181 |
+
if self.enable_opencc:
|
182 |
+
translated_sentences = []
|
183 |
+
for sent in sentences:
|
184 |
+
translated_sent = self.cc.convert(sent)
|
185 |
+
assert len(translated_sent) == len(sent)
|
186 |
+
translated_sentences.append(translated_sent)
|
187 |
+
sentences = translated_sentences
|
188 |
+
|
189 |
+
texts, query_ids, sent_ids, partial_results = self._prepare_data(
|
190 |
+
sentences=sentences)
|
191 |
+
if len(texts) == 0:
|
192 |
+
# sentences no polyphonic words
|
193 |
+
return partial_results
|
194 |
+
|
195 |
+
onnx_input = prepare_onnx_input(
|
196 |
+
tokenizer=self.tokenizer,
|
197 |
+
labels=self.labels,
|
198 |
+
char2phonemes=self.char2phonemes,
|
199 |
+
chars=self.chars,
|
200 |
+
texts=texts,
|
201 |
+
query_ids=query_ids,
|
202 |
+
use_mask=self.config.use_mask,
|
203 |
+
window_size=None)
|
204 |
+
|
205 |
+
preds, confidences = predict(
|
206 |
+
session=self.session_g2pW,
|
207 |
+
onnx_input=onnx_input,
|
208 |
+
labels=self.labels)
|
209 |
+
if self.config.use_char_phoneme:
|
210 |
+
preds = [pred.split(' ')[1] for pred in preds]
|
211 |
+
|
212 |
+
results = partial_results
|
213 |
+
for sent_id, query_id, pred in zip(sent_ids, query_ids, preds):
|
214 |
+
results[sent_id][query_id] = self.style_convert_func(pred)
|
215 |
+
|
216 |
+
return results
|
217 |
+
|
218 |
+
def _prepare_data(
|
219 |
+
self, sentences: List[str]
|
220 |
+
) -> Tuple[List[str], List[int], List[int], List[List[str]]]:
|
221 |
+
texts, query_ids, sent_ids, partial_results = [], [], [], []
|
222 |
+
for sent_id, sent in enumerate(sentences):
|
223 |
+
# pypinyin works well for Simplified Chinese than Traditional Chinese
|
224 |
+
sent_s = tranditional_to_simplified(sent)
|
225 |
+
pypinyin_result = pinyin(
|
226 |
+
sent_s, neutral_tone_with_five=True, style=Style.TONE3)
|
227 |
+
partial_result = [None] * len(sent)
|
228 |
+
for i, char in enumerate(sent):
|
229 |
+
if char in self.polyphonic_chars_new:
|
230 |
+
texts.append(sent)
|
231 |
+
query_ids.append(i)
|
232 |
+
sent_ids.append(sent_id)
|
233 |
+
elif char in self.monophonic_chars_dict:
|
234 |
+
partial_result[i] = self.style_convert_func(
|
235 |
+
self.monophonic_chars_dict[char])
|
236 |
+
elif char in self.char_bopomofo_dict:
|
237 |
+
partial_result[i] = pypinyin_result[i][0]
|
238 |
+
# partial_result[i] = self.style_convert_func(self.char_bopomofo_dict[char][0])
|
239 |
+
else:
|
240 |
+
partial_result[i] = pypinyin_result[i][0]
|
241 |
+
|
242 |
+
partial_results.append(partial_result)
|
243 |
+
return texts, query_ids, sent_ids, partial_results
|
text/g2pw/polyphonic-fix.rep
ADDED
The diff for this file is too large to render.
See raw diff
|
|
text/g2pw/polyphonic.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f425246160a32c578557cd3151cd0bb97f5f44c3aaf65e718dd2c3213c04fb4b
|
3 |
+
size 1322387
|
text/g2pw/polyphonic.rep
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
湖泊: ['hu2','po1']
|
2 |
+
地壳: ['di4','qiao4']
|
3 |
+
柏树: ['bai3','shu4']
|
4 |
+
曝光: ['bao4','guang1']
|
5 |
+
弹力: ['tan2','li4']
|
6 |
+
字帖: ['zi4','tie4']
|
7 |
+
口吃: ['kou3','chi1']
|
8 |
+
包扎: ['bao1','za1']
|
9 |
+
哪吒: ['ne2','zha1']
|
10 |
+
说服: ['shuo1','fu2']
|
11 |
+
识字: ['shi2','zi4']
|
12 |
+
骨头: ['gu3','tou5']
|
13 |
+
对称: ['dui4','chen4']
|
14 |
+
口供: ['kou3','gong4']
|
15 |
+
抹布: ['ma1','bu4']
|
16 |
+
露背: ['lu4','bei4']
|
17 |
+
圈养: ['juan4', 'yang3']
|
18 |
+
眼眶: ['yan3', 'kuang4']
|
19 |
+
品行: ['pin3','xing2']
|
20 |
+
颤抖: ['chan4','dou3']
|
21 |
+
差不多: ['cha4','bu5','duo1']
|
22 |
+
鸭绿江: ['ya1','lu4','jiang1']
|
23 |
+
撒切尔: ['sa4','qie4','er3']
|
24 |
+
比比皆是: ['bi3','bi3','jie1','shi4']
|
25 |
+
身无长物: ['shen1','wu2','chang2','wu4']
|
26 |
+
手里: ['shou2','li3']
|
27 |
+
关卡: ['guan1','qia3']
|
28 |
+
怀揣: ['huai2','chuai1']
|
29 |
+
挑剔: ['tiao1','ti4']
|
30 |
+
供称: ['gong4','cheng1']
|
31 |
+
作坊: ['zuo1', 'fang5']
|
32 |
+
中医: ['zhong1','yi1']
|
33 |
+
嚷嚷: ['rang1','rang5']
|
34 |
+
商厦: ['shang1','sha4']
|
35 |
+
大厦: ['da4','sha4']
|
36 |
+
刹车: ['sha1','che1']
|
37 |
+
嘚瑟: ['de4','se5']
|
38 |
+
朝鲜: ['chao2','xian3']
|
39 |
+
阿房宫: ['e1','pang2','gong1']
|
40 |
+
阿胶: ['e1','jiao1']
|
41 |
+
咖喱: ['ga1','li5']
|
42 |
+
时分: ['shi2','fen1']
|
43 |
+
蚌埠: ['beng4','bu4']
|
44 |
+
驯服: ['xun4','fu2']
|
45 |
+
幸免于难: ['xing4','mian3','yu2','nan4']
|
46 |
+
恶行: ['e4','xing2']
|
47 |
+
唉: ['ai4']
|
48 |
+
扎实: ['zha1','shi2']
|
49 |
+
干将: ['gan4','jiang4']
|
50 |
+
陈威行: ['chen2', 'wei1', 'hang2']
|
51 |
+
郭晟: ['guo1', 'sheng4']
|
52 |
+
中标: ['zhong4', 'biao1']
|
53 |
+
抗住: ['kang2', 'zhu4']
|
text/g2pw/utils.py
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
"""
|
15 |
+
Credits
|
16 |
+
This code is modified from https://github.com/GitYCC/g2pW
|
17 |
+
"""
|
18 |
+
|
19 |
+
import os
|
20 |
+
import re
|
21 |
+
|
22 |
+
|
23 |
+
def wordize_and_map(text: str):
|
24 |
+
words = []
|
25 |
+
index_map_from_text_to_word = []
|
26 |
+
index_map_from_word_to_text = []
|
27 |
+
while len(text) > 0:
|
28 |
+
match_space = re.match(r"^ +", text)
|
29 |
+
if match_space:
|
30 |
+
space_str = match_space.group(0)
|
31 |
+
index_map_from_text_to_word += [None] * len(space_str)
|
32 |
+
text = text[len(space_str) :]
|
33 |
+
continue
|
34 |
+
|
35 |
+
match_en = re.match(r"^[a-zA-Z0-9]+", text)
|
36 |
+
if match_en:
|
37 |
+
en_word = match_en.group(0)
|
38 |
+
|
39 |
+
word_start_pos = len(index_map_from_text_to_word)
|
40 |
+
word_end_pos = word_start_pos + len(en_word)
|
41 |
+
index_map_from_word_to_text.append((word_start_pos, word_end_pos))
|
42 |
+
|
43 |
+
index_map_from_text_to_word += [len(words)] * len(en_word)
|
44 |
+
|
45 |
+
words.append(en_word)
|
46 |
+
text = text[len(en_word) :]
|
47 |
+
else:
|
48 |
+
word_start_pos = len(index_map_from_text_to_word)
|
49 |
+
word_end_pos = word_start_pos + 1
|
50 |
+
index_map_from_word_to_text.append((word_start_pos, word_end_pos))
|
51 |
+
|
52 |
+
index_map_from_text_to_word += [len(words)]
|
53 |
+
|
54 |
+
words.append(text[0])
|
55 |
+
text = text[1:]
|
56 |
+
return words, index_map_from_text_to_word, index_map_from_word_to_text
|
57 |
+
|
58 |
+
|
59 |
+
def tokenize_and_map(tokenizer, text: str):
|
60 |
+
words, text2word, word2text = wordize_and_map(text=text)
|
61 |
+
|
62 |
+
tokens = []
|
63 |
+
index_map_from_token_to_text = []
|
64 |
+
for word, (word_start, word_end) in zip(words, word2text):
|
65 |
+
word_tokens = tokenizer.tokenize(word)
|
66 |
+
|
67 |
+
if len(word_tokens) == 0 or word_tokens == ["[UNK]"]:
|
68 |
+
index_map_from_token_to_text.append((word_start, word_end))
|
69 |
+
tokens.append("[UNK]")
|
70 |
+
else:
|
71 |
+
current_word_start = word_start
|
72 |
+
for word_token in word_tokens:
|
73 |
+
word_token_len = len(re.sub(r"^##", "", word_token))
|
74 |
+
index_map_from_token_to_text.append((current_word_start, current_word_start + word_token_len))
|
75 |
+
current_word_start = current_word_start + word_token_len
|
76 |
+
tokens.append(word_token)
|
77 |
+
|
78 |
+
index_map_from_text_to_token = text2word
|
79 |
+
for i, (token_start, token_end) in enumerate(index_map_from_token_to_text):
|
80 |
+
for token_pos in range(token_start, token_end):
|
81 |
+
index_map_from_text_to_token[token_pos] = i
|
82 |
+
|
83 |
+
return tokens, index_map_from_text_to_token, index_map_from_token_to_text
|
84 |
+
|
85 |
+
|
86 |
+
def _load_config(config_path: os.PathLike):
|
87 |
+
import importlib.util
|
88 |
+
|
89 |
+
spec = importlib.util.spec_from_file_location("__init__", config_path)
|
90 |
+
config = importlib.util.module_from_spec(spec)
|
91 |
+
spec.loader.exec_module(config)
|
92 |
+
return config
|
93 |
+
|
94 |
+
|
95 |
+
default_config_dict = {
|
96 |
+
"manual_seed": 1313,
|
97 |
+
"model_source": "bert-base-chinese",
|
98 |
+
"window_size": 32,
|
99 |
+
"num_workers": 2,
|
100 |
+
"use_mask": True,
|
101 |
+
"use_char_phoneme": False,
|
102 |
+
"use_conditional": True,
|
103 |
+
"param_conditional": {
|
104 |
+
"affect_location": "softmax",
|
105 |
+
"bias": True,
|
106 |
+
"char-linear": True,
|
107 |
+
"pos-linear": False,
|
108 |
+
"char+pos-second": True,
|
109 |
+
"char+pos-second_lowrank": False,
|
110 |
+
"lowrank_size": 0,
|
111 |
+
"char+pos-second_fm": False,
|
112 |
+
"fm_size": 0,
|
113 |
+
"fix_mode": None,
|
114 |
+
"count_json": "train.count.json",
|
115 |
+
},
|
116 |
+
"lr": 5e-5,
|
117 |
+
"val_interval": 200,
|
118 |
+
"num_iter": 10000,
|
119 |
+
"use_focal": False,
|
120 |
+
"param_focal": {"alpha": 0.0, "gamma": 0.7},
|
121 |
+
"use_pos": True,
|
122 |
+
"param_pos ": {
|
123 |
+
"weight": 0.1,
|
124 |
+
"pos_joint_training": True,
|
125 |
+
"train_pos_path": "train.pos",
|
126 |
+
"valid_pos_path": "dev.pos",
|
127 |
+
"test_pos_path": "test.pos",
|
128 |
+
},
|
129 |
+
}
|
130 |
+
|
131 |
+
|
132 |
+
def load_config(config_path: os.PathLike, use_default: bool = False):
|
133 |
+
config = _load_config(config_path)
|
134 |
+
if use_default:
|
135 |
+
for attr, val in default_config_dict.items():
|
136 |
+
if not hasattr(config, attr):
|
137 |
+
setattr(config, attr, val)
|
138 |
+
elif isinstance(val, dict):
|
139 |
+
d = getattr(config, attr)
|
140 |
+
for dict_k, dict_v in val.items():
|
141 |
+
if dict_k not in d:
|
142 |
+
d[dict_k] = dict_v
|
143 |
+
return config
|
text/ja_userdic/user.dict
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b44817ce96e24be7bcfdd009d834b5237fe044dc9ed5f2f9709f71da9d506fed
|
3 |
+
size 21321666
|
text/ja_userdic/userdict.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d857e443ee48d9641096816a98996669602895411e4330d7d91d1dbe1103389f
|
3 |
+
size 17180971
|
text/ja_userdic/userdict.md5
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
d36bd5ffba62f195d22bf4f1a41cd08f
|
text/japanese.py
ADDED
@@ -0,0 +1,276 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py
|
2 |
+
import re
|
3 |
+
import os
|
4 |
+
import hashlib
|
5 |
+
|
6 |
+
try:
|
7 |
+
import pyopenjtalk
|
8 |
+
|
9 |
+
current_file_path = os.path.dirname(__file__)
|
10 |
+
|
11 |
+
# 防止win下无法读取模型
|
12 |
+
if os.name == "nt":
|
13 |
+
python_dir = os.getcwd()
|
14 |
+
OPEN_JTALK_DICT_DIR = pyopenjtalk.OPEN_JTALK_DICT_DIR.decode("utf-8")
|
15 |
+
if not (re.match(r"^[A-Za-z0-9_/\\:.\-]*$", OPEN_JTALK_DICT_DIR)):
|
16 |
+
if OPEN_JTALK_DICT_DIR[: len(python_dir)].upper() == python_dir.upper():
|
17 |
+
OPEN_JTALK_DICT_DIR = os.path.join(os.path.relpath(OPEN_JTALK_DICT_DIR, python_dir))
|
18 |
+
else:
|
19 |
+
import shutil
|
20 |
+
|
21 |
+
if not os.path.exists("TEMP"):
|
22 |
+
os.mkdir("TEMP")
|
23 |
+
if not os.path.exists(os.path.join("TEMP", "ja")):
|
24 |
+
os.mkdir(os.path.join("TEMP", "ja"))
|
25 |
+
if os.path.exists(os.path.join("TEMP", "ja", "open_jtalk_dic")):
|
26 |
+
shutil.rmtree(os.path.join("TEMP", "ja", "open_jtalk_dic"))
|
27 |
+
shutil.copytree(
|
28 |
+
pyopenjtalk.OPEN_JTALK_DICT_DIR.decode("utf-8"),
|
29 |
+
os.path.join("TEMP", "ja", "open_jtalk_dic"),
|
30 |
+
)
|
31 |
+
OPEN_JTALK_DICT_DIR = os.path.join("TEMP", "ja", "open_jtalk_dic")
|
32 |
+
pyopenjtalk.OPEN_JTALK_DICT_DIR = OPEN_JTALK_DICT_DIR.encode("utf-8")
|
33 |
+
|
34 |
+
if not (re.match(r"^[A-Za-z0-9_/\\:.\-]*$", current_file_path)):
|
35 |
+
if current_file_path[: len(python_dir)].upper() == python_dir.upper():
|
36 |
+
current_file_path = os.path.join(os.path.relpath(current_file_path, python_dir))
|
37 |
+
else:
|
38 |
+
if not os.path.exists("TEMP"):
|
39 |
+
os.mkdir("TEMP")
|
40 |
+
if not os.path.exists(os.path.join("TEMP", "ja")):
|
41 |
+
os.mkdir(os.path.join("TEMP", "ja"))
|
42 |
+
if not os.path.exists(os.path.join("TEMP", "ja", "ja_userdic")):
|
43 |
+
os.mkdir(os.path.join("TEMP", "ja", "ja_userdic"))
|
44 |
+
shutil.copyfile(
|
45 |
+
os.path.join(current_file_path, "ja_userdic", "userdict.csv"),
|
46 |
+
os.path.join("TEMP", "ja", "ja_userdic", "userdict.csv"),
|
47 |
+
)
|
48 |
+
current_file_path = os.path.join("TEMP", "ja")
|
49 |
+
|
50 |
+
def get_hash(fp: str) -> str:
|
51 |
+
hash_md5 = hashlib.md5()
|
52 |
+
with open(fp, "rb") as f:
|
53 |
+
for chunk in iter(lambda: f.read(4096), b""):
|
54 |
+
hash_md5.update(chunk)
|
55 |
+
return hash_md5.hexdigest()
|
56 |
+
|
57 |
+
USERDIC_CSV_PATH = os.path.join(current_file_path, "ja_userdic", "userdict.csv")
|
58 |
+
USERDIC_BIN_PATH = os.path.join(current_file_path, "ja_userdic", "user.dict")
|
59 |
+
USERDIC_HASH_PATH = os.path.join(current_file_path, "ja_userdic", "userdict.md5")
|
60 |
+
# 如果没有用户词典,就生成一个;如果有,就检查md5,如果不一样,就重新生成
|
61 |
+
if os.path.exists(USERDIC_CSV_PATH):
|
62 |
+
if (
|
63 |
+
not os.path.exists(USERDIC_BIN_PATH)
|
64 |
+
or get_hash(USERDIC_CSV_PATH) != open(USERDIC_HASH_PATH, "r", encoding="utf-8").read()
|
65 |
+
):
|
66 |
+
pyopenjtalk.mecab_dict_index(USERDIC_CSV_PATH, USERDIC_BIN_PATH)
|
67 |
+
with open(USERDIC_HASH_PATH, "w", encoding="utf-8") as f:
|
68 |
+
f.write(get_hash(USERDIC_CSV_PATH))
|
69 |
+
|
70 |
+
if os.path.exists(USERDIC_BIN_PATH):
|
71 |
+
pyopenjtalk.update_global_jtalk_with_user_dict(USERDIC_BIN_PATH)
|
72 |
+
except Exception:
|
73 |
+
# print(e)
|
74 |
+
import pyopenjtalk
|
75 |
+
|
76 |
+
# failed to load user dictionary, ignore.
|
77 |
+
pass
|
78 |
+
|
79 |
+
|
80 |
+
from text.symbols import punctuation
|
81 |
+
|
82 |
+
# Regular expression matching Japanese without punctuation marks:
|
83 |
+
_japanese_characters = re.compile(
|
84 |
+
r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
|
85 |
+
)
|
86 |
+
|
87 |
+
# Regular expression matching non-Japanese characters or punctuation marks:
|
88 |
+
_japanese_marks = re.compile(
|
89 |
+
r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
|
90 |
+
)
|
91 |
+
|
92 |
+
# List of (symbol, Japanese) pairs for marks:
|
93 |
+
_symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("%", "パーセント")]]
|
94 |
+
|
95 |
+
|
96 |
+
# List of (consonant, sokuon) pairs:
|
97 |
+
_real_sokuon = [
|
98 |
+
(re.compile("%s" % x[0]), x[1])
|
99 |
+
for x in [
|
100 |
+
(r"Q([↑↓]*[kg])", r"k#\1"),
|
101 |
+
(r"Q([↑↓]*[tdjʧ])", r"t#\1"),
|
102 |
+
(r"Q([↑↓]*[sʃ])", r"s\1"),
|
103 |
+
(r"Q([↑↓]*[pb])", r"p#\1"),
|
104 |
+
]
|
105 |
+
]
|
106 |
+
|
107 |
+
# List of (consonant, hatsuon) pairs:
|
108 |
+
_real_hatsuon = [
|
109 |
+
(re.compile("%s" % x[0]), x[1])
|
110 |
+
for x in [
|
111 |
+
(r"N([↑↓]*[pbm])", r"m\1"),
|
112 |
+
(r"N([↑↓]*[ʧʥj])", r"n^\1"),
|
113 |
+
(r"N([↑↓]*[tdn])", r"n\1"),
|
114 |
+
(r"N([↑↓]*[kg])", r"ŋ\1"),
|
115 |
+
]
|
116 |
+
]
|
117 |
+
|
118 |
+
|
119 |
+
def post_replace_ph(ph):
|
120 |
+
rep_map = {
|
121 |
+
":": ",",
|
122 |
+
";": ",",
|
123 |
+
",": ",",
|
124 |
+
"。": ".",
|
125 |
+
"!": "!",
|
126 |
+
"?": "?",
|
127 |
+
"\n": ".",
|
128 |
+
"·": ",",
|
129 |
+
"、": ",",
|
130 |
+
"...": "…",
|
131 |
+
}
|
132 |
+
|
133 |
+
if ph in rep_map.keys():
|
134 |
+
ph = rep_map[ph]
|
135 |
+
return ph
|
136 |
+
|
137 |
+
|
138 |
+
def replace_consecutive_punctuation(text):
|
139 |
+
punctuations = "".join(re.escape(p) for p in punctuation)
|
140 |
+
pattern = f"([{punctuations}])([{punctuations}])+"
|
141 |
+
result = re.sub(pattern, r"\1", text)
|
142 |
+
return result
|
143 |
+
|
144 |
+
|
145 |
+
def symbols_to_japanese(text):
|
146 |
+
for regex, replacement in _symbols_to_japanese:
|
147 |
+
text = re.sub(regex, replacement, text)
|
148 |
+
return text
|
149 |
+
|
150 |
+
|
151 |
+
def preprocess_jap(text, with_prosody=False):
|
152 |
+
"""Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html"""
|
153 |
+
text = symbols_to_japanese(text)
|
154 |
+
# English words to lower case, should have no influence on japanese words.
|
155 |
+
text = text.lower()
|
156 |
+
sentences = re.split(_japanese_marks, text)
|
157 |
+
marks = re.findall(_japanese_marks, text)
|
158 |
+
text = []
|
159 |
+
for i, sentence in enumerate(sentences):
|
160 |
+
if re.match(_japanese_characters, sentence):
|
161 |
+
if with_prosody:
|
162 |
+
text += pyopenjtalk_g2p_prosody(sentence)[1:-1]
|
163 |
+
else:
|
164 |
+
p = pyopenjtalk.g2p(sentence)
|
165 |
+
text += p.split(" ")
|
166 |
+
|
167 |
+
if i < len(marks):
|
168 |
+
if marks[i] == " ": # 防止意外的UNK
|
169 |
+
continue
|
170 |
+
text += [marks[i].replace(" ", "")]
|
171 |
+
return text
|
172 |
+
|
173 |
+
|
174 |
+
def text_normalize(text):
|
175 |
+
# todo: jap text normalize
|
176 |
+
|
177 |
+
# 避免重复标点引起的参考泄露
|
178 |
+
text = replace_consecutive_punctuation(text)
|
179 |
+
return text
|
180 |
+
|
181 |
+
|
182 |
+
# Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
|
183 |
+
def pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True):
|
184 |
+
"""Extract phoneme + prosoody symbol sequence from input full-context labels.
|
185 |
+
|
186 |
+
The algorithm is based on `Prosodic features control by symbols as input of
|
187 |
+
sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks.
|
188 |
+
|
189 |
+
Args:
|
190 |
+
text (str): Input text.
|
191 |
+
drop_unvoiced_vowels (bool): whether to drop unvoiced vowels.
|
192 |
+
|
193 |
+
Returns:
|
194 |
+
List[str]: List of phoneme + prosody symbols.
|
195 |
+
|
196 |
+
Examples:
|
197 |
+
>>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody
|
198 |
+
>>> pyopenjtalk_g2p_prosody("こんにちは。")
|
199 |
+
['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$']
|
200 |
+
|
201 |
+
.. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic
|
202 |
+
modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104
|
203 |
+
|
204 |
+
"""
|
205 |
+
labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text))
|
206 |
+
N = len(labels)
|
207 |
+
|
208 |
+
phones = []
|
209 |
+
for n in range(N):
|
210 |
+
lab_curr = labels[n]
|
211 |
+
|
212 |
+
# current phoneme
|
213 |
+
p3 = re.search(r"\-(.*?)\+", lab_curr).group(1)
|
214 |
+
# deal unvoiced vowels as normal vowels
|
215 |
+
if drop_unvoiced_vowels and p3 in "AEIOU":
|
216 |
+
p3 = p3.lower()
|
217 |
+
|
218 |
+
# deal with sil at the beginning and the end of text
|
219 |
+
if p3 == "sil":
|
220 |
+
assert n == 0 or n == N - 1
|
221 |
+
if n == 0:
|
222 |
+
phones.append("^")
|
223 |
+
elif n == N - 1:
|
224 |
+
# check question form or not
|
225 |
+
e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr)
|
226 |
+
if e3 == 0:
|
227 |
+
phones.append("$")
|
228 |
+
elif e3 == 1:
|
229 |
+
phones.append("?")
|
230 |
+
continue
|
231 |
+
elif p3 == "pau":
|
232 |
+
phones.append("_")
|
233 |
+
continue
|
234 |
+
else:
|
235 |
+
phones.append(p3)
|
236 |
+
|
237 |
+
# accent type and position info (forward or backward)
|
238 |
+
a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr)
|
239 |
+
a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr)
|
240 |
+
a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr)
|
241 |
+
|
242 |
+
# number of mora in accent phrase
|
243 |
+
f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr)
|
244 |
+
|
245 |
+
a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1])
|
246 |
+
# accent phrase border
|
247 |
+
if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl":
|
248 |
+
phones.append("#")
|
249 |
+
# pitch falling
|
250 |
+
elif a1 == 0 and a2_next == a2 + 1 and a2 != f1:
|
251 |
+
phones.append("]")
|
252 |
+
# pitch rising
|
253 |
+
elif a2 == 1 and a2_next == 2:
|
254 |
+
phones.append("[")
|
255 |
+
|
256 |
+
return phones
|
257 |
+
|
258 |
+
|
259 |
+
# Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
|
260 |
+
def _numeric_feature_by_regex(regex, s):
|
261 |
+
match = re.search(regex, s)
|
262 |
+
if match is None:
|
263 |
+
return -50
|
264 |
+
return int(match.group(1))
|
265 |
+
|
266 |
+
|
267 |
+
def g2p(norm_text, with_prosody=True):
|
268 |
+
phones = preprocess_jap(norm_text, with_prosody)
|
269 |
+
phones = [post_replace_ph(i) for i in phones]
|
270 |
+
# todo: implement tones and word2ph
|
271 |
+
return phones
|
272 |
+
|
273 |
+
|
274 |
+
if __name__ == "__main__":
|
275 |
+
phones = g2p("Hello.こんにちは!今日もNiCe天気ですね!tokyotowerに行きましょう!")
|
276 |
+
print(phones)
|
text/korean.py
ADDED
@@ -0,0 +1,337 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# reference: https://github.com/ORI-Muchim/MB-iSTFT-VITS-Korean/blob/main/text/korean.py
|
2 |
+
|
3 |
+
import re
|
4 |
+
from jamo import h2j, j2hcj
|
5 |
+
import ko_pron
|
6 |
+
from g2pk2 import G2p
|
7 |
+
|
8 |
+
import importlib
|
9 |
+
import os
|
10 |
+
|
11 |
+
# 防止win下无法读取模型
|
12 |
+
if os.name == "nt":
|
13 |
+
|
14 |
+
class win_G2p(G2p):
|
15 |
+
def check_mecab(self):
|
16 |
+
super().check_mecab()
|
17 |
+
spam_spec = importlib.util.find_spec("eunjeon")
|
18 |
+
non_found = spam_spec is None
|
19 |
+
if non_found:
|
20 |
+
print("you have to install eunjeon. install it...")
|
21 |
+
else:
|
22 |
+
installpath = spam_spec.submodule_search_locations[0]
|
23 |
+
if not (re.match(r"^[A-Za-z0-9_/\\:.\-]*$", installpath)):
|
24 |
+
import sys
|
25 |
+
from eunjeon import Mecab as _Mecab
|
26 |
+
|
27 |
+
class Mecab(_Mecab):
|
28 |
+
def get_dicpath(installpath):
|
29 |
+
if not (re.match(r"^[A-Za-z0-9_/\\:.\-]*$", installpath)):
|
30 |
+
import shutil
|
31 |
+
|
32 |
+
python_dir = os.getcwd()
|
33 |
+
if installpath[: len(python_dir)].upper() == python_dir.upper():
|
34 |
+
dicpath = os.path.join(os.path.relpath(installpath, python_dir), "data", "mecabrc")
|
35 |
+
else:
|
36 |
+
if not os.path.exists("TEMP"):
|
37 |
+
os.mkdir("TEMP")
|
38 |
+
if not os.path.exists(os.path.join("TEMP", "ko")):
|
39 |
+
os.mkdir(os.path.join("TEMP", "ko"))
|
40 |
+
if os.path.exists(os.path.join("TEMP", "ko", "ko_dict")):
|
41 |
+
shutil.rmtree(os.path.join("TEMP", "ko", "ko_dict"))
|
42 |
+
|
43 |
+
shutil.copytree(
|
44 |
+
os.path.join(installpath, "data"), os.path.join("TEMP", "ko", "ko_dict")
|
45 |
+
)
|
46 |
+
dicpath = os.path.join("TEMP", "ko", "ko_dict", "mecabrc")
|
47 |
+
else:
|
48 |
+
dicpath = os.path.abspath(os.path.join(installpath, "data/mecabrc"))
|
49 |
+
return dicpath
|
50 |
+
|
51 |
+
def __init__(self, dicpath=get_dicpath(installpath)):
|
52 |
+
super().__init__(dicpath=dicpath)
|
53 |
+
|
54 |
+
sys.modules["eunjeon"].Mecab = Mecab
|
55 |
+
|
56 |
+
G2p = win_G2p
|
57 |
+
|
58 |
+
|
59 |
+
from text.symbols2 import symbols
|
60 |
+
|
61 |
+
# This is a list of Korean classifiers preceded by pure Korean numerals.
|
62 |
+
_korean_classifiers = (
|
63 |
+
"군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통"
|
64 |
+
)
|
65 |
+
|
66 |
+
# List of (hangul, hangul divided) pairs:
|
67 |
+
_hangul_divided = [
|
68 |
+
(re.compile("%s" % x[0]), x[1])
|
69 |
+
for x in [
|
70 |
+
# ('ㄳ', 'ㄱㅅ'), # g2pk2, A Syllable-ending Rule
|
71 |
+
# ('ㄵ', 'ㄴㅈ'),
|
72 |
+
# ('ㄶ', 'ㄴㅎ'),
|
73 |
+
# ('ㄺ', 'ㄹㄱ'),
|
74 |
+
# ('ㄻ', 'ㄹㅁ'),
|
75 |
+
# ('ㄼ', 'ㄹㅂ'),
|
76 |
+
# ('ㄽ', 'ㄹㅅ'),
|
77 |
+
# ('ㄾ', 'ㄹㅌ'),
|
78 |
+
# ('ㄿ', 'ㄹㅍ'),
|
79 |
+
# ('ㅀ', 'ㄹㅎ'),
|
80 |
+
# ('ㅄ', 'ㅂㅅ'),
|
81 |
+
("ㅘ", "ㅗㅏ"),
|
82 |
+
("ㅙ", "ㅗㅐ"),
|
83 |
+
("ㅚ", "ㅗㅣ"),
|
84 |
+
("ㅝ", "ㅜㅓ"),
|
85 |
+
("ㅞ", "ㅜㅔ"),
|
86 |
+
("ㅟ", "ㅜㅣ"),
|
87 |
+
("ㅢ", "ㅡㅣ"),
|
88 |
+
("ㅑ", "ㅣㅏ"),
|
89 |
+
("ㅒ", "ㅣㅐ"),
|
90 |
+
("ㅕ", "ㅣㅓ"),
|
91 |
+
("ㅖ", "ㅣㅔ"),
|
92 |
+
("ㅛ", "ㅣㅗ"),
|
93 |
+
("ㅠ", "ㅣㅜ"),
|
94 |
+
]
|
95 |
+
]
|
96 |
+
|
97 |
+
# List of (Latin alphabet, hangul) pairs:
|
98 |
+
_latin_to_hangul = [
|
99 |
+
(re.compile("%s" % x[0], re.IGNORECASE), x[1])
|
100 |
+
for x in [
|
101 |
+
("a", "에이"),
|
102 |
+
("b", "비"),
|
103 |
+
("c", "시"),
|
104 |
+
("d", "디"),
|
105 |
+
("e", "이"),
|
106 |
+
("f", "에프"),
|
107 |
+
("g", "지"),
|
108 |
+
("h", "에이치"),
|
109 |
+
("i", "아이"),
|
110 |
+
("j", "제이"),
|
111 |
+
("k", "케이"),
|
112 |
+
("l", "엘"),
|
113 |
+
("m", "엠"),
|
114 |
+
("n", "엔"),
|
115 |
+
("o", "오"),
|
116 |
+
("p", "피"),
|
117 |
+
("q", "큐"),
|
118 |
+
("r", "아르"),
|
119 |
+
("s", "에스"),
|
120 |
+
("t", "티"),
|
121 |
+
("u", "유"),
|
122 |
+
("v", "브이"),
|
123 |
+
("w", "더블유"),
|
124 |
+
("x", "엑스"),
|
125 |
+
("y", "와이"),
|
126 |
+
("z", "제트"),
|
127 |
+
]
|
128 |
+
]
|
129 |
+
|
130 |
+
# List of (ipa, lazy ipa) pairs:
|
131 |
+
_ipa_to_lazy_ipa = [
|
132 |
+
(re.compile("%s" % x[0], re.IGNORECASE), x[1])
|
133 |
+
for x in [
|
134 |
+
("t͡ɕ", "ʧ"),
|
135 |
+
("d͡ʑ", "ʥ"),
|
136 |
+
("ɲ", "n^"),
|
137 |
+
("ɕ", "ʃ"),
|
138 |
+
("ʷ", "w"),
|
139 |
+
("ɭ", "l`"),
|
140 |
+
("ʎ", "ɾ"),
|
141 |
+
("ɣ", "ŋ"),
|
142 |
+
("ɰ", "ɯ"),
|
143 |
+
("ʝ", "j"),
|
144 |
+
("ʌ", "ə"),
|
145 |
+
("ɡ", "g"),
|
146 |
+
("\u031a", "#"),
|
147 |
+
("\u0348", "="),
|
148 |
+
("\u031e", ""),
|
149 |
+
("\u0320", ""),
|
150 |
+
("\u0339", ""),
|
151 |
+
]
|
152 |
+
]
|
153 |
+
|
154 |
+
|
155 |
+
def fix_g2pk2_error(text):
|
156 |
+
new_text = ""
|
157 |
+
i = 0
|
158 |
+
while i < len(text) - 4:
|
159 |
+
if (text[i : i + 3] == "ㅇㅡㄹ" or text[i : i + 3] == "ㄹㅡㄹ") and text[i + 3] == " " and text[i + 4] == "ㄹ":
|
160 |
+
new_text += text[i : i + 3] + " " + "ㄴ"
|
161 |
+
i += 5
|
162 |
+
else:
|
163 |
+
new_text += text[i]
|
164 |
+
i += 1
|
165 |
+
|
166 |
+
new_text += text[i:]
|
167 |
+
return new_text
|
168 |
+
|
169 |
+
|
170 |
+
def latin_to_hangul(text):
|
171 |
+
for regex, replacement in _latin_to_hangul:
|
172 |
+
text = re.sub(regex, replacement, text)
|
173 |
+
return text
|
174 |
+
|
175 |
+
|
176 |
+
def divide_hangul(text):
|
177 |
+
text = j2hcj(h2j(text))
|
178 |
+
for regex, replacement in _hangul_divided:
|
179 |
+
text = re.sub(regex, replacement, text)
|
180 |
+
return text
|
181 |
+
|
182 |
+
|
183 |
+
def hangul_number(num, sino=True):
|
184 |
+
"""Reference https://github.com/Kyubyong/g2pK"""
|
185 |
+
num = re.sub(",", "", num)
|
186 |
+
|
187 |
+
if num == "0":
|
188 |
+
return "영"
|
189 |
+
if not sino and num == "20":
|
190 |
+
return "스무"
|
191 |
+
|
192 |
+
digits = "123456789"
|
193 |
+
names = "일이삼사오육칠팔구"
|
194 |
+
digit2name = {d: n for d, n in zip(digits, names)}
|
195 |
+
|
196 |
+
modifiers = "한 두 세 네 다섯 여섯 일곱 여덟 아홉"
|
197 |
+
decimals = "열 스물 서른 마흔 쉰 예순 일흔 여든 아흔"
|
198 |
+
digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())}
|
199 |
+
digit2dec = {d: dec for d, dec in zip(digits, decimals.split())}
|
200 |
+
|
201 |
+
spelledout = []
|
202 |
+
for i, digit in enumerate(num):
|
203 |
+
i = len(num) - i - 1
|
204 |
+
if sino:
|
205 |
+
if i == 0:
|
206 |
+
name = digit2name.get(digit, "")
|
207 |
+
elif i == 1:
|
208 |
+
name = digit2name.get(digit, "") + "십"
|
209 |
+
name = name.replace("일십", "십")
|
210 |
+
else:
|
211 |
+
if i == 0:
|
212 |
+
name = digit2mod.get(digit, "")
|
213 |
+
elif i == 1:
|
214 |
+
name = digit2dec.get(digit, "")
|
215 |
+
if digit == "0":
|
216 |
+
if i % 4 == 0:
|
217 |
+
last_three = spelledout[-min(3, len(spelledout)) :]
|
218 |
+
if "".join(last_three) == "":
|
219 |
+
spelledout.append("")
|
220 |
+
continue
|
221 |
+
else:
|
222 |
+
spelledout.append("")
|
223 |
+
continue
|
224 |
+
if i == 2:
|
225 |
+
name = digit2name.get(digit, "") + "백"
|
226 |
+
name = name.replace("일백", "백")
|
227 |
+
elif i == 3:
|
228 |
+
name = digit2name.get(digit, "") + "천"
|
229 |
+
name = name.replace("일천", "천")
|
230 |
+
elif i == 4:
|
231 |
+
name = digit2name.get(digit, "") + "만"
|
232 |
+
name = name.replace("일만", "만")
|
233 |
+
elif i == 5:
|
234 |
+
name = digit2name.get(digit, "") + "십"
|
235 |
+
name = name.replace("일십", "십")
|
236 |
+
elif i == 6:
|
237 |
+
name = digit2name.get(digit, "") + "백"
|
238 |
+
name = name.replace("일백", "백")
|
239 |
+
elif i == 7:
|
240 |
+
name = digit2name.get(digit, "") + "천"
|
241 |
+
name = name.replace("일천", "천")
|
242 |
+
elif i == 8:
|
243 |
+
name = digit2name.get(digit, "") + "억"
|
244 |
+
elif i == 9:
|
245 |
+
name = digit2name.get(digit, "") + "십"
|
246 |
+
elif i == 10:
|
247 |
+
name = digit2name.get(digit, "") + "백"
|
248 |
+
elif i == 11:
|
249 |
+
name = digit2name.get(digit, "") + "천"
|
250 |
+
elif i == 12:
|
251 |
+
name = digit2name.get(digit, "") + "조"
|
252 |
+
elif i == 13:
|
253 |
+
name = digit2name.get(digit, "") + "십"
|
254 |
+
elif i == 14:
|
255 |
+
name = digit2name.get(digit, "") + "백"
|
256 |
+
elif i == 15:
|
257 |
+
name = digit2name.get(digit, "") + "천"
|
258 |
+
spelledout.append(name)
|
259 |
+
return "".join(elem for elem in spelledout)
|
260 |
+
|
261 |
+
|
262 |
+
def number_to_hangul(text):
|
263 |
+
"""Reference https://github.com/Kyubyong/g2pK"""
|
264 |
+
tokens = set(re.findall(r"(\d[\d,]*)([\uac00-\ud71f]+)", text))
|
265 |
+
for token in tokens:
|
266 |
+
num, classifier = token
|
267 |
+
if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers:
|
268 |
+
spelledout = hangul_number(num, sino=False)
|
269 |
+
else:
|
270 |
+
spelledout = hangul_number(num, sino=True)
|
271 |
+
text = text.replace(f"{num}{classifier}", f"{spelledout}{classifier}")
|
272 |
+
# digit by digit for remaining digits
|
273 |
+
digits = "0123456789"
|
274 |
+
names = "영일이삼사오육칠팔구"
|
275 |
+
for d, n in zip(digits, names):
|
276 |
+
text = text.replace(d, n)
|
277 |
+
return text
|
278 |
+
|
279 |
+
|
280 |
+
def korean_to_lazy_ipa(text):
|
281 |
+
text = latin_to_hangul(text)
|
282 |
+
text = number_to_hangul(text)
|
283 |
+
text = re.sub("[\uac00-\ud7af]+", lambda x: ko_pron.romanise(x.group(0), "ipa").split("] ~ [")[0], text)
|
284 |
+
for regex, replacement in _ipa_to_lazy_ipa:
|
285 |
+
text = re.sub(regex, replacement, text)
|
286 |
+
return text
|
287 |
+
|
288 |
+
|
289 |
+
_g2p = G2p()
|
290 |
+
|
291 |
+
|
292 |
+
def korean_to_ipa(text):
|
293 |
+
text = latin_to_hangul(text)
|
294 |
+
text = number_to_hangul(text)
|
295 |
+
text = _g2p(text)
|
296 |
+
text = fix_g2pk2_error(text)
|
297 |
+
text = korean_to_lazy_ipa(text)
|
298 |
+
return text.replace("ʧ", "tʃ").replace("ʥ", "dʑ")
|
299 |
+
|
300 |
+
|
301 |
+
def post_replace_ph(ph):
|
302 |
+
rep_map = {
|
303 |
+
":": ",",
|
304 |
+
";": ",",
|
305 |
+
",": ",",
|
306 |
+
"。": ".",
|
307 |
+
"!": "!",
|
308 |
+
"?": "?",
|
309 |
+
"\n": ".",
|
310 |
+
"·": ",",
|
311 |
+
"、": ",",
|
312 |
+
"...": "…",
|
313 |
+
" ": "空",
|
314 |
+
}
|
315 |
+
if ph in rep_map.keys():
|
316 |
+
ph = rep_map[ph]
|
317 |
+
if ph in symbols:
|
318 |
+
return ph
|
319 |
+
if ph not in symbols:
|
320 |
+
ph = "停"
|
321 |
+
return ph
|
322 |
+
|
323 |
+
|
324 |
+
def g2p(text):
|
325 |
+
text = latin_to_hangul(text)
|
326 |
+
text = _g2p(text)
|
327 |
+
text = divide_hangul(text)
|
328 |
+
text = fix_g2pk2_error(text)
|
329 |
+
text = re.sub(r"([\u3131-\u3163])$", r"\1.", text)
|
330 |
+
# text = "".join([post_replace_ph(i) for i in text])
|
331 |
+
text = [post_replace_ph(i) for i in text]
|
332 |
+
return text
|
333 |
+
|
334 |
+
|
335 |
+
if __name__ == "__main__":
|
336 |
+
text = "안녕하세요"
|
337 |
+
print(g2p(text))
|
text/namedict_cache.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:559552094c4a6e995213e3fa586330e078ef8cb3a7a95a3109e945111cd2bfc1
|
3 |
+
size 760663
|
text/opencpop-strict.txt
ADDED
@@ -0,0 +1,429 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
a AA a
|
2 |
+
ai AA ai
|
3 |
+
an AA an
|
4 |
+
ang AA ang
|
5 |
+
ao AA ao
|
6 |
+
ba b a
|
7 |
+
bai b ai
|
8 |
+
ban b an
|
9 |
+
bang b ang
|
10 |
+
bao b ao
|
11 |
+
bei b ei
|
12 |
+
ben b en
|
13 |
+
beng b eng
|
14 |
+
bi b i
|
15 |
+
bian b ian
|
16 |
+
biao b iao
|
17 |
+
bie b ie
|
18 |
+
bin b in
|
19 |
+
bing b ing
|
20 |
+
bo b o
|
21 |
+
bu b u
|
22 |
+
ca c a
|
23 |
+
cai c ai
|
24 |
+
can c an
|
25 |
+
cang c ang
|
26 |
+
cao c ao
|
27 |
+
ce c e
|
28 |
+
cei c ei
|
29 |
+
cen c en
|
30 |
+
ceng c eng
|
31 |
+
cha ch a
|
32 |
+
chai ch ai
|
33 |
+
chan ch an
|
34 |
+
chang ch ang
|
35 |
+
chao ch ao
|
36 |
+
che ch e
|
37 |
+
chen ch en
|
38 |
+
cheng ch eng
|
39 |
+
chi ch ir
|
40 |
+
chong ch ong
|
41 |
+
chou ch ou
|
42 |
+
chu ch u
|
43 |
+
chua ch ua
|
44 |
+
chuai ch uai
|
45 |
+
chuan ch uan
|
46 |
+
chuang ch uang
|
47 |
+
chui ch ui
|
48 |
+
chun ch un
|
49 |
+
chuo ch uo
|
50 |
+
ci c i0
|
51 |
+
cong c ong
|
52 |
+
cou c ou
|
53 |
+
cu c u
|
54 |
+
cuan c uan
|
55 |
+
cui c ui
|
56 |
+
cun c un
|
57 |
+
cuo c uo
|
58 |
+
da d a
|
59 |
+
dai d ai
|
60 |
+
dan d an
|
61 |
+
dang d ang
|
62 |
+
dao d ao
|
63 |
+
de d e
|
64 |
+
dei d ei
|
65 |
+
den d en
|
66 |
+
deng d eng
|
67 |
+
di d i
|
68 |
+
dia d ia
|
69 |
+
dian d ian
|
70 |
+
diao d iao
|
71 |
+
die d ie
|
72 |
+
ding d ing
|
73 |
+
diu d iu
|
74 |
+
dong d ong
|
75 |
+
dou d ou
|
76 |
+
du d u
|
77 |
+
duan d uan
|
78 |
+
dui d ui
|
79 |
+
dun d un
|
80 |
+
duo d uo
|
81 |
+
e EE e
|
82 |
+
ei EE ei
|
83 |
+
en EE en
|
84 |
+
eng EE eng
|
85 |
+
er EE er
|
86 |
+
fa f a
|
87 |
+
fan f an
|
88 |
+
fang f ang
|
89 |
+
fei f ei
|
90 |
+
fen f en
|
91 |
+
feng f eng
|
92 |
+
fo f o
|
93 |
+
fou f ou
|
94 |
+
fu f u
|
95 |
+
ga g a
|
96 |
+
gai g ai
|
97 |
+
gan g an
|
98 |
+
gang g ang
|
99 |
+
gao g ao
|
100 |
+
ge g e
|
101 |
+
gei g ei
|
102 |
+
gen g en
|
103 |
+
geng g eng
|
104 |
+
gong g ong
|
105 |
+
gou g ou
|
106 |
+
gu g u
|
107 |
+
gua g ua
|
108 |
+
guai g uai
|
109 |
+
guan g uan
|
110 |
+
guang g uang
|
111 |
+
gui g ui
|
112 |
+
gun g un
|
113 |
+
guo g uo
|
114 |
+
ha h a
|
115 |
+
hai h ai
|
116 |
+
han h an
|
117 |
+
hang h ang
|
118 |
+
hao h ao
|
119 |
+
he h e
|
120 |
+
hei h ei
|
121 |
+
hen h en
|
122 |
+
heng h eng
|
123 |
+
hong h ong
|
124 |
+
hou h ou
|
125 |
+
hu h u
|
126 |
+
hua h ua
|
127 |
+
huai h uai
|
128 |
+
huan h uan
|
129 |
+
huang h uang
|
130 |
+
hui h ui
|
131 |
+
hun h un
|
132 |
+
huo h uo
|
133 |
+
ji j i
|
134 |
+
jia j ia
|
135 |
+
jian j ian
|
136 |
+
jiang j iang
|
137 |
+
jiao j iao
|
138 |
+
jie j ie
|
139 |
+
jin j in
|
140 |
+
jing j ing
|
141 |
+
jiong j iong
|
142 |
+
jiu j iu
|
143 |
+
ju j v
|
144 |
+
jv j v
|
145 |
+
juan j van
|
146 |
+
jvan j van
|
147 |
+
jue j ve
|
148 |
+
jve j ve
|
149 |
+
jun j vn
|
150 |
+
jvn j vn
|
151 |
+
ka k a
|
152 |
+
kai k ai
|
153 |
+
kan k an
|
154 |
+
kang k ang
|
155 |
+
kao k ao
|
156 |
+
ke k e
|
157 |
+
kei k ei
|
158 |
+
ken k en
|
159 |
+
keng k eng
|
160 |
+
kong k ong
|
161 |
+
kou k ou
|
162 |
+
ku k u
|
163 |
+
kua k ua
|
164 |
+
kuai k uai
|
165 |
+
kuan k uan
|
166 |
+
kuang k uang
|
167 |
+
kui k ui
|
168 |
+
kun k un
|
169 |
+
kuo k uo
|
170 |
+
la l a
|
171 |
+
lai l ai
|
172 |
+
lan l an
|
173 |
+
lang l ang
|
174 |
+
lao l ao
|
175 |
+
le l e
|
176 |
+
lei l ei
|
177 |
+
leng l eng
|
178 |
+
li l i
|
179 |
+
lia l ia
|
180 |
+
lian l ian
|
181 |
+
liang l iang
|
182 |
+
liao l iao
|
183 |
+
lie l ie
|
184 |
+
lin l in
|
185 |
+
ling l ing
|
186 |
+
liu l iu
|
187 |
+
lo l o
|
188 |
+
long l ong
|
189 |
+
lou l ou
|
190 |
+
lu l u
|
191 |
+
luan l uan
|
192 |
+
lun l un
|
193 |
+
luo l uo
|
194 |
+
lv l v
|
195 |
+
lve l ve
|
196 |
+
ma m a
|
197 |
+
mai m ai
|
198 |
+
man m an
|
199 |
+
mang m ang
|
200 |
+
mao m ao
|
201 |
+
me m e
|
202 |
+
mei m ei
|
203 |
+
men m en
|
204 |
+
meng m eng
|
205 |
+
mi m i
|
206 |
+
mian m ian
|
207 |
+
miao m iao
|
208 |
+
mie m ie
|
209 |
+
min m in
|
210 |
+
ming m ing
|
211 |
+
miu m iu
|
212 |
+
mo m o
|
213 |
+
mou m ou
|
214 |
+
mu m u
|
215 |
+
na n a
|
216 |
+
nai n ai
|
217 |
+
nan n an
|
218 |
+
nang n ang
|
219 |
+
nao n ao
|
220 |
+
ne n e
|
221 |
+
nei n ei
|
222 |
+
nen n en
|
223 |
+
neng n eng
|
224 |
+
ni n i
|
225 |
+
nian n ian
|
226 |
+
niang n iang
|
227 |
+
niao n iao
|
228 |
+
nie n ie
|
229 |
+
nin n in
|
230 |
+
ning n ing
|
231 |
+
niu n iu
|
232 |
+
nong n ong
|
233 |
+
nou n ou
|
234 |
+
nu n u
|
235 |
+
nuan n uan
|
236 |
+
nun n un
|
237 |
+
nuo n uo
|
238 |
+
nv n v
|
239 |
+
nve n ve
|
240 |
+
o OO o
|
241 |
+
ou OO ou
|
242 |
+
pa p a
|
243 |
+
pai p ai
|
244 |
+
pan p an
|
245 |
+
pang p ang
|
246 |
+
pao p ao
|
247 |
+
pei p ei
|
248 |
+
pen p en
|
249 |
+
peng p eng
|
250 |
+
pi p i
|
251 |
+
pian p ian
|
252 |
+
piao p iao
|
253 |
+
pie p ie
|
254 |
+
pin p in
|
255 |
+
ping p ing
|
256 |
+
po p o
|
257 |
+
pou p ou
|
258 |
+
pu p u
|
259 |
+
qi q i
|
260 |
+
qia q ia
|
261 |
+
qian q ian
|
262 |
+
qiang q iang
|
263 |
+
qiao q iao
|
264 |
+
qie q ie
|
265 |
+
qin q in
|
266 |
+
qing q ing
|
267 |
+
qiong q iong
|
268 |
+
qiu q iu
|
269 |
+
qu q v
|
270 |
+
qv q v
|
271 |
+
quan q van
|
272 |
+
qvan q van
|
273 |
+
que q ve
|
274 |
+
qve q ve
|
275 |
+
qun q vn
|
276 |
+
qvn q vn
|
277 |
+
ran r an
|
278 |
+
rang r ang
|
279 |
+
rao r ao
|
280 |
+
re r e
|
281 |
+
ren r en
|
282 |
+
reng r eng
|
283 |
+
ri r ir
|
284 |
+
rong r ong
|
285 |
+
rou r ou
|
286 |
+
ru r u
|
287 |
+
rua r ua
|
288 |
+
ruan r uan
|
289 |
+
rui r ui
|
290 |
+
run r un
|
291 |
+
ruo r uo
|
292 |
+
sa s a
|
293 |
+
sai s ai
|
294 |
+
san s an
|
295 |
+
sang s ang
|
296 |
+
sao s ao
|
297 |
+
se s e
|
298 |
+
sen s en
|
299 |
+
seng s eng
|
300 |
+
sha sh a
|
301 |
+
shai sh ai
|
302 |
+
shan sh an
|
303 |
+
shang sh ang
|
304 |
+
shao sh ao
|
305 |
+
she sh e
|
306 |
+
shei sh ei
|
307 |
+
shen sh en
|
308 |
+
sheng sh eng
|
309 |
+
shi sh ir
|
310 |
+
shou sh ou
|
311 |
+
shu sh u
|
312 |
+
shua sh ua
|
313 |
+
shuai sh uai
|
314 |
+
shuan sh uan
|
315 |
+
shuang sh uang
|
316 |
+
shui sh ui
|
317 |
+
shun sh un
|
318 |
+
shuo sh uo
|
319 |
+
si s i0
|
320 |
+
song s ong
|
321 |
+
sou s ou
|
322 |
+
su s u
|
323 |
+
suan s uan
|
324 |
+
sui s ui
|
325 |
+
sun s un
|
326 |
+
suo s uo
|
327 |
+
ta t a
|
328 |
+
tai t ai
|
329 |
+
tan t an
|
330 |
+
tang t ang
|
331 |
+
tao t ao
|
332 |
+
te t e
|
333 |
+
tei t ei
|
334 |
+
teng t eng
|
335 |
+
ti t i
|
336 |
+
tian t ian
|
337 |
+
tiao t iao
|
338 |
+
tie t ie
|
339 |
+
ting t ing
|
340 |
+
tong t ong
|
341 |
+
tou t ou
|
342 |
+
tu t u
|
343 |
+
tuan t uan
|
344 |
+
tui t ui
|
345 |
+
tun t un
|
346 |
+
tuo t uo
|
347 |
+
wa w a
|
348 |
+
wai w ai
|
349 |
+
wan w an
|
350 |
+
wang w ang
|
351 |
+
wei w ei
|
352 |
+
wen w en
|
353 |
+
weng w eng
|
354 |
+
wo w o
|
355 |
+
wu w u
|
356 |
+
xi x i
|
357 |
+
xia x ia
|
358 |
+
xian x ian
|
359 |
+
xiang x iang
|
360 |
+
xiao x iao
|
361 |
+
xie x ie
|
362 |
+
xin x in
|
363 |
+
xing x ing
|
364 |
+
xiong x iong
|
365 |
+
xiu x iu
|
366 |
+
xu x v
|
367 |
+
xv x v
|
368 |
+
xuan x van
|
369 |
+
xvan x van
|
370 |
+
xue x ve
|
371 |
+
xve x ve
|
372 |
+
xun x vn
|
373 |
+
xvn x vn
|
374 |
+
ya y a
|
375 |
+
yan y En
|
376 |
+
yang y ang
|
377 |
+
yao y ao
|
378 |
+
ye y E
|
379 |
+
yi y i
|
380 |
+
yin y in
|
381 |
+
ying y ing
|
382 |
+
yo y o
|
383 |
+
yong y ong
|
384 |
+
you y ou
|
385 |
+
yu y v
|
386 |
+
yv y v
|
387 |
+
yuan y van
|
388 |
+
yvan y van
|
389 |
+
yue y ve
|
390 |
+
yve y ve
|
391 |
+
yun y vn
|
392 |
+
yvn y vn
|
393 |
+
za z a
|
394 |
+
zai z ai
|
395 |
+
zan z an
|
396 |
+
zang z ang
|
397 |
+
zao z ao
|
398 |
+
ze z e
|
399 |
+
zei z ei
|
400 |
+
zen z en
|
401 |
+
zeng z eng
|
402 |
+
zha zh a
|
403 |
+
zhai zh ai
|
404 |
+
zhan zh an
|
405 |
+
zhang zh ang
|
406 |
+
zhao zh ao
|
407 |
+
zhe zh e
|
408 |
+
zhei zh ei
|
409 |
+
zhen zh en
|
410 |
+
zheng zh eng
|
411 |
+
zhi zh ir
|
412 |
+
zhong zh ong
|
413 |
+
zhou zh ou
|
414 |
+
zhu zh u
|
415 |
+
zhua zh ua
|
416 |
+
zhuai zh uai
|
417 |
+
zhuan zh uan
|
418 |
+
zhuang zh uang
|
419 |
+
zhui zh ui
|
420 |
+
zhun zh un
|
421 |
+
zhuo zh uo
|
422 |
+
zi z i0
|
423 |
+
zong z ong
|
424 |
+
zou z ou
|
425 |
+
zu z u
|
426 |
+
zuan z uan
|
427 |
+
zui z ui
|
428 |
+
zun z un
|
429 |
+
zuo z uo
|
text/symbols.py
ADDED
@@ -0,0 +1,399 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# punctuation = ['!', '?', '…', ",", ".","@"]#@是SP停顿
|
2 |
+
punctuation = ["!", "?", "…", ",", "."] # @是SP停顿
|
3 |
+
punctuation.append("-")
|
4 |
+
pu_symbols = punctuation + ["SP", "SP2", "SP3", "UNK"]
|
5 |
+
# pu_symbols = punctuation + ["SP", 'SP2', 'SP3','SP4', "UNK"]
|
6 |
+
pad = "_"
|
7 |
+
|
8 |
+
c = [
|
9 |
+
"AA",
|
10 |
+
"EE",
|
11 |
+
"OO",
|
12 |
+
"b",
|
13 |
+
"c",
|
14 |
+
"ch",
|
15 |
+
"d",
|
16 |
+
"f",
|
17 |
+
"g",
|
18 |
+
"h",
|
19 |
+
"j",
|
20 |
+
"k",
|
21 |
+
"l",
|
22 |
+
"m",
|
23 |
+
"n",
|
24 |
+
"p",
|
25 |
+
"q",
|
26 |
+
"r",
|
27 |
+
"s",
|
28 |
+
"sh",
|
29 |
+
"t",
|
30 |
+
"w",
|
31 |
+
"x",
|
32 |
+
"y",
|
33 |
+
"z",
|
34 |
+
"zh",
|
35 |
+
]
|
36 |
+
v = [
|
37 |
+
"E1",
|
38 |
+
"En1",
|
39 |
+
"a1",
|
40 |
+
"ai1",
|
41 |
+
"an1",
|
42 |
+
"ang1",
|
43 |
+
"ao1",
|
44 |
+
"e1",
|
45 |
+
"ei1",
|
46 |
+
"en1",
|
47 |
+
"eng1",
|
48 |
+
"er1",
|
49 |
+
"i1",
|
50 |
+
"i01",
|
51 |
+
"ia1",
|
52 |
+
"ian1",
|
53 |
+
"iang1",
|
54 |
+
"iao1",
|
55 |
+
"ie1",
|
56 |
+
"in1",
|
57 |
+
"ing1",
|
58 |
+
"iong1",
|
59 |
+
"ir1",
|
60 |
+
"iu1",
|
61 |
+
"o1",
|
62 |
+
"ong1",
|
63 |
+
"ou1",
|
64 |
+
"u1",
|
65 |
+
"ua1",
|
66 |
+
"uai1",
|
67 |
+
"uan1",
|
68 |
+
"uang1",
|
69 |
+
"ui1",
|
70 |
+
"un1",
|
71 |
+
"uo1",
|
72 |
+
"v1",
|
73 |
+
"van1",
|
74 |
+
"ve1",
|
75 |
+
"vn1",
|
76 |
+
"E2",
|
77 |
+
"En2",
|
78 |
+
"a2",
|
79 |
+
"ai2",
|
80 |
+
"an2",
|
81 |
+
"ang2",
|
82 |
+
"ao2",
|
83 |
+
"e2",
|
84 |
+
"ei2",
|
85 |
+
"en2",
|
86 |
+
"eng2",
|
87 |
+
"er2",
|
88 |
+
"i2",
|
89 |
+
"i02",
|
90 |
+
"ia2",
|
91 |
+
"ian2",
|
92 |
+
"iang2",
|
93 |
+
"iao2",
|
94 |
+
"ie2",
|
95 |
+
"in2",
|
96 |
+
"ing2",
|
97 |
+
"iong2",
|
98 |
+
"ir2",
|
99 |
+
"iu2",
|
100 |
+
"o2",
|
101 |
+
"ong2",
|
102 |
+
"ou2",
|
103 |
+
"u2",
|
104 |
+
"ua2",
|
105 |
+
"uai2",
|
106 |
+
"uan2",
|
107 |
+
"uang2",
|
108 |
+
"ui2",
|
109 |
+
"un2",
|
110 |
+
"uo2",
|
111 |
+
"v2",
|
112 |
+
"van2",
|
113 |
+
"ve2",
|
114 |
+
"vn2",
|
115 |
+
"E3",
|
116 |
+
"En3",
|
117 |
+
"a3",
|
118 |
+
"ai3",
|
119 |
+
"an3",
|
120 |
+
"ang3",
|
121 |
+
"ao3",
|
122 |
+
"e3",
|
123 |
+
"ei3",
|
124 |
+
"en3",
|
125 |
+
"eng3",
|
126 |
+
"er3",
|
127 |
+
"i3",
|
128 |
+
"i03",
|
129 |
+
"ia3",
|
130 |
+
"ian3",
|
131 |
+
"iang3",
|
132 |
+
"iao3",
|
133 |
+
"ie3",
|
134 |
+
"in3",
|
135 |
+
"ing3",
|
136 |
+
"iong3",
|
137 |
+
"ir3",
|
138 |
+
"iu3",
|
139 |
+
"o3",
|
140 |
+
"ong3",
|
141 |
+
"ou3",
|
142 |
+
"u3",
|
143 |
+
"ua3",
|
144 |
+
"uai3",
|
145 |
+
"uan3",
|
146 |
+
"uang3",
|
147 |
+
"ui3",
|
148 |
+
"un3",
|
149 |
+
"uo3",
|
150 |
+
"v3",
|
151 |
+
"van3",
|
152 |
+
"ve3",
|
153 |
+
"vn3",
|
154 |
+
"E4",
|
155 |
+
"En4",
|
156 |
+
"a4",
|
157 |
+
"ai4",
|
158 |
+
"an4",
|
159 |
+
"ang4",
|
160 |
+
"ao4",
|
161 |
+
"e4",
|
162 |
+
"ei4",
|
163 |
+
"en4",
|
164 |
+
"eng4",
|
165 |
+
"er4",
|
166 |
+
"i4",
|
167 |
+
"i04",
|
168 |
+
"ia4",
|
169 |
+
"ian4",
|
170 |
+
"iang4",
|
171 |
+
"iao4",
|
172 |
+
"ie4",
|
173 |
+
"in4",
|
174 |
+
"ing4",
|
175 |
+
"iong4",
|
176 |
+
"ir4",
|
177 |
+
"iu4",
|
178 |
+
"o4",
|
179 |
+
"ong4",
|
180 |
+
"ou4",
|
181 |
+
"u4",
|
182 |
+
"ua4",
|
183 |
+
"uai4",
|
184 |
+
"uan4",
|
185 |
+
"uang4",
|
186 |
+
"ui4",
|
187 |
+
"un4",
|
188 |
+
"uo4",
|
189 |
+
"v4",
|
190 |
+
"van4",
|
191 |
+
"ve4",
|
192 |
+
"vn4",
|
193 |
+
"E5",
|
194 |
+
"En5",
|
195 |
+
"a5",
|
196 |
+
"ai5",
|
197 |
+
"an5",
|
198 |
+
"ang5",
|
199 |
+
"ao5",
|
200 |
+
"e5",
|
201 |
+
"ei5",
|
202 |
+
"en5",
|
203 |
+
"eng5",
|
204 |
+
"er5",
|
205 |
+
"i5",
|
206 |
+
"i05",
|
207 |
+
"ia5",
|
208 |
+
"ian5",
|
209 |
+
"iang5",
|
210 |
+
"iao5",
|
211 |
+
"ie5",
|
212 |
+
"in5",
|
213 |
+
"ing5",
|
214 |
+
"iong5",
|
215 |
+
"ir5",
|
216 |
+
"iu5",
|
217 |
+
"o5",
|
218 |
+
"ong5",
|
219 |
+
"ou5",
|
220 |
+
"u5",
|
221 |
+
"ua5",
|
222 |
+
"uai5",
|
223 |
+
"uan5",
|
224 |
+
"uang5",
|
225 |
+
"ui5",
|
226 |
+
"un5",
|
227 |
+
"uo5",
|
228 |
+
"v5",
|
229 |
+
"van5",
|
230 |
+
"ve5",
|
231 |
+
"vn5",
|
232 |
+
]
|
233 |
+
|
234 |
+
v_without_tone = [
|
235 |
+
"E",
|
236 |
+
"En",
|
237 |
+
"a",
|
238 |
+
"ai",
|
239 |
+
"an",
|
240 |
+
"ang",
|
241 |
+
"ao",
|
242 |
+
"e",
|
243 |
+
"ei",
|
244 |
+
"en",
|
245 |
+
"eng",
|
246 |
+
"er",
|
247 |
+
"i",
|
248 |
+
"i0",
|
249 |
+
"ia",
|
250 |
+
"ian",
|
251 |
+
"iang",
|
252 |
+
"iao",
|
253 |
+
"ie",
|
254 |
+
"in",
|
255 |
+
"ing",
|
256 |
+
"iong",
|
257 |
+
"ir",
|
258 |
+
"iu",
|
259 |
+
"o",
|
260 |
+
"ong",
|
261 |
+
"ou",
|
262 |
+
"u",
|
263 |
+
"ua",
|
264 |
+
"uai",
|
265 |
+
"uan",
|
266 |
+
"uang",
|
267 |
+
"ui",
|
268 |
+
"un",
|
269 |
+
"uo",
|
270 |
+
"v",
|
271 |
+
"van",
|
272 |
+
"ve",
|
273 |
+
"vn",
|
274 |
+
]
|
275 |
+
|
276 |
+
# japanese
|
277 |
+
ja_symbols = [
|
278 |
+
"I",
|
279 |
+
"N",
|
280 |
+
"U",
|
281 |
+
"a",
|
282 |
+
"b",
|
283 |
+
"by",
|
284 |
+
"ch",
|
285 |
+
"cl",
|
286 |
+
"d",
|
287 |
+
"dy",
|
288 |
+
"e",
|
289 |
+
"f",
|
290 |
+
"g",
|
291 |
+
"gy",
|
292 |
+
"h",
|
293 |
+
"hy",
|
294 |
+
"i",
|
295 |
+
"j",
|
296 |
+
"k",
|
297 |
+
"ky",
|
298 |
+
"m",
|
299 |
+
"my",
|
300 |
+
"n",
|
301 |
+
"ny",
|
302 |
+
"o",
|
303 |
+
"p",
|
304 |
+
"py",
|
305 |
+
"r",
|
306 |
+
"ry",
|
307 |
+
"s",
|
308 |
+
"sh",
|
309 |
+
"t",
|
310 |
+
"ts",
|
311 |
+
"u",
|
312 |
+
"v",
|
313 |
+
"w",
|
314 |
+
"y",
|
315 |
+
"z",
|
316 |
+
# "[", #上升调型
|
317 |
+
# "]", #下降调型
|
318 |
+
# "$", #结束符
|
319 |
+
# "^", #开始符
|
320 |
+
]
|
321 |
+
|
322 |
+
arpa = {
|
323 |
+
"AH0",
|
324 |
+
"S",
|
325 |
+
"AH1",
|
326 |
+
"EY2",
|
327 |
+
"AE2",
|
328 |
+
"EH0",
|
329 |
+
"OW2",
|
330 |
+
"UH0",
|
331 |
+
"NG",
|
332 |
+
"B",
|
333 |
+
"G",
|
334 |
+
"AY0",
|
335 |
+
"M",
|
336 |
+
"AA0",
|
337 |
+
"F",
|
338 |
+
"AO0",
|
339 |
+
"ER2",
|
340 |
+
"UH1",
|
341 |
+
"IY1",
|
342 |
+
"AH2",
|
343 |
+
"DH",
|
344 |
+
"IY0",
|
345 |
+
"EY1",
|
346 |
+
"IH0",
|
347 |
+
"K",
|
348 |
+
"N",
|
349 |
+
"W",
|
350 |
+
"IY2",
|
351 |
+
"T",
|
352 |
+
"AA1",
|
353 |
+
"ER1",
|
354 |
+
"EH2",
|
355 |
+
"OY0",
|
356 |
+
"UH2",
|
357 |
+
"UW1",
|
358 |
+
"Z",
|
359 |
+
"AW2",
|
360 |
+
"AW1",
|
361 |
+
"V",
|
362 |
+
"UW2",
|
363 |
+
"AA2",
|
364 |
+
"ER",
|
365 |
+
"AW0",
|
366 |
+
"UW0",
|
367 |
+
"R",
|
368 |
+
"OW1",
|
369 |
+
"EH1",
|
370 |
+
"ZH",
|
371 |
+
"AE0",
|
372 |
+
"IH2",
|
373 |
+
"IH",
|
374 |
+
"Y",
|
375 |
+
"JH",
|
376 |
+
"P",
|
377 |
+
"AY1",
|
378 |
+
"EY0",
|
379 |
+
"OY2",
|
380 |
+
"TH",
|
381 |
+
"HH",
|
382 |
+
"D",
|
383 |
+
"ER0",
|
384 |
+
"CH",
|
385 |
+
"AO1",
|
386 |
+
"AE1",
|
387 |
+
"AO2",
|
388 |
+
"OY1",
|
389 |
+
"AY2",
|
390 |
+
"IH1",
|
391 |
+
"OW0",
|
392 |
+
"L",
|
393 |
+
"SH",
|
394 |
+
}
|
395 |
+
|
396 |
+
symbols = [pad] + c + v + ja_symbols + pu_symbols + list(arpa)
|
397 |
+
symbols = sorted(set(symbols))
|
398 |
+
if __name__ == "__main__":
|
399 |
+
print(len(symbols))
|
text/symbols2.py
ADDED
@@ -0,0 +1,797 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# punctuation = ['!', '?', '…', ",", ".","@"]#@是SP停顿
|
2 |
+
punctuation = ["!", "?", "…", ",", "."] # @是SP停顿
|
3 |
+
punctuation.append("-")
|
4 |
+
pu_symbols = punctuation + ["SP", "SP2", "SP3", "UNK"]
|
5 |
+
# pu_symbols = punctuation + ["SP", 'SP2', 'SP3','SP4', "UNK"]
|
6 |
+
pad = "_"
|
7 |
+
|
8 |
+
c = [
|
9 |
+
"AA",
|
10 |
+
"EE",
|
11 |
+
"OO",
|
12 |
+
"b",
|
13 |
+
"c",
|
14 |
+
"ch",
|
15 |
+
"d",
|
16 |
+
"f",
|
17 |
+
"g",
|
18 |
+
"h",
|
19 |
+
"j",
|
20 |
+
"k",
|
21 |
+
"l",
|
22 |
+
"m",
|
23 |
+
"n",
|
24 |
+
"p",
|
25 |
+
"q",
|
26 |
+
"r",
|
27 |
+
"s",
|
28 |
+
"sh",
|
29 |
+
"t",
|
30 |
+
"w",
|
31 |
+
"x",
|
32 |
+
"y",
|
33 |
+
"z",
|
34 |
+
"zh",
|
35 |
+
]
|
36 |
+
v = [
|
37 |
+
"E1",
|
38 |
+
"En1",
|
39 |
+
"a1",
|
40 |
+
"ai1",
|
41 |
+
"an1",
|
42 |
+
"ang1",
|
43 |
+
"ao1",
|
44 |
+
"e1",
|
45 |
+
"ei1",
|
46 |
+
"en1",
|
47 |
+
"eng1",
|
48 |
+
"er1",
|
49 |
+
"i1",
|
50 |
+
"i01",
|
51 |
+
"ia1",
|
52 |
+
"ian1",
|
53 |
+
"iang1",
|
54 |
+
"iao1",
|
55 |
+
"ie1",
|
56 |
+
"in1",
|
57 |
+
"ing1",
|
58 |
+
"iong1",
|
59 |
+
"ir1",
|
60 |
+
"iu1",
|
61 |
+
"o1",
|
62 |
+
"ong1",
|
63 |
+
"ou1",
|
64 |
+
"u1",
|
65 |
+
"ua1",
|
66 |
+
"uai1",
|
67 |
+
"uan1",
|
68 |
+
"uang1",
|
69 |
+
"ui1",
|
70 |
+
"un1",
|
71 |
+
"uo1",
|
72 |
+
"v1",
|
73 |
+
"van1",
|
74 |
+
"ve1",
|
75 |
+
"vn1",
|
76 |
+
"E2",
|
77 |
+
"En2",
|
78 |
+
"a2",
|
79 |
+
"ai2",
|
80 |
+
"an2",
|
81 |
+
"ang2",
|
82 |
+
"ao2",
|
83 |
+
"e2",
|
84 |
+
"ei2",
|
85 |
+
"en2",
|
86 |
+
"eng2",
|
87 |
+
"er2",
|
88 |
+
"i2",
|
89 |
+
"i02",
|
90 |
+
"ia2",
|
91 |
+
"ian2",
|
92 |
+
"iang2",
|
93 |
+
"iao2",
|
94 |
+
"ie2",
|
95 |
+
"in2",
|
96 |
+
"ing2",
|
97 |
+
"iong2",
|
98 |
+
"ir2",
|
99 |
+
"iu2",
|
100 |
+
"o2",
|
101 |
+
"ong2",
|
102 |
+
"ou2",
|
103 |
+
"u2",
|
104 |
+
"ua2",
|
105 |
+
"uai2",
|
106 |
+
"uan2",
|
107 |
+
"uang2",
|
108 |
+
"ui2",
|
109 |
+
"un2",
|
110 |
+
"uo2",
|
111 |
+
"v2",
|
112 |
+
"van2",
|
113 |
+
"ve2",
|
114 |
+
"vn2",
|
115 |
+
"E3",
|
116 |
+
"En3",
|
117 |
+
"a3",
|
118 |
+
"ai3",
|
119 |
+
"an3",
|
120 |
+
"ang3",
|
121 |
+
"ao3",
|
122 |
+
"e3",
|
123 |
+
"ei3",
|
124 |
+
"en3",
|
125 |
+
"eng3",
|
126 |
+
"er3",
|
127 |
+
"i3",
|
128 |
+
"i03",
|
129 |
+
"ia3",
|
130 |
+
"ian3",
|
131 |
+
"iang3",
|
132 |
+
"iao3",
|
133 |
+
"ie3",
|
134 |
+
"in3",
|
135 |
+
"ing3",
|
136 |
+
"iong3",
|
137 |
+
"ir3",
|
138 |
+
"iu3",
|
139 |
+
"o3",
|
140 |
+
"ong3",
|
141 |
+
"ou3",
|
142 |
+
"u3",
|
143 |
+
"ua3",
|
144 |
+
"uai3",
|
145 |
+
"uan3",
|
146 |
+
"uang3",
|
147 |
+
"ui3",
|
148 |
+
"un3",
|
149 |
+
"uo3",
|
150 |
+
"v3",
|
151 |
+
"van3",
|
152 |
+
"ve3",
|
153 |
+
"vn3",
|
154 |
+
"E4",
|
155 |
+
"En4",
|
156 |
+
"a4",
|
157 |
+
"ai4",
|
158 |
+
"an4",
|
159 |
+
"ang4",
|
160 |
+
"ao4",
|
161 |
+
"e4",
|
162 |
+
"ei4",
|
163 |
+
"en4",
|
164 |
+
"eng4",
|
165 |
+
"er4",
|
166 |
+
"i4",
|
167 |
+
"i04",
|
168 |
+
"ia4",
|
169 |
+
"ian4",
|
170 |
+
"iang4",
|
171 |
+
"iao4",
|
172 |
+
"ie4",
|
173 |
+
"in4",
|
174 |
+
"ing4",
|
175 |
+
"iong4",
|
176 |
+
"ir4",
|
177 |
+
"iu4",
|
178 |
+
"o4",
|
179 |
+
"ong4",
|
180 |
+
"ou4",
|
181 |
+
"u4",
|
182 |
+
"ua4",
|
183 |
+
"uai4",
|
184 |
+
"uan4",
|
185 |
+
"uang4",
|
186 |
+
"ui4",
|
187 |
+
"un4",
|
188 |
+
"uo4",
|
189 |
+
"v4",
|
190 |
+
"van4",
|
191 |
+
"ve4",
|
192 |
+
"vn4",
|
193 |
+
"E5",
|
194 |
+
"En5",
|
195 |
+
"a5",
|
196 |
+
"ai5",
|
197 |
+
"an5",
|
198 |
+
"ang5",
|
199 |
+
"ao5",
|
200 |
+
"e5",
|
201 |
+
"ei5",
|
202 |
+
"en5",
|
203 |
+
"eng5",
|
204 |
+
"er5",
|
205 |
+
"i5",
|
206 |
+
"i05",
|
207 |
+
"ia5",
|
208 |
+
"ian5",
|
209 |
+
"iang5",
|
210 |
+
"iao5",
|
211 |
+
"ie5",
|
212 |
+
"in5",
|
213 |
+
"ing5",
|
214 |
+
"iong5",
|
215 |
+
"ir5",
|
216 |
+
"iu5",
|
217 |
+
"o5",
|
218 |
+
"ong5",
|
219 |
+
"ou5",
|
220 |
+
"u5",
|
221 |
+
"ua5",
|
222 |
+
"uai5",
|
223 |
+
"uan5",
|
224 |
+
"uang5",
|
225 |
+
"ui5",
|
226 |
+
"un5",
|
227 |
+
"uo5",
|
228 |
+
"v5",
|
229 |
+
"van5",
|
230 |
+
"ve5",
|
231 |
+
"vn5",
|
232 |
+
]
|
233 |
+
|
234 |
+
v_without_tone = [
|
235 |
+
"E",
|
236 |
+
"En",
|
237 |
+
"a",
|
238 |
+
"ai",
|
239 |
+
"an",
|
240 |
+
"ang",
|
241 |
+
"ao",
|
242 |
+
"e",
|
243 |
+
"ei",
|
244 |
+
"en",
|
245 |
+
"eng",
|
246 |
+
"er",
|
247 |
+
"i",
|
248 |
+
"i0",
|
249 |
+
"ia",
|
250 |
+
"ian",
|
251 |
+
"iang",
|
252 |
+
"iao",
|
253 |
+
"ie",
|
254 |
+
"in",
|
255 |
+
"ing",
|
256 |
+
"iong",
|
257 |
+
"ir",
|
258 |
+
"iu",
|
259 |
+
"o",
|
260 |
+
"ong",
|
261 |
+
"ou",
|
262 |
+
"u",
|
263 |
+
"ua",
|
264 |
+
"uai",
|
265 |
+
"uan",
|
266 |
+
"uang",
|
267 |
+
"ui",
|
268 |
+
"un",
|
269 |
+
"uo",
|
270 |
+
"v",
|
271 |
+
"van",
|
272 |
+
"ve",
|
273 |
+
"vn",
|
274 |
+
]
|
275 |
+
|
276 |
+
# japanese
|
277 |
+
ja_symbols = [
|
278 |
+
"I",
|
279 |
+
"N",
|
280 |
+
"U",
|
281 |
+
"a",
|
282 |
+
"b",
|
283 |
+
"by",
|
284 |
+
"ch",
|
285 |
+
"cl",
|
286 |
+
"d",
|
287 |
+
"dy",
|
288 |
+
"e",
|
289 |
+
"f",
|
290 |
+
"g",
|
291 |
+
"gy",
|
292 |
+
"h",
|
293 |
+
"hy",
|
294 |
+
"i",
|
295 |
+
"j",
|
296 |
+
"k",
|
297 |
+
"ky",
|
298 |
+
"m",
|
299 |
+
"my",
|
300 |
+
"n",
|
301 |
+
"ny",
|
302 |
+
"o",
|
303 |
+
"p",
|
304 |
+
"py",
|
305 |
+
"r",
|
306 |
+
"ry",
|
307 |
+
"s",
|
308 |
+
"sh",
|
309 |
+
"t",
|
310 |
+
"ts",
|
311 |
+
"u",
|
312 |
+
"v",
|
313 |
+
"w",
|
314 |
+
"y",
|
315 |
+
"z",
|
316 |
+
###楼下2个留到后面加
|
317 |
+
# "[", #上升调型
|
318 |
+
# "]", #下降调型
|
319 |
+
# "$", #结束符
|
320 |
+
# "^", #开始符
|
321 |
+
]
|
322 |
+
|
323 |
+
arpa = {
|
324 |
+
"AH0",
|
325 |
+
"S",
|
326 |
+
"AH1",
|
327 |
+
"EY2",
|
328 |
+
"AE2",
|
329 |
+
"EH0",
|
330 |
+
"OW2",
|
331 |
+
"UH0",
|
332 |
+
"NG",
|
333 |
+
"B",
|
334 |
+
"G",
|
335 |
+
"AY0",
|
336 |
+
"M",
|
337 |
+
"AA0",
|
338 |
+
"F",
|
339 |
+
"AO0",
|
340 |
+
"ER2",
|
341 |
+
"UH1",
|
342 |
+
"IY1",
|
343 |
+
"AH2",
|
344 |
+
"DH",
|
345 |
+
"IY0",
|
346 |
+
"EY1",
|
347 |
+
"IH0",
|
348 |
+
"K",
|
349 |
+
"N",
|
350 |
+
"W",
|
351 |
+
"IY2",
|
352 |
+
"T",
|
353 |
+
"AA1",
|
354 |
+
"ER1",
|
355 |
+
"EH2",
|
356 |
+
"OY0",
|
357 |
+
"UH2",
|
358 |
+
"UW1",
|
359 |
+
"Z",
|
360 |
+
"AW2",
|
361 |
+
"AW1",
|
362 |
+
"V",
|
363 |
+
"UW2",
|
364 |
+
"AA2",
|
365 |
+
"ER",
|
366 |
+
"AW0",
|
367 |
+
"UW0",
|
368 |
+
"R",
|
369 |
+
"OW1",
|
370 |
+
"EH1",
|
371 |
+
"ZH",
|
372 |
+
"AE0",
|
373 |
+
"IH2",
|
374 |
+
"IH",
|
375 |
+
"Y",
|
376 |
+
"JH",
|
377 |
+
"P",
|
378 |
+
"AY1",
|
379 |
+
"EY0",
|
380 |
+
"OY2",
|
381 |
+
"TH",
|
382 |
+
"HH",
|
383 |
+
"D",
|
384 |
+
"ER0",
|
385 |
+
"CH",
|
386 |
+
"AO1",
|
387 |
+
"AE1",
|
388 |
+
"AO2",
|
389 |
+
"OY1",
|
390 |
+
"AY2",
|
391 |
+
"IH1",
|
392 |
+
"OW0",
|
393 |
+
"L",
|
394 |
+
"SH",
|
395 |
+
}
|
396 |
+
|
397 |
+
ko_symbols = "ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ空停"
|
398 |
+
# ko_symbols='ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ '
|
399 |
+
|
400 |
+
yue_symbols = {
|
401 |
+
"Yeot3",
|
402 |
+
"Yip1",
|
403 |
+
"Yyu3",
|
404 |
+
"Yeng4",
|
405 |
+
"Yut5",
|
406 |
+
"Yaan5",
|
407 |
+
"Ym5",
|
408 |
+
"Yaan6",
|
409 |
+
"Yang1",
|
410 |
+
"Yun4",
|
411 |
+
"Yon2",
|
412 |
+
"Yui5",
|
413 |
+
"Yun2",
|
414 |
+
"Yat3",
|
415 |
+
"Ye",
|
416 |
+
"Yeot1",
|
417 |
+
"Yoeng5",
|
418 |
+
"Yoek2",
|
419 |
+
"Yam2",
|
420 |
+
"Yeon6",
|
421 |
+
"Yu6",
|
422 |
+
"Yiu3",
|
423 |
+
"Yaang6",
|
424 |
+
"Yp5",
|
425 |
+
"Yai4",
|
426 |
+
"Yoek4",
|
427 |
+
"Yit6",
|
428 |
+
"Yam5",
|
429 |
+
"Yoeng6",
|
430 |
+
"Yg1",
|
431 |
+
"Yk3",
|
432 |
+
"Yoe4",
|
433 |
+
"Yam3",
|
434 |
+
"Yc",
|
435 |
+
"Yyu4",
|
436 |
+
"Yyut1",
|
437 |
+
"Yiu4",
|
438 |
+
"Ying3",
|
439 |
+
"Yip3",
|
440 |
+
"Yaap3",
|
441 |
+
"Yau3",
|
442 |
+
"Yan4",
|
443 |
+
"Yau1",
|
444 |
+
"Yap4",
|
445 |
+
"Yk6",
|
446 |
+
"Yok3",
|
447 |
+
"Yai1",
|
448 |
+
"Yeot6",
|
449 |
+
"Yan2",
|
450 |
+
"Yoek6",
|
451 |
+
"Yt1",
|
452 |
+
"Yoi1",
|
453 |
+
"Yit5",
|
454 |
+
"Yn4",
|
455 |
+
"Yaau3",
|
456 |
+
"Yau4",
|
457 |
+
"Yuk6",
|
458 |
+
"Ys",
|
459 |
+
"Yuk",
|
460 |
+
"Yin6",
|
461 |
+
"Yung6",
|
462 |
+
"Ya",
|
463 |
+
"You",
|
464 |
+
"Yaai5",
|
465 |
+
"Yau5",
|
466 |
+
"Yoi3",
|
467 |
+
"Yaak3",
|
468 |
+
"Yaat3",
|
469 |
+
"Ying2",
|
470 |
+
"Yok5",
|
471 |
+
"Yeng2",
|
472 |
+
"Yyut3",
|
473 |
+
"Yam1",
|
474 |
+
"Yip5",
|
475 |
+
"You1",
|
476 |
+
"Yam6",
|
477 |
+
"Yaa5",
|
478 |
+
"Yi6",
|
479 |
+
"Yek4",
|
480 |
+
"Yyu2",
|
481 |
+
"Yuk5",
|
482 |
+
"Yaam1",
|
483 |
+
"Yang2",
|
484 |
+
"Yai",
|
485 |
+
"Yiu6",
|
486 |
+
"Yin4",
|
487 |
+
"Yok4",
|
488 |
+
"Yot3",
|
489 |
+
"Yui2",
|
490 |
+
"Yeoi5",
|
491 |
+
"Yyun6",
|
492 |
+
"Yyu5",
|
493 |
+
"Yoi5",
|
494 |
+
"Yeot2",
|
495 |
+
"Yim4",
|
496 |
+
"Yeoi2",
|
497 |
+
"Yaan1",
|
498 |
+
"Yang6",
|
499 |
+
"Yong1",
|
500 |
+
"Yaang4",
|
501 |
+
"Yung5",
|
502 |
+
"Yeon1",
|
503 |
+
"Yin2",
|
504 |
+
"Ya3",
|
505 |
+
"Yaang3",
|
506 |
+
"Yg",
|
507 |
+
"Yk2",
|
508 |
+
"Yaau5",
|
509 |
+
"Yut1",
|
510 |
+
"Yt5",
|
511 |
+
"Yip4",
|
512 |
+
"Yung4",
|
513 |
+
"Yj",
|
514 |
+
"Yong3",
|
515 |
+
"Ya1",
|
516 |
+
"Yg6",
|
517 |
+
"Yaau6",
|
518 |
+
"Yit3",
|
519 |
+
"Yun3",
|
520 |
+
"Ying1",
|
521 |
+
"Yn2",
|
522 |
+
"Yg4",
|
523 |
+
"Yl",
|
524 |
+
"Yp3",
|
525 |
+
"Yn3",
|
526 |
+
"Yak1",
|
527 |
+
"Yang5",
|
528 |
+
"Yoe6",
|
529 |
+
"You2",
|
530 |
+
"Yap2",
|
531 |
+
"Yak2",
|
532 |
+
"Yt3",
|
533 |
+
"Yot5",
|
534 |
+
"Yim2",
|
535 |
+
"Yi1",
|
536 |
+
"Yn6",
|
537 |
+
"Yaat5",
|
538 |
+
"Yaam3",
|
539 |
+
"Yoek5",
|
540 |
+
"Ye3",
|
541 |
+
"Yeon4",
|
542 |
+
"Yaa2",
|
543 |
+
"Yu3",
|
544 |
+
"Yim6",
|
545 |
+
"Ym",
|
546 |
+
"Yoe3",
|
547 |
+
"Yaai2",
|
548 |
+
"Ym2",
|
549 |
+
"Ya6",
|
550 |
+
"Yeng6",
|
551 |
+
"Yik4",
|
552 |
+
"Yot4",
|
553 |
+
"Yaai4",
|
554 |
+
"Yyun3",
|
555 |
+
"Yu1",
|
556 |
+
"Yoeng1",
|
557 |
+
"Yaap2",
|
558 |
+
"Yuk3",
|
559 |
+
"Yoek3",
|
560 |
+
"Yeng5",
|
561 |
+
"Yeoi1",
|
562 |
+
"Yiu2",
|
563 |
+
"Yok1",
|
564 |
+
"Yo1",
|
565 |
+
"Yoek1",
|
566 |
+
"Yoeng2",
|
567 |
+
"Yeon5",
|
568 |
+
"Yiu1",
|
569 |
+
"Yoeng4",
|
570 |
+
"Yuk2",
|
571 |
+
"Yat4",
|
572 |
+
"Yg5",
|
573 |
+
"Yut4",
|
574 |
+
"Yan6",
|
575 |
+
"Yin3",
|
576 |
+
"Yaa6",
|
577 |
+
"Yap1",
|
578 |
+
"Yg2",
|
579 |
+
"Yoe5",
|
580 |
+
"Yt4",
|
581 |
+
"Ya5",
|
582 |
+
"Yo4",
|
583 |
+
"Yyu1",
|
584 |
+
"Yak3",
|
585 |
+
"Yeon2",
|
586 |
+
"Yong4",
|
587 |
+
"Ym1",
|
588 |
+
"Ye2",
|
589 |
+
"Yaang5",
|
590 |
+
"Yoi2",
|
591 |
+
"Yeng3",
|
592 |
+
"Yn",
|
593 |
+
"Yyut4",
|
594 |
+
"Yau",
|
595 |
+
"Yaak2",
|
596 |
+
"Yaan4",
|
597 |
+
"Yek2",
|
598 |
+
"Yin1",
|
599 |
+
"Yi5",
|
600 |
+
"Yoe2",
|
601 |
+
"Yei5",
|
602 |
+
"Yaat6",
|
603 |
+
"Yak5",
|
604 |
+
"Yp6",
|
605 |
+
"Yok6",
|
606 |
+
"Yei2",
|
607 |
+
"Yaap1",
|
608 |
+
"Yyut5",
|
609 |
+
"Yi4",
|
610 |
+
"Yim1",
|
611 |
+
"Yk5",
|
612 |
+
"Ye4",
|
613 |
+
"Yok2",
|
614 |
+
"Yaam6",
|
615 |
+
"Yat2",
|
616 |
+
"Yon6",
|
617 |
+
"Yei3",
|
618 |
+
"Yyu6",
|
619 |
+
"Yeot5",
|
620 |
+
"Yk4",
|
621 |
+
"Yai6",
|
622 |
+
"Yd",
|
623 |
+
"Yg3",
|
624 |
+
"Yei6",
|
625 |
+
"Yau2",
|
626 |
+
"Yok",
|
627 |
+
"Yau6",
|
628 |
+
"Yung3",
|
629 |
+
"Yim5",
|
630 |
+
"Yut6",
|
631 |
+
"Yit1",
|
632 |
+
"Yon3",
|
633 |
+
"Yat1",
|
634 |
+
"Yaam2",
|
635 |
+
"Yyut2",
|
636 |
+
"Yui6",
|
637 |
+
"Yt2",
|
638 |
+
"Yek6",
|
639 |
+
"Yt",
|
640 |
+
"Ye6",
|
641 |
+
"Yang3",
|
642 |
+
"Ying6",
|
643 |
+
"Yaau1",
|
644 |
+
"Yeon3",
|
645 |
+
"Yng",
|
646 |
+
"Yh",
|
647 |
+
"Yang4",
|
648 |
+
"Ying5",
|
649 |
+
"Yaap6",
|
650 |
+
"Yoeng3",
|
651 |
+
"Yyun4",
|
652 |
+
"You3",
|
653 |
+
"Yan5",
|
654 |
+
"Yat5",
|
655 |
+
"Yot1",
|
656 |
+
"Yun1",
|
657 |
+
"Yi3",
|
658 |
+
"Yaa1",
|
659 |
+
"Yaap4",
|
660 |
+
"You6",
|
661 |
+
"Yaang2",
|
662 |
+
"Yaap5",
|
663 |
+
"Yaa3",
|
664 |
+
"Yaak6",
|
665 |
+
"Yeng1",
|
666 |
+
"Yaak1",
|
667 |
+
"Yo5",
|
668 |
+
"Yoi4",
|
669 |
+
"Yam4",
|
670 |
+
"Yik1",
|
671 |
+
"Ye1",
|
672 |
+
"Yai5",
|
673 |
+
"Yung1",
|
674 |
+
"Yp2",
|
675 |
+
"Yui4",
|
676 |
+
"Yaak4",
|
677 |
+
"Yung2",
|
678 |
+
"Yak4",
|
679 |
+
"Yaat4",
|
680 |
+
"Yeoi4",
|
681 |
+
"Yut2",
|
682 |
+
"Yin5",
|
683 |
+
"Yaau4",
|
684 |
+
"Yap6",
|
685 |
+
"Yb",
|
686 |
+
"Yaam4",
|
687 |
+
"Yw",
|
688 |
+
"Yut3",
|
689 |
+
"Yong2",
|
690 |
+
"Yt6",
|
691 |
+
"Yaai6",
|
692 |
+
"Yap5",
|
693 |
+
"Yik5",
|
694 |
+
"Yun6",
|
695 |
+
"Yaam5",
|
696 |
+
"Yun5",
|
697 |
+
"Yik3",
|
698 |
+
"Ya2",
|
699 |
+
"Yyut6",
|
700 |
+
"Yon4",
|
701 |
+
"Yk1",
|
702 |
+
"Yit4",
|
703 |
+
"Yak6",
|
704 |
+
"Yaan2",
|
705 |
+
"Yuk1",
|
706 |
+
"Yai2",
|
707 |
+
"Yik2",
|
708 |
+
"Yaat2",
|
709 |
+
"Yo3",
|
710 |
+
"Ykw",
|
711 |
+
"Yn5",
|
712 |
+
"Yaa",
|
713 |
+
"Ye5",
|
714 |
+
"Yu4",
|
715 |
+
"Yei1",
|
716 |
+
"Yai3",
|
717 |
+
"Yyun5",
|
718 |
+
"Yip2",
|
719 |
+
"Yaau2",
|
720 |
+
"Yiu5",
|
721 |
+
"Ym4",
|
722 |
+
"Yeoi6",
|
723 |
+
"Yk",
|
724 |
+
"Ym6",
|
725 |
+
"Yoe1",
|
726 |
+
"Yeoi3",
|
727 |
+
"Yon",
|
728 |
+
"Yuk4",
|
729 |
+
"Yaai3",
|
730 |
+
"Yaa4",
|
731 |
+
"Yot6",
|
732 |
+
"Yaang1",
|
733 |
+
"Yei4",
|
734 |
+
"Yek1",
|
735 |
+
"Yo",
|
736 |
+
"Yp",
|
737 |
+
"Yo6",
|
738 |
+
"Yp4",
|
739 |
+
"Yan3",
|
740 |
+
"Yoi",
|
741 |
+
"Yap3",
|
742 |
+
"Yek3",
|
743 |
+
"Yim3",
|
744 |
+
"Yz",
|
745 |
+
"Yot2",
|
746 |
+
"Yoi6",
|
747 |
+
"Yit2",
|
748 |
+
"Yu5",
|
749 |
+
"Yaan3",
|
750 |
+
"Yan1",
|
751 |
+
"Yon5",
|
752 |
+
"Yp1",
|
753 |
+
"Yong5",
|
754 |
+
"Ygw",
|
755 |
+
"Yak",
|
756 |
+
"Yat6",
|
757 |
+
"Ying4",
|
758 |
+
"Yu2",
|
759 |
+
"Yf",
|
760 |
+
"Ya4",
|
761 |
+
"Yon1",
|
762 |
+
"You4",
|
763 |
+
"Yik6",
|
764 |
+
"Yui1",
|
765 |
+
"Yaat1",
|
766 |
+
"Yeot4",
|
767 |
+
"Yi2",
|
768 |
+
"Yaai1",
|
769 |
+
"Yek5",
|
770 |
+
"Ym3",
|
771 |
+
"Yong6",
|
772 |
+
"You5",
|
773 |
+
"Yyun1",
|
774 |
+
"Yn1",
|
775 |
+
"Yo2",
|
776 |
+
"Yip6",
|
777 |
+
"Yui3",
|
778 |
+
"Yaak5",
|
779 |
+
"Yyun2",
|
780 |
+
}
|
781 |
+
|
782 |
+
# symbols = [pad] + c + v + ja_symbols + pu_symbols + list(arpa)+list(ko_symbols)#+list(yue_symbols)###直接这么加yue顺序乱了
|
783 |
+
symbols = [pad] + c + v + ja_symbols + pu_symbols + list(arpa)
|
784 |
+
symbols = sorted(set(symbols))
|
785 |
+
# print(len(symbols))
|
786 |
+
symbols += ["[", "]"] ##日文新增上升下降调型
|
787 |
+
symbols += sorted(list(ko_symbols))
|
788 |
+
symbols += sorted(list(yue_symbols)) ##新加的yue统一摆在后头#已查过开头加Y后没有重复,韩文显然不会重复
|
789 |
+
# print(len(symbols))
|
790 |
+
if __name__ == "__main__":
|
791 |
+
print(len(symbols))
|
792 |
+
"""
|
793 |
+
粤语:
|
794 |
+
732-353=379
|
795 |
+
韩文+粤语:
|
796 |
+
732-322=410
|
797 |
+
"""
|
text/tone_sandhi.py
ADDED
@@ -0,0 +1,778 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
from typing import List
|
15 |
+
from typing import Tuple
|
16 |
+
|
17 |
+
import jieba_fast as jieba
|
18 |
+
from pypinyin import lazy_pinyin
|
19 |
+
from pypinyin import Style
|
20 |
+
|
21 |
+
|
22 |
+
class ToneSandhi:
|
23 |
+
def __init__(self):
|
24 |
+
self.must_neural_tone_words = {
|
25 |
+
"麻烦",
|
26 |
+
"麻利",
|
27 |
+
"鸳鸯",
|
28 |
+
"高粱",
|
29 |
+
"骨头",
|
30 |
+
"骆驼",
|
31 |
+
"马虎",
|
32 |
+
"首饰",
|
33 |
+
"馒头",
|
34 |
+
"馄饨",
|
35 |
+
"风筝",
|
36 |
+
"难为",
|
37 |
+
"队伍",
|
38 |
+
"阔气",
|
39 |
+
"闺女",
|
40 |
+
"门道",
|
41 |
+
"锄头",
|
42 |
+
"铺盖",
|
43 |
+
"铃铛",
|
44 |
+
"铁匠",
|
45 |
+
"钥匙",
|
46 |
+
"里脊",
|
47 |
+
"里头",
|
48 |
+
"部分",
|
49 |
+
"那么",
|
50 |
+
"道士",
|
51 |
+
"造化",
|
52 |
+
"迷糊",
|
53 |
+
"连累",
|
54 |
+
"这么",
|
55 |
+
"这个",
|
56 |
+
"运气",
|
57 |
+
"过去",
|
58 |
+
"软和",
|
59 |
+
"转悠",
|
60 |
+
"踏实",
|
61 |
+
"跳蚤",
|
62 |
+
"跟头",
|
63 |
+
"趔趄",
|
64 |
+
"财主",
|
65 |
+
"豆腐",
|
66 |
+
"讲究",
|
67 |
+
"记性",
|
68 |
+
"记号",
|
69 |
+
"认识",
|
70 |
+
"规矩",
|
71 |
+
"见识",
|
72 |
+
"裁缝",
|
73 |
+
"补丁",
|
74 |
+
"衣裳",
|
75 |
+
"衣服",
|
76 |
+
"衙门",
|
77 |
+
"街坊",
|
78 |
+
"行李",
|
79 |
+
"行当",
|
80 |
+
"蛤蟆",
|
81 |
+
"蘑菇",
|
82 |
+
"薄荷",
|
83 |
+
"葫芦",
|
84 |
+
"葡萄",
|
85 |
+
"萝卜",
|
86 |
+
"荸荠",
|
87 |
+
"苗条",
|
88 |
+
"苗头",
|
89 |
+
"苍蝇",
|
90 |
+
"芝麻",
|
91 |
+
"舒服",
|
92 |
+
"舒坦",
|
93 |
+
"舌头",
|
94 |
+
"自在",
|
95 |
+
"膏药",
|
96 |
+
"脾气",
|
97 |
+
"脑袋",
|
98 |
+
"脊梁",
|
99 |
+
"能耐",
|
100 |
+
"胳膊",
|
101 |
+
"胭脂",
|
102 |
+
"胡萝",
|
103 |
+
"胡琴",
|
104 |
+
"胡同",
|
105 |
+
"聪明",
|
106 |
+
"耽误",
|
107 |
+
"耽搁",
|
108 |
+
"耷拉",
|
109 |
+
"耳朵",
|
110 |
+
"老爷",
|
111 |
+
"老实",
|
112 |
+
"老婆",
|
113 |
+
"老头",
|
114 |
+
"老太",
|
115 |
+
"翻腾",
|
116 |
+
"罗嗦",
|
117 |
+
"罐头",
|
118 |
+
"编辑",
|
119 |
+
"结实",
|
120 |
+
"红火",
|
121 |
+
"累赘",
|
122 |
+
"糨糊",
|
123 |
+
"糊涂",
|
124 |
+
"精神",
|
125 |
+
"粮食",
|
126 |
+
"簸箕",
|
127 |
+
"篱笆",
|
128 |
+
"算计",
|
129 |
+
"算盘",
|
130 |
+
"答应",
|
131 |
+
"笤帚",
|
132 |
+
"笑语",
|
133 |
+
"笑话",
|
134 |
+
"窟窿",
|
135 |
+
"窝囊",
|
136 |
+
"窗户",
|
137 |
+
"稳当",
|
138 |
+
"稀罕",
|
139 |
+
"称呼",
|
140 |
+
"秧歌",
|
141 |
+
"秀气",
|
142 |
+
"秀才",
|
143 |
+
"福气",
|
144 |
+
"祖宗",
|
145 |
+
"砚台",
|
146 |
+
"码头",
|
147 |
+
"石榴",
|
148 |
+
"石头",
|
149 |
+
"石匠",
|
150 |
+
"知识",
|
151 |
+
"眼睛",
|
152 |
+
"眯缝",
|
153 |
+
"眨巴",
|
154 |
+
"眉毛",
|
155 |
+
"相声",
|
156 |
+
"盘算",
|
157 |
+
"白净",
|
158 |
+
"痢疾",
|
159 |
+
"痛快",
|
160 |
+
"疟疾",
|
161 |
+
"疙瘩",
|
162 |
+
"疏忽",
|
163 |
+
"畜生",
|
164 |
+
"生意",
|
165 |
+
"甘蔗",
|
166 |
+
"琵琶",
|
167 |
+
"琢磨",
|
168 |
+
"琉璃",
|
169 |
+
"玻璃",
|
170 |
+
"玫瑰",
|
171 |
+
"玄乎",
|
172 |
+
"狐狸",
|
173 |
+
"状元",
|
174 |
+
"特务",
|
175 |
+
"牲口",
|
176 |
+
"牙碜",
|
177 |
+
"牌楼",
|
178 |
+
"爽快",
|
179 |
+
"爱人",
|
180 |
+
"热闹",
|
181 |
+
"烧饼",
|
182 |
+
"烟筒",
|
183 |
+
"烂糊",
|
184 |
+
"点心",
|
185 |
+
"炊帚",
|
186 |
+
"灯笼",
|
187 |
+
"火候",
|
188 |
+
"漂亮",
|
189 |
+
"滑溜",
|
190 |
+
"溜达",
|
191 |
+
"温和",
|
192 |
+
"清楚",
|
193 |
+
"消息",
|
194 |
+
"浪头",
|
195 |
+
"活泼",
|
196 |
+
"比方",
|
197 |
+
"正经",
|
198 |
+
"欺负",
|
199 |
+
"模糊",
|
200 |
+
"槟榔",
|
201 |
+
"棺材",
|
202 |
+
"棒槌",
|
203 |
+
"棉花",
|
204 |
+
"核桃",
|
205 |
+
"栅栏",
|
206 |
+
"柴火",
|
207 |
+
"架势",
|
208 |
+
"枕头",
|
209 |
+
"���杷",
|
210 |
+
"机灵",
|
211 |
+
"本事",
|
212 |
+
"木头",
|
213 |
+
"木匠",
|
214 |
+
"朋友",
|
215 |
+
"月饼",
|
216 |
+
"月亮",
|
217 |
+
"暖和",
|
218 |
+
"明白",
|
219 |
+
"时候",
|
220 |
+
"新鲜",
|
221 |
+
"故事",
|
222 |
+
"收拾",
|
223 |
+
"收成",
|
224 |
+
"提防",
|
225 |
+
"挖苦",
|
226 |
+
"挑剔",
|
227 |
+
"指甲",
|
228 |
+
"指头",
|
229 |
+
"拾掇",
|
230 |
+
"拳头",
|
231 |
+
"拨弄",
|
232 |
+
"招牌",
|
233 |
+
"招呼",
|
234 |
+
"抬举",
|
235 |
+
"护士",
|
236 |
+
"折腾",
|
237 |
+
"扫帚",
|
238 |
+
"打量",
|
239 |
+
"打算",
|
240 |
+
"打点",
|
241 |
+
"打扮",
|
242 |
+
"打听",
|
243 |
+
"打发",
|
244 |
+
"扎实",
|
245 |
+
"扁担",
|
246 |
+
"戒指",
|
247 |
+
"懒得",
|
248 |
+
"意识",
|
249 |
+
"意思",
|
250 |
+
"情形",
|
251 |
+
"悟性",
|
252 |
+
"怪物",
|
253 |
+
"思量",
|
254 |
+
"怎么",
|
255 |
+
"念头",
|
256 |
+
"念叨",
|
257 |
+
"快活",
|
258 |
+
"忙活",
|
259 |
+
"志气",
|
260 |
+
"心思",
|
261 |
+
"得罪",
|
262 |
+
"张罗",
|
263 |
+
"弟兄",
|
264 |
+
"开通",
|
265 |
+
"应酬",
|
266 |
+
"庄稼",
|
267 |
+
"干事",
|
268 |
+
"帮手",
|
269 |
+
"帐篷",
|
270 |
+
"希罕",
|
271 |
+
"师父",
|
272 |
+
"师傅",
|
273 |
+
"巴结",
|
274 |
+
"巴掌",
|
275 |
+
"差事",
|
276 |
+
"工夫",
|
277 |
+
"岁数",
|
278 |
+
"屁股",
|
279 |
+
"尾巴",
|
280 |
+
"少爷",
|
281 |
+
"小气",
|
282 |
+
"小伙",
|
283 |
+
"将就",
|
284 |
+
"对头",
|
285 |
+
"对付",
|
286 |
+
"寡妇",
|
287 |
+
"家伙",
|
288 |
+
"客气",
|
289 |
+
"实在",
|
290 |
+
"官司",
|
291 |
+
"学问",
|
292 |
+
"学生",
|
293 |
+
"字号",
|
294 |
+
"嫁妆",
|
295 |
+
"媳妇",
|
296 |
+
"媒人",
|
297 |
+
"婆家",
|
298 |
+
"娘家",
|
299 |
+
"委屈",
|
300 |
+
"姑娘",
|
301 |
+
"姐夫",
|
302 |
+
"妯娌",
|
303 |
+
"妥当",
|
304 |
+
"妖精",
|
305 |
+
"奴才",
|
306 |
+
"女婿",
|
307 |
+
"头发",
|
308 |
+
"太阳",
|
309 |
+
"大爷",
|
310 |
+
"大方",
|
311 |
+
"大意",
|
312 |
+
"大夫",
|
313 |
+
"多少",
|
314 |
+
"多么",
|
315 |
+
"外甥",
|
316 |
+
"壮实",
|
317 |
+
"地道",
|
318 |
+
"地方",
|
319 |
+
"在乎",
|
320 |
+
"困难",
|
321 |
+
"嘴巴",
|
322 |
+
"嘱咐",
|
323 |
+
"嘟囔",
|
324 |
+
"嘀咕",
|
325 |
+
"喜欢",
|
326 |
+
"喇嘛",
|
327 |
+
"喇叭",
|
328 |
+
"商量",
|
329 |
+
"唾沫",
|
330 |
+
"哑巴",
|
331 |
+
"哈欠",
|
332 |
+
"哆嗦",
|
333 |
+
"咳嗽",
|
334 |
+
"和尚",
|
335 |
+
"告诉",
|
336 |
+
"告示",
|
337 |
+
"含糊",
|
338 |
+
"吓唬",
|
339 |
+
"后头",
|
340 |
+
"名字",
|
341 |
+
"名堂",
|
342 |
+
"合同",
|
343 |
+
"吆喝",
|
344 |
+
"叫唤",
|
345 |
+
"口袋",
|
346 |
+
"厚道",
|
347 |
+
"厉害",
|
348 |
+
"千斤",
|
349 |
+
"包袱",
|
350 |
+
"包涵",
|
351 |
+
"匀称",
|
352 |
+
"勤快",
|
353 |
+
"动静",
|
354 |
+
"动弹",
|
355 |
+
"功夫",
|
356 |
+
"力气",
|
357 |
+
"前头",
|
358 |
+
"刺猬",
|
359 |
+
"刺激",
|
360 |
+
"别扭",
|
361 |
+
"利落",
|
362 |
+
"利索",
|
363 |
+
"利害",
|
364 |
+
"分析",
|
365 |
+
"出息",
|
366 |
+
"凑合",
|
367 |
+
"凉快",
|
368 |
+
"冷战",
|
369 |
+
"冤枉",
|
370 |
+
"冒失",
|
371 |
+
"养活",
|
372 |
+
"关系",
|
373 |
+
"先生",
|
374 |
+
"兄弟",
|
375 |
+
"便宜",
|
376 |
+
"使唤",
|
377 |
+
"佩服",
|
378 |
+
"作坊",
|
379 |
+
"体面",
|
380 |
+
"位置",
|
381 |
+
"似的",
|
382 |
+
"伙计",
|
383 |
+
"休息",
|
384 |
+
"什么",
|
385 |
+
"人家",
|
386 |
+
"亲戚",
|
387 |
+
"亲家",
|
388 |
+
"交情",
|
389 |
+
"云彩",
|
390 |
+
"事情",
|
391 |
+
"买卖",
|
392 |
+
"主意",
|
393 |
+
"丫头",
|
394 |
+
"丧气",
|
395 |
+
"两口",
|
396 |
+
"东西",
|
397 |
+
"东家",
|
398 |
+
"世故",
|
399 |
+
"不由",
|
400 |
+
"不在",
|
401 |
+
"下水",
|
402 |
+
"下巴",
|
403 |
+
"上头",
|
404 |
+
"上司",
|
405 |
+
"丈夫",
|
406 |
+
"丈人",
|
407 |
+
"一辈",
|
408 |
+
"那个",
|
409 |
+
"菩萨",
|
410 |
+
"父亲",
|
411 |
+
"母亲",
|
412 |
+
"咕噜",
|
413 |
+
"邋遢",
|
414 |
+
"费用",
|
415 |
+
"冤家",
|
416 |
+
"甜头",
|
417 |
+
"介绍",
|
418 |
+
"荒唐",
|
419 |
+
"大人",
|
420 |
+
"泥鳅",
|
421 |
+
"幸福",
|
422 |
+
"熟悉",
|
423 |
+
"计划",
|
424 |
+
"扑腾",
|
425 |
+
"蜡烛",
|
426 |
+
"姥爷",
|
427 |
+
"照顾",
|
428 |
+
"喉咙",
|
429 |
+
"吉他",
|
430 |
+
"弄堂",
|
431 |
+
"蚂蚱",
|
432 |
+
"凤凰",
|
433 |
+
"拖沓",
|
434 |
+
"寒碜",
|
435 |
+
"糟蹋",
|
436 |
+
"倒腾",
|
437 |
+
"报复",
|
438 |
+
"逻辑",
|
439 |
+
"盘缠",
|
440 |
+
"喽啰",
|
441 |
+
"牢骚",
|
442 |
+
"咖喱",
|
443 |
+
"扫把",
|
444 |
+
"惦记",
|
445 |
+
}
|
446 |
+
self.must_not_neural_tone_words = {
|
447 |
+
"男子",
|
448 |
+
"女子",
|
449 |
+
"分子",
|
450 |
+
"原子",
|
451 |
+
"量子",
|
452 |
+
"莲子",
|
453 |
+
"石子",
|
454 |
+
"瓜子",
|
455 |
+
"电子",
|
456 |
+
"人人",
|
457 |
+
"虎虎",
|
458 |
+
"幺幺",
|
459 |
+
"干嘛",
|
460 |
+
"学子",
|
461 |
+
"哈哈",
|
462 |
+
"数数",
|
463 |
+
"袅袅",
|
464 |
+
"局地",
|
465 |
+
"以下",
|
466 |
+
"娃哈哈",
|
467 |
+
"花花草草",
|
468 |
+
"留得",
|
469 |
+
"耕地",
|
470 |
+
"想想",
|
471 |
+
"熙熙",
|
472 |
+
"攘攘",
|
473 |
+
"卵子",
|
474 |
+
"死死",
|
475 |
+
"冉冉",
|
476 |
+
"恳恳",
|
477 |
+
"佼佼",
|
478 |
+
"吵吵",
|
479 |
+
"打打",
|
480 |
+
"考考",
|
481 |
+
"整整",
|
482 |
+
"莘莘",
|
483 |
+
"落地",
|
484 |
+
"算子",
|
485 |
+
"家家户户",
|
486 |
+
"青青",
|
487 |
+
}
|
488 |
+
self.punc = ":,;。?!“”‘’':,;.?!"
|
489 |
+
|
490 |
+
# the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041
|
491 |
+
# e.g.
|
492 |
+
# word: "家里"
|
493 |
+
# pos: "s"
|
494 |
+
# finals: ['ia1', 'i3']
|
495 |
+
def _neural_sandhi(self, word: str, pos: str, finals: List[str]) -> List[str]:
|
496 |
+
# reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺
|
497 |
+
for j, item in enumerate(word):
|
498 |
+
if (
|
499 |
+
j - 1 >= 0
|
500 |
+
and item == word[j - 1]
|
501 |
+
and pos[0] in {"n", "v", "a"}
|
502 |
+
and word not in self.must_not_neural_tone_words
|
503 |
+
):
|
504 |
+
finals[j] = finals[j][:-1] + "5"
|
505 |
+
ge_idx = word.find("个")
|
506 |
+
if len(word) >= 1 and word[-1] in "吧呢哈啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶":
|
507 |
+
finals[-1] = finals[-1][:-1] + "5"
|
508 |
+
elif len(word) >= 1 and word[-1] in "的地得":
|
509 |
+
finals[-1] = finals[-1][:-1] + "5"
|
510 |
+
# e.g. 走了, 看着, 去过
|
511 |
+
elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}:
|
512 |
+
finals[-1] = finals[-1][:-1] + "5"
|
513 |
+
elif len(word) > 1 and word[-1] in "们子" and pos in {"r", "n"} and word not in self.must_not_neural_tone_words:
|
514 |
+
finals[-1] = finals[-1][:-1] + "5"
|
515 |
+
# e.g. 桌上, 地下, 家里
|
516 |
+
elif len(word) > 1 and word[-1] in "上下里" and pos in {"s", "l", "f"}:
|
517 |
+
finals[-1] = finals[-1][:-1] + "5"
|
518 |
+
# e.g. 上来, 下去
|
519 |
+
elif len(word) > 1 and word[-1] in "来去" and word[-2] in "上下进出回过起开":
|
520 |
+
finals[-1] = finals[-1][:-1] + "5"
|
521 |
+
# 个做量词
|
522 |
+
elif (
|
523 |
+
ge_idx >= 1 and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是")
|
524 |
+
) or word == "个":
|
525 |
+
finals[ge_idx] = finals[ge_idx][:-1] + "5"
|
526 |
+
else:
|
527 |
+
if word in self.must_neural_tone_words or word[-2:] in self.must_neural_tone_words:
|
528 |
+
finals[-1] = finals[-1][:-1] + "5"
|
529 |
+
|
530 |
+
word_list = self._split_word(word)
|
531 |
+
finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]]
|
532 |
+
for i, word in enumerate(word_list):
|
533 |
+
# conventional neural in Chinese
|
534 |
+
if word in self.must_neural_tone_words or word[-2:] in self.must_neural_tone_words:
|
535 |
+
finals_list[i][-1] = finals_list[i][-1][:-1] + "5"
|
536 |
+
finals = sum(finals_list, [])
|
537 |
+
return finals
|
538 |
+
|
539 |
+
def _bu_sandhi(self, word: str, finals: List[str]) -> List[str]:
|
540 |
+
# e.g. 看不懂
|
541 |
+
if len(word) == 3 and word[1] == "不":
|
542 |
+
finals[1] = finals[1][:-1] + "5"
|
543 |
+
else:
|
544 |
+
for i, char in enumerate(word):
|
545 |
+
# "不" before tone4 should be bu2, e.g. 不怕
|
546 |
+
if char == "不" and i + 1 < len(word) and finals[i + 1][-1] == "4":
|
547 |
+
finals[i] = finals[i][:-1] + "2"
|
548 |
+
return finals
|
549 |
+
|
550 |
+
def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]:
|
551 |
+
# "一" in number sequences, e.g. 一零零, 二一零
|
552 |
+
if word.find("一") != -1 and all([item.isnumeric() for item in word if item != "一"]):
|
553 |
+
return finals
|
554 |
+
# "一" between reduplication words shold be yi5, e.g. 看一看
|
555 |
+
elif len(word) == 3 and word[1] == "一" and word[0] == word[-1]:
|
556 |
+
finals[1] = finals[1][:-1] + "5"
|
557 |
+
# when "一" is ordinal word, it should be yi1
|
558 |
+
elif word.startswith("第一"):
|
559 |
+
finals[1] = finals[1][:-1] + "1"
|
560 |
+
else:
|
561 |
+
for i, char in enumerate(word):
|
562 |
+
if char == "一" and i + 1 < len(word):
|
563 |
+
# "一" before tone4 should be yi2, e.g. 一段
|
564 |
+
if finals[i + 1][-1] == "4":
|
565 |
+
finals[i] = finals[i][:-1] + "2"
|
566 |
+
# "一" before non-tone4 should be yi4, e.g. 一天
|
567 |
+
else:
|
568 |
+
# "一" 后面如果是标点,还读一声
|
569 |
+
if word[i + 1] not in self.punc:
|
570 |
+
finals[i] = finals[i][:-1] + "4"
|
571 |
+
return finals
|
572 |
+
|
573 |
+
def _split_word(self, word: str) -> List[str]:
|
574 |
+
word_list = jieba.cut_for_search(word)
|
575 |
+
word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
|
576 |
+
first_subword = word_list[0]
|
577 |
+
first_begin_idx = word.find(first_subword)
|
578 |
+
if first_begin_idx == 0:
|
579 |
+
second_subword = word[len(first_subword) :]
|
580 |
+
new_word_list = [first_subword, second_subword]
|
581 |
+
else:
|
582 |
+
second_subword = word[: -len(first_subword)]
|
583 |
+
new_word_list = [second_subword, first_subword]
|
584 |
+
return new_word_list
|
585 |
+
|
586 |
+
def _three_sandhi(self, word: str, finals: List[str]) -> List[str]:
|
587 |
+
if len(word) == 2 and self._all_tone_three(finals):
|
588 |
+
finals[0] = finals[0][:-1] + "2"
|
589 |
+
elif len(word) == 3:
|
590 |
+
word_list = self._split_word(word)
|
591 |
+
if self._all_tone_three(finals):
|
592 |
+
# disyllabic + monosyllabic, e.g. 蒙古/包
|
593 |
+
if len(word_list[0]) == 2:
|
594 |
+
finals[0] = finals[0][:-1] + "2"
|
595 |
+
finals[1] = finals[1][:-1] + "2"
|
596 |
+
# monosyllabic + disyllabic, e.g. 纸/老虎
|
597 |
+
elif len(word_list[0]) == 1:
|
598 |
+
finals[1] = finals[1][:-1] + "2"
|
599 |
+
else:
|
600 |
+
finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]]
|
601 |
+
if len(finals_list) == 2:
|
602 |
+
for i, sub in enumerate(finals_list):
|
603 |
+
# e.g. 所有/人
|
604 |
+
if self._all_tone_three(sub) and len(sub) == 2:
|
605 |
+
finals_list[i][0] = finals_list[i][0][:-1] + "2"
|
606 |
+
# e.g. 好/喜欢
|
607 |
+
elif (
|
608 |
+
i == 1
|
609 |
+
and not self._all_tone_three(sub)
|
610 |
+
and finals_list[i][0][-1] == "3"
|
611 |
+
and finals_list[0][-1][-1] == "3"
|
612 |
+
):
|
613 |
+
finals_list[0][-1] = finals_list[0][-1][:-1] + "2"
|
614 |
+
finals = sum(finals_list, [])
|
615 |
+
# split idiom into two words who's length is 2
|
616 |
+
elif len(word) == 4:
|
617 |
+
finals_list = [finals[:2], finals[2:]]
|
618 |
+
finals = []
|
619 |
+
for sub in finals_list:
|
620 |
+
if self._all_tone_three(sub):
|
621 |
+
sub[0] = sub[0][:-1] + "2"
|
622 |
+
finals += sub
|
623 |
+
|
624 |
+
return finals
|
625 |
+
|
626 |
+
def _all_tone_three(self, finals: List[str]) -> bool:
|
627 |
+
return all(x[-1] == "3" for x in finals)
|
628 |
+
|
629 |
+
# merge "不" and the word behind it
|
630 |
+
# if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error
|
631 |
+
def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
|
632 |
+
new_seg = []
|
633 |
+
last_word = ""
|
634 |
+
for word, pos in seg:
|
635 |
+
if last_word == "不":
|
636 |
+
word = last_word + word
|
637 |
+
if word != "不":
|
638 |
+
new_seg.append((word, pos))
|
639 |
+
last_word = word[:]
|
640 |
+
if last_word == "不":
|
641 |
+
new_seg.append((last_word, "d"))
|
642 |
+
last_word = ""
|
643 |
+
return new_seg
|
644 |
+
|
645 |
+
# function 1: merge "一" and reduplication words in it's left and right, e.g. "听","一","听" ->"听一听"
|
646 |
+
# function 2: merge single "一" and the word behind it
|
647 |
+
# if don't merge, "一" sometimes appears alone according to jieba, which may occur sandhi error
|
648 |
+
# e.g.
|
649 |
+
# input seg: [('听', 'v'), ('一', 'm'), ('听', 'v')]
|
650 |
+
# output seg: [['听一听', 'v']]
|
651 |
+
def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
|
652 |
+
new_seg = []
|
653 |
+
i = 0
|
654 |
+
# function 1
|
655 |
+
while i < len(seg):
|
656 |
+
word, pos = seg[i]
|
657 |
+
merged = False
|
658 |
+
if (
|
659 |
+
i - 1 >= 0
|
660 |
+
and word == "一"
|
661 |
+
and i + 1 < len(seg)
|
662 |
+
):
|
663 |
+
last = new_seg[-1] if new_seg else seg[i - 1]
|
664 |
+
if last[0] == seg[i + 1][0] and last[1] == "v" and seg[i + 1][1] == "v":
|
665 |
+
combined = last[0] + "一" + seg[i + 1][0]
|
666 |
+
new_seg[-1] = [combined, last[1]]
|
667 |
+
i += 2
|
668 |
+
merged = True
|
669 |
+
if not merged:
|
670 |
+
new_seg.append([word, pos])
|
671 |
+
i += 1
|
672 |
+
seg = new_seg
|
673 |
+
new_seg = []
|
674 |
+
# function 2
|
675 |
+
for word, pos in seg:
|
676 |
+
if new_seg and new_seg[-1][0] == "一":
|
677 |
+
new_seg[-1][0] = new_seg[-1][0] + word
|
678 |
+
else:
|
679 |
+
new_seg.append([word, pos])
|
680 |
+
return new_seg
|
681 |
+
|
682 |
+
# the first and the second words are all_tone_three
|
683 |
+
def _merge_continuous_three_tones(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
|
684 |
+
new_seg = []
|
685 |
+
sub_finals_list = [
|
686 |
+
lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) for (word, pos) in seg
|
687 |
+
]
|
688 |
+
assert len(sub_finals_list) == len(seg)
|
689 |
+
merge_last = [False] * len(seg)
|
690 |
+
for i, (word, pos) in enumerate(seg):
|
691 |
+
if (
|
692 |
+
i - 1 >= 0
|
693 |
+
and self._all_tone_three(sub_finals_list[i - 1])
|
694 |
+
and self._all_tone_three(sub_finals_list[i])
|
695 |
+
and not merge_last[i - 1]
|
696 |
+
):
|
697 |
+
# if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
|
698 |
+
if not self._is_reduplication(seg[i - 1][0]) and len(seg[i - 1][0]) + len(seg[i][0]) <= 3:
|
699 |
+
new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
|
700 |
+
merge_last[i] = True
|
701 |
+
else:
|
702 |
+
new_seg.append([word, pos])
|
703 |
+
else:
|
704 |
+
new_seg.append([word, pos])
|
705 |
+
|
706 |
+
return new_seg
|
707 |
+
|
708 |
+
def _is_reduplication(self, word: str) -> bool:
|
709 |
+
return len(word) == 2 and word[0] == word[1]
|
710 |
+
|
711 |
+
# the last char of first word and the first char of second word is tone_three
|
712 |
+
def _merge_continuous_three_tones_2(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
|
713 |
+
new_seg = []
|
714 |
+
sub_finals_list = [
|
715 |
+
lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) for (word, pos) in seg
|
716 |
+
]
|
717 |
+
assert len(sub_finals_list) == len(seg)
|
718 |
+
merge_last = [False] * len(seg)
|
719 |
+
for i, (word, pos) in enumerate(seg):
|
720 |
+
if (
|
721 |
+
i - 1 >= 0
|
722 |
+
and sub_finals_list[i - 1][-1][-1] == "3"
|
723 |
+
and sub_finals_list[i][0][-1] == "3"
|
724 |
+
and not merge_last[i - 1]
|
725 |
+
):
|
726 |
+
# if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
|
727 |
+
if not self._is_reduplication(seg[i - 1][0]) and len(seg[i - 1][0]) + len(seg[i][0]) <= 3:
|
728 |
+
new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
|
729 |
+
merge_last[i] = True
|
730 |
+
else:
|
731 |
+
new_seg.append([word, pos])
|
732 |
+
else:
|
733 |
+
new_seg.append([word, pos])
|
734 |
+
return new_seg
|
735 |
+
|
736 |
+
def _merge_er(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
|
737 |
+
new_seg = []
|
738 |
+
for i, (word, pos) in enumerate(seg):
|
739 |
+
if i - 1 >= 0 and word == "儿" and seg[i - 1][0] != "#":
|
740 |
+
new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
|
741 |
+
else:
|
742 |
+
new_seg.append([word, pos])
|
743 |
+
return new_seg
|
744 |
+
|
745 |
+
def _merge_reduplication(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
|
746 |
+
new_seg = []
|
747 |
+
for i, (word, pos) in enumerate(seg):
|
748 |
+
if new_seg and word == new_seg[-1][0]:
|
749 |
+
new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
|
750 |
+
else:
|
751 |
+
new_seg.append([word, pos])
|
752 |
+
return new_seg
|
753 |
+
|
754 |
+
def pre_merge_for_modify(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
|
755 |
+
seg = self._merge_bu(seg)
|
756 |
+
try:
|
757 |
+
seg = self._merge_yi(seg)
|
758 |
+
except:
|
759 |
+
print("_merge_yi failed")
|
760 |
+
seg = self._merge_reduplication(seg)
|
761 |
+
try:
|
762 |
+
seg = self._merge_continuous_three_tones(seg)
|
763 |
+
except:
|
764 |
+
print("_merge_continuous_three_tones failed")
|
765 |
+
try:
|
766 |
+
seg = self._merge_continuous_three_tones_2(seg)
|
767 |
+
except:
|
768 |
+
print("_merge_continuous_three_tones_2 failed")
|
769 |
+
|
770 |
+
seg = self._merge_er(seg)
|
771 |
+
return seg
|
772 |
+
|
773 |
+
def modified_tone(self, word: str, pos: str, finals: List[str]) -> List[str]:
|
774 |
+
finals = self._bu_sandhi(word, finals)
|
775 |
+
finals = self._yi_sandhi(word, finals)
|
776 |
+
finals = self._neural_sandhi(word, pos, finals)
|
777 |
+
finals = self._three_sandhi(word, finals)
|
778 |
+
return finals
|
text/zh_normalization/README.md
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Supported NSW (Non-Standard-Word) Normalization
|
2 |
+
|
3 |
+
|NSW type|raw|normalized|
|
4 |
+
|:--|:-|:-|
|
5 |
+
|serial number|电影中梁朝伟扮演的陈永仁的编号27149|电影中梁朝伟扮演的陈永仁的编号二七一四九|
|
6 |
+
|cardinal|这块黄金重达324.75克<br>我们班的最高总分为583分|这块黄金重达三百二十四点七五克<br>我们班的最高总分为五百八十三分|
|
7 |
+
|numeric range |12\~23<br>-1.5\~2|十二到二十三<br>负一点五到二|
|
8 |
+
|date|她出生于86年8月18日,她弟弟出生于1995年3月1日|她出生于八六年八月十八日, 她弟弟出生于一九九五年三月一日|
|
9 |
+
|time|等会请在12:05请通知我|等会请在十二点零五分请通知我
|
10 |
+
|temperature|今天的最低气温达到-10°C|今天的最低气温达到零下十度
|
11 |
+
|fraction|现场有7/12的观众投出了赞成票|现场有十二分之七的观众投出了赞成票|
|
12 |
+
|percentage|明天有62%的概率降雨|明天有百分之六十二的概率降雨|
|
13 |
+
|money|随便来几个价格12块5,34.5元,20.1万|随便来几个价格十二块五,三十四点五元,二十点一万|
|
14 |
+
|telephone|这是固话0421-33441122<br>这是手机+86 18544139121|这是固话零四二一三三四四一一二二<br>这是手机八六一八五四四一三九一二一|
|
15 |
+
## References
|
16 |
+
[Pull requests #658 of DeepSpeech](https://github.com/PaddlePaddle/DeepSpeech/pull/658/files)
|
text/zh_normalization/__init__.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
from text.zh_normalization.text_normlization import *
|
text/zh_normalization/char_convert.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
3 |
+
#
|
4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
# you may not use this file except in compliance with the License.
|
6 |
+
# You may obtain a copy of the License at
|
7 |
+
#
|
8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
#
|
10 |
+
# Unless required by applicable law or agreed to in writing, software
|
11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
# See the License for the specific language governing permissions and
|
14 |
+
# limitations under the License.
|
15 |
+
"""Traditional and simplified Chinese conversion, a simplified character may correspond to multiple traditional characters."""
|
16 |
+
|
17 |
+
simplified_charcters = "制咖片型超声盘鉴定仔点他命书歌粉巾字帐恤手指记忆棒形转弯沟光○〇㐄㐅㐆㐌㐖毒㐜㐡㐤㐰㐺㑇㑳㒳㒸㔾㗂㗎㝵㞎㞙㞞以㢲㢴㤅㥁㥯㨗㫺㬎㮎㮚㮸㲋㲱㲾㳮涧㵪㶸㷖㷭㹢㹴犬㺢狓㺵碗㽮㿝䍃䔢䖟䖸䗈䗥䗪䝓射䥯䦉䯝鲃鱼䲔䳗鹅䵹鼄䶑一对应映射丁不识下儿子做二休世丘之貉并中台原则串为甚谓干净了百事无成八变五十些人得道鸡升天代如并来去个国政策劲幽灵在欧洲游荡接样萝卜坑侧化传价元论醇共再准刀两断切分耕耘收获钱货物向看旧就绪险刻千金动劳永逸匙零夜半卡通回复返影踪反常态口咬气句话同吐快吹周味呼诺呜品红锅哄而散起唱和问三知生熟团漆黑火糟堆场空块面塌糊涂尘染壁厢夔已足多情露水大早到晚夫妻当关万莫开失古恨套所料既往孔见提师要家主审寸阴难买斗牛小撮部阵局展身层巴掌帆风顺席地带过年计于春头载四季期被蛇怕井绳度愿式份弹顷深前律径心意念差愁孤行俱全房厅交遮打技长把抓死拿眼泪鼻涕钥锁折段抿拍即合扫排掬挥拨拥上入击洞掷揽改故辙败文值名斑方面旁族日秋餐隔雅里终父旦时晌会霎间晃暴寒曝更月望垠际朝夕本正经利杯羹东西板枝独秀根筋杆进条龙服务概模次函数又性程总付步脚印趋登毛拔呵氧氮碳决雌雄波未平派谎言流清楚白准溜烟潭有获闻是处降琴鹤甲病发可拾沙目然了直以相眨穿睹瞥瞬矢的解石鸟神教秉虔诚秘种窝蜂穷窍笑置笔苟勾销抹杀煞等奖箍节吃箭仇双雕诗筹箩筐系列纸级士官统丝毫挂维网尽线微吭响股脑胎脉承腔臂力致效资源址器举功投般说讲规贸易叶障着慎满皆输号木电池衣倾钟高低视仁觉醒览遗角银币触溃九鼎蔽抄出驷马追重语破贫洗贯走路安蹴至几蹶振跃役胆汗较辈轮辞赞退六连遍递边针血锤音错门思闪真倒项栽雾类保护川先惊乍体哄鳞爪鸣滴泡邻域党专鼓作齐炒丑烯亥克内酯冬加奴卯肝炎基尺梁街裤镐客宠庭巳汝昌烷玲磊糖肇酉醛啷青县韪良香骨鲷丂七集河市弦喜嘴张舌堵区工业姊妹星架构巧彩扭歪拼凑余热曜武州爷浮屠美乡老阶树荤素碎落能魄鳃鳗珠丄丅丆万俟丈尚摸母娘量管群亚虎必我堂令申件装伏位博侠义界表女墟台戏臭皮匠胜诸葛亮赛顶倍催请运算包立叉戟离疫苗土史志演围揭瓦晒夷姑婆帝村宝烂尖杉碱屉桌山岔岛由纪峡坝库镇废从德后拗汤治旬食明昧曹朋友框栏极权幂曲归依猫民氟硼氯磷铁江侗自旅法司洋浦梅园温暖湾焦班幸用田略番叠皇炮捶硝苯酸腺苷棱草镜穗跳远索锦纲聚氰胺联店胚膲爱色堇紫罗兰芝茶饭菱云虫藏藩乱叛苏亲债凳学座恐恋柱测肌腹衩锥系貂企乌跪叩军车农题迭都甘油屯奏键短阿姨陪姐只顾茅庐槽驾魂鲜鹿页其菜单乘任供势午齿汉组织吊调泻唇坡城报坟外夸将尉建筑岸岗公床扬新剑升杭林栗校楼标款汽社浣海商馆剧院钢华港机械广媒环球融第医科证券综财乐育游涨犹岭疏瘾睑确兵领导缴肢膛船艾瑟尔苍蔡虞效衫覆访诉课谕议轨述野钩限敌鞋颌颔颚饶首龈站例修凡划垂届属崽颏厨拜挫摆放旋削棋榻槛礼沉注滑营狱画确仪聘花葬诏员跌辖周达酒锚闸陷陆雨雪飞威丌于丹久乏予理评产亢卑亦乎舞己悲矩圆词害志但住佞佳便俗信票案幅翁倦伦假偏倚斜亏鬼敲停备伤脾胃仅此像俭匮免宜穴焉戴兼容许冻伯仲负彼昼皂轩轾实刊划颠卫战哥比省非好黄饰别拘束掩奶睬选择摇扰烦苦枚写协厌及格受欢迎约只估侵犯割状告或缺抗拒挽撤救药喻磨灭端倪少逆逾越避靠适吉誉吝玉含延咎歹听啻渊善谋均匀堪忍够太惹妙妥妨孕症孝术室完纳推冠积宣疑辩栗碴称屈挠屑干涉衡待很忙恶忿怎么怠急耻恭息悦惑惜惟想愉愧怍慌愤启懂懈怀材才紧招认扣抵拉舍也罢插揣冒搭撞南墙扩核支攻敢雷攀敬里吗需景智暇曾罪遇朽枉止况竞争辱求愈渝溶济左右袒困补爽特寂寞示弱找谢畏强疾徐痛痒冤符眠睦瞅董何厚云措活疲羞者轻玻璃祥兆禁���稂莠稳佛换答简结果盟绝缕途给谈否羁翼耐肖胫毋宁兴舒若菲莱痕迹窠臼虚衰脸兔撒鹰棺范该详讳抬泰让须眉象众赀账费灰赖奇虑训辍辨菽麦辛近送透逞徒速续逮捕遂遑违逊斧钺艰醉锈随观弃显饱脂肪使丏丐帮丒且慢末丕替桃宗王尊凉爵各图屋脊粮署录坛吾禄职胄袭君厦丗北壑桐疹损逢陵鹬丙寅戌氨腈唑纶辰酮脱氢酶醚丞丢现掉纱帽弄扯炮碗丠両丣坐存激肩臻蒂莲悖序驱丨丩丫挺杈髻鬟细介俄伊犁京尼布订普渡央委监察检查剂圈设警队斯督剩震境航舶革防托播促质版蝾螈锋研艺历残消频谱精密制造陲邮候埔坚压坜凹汇执府究邦俘摄寮彬狼岳肺肿庸英讯诊埋粒胞括控码韩暑枪枢砥澳哇牟寿甸钻探篇签缀缝继耳肯照妇埃悬璧轴柜台辣搁浅邪跑纤阮阳私囊魔丮丰姿采丱烧丳丵丶丷丸参寨朗桂瑞砂衷霞貌凤仆舰因嫌宰峰干络牌持旨祭祷簿编罚宾办丼丿乀乂乃乄仰慕盛旷留考验阔乆乇么丑麽乊湖燃乑乒乓乕乖僻忤戾离谬迕乗危肥劫除隙浪婿乙炔肠酰吡咯盐乚乛乜嘢卿玄宫尾狐龟塔嶷兄弟泉章霄钉耙乞扎哀怜恕讨乢乣乤乥乧乨乩童乪乫乭乳晕汁液瑶浆牙癌突窦罩腐胶猪酪蛋糕菌瘤乴乵乶乷乸乹乺乼乾俸冰嘉哕嚎坤妈尸垒旱枯涸俐渴潮涩煸豆燥爹瘦瘪癣瞪袋脆姜贝隆馏乿亀亁叫咕攘扔搞男砸窜蓬麻亃亄亅却亇迟典今临繁累卵奉婚聪躬巨与迁添裂副宿岁怪恶尕仑愣杆硅硫钛铀锰芑杂异钠砷胂磺琥珀舱棍簧胡茬盗浩盆贩郎腿亍洪亐互欠助勉惠操斥诿系户译亓墓碑刑铃卅渠缤纷斗米旗宪钒灯徽瘟祖拳福谷丰脏腑绑肉腌苓蕴桥铺霸颜闹判喷冈底蛙陉矿亖亘亜罕们娜桑那努哈喀弗烈曼松森杜氏杯奥琛敦戊穆圣裔汇薛孙亟亡佚虏羊牢奋释卷卸契媾感额睫缠谊趾塞挤纽阻还配驰庄亨洛祚亪享津沪畿郊慈菴枇杷膏亭阁锃丽亳亶亹诛初责翻疯偶杰丛稠妖拖寰居吸授慧蜗吞壮魅狗矛盾益渣患忧稀描猿梦暂涯畜祸缘沸搜引擎臣横纭谁混援蒸兽狮税剖亻亼亽亡什献刹邡么仂仃仄仆富怨仈仉毕昔晨壳绍仍仏仒仕宦仗欺恃腰叹叹炬梓讫施仙后琼逝仚仝仞仟悔仡佬偿填泊拓扑簇羔购顿钦佩发棻阃驭养亿儆尤借帧赈凌叙帖李柔刚沃眦睚戒讹取飨读仨仫仮著泳卧躺韶夏裁仳仵唯贤凭钓诞仿似宋佛讽伀硕盼鹅伄儅伈伉俪柯始娃迈戈坦堡帕茨萨庙玛莉莎藤霍姆伋伍奢胥廷芳豪伎俩侍汛勒希羲雏伐憩整谟闲闲伕伙伴颐伜伝伢叔恒兹恩翰伱伲侣伶俜悧鼬伸懒缩喇叭伹伺伻伽倻辐伾似佃伫布乔妮墨佉卢佌贷劣廉昂档浓矮伞洼缓耗胸谷迷挡率龋宅沫舍疗佐贰佑占优据铧尝呢须鲁晓佗佘余坪寺瓜铳僧蒙芒陀龛哼呕坊奸孽弊揖祟茧缚誓贼佝偻瞀佟你夺赶佡佢佣佤佧贾佪佫佯佰佱洁绩酿肴佴卷佶佷佸佹佺佻佼佽佾具唤窘坏娱怒慨硬习惯聋膨胀蔓骇贵痹侀侁侂侃侄侅鸿燕侇侈糜靡侉侌妾侏儒仓鼠侐侑侔仑侘侚链侜偎傍钴循柳葫芦附価侮骂蔑侯岩截蚀局贴壶嬛宴捷携桶笺酌俣狭膝狄俅俉俊俏俎俑俓俔谚俚俛黎健呈固墒增守康箱湿祐镖镳杠盒靖膜龄俞豹猎噪孚封札筒托衍鸽剪撰稿炼厂禊练缮葺俯瞰撑冲效俳俴俵俶俷俺备俾伥倂倅储卒惶敷猝逃颉蓄崇隐倌倏忽刺蜡烛噍嚼坍扁抽毙葱楣灌灶粪背薮卖赔闭霉腾倓倔幸倘倜傥倝借箸挹浇阅倡狂倢倣値倥偬倨傲倩匡嗣冲柝珍倬倭寇猩倮倶倷倹勤赞偁偃充伪吏嗓寐惺扮拱芫茜藉虢钞偈伟晶偌宕距析滤殿疼瘫注颇偓偕鸭歇滞偝偟偢忘怡旺偨偩逼偫偭偯偰偱偲侦缉蹄偷减惰漏窥窃偸偺迹傀儡傅傈僳骂篱傎奎琳迪叟芭傒傔傕伧悉荒傜傞傢傣芽逼佣婢傮睨寄檄诵谣颂伛担辜弓惨蒿悼疤傺傻屄臆巢泄箧羡盖轧颓傿㑩僄僇佥僊働僎侨僔僖僚僝伪僣僤侥僦猴偾僩僬僭僮僯僰雇僵殖签静僾僿征陇儁侬儃儇侩朴薄儊儋儌儍傧儓俦侪拟尽儜儞儤儦儩汰哉寡渥裕酷儭儱罐儳儵儹傩俨儽兀臬臲鹫允勋勋宙宵帅憝彝谐嫂阋畅沛溢盈饥赫凶悍狠猛顽愚妣斩秦遣鞭耀敏荣槃泽爆碟磁秃缆辉霁卤朵娄孜烽酱勃汀箕裘钳耶蒙蕾彻兑软遭黜兎児韵媳爸兕觥兖兙兛兜售鍪肚兝兞兟兡兢兣樽殓涅睡禀籍赘泌啡肽奸幕涵涝熵疚眷稃衬讧赴焕椒歼植跏没试误猜栖窗肋袖颊兪卦撇胡岐廓轿疸枫茴珑厕秩募勺吨寓斤历亩迫筷厘最淫螺韬兮宽匪筛襄赢轭复兲诈刃堰戎痞蚁饷它冀铸冂冃円冇冉册嫁厉砺竭醮冏牧冑冓冔冕冖冗冘冞冢窄抑诬冥冫烘菇蛰冷凝坨橇淇淋炭饼砖碛窖醋雕雹霜冱冶炉艳嘲峻滩淡漠煖飕饮冼冽凃凄怆梗凅凇净凊凋敝蒙凔凛遵汞脢凞几凢処凰凯凵凶焰凸折刷纹预丧喽奔巡榜殡芙蓉租笼辑鞘萃凼锯镬刁蛮刂娩崩批拆摊掰蘖骤歧颗秒袂赃勿嘱忌磋琢肤刈羽刎讼戮舂桨艇刓刖霹雳刜创犊刡恙墅帜筵致劫劫刨昏默攸尿欲熏润薰圭删刮痧铲刱刲刳刴刵踏磅戳柏槐绣芹苋猬舟铭鹄鹜劫剁剃辫刭锉履铅克剌姻咽哨廊掠桅沿召瞻翅赵卜渺茫郭剒剔剕沥剚愎毅讷才剜剥啄采剞剟剡剣剤䌽剐肾驶黏剰袍剀紊铲剸剺剽剿劁劂札劈啪柴扳啦刘奭姥夼昫涓熙禅禹锡翔雁鹗刽刿弩柄蜻蛉劒劓劖劘劙澜篑赏矶釜晋甜薪逐劦熔纣虐赤囚劬劭労劵效劻劼劾峭艮勅勇励勍勐腊脖庞漫饲荡粥辄勖勗勘骄馁碌泮雇捐竹骑殊阱绩朴恳谨剿勧勩勯勰劢勋勷劝惩慰诫谏勹芡践阑匁庇拯粟扎袱裹饺匆遽匈匉匊匋匍匐茎匏匕妆痰脓蛹斋苑烤蹈塘羌熊阀螳螂疆碚竿纬荷茵邙魏匚匜匝匟扶稷匣匦拢匸匹耦匽匾匿卂叮疮禧轸堤棚迢钧炼卄卆遐卉瓷盲瓶当胱腱裸卋卌卍卐怯污贱鄙龌龊陋卓溪唐梯渔陈枣泥漳浔涧梨芬谯赡辕迦郑単驴弈洽鳌卛占筮卝卞卟吩啉屎翠厄卣卨卪卬卮榫袄玺绶钮蚤惧殆笃耸卲帘帙绕恤卼卽厂厎厓厔厖厗奚厘厍厜厝谅厕厤厥厪腻孢厮厰厳厣厹厺粕垢芜菁厼厾叁悟茸薯叄吵笄悌哺讥坫垄弧芯杠潜婴刍袁诘贪谍煽馈驳収岳缔灾贿骗叚叡吻拦蘑蜜诀燧玩砚筝椎蔺铜逗骊另觅叨唠谒杵姓喊嚷嚣咚咛塑寻恼憎擦只泣渗蝠叱吒咄咤喝籀黛舵舷叵叶铎懿昭穰苴辽叻叼吁堑嫖赌瞧爬众抒吅吆夥卺橡涤抱纵摩郡唁坠扇篮膀袜颈吋忾谘酬哭妓媛暗表缰迩妃羿絮蕃浑拐葵暮隅吔吖啶嗪戚吜啬噬咽吟哦咏吠吧唧嗒咐吪隽咀征燐苞茹钙哧吮吰吱嘎吲哚吴栋娇窟孟箫忠晗淞阖闾趼宇呐睛嘘拂捧疵熄竽笛糠吼吽呀吕韦蒙呃呆笨呇贡呉罄呋喃呎呏呔呠呡痴呣呤呦呧瑛眩扒晬淑姬瑜璇鹃呪呫哔嚅嗫呬呯呰呱呲咧噌钝呴呶呷呸呺呻哱咻啸噜吁坎坷逻呿咁咂咆哮咇咈咋蟹煦珅蔼咍咑咒诅咔哒嚓咾哝哩喱咗咠咡咢咣咥咦咨嗟询咩咪咫啮啮咭咮咱咲咳呛嗽咴啕咸咹咺呙喉咿婉恸悯赋矜绿茗蓝哂抢瞒哆嗦啰噻啾滨彗哋哌哎唷哟哏哐哞哢哤哪里哫啼喘哰哲萎蚌哳咩哽哿呗唅唆唈唉唎唏哗尧棣殇璜睿肃唔睇唕吣唞唣喳唪唬唰喏唲唳唵嘛唶唸唹唻唼唾唿啁啃鹦鹉啅埠栈榷祺铺鞅飙啊啍啎啐啓啕啖啗啜哑祈啢衔啤啥啫啱啲啵啺饥啽噶昆沁喁喂喆裙喈咙喋喌喎喑喒喓喔粗喙幛庆滋鹊喟喣喤喥喦喧骚喨喩梆吃葡萄喭驼挑吓碰枞瓣纯疱藻趟铬喵営喹喺喼喿嗀嗃嗄嗅嗈嗉嗊嗍嗐嗑嗔诟嗕嗖嗙嗛嗜痂癖嗝嗡嗤嗥嗨唢嗬嗯嗰嗲嗵叽嗷嗹嗾嗿嘀嘁嘂嘅惋嘈峪禾荫啀嘌嘏嘐嘒啯啧嘚唛嘞嘟囔嘣嘥嘦嘧嘬嘭这谑严敞馋松哓嘶嗥呒虾嘹嘻啴嘿噀噂噅噇噉噎噏噔噗噘噙噚咝噞噢噤蝉皿噩噫噭嗳噱哙噳嚏涌洒欲巫霏噷噼嚃嚄嚆抖哜尝嚔苏嚚嚜嚞嚟呖嚬嚭嚮嚯亸喾饬按竣苛嚵嘤啭冁呓膪谦囍囒囓囗囘萧酚飘溅谛囝溯眸纥銮鹘囟殉囡団囤囥囧囨囱囫囵囬囮囯囲図囶囷囸囹圄圉拟囻囿圀圂圃圊粹蠹赦圌垦圏滚鲱凿枘圕圛圜圞坯埂壤骸炕祠窑豚绅魠鲮鳖圧握圩圪垯圬圮圯炸岬幔毯祇窨菩溉圳圴圻圾坂坆沾坋坌舛壈昆垫墩椅坒坓坩埚坭坰坱坳坴坵坻坼杨挣涎帘垃垈垌垍垓垔垕垗垚垛垝垣垞垟垤垧垮垵垺垾垿埀畔埄埆埇埈埌殃隍埏埒埕埗埜垭埤埦埧埭埯埰埲埳埴埵埶绋埸培怖桩础辅埼埽堀诃侄庑堃堄摧磐贞韧砌堈堉垩堋堌堍堎垴堙堞堠礁堧堨舆堭堮蜓摘堲堳堽堿塁塄塈煤茔棵塍垲埘塓绸塕鸦沽虱塙冢塝缪塡坞埙塥塩塬塱场螨塼塽塾塿墀墁墈墉墐夯増毁墝墠墦渍钵墫墬堕墰墺墙橱壅壆壊壌壎壒榨蒜壔壕壖圹垆壜壝垅壡壬壭壱売壴壹壻壸寝壿夂夅夆変夊夌漱邑夓腕泄甥御骼夗夘夙衮瑙妊娠醣枭珊莺鹭戗幻魇夤蹀秘擂鸫姚宛闺屿庾挞拇賛蛤裨菠氅漓捞湄蚊霆鲨箐篆篷荆肆舅荔鲆巷惭骰辟邱镕镰阪漂烩鲵鲽鳄鸨胪鹏妒峨谭枰晏玑癸祝秤竺牡籁恢罡蝼蝎赐绒御梭夬夭砣榆怙枕夶夹馅奄崛葩谲奈贺祀赠奌奂奓奕䜣詝奘奜奠奡奣陶奨奁魁奫奬奰娲孩贬隶酥宄狡猾她姹嫣妁毡荼皋膻蝇嫔妄妍嫉媚娆妗趣妚妞妤碍妬娅妯娌妲妳妵妺姁姅姉姗姒姘姙姜姝姞姣姤姧姫姮娥姱姸姺姽婀娀诱慑胁娉婷娑娓娟娣娭娯娵娶娸娼婊婐婕婞婤婥溪孺婧婪婬婹婺婼婽媁媄媊媕媞媟媠媢媬媮妫媲媵媸媺媻媪眯媿嫄嫈袅嫏嫕妪嫘嫚嫜嫠嫡嫦嫩嫪毐嫫嫬嫰妩嫺娴嫽嫿妫嬃嬅嬉耍婵痴艳嬔嬖嬗嫱袅嫒嬢嬷嬦嬬嬭幼嬲嬴婶嬹嬾嬿孀娘孅娈孏曰癫屏孑孓雀孖斟篓谜摺孛矻鸠崮轲祜鸾孥邈毓棠膑孬孭孰孱孳孵泛罔衔孻孪宀宁冗拙株薇掣抚琪瓿榴谧弥宊濂祁瑕宍宏碁宓邸谳実潢町宥宧宨宬徵崎骏掖阙臊煮禽蚕宸豫寀寁寥寃檐庶寎暄碜寔寖寘寙寛寠苫寤肘洱滥蒗陕核寪弘绰螽宝擅疙瘩晷対檐専尃尅赎绌缭畴衅尌峙醌襟痲碧屁昊槌淘恵瀑牝畑莓缸羚觑蔻脏躁尔尓锐尗尙尜尟尢��尨尪尬尭尰擒尲尶尴尸尹潽蠖蛾尻扣梢蚴鳍脬蹲屇屌蚵屐屃挪屖屘屙屛屝屡屣峦嶂岩舄屧屦屩屪屃屮戍驻钾崖嵛巅旮旯楂榄榉芋茱萸靛麓屴屹屺屼岀岊岌岍阜岑彭巩岒岝岢岚岣岧岨岫岱岵岷峁峇峋峒峓峞峠嵋峨峰峱岘峹峿崀崁崆祯崋崌崃岖昆崒崔嵬巍萤颢崚崞崟崠峥巆崤崦崧殂岽崱崳崴崶崿嵂嵇嵊泗嵌嵎嵒嵓岁嵙嵞嵡嵩嵫嵯嵴嵼嵾嵝崭崭晴嶋嶌嶒嶓嵚崂嶙嶝嶞峤嶡嶢峄嶨嶭嶮嶰嶲岙嵘巂巃巇巉岿巌巓巘巛滇芎巟巠弋回巣巤炊擘蜥蟒蛊觋巰蜀彦淖杏茂甫楞巻巽帼巿帛斐鲫蕊帑帔帗帚琉汶帟帡帣帨裙帯帰帷帹暆帏幄帮幋幌幏帻幙帮幞幠幡幢幦幨幩幪帱幭幯幰遥蹉跎馀庚鉴幵幷稚邃庀庁広庄庈庉笠庋跋庖牺庠庤庥鲸庬庱庳庴庵馨衢庹庿廃厩廆廋廌廎廏廐廑廒荫廖廛厮搏锣廞弛袤廥廧廨廪廱绵踵髓廸迫瓯邺廻廼廾廿躔弁皱弇弌弍弎弐弑吊诡憾荐弝弢弣弤弨弭弮弰弪霖繇焘斌旭溥骞弶弸弼弾彀彄别累纠强彔彖彘彟彟陌彤贻彧绘虹彪炳雕蔚鸥彰瘅彲彳彴仿彷徉徨彸彽踩敛旆徂徇徊渭畲铉裼従筌徘徙徜徕膳苏萌渐徬徭醺徯徳徴潘徻徼忀瘁胖燎怦悸颤扉犀澎湃砰恍惚绞隘忉惮挨饿忐忑忒忖応忝忞耿忡忪忭忮忱忸怩忻悠懑怏遏怔怗怚怛怞怼黍讶怫怭懦怱怲恍怵惕怸怹恁恂恇恉恌恏恒恓恔恘恚恛恝恞恟恠恣恧眄恪恫恬澹恰恿悀悁悃悄悆悊悐悒晦悚悛悜悝悤您悩悪悮悰悱凄恻德悴怅惘闷悻悾惄愫钟蒐惆惇惌惎惏惓惔惙惛耄惝疟浊恿惦德恽惴蠢惸拈愀愃愆愈愊愍愐愑愒愓愔愕恪氓蠢騃昵惬赧悫愬愮愯恺愼慁恿慅慆慇霭慉慊愠慝慥怄怂慬慱悭慴慵慷戚焚憀灼郁憃惫憋憍眺捏轼愦憔憖憙憧憬憨憪憭怃憯憷憸憹憺懃懅懆邀懊懋怿懔懐懞懠懤懥恹懫懮懰懱毖懵遁梁雍忏懽戁戄戆戉戋戕戛戝戛戠戡戢戣戤戥戦戬戭戯轰戱披菊牖戸戹戺戻卯戽锹扂楔扃扆扈扊杖牵绢铐镯赉扐搂搅烊盹瞌跟趸镲靶鼾払扗玫腮扛扞扠扡扢盔押扤扦扱罾揄绥鞍郤窾扻扼扽抃抆抈抉抌抏瞎抔缳缢擞抜拗択抨摔歉蹿牾抶抻搐泵菸拃拄拊髀抛拌脯拎拏拑擢秧沓曳挛迂拚拝拠拡拫拭拮踢拴拶拷攒拽掇芥橐簪摹疔挈瓢骥捺蹻挌挍挎挐拣挓挖掘浚挙揍聩挲挶挟挿捂捃捄捅捆捉捋胳膊揎捌捍捎躯蛛捗捘捙捜捥捩扪捭据捱捻捼捽掀掂抡臀膘掊掎掏掐笙掔掗掞棉芍掤搪阐掫掮掯揉掱掲掽掾揃揅揆搓揌诨揕揗揘揜揝揞揠揥揩揪揫橥遒麈揰揲揵揶揸背揺搆搉搊搋搌搎搔搕撼橹捣搘搠搡搢搣搤搥搦搧搨搬楦裢讪赸掏搰搲搳搴揾搷搽搾搿摀摁摂摃摎掴摒摓跤摙摛掼摞摠摦喉羯摭摮挚摰摲抠摴抟摷掺摽撂撃撅稻撊撋挦锏泼撕撙撚㧑挢撢掸撦撅撩撬撱朔揿蚍蜉挝捡擀掳闯擉缶觚擐擕擖擗擡擣擤澡腚擧擨擩擫擭摈拧撷擸撸擽擿攃摅撵攉攥攐攓撄搀撺每攩攫辔澄攮攰攲攴轶攷砭讦攽碘敁敃敇敉叙敎筏敔敕敖闰诲敜煌敧敪敳敹敺敻敿斁衽斄牒绉诌斉斎斓鹑谰驳鳢斒筲斛斝斞斠斡斢斨斫斮晾沂潟颖绛邵斲斸釳於琅斾斿旀旗旃旄涡旌旎旐旒旓旖旛旝旟旡旣浴旰獭魃旴时旻旼旽昀昃昄昇昉晰躲澈熹皎皓矾昑昕昜昝昞昡昤晖笋昦昨是昱昳昴昶昺昻晁蹇隧蔬髦晄晅晒晛晜晞晟晡晢晤晥曦晩萘莹顗晿暁暋暌暍暐暔暕煅旸暝暠暡曚暦暨暪朦胧昵暲殄冯暵暸暹暻暾曀晔昙曈曌曏曐暧曘曙曛叠昽曩骆曱甴肱曷牍禺锟曽沧耽朁朅朆杪栓夸竟粘绦朊膺朏朐朓朕朘朙瞄觐溘饔飧朠朢朣栅椆淀虱朩朮朰朱炆璋钰炽鹮朳槿朵朾朿杅杇杌陧欣钊湛漼楷瀍煜玟缨翱肇舜贽适逵杓杕杗杙荀蘅杝杞脩珓筊杰榔狍閦颦缅莞杲杳眇杴杶杸杻杼枋枌枒枓衾葄翘纾逋枙狸桠枟槁枲枳枴枵枷枸橼枹枻柁柂柃柅柈柊柎某柑橘柒柘柙柚柜柞栎柟柢柣柤柩柬柮柰柲橙柶柷柸柺査柿栃栄栒栔栘栝栟柏栩栫栭栱栲栳栴檀栵栻桀骜桁镁桄桉桋桎梏椹葚桓桔桕桜桟桫椤桭杯桯桲桴桷桹湘溟梃梊梍梐潼栀枧梜梠梡梣梧梩梱梲梳梴梵梹棁棃樱棐棑棕榈簑绷蓑枨棘棜棨棩棪棫棬棯棰棱棳棸棹椁棼碗椄苕椈椊椋椌椐椑椓椗検椤椪椰椳椴椵椷椸椽椿楀匾楅篪楋楍楎楗楘楙楛楝楟楠楢楥桢楩楪楫楬楮楯楰梅楸楹楻楽榀榃榊榎槺榕榖榘榛狉莽搒笞榠榡榤榥榦榧杩榭榰榱梿霰榼榾桤槊闩槎槑槔槖様槜槢槥椠槪槭椮槱槲槻槼槾樆樊樏樑樕樗樘樛樟樠樧樨権樲樴樵猢狲桦樻罍樾樿橁橄橆桡笥龠橕橚橛辆椭橤橧竖膈跨橾橿檩檃檇柽檍檎檑檖檗桧槚檠樯檨檫檬梼槟檴檵柠棹櫆櫌栉櫜椟櫡槠栌枥榇栊櫹棂茄櫽欀欂欃欐欑栾欙棂溴欨欬欱欵欶欷歔欸欹欻欼欿歁歃歆艎歈歊莳蝶歓歕歘歙歛歜欤歠蹦诠镶蹒跚升陟歩歮歯歰歳歴璞歺瞑歾殁夭殈殍殑殗殜殙殛殒殢殣殥殪殚僵殰殳荃殷殸殹蛟殻肴谤殴毈毉喂毎���蕈毗毘毚茛邓毧毬毳毷毹毽毾毵牦氄氆靴氉氊氇氍氐聊氕氖気氘氙氚氛氜氝氡汹焊痉氤氲氥氦铝锌氪烃氩铵痤汪浒漉痘盂碾菖蒲蕹蛭螅氵冰氹氺氽烫氾氿渚汆汊汋汍汎汏汐汔汕褟汙汚汜蓠沼秽蔑汧汨汩汭汲汳汴堤汾沄沅沆瀣沇沈葆浸沦湎溺痼疴沌沍沏沐沔沕沘浜畹砾沚沢沬沭沮沰沱灢沴沷籽沺烹濡洄泂肛泅泆涌肓泐泑泒泓泔泖泙泚泜泝泠漩馍涛粼泞藓鳅泩泫泭泯铢泱泲洇洊泾琵琶荽蓟箔洌洎洏洑潄濯洙洚洟洢洣洧洨洩痢滔洫洮洳洴洵洸洹洺洼洿淌蜚浄浉浙赣渫浠浡浤浥淼瀚浬浭翩萍浯浰蜃淀苔蛞蝓蜇螵蛸煲鲤浃浼浽溦涂涊涐涑涒涔滂莅涘涙涪涫涬涮涴涶涷涿淄淅淆淊凄黯淓淙涟淜淝淟淠淢淤渌淦淩猥藿亵淬淮淯淰淳诣涞纺淸淹炖癯绮渇済渉渋渓渕涣渟渢滓渤澥渧渨渮渰渲渶渼湅湉湋湍湑湓湔黔湜湝浈湟湢湣湩湫湮麟湱湲湴涅満沩溍溎溏溛舐漭溠溤溧驯溮溱溲溳溵溷溻溼溽溾滁滃滉滊荥滏稽滕滘汇滝滫滮羼耷卤滹浐煎漈漊漎绎漕漖漘漙沤漜漪漾漥漦漯漰溆漶漷濞潀颍潎潏潕潗潚潝潞潠潦祉疡潲潵滗潸潺潾涠澁澂澃澉澌澍澐澒澔澙渑澣澦澧澨澫澬浍澰澴澶澼熏郁濆濇濈濉濊貊濔疣濜濠濩觞浚濮盥潍濲泺瀁滢渎渖瀌浏瀒瀔濒泸瀛潇潆瀡潴泷濑瀬弥潋瀳瀵瀹瀺瀼沣滠灉灋灒漓灖灏灞灠滦灥灨滟灪蜴灮烬獴灴灸灺炁炅鱿炗炘炙炤炫疽烙钎炯炰炱炲炴炷毁炻烀烋瘴鲳烓烔焙烜烝烳饪烺焃焄耆焌焐焓焗焜焞焠焢焮焯焱焼煁煃煆煇煊熠煍熬煐炜煕暖熏硷霾煚煝煟煠茕矸煨琐炀萁煳煺煻熀熅熇熉罴荧穹炝熘熛熜稔谙烁熤熨熯熰眶蚂颎熳熸熿燀烨燂燄盏燊燋燏燔隼燖焖燠燡灿燨燮燹燻燽燿爇爊爓爚爝爟爨蟾爯爰为爻丬爿牀牁牂牄牋窗牏牓窗釉牚腩蒡虻牠虽蛎牣牤牮牯牲牳牴牷牸牼绊牿靬犂犄犆犇犉犍犎犒荦犗犛犟犠犨犩犪犮犰狳犴犵犺狁甩狃狆狎狒獾狘狙黠狨狩狫狴狷狺狻豕狈蜘猁猇猈猊猋猓猖獗猗猘狰狞犸猞猟獕猭猱猲猳猷猸猹猺玃獀獃獉獍獏獐獒毙獙獚獜獝獞獠獢獣獧鼇蹊狯猃獬豸狝獯鬻獳犷猕猡玁菟玅玆玈珉糁禛郅玍玎玓瓅玔玕玖玗玘玞玠玡玢玤玥玦珏瑰玭玳瑁玶玷玹玼珂珇珈瑚珌馐馔珔珖珙珛珞珡珣珥珧珩珪佩珶珷珺珽琀琁陨玡琇琖琚琠琤琦琨琫琬琭琮琯琰琱琲琅琴珐珲瑀瑂瑄瑉玮瑑瑔瑗瑢瑭瑱瑲瑳瑽瑾瑿璀璨璁璅璆璈琏璊璐璘璚璝璟璠璡璥瑷璩璪璫璯璲玙璸璺璿瓀璎瓖瓘瓒瓛脐瓞瓠瓤瓧瓩瓮瓰瓱瓴瓸瓻瓼甀甁甃甄甇甋甍甎甏甑甒甓甔瓮甖甗饴蔗甙诧钜粱盎锈团甡褥産甪甬甭甮宁铠甹甽甾甿畀畁畇畈畊畋畎畓畚畛畟鄂畤畦畧荻畯畳畵畷畸畽畾疃叠疋疍疎箪疐疒疕疘疝疢疥疧疳疶疿痁痄痊痌痍痏痐痒痔痗瘢痚痠痡痣痦痩痭痯痱痳痵痻痿瘀痖瘃瘈瘉瘊瘌瘏瘐痪瘕瘖瘙瘚瘛疭瘜瘝瘗瘠瘥瘨瘭瘆瘯瘰疬瘳疠瘵瘸瘺瘘瘼癃痨痫癈癎癐癔癙癜癠疖症癞蟆癪瘿痈発踔绀蔫酵皙砬砒翎翳蔹钨镴皑鹎驹暨粤褶皀皁荚皃镈皈皌皋皒朱皕皖皘皜皝皞皤皦皨皪皫皭糙绽皴皲皻皽盅盋碗盍盚盝踞盦盩秋千盬盭眦睁瞤盯盱眙裰盵盻睐眂眅眈眊県眑眕眚眛眞眢眣眭眳眴眵眹瞓眽郛睃睅睆睊睍睎困睒睖睙睟睠睢睥睪睾睯睽睾眯瞈瞋瞍逛瞏瞕瞖眍䁖瞟瞠瞢瞫瞭瞳瞵瞷瞹瞽阇瞿眬矉矍铄矔矗矙瞩矞矟矠矣矧矬矫矰矱硪碇磙罅舫阡、矼矽礓砃砅砆砉砍砑砕砝砟砠砢砦砧砩砫砮砳艏砵砹砼硇硌硍硎硏硐硒硜硖砗磲茚钡硭硻硾碃碉碏碣碓碔碞碡碪碫碬砀碯碲砜碻礴磈磉磎硙磔磕磖磛磟磠磡磤磥蹭磪磬磴磵磹磻硗礀硚礅礌礐礚礜礞礤礧礮砻礲礵礽礿祂祄祅祆禳祊祍祏祓祔祕祗祘祛祧祫祲祻祼饵脔锢禂禇禋祦禔祎隋禖禘禚禜禝禠祃禢禤禥禨禫祢禴禸秆秈秊闱飒秋秏秕笈蘵赁秠秣秪秫秬秭秷秸稊稌稍稑稗稙稛稞稬秸稲稹稼颡稿穂穄穇穈穉穋稣贮穏穜穟秾穑穣穤穧穨穭穮穵穸窿阒窀窂窅窆窈窕窊窋窌窒窗窔窞窣窬黩蹙窑窳窴窵窭窸窗竁竃竈竑竜并竦竖篦篾笆鲛竾笉笊笎笏笐靥笓笤箓笪笫笭笮笰笱笲笳笵笸笻筀筅筇筈筎筑筘筠筤筥筦笕筒筭箸筰筱筳筴宴筸箂个箊箎箑箒箘箙箛箜篌箝箠箬镞箯箴箾篁筼筜篘篙篚篛篜篝篟篠篡篢篥篧篨篭篰篲筚篴篶篹篼箦簁簃簆簉簋簌簏簜簟簠簥簦簨簬簰簸簻籊藤籒籓籔签籚篯箨籣籥籧笾簖籫籯芾麴籵籸籹籼粁秕粋粑粔粝粛粞粢粧粨粲粳稗粻粽辟粿糅糆糈糌糍糒糔萼糗蛆蹋糢糨糬粽糯糱籴粜糸糺紃蹼鲣霉纡纨绔纫闽襻紑纰纮锭鸢鹞纴紞紟扎紩紬绂绁纻紽紾绐絁絃絅経絍绗絏缡褵絓絖絘絜绚絣螯絪絫聒絰絵绝絺絻絿綀绡綅绠绨绣綌綍綎捆綖綘継続缎绻綦綪线綮綯绾罟蝽綷縩绺绫緁绲緅緆缁绯緌緎総緑绱緖缃缄缂绵缗緤褓缌纂緪緰缑缈缏缇縁縃縄萦缙缒縏缣縕缞縚缜缟缛縠縡縢縦绦縯縰骋缧縳纤缦絷缥縻衙縿繄缫繈繊繋繐缯繖繘繙繠缋繣繨缰缲繸繻缱纁纆纇缬缵纩纑纕缵纙纚纛缾罃罆坛罋罂罎罏罖罘罛罝罠罣罥罦罨罫罭锾罳罶罹罻罽罿羂羃羇芈蕉51鸵羑羖羌羜羝羢羣羟羧羭羮羰羱羵羶羸藜鲐翀翃翅翊翌翏翕翛翟翡翣翥翦跹翪翫翚翮翯翱翽翾翿板饕鸹锨耋耇耎耏专耒耜耔耞耡耤耨耩耪耧耰鬓耵聍聃聆聎聝聡聦聱聴聂聼阈聿肄肏肐肕腋肙肜肟肧胛肫肬肭肰肴肵肸肼胊胍胏胑胔胗胙胝胠铨胤胦胩胬胭胯胰胲胴胹胻胼胾脇脘脝脞脡脣脤脥脧脰脲脳腆腊腌臜腍腒腓胨腜腠脶腥腧腬腯踝蹬镣腴腶蠕诽膂腽嗉膇膋膔腘膗膙膟黐膣膦膫膰膴膵膷脍臃臄臇臈臌臐臑臓膘臖臙臛臝臞臧蓐诩臽臾臿舀舁鳑鲏舋舎舔舗馆舝舠舡舢舨舭舲舳舴舸舺艁艄艅艉艋艑艕艖艗艘艚艜艟艣舣艨艩舻艬艭荏艴艳艸艹艻艿芃芄芊萰陂藭芏芔芘芚蕙芟芣芤茉芧芨芩芪芮芰鲢芴芷芸荛豢芼芿苄苒苘苙苜蓿苠苡苣荬苤苎苪镑苶苹苺苻苾茀茁范蠡萣茆茇茈茌茍茖茞茠茢茥茦菰茭茯茳藨茷藘茼荁荄荅荇荈菅蜢鸮荍荑荘豆荵荸荠莆莒莔莕莘莙莚莛莜莝莦莨菪莩莪莭莰莿菀菆菉菎菏菐菑菓菔芲菘菝菡菢菣菥蓂菧菫毂蓥菶菷菹醢菺菻菼菾萅萆苌萋萏萐萑萜萩萱萴莴扁萻葇葍葎葑荭葖葙葠葥苇葧葭药葳葴葶葸葹葽蒄蒎莼茏薹莅蒟蒻蒢蒦蒨蒭藁蒯蒱鉾蒴蒹蒺蒽荪蓁蓆蓇蓊蓌蓍蓏蓓蓖蓧蓪蓫荜跣藕苁蓰蓱莼蓷蓺蓼蔀蔂蔃蔆蔇蔉蔊蔋蔌蔎蔕蔘蔙蒌蔟锷蒋雯茑蔯蔳麻蔵蔸蔾荨蒇蕋蕍荞蕐蕑芸莸蕖蕗蕝蕞蕠蕡蒉蕣蕤蕨蕳蓣蕸蕺蕻薀薁薃薅薆荟薉芗薏薐蔷薖薘剃谔钗薜薠薢薤薧薨薫薬薳薶薷薸薽薾薿藄藇藋荩藐藙藚藟藦藳藴苈藷藾蘀蘁蕲苹蘗蘘蘝蘤蘧蘩蘸蘼虀虆虍蟠虒虓虖虡虣虥虩虬虰蛵蛇虷鳟虺虼蚆蚈蚋蚓蚔蚖蚘蚜蚡蚣蚧蚨蚩蚪蚯蚰蜒蚱蚳蚶蚹蚺蚻蚿蛀蛁蛄蛅蝮蛌蛍蛐蟮蛑蛓蛔蛘蛚蛜蛡蛣蜊蛩蛱蜕螫蜅蚬蜈蝣蜋蜍蜎蜑蠊蜛饯蜞蜣蜨蜩蜮蜱蜷蜺蜾蜿蝀蝃蝋蝌蝍蝎蝏蝗蝘蝙蝝鲼蝡蝤蝥猿蝰虻蝲蝴蝻螃蠏蛳螉螋螒螓螗螘螙螚蟥螟螣螥螬螭䗖螾螀蟀蟅蝈蟊蟋蟑蟓蟛蟜蟟蟢虮蟨蟪蟭蛲蟳蛏蟷蟺蟿蠁蠂蠃虿蠋蛴蠓蚝蠗蠙蠚蠛蠜蠧蟏蠩蜂蠮蠰蠲蠵蠸蠼蠽衁衄衄衇衈衉衋衎衒同衖胡衞裳钩衭衲衵衹衺衿袈裟袗袚袟袢袪袮袲袴袷袺袼褙袽裀裉袅裋夹裍裎裒裛裯裱裲裴裾褀褂褉褊裈褎褐褒褓褔褕袆褚褡褢褦褧褪褫袅褯褰褱裆褛褽褾襁褒襆裥襉襋襌襏襚襛襜裣襞襡襢褴襦襫襬襭襮襕襶襼襽襾覂覃覅霸覉覊覌覗觇覚覜觍觎覧覩觊觏覰観觌觔觕觖觜觽觝觡酲觩觫觭觱觳觯觷觼觾觿言赅讣訇訏訑訒诂讬訧訬訳訹证訾詀詅诋毁詈詊讵詑诒诐詗诎察詨诜詶詸詹詻诙诖誂誃诔锄诓誋诳诶悖誙诮诰誧説読誯谇訚谄谆諆諌诤诹诼諕谂谀諝谝諟喧谥諴諵谌谖誊謆謇歌謍謏謑谡谥謡謦謪谪讴謷謼谩哗譅譆譈譊讹譒撰谮鑫譞噪譩谵譬譱譲谴譸譹谫讅讆詟䜩雠讐谗谶讙谠讟谽豁豉豇岂豊豋豌豏豔豞豖豗豜豝豣豦豨豭豱豳豵豶豷豺豻貅貆狸猊貔貘䝙貜貤餍贳餸贶贲赂賏赊赇赒賝赓赕賨赍斗賮賵賸赚赙赜赟贉赆赑贕赝赬赭赱赳迄趁趂趄趐趑趒趔趡趦趫趮趯趱趴趵趷趹趺趿跁跂跅跆踬跄跐跕跖跗跙跛跦跧跩跫跬跮跱跲跴跺跼跽踅踆踈踉踊踒踖踘踜踟躇蹰踠踡踣踤踥踦踧跷踫踮逾踱踊踶踹踺踼踽躞蹁蹂躏蹎蹐蹓蹔跸蹚蹜蹝迹蹠蹡蹢跶蹧蹩蹪蹯鞠蹽躃躄躅踌跻躐踯跞躘躙躗躝躠蹑躜躧躩躭躰躬躶軃軆辊軏轫軘軜軝腭転軥軨軭軱轱辘軷轵轺軽軿輀輂辇辂辁輈挽輗辄辎辋輠輤輬輭輮辏輴輵輶輹輼辗辒轇轏轑轒辚轕轖轗轘轙轝轞轹轳罪辣辞辵辶辺込辿迅迋迍麿迓迣迤逦迥迨迮迸迺迻迿逄逅逌逍逑逓迳逖逡逭逯逴逶逹遄遅侦遘遛遝遢遨遫遯遰遴绕遹遻邂邅邉邋邎邕邗邘邛邠邢邧邨邯郸邰邲邳邴邶邷邽邾邿郃郄郇郈郔郕郗郙郚郜郝郞郏郠郢郪郫郯郰郲郳郴郷郹郾郿鄀鄄郓鄇鄈鄋鄍鄎鄏鄐鄑邹邬鄕郧鄗鄘鄚鄜鄞鄠鄢鄣鄤鄦鄩鄫鄬鄮鄯鄱郐鄷鄹邝鄻鄾鄿酃酅酆酇郦酊酋酎酏酐酣酔酕醄酖酗酞酡酢酤酩酴酹酺醁醅醆醊醍醐醑醓醖醝酝醡醤醨醪醭醯醰酦醲醴醵醸醹醼醽醾釂酾酽釆釈鲈镏阊钆钇钌钯钋鼢鼹钐钏釪釬釭釱钍釸钕钫鈃钭鈆鈇钚鈊鈌钤钣鈒鈤钬钪鈬铌铈钶铛钹铍钸钿鉄鉆铊铇鉌铋鉏铂钷铆钵鉥钲鉨钼钽鉱鉲鉶铰铒鉼铪銍銎铣銕镂铫铦铑铷銤铱铟銧铥铕铯銭銰焊銶锑锉汞鋂锒鋆鋈鋊铤鋍铗鋐鋑鋕鋘鋙锊锓锔锇铓鋭铖锆锂铽鋳鋹鋺鉴镚钎錀锞锖锫锩錍铔锕錔锱铮锛錞锬锜錤錩錬録铼錼锝钔锴鍉镀鍏鍐铡鍚锻锽锸锲锘鍫鍭鍱鍴锶鍹锗针锺锿镅鎉鎋鎌鎍鎏鎒鎓鎗镉鎚鎞镃鎤铩锼鎭鎯镒镍鎴镓��鎹镎镟鏊镆镠镝鏖铿锵鏚镗镘镛鏠鏦錾镤鏸镪鏻鏽鏾铙鐄鐇鐏铹镦镡鐗馗镫镢镨鐡锎镄鐩镌鐬鐱镭鐶鐻鐽镱鑀鑅镔鑐鑕鑚鑛鑢鑤镥鑪镧鑯鑱鑴鑵镊镢钃镻闫闬闶闳閒闵閗閟阂関合閤哄阆閲阉閺阎阏阍阌暗闉阕阗闑闒闿闘闚阚闟闠闤闼阞阢阤阨阬阯阹阼阽陁陑陔陛陜陡陥陬骘陴険陼陾阴隃隈隒隗隞隠隣隤隩隮隰颧隳隷隹雂雈雉雊雎雑雒雗雘雚雝雟雩雰雱驿霂霅霈霊沾霒霓霙霝霢霣霤霨霩霪霫霮靁叇叆靑靓靣腼靪靮靰靳靷靸靺靼靿鞀鞃鞄鞍鞗鞙鞚鞝鞞鞡鞣鞨鞫鞬鞮鞶鞹鞾鞑韅鞯驮韍韎韔韖韘韝韫韡韣韭韭韱韹韺頀刮頄顸顼頍颀颃颁頖頞頠頫頬颅頯頲颕頼悴顋顑颙颛颜顕顚顜颟顣颥颞飐飑台飓颸飏飖颽颾颿飀飂飚飌翻飡飣饲飥饨饫飮飧飶餀餂饸饹餇餈饽哺馂餖餗餚馄馃餟餠餤餧餩餪餫糊餮糇餲饧馎糕饩馈馊馌馒饇馑馓膳饎饐饘饟馕馘馥馝馡馣骝骡馵馹駃駄駅駆駉駋驽駓驵駗骀驸駜骂骈駪駬骃駴骎駹駽駾騂騄骓騆騉騋骒骐麟騑騒験騕骛騠騢騣騤騧骧騵驺骟騺蓦骖骠骢驆驈骅驌骁驎骣驒驔驖驙驦驩驫骺鲠骫骭肮骱骴骶骷髅骾髁髂髄髆膀髇髑髌髋髙髝髞髟髡髣髧髪髫髭髯髲髳髹髺髽髾鬁鬃鬅鬈鬋鬎鬏鬐鬑鬒鬖鬗鬘鬙鬠鬣斗鬫鬬阄鬯鬰鬲鬵鬷魆魈魊魋魍魉魑魖鳔魛魟魣魦魨魬鲂魵魸鮀鲅鮆鲧鲇鲍鲋鮓鲒鲕鮟鱇鮠鮦鮨鲔鲑鮶鮸鮿鲧鯄鯆鲩鯈鲻鯕鲭鲞鯙鯠鲲鯥鲰鲶鳀鯸鳊鲗䲠鹣鳇鰋鳄鳆鰕鰛鰜鲥鰤鳏鰦鳎鳐鳁鳓鰶鲦鲡鰼鰽鱀鱄鳙鱆鳕鱎鱐鳝鳝鳜鲟鲎鱠鳣鱨鲚鱮鱲鱵鱻鲅鳦凫鳯鳲鳷鳻鴂鴃鴄鸩鴈鴎鸰鴔鴗鸳鸯鸲鹆鸱鴠鴢鸪鴥鸸鹋鴳鸻鴷鴽鵀鵁鸺鹁鵖鵙鹈鹕鹅鵟鵩鹌鵫鵵鵷鵻鹍鶂鶊鶏鶒鹙鶗鶡鶤鶦鶬鶱鹟鶵鶸鶹鹡鶿鹚鷁鷃鷄鷇䴘䴘鷊鷏鹧鷕鹥鸷鷞鷟鸶鹪鹩鷩鷫鷭鹇鹇鸴鷾䴙鸂鸇䴙鸏鸑鸒鸓鸬鹳鸜鹂鹸咸鹾麀麂麃麄麇麋麌麐麑麒麚麛麝麤麸面麫麮麯麰麺麾黁黈黉黢黒黓黕黙黝黟黥黦黧黮黰黱黪黶黹黻黼黾鼋鼂鼃鼅鼈鼍鼏鼐鼒冬鼖鼙鼚鼛鼡鼩鼱鼪鼫鼯鼷鼽齁齆齇齈齉齌赍齑龀齕齗龅齚龇齞龃龉龆齢出齧齩齮齯齰齱齵齾厐龑龒龚龖龘龝龡龢龤"
|
18 |
+
|
19 |
+
traditional_characters = "制咖片型超聲盤鑒定仔點他命書歌粉巾字帳恤手指記憶棒形轉彎溝光○〇㐄㐅㐆㐌㐖毒㐜㐡㐤㐰㐺㑇㑳㒳㒸㔾㗂㗎㝵㞎㞙㞞㠯㢲㢴㤅㥁㥯㨗㫺㬎㮎㮚㮸㲋㲱㲾㳮㵎㵪㶸㷖㷭㹢㹴犬㺢狓㺵㼝㽮㿝䍃䔢䖟䖸䗈䗥䗪䝓䠶䥯䦉䯝䰾魚䲔䳗䳘䵹鼄䶑一對應映射丁不識下兒子做二休世丘之貉並中台原則串為甚謂乾淨了百事無成八變五十些人得道雞升天代如併來去個國政策勁幽靈在歐洲遊蕩接樣蘿蔔坑側化傳價元論醇共再准刀兩斷切分耕耘收穫錢貨物向看舊就緒險刻千金動勞永逸匙零夜半卡通回復返影蹤反常態口咬氣句話同吐快吹周味呼諾嗚品紅鍋哄而散起唱和問三知生熟團漆黑火糟堆場空塊麵塌糊塗塵染壁廂夔已足多情露水大早到晚夫妻當關萬莫開失古恨套所料既往孔見提師要家主審寸陰難買鬥牛小撮部陣局展身層巴掌帆風順席地帶過年計於春頭載四季期被蛇怕井繩度願式份彈頃深前律徑心意念差愁孤行俱全房廳交遮打技長把抓死拿眼淚鼻涕鑰鎖折段抿拍即合掃排掬揮撥擁上入擊洞擲攬改故轍敗文值名斑方面旁族日秋餐隔雅里終父旦時晌會霎間晃暴寒曝更月望垠際朝夕本正經利杯羹東西板枝獨秀根筋桿進條龍服務概模次函數又性程總付步腳印趨登毛拔呵氧氮碳決雌雄波未平派謊言流清楚白準溜煙潭有獲聞是處降琴鶴甲病發可拾沙目然瞭直以相眨穿睹瞥瞬矢的解石鳥神教秉虔誠秘種窩蜂窮竅笑置筆苟勾銷抹殺煞等獎箍節吃箭仇雙鵰詩籌籮筐系列紙級士官統絲毫掛維網盡線微吭響股腦胎脈承腔臂力致效資源址器舉功投般說講規貿易葉障著慎滿皆輸號木電池衣傾鐘高低視仁覺醒覽遺角銀幣觸潰九鼎蔽抄出駟馬追重語破貧洗貫走路安蹴至幾蹶振躍役膽汗較輩輪辭贊退六連遍遞邊針血錘音錯門思閃真倒項栽霧類保護川先驚乍體鬨鱗爪鳴滴泡鄰域黨專鼓作齊炒丑烯亥克內酯冬加奴卯肝炎基尺梁街褲鎬客寵庭巳汝昌烷玲磊糖肇酉醛啷青縣韙良香骨鯛丂七集河市弦喜嘴張舌堵區工業姊妹星架構巧彩扭歪拼湊餘熱曜武州爺浮屠美鄉老階樹葷素碎落能魄鰓鰻珠丄丅丆万俟丈尚摸母娘量管群亞虎必我堂令申件裝伏位博俠義界表女墟臺戲臭皮匠勝諸葛亮賽頂倍催請運算包立叉戟離疫苗土史志演圍揭瓦曬夷姑婆帝村寶爛尖杉鹼屜桌山岔島由紀峽壩庫鎮廢從德後拗湯治旬食明昧曹朋友框欄極權冪曲歸依貓民氟硼氯磷鐵江侗自旅法司洋浦梅園溫暖灣焦班幸用田略番疊皇炮捶硝苯酸腺苷稜草鏡穗跳遠索錦綱聚氰胺聯店胚膲愛色堇紫羅蘭芝茶飯菱雲蟲藏藩亂叛蘇親債凳學座恐戀柱測肌腹衩錐係貂企烏跪叩軍車農題迭都甘油屯奏鍵短阿姨陪姐隻顧茅廬槽駕魂鮮鹿頁其菜單乘任供勢午齒漢組織吊調瀉唇坡城報墳外夸將尉建築岸崗公床揚新劍昇杭林栗校樓標款汽社浣海商館劇院鋼華港機械廣媒環球融第醫科證券綜財樂育游漲猶嶺疏癮瞼確兵領導繳肢膛船艾瑟爾蒼蔡虞傚衫覆訪訴課諭議軌述野鉤限敵鞋頜頷顎饒首齦站例修凡劃垂屆屬崽頦廚拜挫擺放旋削棋榻檻禮沉注滑營獄畫确儀聘花葬詔員跌轄週達酒錨閘陷陸雨雪飛威丌于丹久乏予理評產亢卑亦乎舞己悲矩圓詞害誌但住佞佳便俗信票案幅翁倦倫假偏倚斜虧鬼敲停備傷脾胃僅此像儉匱免宜穴焉戴兼容許凍伯仲負彼晝皂軒輊實刊划顛衛戰哥比省非好黃飾別拘束掩奶睬選擇搖擾煩苦枚寫協厭及格受歡迎約只估侵犯割狀告或缺抗拒挽撤救藥喻磨滅端倪少逆逾越避靠適吉譽吝玉含延咎歹聽啻淵善謀均勻堪忍夠太惹妙妥妨孕症孝術室完納推冠積宣疑辯慄碴稱屈撓屑干涉衡待很忙惡忿怎麼怠急恥恭息悅惑惜惟想愉愧怍慌憤啟懂懈懷材才緊招認扣抵拉捨也罷插揣冒搭撞南牆擴核支攻敢雷攀敬裡嗎需景智暇曾罪遇朽枉止況競爭辱求癒渝溶濟左右袒困補爽特寂寞示弱找謝畏強疾徐痛癢冤符眠睦瞅董何厚云措活疲羞者輕玻璃祥兆禁移稂莠穩佛換答簡結果盟絕縷途給談否羈翼耐肖脛毋寧興舒若菲萊痕跡窠臼虛衰臉兔撒鷹棺範該詳諱抬泰讓鬚眉象眾貲賬費灰賴奇慮訓輟辨菽麥辛近送透逞徒速續逮捕遂遑違遜斧鉞艱醉鏽隨觀棄顯飽脂肪使丏丐幫丒且慢末丕替桃宗王尊涼爵各圖屋脊糧署錄壇吾祿職胄襲君廈丗北壑桐疹損逢陵鷸丙寅戌氨腈唑綸辰酮脫氫酶醚丞丟現掉紗帽弄扯砲碗丠両丣坐存激肩臻蒂蓮悖序驅丨丩丫挺杈髻鬟細介俄伊犁京尼布訂普渡央委監察檢查劑圈設警隊斯督剩震境航舶革防托播促質版蠑螈鋒研藝歷殘消頻譜精密製造陲郵候埔堅壓壢凹匯執府究邦俘攝寮彬狼嶽肺腫庸英訊診埋粒胞括控碼韓暑槍樞砥澳哇牟壽甸鑽探篇簽綴縫繼耳肯照婦埃懸璧軸櫃檯辣擱淺邪跑纖阮陽私囊魔丮丰姿采丱燒丳丵丶丷丸參寨朗桂瑞砂衷霞貌鳳僕艦因嫌宰峰幹絡牌持旨祭禱簿編罰賓辦丼丿乀乂乃乄仰慕盛曠留考驗闊乆乇么醜麼乊湖燃乑乒乓乕乖僻忤戾离謬迕乗危肥劫除隙浪婿乙炔腸酰吡咯鹽乚乛乜嘢卿玄宮尾狐龜塔嶷兄弟泉章霄釘耙乞扎哀憐恕討乢乣乤乥乧乨乩童乪乫乭乳暈汁液瑤漿牙癌突竇罩腐膠豬酪蛋糕菌瘤乴乵乶乷乸乹乺乼乾俸冰嘉噦嚎坤媽屍壘旱枯涸俐渴潮澀煸豆燥爹瘦癟癬瞪袋脆薑貝隆餾乿亀亁叫咕攘扔搞男砸竄蓬麻亃亄亅卻亇遲典今臨繁累卵奉婚聰躬巨與遷添裂副宿歲怪噁尕崙愣杆硅硫鈦鈾錳芑雜異鈉砷胂磺琥珀艙棍簧胡茬盜浩盆販郎腿亍洪亐互欠助勉惠操斥諉繫戶譯亓墓碑刑鈴卅渠繽紛斗米旗憲釩燈徽瘟祖拳福穀豐臟腑綁肉醃苓蘊橋鋪霸顏鬧判噴岡底蛙陘礦亖亙亜罕們娜桑那努哈喀弗烈曼松森杜氏盃奧琛敦戊穆聖裔彙薛孫亟亡佚虜羊牢奮釋卷卸契媾感額睫纏誼趾塞擠紐阻還配馳莊亨洛祚亪享津滬畿郊慈菴枇杷膏亭閣鋥麗亳亶亹誅初責翻瘋偶傑叢稠妖拖寰居吸授慧蝸吞壯魅狗矛盾益渣患憂稀描猿夢暫涯畜禍緣沸搜引擎臣橫紜誰混援蒸獸獅稅剖亻亼亽亾什獻剎邡麽仂仃仄仆富怨仈仉畢昔晨殼紹仍仏仒仕宦仗欺恃腰嘆歎炬梓訖施仙后瓊逝仚仝仞仟悔仡佬償填泊拓撲簇羔購頓欽佩髮棻閫馭養億儆尤藉幀賑凌敘帖李柔剛沃眥睚戒訛取饗讀仨仫仮著泳臥躺韶夏裁仳仵唯賢憑釣誕仿似宋彿諷伀碩盼鵝伄儅伈伉儷柯始娃邁戈坦堡帕茨薩廟瑪莉莎藤霍姆伋伍奢胥廷芳豪伎倆侍汛勒希羲雛伐憩整謨閑閒伕伙伴頤伜伝伢叔恆茲恩翰伱伲侶伶俜悧鼬伸懶縮喇叭伹伺伻伽倻輻伾佀佃佇佈喬妮墨佉盧佌貸劣廉昂檔濃矮傘窪緩耗胸谷迷擋率齲宅沫舍療佐貳佑佔優據鏵嘗呢須魯曉佗佘余坪寺瓜銃僧蒙芒陀龕哼嘔坊姦孽弊揖祟繭縛誓賊佝僂瞀佟你奪趕佡佢佣佤佧賈佪佫佯佰佱潔績釀餚佴捲佶佷佸佹佺佻佼佽佾具喚窘壞娛怒慨硬習慣聾膨脹蔓駭貴痺侀侁侂侃侄侅鴻燕侇侈糜靡侉侌妾侏儒倉鼠侐侑侔侖侘侚鏈侜偎傍鈷循柳葫蘆附価侮罵蔑侯岩截蝕侷貼壺嬛宴捷攜桶箋酌俁狹膝狄俅俉俊俏俎俑俓俔諺俚俛黎健呈固墒增守康箱濕祐鏢鑣槓盒靖膜齡俞豹獵噪孚封札筒託衍鴿剪撰稿煉廠禊練繕葺俯瞰撐衝俲俳俴俵俶俷俺俻俾倀倂倅儲卒惶敷猝逃頡蓄崇隱倌倏忽刺蠟燭噍嚼坍扁抽斃蔥楣灌灶糞背藪賣賠閉霉騰倓倔倖倘倜儻倝借箸挹澆閱倡狂倢倣値倥傯倨��倩匡嗣沖柝珍倬倭寇猩倮倶倷倹勤讚偁偃充偽吏嗓寐惺扮拱芫茜藉虢鈔偈偉晶偌宕距析濾殿疼癱註頗偓偕鴨歇滯偝偟偢忘怡旺偨偩偪偫偭偯偰偱偲偵緝蹄偷減惰漏窺竊偸偺迹傀儡傅傈僳傌籬傎奎琳迪叟芭傒傔傕傖悉荒傜傞傢傣芽逼傭婢傮睨寄檄誦謠頌傴擔辜弓慘蒿悼疤傺傻屄臆巢洩篋羨蓋軋頹傿儸僄僇僉僊働僎僑僔僖僚僝僞僣僤僥僦猴僨僩僬僭僮僯僰僱僵殖籤靜僾僿征隴儁儂儃儇儈朴薄儊儋儌儍儐儓儔儕儗儘儜儞儤儦儩汰哉寡渥裕酷儭儱罐儳儵儹儺儼儽兀臬臲鷲允勛勳宙宵帥憝彞諧嫂鬩暢沛溢盈飢赫兇悍狠猛頑愚妣斬秦遣鞭耀敏榮槃澤爆碟磁禿纜輝霽鹵朵婁孜烽醬勃汀箕裘鉗耶懞蕾徹兌軟遭黜兎児韻媳爸兕觥兗兙兛兜售鍪肚兝兞兟兡兢兣樽殮涅睡稟籍贅泌啡肽奸幕涵澇熵疚眷稃襯訌赴煥椒殲植跏沒試誤猜棲窗肋袖頰兪卦撇鬍岐廓轎疸楓茴瓏廁秩募勺噸寓斤曆畝迫筷釐最淫螺韜兮寬匪篩襄贏軛複兲詐刃堰戎痞蟻餉它冀鑄冂冃円冇冉冊嫁厲礪竭醮冏牧冑冓冔冕冖冗冘冞冢窄抑誣冥冫烘菇蟄冷凝坨橇淇淋炭餅磚磧窖醋雕雹霜冱冶爐艷嘲峻灘淡漠煖颼飲冼冽凃凄愴梗凅凇凈凊凋敝濛凔凜遵汞脢凞几凢処凰凱凵凶焰凸摺刷紋預喪嘍奔巡榜殯芙蓉租籠輯鞘萃凼鋸鑊刁蠻刂娩崩批拆攤掰櫱驟歧顆秒袂贓勿囑忌磋琢膚刈羽刎訟戮舂槳艇刓刖霹靂刜創犢刡恙墅幟筵緻刦刧刨昏默攸尿慾薰潤薰圭刪刮痧鏟刱刲刳刴刵踏磅戳柏槐繡芹莧蝟舟銘鵠鶩刼剁剃辮剄剉履鉛剋剌姻咽哨廊掠桅沿召瞻翅趙卜渺茫郭剒剔剕瀝剚愎毅訥纔剜剝啄採剞剟剡剣剤綵剮腎駛黏剰袍剴紊剷剸剺剽剿劁劂劄劈啪柴扳啦劉奭姥夼昫涓熙禪禹錫翔雁鶚劊劌弩柄蜻蛉劒劓劖劘劙瀾簣賞磯釜晉甜薪逐劦熔紂虐赤囚劬劭労劵効劻劼劾峭艮勅勇勵勍勐臘脖龐漫飼盪粥輒勖勗勘驕餒碌泮雇捐竹騎殊阱勣樸懇謹勦勧勩勯勰勱勲勷勸懲慰誡諫勹芡踐闌匁庇拯粟紮袱裹餃匆遽匈匉匊匋匍匐莖匏匕妝痰膿蛹齋苑烤蹈塘羌熊閥螳螂疆碚竿緯荷茵邙魏匚匜匝匟扶稷匣匭攏匸匹耦匽匾匿卂叮瘡禧軫堤棚迢鈞鍊卄卆遐卉瓷盲瓶噹胱腱裸卋卌卍卐怯污賤鄙齷齪陋卓溪唐梯漁陳棗泥漳潯澗梨芬譙贍轅迦鄭単驢弈洽鰲卛占筮卝卞卟吩啉屎翠厄卣卨卪卬卮榫襖璽綬鈕蚤懼殆篤聳卲帘帙繞卹卼卽厂厎厓厔厖厗奚厘厙厜厝諒厠厤厥厪膩孢厮厰厳厴厹厺粕垢蕪菁厼厾叁悟茸薯叄吵笄悌哺譏坫壟弧芯杠潛嬰芻袁詰貪諜煽饋駁収岳締災賄騙叚叡吻攔蘑蜜訣燧玩硯箏椎藺銅逗驪另覓叨嘮謁杵姓喊嚷囂咚嚀塑尋惱憎擦祇泣滲蝠叱吒咄咤喝籀黛舵舷叵叶鐸懿昭穰苴遼叻叼吁塹嫖賭瞧爬衆抒吅吆夥巹橡滌抱縱摩郡唁墜扇籃膀襪頸吋愾諮酬哭妓媛暗錶韁邇妃羿絮蕃渾拐葵暮隅吔吖啶嗪戚吜嗇噬嚥吟哦詠吠吧唧嗒咐吪雋咀徵燐苞茹鈣哧吮吰吱嘎吲哚吳棟嬌窟孟簫忠晗淞闔閭趼宇吶睛噓拂捧疵熄竽笛糠吼吽呀呂韋矇呃呆笨呇貢呉罄呋喃呎呏呔呠呡癡呣呤呦呧瑛眩扒晬淑姬瑜璇鵑呪呫嗶嚅囁呬呯呰呱呲咧噌鈍呴呶呷呸呺呻哱咻嘯嚕籲坎坷邏呿咁咂咆哮咇咈咋蟹煦珅藹咍咑咒詛咔噠嚓咾噥哩喱咗咠咡咢咣咥咦咨嗟詢咩咪咫嚙齧咭咮咱咲咳嗆嗽咴咷咸咹咺咼喉咿婉慟憫賦矜綠茗藍哂搶瞞哆嗦囉噻啾濱彗哋哌哎唷喲哏哐哞哢哤哪裏哫啼喘哰哲萎蚌哳哶哽哿唄唅唆唈唉唎唏嘩堯棣殤璜睿肅唔睇唕唚唞唣喳唪唬唰喏唲唳唵嘛唶唸唹唻唼唾唿啁啃鸚鵡啅埠棧榷祺舖鞅飆啊啍啎啐啓啕啖啗啜啞祈啢啣啤啥啫啱啲啵啺饑啽噶崑沁喁喂喆裙喈嚨喋喌喎喑喒喓喔粗喙幛慶滋鵲喟喣喤喥喦喧騷喨喩梆喫葡萄喭駝挑嚇碰樅瓣純皰藻趟鉻喵営喹喺喼喿嗀嗃嗄嗅嗈嗉嗊嗍嗐嗑嗔詬嗕嗖嗙嗛嗜痂癖嗝嗡嗤嗥嗨嗩嗬嗯嗰嗲嗵嘰嗷嗹嗾嗿嘀嘁嘂嘅惋嘈峪禾蔭嘊嘌嘏嘐嘒嘓嘖嘚嘜嘞嘟囔嘣嘥嘦嘧嘬嘭這謔嚴敞饞鬆嘵嘶嘷嘸蝦嘹嘻嘽嘿噀噂噅噇噉噎噏噔噗噘噙噚噝噞噢噤蟬皿噩噫噭噯噱噲噳嚏涌灑欲巫霏噷噼嚃嚄嚆抖嚌嚐嚔囌嚚嚜嚞嚟嚦嚬嚭嚮嚯嚲嚳飭按竣苛嚵嚶囀囅囈膪謙囍囒囓囗囘蕭酚飄濺諦囝溯眸紇鑾鶻囟殉囡団囤囥囧囨囪囫圇囬囮囯囲図囶囷囸囹圄圉擬囻囿圀圂圃圊粹蠹赦圌墾圏滾鯡鑿枘圕圛圜圞坯埂壤骸炕祠窯豚紳魠鯪鱉圧握圩圪垯圬圮圯炸岬幔毯祇窨菩溉圳圴圻圾坂坆沾坋坌舛壈昆墊墩椅坒坓坩堝坭坰坱坳坴坵坻坼楊掙涎簾垃垈垌垍垓垔垕垗垚垛垝垣垞垟垤垧垮垵垺垾垿埀畔埄埆埇埈埌殃隍埏埒埕埗埜埡埤埦埧埭埯埰埲埳埴埵埶紼埸培怖樁礎輔埼埽堀訶姪廡堃堄摧磐貞韌砌堈堉堊堋堌堍堎堖堙堞堠礁堧堨輿堭堮蜓摘堲堳堽堿塁塄塈煤塋棵塍塏塒塓綢���鴉沽虱塙塚塝繆塡塢塤塥塩塬塱塲蟎塼塽塾塿墀墁墈墉墐夯増毀墝墠墦漬缽墫墬墮墰墺墻櫥壅壆壊壌壎壒榨蒜壔壕壖壙壚壜壝壠壡壬壭壱売壴壹壻壼寢壿夂夅夆変夊夌漱邑夓腕泄甥禦骼夗夘夙袞瑙妊娠醣梟珊鶯鷺戧幻魘夤蹀祕擂鶇姚宛閨嶼庾撻拇賛蛤裨菠氅漓撈湄蚊霆鯊箐篆篷荊肆舅荔鮃巷慚骰辟邱鎔鐮阪漂燴鯢鰈鱷鴇臚鵬妒峨譚枰晏璣癸祝秤竺牡籟恢罡螻蠍賜絨御梭夬夭砣榆怙枕夶夾餡奄崛葩譎奈賀祀贈奌奐奓奕訢詝奘奜奠奡奣陶奨奩魁奫奬奰媧孩貶隸酥宄狡猾她奼嫣妁氈荼皋膻蠅嬪妄妍嫉媚嬈妗趣妚妞妤礙妬婭妯娌妲妳妵妺姁姅姉姍姒姘姙姜姝姞姣姤姧姫姮娥姱姸姺姽婀娀誘懾脅娉婷娑娓娟娣娭娯娵娶娸娼婊婐婕婞婤婥谿孺婧婪婬婹婺婼婽媁媄媊媕媞媟媠媢媬媮媯媲媵媸媺媻媼眯媿嫄嫈嫋嫏嫕嫗嫘嫚嫜嫠嫡嫦嫩嫪毐嫫嫬嫰嫵嫺嫻嫽嫿嬀嬃嬅嬉耍嬋痴豔嬔嬖嬗嬙嬝嬡嬢嬤嬦嬬嬭幼嬲嬴嬸嬹嬾嬿孀孃孅孌孏曰癲屏孑孓雀孖斟簍謎摺孛矻鳩崮軻祜鸞孥邈毓棠臏孬孭孰孱孳孵泛罔銜孻孿宀宁宂拙株薇掣撫琪瓿榴謐彌宊濂祁瑕宍宏碁宓邸讞実潢町宥宧宨宬徵崎駿掖闕臊煮禽蠶宸豫寀寁寥寃簷庶寎暄磣寔寖寘寙寛寠苫寤肘洱濫蒗陝覈寪弘綽螽寳擅疙瘩晷対檐専尃尅贖絀繚疇釁尌峙醌襟痲碧屁昊槌淘恵瀑牝畑莓缸羚覷蔻髒躁尒尓銳尗尙尜尟尢尥尨尪尬尭尰擒尲尶尷尸尹潽蠖蛾尻釦梢蚴鰭脬蹲屇屌蚵屐屓挪屖屘屙屛屝屢屣巒嶂巖舄屧屨屩屪屭屮戍駐鉀崖嵛巔旮旯楂欖櫸芋茱萸靛麓屴屹屺屼岀岊岌岍阜岑彭鞏岒岝岢嵐岣岧岨岫岱岵岷峁峇峋峒峓峞峠嵋峩峯峱峴峹峿崀崁崆禎崋崌崍嶇崐崒崔嵬巍螢顥崚崞崟崠崢巆崤崦崧殂崬崱崳崴崶崿嵂嵇嵊泗嵌嵎嵒嵓嵗嵙嵞嵡嵩嵫嵯嵴嵼嵾嶁嶃嶄晴嶋嶌嶒嶓嶔嶗嶙嶝嶞嶠嶡嶢嶧嶨嶭嶮嶰嶲嶴嶸巂巃巇巉巋巌巓巘巛滇芎巟巠弋迴巣巤炊擘蜥蟒蠱覡巰蜀彥淖杏茂甫楞巻巽幗巿帛斐鯽蕊帑帔帗帚琉汶帟帡帣帨帬帯帰帷帹暆幃幄幇幋幌幏幘幙幚幞幠幡幢幦幨幩幪幬幭幯幰遙蹉跎餘庚鑑幵幷稚邃庀庁広庄庈庉笠庋跋庖犧庠庤庥鯨庬庱庳庴庵馨衢庹庿廃廄廆廋廌廎廏廐廑廒廕廖廛廝搏鑼廞弛袤廥廧廨廩廱綿踵髓廸廹甌鄴廻廼廾廿躔弁皺弇弌弍弎弐弒弔詭憾薦弝弢弣弤弨弭弮弰弳霖繇燾斌旭溥騫弶弸弼弾彀彄彆纍糾彊彔彖彘彟彠陌彤貽彧繪虹彪炳彫蔚鷗彰癉彲彳彴彷彷徉徨彸彽踩斂旆徂徇徊渭畬鉉裼従筌徘徙徜徠膳甦萌漸徬徭醺徯徳徴潘徻徼忀瘁胖燎怦悸顫扉犀澎湃砰恍惚絞隘忉憚挨餓忐忑忒忖応忝忞耿忡忪忭忮忱忸怩忻悠懣怏遏怔怗怚怛怞懟黍訝怫怭懦怱怲怳怵惕怸怹恁恂恇恉恌恏恒恓恔恘恚恛恝恞恟恠恣恧眄恪恫恬澹恰恿悀悁悃悄悆悊悐悒晦悚悛悜悝悤您悩悪悮悰悱悽惻悳悴悵惘悶悻悾惄愫鍾蒐惆惇惌惎惏惓惔惙惛耄惝瘧濁惥惦惪惲惴惷惸拈愀愃愆愈愊愍愐愑愒愓愔愕愙氓蠢騃昵愜赧愨愬愮愯愷愼慁慂慅慆慇靄慉慊慍慝慥慪慫慬慱慳慴慵慷慼焚憀灼鬱憃憊憋憍眺捏軾憒憔憖憙憧憬憨憪憭憮憯憷憸憹憺懃懅懆邀懊懋懌懍懐懞懠懤懥懨懫懮懰懱毖懵遁樑雍懺懽戁戄戇戉戔戕戛戝戞戠戡戢戣戤戥戦戩戭戯轟戱披菊牖戸戹戺戻戼戽鍬扂楔扃扆扈扊杖牽絹銬鐲賚扐摟攪烊盹瞌跟躉鑔靶鼾払扗玫腮扛扞扠扡扢盔押扤扦扱罾揄綏鞍郤窾扻扼扽抃抆抈抉抌抏瞎抔繯縊擻抜抝択抨摔歉躥牾抶抻搐泵菸拃拄拊髀拋拌脯拎拏拑擢秧沓曳攣迂拚拝拠拡拫拭拮踢拴拶拷攢拽掇芥橐簪摹疔挈瓢驥捺蹻挌挍挎挐揀挓挖掘浚挙揍聵挲挶挾挿捂捃捄捅捆捉捋胳膊揎捌捍捎軀蛛捗捘捙捜捥捩捫捭据捱捻捼捽掀掂掄臀膘掊掎掏掐笙掔掗掞棉芍掤搪闡掫掮掯揉掱掲掽掾揃揅揆搓揌諢揕揗揘揜揝揞揠揥揩揪揫櫫遒麈揰揲揵揶揸揹揺搆搉搊搋搌搎搔搕撼櫓搗搘搠搡搢搣搤搥搦搧搨搬楦褳訕赸搯搰搲搳搴搵搷搽搾搿摀摁摂摃摎摑摒摓跤摙摛摜摞摠摦睺羯摭摮摯摰摲摳摴摶摷摻摽撂撃撅稻撊撋撏鐧潑撕撙撚撝撟撢撣撦撧撩撬撱朔撳蚍蜉撾撿擀擄闖擉缶觚擐擕擖擗擡擣擤澡腚擧擨擩擫擭擯擰擷擸擼擽擿攃攄攆攉攥攐攓攖攙攛每攩攫轡澄攮攰攲攴軼攷砭訐攽碘敁敃敇敉敍敎筏敔敕敖閏誨敜煌敧敪敱敹敺敻敿斁衽斄牒縐謅斉斎斕鶉讕駮鱧斒筲斛斝斞斠斡斢斨斫斮晾沂潟穎絳邵斲斸釳於琅斾斿旀旂旃旄渦旌旎旐旒旓旖旛旝旟旡旣浴旰獺魃旴旹旻旼旽昀昃昄昇昉晰躲澈熹皎皓礬昑昕昜昝昞昡昤暉筍昦昨昰昱昳昴昶昺昻晁蹇隧蔬髦晄晅晒晛晜晞晟晡晢晤晥曦晩萘瑩顗晿暁暋暌暍暐暔暕煅暘暝暠暡曚暦暨暪朦朧暱暲殄馮暵暸暹暻暾曀曄曇曈曌曏曐曖曘曙曛曡曨曩駱曱甴肱曷牘禺錕曽滄耽朁朅朆杪栓誇竟粘絛朊膺朏朐朓朕朘朙瞄覲溘饔飧朠朢朣柵椆澱蝨朩朮朰朱炆璋鈺熾鹮朳槿朶朾朿杅杇杌隉欣釗湛漼楷瀍煜玟纓翱肈舜贄适逵杓杕杗杙荀蘅杝杞脩珓筊杰榔狍閦顰緬莞杲杳眇杴杶杸杻杼枋枌枒枓衾葄翹紓逋枙狸椏枟槁枲枳枴枵枷枸櫞枹枻柁柂柃柅柈柊柎某柑橘柒柘柙柚柜柞櫟柟柢柣柤柩柬柮柰柲橙柶柷柸柺査柿栃栄栒栔栘栝栟栢栩栫栭栱栲栳栴檀栵栻桀驁桁鎂桄桉桋桎梏椹葚桓桔桕桜桟桫欏桭桮桯桲桴桷桹湘溟梃梊梍梐潼梔梘梜梠梡梣梧梩梱梲梳梴梵梹棁棃櫻棐棑棕櫚簑繃蓑棖棘棜棨棩棪棫棬棯棰棱棳棸棹槨棼椀椄苕椈椊椋椌椐椑椓椗検椤椪椰椳椴椵椷椸椽椿楀楄楅篪楋楍楎楗楘楙楛楝楟楠楢楥楨楩楪楫楬楮楯楰楳楸楹楻楽榀榃榊榎槺榕榖榘榛狉莽榜笞榠榡榤榥榦榧榪榭榰榱槤霰榼榾榿槊閂槎槑槔槖様槜槢槥槧槪槭槮槱槲槻槼槾樆樊樏樑樕樗樘樛樟樠樧樨権樲樴樵猢猻樺樻罍樾樿橁橄橆橈笥龠橕橚橛輛橢橤橧豎膈跨橾橿檁檃檇檉檍檎檑檖檗檜檟檠檣檨檫檬檮檳檴檵檸櫂櫆櫌櫛櫜櫝櫡櫧櫨櫪櫬櫳櫹櫺茄櫽欀欂欃欐欑欒欙欞溴欨欬欱欵欶欷歔欸欹欻欼欿歁歃歆艎歈歊蒔蝶歓歕歘歙歛歜歟歠蹦詮鑲蹣跚陞陟歩歮歯歰歳歴璞歺瞑歾歿殀殈殍殑殗殜殙殛殞殢殣殥殪殫殭殰殳荃殷殸殹蛟殻殽謗毆毈毉餵毎毑蕈毗毘毚茛鄧毧毬毳毷毹毽毾毿氂氄氆靴氉氊氌氍氐聊氕氖気氘氙氚氛氜氝氡洶焊痙氤氳氥氦鋁鋅氪烴氬銨痤汪滸漉痘盂碾菖蒲蕹蛭螅氵氷氹氺氽燙氾氿渚汆汊汋汍汎汏汐汔汕褟汙汚汜蘺沼穢衊汧汨汩汭汲汳汴隄汾沄沅沆瀣沇沈葆浸淪湎溺痼痾沌沍沏沐沔沕沘浜畹礫沚沢沬沭沮沰沱灢沴沷籽沺烹濡洄泂肛泅泆湧肓泐泑泒泓泔泖泙泚泜泝泠漩饃濤粼濘蘚鰍泩泫泭泯銖泱泲洇洊涇琵琶荽薊箔洌洎洏洑潄濯洙洚洟洢洣洧洨洩痢滔洫洮洳洴洵洸洹洺洼洿淌蜚浄浉浙贛渫浠浡浤浥淼瀚浬浭翩萍浯浰蜃淀苔蛞蝓蜇螵蛸煲鯉浹浼浽溦涂涊涐涑涒涔滂涖涘涙涪涫涬涮涴涶涷涿淄淅淆淊淒黯淓淙漣淜淝淟淠淢淤淥淦淩猥藿褻淬淮淯淰淳詣淶紡淸淹燉癯綺渇済渉渋渓渕渙渟渢滓渤澥渧渨渮渰渲渶渼湅湉湋湍湑湓湔黔湜湝湞湟湢湣湩湫湮麟湱湲湴湼満溈溍溎溏溛舐漭溠溤溧馴溮溱溲溳溵溷溻溼溽溾滁滃滉滊滎滏稽滕滘滙滝滫滮羼耷滷滹滻煎漈漊漎繹漕漖漘漙漚漜漪漾漥漦漯漰漵漶漷濞潀潁潎潏潕潗潚潝潞潠潦祉瘍潲潵潷潸潺潾潿澁澂澃澉澌澍澐澒澔澙澠澣澦澧澨澫澬澮澰澴澶澼熏郁濆濇濈濉濊貊濔疣濜濠濩觴濬濮盥濰濲濼瀁瀅瀆瀋瀌瀏瀒瀔瀕瀘瀛瀟瀠瀡瀦瀧瀨瀬瀰瀲瀳瀵瀹瀺瀼灃灄灉灋灒灕灖灝灞灠灤灥灨灩灪蜴灮燼獴灴灸灺炁炅魷炗炘炙炤炫疽烙釺炯炰炱炲炴炷燬炻烀烋瘴鯧烓烔焙烜烝烳飪烺焃焄耆焌焐焓焗焜焞焠焢焮焯焱焼煁煃煆煇煊熠煍熬煐煒煕煗燻礆霾煚煝煟煠煢矸煨瑣煬萁煳煺煻熀熅熇熉羆熒穹熗熘熛熜稔諳爍熤熨熯熰眶螞熲熳熸熿燀燁燂燄盞燊燋燏燔隼燖燜燠燡燦燨燮燹燻燽燿爇爊爓爚爝爟爨蟾爯爰爲爻爿爿牀牁牂牄牋牎牏牓牕釉牚腩蒡虻牠雖蠣牣牤牮牯牲牳牴牷牸牼絆牿靬犂犄犆犇犉犍犎犒犖犗犛犟犠犨犩犪犮犰狳犴犵犺狁甩狃狆狎狒獾狘狙黠狨狩狫狴狷狺狻豕狽蜘猁猇猈猊猋猓猖獗猗猘猙獰獁猞猟獕猭猱猲猳猷猸猹猺玃獀獃獉獍獏獐獒獘獙獚獜獝獞獠獢獣獧鼇蹊獪獫獬豸獮獯鬻獳獷獼玀玁菟玅玆玈珉糝禛郅玍玎玓瓅玔玕玖玗玘玞玠玡玢玤玥玦玨瑰玭玳瑁玶玷玹玼珂珇珈瑚珌饈饌珔珖珙珛珞珡珣珥珧珩珪珮珶珷珺珽琀琁隕琊琇琖琚琠琤琦琨琫琬琭琮琯琰琱琲瑯琹琺琿瑀瑂瑄瑉瑋瑑瑔瑗瑢瑭瑱瑲瑳瑽瑾瑿璀璨璁璅璆璈璉璊璐璘璚璝璟璠璡璥璦璩璪璫璯璲璵璸璺璿瓀瓔瓖瓘瓚瓛臍瓞瓠瓤瓧瓩瓮瓰瓱瓴瓸瓻瓼甀甁甃甄甇甋甍甎甏甑甒甓甔甕甖甗飴蔗甙詫鉅粱盎銹糰甡褥産甪甬甭甮甯鎧甹甽甾甿畀畁畇畈畊畋畎畓畚畛畟鄂畤畦畧荻畯畳畵畷畸畽畾疃疉疋疍疎簞疐疒疕疘疝疢疥疧疳疶疿痁痄痊痌痍痏痐痒痔痗瘢痚痠痡痣痦痩痭痯痱痳痵痻痿瘀瘂瘃瘈瘉瘊瘌瘏瘐瘓瘕瘖瘙瘚瘛瘲瘜瘝瘞瘠瘥瘨瘭瘮瘯瘰癧瘳癘瘵瘸瘺瘻瘼癃癆癇癈癎癐癔癙癜癠癤癥癩蟆癪癭癰発踔紺蔫酵皙砬砒翎翳蘞鎢鑞皚鵯駒鱀粵褶皀皁莢皃鎛皈皌皐皒硃皕皖皘皜皝皞皤皦皨皪皫皭糙綻皴皸皻皽盅盋盌盍盚盝踞盦盩鞦韆盬盭眦睜瞤盯盱眙裰盵盻睞眂眅眈眊県眑眕眚眛眞眢眣眭眳眴眵眹瞓眽郛睃睅睆睊睍睎睏睒睖睙睟睠睢睥睪睪睯睽睾瞇瞈瞋瞍逛瞏瞕瞖瞘瞜瞟瞠瞢瞫瞭瞳瞵瞷瞹瞽闍瞿矓矉矍鑠矔矗矙矚矞矟矠矣矧矬矯矰矱硪碇磙��舫阡、矼矽礓砃砅砆砉砍砑砕砝砟砠砢砦砧砩砫砮砳艏砵砹砼硇硌硍硎硏硐硒硜硤硨磲茚鋇硭硻硾碃碉碏碣碓碔碞碡碪碫碬碭碯碲碸碻礡磈磉磎磑磔磕磖磛磟磠磡磤磥蹭磪磬磴磵磹磻磽礀礄礅礌礐礚礜礞礤礧礮礱礲礵礽礿祂祄祅祆禳祊祍祏祓祔祕祗祘祛祧祫祲祻祼餌臠錮禂禇禋禑禔禕隋禖禘禚禜禝禠禡禢禤禥禨禫禰禴禸稈秈秊闈颯秌秏秕笈蘵賃秠秣秪秫秬秭秷秸稊稌稍稑稗稙稛稞稬稭稲稹稼顙稾穂穄穇穈穉穋穌貯穏穜穟穠穡穣穤穧穨穭穮穵穸窿闃窀窂窅窆窈窕窊窋窌窒窓窔窞窣窬黷蹙窰窳窴窵窶窸窻竁竃竈竑竜竝竦竪篦篾笆鮫竾笉笊笎笏笐靨笓笤籙笪笫笭笮笰笱笲笳笵笸笻筀筅筇筈筎筑筘筠筤筥筦筧筩筭筯筰筱筳筴讌筸箂箇箊箎箑箒箘箙箛箜篌箝箠箬鏃箯箴箾篁篔簹篘篙篚篛篜篝篟篠篡篢篥篧篨篭篰篲篳篴篶篹篼簀簁簃簆簉簋簌簏簜簟簠簥簦簨簬簰簸簻籊籐籒籓籔籖籚籛籜籣籥籧籩籪籫籯芾麴籵籸籹籼粁粃粋粑粔糲粛粞粢粧粨粲粳粺粻粽闢粿糅糆糈糌糍糒糔萼糗蛆蹋糢糨糬糭糯糱糴糶糸糺紃蹼鰹黴紆紈絝紉閩襻紑紕紘錠鳶鷂紝紞紟紥紩紬紱紲紵紽紾紿絁絃絅経絍絎絏縭褵絓絖絘絜絢絣螯絪絫聒絰絵絶絺絻絿綀綃綅綆綈綉綌綍綎綑綖綘継続緞綣綦綪綫綮綯綰罟蝽綷縩綹綾緁緄緅緆緇緋緌緎総緑緔緖緗緘緙緜緡緤緥緦纂緪緰緱緲緶緹縁縃縄縈縉縋縏縑縕縗縚縝縞縟縠縡縢縦縧縯縰騁縲縳縴縵縶縹縻衙縿繄繅繈繊繋繐繒繖繘繙繠繢繣繨繮繰繸繻繾纁纆纇纈纉纊纑纕纘纙纚纛缾罃罆罈罋罌罎罏罖罘罛罝罠罣罥罦罨罫罭鍰罳罶罹罻罽罿羂羃羇羋蕉51鴕羑羖羗羜羝羢羣羥羧羭羮羰羱羵羶羸藜鮐翀翃翄翊翌翏翕翛翟翡翣翥翦躚翪翫翬翮翯翺翽翾翿闆饕鴰鍁耋耇耎耏耑耒耜耔耞耡耤耨耩耪耬耰鬢耵聹聃聆聎聝聡聦聱聴聶聼閾聿肄肏肐肕腋肙肜肟肧胛肫肬肭肰肴肵肸肼胊胍胏胑胔胗胙胝胠銓胤胦胩胬胭胯胰胲胴胹胻胼胾脇脘脝脞脡脣脤脥脧脰脲脳腆腊腌臢腍腒腓腖腜腠腡腥腧腬腯踝蹬鐐腴腶蠕誹膂膃膆膇膋膔膕膗膙膟黐膣膦膫膰膴膵膷膾臃臄臇臈臌臐臑臓臕臖臙臛臝臞臧蓐詡臽臾臿舀舁鰟鮍舋舎舔舗舘舝舠舡舢舨舭舲舳舴舸舺艁艄艅艉艋艑艕艖艗艘艚艜艟艣艤艨艩艫艬艭荏艴艶艸艹艻艿芃芄芊萰陂藭芏芔芘芚蕙芟芣芤茉芧芨芩芪芮芰鰱芴芷芸蕘豢芼芿苄苒苘苙苜蓿苠苡苣蕒苤苧苪鎊苶苹苺苻苾茀茁范蠡萣茆茇茈茌茍茖茞茠茢茥茦菰茭茯茳藨茷藘茼荁荄荅荇荈菅蜢鴞荍荑荘荳荵荸薺莆莒莔莕莘莙莚莛莜莝莦莨菪莩莪莭莰莿菀菆菉菎菏菐菑菓菔菕菘菝菡菢菣菥蓂菧菫轂鎣菶菷菹醢菺菻菼菾萅萆萇萋萏萐萑萜萩萱萴萵萹萻葇葍葎葑葒葖葙葠葥葦葧葭葯葳葴葶葸葹葽蒄蒎蒓蘢薹蒞蒟蒻蒢蒦蒨蒭藁蒯蒱鉾蒴蒹蒺蒽蓀蓁蓆蓇蓊蓌蓍蓏蓓蓖蓧蓪蓫蓽跣藕蓯蓰蓱蓴蓷蓺蓼蔀蔂蔃蔆蔇蔉蔊蔋蔌蔎蔕蔘蔙蔞蔟鍔蔣雯蔦蔯蔳蔴蔵蔸蔾蕁蕆蕋蕍蕎蕐蕑蕓蕕蕖蕗蕝蕞蕠蕡蕢蕣蕤蕨蕳蕷蕸蕺蕻薀薁薃薅薆薈薉薌薏薐薔薖薘薙諤釵薜薠薢薤薧薨薫薬薳薶薷薸薽薾薿藄藇藋藎藐藙藚藟藦藳藴藶藷藾蘀蘁蘄蘋蘗蘘蘝蘤蘧蘩蘸蘼虀虆虍蟠虒虓虖虡虣虥虩虯虰蛵虵虷鱒虺虼蚆蚈蚋蚓蚔蚖蚘蚜蚡蚣蚧蚨蚩蚪蚯蚰蜒蚱蚳蚶蚹蚺蚻蚿蛀蛁蛄蛅蝮蛌蛍蛐蟮蛑蛓蛔蛘蛚蛜蛡蛣蜊蛩蛺蛻螫蜅蜆蜈蝣蜋蜍蜎蜑蠊蜛餞蜞蜣蜨蜩蜮蜱蜷蜺蜾蜿蝀蝃蝋蝌蝍蝎蝏蝗蝘蝙蝝鱝蝡蝤蝥蝯蝰蝱蝲蝴蝻螃蠏螄螉螋螒螓螗螘螙螚蟥螟螣螥螬螭螮螾螿蟀蟅蟈蟊蟋蟑蟓蟛蟜蟟蟢蟣蟨蟪蟭蟯蟳蟶蟷蟺蟿蠁蠂蠃蠆蠋蠐蠓蠔蠗蠙蠚蠛蠜蠧蠨蠩蠭蠮蠰蠲蠵蠸蠼蠽衁衂衄衇衈衉衋衎衒衕衖衚衞裳鈎衭衲衵衹衺衿袈裟袗袚袟袢袪袮袲袴袷袺袼褙袽裀裉裊裋裌裍裎裒裛裯裱裲裴裾褀褂褉褊褌褎褐褒褓褔褕褘褚褡褢褦褧褪褫褭褯褰褱襠褸褽褾襁襃襆襇襉襋襌襏襚襛襜襝襞襡襢襤襦襫襬襭襮襴襶襼襽襾覂覃覅覇覉覊覌覗覘覚覜覥覦覧覩覬覯覰観覿觔觕觖觜觽觝觡酲觩觫觭觱觳觶觷觼觾觿言賅訃訇訏訑訒詁託訧訬訳訹証訾詀詅詆譭詈詊詎詑詒詖詗詘詧詨詵詶詸詹詻詼詿誂誃誄鋤誆誋誑誒誖誙誚誥誧説読誯誶誾諂諄諆諌諍諏諑諕諗諛諝諞諟諠諡諴諵諶諼謄謆謇謌謍謏謑謖謚謡謦謪謫謳謷謼謾譁譅譆譈譊譌譒譔譖鑫譞譟譩譫譬譱譲譴譸譹譾讅讆讋讌讎讐讒讖讙讜讟谽豁豉豇豈豊豋豌豏豔豞豖豗豜豝豣豦豨豭豱豳豵豶豷豺豻貅貆貍貎貔貘貙貜貤饜貰餸貺賁賂賏賒賕賙賝賡賧賨賫鬭賮賵賸賺賻賾贇贉贐贔贕贗赬赭赱赳迄趁趂趄趐趑趒趔趡趦趫趮趯趲趴趵趷趹趺趿跁跂跅跆躓蹌跐跕跖跗跙跛跦跧跩跫跬跮跱跲跴跺跼跽踅踆踈踉踊踒���踘踜踟躇躕踠踡踣踤踥踦踧蹺踫踮踰踱踴踶踹踺踼踽躞蹁蹂躪蹎蹐蹓蹔蹕蹚蹜蹝蹟蹠蹡蹢躂蹧蹩蹪蹯鞠蹽躃躄躅躊躋躐躑躒躘躙躛躝躠躡躦躧躩躭躰躳躶軃軆輥軏軔軘軜軝齶転軥軨軭軱軲轆軷軹軺軽軿輀輂輦輅輇輈輓輗輙輜輞輠輤輬輭輮輳輴輵輶輹輼輾轀轇轏轑轒轔轕轖轗轘轙轝轞轢轤辠辢辤辵辶辺込辿迅迋迍麿迓迣迤邐迥迨迮迸迺迻迿逄逅逌逍逑逓逕逖逡逭逯逴逶逹遄遅遉遘遛遝遢遨遫遯遰遴遶遹遻邂邅邉邋邎邕邗邘邛邠邢邧邨邯鄲邰邲邳邴邶邷邽邾邿郃郄郇郈郔郕郗郙郚郜郝郞郟郠郢郪郫郯郰郲郳郴郷郹郾郿鄀鄄鄆鄇鄈鄋鄍鄎鄏鄐鄑鄒鄔鄕鄖鄗鄘鄚鄜鄞鄠鄢鄣鄤鄦鄩鄫鄬鄮鄯鄱鄶鄷鄹鄺鄻鄾鄿酃酅酆酇酈酊酋酎酏酐酣酔酕醄酖酗酞酡酢酤酩酴酹酺醁醅醆醊醍醐醑醓醖醝醞醡醤醨醪醭醯醰醱醲醴醵醸醹醼醽醾釂釃釅釆釈鱸鎦閶釓釔釕鈀釙鼢鼴釤釧釪釬釭釱釷釸釹鈁鈃鈄鈆鈇鈈鈊鈌鈐鈑鈒鈤鈥鈧鈬鈮鈰鈳鐺鈸鈹鈽鈿鉄鉆鉈鉋鉌鉍鉏鉑鉕鉚鉢鉥鉦鉨鉬鉭鉱鉲鉶鉸鉺鉼鉿銍銎銑銕鏤銚銛銠銣銤銥銦銧銩銪銫銭銰銲銶銻銼銾鋂鋃鋆鋈鋊鋌鋍鋏鋐鋑鋕鋘鋙鋝鋟鋦鋨鋩鋭鋮鋯鋰鋱鋳鋹鋺鋻鏰鐱錀錁錆錇錈錍錏錒錔錙錚錛錞錟錡錤錩錬録錸錼鍀鍆鍇鍉鍍鍏鍐鍘鍚鍛鍠鍤鍥鍩鍫鍭鍱鍴鍶鍹鍺鍼鍾鎄鎇鎉鎋鎌鎍鎏鎒鎓鎗鎘鎚鎞鎡鎤鎩鎪鎭鎯鎰鎳鎴鎵鎸鎹鎿鏇鏊鏌鏐鏑鏖鏗鏘鏚鏜鏝鏞鏠鏦鏨鏷鏸鏹鏻鏽鏾鐃鐄鐇鐏鐒鐓鐔鐗馗鐙鐝鐠鐡鐦鐨鐩鐫鐬鐱鐳鐶鐻鐽鐿鑀鑅鑌鑐鑕鑚鑛鑢鑤鑥鑪鑭鑯鑱鑴鑵鑷钁钃镻閆閈閌閎閒閔閗閟閡関閤閤閧閬閲閹閺閻閼閽閿闇闉闋闐闑闒闓闘闚闞闟闠闤闥阞阢阤阨阬阯阹阼阽陁陑陔陛陜陡陥陬騭陴険陼陾隂隃隈隒隗隞隠隣隤隩隮隰顴隳隷隹雂雈雉雊雎雑雒雗雘雚雝雟雩雰雱驛霂霅霈霊霑霒霓霙霝霢霣霤霨霩霪霫霮靁靆靉靑靚靣靦靪靮靰靳靷靸靺靼靿鞀鞃鞄鞌鞗鞙鞚鞝鞞鞡鞣鞨鞫鞬鞮鞶鞹鞾韃韅韉馱韍韎韔韖韘韝韞韡韣韭韮韱韹韺頀颳頄頇頊頍頎頏頒頖頞頠頫頬顱頯頲頴頼顇顋顑顒顓顔顕顚顜顢顣顬顳颭颮颱颶颸颺颻颽颾颿飀飂飈飌飜飡飣飤飥飩飫飮飱飶餀餂餄餎餇餈餑餔餕餖餗餚餛餜餟餠餤餧餩餪餫餬餮餱餲餳餺餻餼餽餿饁饅饇饉饊饍饎饐饘饟饢馘馥馝馡馣騮騾馵馹駃駄駅駆駉駋駑駓駔駗駘駙駜駡駢駪駬駰駴駸駹駽駾騂騄騅騆騉騋騍騏驎騑騒験騕騖騠騢騣騤騧驤騵騶騸騺驀驂驃驄驆驈驊驌驍驎驏驒驔驖驙驦驩驫骺鯁骫骭骯骱骴骶骷髏骾髁髂髄髆髈髐髑髕髖髙髝髞髟髡髣髧髪髫髭髯髲髳髹髺髽髾鬁鬃鬅鬈鬋鬎鬏鬐鬑鬒鬖鬗鬘鬙鬠鬣鬪鬫鬬鬮鬯鬰鬲鬵鬷魆魈魊魋魍魎魑魖鰾魛魟魣魦魨魬魴魵魸鮀鮁鮆鮌鮎鮑鮒鮓鮚鮞鮟鱇鮠鮦鮨鮪鮭鮶鮸鮿鯀鯄鯆鯇鯈鯔鯕鯖鯗鯙鯠鯤鯥鯫鯰鯷鯸鯿鰂鰆鶼鰉鰋鰐鰒鰕鰛鰜鰣鰤鰥鰦鰨鰩鰮鰳鰶鰷鱺鰼鰽鱀鱄鱅鱆鱈鱎鱐鱓鱔鱖鱘鱟鱠鱣鱨鱭鱮鱲鱵鱻鲅鳦鳧鳯鳲鳷鳻鴂鴃鴄鴆鴈鴎鴒鴔鴗鴛鴦鴝鵒鴟鴠鴢鴣鴥鴯鶓鴳鴴鴷鴽鵀鵁鵂鵓鵖鵙鵜鶘鵞鵟鵩鵪鵫鵵鵷鵻鵾鶂鶊鶏鶒鶖鶗鶡鶤鶦鶬鶱鶲鶵鶸鶹鶺鶿鷀鷁鷃鷄鷇鷈鷉鷊鷏鷓鷕鷖鷙鷞鷟鷥鷦鷯鷩鷫鷭鷳鷴鷽鷾鷿鸂鸇鸊鸏鸑鸒鸓鸕鸛鸜鸝鹸鹹鹺麀麂麃麄麇麋麌麐麑麒麚麛麝麤麩麪麫麮麯麰麺麾黁黈黌黢黒黓黕黙黝黟黥黦黧黮黰黱黲黶黹黻黼黽黿鼂鼃鼅鼈鼉鼏鼐鼒鼕鼖鼙鼚鼛鼡鼩鼱鼪鼫鼯鼷鼽齁齆齇齈齉齌齎齏齔齕齗齙齚齜齞齟齬齠齢齣齧齩齮齯齰齱齵齾龎龑龒龔龖龘龝龡龢龤"
|
20 |
+
|
21 |
+
assert len(simplified_charcters) == len(simplified_charcters)
|
22 |
+
|
23 |
+
s2t_dict = {}
|
24 |
+
t2s_dict = {}
|
25 |
+
for i, item in enumerate(simplified_charcters):
|
26 |
+
s2t_dict[item] = traditional_characters[i]
|
27 |
+
t2s_dict[traditional_characters[i]] = item
|
28 |
+
|
29 |
+
|
30 |
+
def tranditional_to_simplified(text: str) -> str:
|
31 |
+
return "".join([t2s_dict[item] if item in t2s_dict else item for item in text])
|
32 |
+
|
33 |
+
|
34 |
+
def simplified_to_traditional(text: str) -> str:
|
35 |
+
return "".join([s2t_dict[item] if item in s2t_dict else item for item in text])
|
36 |
+
|
37 |
+
|
38 |
+
if __name__ == "__main__":
|
39 |
+
text = "一般是指存取一個應用程式啟動時始終顯示在網站或網頁瀏覽器中的一個或多個初始網頁等畫面存在的站點"
|
40 |
+
print(text)
|
41 |
+
text_simple = tranditional_to_simplified(text)
|
42 |
+
print(text_simple)
|
43 |
+
text_traditional = simplified_to_traditional(text_simple)
|
44 |
+
print(text_traditional)
|
text/zh_normalization/chronology.py
ADDED
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
import re
|
15 |
+
|
16 |
+
from .num import DIGITS
|
17 |
+
from .num import num2str
|
18 |
+
from .num import verbalize_cardinal
|
19 |
+
from .num import verbalize_digit
|
20 |
+
|
21 |
+
|
22 |
+
def _time_num2str(num_string: str) -> str:
|
23 |
+
"""A special case for verbalizing number in time."""
|
24 |
+
result = num2str(num_string.lstrip("0"))
|
25 |
+
if num_string.startswith("0"):
|
26 |
+
result = DIGITS["0"] + result
|
27 |
+
return result
|
28 |
+
|
29 |
+
|
30 |
+
# 时刻表达式
|
31 |
+
RE_TIME = re.compile(
|
32 |
+
r"([0-1]?[0-9]|2[0-3])"
|
33 |
+
r":([0-5][0-9])"
|
34 |
+
r"(:([0-5][0-9]))?"
|
35 |
+
)
|
36 |
+
|
37 |
+
# 时间范围,如8:30-12:30
|
38 |
+
RE_TIME_RANGE = re.compile(
|
39 |
+
r"([0-1]?[0-9]|2[0-3])"
|
40 |
+
r":([0-5][0-9])"
|
41 |
+
r"(:([0-5][0-9]))?"
|
42 |
+
r"(~|-)"
|
43 |
+
r"([0-1]?[0-9]|2[0-3])"
|
44 |
+
r":([0-5][0-9])"
|
45 |
+
r"(:([0-5][0-9]))?"
|
46 |
+
)
|
47 |
+
|
48 |
+
|
49 |
+
def replace_time(match) -> str:
|
50 |
+
"""
|
51 |
+
Args:
|
52 |
+
match (re.Match)
|
53 |
+
Returns:
|
54 |
+
str
|
55 |
+
"""
|
56 |
+
|
57 |
+
is_range = len(match.groups()) > 5
|
58 |
+
|
59 |
+
hour = match.group(1)
|
60 |
+
minute = match.group(2)
|
61 |
+
second = match.group(4)
|
62 |
+
|
63 |
+
if is_range:
|
64 |
+
hour_2 = match.group(6)
|
65 |
+
minute_2 = match.group(7)
|
66 |
+
second_2 = match.group(9)
|
67 |
+
|
68 |
+
result = f"{num2str(hour)}点"
|
69 |
+
if minute.lstrip("0"):
|
70 |
+
if int(minute) == 30:
|
71 |
+
result += "半"
|
72 |
+
else:
|
73 |
+
result += f"{_time_num2str(minute)}分"
|
74 |
+
if second and second.lstrip("0"):
|
75 |
+
result += f"{_time_num2str(second)}秒"
|
76 |
+
|
77 |
+
if is_range:
|
78 |
+
result += "至"
|
79 |
+
result += f"{num2str(hour_2)}点"
|
80 |
+
if minute_2.lstrip("0"):
|
81 |
+
if int(minute) == 30:
|
82 |
+
result += "半"
|
83 |
+
else:
|
84 |
+
result += f"{_time_num2str(minute_2)}分"
|
85 |
+
if second_2 and second_2.lstrip("0"):
|
86 |
+
result += f"{_time_num2str(second_2)}秒"
|
87 |
+
|
88 |
+
return result
|
89 |
+
|
90 |
+
|
91 |
+
RE_DATE = re.compile(
|
92 |
+
r"(\d{4}|\d{2})年"
|
93 |
+
r"((0?[1-9]|1[0-2])月)?"
|
94 |
+
r"(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?"
|
95 |
+
)
|
96 |
+
|
97 |
+
|
98 |
+
def replace_date(match) -> str:
|
99 |
+
"""
|
100 |
+
Args:
|
101 |
+
match (re.Match)
|
102 |
+
Returns:
|
103 |
+
str
|
104 |
+
"""
|
105 |
+
year = match.group(1)
|
106 |
+
month = match.group(3)
|
107 |
+
day = match.group(5)
|
108 |
+
result = ""
|
109 |
+
if year:
|
110 |
+
result += f"{verbalize_digit(year)}年"
|
111 |
+
if month:
|
112 |
+
result += f"{verbalize_cardinal(month)}月"
|
113 |
+
if day:
|
114 |
+
result += f"{verbalize_cardinal(day)}{match.group(9)}"
|
115 |
+
return result
|
116 |
+
|
117 |
+
|
118 |
+
# 用 / 或者 - 分隔的 YY/MM/DD 或者 YY-MM-DD 日期
|
119 |
+
RE_DATE2 = re.compile(r"(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])")
|
120 |
+
|
121 |
+
|
122 |
+
def replace_date2(match) -> str:
|
123 |
+
"""
|
124 |
+
Args:
|
125 |
+
match (re.Match)
|
126 |
+
Returns:
|
127 |
+
str
|
128 |
+
"""
|
129 |
+
year = match.group(1)
|
130 |
+
month = match.group(3)
|
131 |
+
day = match.group(4)
|
132 |
+
result = ""
|
133 |
+
if year:
|
134 |
+
result += f"{verbalize_digit(year)}年"
|
135 |
+
if month:
|
136 |
+
result += f"{verbalize_cardinal(month)}月"
|
137 |
+
if day:
|
138 |
+
result += f"{verbalize_cardinal(day)}日"
|
139 |
+
return result
|
text/zh_normalization/constants.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
import re
|
15 |
+
import string
|
16 |
+
|
17 |
+
from pypinyin.constants import SUPPORT_UCS4
|
18 |
+
|
19 |
+
# 全角半角转换
|
20 |
+
# 英文字符全角 -> 半角映射表 (num: 52)
|
21 |
+
F2H_ASCII_LETTERS = {ord(char) + 65248: ord(char) for char in string.ascii_letters}
|
22 |
+
|
23 |
+
# 英文字符半角 -> 全角映射表
|
24 |
+
H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()}
|
25 |
+
|
26 |
+
# 数字字符全角 -> 半角映射表 (num: 10)
|
27 |
+
F2H_DIGITS = {ord(char) + 65248: ord(char) for char in string.digits}
|
28 |
+
# 数字字符半角 -> 全角映射表
|
29 |
+
H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()}
|
30 |
+
|
31 |
+
# 标点符号全角 -> 半角映射表 (num: 32)
|
32 |
+
F2H_PUNCTUATIONS = {ord(char) + 65248: ord(char) for char in string.punctuation}
|
33 |
+
# 标点符号半角 -> 全角映射表
|
34 |
+
H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()}
|
35 |
+
|
36 |
+
# 空格 (num: 1)
|
37 |
+
F2H_SPACE = {"\u3000": " "}
|
38 |
+
H2F_SPACE = {" ": "\u3000"}
|
39 |
+
|
40 |
+
# 非"有拼音的汉字"的字符串,可用于NSW提取
|
41 |
+
if SUPPORT_UCS4:
|
42 |
+
RE_NSW = re.compile(
|
43 |
+
r"(?:[^"
|
44 |
+
r"\u3007" # 〇
|
45 |
+
r"\u3400-\u4dbf" # CJK扩展A:[3400-4DBF]
|
46 |
+
r"\u4e00-\u9fff" # CJK基本:[4E00-9FFF]
|
47 |
+
r"\uf900-\ufaff" # CJK兼容:[F900-FAFF]
|
48 |
+
r"\U00020000-\U0002A6DF" # CJK扩展B:[20000-2A6DF]
|
49 |
+
r"\U0002A703-\U0002B73F" # CJK扩展C:[2A700-2B73F]
|
50 |
+
r"\U0002B740-\U0002B81D" # CJK扩展D:[2B740-2B81D]
|
51 |
+
r"\U0002F80A-\U0002FA1F" # CJK兼容扩展:[2F800-2FA1F]
|
52 |
+
r"])+"
|
53 |
+
)
|
54 |
+
else:
|
55 |
+
RE_NSW = re.compile( # pragma: no cover
|
56 |
+
r"(?:[^"
|
57 |
+
r"\u3007" # 〇
|
58 |
+
r"\u3400-\u4dbf" # CJK扩展A:[3400-4DBF]
|
59 |
+
r"\u4e00-\u9fff" # CJK基本:[4E00-9FFF]
|
60 |
+
r"\uf900-\ufaff" # CJK兼容:[F900-FAFF]
|
61 |
+
r"])+"
|
62 |
+
)
|
text/zh_normalization/num.py
ADDED
@@ -0,0 +1,317 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
"""
|
15 |
+
Rules to verbalize numbers into Chinese characters.
|
16 |
+
https://zh.wikipedia.org/wiki/中文数字#現代中文
|
17 |
+
"""
|
18 |
+
|
19 |
+
import re
|
20 |
+
from collections import OrderedDict
|
21 |
+
from typing import List
|
22 |
+
|
23 |
+
DIGITS = {str(i): tran for i, tran in enumerate("零一二三四五六七八九")}
|
24 |
+
UNITS = OrderedDict(
|
25 |
+
{
|
26 |
+
1: "十",
|
27 |
+
2: "百",
|
28 |
+
3: "千",
|
29 |
+
4: "万",
|
30 |
+
8: "亿",
|
31 |
+
}
|
32 |
+
)
|
33 |
+
|
34 |
+
COM_QUANTIFIERS = "(处|台|架|枚|趟|幅|平|方|堵|间|床|株|批|项|例|列|篇|栋|注|亩|封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)"
|
35 |
+
|
36 |
+
# 分数表达式
|
37 |
+
RE_FRAC = re.compile(r"(-?)(\d+)/(\d+)")
|
38 |
+
|
39 |
+
|
40 |
+
def replace_frac(match) -> str:
|
41 |
+
"""
|
42 |
+
Args:
|
43 |
+
match (re.Match)
|
44 |
+
Returns:
|
45 |
+
str
|
46 |
+
"""
|
47 |
+
sign = match.group(1)
|
48 |
+
nominator = match.group(2)
|
49 |
+
denominator = match.group(3)
|
50 |
+
sign: str = "负" if sign else ""
|
51 |
+
nominator: str = num2str(nominator)
|
52 |
+
denominator: str = num2str(denominator)
|
53 |
+
result = f"{sign}{denominator}分之{nominator}"
|
54 |
+
return result
|
55 |
+
|
56 |
+
|
57 |
+
# 百分数表达式
|
58 |
+
RE_PERCENTAGE = re.compile(r"(-?)(\d+(\.\d+)?)%")
|
59 |
+
|
60 |
+
|
61 |
+
def replace_percentage(match) -> str:
|
62 |
+
"""
|
63 |
+
Args:
|
64 |
+
match (re.Match)
|
65 |
+
Returns:
|
66 |
+
str
|
67 |
+
"""
|
68 |
+
sign = match.group(1)
|
69 |
+
percent = match.group(2)
|
70 |
+
sign: str = "负" if sign else ""
|
71 |
+
percent: str = num2str(percent)
|
72 |
+
result = f"{sign}百分之{percent}"
|
73 |
+
return result
|
74 |
+
|
75 |
+
|
76 |
+
# 整数表达式
|
77 |
+
# 带负号的整数 -10
|
78 |
+
RE_INTEGER = re.compile(r"(-)" r"(\d+)")
|
79 |
+
|
80 |
+
|
81 |
+
def replace_negative_num(match) -> str:
|
82 |
+
"""
|
83 |
+
Args:
|
84 |
+
match (re.Match)
|
85 |
+
Returns:
|
86 |
+
str
|
87 |
+
"""
|
88 |
+
sign = match.group(1)
|
89 |
+
number = match.group(2)
|
90 |
+
sign: str = "负" if sign else ""
|
91 |
+
number: str = num2str(number)
|
92 |
+
result = f"{sign}{number}"
|
93 |
+
return result
|
94 |
+
|
95 |
+
|
96 |
+
# 编号-无符号整形
|
97 |
+
# 00078
|
98 |
+
RE_DEFAULT_NUM = re.compile(r"\d{3}\d*")
|
99 |
+
|
100 |
+
|
101 |
+
def replace_default_num(match):
|
102 |
+
"""
|
103 |
+
Args:
|
104 |
+
match (re.Match)
|
105 |
+
Returns:
|
106 |
+
str
|
107 |
+
"""
|
108 |
+
number = match.group(0)
|
109 |
+
return verbalize_digit(number, alt_one=True)
|
110 |
+
|
111 |
+
|
112 |
+
# 加减乘除
|
113 |
+
# RE_ASMD = re.compile(
|
114 |
+
# r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))([\+\-\×÷=])((-?)((\d+)(\.\d+)?)|(\.(\d+)))')
|
115 |
+
RE_ASMD = re.compile(
|
116 |
+
r"((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))([\+\-\×÷=])((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))"
|
117 |
+
)
|
118 |
+
|
119 |
+
asmd_map = {"+": "加", "-": "减", "×": "乘", "÷": "除", "=": "等于"}
|
120 |
+
|
121 |
+
|
122 |
+
def replace_asmd(match) -> str:
|
123 |
+
"""
|
124 |
+
Args:
|
125 |
+
match (re.Match)
|
126 |
+
Returns:
|
127 |
+
str
|
128 |
+
"""
|
129 |
+
result = match.group(1) + asmd_map[match.group(8)] + match.group(9)
|
130 |
+
return result
|
131 |
+
|
132 |
+
|
133 |
+
# 次方专项
|
134 |
+
RE_POWER = re.compile(r"[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]+")
|
135 |
+
|
136 |
+
power_map = {
|
137 |
+
"⁰": "0",
|
138 |
+
"¹": "1",
|
139 |
+
"²": "2",
|
140 |
+
"³": "3",
|
141 |
+
"⁴": "4",
|
142 |
+
"⁵": "5",
|
143 |
+
"⁶": "6",
|
144 |
+
"⁷": "7",
|
145 |
+
"⁸": "8",
|
146 |
+
"⁹": "9",
|
147 |
+
"ˣ": "x",
|
148 |
+
"ʸ": "y",
|
149 |
+
"ⁿ": "n",
|
150 |
+
}
|
151 |
+
|
152 |
+
|
153 |
+
def replace_power(match) -> str:
|
154 |
+
"""
|
155 |
+
Args:
|
156 |
+
match (re.Match)
|
157 |
+
Returns:
|
158 |
+
str
|
159 |
+
"""
|
160 |
+
power_num = ""
|
161 |
+
for m in match.group(0):
|
162 |
+
power_num += power_map[m]
|
163 |
+
result = "的" + power_num + "次方"
|
164 |
+
return result
|
165 |
+
|
166 |
+
|
167 |
+
# 数字表达式
|
168 |
+
# 纯小数
|
169 |
+
RE_DECIMAL_NUM = re.compile(r"(-?)((\d+)(\.\d+))" r"|(\.(\d+))")
|
170 |
+
# 正整数 + ��词
|
171 |
+
RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几\+])?" + COM_QUANTIFIERS)
|
172 |
+
RE_NUMBER = re.compile(r"(-?)((\d+)(\.\d+)?)" r"|(\.(\d+))")
|
173 |
+
|
174 |
+
|
175 |
+
def replace_positive_quantifier(match) -> str:
|
176 |
+
"""
|
177 |
+
Args:
|
178 |
+
match (re.Match)
|
179 |
+
Returns:
|
180 |
+
str
|
181 |
+
"""
|
182 |
+
number = match.group(1)
|
183 |
+
match_2 = match.group(2)
|
184 |
+
if match_2 == "+":
|
185 |
+
match_2 = "多"
|
186 |
+
match_2: str = match_2 if match_2 else ""
|
187 |
+
quantifiers: str = match.group(3)
|
188 |
+
number: str = num2str(number)
|
189 |
+
number = "两" if number == "二" else number
|
190 |
+
result = f"{number}{match_2}{quantifiers}"
|
191 |
+
return result
|
192 |
+
|
193 |
+
|
194 |
+
def replace_number(match) -> str:
|
195 |
+
"""
|
196 |
+
Args:
|
197 |
+
match (re.Match)
|
198 |
+
Returns:
|
199 |
+
str
|
200 |
+
"""
|
201 |
+
sign = match.group(1)
|
202 |
+
number = match.group(2)
|
203 |
+
pure_decimal = match.group(5)
|
204 |
+
if pure_decimal:
|
205 |
+
result = num2str(pure_decimal)
|
206 |
+
else:
|
207 |
+
sign: str = "负" if sign else ""
|
208 |
+
number: str = num2str(number)
|
209 |
+
result = f"{sign}{number}"
|
210 |
+
return result
|
211 |
+
|
212 |
+
|
213 |
+
# 范围表达式
|
214 |
+
# match.group(1) and match.group(8) are copy from RE_NUMBER
|
215 |
+
|
216 |
+
RE_RANGE = re.compile(
|
217 |
+
r"""
|
218 |
+
(?<![\d\+\-\×÷=]) # 使用反向前瞻以确保数字范围之前没有其他数字和操作符
|
219 |
+
((-?)((\d+)(\.\d+)?)) # 匹配范围起始的负数或正数(整数或小数)
|
220 |
+
[-~] # 匹配范围分隔符
|
221 |
+
((-?)((\d+)(\.\d+)?)) # 匹配范围结束的负数或正数(整数或小数)
|
222 |
+
(?![\d\+\-\×÷=]) # 使用正向前瞻以确保数字范围之后没有其他数字和操作符
|
223 |
+
""",
|
224 |
+
re.VERBOSE,
|
225 |
+
)
|
226 |
+
|
227 |
+
|
228 |
+
def replace_range(match) -> str:
|
229 |
+
"""
|
230 |
+
Args:
|
231 |
+
match (re.Match)
|
232 |
+
Returns:
|
233 |
+
str
|
234 |
+
"""
|
235 |
+
first, second = match.group(1), match.group(6)
|
236 |
+
first = RE_NUMBER.sub(replace_number, first)
|
237 |
+
second = RE_NUMBER.sub(replace_number, second)
|
238 |
+
result = f"{first}到{second}"
|
239 |
+
return result
|
240 |
+
|
241 |
+
|
242 |
+
# ~至表达式
|
243 |
+
RE_TO_RANGE = re.compile(
|
244 |
+
r"((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)[~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)"
|
245 |
+
)
|
246 |
+
|
247 |
+
|
248 |
+
def replace_to_range(match) -> str:
|
249 |
+
"""
|
250 |
+
Args:
|
251 |
+
match (re.Match)
|
252 |
+
Returns:
|
253 |
+
str
|
254 |
+
"""
|
255 |
+
result = match.group(0).replace("~", "至")
|
256 |
+
return result
|
257 |
+
|
258 |
+
|
259 |
+
def _get_value(value_string: str, use_zero: bool = True) -> List[str]:
|
260 |
+
stripped = value_string.lstrip("0")
|
261 |
+
if len(stripped) == 0:
|
262 |
+
return []
|
263 |
+
elif len(stripped) == 1:
|
264 |
+
if use_zero and len(stripped) < len(value_string):
|
265 |
+
return [DIGITS["0"], DIGITS[stripped]]
|
266 |
+
else:
|
267 |
+
return [DIGITS[stripped]]
|
268 |
+
else:
|
269 |
+
largest_unit = next(power for power in reversed(UNITS.keys()) if power < len(stripped))
|
270 |
+
first_part = value_string[:-largest_unit]
|
271 |
+
second_part = value_string[-largest_unit:]
|
272 |
+
return _get_value(first_part) + [UNITS[largest_unit]] + _get_value(second_part)
|
273 |
+
|
274 |
+
|
275 |
+
def verbalize_cardinal(value_string: str) -> str:
|
276 |
+
if not value_string:
|
277 |
+
return ""
|
278 |
+
|
279 |
+
# 000 -> '零' , 0 -> '零'
|
280 |
+
value_string = value_string.lstrip("0")
|
281 |
+
if len(value_string) == 0:
|
282 |
+
return DIGITS["0"]
|
283 |
+
|
284 |
+
result_symbols = _get_value(value_string)
|
285 |
+
# verbalized number starting with '一十*' is abbreviated as `十*`
|
286 |
+
if len(result_symbols) >= 2 and result_symbols[0] == DIGITS["1"] and result_symbols[1] == UNITS[1]:
|
287 |
+
result_symbols = result_symbols[1:]
|
288 |
+
return "".join(result_symbols)
|
289 |
+
|
290 |
+
|
291 |
+
def verbalize_digit(value_string: str, alt_one=False) -> str:
|
292 |
+
result_symbols = [DIGITS[digit] for digit in value_string]
|
293 |
+
result = "".join(result_symbols)
|
294 |
+
if alt_one:
|
295 |
+
result = result.replace("一", "幺")
|
296 |
+
return result
|
297 |
+
|
298 |
+
|
299 |
+
def num2str(value_string: str) -> str:
|
300 |
+
integer_decimal = value_string.split(".")
|
301 |
+
if len(integer_decimal) == 1:
|
302 |
+
integer = integer_decimal[0]
|
303 |
+
decimal = ""
|
304 |
+
elif len(integer_decimal) == 2:
|
305 |
+
integer, decimal = integer_decimal
|
306 |
+
else:
|
307 |
+
raise ValueError(f"The value string: '${value_string}' has more than one point in it.")
|
308 |
+
|
309 |
+
result = verbalize_cardinal(integer)
|
310 |
+
|
311 |
+
decimal = decimal.rstrip("0")
|
312 |
+
if decimal:
|
313 |
+
# '.22' is verbalized as '零点二二'
|
314 |
+
# '3.20' is verbalized as '三点二
|
315 |
+
result = result if result else "零"
|
316 |
+
result += "点" + verbalize_digit(decimal)
|
317 |
+
return result
|
text/zh_normalization/phonecode.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
import re
|
15 |
+
|
16 |
+
from .num import verbalize_digit
|
17 |
+
|
18 |
+
# 规范化固话/手机号码
|
19 |
+
# 手机
|
20 |
+
# http://www.jihaoba.com/news/show/13680
|
21 |
+
# 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
|
22 |
+
# 联通:130、131、132、156、155、186、185、176
|
23 |
+
# 电信:133、153、189、180、181、177
|
24 |
+
RE_MOBILE_PHONE = re.compile(r"(?<!\d)((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})(?!\d)")
|
25 |
+
RE_TELEPHONE = re.compile(r"(?<!\d)((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})(?!\d)")
|
26 |
+
|
27 |
+
# 全国统一的号码400开头
|
28 |
+
RE_NATIONAL_UNIFORM_NUMBER = re.compile(r"(400)(-)?\d{3}(-)?\d{4}")
|
29 |
+
|
30 |
+
|
31 |
+
def phone2str(phone_string: str, mobile=True) -> str:
|
32 |
+
if mobile:
|
33 |
+
sp_parts = phone_string.strip("+").split()
|
34 |
+
result = ",".join([verbalize_digit(part, alt_one=True) for part in sp_parts])
|
35 |
+
return result
|
36 |
+
else:
|
37 |
+
sil_parts = phone_string.split("-")
|
38 |
+
result = ",".join([verbalize_digit(part, alt_one=True) for part in sil_parts])
|
39 |
+
return result
|
40 |
+
|
41 |
+
|
42 |
+
def replace_phone(match) -> str:
|
43 |
+
"""
|
44 |
+
Args:
|
45 |
+
match (re.Match)
|
46 |
+
Returns:
|
47 |
+
str
|
48 |
+
"""
|
49 |
+
return phone2str(match.group(0), mobile=False)
|
50 |
+
|
51 |
+
|
52 |
+
def replace_mobile(match) -> str:
|
53 |
+
"""
|
54 |
+
Args:
|
55 |
+
match (re.Match)
|
56 |
+
Returns:
|
57 |
+
str
|
58 |
+
"""
|
59 |
+
return phone2str(match.group(0))
|