Spaces:

LukeJacob2023
/

chinese-punc

Sleeping

App Files Files Community

LukeJacob2023 commited on Sep 1, 2024

Commit

eac1a45

verified ·

1 Parent(s): de9456f

Upload 14 files

Browse files

Files changed (14) hide show

app.py +45 -0
cttPunctuator.py +63 -0
cttpunctuator/__init__.py +5 -0
cttpunctuator/__pycache__/__init__.cpython-310.pyc +0 -0
cttpunctuator/src/__pycache__/punctuator.cpython-310.pyc +0 -0
cttpunctuator/src/onnx/configuration.json +20 -0
cttpunctuator/src/onnx/punc.bin +3 -0
cttpunctuator/src/onnx/punc.onnx +3 -0
cttpunctuator/src/punctuator.py +312 -0
cttpunctuator/src/utils/OrtInferSession.py +103 -0
cttpunctuator/src/utils/__pycache__/OrtInferSession.cpython-310.pyc +0 -0
cttpunctuator/src/utils/__pycache__/text_post_process.cpython-310.pyc +0 -0
cttpunctuator/src/utils/text_post_process.py +85 -0
requirements.txt +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import gradio as gr
+from cttPunctuator import CttPunctuator
+punc = CttPunctuator()
+def punctuate(text):
+    # 使用模型生成标点润饰的文本
+    return punc.punctuate(text)[0]
+def clear_text():
+    return "", ""
+with gr.Blocks() as iface:
+    gr.Markdown("""
+    # 中英文标点润饰工具
+    这个工具可以帮助你自动为文本添加适当的标点符号。
+    基于项目：https://github.com/lovemefan/CT-Transformer-punctuation
+    使用说明：
+    1. 在左侧的输入框中粘贴或输入你的文本。
+    2. 点击"润饰"按钮。
+    3. 查看右侧输出框中的结果。可以使用输出框右上角复制按钮快速复制结果。
+    4. 如需清空所有内容，点击"清空"按钮。
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            input_text = gr.Textbox(lines=10, label="输入文本")
+        with gr.Column(scale=1):
+            output_text = gr.Textbox(lines=10, label="结果", show_copy_button=True)
+    with gr.Row():
+        punctuate_button = gr.Button("润饰")
+        clear_button = gr.Button("清空")
+    punctuate_button.click(fn=punctuate, inputs=input_text, outputs=output_text)
+    clear_button.click(fn=clear_text, inputs=None, outputs=[input_text, output_text])
+# 启动Gradio应用
+iface.launch()

cttPunctuator.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# -*- coding:utf-8 -*-
+# @FileName  :ctt-punctuator.py
+# @Time      :2023/4/13 15:03
+# @Author    :lovemefan
+# @Email     :[email protected]
+__author__ = "lovemefan"
+__copyright__ = "Copyright (C) 2023 lovemefan"
+__license__ = "MIT"
+__version__ = "v0.0.1"
+import logging
+import threading
+from cttpunctuator.src.punctuator import CT_Transformer, CT_Transformer_VadRealtime
+logging.basicConfig(
+    level=logging.INFO,
+    format="[%(asctime)s %(levelname)s] [%(filename)s:%(lineno)d %(module)s.%(funcName)s] %(message)s",
+)
+lock = threading.RLock()
+class CttPunctuator:
+    _offline_model = None
+    _online_model = None
+    def __init__(self, online: bool = False):
+        """
+        punctuator with singleton pattern
+        :param online:
+        """
+        self.online = online
+        if online:
+            if CttPunctuator._online_model is None:
+                with lock:
+                    if CttPunctuator._online_model is None:
+                        logging.info("Initializing punctuator model with online mode.")
+                        CttPunctuator._online_model = CT_Transformer_VadRealtime()
+                        self.param_dict = {"cache": []}
+                        logging.info("Online model initialized.")
+            self.model = CttPunctuator._online_model
+        else:
+            if CttPunctuator._offline_model is None:
+                with lock:
+                    if CttPunctuator._offline_model is None:
+                        logging.info("Initializing punctuator model with offline mode.")
+                        CttPunctuator._offline_model = CT_Transformer()
+                        logging.info("Offline model initialized.")
+            self.model = CttPunctuator._offline_model
+        logging.info("Model initialized.")
+    def punctuate(self, text: str, param_dict=None):
+        if self.online:
+            param_dict = param_dict or self.param_dict
+            return self.model(text, self.param_dict)
+        else:
+            return self.model(text)

cttpunctuator/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# -*- coding:utf-8 -*-
+# @FileName  :__init__.py.py
+# @Time      :2023/4/13 14:58
+# @Author    :lovemefan
+# @Email     :[email protected]

cttpunctuator/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (146 Bytes). View file

cttpunctuator/src/__pycache__/punctuator.cpython-310.pyc ADDED Viewed

Binary file (8.14 kB). View file

cttpunctuator/src/onnx/configuration.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "framework": "onnx",
+  "task" : "punctuation",
+  "model" : {
+    "type" : "generic-punc",
+    "punc_model_name" : "punc.pb",
+    "punc_model_config" : {
+      "type": "pytorch",
+      "code_base": "funasr",
+      "mode": "punc",
+      "lang": "zh-cn",
+      "batch_size": 1,
+      "punc_config": "punc.yaml",
+      "model": "damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
+    }
+  },
+  "pipeline": {
+    "type":"punc-inference"
+  }
+}

cttpunctuator/src/onnx/punc.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:85a6f2ec7cfa74c1ec932223425a35ce801bb0171571330a94d5d78f9ba2e245
+size 2848807

cttpunctuator/src/onnx/punc.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ed5318d91ff9520a03a5b5a8dba264b76858931db7d914b0de6ec9e4ad35970e
+size 292001778

cttpunctuator/src/punctuator.py ADDED Viewed

	@@ -0,0 +1,312 @@

+import logging
+import os.path
+import pickle
+from pathlib import Path
+from typing import Tuple, Union
+import numpy as np
+from cttpunctuator.src.utils.OrtInferSession import ONNXRuntimeError, OrtInferSession
+from cttpunctuator.src.utils.text_post_process import (
+    TokenIDConverter,
+    code_mix_split_words,
+    split_to_mini_sentence,
+)
+class CT_Transformer:
+    """
+    Author: Speech Lab, Alibaba Group, China
+    CT-Transformer: Controllable time-delay transformer
+    for real-time punctuation prediction and disfluency detection
+    https://arxiv.org/pdf/2003.01309.pdf
+    """
+    def __init__(
+        self,
+        model_dir: Union[str, Path] = None,
+        batch_size: int = 1,
+        device_id: Union[str, int] = "-1",
+        quantize: bool = False,
+        intra_op_num_threads: int = 4,
+    ):
+        model_dir = model_dir or os.path.join(os.path.dirname(__file__), "onnx")
+        if model_dir is None or not Path(model_dir).exists():
+            raise FileNotFoundError(f"{model_dir} does not exist.")
+        model_file = os.path.join(model_dir, "punc.onnx")
+        if quantize:
+            model_file = os.path.join(model_dir, "model_quant.onnx")
+        config_file = os.path.join(model_dir, "punc.bin")
+        with open(config_file, "rb") as file:
+            config = pickle.load(file)
+        self.converter = TokenIDConverter(config["token_list"])
+        self.ort_infer = OrtInferSession(
+            model_file, device_id, intra_op_num_threads=intra_op_num_threads
+        )
+        self.batch_size = 1
+        self.punc_list = config["punc_list"]
+        self.period = 0
+        for i in range(len(self.punc_list)):
+            if self.punc_list[i] == ",":
+                self.punc_list[i] = "，"
+            elif self.punc_list[i] == "?":
+                self.punc_list[i] = "？"
+            elif self.punc_list[i] == "。":
+                self.period = i
+    def __call__(self, text: Union[list, str], split_size=20):
+        split_text = code_mix_split_words(text)
+        split_text_id = self.converter.tokens2ids(split_text)
+        mini_sentences = split_to_mini_sentence(split_text, split_size)
+        mini_sentences_id = split_to_mini_sentence(split_text_id, split_size)
+        assert len(mini_sentences) == len(mini_sentences_id)
+        cache_sent = []
+        cache_sent_id = []
+        new_mini_sentence = ""
+        new_mini_sentence_punc = []
+        cache_pop_trigger_limit = 200
+        for mini_sentence_i in range(len(mini_sentences)):
+            mini_sentence = mini_sentences[mini_sentence_i]
+            mini_sentence_id = mini_sentences_id[mini_sentence_i]
+            mini_sentence = cache_sent + mini_sentence
+            mini_sentence_id = np.array(cache_sent_id + mini_sentence_id, dtype="int64")
+            text_lengths = np.array([len(mini_sentence)], dtype="int32")
+            data = {
+                "text": mini_sentence_id[None, :],
+                "text_lengths": text_lengths,
+            }
+            try:
+                outputs = self.infer(data["text"], data["text_lengths"])
+                y = outputs[0]
+                punctuations = np.argmax(y, axis=-1)[0]
+                assert punctuations.size == len(mini_sentence)
+            except ONNXRuntimeError as e:
+                logging.exception(e)
+            # Search for the last Period/QuestionMark as cache
+            if mini_sentence_i < len(mini_sentences) - 1:
+                sentenceEnd = -1
+                last_comma_index = -1
+                for i in range(len(punctuations) - 2, 1, -1):
+                    if (
+                        self.punc_list[punctuations[i]] == "。"
+                        or self.punc_list[punctuations[i]] == "？"
+                    ):
+                        sentenceEnd = i
+                        break
+                    if last_comma_index < 0 and self.punc_list[punctuations[i]] == "，":
+                        last_comma_index = i
+                if (
+                    sentenceEnd < 0
+                    and len(mini_sentence) > cache_pop_trigger_limit
+                    and last_comma_index >= 0
+                ):
+                    # The sentence it too long, cut off at a comma.
+                    sentenceEnd = last_comma_index
+                    punctuations[sentenceEnd] = self.period
+                cache_sent = mini_sentence[sentenceEnd + 1 :]
+                cache_sent_id = mini_sentence_id[sentenceEnd + 1 :].tolist()
+                mini_sentence = mini_sentence[0 : sentenceEnd + 1]
+                punctuations = punctuations[0 : sentenceEnd + 1]
+            new_mini_sentence_punc += [int(x) for x in punctuations]
+            words_with_punc = []
+            for i in range(len(mini_sentence)):
+                if i > 0:
+                    if (
+                        len(mini_sentence[i][0].encode()) == 1
+                        and len(mini_sentence[i - 1][0].encode()) == 1
+                    ):
+                        mini_sentence[i] = " " + mini_sentence[i]
+                words_with_punc.append(mini_sentence[i])
+                if self.punc_list[punctuations[i]] != "_":
+                    words_with_punc.append(self.punc_list[punctuations[i]])
+            new_mini_sentence += "".join(words_with_punc)
+            # Add Period for the end of the sentence
+            new_mini_sentence_out = new_mini_sentence
+            new_mini_sentence_punc_out = new_mini_sentence_punc
+            if mini_sentence_i == len(mini_sentences) - 1:
+                if new_mini_sentence[-1] == "，" or new_mini_sentence[-1] == "、":
+                    new_mini_sentence_out = new_mini_sentence[:-1] + "。"
+                    new_mini_sentence_punc_out = new_mini_sentence_punc[:-1] + [
+                        self.period
+                    ]
+                elif new_mini_sentence[-1] != "。" and new_mini_sentence[-1] != "？":
+                    new_mini_sentence_out = new_mini_sentence + "。"
+                    new_mini_sentence_punc_out = new_mini_sentence_punc[:-1] + [
+                        self.period
+                    ]
+        return new_mini_sentence_out, new_mini_sentence_punc_out
+    def infer(
+        self, feats: np.ndarray, feats_len: np.ndarray
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        outputs = self.ort_infer([feats, feats_len])
+        return outputs
+class CT_Transformer_VadRealtime(CT_Transformer):
+    """
+    Author: Speech Lab, Alibaba Group, China
+    CT-Transformer: Controllable time-delay transformer for
+    real-time punctuation prediction and disfluency detection
+    https://arxiv.org/pdf/2003.01309.pdf
+    """
+    def __init__(
+        self,
+        model_dir: Union[str, Path] = None,
+        batch_size: int = 1,
+        device_id: Union[str, int] = "-1",
+        quantize: bool = False,
+        intra_op_num_threads: int = 4,
+    ):
+        super(CT_Transformer_VadRealtime, self).__init__(
+            model_dir, batch_size, device_id, quantize, intra_op_num_threads
+        )
+    def __call__(self, text: str, param_dict: map, split_size=20):
+        cache_key = "cache"
+        assert cache_key in param_dict
+        cache = param_dict[cache_key]
+        if cache is not None and len(cache) > 0:
+            precache = "".join(cache)
+        else:
+            precache = ""
+            cache = []
+        full_text = precache + text
+        split_text = code_mix_split_words(full_text)
+        split_text_id = self.converter.tokens2ids(split_text)
+        mini_sentences = split_to_mini_sentence(split_text, split_size)
+        mini_sentences_id = split_to_mini_sentence(split_text_id, split_size)
+        new_mini_sentence_punc = []
+        assert len(mini_sentences) == len(mini_sentences_id)
+        cache_sent = []
+        cache_sent_id = np.array([], dtype="int32")
+        sentence_punc_list = []
+        sentence_words_list = []
+        cache_pop_trigger_limit = 200
+        skip_num = 0
+        for mini_sentence_i in range(len(mini_sentences)):
+            mini_sentence = mini_sentences[mini_sentence_i]
+            mini_sentence_id = mini_sentences_id[mini_sentence_i]
+            mini_sentence = cache_sent + mini_sentence
+            mini_sentence_id = np.concatenate((cache_sent_id, mini_sentence_id), axis=0)
+            text_length = len(mini_sentence_id)
+            data = {
+                "input": np.array(mini_sentence_id[None, :], dtype="int64"),
+                "text_lengths": np.array([text_length], dtype="int32"),
+                "vad_mask": self.vad_mask(text_length, len(cache))[
+                    None, None, :, :
+                ].astype(np.float32),
+                "sub_masks": np.tril(
+                    np.ones((text_length, text_length), dtype=np.float32)
+                )[None, None, :, :].astype(np.float32),
+            }
+            try:
+                outputs = self.infer(
+                    data["input"],
+                    data["text_lengths"],
+                    data["vad_mask"],
+                    data["sub_masks"],
+                )
+                y = outputs[0]
+                punctuations = np.argmax(y, axis=-1)[0]
+                assert punctuations.size == len(mini_sentence)
+            except ONNXRuntimeError as e:
+                logging.exception(e)
+            # Search for the last Period/QuestionMark as cache
+            if mini_sentence_i < len(mini_sentences) - 1:
+                sentenceEnd = -1
+                last_comma_index = -1
+                for i in range(len(punctuations) - 2, 1, -1):
+                    if (
+                        self.punc_list[punctuations[i]] == "。"
+                        or self.punc_list[punctuations[i]] == "？"
+                    ):
+                        sentenceEnd = i
+                        break
+                    if last_comma_index < 0 and self.punc_list[punctuations[i]] == "，":
+                        last_comma_index = i
+                if (
+                    sentenceEnd < 0
+                    and len(mini_sentence) > cache_pop_trigger_limit
+                    and last_comma_index >= 0
+                ):
+                    # The sentence it too long, cut off at a comma.
+                    sentenceEnd = last_comma_index
+                    punctuations[sentenceEnd] = self.period
+                cache_sent = mini_sentence[sentenceEnd + 1 :]
+                cache_sent_id = mini_sentence_id[sentenceEnd + 1 :]
+                mini_sentence = mini_sentence[0 : sentenceEnd + 1]
+                punctuations = punctuations[0 : sentenceEnd + 1]
+            punctuations_np = [int(x) for x in punctuations]
+            new_mini_sentence_punc += punctuations_np
+            sentence_punc_list += [self.punc_list[int(x)] for x in punctuations_np]
+            sentence_words_list += mini_sentence
+        assert len(sentence_punc_list) == len(sentence_words_list)
+        words_with_punc = []
+        sentence_punc_list_out = []
+        for i in range(0, len(sentence_words_list)):
+            if i > 0:
+                if (
+                    len(sentence_words_list[i][0].encode()) == 1
+                    and len(sentence_words_list[i - 1][-1].encode()) == 1
+                ):
+                    sentence_words_list[i] = " " + sentence_words_list[i]
+            if skip_num < len(cache):
+                skip_num += 1
+            else:
+                words_with_punc.append(sentence_words_list[i])
+            if skip_num >= len(cache):
+                sentence_punc_list_out.append(sentence_punc_list[i])
+                if sentence_punc_list[i] != "_":
+                    words_with_punc.append(sentence_punc_list[i])
+        sentence_out = "".join(words_with_punc)
+        sentenceEnd = -1
+        for i in range(len(sentence_punc_list) - 2, 1, -1):
+            if sentence_punc_list[i] == "。" or sentence_punc_list[i] == "？":
+                sentenceEnd = i
+                break
+        cache_out = sentence_words_list[sentenceEnd + 1 :]
+        if sentence_out[-1] in self.punc_list:
+            sentence_out = sentence_out[:-1]
+            sentence_punc_list_out[-1] = "_"
+        param_dict[cache_key] = cache_out
+        return sentence_out, sentence_punc_list_out, cache_out
+    def vad_mask(self, size, vad_pos, dtype=np.bool_):
+        """Create mask for decoder self-attention.
+        :param int size: size of mask
+        :param int vad_pos: index of vad index
+        :param torch.dtype dtype: result dtype
+        :rtype: torch.Tensor (B, Lmax, Lmax)
+        """
+        ret = np.ones((size, size), dtype=dtype)
+        if vad_pos <= 0 or vad_pos >= size:
+            return ret
+        sub_corner = np.zeros((vad_pos - 1, size - vad_pos), dtype=dtype)
+        ret[0 : vad_pos - 1, vad_pos:] = sub_corner
+        return ret
+    def infer(
+        self,
+        feats: np.ndarray,
+        feats_len: np.ndarray,
+        vad_mask: np.ndarray,
+        sub_masks: np.ndarray,
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        outputs = self.ort_infer([feats, feats_len, vad_mask, sub_masks])
+        return outputs

cttpunctuator/src/utils/OrtInferSession.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# -*- coding:utf-8 -*-
+# @FileName  :OrtInferSession.py
+# @Time      :2023/4/13 15:13
+# @Author    :lovemefan
+# @Email     :[email protected]
+import logging
+from pathlib import Path
+from typing import List, Union
+import numpy as np
+from onnxruntime import (
+    GraphOptimizationLevel,
+    InferenceSession,
+    SessionOptions,
+    get_available_providers,
+    get_device,
+)
+class ONNXRuntimeError(Exception):
+    pass
+class OrtInferSession:
+    def __init__(self, model_file, device_id=-1, intra_op_num_threads=4):
+        device_id = str(device_id)
+        sess_opt = SessionOptions()
+        sess_opt.intra_op_num_threads = intra_op_num_threads
+        sess_opt.log_severity_level = 4
+        sess_opt.enable_cpu_mem_arena = False
+        sess_opt.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
+        cuda_ep = "CUDAExecutionProvider"
+        cuda_provider_options = {
+            "device_id": device_id,
+            "arena_extend_strategy": "kNextPowerOfTwo",
+            "cudnn_conv_algo_search": "EXHAUSTIVE",
+            "do_copy_in_default_stream": "true",
+        }
+        cpu_ep = "CPUExecutionProvider"
+        cpu_provider_options = {
+            "arena_extend_strategy": "kSameAsRequested",
+        }
+        EP_list = []
+        if (
+            device_id != "-1"
+            and get_device() == "GPU"
+            and cuda_ep in get_available_providers()
+        ):
+            EP_list = [(cuda_ep, cuda_provider_options)]
+        EP_list.append((cpu_ep, cpu_provider_options))
+        self._verify_model(model_file)
+        self.session = InferenceSession(
+            model_file, sess_options=sess_opt, providers=EP_list
+        )
+        if device_id != "-1" and cuda_ep not in self.session.get_providers():
+            logging.warnings.warn(
+                f"{cuda_ep} is not avaiable for current env, "
+                f"the inference part is automatically shifted to be executed under {cpu_ep}.\n"
+                "Please ensure the installed onnxruntime-gpu version matches your cuda and cudnn version, "
+                "you can check their relations from the offical web site: "
+                "https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html",
+                RuntimeWarning,
+            )
+    def __call__(
+        self, input_content: List[Union[np.ndarray, np.ndarray]]
+    ) -> np.ndarray:
+        input_dict = dict(zip(self.get_input_names(), input_content))
+        try:
+            return self.session.run(self.get_output_names(), input_dict)
+        except Exception as e:
+            raise ONNXRuntimeError("ONNXRuntime inferece failed.") from e
+    def get_input_names(
+        self,
+    ):
+        return [v.name for v in self.session.get_inputs()]
+    def get_output_names(
+        self,
+    ):
+        return [v.name for v in self.session.get_outputs()]
+    def get_character_list(self, key: str = "character"):
+        return self.meta_dict[key].splitlines()
+    def have_key(self, key: str = "character") -> bool:
+        self.meta_dict = self.session.get_modelmeta().custom_metadata_map
+        if key in self.meta_dict.keys():
+            return True
+        return False
+    @staticmethod
+    def _verify_model(model_path):
+        model_path = Path(model_path)
+        if not model_path.exists():
+            raise FileNotFoundError(f"{model_path} does not exists.")
+        if not model_path.is_file():
+            raise FileExistsError(f"{model_path} is not a file.")

cttpunctuator/src/utils/__pycache__/OrtInferSession.cpython-310.pyc ADDED Viewed

Binary file (3.82 kB). View file

cttpunctuator/src/utils/__pycache__/text_post_process.cpython-310.pyc ADDED Viewed

Binary file (3.26 kB). View file

cttpunctuator/src/utils/text_post_process.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# -*- coding:utf-8 -*-
+# @FileName  :text_post_process.py
+# @Time      :2023/4/13 15:09
+# @Author    :lovemefan
+# @Email     :[email protected]
+from pathlib import Path
+from typing import Dict, Iterable, List, Union
+import numpy as np
+import yaml
+from typeguard import check_argument_types
+class TokenIDConverterError(Exception):
+    pass
+class TokenIDConverter:
+    def __init__(
+        self,
+        token_list: Union[List, str],
+    ):
+        check_argument_types()
+        self.token_list = token_list
+        self.unk_symbol = token_list[-1]
+        self.token2id = {v: i for i, v in enumerate(self.token_list)}
+        self.unk_id = self.token2id[self.unk_symbol]
+    def get_num_vocabulary_size(self) -> int:
+        return len(self.token_list)
+    def ids2tokens(self, integers: Union[np.ndarray, Iterable[int]]) -> List[str]:
+        if isinstance(integers, np.ndarray) and integers.ndim != 1:
+            raise TokenIDConverterError(
+                f"Must be 1 dim ndarray, but got {integers.ndim}"
+            )
+        return [self.token_list[i] for i in integers]
+    def tokens2ids(self, tokens: Iterable[str]) -> List[int]:
+        return [self.token2id.get(i, self.unk_id) for i in tokens]
+def split_to_mini_sentence(words: list, word_limit: int = 20):
+    assert word_limit > 1
+    if len(words) <= word_limit:
+        return [words]
+    sentences = []
+    length = len(words)
+    sentence_len = length // word_limit
+    for i in range(sentence_len):
+        sentences.append(words[i * word_limit : (i + 1) * word_limit])
+    if length % word_limit > 0:
+        sentences.append(words[sentence_len * word_limit :])
+    return sentences
+def code_mix_split_words(text: str):
+    words = []
+    segs = text.split()
+    for seg in segs:
+        # There is no space in seg.
+        current_word = ""
+        for c in seg:
+            if len(c.encode()) == 1:
+                # This is an ASCII char.
+                current_word += c
+            else:
+                # This is a Chinese char.
+                if len(current_word) > 0:
+                    words.append(current_word)
+                    current_word = ""
+                words.append(c)
+        if len(current_word) > 0:
+            words.append(current_word)
+    return words
+def read_yaml(yaml_path: Union[str, Path]) -> Dict:
+    if not Path(yaml_path).exists():
+        raise FileExistsError(f"The {yaml_path} does not exist.")
+    with open(str(yaml_path), "rb") as f:
+        data = yaml.load(f, Loader=yaml.Loader)
+    return data

requirements.txt ADDED Viewed

Binary file (5.01 kB). View file