import opencc from typing import Literal import re class Corrector: """ SenseVoice model ouputs Simplified Chinese only, this class converts the output to Traditional Chinese and fix common Cantonese spelling errors. """ def __init__(self, corrector: Literal["opencc"] = "opencc"): self.corrector = corrector self.converter = None self.bert_model = None if corrector == "opencc": self.converter = opencc.OpenCC("s2hk") self.regular_errors: list[tuple[re.Pattern, str]] = [ (re.compile(r"俾(?!(?:路支|斯麥|益))"), r"畀"), (re.compile(r"(?"), r""), ] def correct(self, text: str) -> str: """ Correct the output text using either a language model or OpenCC Args: text: Input text to correct t2s_char_dict: Dictionary mapping traditional to simplified characters lm_model: Either 'opencc' or a LanguageModel instance Returns: Corrected text string """ text = text.strip() if not text: # Early return for empty string return text if self.corrector == "opencc": return self.opencc_correct(text) else: raise ValueError("corrector should be either 'opencc' or 'bert'") def opencc_correct(self, text: str) -> str: """ Convert text using OpenCC Args: text: Input text to convert config: OpenCC configuration Returns: Converted text string """ opencc_text = self.converter.convert(text) for pattern, replacement in self.regular_errors: opencc_text = pattern.sub(replacement, opencc_text) return opencc_text