Spaces:

KdaiP
/

StableTTS1.1

Running

App Files Files Community

KdaiP commited on Sep 8, 2024

Commit

3dd84f8

verified ·

1 Parent(s): 015e033

Upload 80 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
api.py +103 -0
audios/1.wav +0 -0
audios/2.wav +0 -0
audios/3.wav +0 -0
audios/4.wav +3 -0
audios/5.wav +0 -0
audios/6.wav +0 -0
audios/7.wav +0 -0
audios/8.wav +0 -0
checkpoints/.keep +0 -0
checkpoints/checkpoint_0.pt +3 -0
config.py +50 -0
datas/__init__.py +0 -0
datas/dataset.py +69 -0
datas/sampler.py +131 -0
models/__init__.py +0 -0
models/diffusion_transformer.py +205 -0
models/duration_predictor.py +40 -0
models/estimator.py +138 -0
models/flow_matching.py +100 -0
models/model.py +178 -0
models/reference_encoder.py +168 -0
models/text_encoder.py +44 -0
monotonic_align/__init__.py +16 -0
monotonic_align/core.py +46 -0
requirements.txt +33 -0
text/LICENSE +19 -0
text/__init__.py +71 -0
text/cleaners.py +58 -0
text/cn2an/__init__.py +16 -0
text/cn2an/an2cn.py +204 -0
text/cn2an/cn2an.py +294 -0
text/cn2an/conf.py +135 -0
text/cn2an/transform.py +104 -0
text/cnm3/ds_CNM3.txt +606 -0
text/custom_pypinyin_dict/__init__.py +1 -0
text/custom_pypinyin_dict/cc_cedict_0.py +0 -0
text/custom_pypinyin_dict/cc_cedict_1.py +0 -0
text/custom_pypinyin_dict/cc_cedict_2.py +0 -0
text/custom_pypinyin_dict/cc_cedict_3.py +14 -0
text/custom_pypinyin_dict/genshin.py +11 -0
text/custom_pypinyin_dict/phrase_pinyin_data.py +24 -0
text/english.py +175 -0
text/japanese.py +157 -0
text/mandarin.py +173 -0
text/symbols.py +79 -0
utils/__init__.py +0 -0
utils/audio.py +74 -0
utils/load.py +43 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+audios/4.wav filter=lfs diff=lfs merge=lfs -text

api.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import torch
+import torch.nn as nn
+from dataclasses import asdict
+from utils.audio import LogMelSpectrogram
+from config import ModelConfig, MelConfig
+from models.model import StableTTS
+from text import symbols
+from text import cleaned_text_to_sequence
+from text.mandarin import chinese_to_cnm3
+from text.english import english_to_ipa2
+from text.japanese import japanese_to_ipa2
+from datas.dataset import intersperse
+from utils.audio import load_and_resample_audio
+def get_vocoder(model_path, model_name='ffgan') -> nn.Module:
+    if model_name == 'ffgan':
+        # training or changing ffgan config is not supported in this repo
+        # you can train your own model at https://github.com/fishaudio/vocoder
+        from vocoders.ffgan.model import FireflyGANBaseWrapper
+        vocoder = FireflyGANBaseWrapper(model_path)
+    elif model_name == 'vocos':
+        from vocoders.vocos.models.model import Vocos
+        from config import VocosConfig, MelConfig
+        vocoder = Vocos(VocosConfig(), MelConfig())
+        vocoder.load_state_dict(torch.load(model_path, weights_only=True, map_location='cpu'))
+        vocoder.eval()
+    else:
+        raise NotImplementedError(f"Unsupported model: {model_name}")
+    return vocoder
+class StableTTSAPI(nn.Module):
+    def __init__(self, tts_model_path, vocoder_model_path, vocoder_name='ffgan'):
+        super().__init__()
+        self.mel_config = MelConfig()
+        self.tts_model_config = ModelConfig()
+        self.mel_extractor = LogMelSpectrogram(**asdict(self.mel_config))
+        # text to mel spectrogram
+        self.tts_model = StableTTS(len(symbols), self.mel_config.n_mels, **asdict(self.tts_model_config))
+        self.tts_model.load_state_dict(torch.load(tts_model_path, map_location='cpu', weights_only=True))
+        self.tts_model.eval()
+        # mel spectrogram to waveform
+        self.vocoder_model = get_vocoder(vocoder_model_path, vocoder_name)
+        self.vocoder_model.eval()
+        self.g2p_mapping = {
+            'chinese': chinese_to_cnm3,
+            'japanese': japanese_to_ipa2,
+            'english': english_to_ipa2,
+        }
+        self.supported_languages = self.g2p_mapping.keys()
+    @ torch.inference_mode()
+    def inference(self, text, ref_audio, language, step, temperature=1.0, length_scale=1.0, solver=None, cfg=3.0):
+        device = next(self.parameters()).device
+        phonemizer = self.g2p_mapping.get(language)
+        text = phonemizer(text)
+        text = torch.tensor(intersperse(cleaned_text_to_sequence(text), item=0), dtype=torch.long, device=device).unsqueeze(0)
+        text_length = torch.tensor([text.size(-1)], dtype=torch.long, device=device)
+        ref_audio = load_and_resample_audio(ref_audio, self.mel_config.sample_rate).to(device)
+        ref_audio = self.mel_extractor(ref_audio)
+        mel_output = self.tts_model.synthesise(text, text_length, step, temperature, ref_audio, length_scale, solver, cfg)['decoder_outputs']
+        audio_output = self.vocoder_model(mel_output)
+        return audio_output.cpu(), mel_output.cpu()
+    def get_params(self):
+        tts_param = sum(p.numel() for p in self.tts_model.parameters()) / 1e6
+        vocoder_param = sum(p.numel() for p in self.vocoder_model.parameters()) / 1e6
+        return tts_param, vocoder_param
+if __name__ == '__main__':
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    tts_model_path = './checkpoints/checkpoint_0.pt'
+    vocoder_model_path = './vocoders/pretrained/vocos.pt'
+    model = StableTTSAPI(tts_model_path, vocoder_model_path, 'vocos')
+    model.to(device)
+    text = '樱落满殇祈念集……殇歌花落集思祈……樱花满地集于我心……揲舞纷飞祈愿相随……'
+    audio = './audio_1.wav'
+    audio_output, mel_output = model.inference(text, audio, 'chinese', 10, solver='dopri5', cfg=3)
+    print(audio_output.shape)
+    print(mel_output.shape)
+    import torchaudio
+    torchaudio.save('output.wav', audio_output, MelConfig().sample_rate)

audios/1.wav ADDED Viewed

Binary file (374 kB). View file

audios/2.wav ADDED Viewed

Binary file (182 kB). View file

audios/3.wav ADDED Viewed

Binary file (529 kB). View file

audios/4.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e6672b81d7dd41cac56cf49b75bb66a5486b5fe969ddab0f98f14b05be7857df
+size 1349150

audios/5.wav ADDED Viewed

Binary file (368 kB). View file

audios/6.wav ADDED Viewed

Binary file (431 kB). View file

audios/7.wav ADDED Viewed

Binary file (514 kB). View file

audios/8.wav ADDED Viewed

Binary file (420 kB). View file

checkpoints/.keep ADDED Viewed

File without changes

checkpoints/checkpoint_0.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b041bea13241b402bbfcdbfffd14381774be1179bae78e99ebd505d6d89f9367
+size 126657600

config.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from dataclasses import dataclass
+@dataclass
+class MelConfig:
+    sample_rate: int = 44100
+    n_fft: int = 2048
+    win_length: int = 2048
+    hop_length: int = 512
+    f_min: float = 0.0
+    f_max: float = None
+    pad: int = 0
+    n_mels: int = 128
+    center: bool = False
+    pad_mode: str = "reflect"
+    mel_scale: str = "slaney"
+    def __post_init__(self):
+        if self.pad == 0:
+            self.pad = (self.n_fft - self.hop_length) // 2
+@dataclass
+class ModelConfig:
+    hidden_channels: int = 256
+    filter_channels: int = 1024
+    n_heads: int = 4
+    n_enc_layers: int = 3
+    n_dec_layers: int = 6
+    kernel_size: int = 3
+    p_dropout: int = 0.1
+    gin_channels: int = 256
+@dataclass
+class TrainConfig:
+    train_dataset_path: str = 'filelists/filelist.json'
+    test_dataset_path: str = 'filelists/filelist.json' # not used
+    batch_size: int = 32
+    learning_rate: float = 1e-4
+    num_epochs: int = 10000
+    model_save_path: str = './checkpoints'
+    log_dir: str = './runs'
+    log_interval: int = 16
+    save_interval: int = 1
+    warmup_steps: int = 200
+@dataclass
+class VocosConfig:
+    input_channels: int = 128
+    dim: int = 512
+    intermediate_dim: int = 1536
+    num_layers: int = 8

datas/__init__.py ADDED Viewed

File without changes

datas/dataset.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import os
+import random
+import json
+import torch
+from torch.utils.data import Dataset
+from text import cleaned_text_to_sequence
+def intersperse(lst: list, item: int):
+    """
+    putting a blank token between any two input tokens to improve pronunciation
+    see https://github.com/jaywalnut310/glow-tts/issues/43 for more details
+    """
+    result = [item] * (len(lst) * 2 + 1)
+    result[1::2] = lst
+    return result
+class StableDataset(Dataset):
+    def __init__(self, filelist_path, hop_length):
+        self.filelist_path = filelist_path
+        self.hop_length = hop_length
+        self._load_filelist(filelist_path)
+    def _load_filelist(self, filelist_path):
+        filelist, lengths = [], []
+        with open(filelist_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                line = json.loads(line.strip())
+                filelist.append((line['mel_path'], line['phone']))
+                lengths.append(line['mel_length'])
+        self.filelist = filelist
+        self.lengths = lengths # length is used for DistributedBucketSampler
+    def __len__(self):
+        return len(self.filelist)
+    def __getitem__(self, idx):
+        mel_path, phone = self.filelist[idx]
+        mel = torch.load(mel_path, map_location='cpu', weights_only=True)
+        phone = torch.tensor(intersperse(cleaned_text_to_sequence(phone), 0), dtype=torch.long)
+        return mel, phone
+def collate_fn(batch):
+    texts = [item[1] for item in batch]
+    mels = [item[0] for item in batch]
+    mels_sliced = [random_slice_tensor(mel) for mel in mels]
+    text_lengths = torch.tensor([text.size(-1) for text in texts], dtype=torch.long)
+    mel_lengths = torch.tensor([mel.size(-1) for mel in mels], dtype=torch.long)
+    mels_sliced_lengths = torch.tensor([mel_sliced.size(-1) for mel_sliced in mels_sliced], dtype=torch.long)
+    # pad to the same length
+    texts_padded = torch.nested.to_padded_tensor(torch.nested.nested_tensor(texts), padding=0)
+    mels_padded = torch.nested.to_padded_tensor(torch.nested.nested_tensor(mels), padding=0)
+    mels_sliced_padded = torch.nested.to_padded_tensor(torch.nested.nested_tensor(mels_sliced), padding=0)
+    return texts_padded, text_lengths, mels_padded, mel_lengths, mels_sliced_padded, mels_sliced_lengths
+# random slice mel for reference encoder to prevent overfitting
+def random_slice_tensor(x: torch.Tensor):
+    length = x.size(-1)
+    if length < 8:
+        return x
+    segmnt_size = random.randint(length // 12, length // 3)
+    start = random.randint(0, length - segmnt_size)
+    return x[..., start : start + segmnt_size]

datas/sampler.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import torch
+# reference: https://github.com/jaywalnut310/vits/blob/main/data_utils.py
+class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
+    """
+    Maintain similar input lengths in a batch.
+    Length groups are specified by boundaries.
+    Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}.
+    It removes samples which are not included in the boundaries.
+    Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded.
+    """
+    def __init__(
+        self,
+        dataset,
+        batch_size,
+        boundaries,
+        num_replicas=None,
+        rank=None,
+        shuffle=True,
+    ):
+        super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
+        self.lengths = dataset.lengths
+        self.batch_size = batch_size
+        self.boundaries = boundaries
+        self.buckets, self.num_samples_per_bucket = self._create_buckets()
+        self.total_size = sum(self.num_samples_per_bucket)
+        self.num_samples = self.total_size // self.num_replicas
+    def _create_buckets(self):
+        buckets = [[] for _ in range(len(self.boundaries) - 1)]
+        for i in range(len(self.lengths)):
+            length = self.lengths[i]
+            idx_bucket = self._bisect(length)
+            if idx_bucket != -1:
+                buckets[idx_bucket].append(i)
+        # from https://github.com/Plachtaa/VITS-fast-fine-tuning/blob/main/data_utils.py
+        # avoid "integer division or modulo by zero" error for very small dataset
+        try:
+            for i in range(len(buckets) - 1, 0, -1):
+                if len(buckets[i]) == 0:
+                    buckets.pop(i)
+                    self.boundaries.pop(i + 1)
+            assert all(len(bucket) > 0 for bucket in buckets)
+        # When one bucket is not traversed
+        except Exception as e:
+            print('Bucket warning ', e)
+            for i in range(len(buckets) - 1, -1, -1):
+                if len(buckets[i]) == 0:
+                    buckets.pop(i)
+                    self.boundaries.pop(i + 1)
+        num_samples_per_bucket = []
+        for i in range(len(buckets)):
+            len_bucket = len(buckets[i])
+            total_batch_size = self.num_replicas * self.batch_size
+            rem = (
+                total_batch_size - (len_bucket % total_batch_size)
+            ) % total_batch_size
+            num_samples_per_bucket.append(len_bucket + rem)
+        return buckets, num_samples_per_bucket
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+        indices = []
+        if self.shuffle:
+            for bucket in self.buckets:
+                indices.append(torch.randperm(len(bucket), generator=g).tolist())
+        else:
+            for bucket in self.buckets:
+                indices.append(list(range(len(bucket))))
+        batches = []
+        for i in range(len(self.buckets)):
+            bucket = self.buckets[i]
+            len_bucket = len(bucket)
+            ids_bucket = indices[i]
+            num_samples_bucket = self.num_samples_per_bucket[i]
+            # add extra samples to make it evenly divisible
+            rem = num_samples_bucket - len_bucket
+            ids_bucket = (
+                ids_bucket
+                + ids_bucket * (rem // len_bucket)
+                + ids_bucket[: (rem % len_bucket)]
+            )
+            # subsample
+            ids_bucket = ids_bucket[self.rank :: self.num_replicas]
+            # batching
+            for j in range(len(ids_bucket) // self.batch_size):
+                batch = [
+                    bucket[idx]
+                    for idx in ids_bucket[
+                        j * self.batch_size : (j + 1) * self.batch_size
+                    ]
+                ]
+                batches.append(batch)
+        if self.shuffle:
+            batch_ids = torch.randperm(len(batches), generator=g).tolist()
+            batches = [batches[i] for i in batch_ids]
+        self.batches = batches
+        assert len(self.batches) * self.batch_size == self.num_samples
+        return iter(self.batches)
+    def _bisect(self, x, lo=0, hi=None):
+        if hi is None:
+            hi = len(self.boundaries) - 1
+        if hi > lo:
+            mid = (hi + lo) // 2
+            if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]:
+                return mid
+            elif x <= self.boundaries[mid]:
+                return self._bisect(x, lo, mid)
+            else:
+                return self._bisect(x, mid + 1, hi)
+        else:
+            return -1
+    def __len__(self):
+        return self.num_samples // self.batch_size

models/__init__.py ADDED Viewed

File without changes

models/diffusion_transformer.py ADDED Viewed

	@@ -0,0 +1,205 @@

+# References:
+# https://github.com/shivammehta25/Matcha-TTS/blob/main/matcha/models/components/transformer.py
+# https://github.com/jaywalnut310/vits/blob/main/attentions.py
+# https://github.com/pytorch-labs/gpt-fast/blob/main/model.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class FFN(nn.Module):
+  def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., gin_channels=0):
+    super().__init__()
+    self.in_channels = in_channels
+    self.out_channels = out_channels
+    self.filter_channels = filter_channels
+    self.kernel_size = kernel_size
+    self.p_dropout = p_dropout
+    self.gin_channels = gin_channels
+    self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size // 2)
+    self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size, padding=kernel_size // 2)
+    self.drop = nn.Dropout(p_dropout)
+    self.act1 = nn.SiLU(inplace=True)
+  def forward(self, x, x_mask):
+        x = self.conv_1(x * x_mask)
+        x = self.act1(x)
+        x = self.drop(x)
+        x = self.conv_2(x * x_mask)
+        return x * x_mask
+class MultiHeadAttention(nn.Module):
+  def __init__(self, channels, out_channels, n_heads, p_dropout=0.):
+    super().__init__()
+    assert channels % n_heads == 0
+    self.channels = channels
+    self.out_channels = out_channels
+    self.n_heads = n_heads
+    self.p_dropout = p_dropout
+    self.k_channels = channels // n_heads
+    self.conv_q = torch.nn.Conv1d(channels, channels, 1)
+    self.conv_k = torch.nn.Conv1d(channels, channels, 1)
+    self.conv_v = torch.nn.Conv1d(channels, channels, 1)
+    # from https://nn.labml.ai/transformers/rope/index.html
+    self.query_rotary_pe = RotaryPositionalEmbeddings(self.k_channels * 0.5)
+    self.key_rotary_pe = RotaryPositionalEmbeddings(self.k_channels * 0.5)
+    self.conv_o = torch.nn.Conv1d(channels, out_channels, 1)
+    self.drop = torch.nn.Dropout(p_dropout)
+    torch.nn.init.xavier_uniform_(self.conv_q.weight)
+    torch.nn.init.xavier_uniform_(self.conv_k.weight)
+    torch.nn.init.xavier_uniform_(self.conv_v.weight)
+  def forward(self, x, attn_mask=None):
+      q = self.conv_q(x)
+      k = self.conv_k(x)
+      v = self.conv_v(x)
+      x = self.attention(q, k, v, mask=attn_mask)
+      x = self.conv_o(x)
+      return x
+  def attention(self, query, key, value, mask=None):
+      b, d, t_s, t_t = (*key.size(), query.size(2))
+      query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
+      key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+      value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+      query = self.query_rotary_pe(query) # [b, n_head, t, c // n_head]
+      key = self.key_rotary_pe(key)
+      output = F.scaled_dot_product_attention(query, key, value, attn_mask=mask, dropout_p=self.p_dropout if self.training else 0)
+      output = output.transpose(2, 3).contiguous().view(b, d, t_t)  # [b, n_h, t_t, d_k] -> [b, d, t_t]
+      return output
+# modified from https://github.com/sh-lee-prml/HierSpeechpp/blob/main/modules.py#L390
+class DiTConVBlock(nn.Module):
+    """
+    A DiT block with adaptive layer norm zero (adaLN-Zero) conditioning.
+    """
+    def __init__(self, hidden_channels, filter_channels, num_heads, kernel_size=3, p_dropout=0.1, gin_channels=0):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_channels, elementwise_affine=False)
+        self.attn = MultiHeadAttention(hidden_channels, hidden_channels, num_heads, p_dropout)
+        self.norm2 = nn.LayerNorm(hidden_channels, elementwise_affine=False)
+        self.mlp = FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout)
+        self.adaLN_modulation = nn.Sequential(
+            nn.Linear(gin_channels, hidden_channels) if gin_channels != hidden_channels else nn.Identity(),
+            nn.SiLU(),
+            nn.Linear(hidden_channels, 6 * hidden_channels, bias=True)
+        )
+    def forward(self, x, c, x_mask):
+        """
+        Args:
+            x : [batch_size, channel, time]
+            c : [batch_size, channel]
+            x_mask : [batch_size, 1, time]
+        return the same shape as x
+        """
+        x = x * x_mask
+        attn_mask = x_mask.unsqueeze(1) * x_mask.unsqueeze(-1) # shape: [batch_size, 1, time, time]
+        attn_mask = torch.zeros_like(attn_mask).masked_fill(attn_mask == 0, -torch.finfo(x.dtype).max)
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).unsqueeze(2).chunk(6, dim=1) # shape: [batch_size, channel, 1]
+        x = x + gate_msa * self.attn(self.modulate(self.norm1(x.transpose(1,2)).transpose(1,2), shift_msa, scale_msa), attn_mask) * x_mask
+        x = x + gate_mlp * self.mlp(self.modulate(self.norm2(x.transpose(1,2)).transpose(1,2), shift_mlp, scale_mlp), x_mask)
+        # no condition version
+        # x = x + self.attn(self.norm1(x.transpose(1,2)).transpose(1,2),  attn_mask)
+        # x = x + self.mlp(self.norm2(x.transpose(1,2)).transpose(1,2), x_mask)
+        return x
+    @staticmethod
+    def modulate(x, shift, scale):
+        return x * (1 + scale) + shift
+class RotaryPositionalEmbeddings(nn.Module):
+    """
+    ## RoPE module
+    Rotary encoding transforms pairs of features by rotating in the 2D plane.
+    That is, it organizes the $d$ features as $\frac{d}{2}$ pairs.
+    Each pair can be considered a coordinate in a 2D plane, and the encoding will rotate it
+    by an angle depending on the position of the token.
+    """
+    def __init__(self, d: int, base: int = 10_000):
+        r"""
+        * `d` is the number of features $d$
+        * `base` is the constant used for calculating $\Theta$
+        """
+        super().__init__()
+        self.base = base
+        self.d = int(d)
+        self.cos_cached = None
+        self.sin_cached = None
+    def _build_cache(self, x: torch.Tensor):
+        r"""
+        Cache $\cos$ and $\sin$ values
+        """
+        # Return if cache is already built
+        if self.cos_cached is not None and x.shape[0] <= self.cos_cached.shape[0]:
+            return
+        # Get sequence length
+        seq_len = x.shape[0]
+        # $\Theta = {\theta_i = 10000^{-\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
+        theta = 1.0 / (self.base ** (torch.arange(0, self.d, 2).float() / self.d)).to(x.device)
+        # Create position indexes `[0, 1, ..., seq_len - 1]`
+        seq_idx = torch.arange(seq_len, device=x.device).float().to(x.device)
+        # Calculate the product of position index and $\theta_i$
+        idx_theta = torch.einsum("n,d->nd", seq_idx, theta)
+        # Concatenate so that for row $m$ we have
+        # $[m \theta_0, m \theta_1, ..., m \theta_{\frac{d}{2}}, m \theta_0, m \theta_1, ..., m \theta_{\frac{d}{2}}]$
+        idx_theta2 = torch.cat([idx_theta, idx_theta], dim=1)
+        # Cache them
+        self.cos_cached = idx_theta2.cos()[:, None, None, :]
+        self.sin_cached = idx_theta2.sin()[:, None, None, :]
+    def _neg_half(self, x: torch.Tensor):
+        # $\frac{d}{2}$
+        d_2 = self.d // 2
+        # Calculate $[-x^{(\frac{d}{2} + 1)}, -x^{(\frac{d}{2} + 2)}, ..., -x^{(d)}, x^{(1)}, x^{(2)}, ..., x^{(\frac{d}{2})}]$
+        return torch.cat([-x[:, :, :, d_2:], x[:, :, :, :d_2]], dim=-1)
+    def forward(self, x: torch.Tensor):
+        """
+        * `x` is the Tensor at the head of a key or a query with shape `[seq_len, batch_size, n_heads, d]`
+        """
+        # Cache $\cos$ and $\sin$ values
+        x = x.permute(2, 0, 1, 3) # b h t d -> t b h d
+        self._build_cache(x)
+        # Split the features, we can choose to apply rotary embeddings only to a partial set of features.
+        x_rope, x_pass = x[..., : self.d], x[..., self.d :]
+        # Calculate
+        # $[-x^{(\frac{d}{2} + 1)}, -x^{(\frac{d}{2} + 2)}, ..., -x^{(d)}, x^{(1)}, x^{(2)}, ..., x^{(\frac{d}{2})}]$
+        neg_half_x = self._neg_half(x_rope)
+        x_rope = (x_rope * self.cos_cached[: x.shape[0]]) + (neg_half_x * self.sin_cached[: x.shape[0]])
+        return torch.cat((x_rope, x_pass), dim=-1).permute(1, 2, 0, 3) # t b h d -> b h t d
+class Transpose(nn.Identity):
+    """(N, T, D) -> (N, D, T)"""
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return input.transpose(1, 2)

models/duration_predictor.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import torch
+import torch.nn as nn
+# modified from https://github.com/jaywalnut310/vits/blob/main/models.py#L98
+class DurationPredictor(nn.Module):
+  def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0):
+    super().__init__()
+    self.in_channels = in_channels
+    self.filter_channels = filter_channels
+    self.kernel_size = kernel_size
+    self.p_dropout = p_dropout
+    self.gin_channels = gin_channels
+    self.drop = nn.Dropout(p_dropout)
+    self.conv1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size//2)
+    self.norm1 = nn.LayerNorm(filter_channels)
+    self.conv2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size//2)
+    self.norm2 = nn.LayerNorm(filter_channels)
+    self.proj = nn.Conv1d(filter_channels, 1, 1)
+    self.cond = nn.Conv1d(gin_channels, in_channels, 1)
+  def forward(self, x, x_mask, g):
+    x = x.detach()
+    x = x + self.cond(g.unsqueeze(2).detach())
+    x = self.conv1(x * x_mask)
+    x = torch.relu(x)
+    x = self.norm1(x.transpose(1,2)).transpose(1,2)
+    x = self.drop(x)
+    x = self.conv2(x * x_mask)
+    x = torch.relu(x)
+    x = self.norm2(x.transpose(1,2)).transpose(1,2)
+    x = self.drop(x)
+    x = self.proj(x * x_mask)
+    return x * x_mask
+def duration_loss(logw, logw_, lengths):
+    loss = torch.sum((logw - logw_) ** 2) / torch.sum(lengths)
+    return loss

models/estimator.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import math
+import torch
+import torch.nn as nn
+from models.diffusion_transformer import DiTConVBlock
+class DitWrapper(nn.Module):
+    """ add FiLM layer to condition time embedding to DiT """
+    def __init__(self, hidden_channels, filter_channels, num_heads, kernel_size=3, p_dropout=0.1, gin_channels=0, time_channels=0):
+        super().__init__()
+        self.time_fusion = FiLMLayer(hidden_channels, time_channels)
+        self.block = DiTConVBlock(hidden_channels, filter_channels, num_heads, kernel_size, p_dropout, gin_channels)
+    def forward(self, x, c, t, x_mask):
+        x = self.time_fusion(x, t) * x_mask
+        x = self.block(x, c, x_mask)
+        return x
+class FiLMLayer(nn.Module):
+    """
+    Feature-wise Linear Modulation (FiLM) layer
+    Reference: https://arxiv.org/abs/1709.07871
+    """
+    def __init__(self, in_channels, cond_channels):
+        super(FiLMLayer, self).__init__()
+        self.in_channels = in_channels
+        self.film = nn.Conv1d(cond_channels, in_channels * 2, 1)
+    def forward(self, x, c):
+        gamma, beta = torch.chunk(self.film(c.unsqueeze(2)), chunks=2, dim=1)
+        return gamma * x + beta
+class SinusoidalPosEmb(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+        assert self.dim % 2 == 0, "SinusoidalPosEmb requires dim to be even"
+    def forward(self, x, scale=1000):
+        if x.ndim < 1:
+            x = x.unsqueeze(0)
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=x.device).float() * -emb)
+        emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+class TimestepEmbedding(nn.Module):
+    def __init__(self, in_channels, out_channels, filter_channels):
+        super().__init__()
+        self.layer = nn.Sequential(
+            nn.Linear(in_channels, filter_channels),
+            nn.SiLU(inplace=True),
+            nn.Linear(filter_channels, out_channels)
+        )
+    def forward(self, x):
+        return self.layer(x)
+# reference: https://github.com/shivammehta25/Matcha-TTS/blob/main/matcha/models/components/decoder.py
+class Decoder(nn.Module):
+    def __init__(self, noise_channels, cond_channels, hidden_channels, out_channels, filter_channels, dropout=0.1, n_layers=1, n_heads=4, kernel_size=3, gin_channels=0, use_lsc=True):
+        super().__init__()
+        self.noise_channels = noise_channels
+        self.cond_channels = cond_channels
+        self.hidden_channels = hidden_channels
+        self.out_channels = out_channels
+        self.filter_channels = filter_channels
+        self.use_lsc = use_lsc # whether to use unet-like long skip connection
+        self.time_embeddings = SinusoidalPosEmb(hidden_channels)
+        self.time_mlp = TimestepEmbedding(hidden_channels, hidden_channels, filter_channels)
+        self.in_proj = nn.Conv1d(hidden_channels + noise_channels, hidden_channels, 1) # cat noise and encoder output as input
+        self.blocks = nn.ModuleList([DitWrapper(hidden_channels, filter_channels, n_heads, kernel_size, dropout, gin_channels, hidden_channels) for _ in range(n_layers)])
+        self.final_proj = nn.Conv1d(hidden_channels, out_channels, 1)
+        # prenet for encoder output
+        self.cond_proj = nn.Sequential(
+            nn.Conv1d(cond_channels, filter_channels, kernel_size, padding=kernel_size//2),
+            nn.SiLU(inplace=True),
+            nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size//2), # add about 3M params
+            nn.SiLU(inplace=True),
+            nn.Conv1d(filter_channels, hidden_channels, kernel_size, padding=kernel_size//2)
+        )
+        if use_lsc:
+            assert n_layers % 2 == 0
+            self.n_lsc_layers = n_layers // 2
+            self.lsc_layers = nn.ModuleList([nn.Conv1d(hidden_channels + hidden_channels, hidden_channels, kernel_size, padding = kernel_size // 2) for _ in range(self.n_lsc_layers)])
+        self.initialize_weights()
+    def initialize_weights(self):
+        for block in self.blocks:
+            nn.init.constant_(block.block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.block.adaLN_modulation[-1].bias, 0)
+    def forward(self, t, x, mask, mu, c):
+        """Forward pass of the DiT model.
+        Args:
+            t (torch.Tensor): timestep, shape (batch_size)
+            x (torch.Tensor): noise, shape (batch_size, in_channels, time)
+            mask (torch.Tensor): shape (batch_size, 1, time)
+            mu (torch.Tensor): output of encoder, shape (batch_size, in_channels, time)
+            c (torch.Tensor): shape (batch_size, gin_channels)
+        Returns:
+            _type_: _description_
+        """
+        t = self.time_mlp(self.time_embeddings(t))
+        mu = self.cond_proj(mu)
+        x = torch.cat((x, mu), dim=1)
+        x = self.in_proj(x)
+        lsc_outputs = [] if self.use_lsc else None
+        for idx, block in enumerate(self.blocks):
+            # add long skip connection, see https://arxiv.org/pdf/2209.12152 for more details
+            if self.use_lsc:
+                if idx < self.n_lsc_layers:
+                    lsc_outputs.append(x)
+                else:
+                    x = torch.cat((x, lsc_outputs.pop()), dim=1)
+                    x = self.lsc_layers[idx - self.n_lsc_layers](x)
+            x = block(x, c, t, mask)
+        output = self.final_proj(x * mask)
+        return output * mask

models/flow_matching.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import functools
+from torchdiffeq import odeint
+from models.estimator import Decoder
+# modified from https://github.com/shivammehta25/Matcha-TTS/blob/main/matcha/models/components/flow_matching.py
+class CFMDecoder(torch.nn.Module):
+    def __init__(self, noise_channels, cond_channels, hidden_channels, out_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, gin_channels):
+        super().__init__()
+        self.noise_channels = noise_channels
+        self.cond_channels = cond_channels
+        self.hidden_channels = hidden_channels
+        self.out_channels = out_channels
+        self.filter_channels = filter_channels
+        self.gin_channels = gin_channels
+        self.sigma_min = 1e-4
+        self.estimator = Decoder(noise_channels, cond_channels, hidden_channels, out_channels, filter_channels, p_dropout, n_layers, n_heads, kernel_size, gin_channels)
+    @torch.inference_mode()
+    def forward(self, mu, mask, n_timesteps, temperature=1.0, c=None, solver=None, cfg_kwargs=None):
+        """Forward diffusion
+        Args:
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            n_timesteps (int): number of diffusion steps
+            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
+            c (torch.Tensor, optional): speaker embedding
+                shape: (batch_size, gin_channels)
+            solver: see https://github.com/rtqichen/torchdiffeq for supported solvers
+            cfg_kwargs: used for cfg inference
+        Returns:
+            sample: generated mel-spectrogram
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        z = torch.randn_like(mu) * temperature
+        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device)
+        # cfg control
+        if cfg_kwargs is None:
+            estimator = functools.partial(self.estimator, mask=mask, mu=mu, c=c)
+        else:
+            estimator = functools.partial(self.cfg_wrapper, mask=mask, mu=mu, c=c, cfg_kwargs=cfg_kwargs)
+        trajectory = odeint(estimator, z, t_span, method=solver, rtol=1e-5, atol=1e-5)
+        return trajectory[-1]
+    # cfg inference
+    def cfg_wrapper(self, t, x, mask, mu, c, cfg_kwargs):
+        fake_speaker = cfg_kwargs['fake_speaker'].repeat(x.size(0), 1)
+        fake_content = cfg_kwargs['fake_content'].repeat(x.size(0), 1, x.size(-1))
+        cfg_strength = cfg_kwargs['cfg_strength']
+        cond_output = self.estimator(t, x, mask, mu, c)
+        uncond_output = self.estimator(t, x, mask, fake_content, fake_speaker)
+        output = uncond_output + cfg_strength * (cond_output - uncond_output)
+        return output
+    def compute_loss(self, x1, mask, mu, c):
+        """Computes diffusion loss
+        Args:
+            x1 (torch.Tensor): Target
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): target mask
+                shape: (batch_size, 1, mel_timesteps)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            c (torch.Tensor, optional): speaker condition.
+        Returns:
+            loss: conditional flow matching loss
+            y: conditional flow
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        b, _, t = mu.shape
+        # random timestep
+        # use cosine timestep scheduler from cosyvoice: https://github.com/FunAudioLLM/CosyVoice/blob/main/cosyvoice/flow/flow_matching.py
+        t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
+        t = 1 - torch.cos(t * 0.5 * torch.pi)
+        # sample noise p(x_0)
+        z = torch.randn_like(x1)
+        y = (1 - (1 - self.sigma_min) * t) * z + t * x1
+        u = x1 - (1 - self.sigma_min) * z
+        loss = F.mse_loss(self.estimator(t.squeeze(), y, mask, mu, c), u, reduction="sum") / (torch.sum(mask) * u.size(1))
+        return loss, y

models/model.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import math
+import torch
+import torch.nn as nn
+import monotonic_align
+from models.text_encoder import TextEncoder
+from models.flow_matching import CFMDecoder
+from models.reference_encoder import MelStyleEncoder
+from models.duration_predictor import DurationPredictor, duration_loss
+from utils.mask import sequence_mask
+def convert_pad_shape(pad_shape):
+    inverted_shape = pad_shape[::-1]
+    pad_shape = [item for sublist in inverted_shape for item in sublist]
+    return pad_shape
+def generate_path(duration, mask):
+    b, t_x, t_y = mask.shape
+    cum_duration = torch.cumsum(duration, 1)
+    path = torch.zeros(b, t_x, t_y, dtype=mask.dtype, device=duration.device)
+    cum_duration_flat = cum_duration.view(b * t_x)
+    path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
+    path = path.view(b, t_x, t_y)
+    path = path - torch.nn.functional.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
+    path = path * mask
+    return path
+# modified from https://github.com/shivammehta25/Matcha-TTS/blob/main/matcha/models/matcha_tts.py
+class StableTTS(nn.Module):
+    def __init__(self, n_vocab, mel_channels, hidden_channels, filter_channels, n_heads, n_enc_layers, n_dec_layers, kernel_size, p_dropout, gin_channels):
+        super().__init__()
+        self.n_vocab = n_vocab
+        self.mel_channels = mel_channels
+        self.encoder = TextEncoder(n_vocab, mel_channels, hidden_channels, filter_channels, n_heads, n_enc_layers, kernel_size, p_dropout, gin_channels)
+        self.ref_encoder = MelStyleEncoder(mel_channels, style_vector_dim=gin_channels, style_kernel_size=5, dropout=0.25)
+        self.dp = DurationPredictor(hidden_channels, filter_channels, kernel_size, 0.5, gin_channels)
+        self.decoder = CFMDecoder(mel_channels, mel_channels, hidden_channels, mel_channels, filter_channels, n_heads, n_dec_layers, kernel_size, p_dropout, gin_channels)
+        # uncondition input for cfg
+        self.fake_speaker = nn.Parameter(torch.zeros(1, gin_channels))
+        self.fake_content = nn.Parameter(torch.zeros(1, mel_channels, 1))
+        self.cfg_dropout = 0.2
+    @torch.inference_mode()
+    def synthesise(self, x, x_lengths, n_timesteps, temperature=1.0, y=None, length_scale=1.0, solver=None, cfg=1.0):
+        """
+        Generates mel-spectrogram from text. Returns:
+            1. encoder outputs
+            2. decoder outputs
+            3. generated alignment
+        Args:
+            x (torch.Tensor): batch of texts, converted to a tensor with phoneme embedding ids.
+                shape: (batch_size, max_text_length)
+            x_lengths (torch.Tensor): lengths of texts in batch.
+                shape: (batch_size,)
+            n_timesteps (int): number of steps to use for reverse diffusion in decoder.
+            temperature (float, optional): controls variance of terminal distribution.
+            y (torch.Tensor): mel spectrogram of reference audio
+                shape: (batch_size, mel_channels, time)
+            length_scale (float, optional): controls speech pace.
+                Increase value to slow down generated speech and vice versa.
+        Returns:
+            dict: {
+                "encoder_outputs": torch.Tensor, shape: (batch_size, n_feats, max_mel_length),
+                # Average mel spectrogram generated by the encoder
+                "decoder_outputs": torch.Tensor, shape: (batch_size, n_feats, max_mel_length),
+                # Refined mel spectrogram improved by the CFM
+                "attn": torch.Tensor, shape: (batch_size, max_text_length, max_mel_length),
+                # Alignment map between text and mel spectrogram
+        """
+        # Get encoder_outputs `mu_x` and log-scaled token durations `logw`
+        c = self.ref_encoder(y, None)
+        x, mu_x, x_mask = self.encoder(x, c, x_lengths)
+        logw = self.dp(x, x_mask, c)
+        w = torch.exp(logw) * x_mask
+        w_ceil = torch.ceil(w) * length_scale
+        y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
+        y_max_length = y_lengths.max()
+        # Using obtained durations `w` construct alignment map `attn`
+        y_mask = sequence_mask(y_lengths, y_max_length).unsqueeze(1).to(x_mask.dtype)
+        attn_mask = x_mask.unsqueeze(-1) * y_mask.unsqueeze(2)
+        attn = generate_path(w_ceil.squeeze(1), attn_mask.squeeze(1)).unsqueeze(1)
+        # Align encoded text and get mu_y
+        mu_y = torch.matmul(attn.squeeze(1).transpose(1, 2), mu_x.transpose(1, 2))
+        mu_y = mu_y.transpose(1, 2)
+        encoder_outputs = mu_y[:, :, :y_max_length]
+        # Generate sample tracing the probability flow
+        if cfg == 1.0:
+            decoder_outputs = self.decoder(mu_y, y_mask, n_timesteps, temperature, c, solver)
+        else:
+            cfg_kwargs = {'fake_speaker': self.fake_speaker, 'fake_content': self.fake_content, 'cfg_strength': cfg}
+            decoder_outputs = self.decoder(mu_y, y_mask, n_timesteps, temperature, c, solver, cfg_kwargs)
+        decoder_outputs = decoder_outputs[:, :, :y_max_length]
+        return {
+            "encoder_outputs": encoder_outputs,
+            "decoder_outputs": decoder_outputs,
+            "attn": attn[:, :, :y_max_length],
+        }
+    def forward(self, x, x_lengths, y, y_lengths, z, z_lengths):
+        """
+        Computes 3 losses:
+            1. duration loss: loss between predicted token durations and those extracted by Monotinic Alignment Search (MAS).
+            2. prior loss: loss between mel-spectrogram and encoder outputs.
+            3. flow matching loss: loss between mel-spectrogram and decoder outputs.
+        Args:
+            x (torch.Tensor): batch of texts, converted to a tensor with phoneme embedding ids.
+                shape: (batch_size, max_text_length)
+            x_lengths (torch.Tensor): lengths of texts in batch.
+                shape: (batch_size,)
+            y (torch.Tensor): batch of corresponding mel-spectrograms.
+                shape: (batch_size, n_feats, max_mel_length)
+            y_lengths (torch.Tensor): lengths of mel-spectrograms in batch.
+                shape: (batch_size,)
+            z (torch.Tensor): batch of cliced mel-spectrograms.
+                shape: (batch_size, n_feats, max_mel_length)
+            z_lengths (torch.Tensor): lengths of sliced mel-spectrograms in batch.
+                shape: (batch_size,)
+        """
+        # Get encoder_outputs `mu_x` and log-scaled token durations `logw`
+        y_mask = sequence_mask(y_lengths, y.size(2)).unsqueeze(1).to(y.dtype)
+        z_mask = sequence_mask(z_lengths, z.size(2)).unsqueeze(1).to(z.dtype)
+        cfg_mask = torch.rand(y.size(0), 1, device=y.device) > self.cfg_dropout
+        # compute global speaker embedding
+        c = self.ref_encoder(z, z_mask)  * cfg_mask + ~cfg_mask * self.fake_speaker.repeat(z.size(0), 1)
+        x, mu_x, x_mask = self.encoder(x, c, x_lengths)
+        logw = self.dp(x, x_mask, c)
+        attn_mask = x_mask.unsqueeze(-1) * y_mask.unsqueeze(2)
+        # Use MAS to find most likely alignment `attn` between text and mel-spectrogram
+        with torch.no_grad():
+            s_p_sq_r = torch.ones_like(mu_x) # [b, d, t]
+            neg_cent1 = torch.sum(-0.5 * math.log(2 * math.pi)- torch.zeros_like(mu_x), [1], keepdim=True)
+            neg_cent2 = torch.einsum("bdt, bds -> bts", -0.5 * (y**2), s_p_sq_r)
+            neg_cent3 = torch.einsum("bdt, bds -> bts", y, (mu_x * s_p_sq_r))
+            neg_cent4 = torch.sum(-0.5 * (mu_x**2) * s_p_sq_r, [1], keepdim=True)
+            neg_cent = neg_cent1 + neg_cent2 + neg_cent3 + neg_cent4
+            attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
+            attn = (monotonic_align.maximum_path(neg_cent, attn_mask.squeeze(1)).unsqueeze(1).detach())
+        # Compute loss between predicted log-scaled durations and those obtained from MAS
+        # refered to as prior loss in the paper
+        logw_ = torch.log(1e-8 + attn.sum(2)) * x_mask
+        dur_loss = duration_loss(logw, logw_, x_lengths)
+        # Align encoded text with mel-spectrogram and get mu_y segment
+        attn = attn.squeeze(1).transpose(1,2)
+        mu_y = torch.matmul(attn.squeeze(1).transpose(1, 2), mu_x.transpose(1, 2))
+        mu_y = mu_y.transpose(1, 2)
+        # Compute loss of the decoder
+        cfg_mask = cfg_mask.unsqueeze(-1)
+        mu_y_masked = mu_y  * cfg_mask + ~cfg_mask * self.fake_content.repeat(mu_y.size(0), 1, mu_y.size(-1)) # mask content information for better diversity for flow-matching
+        diff_loss, _ = self.decoder.compute_loss(y, y_mask, mu_y_masked, c)
+        prior_loss = torch.sum(0.5 * ((y - mu_y) ** 2 + math.log(2 * math.pi)) * y_mask)
+        prior_loss = prior_loss / (torch.sum(y_mask) * self.mel_channels)
+        return dur_loss, diff_loss, prior_loss, attn

models/reference_encoder.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import torch
+import torch.nn as nn
+class Conv1dGLU(nn.Module):
+    """
+    Conv1d + GLU(Gated Linear Unit) with residual connection.
+    For GLU refer to https://arxiv.org/abs/1612.08083 paper.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size, dropout):
+        super(Conv1dGLU, self).__init__()
+        self.out_channels = out_channels
+        self.conv1 = nn.Conv1d(in_channels, 2 * out_channels, kernel_size=kernel_size, padding=kernel_size // 2)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        residual = x
+        x = self.conv1(x)
+        x1, x2 = torch.split(x, self.out_channels, dim=1)
+        x = x1 * torch.sigmoid(x2)
+        x = residual + self.dropout(x)
+        return x
+# modified from https://github.com/RVC-Boss/GPT-SoVITS/blob/main/GPT_SoVITS/module/modules.py#L766
+class MelStyleEncoder(nn.Module):
+    """MelStyleEncoder"""
+    def __init__(
+        self,
+        n_mel_channels=80,
+        style_hidden=128,
+        style_vector_dim=256,
+        style_kernel_size=5,
+        style_head=2,
+        dropout=0.1,
+    ):
+        super(MelStyleEncoder, self).__init__()
+        self.in_dim = n_mel_channels
+        self.hidden_dim = style_hidden
+        self.out_dim = style_vector_dim
+        self.kernel_size = style_kernel_size
+        self.n_head = style_head
+        self.dropout = dropout
+        self.spectral = nn.Sequential(
+            nn.Linear(self.in_dim, self.hidden_dim),
+            nn.Mish(inplace=True),
+            nn.Dropout(self.dropout),
+            nn.Linear(self.hidden_dim, self.hidden_dim),
+            nn.Mish(inplace=True),
+            nn.Dropout(self.dropout),
+        )
+        self.temporal = nn.Sequential(
+            Conv1dGLU(self.hidden_dim, self.hidden_dim, self.kernel_size, self.dropout),
+            Conv1dGLU(self.hidden_dim, self.hidden_dim, self.kernel_size, self.dropout),
+        )
+        self.slf_attn = nn.MultiheadAttention(
+            self.hidden_dim,
+            self.n_head,
+            self.dropout,
+            batch_first=True
+        )
+        self.fc = nn.Linear(self.hidden_dim, self.out_dim)
+    def temporal_avg_pool(self, x, mask=None):
+        if mask is None:
+            return torch.mean(x, dim=1)
+        else:
+            return torch.sum(x * ~mask.unsqueeze(-1), dim=1) / (~mask).sum(dim=1).unsqueeze(1)
+    def forward(self, x, x_mask=None):
+        x = x.transpose(1, 2)
+        # spectral
+        x = self.spectral(x)
+        # temporal
+        x = x.transpose(1, 2)
+        x = self.temporal(x)
+        x = x.transpose(1, 2)
+        # self-attention
+        if x_mask is not None:
+            x_mask = ~x_mask.squeeze(1).to(torch.bool)
+        x, _ = self.slf_attn(x, x, x, key_padding_mask=x_mask, need_weights=False)
+        # fc
+        x = self.fc(x)
+        # temoral average pooling
+        w = self.temporal_avg_pool(x, mask=x_mask)
+        return w
+# Attention Pool version of MelStyleEncoder, not used
+class AttnMelStyleEncoder(nn.Module):
+    """MelStyleEncoder"""
+    def __init__(
+        self,
+        n_mel_channels=80,
+        style_hidden=128,
+        style_vector_dim=256,
+        style_kernel_size=5,
+        style_head=2,
+        dropout=0.1,
+    ):
+        super().__init__()
+        self.in_dim = n_mel_channels
+        self.hidden_dim = style_hidden
+        self.out_dim = style_vector_dim
+        self.kernel_size = style_kernel_size
+        self.n_head = style_head
+        self.dropout = dropout
+        self.spectral = nn.Sequential(
+            nn.Linear(self.in_dim, self.hidden_dim),
+            nn.Mish(inplace=True),
+            nn.Dropout(self.dropout),
+            nn.Linear(self.hidden_dim, self.hidden_dim),
+            nn.Mish(inplace=True),
+            nn.Dropout(self.dropout),
+        )
+        self.temporal = nn.Sequential(
+            Conv1dGLU(self.hidden_dim, self.hidden_dim, self.kernel_size, self.dropout),
+            Conv1dGLU(self.hidden_dim, self.hidden_dim, self.kernel_size, self.dropout),
+        )
+        self.slf_attn = nn.MultiheadAttention(
+            self.hidden_dim,
+            self.n_head,
+            self.dropout,
+            batch_first=True
+        )
+        self.fc = nn.Linear(self.hidden_dim, self.out_dim)
+    def temporal_avg_pool(self, x, mask=None):
+        if mask is None:
+            return torch.mean(x, dim=1)
+        else:
+            return torch.sum(x * ~mask.unsqueeze(-1), dim=1) / (~mask).sum(dim=1).unsqueeze(1)
+    def forward(self, x, x_mask=None):
+        x = x.transpose(1, 2)
+        # spectral
+        x = self.spectral(x)
+        # temporal
+        x = x.transpose(1, 2)
+        x = self.temporal(x)
+        x = x.transpose(1, 2)
+        # self-attention
+        if x_mask is not None:
+            x_mask = ~x_mask.squeeze(1).to(torch.bool)
+            zeros = torch.zeros(x_mask.size(0), 1, device=x_mask.device, dtype=x_mask.dtype)
+            x_attn_mask = torch.cat((zeros, x_mask), dim=1)
+        else:
+            x_attn_mask = None
+        avg = self.temporal_avg_pool(x, x_mask).unsqueeze(1)
+        x = torch.cat([avg, x], dim=1)
+        x, _ = self.slf_attn(x, x, x, key_padding_mask=x_attn_mask, need_weights=False)
+        x = x[:, 0, :]
+        # fc
+        x = self.fc(x)
+        return x

models/text_encoder.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import torch
+import torch.nn as nn
+from models.diffusion_transformer import DiTConVBlock
+from utils.mask import sequence_mask
+# modified from https://github.com/jaywalnut310/vits/blob/main/models.py
+class TextEncoder(nn.Module):
+    def __init__(self, n_vocab, out_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, gin_channels):
+        super().__init__()
+        self.n_vocab = n_vocab
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.scale = self.hidden_channels ** 0.5
+        self.emb = nn.Embedding(n_vocab, hidden_channels)
+        nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
+        self.encoder = nn.ModuleList([DiTConVBlock(hidden_channels, filter_channels, n_heads, kernel_size, p_dropout, gin_channels) for _ in range(n_layers)])
+        self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+        self.initialize_weights()
+    def initialize_weights(self):
+        for block in self.encoder:
+            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+    def forward(self, x: torch.Tensor, c: torch.Tensor, x_lengths: torch.Tensor):
+        x = self.emb(x) * self.scale  # [b, t, h]
+        x = x.transpose(1, -1)  # [b, h, t]
+        x_mask = sequence_mask(x_lengths, x.size(2)).unsqueeze(1).to(x.dtype)
+        for layer in self.encoder:
+            x = layer(x, c, x_mask)
+        mu_x = self.proj(x) * x_mask
+        return x, mu_x, x_mask

monotonic_align/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from numpy import zeros, int32, float32
+from torch import from_numpy
+from .core import maximum_path_jit
+def maximum_path(neg_cent, mask):
+    device = neg_cent.device
+    dtype = neg_cent.dtype
+    neg_cent = neg_cent.data.cpu().numpy().astype(float32)
+    path = zeros(neg_cent.shape, dtype=int32)
+    t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(int32)
+    t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(int32)
+    maximum_path_jit(path, neg_cent, t_t_max, t_s_max)
+    return from_numpy(path).to(device=device, dtype=dtype)

monotonic_align/core.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import numba
+@numba.jit(
+    numba.void(
+        numba.int32[:, :, ::1],
+        numba.float32[:, :, ::1],
+        numba.int32[::1],
+        numba.int32[::1],
+    ),
+    nopython=True,
+    nogil=True,
+)
+def maximum_path_jit(paths, values, t_ys, t_xs):
+    b = paths.shape[0]
+    max_neg_val = -1e9
+    for i in range(int(b)):
+        path = paths[i]
+        value = values[i]
+        t_y = t_ys[i]
+        t_x = t_xs[i]
+        v_prev = v_cur = 0.0
+        index = t_x - 1
+        for y in range(t_y):
+            for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
+                if x == y:
+                    v_cur = max_neg_val
+                else:
+                    v_cur = value[y - 1, x]
+                if x == 0:
+                    if y == 0:
+                        v_prev = 0.0
+                    else:
+                        v_prev = max_neg_val
+                else:
+                    v_prev = value[y - 1, x - 1]
+                value[y, x] += max(v_prev, v_cur)
+        for y in range(t_y - 1, -1, -1):
+            path[y, index] = 1
+            if index != 0 and (
+                index == y or value[y - 1, index] < value[y - 1, index - 1]
+            ):
+                index = index - 1

requirements.txt ADDED Viewed

	@@ -0,0 +1,33 @@

+torch
+torchaudio
+tqdm
+numpy
+soundfile # to make sure that torchaudio has at least one valid backend
+tensorboard
+# for monotonic_align
+numba
+# ODE-solver
+torchdiffeq
+# for g2p
+# chinese
+pypinyin
+jieba
+# english
+eng_to_ipa
+unidecode
+inflect
+# japanese
+# if pyopenjtalk fail to download open_jtalk_dic_utf_8-1.11.tar.gz, manually download and unzip the file below
+# https://github.com/r9y9/open_jtalk/releases/download/v1.11.1/open_jtalk_dic_utf_8-1.11.tar.gz
+# and set os.environ['OPEN_JTALK_DICT_DIR'] to the folder path
+pyopenjtalk-prebuilt # if using python >= 3.12, install pyopenjtalk instead
+# for webui
+gradio
+matplotlib

text/LICENSE ADDED Viewed

	@@ -0,0 +1,19 @@

+Copyright (c) 2017 Keith Ito
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.

text/__init__.py ADDED Viewed

	@@ -0,0 +1,71 @@

+""" from https://github.com/keithito/tacotron """
+from text import cleaners
+from text.symbols import symbols
+# Mappings from symbol to numeric ID and vice versa:
+_symbol_to_id = {s: i for i, s in enumerate(symbols)}
+_id_to_symbol = {i: s for i, s in enumerate(symbols)}
+def text_to_sequence(text, symbols, cleaner_names):
+  '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+    Args:
+      text: string to convert to a sequence
+      cleaner_names: names of the cleaner functions to run the text through
+    Returns:
+      List of integers corresponding to the symbols in the text
+  '''
+  sequence = []
+  symbol_to_id = {s: i for i, s in enumerate(symbols)}
+  clean_text = _clean_text(text, cleaner_names)
+  print(clean_text)
+  print(f" length:{len(clean_text)}")
+  for symbol in clean_text:
+    if symbol not in symbol_to_id.keys():
+      continue
+    symbol_id = symbol_to_id[symbol]
+    sequence += [symbol_id]
+  print(f" length:{len(sequence)}")
+  return sequence
+def cleaned_text_to_sequence(cleaned_text):
+  '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+    Args:
+      text: string to convert to a sequence
+    Returns:
+      List of integers corresponding to the symbols in the text
+  '''
+  # symbol_to_id = {s: i for i, s in enumerate(symbols)}
+  sequence = [_symbol_to_id[symbol] for symbol in cleaned_text if symbol in _symbol_to_id.keys()]
+  return sequence
+def cleaned_text_to_sequence_chinese(cleaned_text):
+  '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+    Args:
+      text: string to convert to a sequence
+    Returns:
+      List of integers corresponding to the symbols in the text
+  '''
+  # symbol_to_id = {s: i for i, s in enumerate(symbols)}
+  sequence = [_symbol_to_id[symbol] for symbol in cleaned_text.split(' ') if symbol in _symbol_to_id.keys()]
+  return sequence
+def sequence_to_text(sequence):
+  '''Converts a sequence of IDs back to a string'''
+  result = ''
+  for symbol_id in sequence:
+    s = _id_to_symbol[symbol_id]
+    result += s
+  return result
+def _clean_text(text, cleaner_names):
+  for name in cleaner_names:
+    cleaner = getattr(cleaners, name)
+    if not cleaner:
+      raise Exception('Unknown cleaner: %s' % name)
+    text = cleaner(text)
+  return text

text/cleaners.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import re
+from text.english import english_to_ipa2
+from text.mandarin import chinese_to_cnm3
+from text.japanese import japanese_to_ipa2
+language_module_map = {"PAD":0, "ZH": 1, "EN": 2, "JA": 3}
+# 预编译正则表达式
+ZH_PATTERN = re.compile(r'[\u3400-\u4DBF\u4e00-\u9FFF\uF900-\uFAFF\u3000-\u303F]')
+EN_PATTERN = re.compile(r'[a-zA-Z.,!?\'"(){}[\]<>:;@#$%^&*-_+=/\\|~`]+')
+JP_PATTERN = re.compile(r'[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FAF\u31F0-\u31FF\uFF00-\uFFEF\u3000-\u303F]')
+CLEANER_PATTERN = re.compile(r'\[(ZH|EN|JA)\]')
+def detect_language(text: str, prev_lang=None):
+    """
+    根据给定的文本检测语言
+    :param text: 输入文本
+    :param prev_lang: 上一个检测到的语言
+    :return: 'ZH' for Chinese, 'EN' for English, 'JA' for Japanese, or prev_lang for spaces
+    """
+    if ZH_PATTERN.search(text): return 'ZH'
+    if EN_PATTERN.search(text): return 'EN'
+    if JP_PATTERN.search(text): return 'JA'
+    if text.isspace(): return prev_lang  # 若是空格，则返回前一个语言
+    return None
+# auto detect language using re
+def cjke_cleaners4(text: str):
+    """
+    根据文本内容自动检测语言并转换为IPA音标
+    :param text: 输入文本
+    :return: 转换为IPA音标的文本
+    """
+    text = CLEANER_PATTERN.sub('', text)
+    pointer = 0
+    output = ''
+    current_language = detect_language(text[pointer])
+    while pointer < len(text):
+        temp_text = ''
+        while pointer < len(text) and detect_language(text[pointer], current_language) == current_language:
+            temp_text += text[pointer]
+            pointer += 1
+        if current_language == 'ZH':
+            output += chinese_to_cnm3(temp_text)
+        elif current_language == 'JA':
+            output += japanese_to_ipa2(temp_text)
+        elif current_language == 'EN':
+            output += english_to_ipa2(temp_text)
+        if pointer < len(text):
+            current_language = detect_language(text[pointer])
+    output = re.sub(r'\s+$', '', output)
+    output = re.sub(r'([^\.,!\?\-…~])$', r'\1.', output)
+    return output

text/cn2an/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+__version__ = "0.5.22"
+from .cn2an import Cn2An
+from .an2cn import An2Cn
+from .transform import Transform
+cn2an = Cn2An().cn2an
+an2cn = An2Cn().an2cn
+transform = Transform().transform
+__all__ = [
+    "__version__",
+    "cn2an",
+    "an2cn",
+    "transform"
+]

text/cn2an/an2cn.py ADDED Viewed

	@@ -0,0 +1,204 @@

+from typing import Union
+from warnings import warn
+# from proces import preprocess
+from .conf import NUMBER_LOW_AN2CN, NUMBER_UP_AN2CN, UNIT_LOW_ORDER_AN2CN, UNIT_UP_ORDER_AN2CN
+class An2Cn(object):
+    def __init__(self) -> None:
+        self.all_num = "0123456789"
+        self.number_low = NUMBER_LOW_AN2CN
+        self.number_up = NUMBER_UP_AN2CN
+        self.mode_list = ["low", "up", "rmb", "direct"]
+    def an2cn(self, inputs: Union[str, int, float] = None, mode: str = "low") -> str:
+        """阿拉伯数字转中文数字
+        :param inputs: 阿拉伯数字
+        :param mode: low 小写数字，up 大写数字，rmb 人民币大写，direct 直接转化
+        :return: 中文数字
+        """
+        if inputs is not None and inputs != "":
+            if mode not in self.mode_list:
+                raise ValueError(f"mode 仅支持 {str(self.mode_list)} ！")
+            # 将数字转化为字符串，这里会有Python会自动做转化
+            # 1. -> 1.0 1.00 -> 1.0 -0 -> 0
+            if not isinstance(inputs, str):
+                inputs = self.__number_to_string(inputs)
+            # 数据预处理：
+            # 1. 繁体转简体
+            # 2. 全角转半角
+            # inputs = preprocess(inputs, pipelines=[
+            #     "traditional_to_simplified",
+            #     "full_angle_to_half_angle"
+            # ])
+            # 检查数据是否有效
+            self.__check_inputs_is_valid(inputs)
+            # 判断正负
+            if inputs[0] == "-":
+                sign = "负"
+                inputs = inputs[1:]
+            else:
+                sign = ""
+            if mode == "direct":
+                output = self.__direct_convert(inputs)
+            else:
+                # 切割整数部分和小数部分
+                split_result = inputs.split(".")
+                len_split_result = len(split_result)
+                if len_split_result == 1:
+                    # 不包含小数的输入
+                    integer_data = split_result[0]
+                    if mode == "rmb":
+                        output = self.__integer_convert(integer_data, "up") + "元整"
+                    else:
+                        output = self.__integer_convert(integer_data, mode)
+                elif len_split_result == 2:
+                    # 包含小数的输入
+                    integer_data, decimal_data = split_result
+                    if mode == "rmb":
+                        int_data = self.__integer_convert(integer_data, "up")
+                        dec_data = self.__decimal_convert(decimal_data, "up")
+                        len_dec_data = len(dec_data)
+                        if len_dec_data == 0:
+                            output = int_data + "元整"
+                        elif len_dec_data == 1:
+                            raise ValueError(f"异常输出：{dec_data}")
+                        elif len_dec_data == 2:
+                            if dec_data[1] != "零":
+                                if int_data == "零":
+                                    output = dec_data[1] + "角"
+                                else:
+                                    output = int_data + "元" + dec_data[1] + "角"
+                            else:
+                                output = int_data + "元整"
+                        else:
+                            if dec_data[1] != "零":
+                                if dec_data[2] != "零":
+                                    if int_data == "零":
+                                        output = dec_data[1] + "角" + dec_data[2] + "分"
+                                    else:
+                                        output = int_data + "元" + dec_data[1] + "角" + dec_data[2] + "分"
+                                else:
+                                    if int_data == "零":
+                                        output = dec_data[1] + "角"
+                                    else:
+                                        output = int_data + "元" + dec_data[1] + "角"
+                            else:
+                                if dec_data[2] != "零":
+                                    if int_data == "零":
+                                        output = dec_data[2] + "分"
+                                    else:
+                                        output = int_data + "元" + "零" + dec_data[2] + "分"
+                                else:
+                                    output = int_data + "元整"
+                    else:
+                        output = self.__integer_convert(integer_data, mode) + self.__decimal_convert(decimal_data, mode)
+                else:
+                    raise ValueError(f"输入格式错误：{inputs}！")
+        else:
+            raise ValueError("输入数据为空！")
+        return sign + output
+    def __direct_convert(self, inputs: str) -> str:
+        _output = ""
+        for d in inputs:
+            if d == ".":
+                _output += "点"
+            else:
+                _output += self.number_low[int(d)]
+        return _output
+    @staticmethod
+    def __number_to_string(number_data: Union[int, float]) -> str:
+        # 小数处理：python 会自动把 0.00005 转化成 5e-05，因此 str(0.00005) != "0.00005"
+        string_data = str(number_data)
+        if "e" in string_data:
+            string_data_list = string_data.split("e")
+            string_key = string_data_list[0]
+            string_value = string_data_list[1]
+            if string_value[0] == "-":
+                string_data = "0." + "0" * (int(string_value[1:]) - 1) + string_key
+            else:
+                string_data = string_key + "0" * int(string_value)
+        return string_data
+    def __check_inputs_is_valid(self, check_data: str) -> None:
+        # 检查输入数据是否在规定的字典中
+        all_check_keys = self.all_num + ".-"
+        for data in check_data:
+            if data not in all_check_keys:
+                raise ValueError(f"输入的数据不在转化范围内：{data}！")
+    def __integer_convert(self, integer_data: str, mode: str) -> str:
+        if mode == "low":
+            numeral_list = NUMBER_LOW_AN2CN
+            unit_list = UNIT_LOW_ORDER_AN2CN
+        elif mode == "up":
+            numeral_list = NUMBER_UP_AN2CN
+            unit_list = UNIT_UP_ORDER_AN2CN
+        else:
+            raise ValueError(f"error mode: {mode}")
+        # 去除前面的 0，比如 007 => 7
+        integer_data = str(int(integer_data))
+        len_integer_data = len(integer_data)
+        if len_integer_data > len(unit_list):
+            raise ValueError(f"超出数据范围，最长支持 {len(unit_list)} 位")
+        output_an = ""
+        for i, d in enumerate(integer_data):
+            if int(d):
+                output_an += numeral_list[int(d)] + unit_list[len_integer_data - i - 1]
+            else:
+                if not (len_integer_data - i - 1) % 4:
+                    output_an += numeral_list[int(d)] + unit_list[len_integer_data - i - 1]
+                if i > 0 and not output_an[-1] == "零":
+                    output_an += numeral_list[int(d)]
+        output_an = output_an.replace("零零", "零").replace("零万", "万").replace("零亿", "亿").replace("亿万", "亿") \
+            .strip("零")
+        # 解决「一十几」问题
+        if output_an[:2] in ["一十"]:
+            output_an = output_an[1:]
+        # 0 - 1 之间的小数
+        if not output_an:
+            output_an = "零"
+        return output_an
+    def __decimal_convert(self, decimal_data: str, o_mode: str) -> str:
+        len_decimal_data = len(decimal_data)
+        if len_decimal_data > 16:
+            warn(f"注意：小数部分长度为 {len_decimal_data} ，将自动截取前 16 位有效精度！")
+            decimal_data = decimal_data[:16]
+        if len_decimal_data:
+            output_an = "点"
+        else:
+            output_an = ""
+        if o_mode == "low":
+            numeral_list = NUMBER_LOW_AN2CN
+        elif o_mode == "up":
+            numeral_list = NUMBER_UP_AN2CN
+        else:
+            raise ValueError(f"error mode: {o_mode}")
+        for data in decimal_data:
+            output_an += numeral_list[int(data)]
+        return output_an

text/cn2an/cn2an.py ADDED Viewed

	@@ -0,0 +1,294 @@

+import re
+from warnings import warn
+from typing import Union
+# from proces import preprocess
+from .an2cn import An2Cn
+from .conf import NUMBER_CN2AN, UNIT_CN2AN, STRICT_CN_NUMBER, NORMAL_CN_NUMBER, NUMBER_LOW_AN2CN, UNIT_LOW_AN2CN
+class Cn2An(object):
+    def __init__(self) -> None:
+        self.all_num = "".join(list(NUMBER_CN2AN.keys()))
+        self.all_unit = "".join(list(UNIT_CN2AN.keys()))
+        self.strict_cn_number = STRICT_CN_NUMBER
+        self.normal_cn_number = NORMAL_CN_NUMBER
+        self.check_key_dict = {
+            "strict": "".join(self.strict_cn_number.values()) + "点负",
+            "normal": "".join(self.normal_cn_number.values()) + "点负",
+            "smart": "".join(self.normal_cn_number.values()) + "点负" + "01234567890.-"
+        }
+        self.pattern_dict = self.__get_pattern()
+        self.ac = An2Cn()
+        self.mode_list = ["strict", "normal", "smart"]
+        self.yjf_pattern = re.compile(fr"^.*?[元圆][{self.all_num}]角([{self.all_num}]分)?$")
+        self.pattern1 = re.compile(fr"^-?\d+(\.\d+)?[{self.all_unit}]?$")
+        self.ptn_all_num = re.compile(f"^[{self.all_num}]+$")
+        # "十?" is for special case "十一万三"
+        self.ptn_speaking_mode = re.compile(f"^([{self.all_num}]{{0,2}}[{self.all_unit}])+[{self.all_num}]$")
+    def cn2an(self, inputs: Union[str, int, float] = None, mode: str = "strict") -> Union[float, int]:
+        """中文数字转阿拉伯数字
+        :param inputs: 中文数字、阿拉伯数字、中文数字和阿拉伯数字
+        :param mode: strict 严格，normal 正常，smart 智能
+        :return: 阿拉伯数字
+        """
+        if inputs is not None or inputs == "":
+            if mode not in self.mode_list:
+                raise ValueError(f"mode 仅支持 {str(self.mode_list)} ！")
+            # 将数字转化为字符串
+            if not isinstance(inputs, str):
+                inputs = str(inputs)
+            # 数据预处理：
+            # 1. 繁体转简体
+            # 2. 全角转半角
+            # inputs = preprocess(inputs, pipelines=[
+            #     "traditional_to_simplified",
+            #     "full_angle_to_half_angle"
+            # ])
+            # 特殊转化 廿
+            inputs = inputs.replace("廿", "二十")
+            # 检查输入数据是否有效
+            sign, integer_data, decimal_data, is_all_num = self.__check_input_data_is_valid(inputs, mode)
+            # smart 下的特殊情况
+            if sign == 0:
+                return integer_data
+            else:
+                if not is_all_num:
+                    if decimal_data is None:
+                        output = self.__integer_convert(integer_data)
+                    else:
+                        output = self.__integer_convert(integer_data) + self.__decimal_convert(decimal_data)
+                        # fix 1 + 0.57 = 1.5699999999999998
+                        output = round(output, len(decimal_data))
+                else:
+                    if decimal_data is None:
+                        output = self.__direct_convert(integer_data)
+                    else:
+                        output = self.__direct_convert(integer_data) + self.__decimal_convert(decimal_data)
+                        # fix 1 + 0.57 = 1.5699999999999998
+                        output = round(output, len(decimal_data))
+        else:
+            raise ValueError("输入数据为空！")
+        return sign * output
+    def __get_pattern(self) -> dict:
+        # 整数严格检查
+        _0 = "[零]"
+        _1_9 = "[一二三四五六七八九]"
+        _10_99 = f"{_1_9}?[十]{_1_9}?"
+        _1_99 = f"({_10_99}|{_1_9})"
+        _100_999 = f"({_1_9}[百]([零]{_1_9})?|{_1_9}[百]{_10_99})"
+        _1_999 = f"({_100_999}|{_1_99})"
+        _1000_9999 = f"({_1_9}[千]([零]{_1_99})?|{_1_9}[千]{_100_999})"
+        _1_9999 = f"({_1000_9999}|{_1_999})"
+        _10000_99999999 = f"({_1_9999}[万]([零]{_1_999})?|{_1_9999}[万]{_1000_9999})"
+        _1_99999999 = f"({_10000_99999999}|{_1_9999})"
+        _100000000_9999999999999999 = f"({_1_99999999}[亿]([零]{_1_99999999})?|{_1_99999999}[亿]{_10000_99999999})"
+        _1_9999999999999999 = f"({_100000000_9999999999999999}|{_1_99999999})"
+        str_int_pattern = f"^({_0}|{_1_9999999999999999})$"
+        nor_int_pattern = f"^({_0}|{_1_9999999999999999})$"
+        str_dec_pattern = "^[零一二三四五六七八九]{0,15}[一二三四五六七八九]$"
+        nor_dec_pattern = "^[零一二三四五六七八九]{0,16}$"
+        for str_num in self.strict_cn_number.keys():
+            str_int_pattern = str_int_pattern.replace(str_num, self.strict_cn_number[str_num])
+            str_dec_pattern = str_dec_pattern.replace(str_num, self.strict_cn_number[str_num])
+        for nor_num in self.normal_cn_number.keys():
+            nor_int_pattern = nor_int_pattern.replace(nor_num, self.normal_cn_number[nor_num])
+            nor_dec_pattern = nor_dec_pattern.replace(nor_num, self.normal_cn_number[nor_num])
+        pattern_dict = {
+            "strict": {
+                "int": re.compile(str_int_pattern),
+                "dec": re.compile(str_dec_pattern)
+            },
+            "normal": {
+                "int": re.compile(nor_int_pattern),
+                "dec": re.compile(nor_dec_pattern)
+            }
+        }
+        return pattern_dict
+    def __copy_num(self, num):
+        cn_num = ""
+        for n in num:
+            cn_num += NUMBER_LOW_AN2CN[int(n)]
+        return cn_num
+    def __check_input_data_is_valid(self, check_data: str, mode: str) -> (int, str, str, bool):
+        # 去除 元整、圆整、元正、圆正
+        stop_words = ["元整", "圆整", "元正", "圆正"]
+        for word in stop_words:
+            if check_data[-2:] == word:
+                check_data = check_data[:-2]
+        # 去除 元、圆
+        if mode != "strict":
+            normal_stop_words = ["圆", "元"]
+            for word in normal_stop_words:
+                if check_data[-1] == word:
+                    check_data = check_data[:-1]
+        # 处理元角分
+        result = self.yjf_pattern.search(check_data)
+        if result:
+            check_data = check_data.replace("元", "点").replace("角", "").replace("分", "")
+        # 处理特殊问法：一千零十一 一万零百一十一
+        if "零十" in check_data:
+            check_data = check_data.replace("零十", "零一十")
+        if "零百" in check_data:
+            check_data = check_data.replace("零百", "零一百")
+        for data in check_data:
+            if data not in self.check_key_dict[mode]:
+                raise ValueError(f"当前为{mode}模式，输入的数据不在转化范围内：{data}！")
+        # 确定正负号
+        if check_data[0] == "负":
+            check_data = check_data[1:]
+            sign = -1
+        else:
+            sign = 1
+        if "点" in check_data:
+            split_data = check_data.split("点")
+            if len(split_data) == 2:
+                integer_data, decimal_data = split_data
+                # 将 smart 模式中的阿拉伯数字转化成中文数字
+                if mode == "smart":
+                    integer_data = re.sub(r"\d+", lambda x: self.ac.an2cn(x.group()), integer_data)
+                    decimal_data = re.sub(r"\d+", lambda x: self.__copy_num(x.group()), decimal_data)
+                    mode = "normal"
+            else:
+                raise ValueError("数据中包含不止一个点！")
+        else:
+            integer_data = check_data
+            decimal_data = None
+            # 将 smart 模式中的阿拉伯数字转化成中文数字
+            if mode == "smart":
+                # 10.1万 10.1
+                result1 = self.pattern1.search(integer_data)
+                if result1:
+                    if result1.group() == integer_data:
+                        if integer_data[-1] in UNIT_CN2AN.keys():
+                            output = int(float(integer_data[:-1]) * UNIT_CN2AN[integer_data[-1]])
+                        else:
+                            output = float(integer_data)
+                        return 0, output, None, None
+                integer_data = re.sub(r"\d+", lambda x: self.ac.an2cn(x.group()), integer_data)
+                mode = "normal"
+        result_int = self.pattern_dict[mode]["int"].search(integer_data)
+        if result_int:
+            if result_int.group() == integer_data:
+                if decimal_data is not None:
+                    result_dec = self.pattern_dict[mode]["dec"].search(decimal_data)
+                    if result_dec:
+                        if result_dec.group() == decimal_data:
+                            return sign, integer_data, decimal_data, False
+                else:
+                    return sign, integer_data, decimal_data, False
+        else:
+            if mode == "strict":
+                raise ValueError(f"不符合格式的数据：{integer_data}")
+            elif mode == "normal":
+                # 纯数模式：一二三
+                result_all_num = self.ptn_all_num.search(integer_data)
+                if result_all_num:
+                    if result_all_num.group() == integer_data:
+                        if decimal_data is not None:
+                            result_dec = self.pattern_dict[mode]["dec"].search(decimal_data)
+                            if result_dec:
+                                if result_dec.group() == decimal_data:
+                                    return sign, integer_data, decimal_data, True
+                        else:
+                            return sign, integer_data, decimal_data, True
+                # 口语模式：一万二，两千三，三百四，十三万六，一百二十五万���
+                result_speaking_mode = self.ptn_speaking_mode.search(integer_data)
+                if len(integer_data) >= 3 and result_speaking_mode and result_speaking_mode.group() == integer_data:
+                    # len(integer_data)>=3: because the minimum length of integer_data that can be matched is 3
+                    # to find the last unit
+                    last_unit = result_speaking_mode.groups()[-1][-1]
+                    _unit = UNIT_LOW_AN2CN[UNIT_CN2AN[last_unit] // 10]
+                    integer_data = integer_data + _unit
+                    if decimal_data is not None:
+                        result_dec = self.pattern_dict[mode]["dec"].search(decimal_data)
+                        if result_dec:
+                            if result_dec.group() == decimal_data:
+                                return sign, integer_data, decimal_data, False
+                    else:
+                        return sign, integer_data, decimal_data, False
+        raise ValueError(f"不符合格式的数据：{check_data}")
+    def __integer_convert(self, integer_data: str) -> int:
+        # 核心
+        output_integer = 0
+        unit = 1
+        ten_thousand_unit = 1
+        for index, cn_num in enumerate(reversed(integer_data)):
+            # 数值
+            if cn_num in NUMBER_CN2AN:
+                num = NUMBER_CN2AN[cn_num]
+                output_integer += num * unit
+            # 单位
+            elif cn_num in UNIT_CN2AN:
+                unit = UNIT_CN2AN[cn_num]
+                # 判断出万、亿、万亿
+                if unit % 10000 == 0:
+                    # 万 亿
+                    if unit > ten_thousand_unit:
+                        ten_thousand_unit = unit
+                    # 万亿
+                    else:
+                        ten_thousand_unit = unit * ten_thousand_unit
+                        unit = ten_thousand_unit
+                if unit < ten_thousand_unit:
+                    unit = unit * ten_thousand_unit
+                if index == len(integer_data) - 1:
+                    output_integer += unit
+            else:
+                raise ValueError(f"{cn_num} 不在转化范围内")
+        return int(output_integer)
+    def __decimal_convert(self, decimal_data: str) -> float:
+        len_decimal_data = len(decimal_data)
+        if len_decimal_data > 16:
+            warn(f"注意：小数部分长度为 {len_decimal_data} ，将自动截取前 16 位有效精度！")
+            decimal_data = decimal_data[:16]
+            len_decimal_data = 16
+        output_decimal = 0
+        for index in range(len(decimal_data) - 1, -1, -1):
+            unit_key = NUMBER_CN2AN[decimal_data[index]]
+            output_decimal += unit_key * 10 ** -(index + 1)
+        # 处理精度溢出问题
+        output_decimal = round(output_decimal, len_decimal_data)
+        return output_decimal
+    def __direct_convert(self, data: str) -> int:
+        output_data = 0
+        for index in range(len(data) - 1, -1, -1):
+            unit_key = NUMBER_CN2AN[data[index]]
+            output_data += unit_key * 10 ** (len(data) - index - 1)
+        return output_data

text/cn2an/conf.py ADDED Viewed

	@@ -0,0 +1,135 @@

+NUMBER_CN2AN = {
+    "零": 0,
+    "〇": 0,
+    "一": 1,
+    "壹": 1,
+    "幺": 1,
+    "二": 2,
+    "贰": 2,
+    "两": 2,
+    "三": 3,
+    "叁": 3,
+    "四": 4,
+    "肆": 4,
+    "五": 5,
+    "伍": 5,
+    "六": 6,
+    "陆": 6,
+    "七": 7,
+    "柒": 7,
+    "八": 8,
+    "捌": 8,
+    "九": 9,
+    "玖": 9,
+}
+UNIT_CN2AN = {
+    "十": 10,
+    "拾": 10,
+    "百": 100,
+    "佰": 100,
+    "千": 1000,
+    "仟": 1000,
+    "万": 10000,
+    "亿": 100000000,
+}
+UNIT_LOW_AN2CN = {
+    10: "十",
+    100: "百",
+    1000: "千",
+    10000: "万",
+    100000000: "亿",
+}
+NUMBER_LOW_AN2CN = {
+    0: "零",
+    1: "一",
+    2: "二",
+    3: "三",
+    4: "四",
+    5: "五",
+    6: "六",
+    7: "七",
+    8: "八",
+    9: "九",
+}
+NUMBER_UP_AN2CN = {
+    0: "零",
+    1: "壹",
+    2: "贰",
+    3: "叁",
+    4: "肆",
+    5: "伍",
+    6: "陆",
+    7: "柒",
+    8: "捌",
+    9: "玖",
+}
+UNIT_LOW_ORDER_AN2CN = [
+    "",
+    "十",
+    "百",
+    "千",
+    "万",
+    "十",
+    "百",
+    "千",
+    "亿",
+    "十",
+    "百",
+    "千",
+    "万",
+    "十",
+    "百",
+    "千",
+]
+UNIT_UP_ORDER_AN2CN = [
+    "",
+    "拾",
+    "佰",
+    "仟",
+    "万",
+    "拾",
+    "佰",
+    "仟",
+    "亿",
+    "拾",
+    "佰",
+    "仟",
+    "万",
+    "拾",
+    "佰",
+    "仟",
+]
+STRICT_CN_NUMBER = {
+    "零": "零",
+    "一": "一壹",
+    "二": "二贰",
+    "三": "三叁",
+    "四": "四肆",
+    "五": "五伍",
+    "六": "六陆",
+    "七": "七柒",
+    "八": "八捌",
+    "九": "九玖",
+    "十": "十拾",
+    "百": "百佰",
+    "千": "千仟",
+    "万": "万",
+    "亿": "亿",
+}
+NORMAL_CN_NUMBER = {
+    "零": "零〇",
+    "一": "一壹幺",
+    "二": "二贰两",
+    "三": "三叁仨",
+    "四": "四肆",
+    "五": "五伍",
+    "六": "六陆",
+    "七": "七柒",
+    "八": "八捌",
+    "九": "九玖",
+    "十": "十拾",
+    "百": "百佰",
+    "千": "千仟",
+    "万": "万",
+    "亿": "亿",
+}

text/cn2an/transform.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import re
+from warnings import warn
+from .cn2an import Cn2An
+from .an2cn import An2Cn
+from .conf import UNIT_CN2AN
+class Transform(object):
+    def __init__(self) -> None:
+        self.all_num = "零一二三四五六七八九"
+        self.all_unit = "".join(list(UNIT_CN2AN.keys()))
+        self.cn2an = Cn2An().cn2an
+        self.an2cn = An2Cn().an2cn
+        self.cn_pattern = f"负?([{self.all_num}{self.all_unit}]+点)?[{self.all_num}{self.all_unit}]+"
+        self.smart_cn_pattern = f"-?([0-9]+.)?[0-9]+[{self.all_unit}]+"
+    def transform(self, inputs: str, method: str = "cn2an") -> str:
+        if method == "cn2an":
+            inputs = inputs.replace("廿", "二十").replace("半", "0.5").replace("两", "2")
+            # date
+            inputs = re.sub(
+                fr"((({self.smart_cn_pattern})|({self.cn_pattern}))年)?([{self.all_num}十]+月)?([{self.all_num}十]+日)?",
+                lambda x: self.__sub_util(x.group(), "cn2an", "date"), inputs)
+            # fraction
+            inputs = re.sub(fr"{self.cn_pattern}分之{self.cn_pattern}",
+                            lambda x: self.__sub_util(x.group(), "cn2an", "fraction"), inputs)
+            # percent
+            inputs = re.sub(fr"百分之{self.cn_pattern}",
+                            lambda x: self.__sub_util(x.group(), "cn2an", "percent"), inputs)
+            # celsius
+            inputs = re.sub(fr"{self.cn_pattern}摄氏度",
+                            lambda x: self.__sub_util(x.group(), "cn2an", "celsius"), inputs)
+            # number
+            output = re.sub(self.cn_pattern,
+                            lambda x: self.__sub_util(x.group(), "cn2an", "number"), inputs)
+        elif method == "an2cn":
+            # date
+            inputs = re.sub(r"(\d{2,4}年)?(\d{1,2}月)?(\d{1,2}日)?",
+                            lambda x: self.__sub_util(x.group(), "an2cn", "date"), inputs)
+            # fraction
+            inputs = re.sub(r"\d+/\d+",
+                            lambda x: self.__sub_util(x.group(), "an2cn", "fraction"), inputs)
+            # percent
+            inputs = re.sub(r"-?(\d+\.)?\d+%",
+                            lambda x: self.__sub_util(x.group(), "an2cn", "percent"), inputs)
+            # celsius
+            inputs = re.sub(r"\d+℃",
+                            lambda x: self.__sub_util(x.group(), "an2cn", "celsius"), inputs)
+            # number
+            output = re.sub(r"-?(\d+\.)?\d+",
+                            lambda x: self.__sub_util(x.group(), "an2cn", "number"), inputs)
+        else:
+            raise ValueError(f"error method: {method}, only support 'cn2an' and 'an2cn'!")
+        return output
+    def __sub_util(self, inputs, method: str = "cn2an", sub_mode: str = "number") -> str:
+        try:
+            if inputs:
+                if method == "cn2an":
+                    if sub_mode == "date":
+                        return re.sub(fr"(({self.smart_cn_pattern})|({self.cn_pattern}))",
+                                      lambda x: str(self.cn2an(x.group(), "smart")), inputs)
+                    elif sub_mode == "fraction":
+                        if inputs[0] != "百":
+                            frac_result = re.sub(self.cn_pattern,
+                                                 lambda x: str(self.cn2an(x.group(), "smart")), inputs)
+                            numerator, denominator = frac_result.split("分之")
+                            return f"{denominator}/{numerator}"
+                        else:
+                            return inputs
+                    elif sub_mode == "percent":
+                        return re.sub(f"(?<=百分之){self.cn_pattern}",
+                                      lambda x: str(self.cn2an(x.group(), "smart")), inputs).replace("百分之", "") + "%"
+                    elif sub_mode == "celsius":
+                        return re.sub(f"{self.cn_pattern}(?=摄氏度)",
+                                      lambda x: str(self.cn2an(x.group(), "smart")), inputs).replace("摄氏度", "℃")
+                    elif sub_mode == "number":
+                        return str(self.cn2an(inputs, "smart"))
+                    else:
+                        raise Exception(f"error sub_mode: {sub_mode} !")
+                else:
+                    if sub_mode == "date":
+                        inputs = re.sub(r"\d+(?=年)",
+                                        lambda x: self.an2cn(x.group(), "direct"), inputs)
+                        return re.sub(r"\d+",
+                                      lambda x: self.an2cn(x.group(), "low"), inputs)
+                    elif sub_mode == "fraction":
+                        frac_result = re.sub(r"\d+", lambda x: self.an2cn(x.group(), "low"), inputs)
+                        numerator, denominator = frac_result.split("/")
+                        return f"{denominator}分之{numerator}"
+                    elif sub_mode == "celsius":
+                        return self.an2cn(inputs[:-1], "low") + "摄氏度"
+                    elif sub_mode == "percent":
+                        return "百分之" + self.an2cn(inputs[:-1], "low")
+                    elif sub_mode == "number":
+                        return self.an2cn(inputs, "low")
+                    else:
+                        raise Exception(f"error sub_mode: {sub_mode} !")
+        except Exception as e:
+            warn(str(e))
+            return inputs

text/cnm3/ds_CNM3.txt ADDED Viewed

	@@ -0,0 +1,606 @@

+a,a
+ai,ai
+ai0,a0 I0
+an,an
+an0,a0 N0
+ang,ang
+ang0,A0 ng0
+ao,ao
+ao0,A0 O0
+ba,b a
+bai,b a0 I0
+ban,b a0 N0
+bang,b A0 ng0
+bao,b A0 O0
+be,b e
+bei,b E0 I0
+ben,b e0 N0
+beng,b e0 ng0
+ber,b er
+bi,b i
+bia,b ia
+bian,b iE0 N0
+biang,b iA0 ng0
+biao,b iA0 O0
+bie,b ie
+bin,b i N0
+bing,b i ng0
+biong,b iO0 ng0
+biu,b io0 U0
+bo,b o
+bong,b oo0 ng0
+bou,b o0 U0
+bu,b u
+bua,b ua
+buai,b ua0 I0
+buan,b ua0 N0
+buang,b uA0 ng0
+bui,b uE0 I0
+bun,b ue0 N0
+bv,b v
+bve,b ve
+ca,c a
+cai,c a0 I0
+can,c a0 N0
+cang,c A0 ng0
+cao,c A0 O0
+ce,c e
+cei,c E0 I0
+cen,c e0 N0
+ceng,c e0 ng0
+cer,c er
+cha,ch a
+chai,ch a0 I0
+chan,ch a0 N0
+chang,ch A0 ng0
+chao,ch A0 O0
+che,ch e
+chei,ch E0 I0
+chen,ch e0 N0
+cheng,ch e0 ng0
+cher,ch er
+chi,ch ir
+chong,ch oo0 ng0
+chou,ch o0 U0
+chu,ch u
+chua,ch ua
+chuai,ch ua0 I0
+chuan,ch ua0 N0
+chuang,ch uA0 ng0
+chui,ch uE0 I0
+chun,ch ue0 N0
+chuo,ch uo
+chv,ch v
+chyi,ch i
+ci,c i0
+cong,c oo0 ng0
+cou,c o0 U0
+cu,c u
+cua,c ua
+cuai,c ua0 I0
+cuan,c ua0 N0
+cuang,c uA0 ng0
+cui,c uE0 I0
+cun,c ue0 N0
+cuo,c uo
+cv,c v
+cyi,c i
+da,d a
+dai,d a0 I0
+dan,d a0 N0
+dang,d A0 ng0
+dao,d A0 O0
+de,d e
+dei,d E0 I0
+den,d e0 N0
+deng,d e0 ng0
+der,d er
+di,d i
+dia,d ia
+dian,d iE0 N0
+diang,d iA0 ng0
+diao,d iA0 O0
+die,d ie
+din,d i N0
+ding,d i ng0
+diong,d iO0 ng0
+diu,d io0 U0
+dong,d oo0 ng0
+dou,d o0 U0
+du,d u
+dua,d ua
+duai,d ua0 I0
+duan,d ua0 N0
+duang,d uA0 ng0
+dui,d uE0 I0
+dun,d ue0 N0
+duo,d uo
+dv,d v
+dve,d ve
+e,e
+ei,E0 I0
+en,e0 N0
+eng,e0 ng0
+er,er
+fa,f a
+fai,f a0 I0
+fan,f a0 N0
+fang,f A0 ng0
+fao,f A0 O0
+fe,f e
+fei,f E0 I0
+fen,f e0 N0
+feng,f e0 ng0
+fer,f er
+fi,f i
+fia,f ia
+fian,f iE0 N0
+fiang,f iA0 ng0
+fiao,f iA0 O0
+fie,f ie
+fin,f i N0
+fing,f i ng0
+fiong,f iO0 ng0
+fiu,f io0 U0
+fo,f o
+fong,f oo0 ng0
+fou,f o0 U0
+fu,f u
+fua,f ua
+fuai,f ua0 I0
+fuan,f ua0 N0
+fuang,f uA0 ng0
+fui,f uE0 I0
+fun,f ue0 N0
+fv,f v
+fve,f ve
+ga,g a
+gai,g a0 I0
+gan,g a0 N0
+gang,g A0 ng0
+gao,g A0 O0
+ge,g e
+gei,g E0 I0
+gen,g e0 N0
+geng,g e0 ng0
+ger,g er
+gi,g i
+gia,g ia
+gian,g iE0 N0
+giang,g iA0 ng0
+giao,g iA0 O0
+gie,g ie
+gin,g i N0
+ging,g i ng0
+giong,g iO0 ng0
+giu,g io0 U0
+gong,g oo0 ng0
+gou,g o0 U0
+gu,g u
+gua,g ua
+guai,g ua0 I0
+guan,g ua0 N0
+guang,g uA0 ng0
+gui,g uE0 I0
+gun,g ue0 N0
+guo,g uo
+gv,g v
+gve,g ve
+ha,h a
+hai,h a0 I0
+han,h a0 N0
+hang,h A0 ng0
+hao,h A0 O0
+he,h e
+hei,h E0 I0
+hen,h e0 N0
+heng,h e0 ng0
+her,h er
+hi,h i
+hia,h ia
+hian,h iE0 N0
+hiang,h iA0 ng0
+hiao,h iA0 O0
+hie,h ie
+hin,h i N0
+hing,h i ng0
+hiong,h iO0 ng0
+hiu,h io0 U0
+hong,h oo0 ng0
+hou,h o0 U0
+hu,h u
+hua,h ua
+huai,h ua0 I0
+huan,h ua0 N0
+huang,h uA0 ng0
+hui,h uE0 I0
+hun,h ue0 N0
+huo,h uo
+hv,h v
+hve,h ve
+ji,j i
+jia,j ia
+jian,j iE0 N0
+jiang,j iA0 ng0
+jiao,j iA0 O0
+jie,j ie
+jin,j i N0
+jing,j i ng0
+jiong,j iO0 ng0
+jiu,j io0 U0
+ju,j v
+juan,j vE0 N0
+jue,j ve
+jun,j v0 N0
+ka,k a
+kai,k a0 I0
+kan,k a0 N0
+kang,k A0 ng0
+kao,k A0 O0
+ke,k e
+kei,k E0 I0
+ken,k e0 N0
+keng,k e0 ng0
+ker,k er
+ki,k i
+kia,k ia
+kian,k iE0 N0
+kiang,k iA0 ng0
+kiao,k iA0 O0
+kie,k ie
+kin,k i N0
+king,k i ng0
+kiong,k iO0 ng0
+kiu,k io0 U0
+kong,k oo0 ng0
+kou,k o0 U0
+ku,k u
+kua,k ua
+kuai,k ua0 I0
+kuan,k ua0 N0
+kuang,k uA0 ng0
+kui,k uE0 I0
+kun,k ue0 N0
+kuo,k uo
+kv,k v
+kve,k ve
+la,l a
+lai,l a0 I0
+lan,l a0 N0
+lang,l A0 ng0
+lao,l A0 O0
+le,l e
+lei,l E0 I0
+len,l e0 N0
+leng,l e0 ng0
+ler,l er
+li,l i
+lia,l ia
+lian,l iE0 N0
+liang,l iA0 ng0
+liao,l iA0 O0
+lie,l ie
+lin,l i N0
+ling,l i ng0
+liong,l iO0 ng0
+liu,l io0 U0
+lo,l o
+long,l oo0 ng0
+lou,l o0 U0
+lu,l u
+lua,l ua
+luai,l ua0 I0
+luan,l ua0 N0
+luang,l uA0 ng0
+lui,l uE0 I0
+lun,l ue0 N0
+luo,l uo
+lv,l v
+lve,l ve
+ma,m a
+mai,m a0 I0
+man,m a0 N0
+mang,m A0 ng0
+mao,m A0 O0
+me,m e
+mei,m E0 I0
+men,m e0 N0
+meng,m e0 ng0
+mer,m er
+mi,m i
+mia,m ia
+mian,m iE0 N0
+miang,m iA0 ng0
+miao,m iA0 O0
+mie,m ie
+min,m i N0
+ming,m i ng0
+miong,m iO0 ng0
+miu,m io0 U0
+mo,m o
+mong,m oo0 ng0
+mou,m o0 U0
+mu,m u
+mua,m ua
+muai,m ua0 I0
+muan,m ua0 N0
+muang,m uA0 ng0
+mui,m uE0 I0
+mun,m ue0 N0
+mv,m v
+mve,m ve
+n,ng
+na,n a
+nai,n a0 I0
+nan,n a0 N0
+nang,n A0 ng0
+nao,n A0 O0
+ne,n e
+nei,n E0 I0
+nen,n e0 N0
+neng,n e0 ng0
+ner,n er
+ni,n i
+nia,n ia
+nian,n iE0 N0
+niang,n iA0 ng0
+niao,n iA0 O0
+nie,n ie
+nin,n i N0
+ning,n i ng0
+niong,n iO0 ng0
+niu,n io0 U0
+nong,n oo0 ng0
+nou,n o0 U0
+nu,n u
+nua,n ua
+nuai,n ua0 I0
+nuan,n ua0 N0
+nuang,n uA0 ng0
+nui,n uE0 I0
+nun,n ue0 N0
+nuo,n uo
+nv,n v
+nve,n ve
+o,o
+ong,ong
+ou,ou
+pa,p a
+pai,p a0 I0
+pan,p a0 N0
+pang,p A0 ng0
+pao,p A0 O0
+pe,p e
+pei,p E0 I0
+pen,p e0 N0
+peng,p e0 ng0
+per,p er
+pi,p i
+pia,p ia
+pian,p iE0 N0
+piang,p iA0 ng0
+piao,p iA0 O0
+pie,p ie
+pin,p i N0
+ping,p i ng0
+piong,p iO0 ng0
+piu,p io0 U0
+po,p o
+pong,p oo0 ng0
+pou,p o0 U0
+pu,p u
+pua,p ua
+puai,p ua0 I0
+puan,p ua0 N0
+puang,p uA0 ng0
+pui,p uE0 I0
+pun,p ue0 N0
+pv,p v
+pve,p ve
+qi,q i
+qia,q ia
+qian,q iE0 N0
+qiang,q iA0 ng0
+qiao,q iA0 O0
+qie,q ie
+qin,q i N0
+qing,q i ng0
+qiong,q iO0 ng0
+qiu,q io0 U0
+qu,q v
+quan,q vE0 N0
+que,q ve
+qun,q v0 N0
+ra,r a
+rai,r a0 I0
+ran,r a0 N0
+rang,r A0 ng0
+rao,r A0 O0
+re,r e
+rei,r E0 I0
+ren,r e0 N0
+reng,r e0 ng0
+rer,r er
+ri,r ir
+rong,r oo0 ng0
+rou,r o0 U0
+ru,r u
+rua,r ua
+ruai,r ua0 I0
+ruan,r ua0 N0
+ruang,r uA0 ng0
+rui,r uE0 I0
+run,r ue0 N0
+ruo,r uo
+rv,r v
+ryi,r i
+sa,s a
+sai,s a0 I0
+san,s a0 N0
+sang,s A0 ng0
+sao,s A0 O0
+se,s e
+sei,s E0 I0
+sen,s e0 N0
+seng,s e0 ng0
+ser,s er
+sha,sh a
+shai,sh a0 I0
+shan,sh a0 N0
+shang,sh A0 ng0
+shao,sh A0 O0
+she,sh e
+shei,sh E0 I0
+shen,sh e0 N0
+sheng,sh e0 ng0
+sher,sh er
+shi,sh ir
+shong,sh oo0 ng0
+shou,sh o0 U0
+shu,sh u
+shua,sh ua
+shuai,sh ua0 I0
+shuan,sh ua0 N0
+shuang,sh uA0 ng0
+shui,sh uE0 I0
+shun,sh ue0 N0
+shuo,sh uo
+shv,sh v
+shyi,sh i
+si,s i0
+song,s oo0 ng0
+sou,s o0 U0
+su,s u
+sua,s ua
+suai,s ua0 I0
+suan,s ua0 N0
+suang,s uA0 ng0
+sui,s uE0 I0
+sun,s ue0 N0
+suo,s uo
+sv,s v
+syi,s i
+ta,t a
+tai,t a0 I0
+tan,t a0 N0
+tang,t A0 ng0
+tao,t A0 O0
+te,t e
+tei,t E0 I0
+ten,t e0 N0
+teng,t e0 ng0
+ter,t er
+ti,t i
+tia,t ia
+tian,t iE0 N0
+tiang,t iA0 ng0
+tiao,t iA0 O0
+tie,t ie
+tin,t i N0
+ting,t i ng0
+tiong,t iO0 ng0
+tong,t oo0 ng0
+tou,t o0 U0
+tu,t u
+tua,t ua
+tuai,t ua0 I0
+tuan,t ua0 N0
+tuang,t uA0 ng0
+tui,t uE0 I0
+tun,t ue0 N0
+tuo,t uo
+tv,t v
+tve,t ve
+wa,w a
+wai,w a0 I0
+wan,w a0 N0
+wang,w A0 ng0
+wao,w A0 O0
+we,w e
+wei,w E0 I0
+wen,w e0 N0
+weng,w e0 ng0
+wer,w er
+wi,w i
+wo,w o
+wong,w oo0 ng0
+wou,w o0 U0
+wu,w u
+xi,x i
+xia,x ia
+xian,x iE0 N0
+xiang,x iA0 ng0
+xiao,x iA0 O0
+xie,x ie
+xin,x i N0
+xing,x i ng0
+xiong,x iO0 ng0
+xiu,x io0 U0
+xu,x v
+xuan,x vE0 N0
+xue,x ve
+xun,x v0 N0
+ya,y a
+yai,y a0 I0
+yan,y iE0 N0
+yang,y A0 ng0
+yao,y A0 O0
+ye,y E
+yei,y E0 I0
+yi,y i
+yin,y i N0
+ying,y i ng0
+yo,y o
+yong,y oo0 ng0
+you,y o0 U0
+yu,y v
+yuan,y vE0 N0
+yue,y ve
+yun,y v0 N0
+ywu,y u
+za,z a
+zai,z a0 I0
+zan,z a0 N0
+zang,z A0 ng0
+zao,z A0 O0
+ze,z e
+zei,z E0 I0
+zen,z e0 N0
+zeng,z e0 ng0
+zer,z er
+zha,zh a
+zhai,zh a0 I0
+zhan,zh a0 N0
+zhang,zh A0 ng0
+zhao,zh A0 O0
+zhe,zh e
+zhei,zh E0 I0
+zhen,zh e0 N0
+zheng,zh e0 ng0
+zher,zh er
+zhi,zh ir
+zhong,zh oo0 ng0
+zhou,zh o0 U0
+zhu,zh u
+zhua,zh ua
+zhuai,zh ua0 I0
+zhuan,zh ua0 N0
+zhuang,zh uA0 ng0
+zhui,zh uE0 I0
+zhun,zh ue0 N0
+zhuo,zh uo
+zhv,zh v
+zhyi,zh i
+zi,z i0
+zong,z oo0 ng0
+zou,z o0 U0
+zu,z u
+zua,z ua
+zuai,z ua0 I0
+zuan,z ua0 N0
+zuang,z uA0 ng0
+zui,z uE0 I0
+zun,z ue0 N0
+zuo,z uo
+zv,z v
+zyi,z i

text/custom_pypinyin_dict/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # -- coding: utf-8 --

text/custom_pypinyin_dict/cc_cedict_0.py ADDED Viewed

The diff for this file is too large to render. See raw diff

text/custom_pypinyin_dict/cc_cedict_1.py ADDED Viewed

The diff for this file is too large to render. See raw diff

text/custom_pypinyin_dict/cc_cedict_2.py ADDED Viewed

The diff for this file is too large to render. See raw diff

text/custom_pypinyin_dict/cc_cedict_3.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+# Warning: Auto-generated file, don't edit.
+phrases_dict = {
+    '𰻝𰻝面': [['biáng'], ['biáng'], ['miàn']],
+}
+from pypinyin import load_phrases_dict
+def load():
+    load_phrases_dict(phrases_dict)

text/custom_pypinyin_dict/genshin.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+phrases_dict = {
+    '㐖毒': [['xié'], ['dú']],
+    '若陀': [['rě'], ['tuó']],
+    '平藏': [['píng'], ['zàng']],
+    '派蒙': [['pài'], ['méng']],
+    '安柏': [['ān'], ['bó']],
+    '一斗': [['yī'], ['dǒu']]
+}

text/custom_pypinyin_dict/phrase_pinyin_data.py ADDED Viewed

	@@ -0,0 +1,24 @@

+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+from pypinyin import load_phrases_dict
+from text.custom_pypinyin_dict import cc_cedict_0
+from text.custom_pypinyin_dict import cc_cedict_1
+from text.custom_pypinyin_dict import cc_cedict_2
+from text.custom_pypinyin_dict import cc_cedict_3
+from text.custom_pypinyin_dict import genshin
+phrases_dict = {}
+phrases_dict.update(cc_cedict_0.phrases_dict)
+phrases_dict.update(cc_cedict_1.phrases_dict)
+phrases_dict.update(cc_cedict_2.phrases_dict)
+phrases_dict.update(cc_cedict_3.phrases_dict)
+phrases_dict.update(genshin.phrases_dict)
+def load():
+    load_phrases_dict(phrases_dict)
+    print("加载自定义词典成功")
+if __name__ == '__main__':
+    print(phrases_dict)

text/english.py ADDED Viewed

	@@ -0,0 +1,175 @@

+""" from https://github.com/keithito/tacotron """
+'''
+Cleaners are transformations that run over the input text at both training and eval time.
+Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
+hyperparameter. Some cleaners are English-specific. You'll typically want to use:
+  1. "english_cleaners" for English text
+  2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
+     the Unidecode library (https://pypi.python.org/pypi/Unidecode)
+  3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
+     the symbols in symbols.py to match your data).
+'''
+# Regular expression matching whitespace:
+import re
+import inflect
+from unidecode import unidecode
+import eng_to_ipa as ipa
+_inflect = inflect.engine()
+_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
+_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
+_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
+_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
+_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
+_number_re = re.compile(r'[0-9]+')
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
+    ('mrs', 'misess'),
+    ('mr', 'mister'),
+    ('dr', 'doctor'),
+    ('st', 'saint'),
+    ('co', 'company'),
+    ('jr', 'junior'),
+    ('maj', 'major'),
+    ('gen', 'general'),
+    ('drs', 'doctors'),
+    ('rev', 'reverend'),
+    ('lt', 'lieutenant'),
+    ('hon', 'honorable'),
+    ('sgt', 'sergeant'),
+    ('capt', 'captain'),
+    ('esq', 'esquire'),
+    ('ltd', 'limited'),
+    ('col', 'colonel'),
+    ('ft', 'fort'),
+]]
+# List of (ipa, lazy ipa) pairs:
+_lazy_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('r', 'ɹ'),
+    ('æ', 'e'),
+    ('ɑ', 'a'),
+    ('ɔ', 'o'),
+    ('ð', 'z'),
+    ('θ', 's'),
+    ('ɛ', 'e'),
+    ('ɪ', 'i'),
+    ('ʊ', 'u'),
+    ('ʒ', 'ʥ'),
+    ('ʤ', 'ʥ'),
+    ('ˈ', '↓'),
+]]
+# List of (ipa, lazy ipa2) pairs:
+_lazy_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('r', 'ɹ'),
+    ('ð', 'z'),
+    ('θ', 's'),
+    ('ʒ', 'ʑ'),
+    ('ʤ', 'dʑ'),
+    ('ˈ', '↓'),
+]]
+# List of (ipa, ipa2) pairs
+_ipa_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('r', 'ɹ'),
+    ('ʤ', 'dʒ'),
+    ('ʧ', 'tʃ')
+]]
+def expand_abbreviations(text):
+    for regex, replacement in _abbreviations:
+        text = re.sub(regex, replacement, text)
+    return text
+def collapse_whitespace(text):
+    return re.sub(r'\s+', ' ', text)
+def _remove_commas(m):
+    return m.group(1).replace(',', '')
+def _expand_decimal_point(m):
+    return m.group(1).replace('.', ' point ')
+def _expand_dollars(m):
+    match = m.group(1)
+    parts = match.split('.')
+    if len(parts) > 2:
+        return match + ' dollars'  # Unexpected format
+    dollars = int(parts[0]) if parts[0] else 0
+    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
+    if dollars and cents:
+        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
+        cent_unit = 'cent' if cents == 1 else 'cents'
+        return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
+    elif dollars:
+        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
+        return '%s %s' % (dollars, dollar_unit)
+    elif cents:
+        cent_unit = 'cent' if cents == 1 else 'cents'
+        return '%s %s' % (cents, cent_unit)
+    else:
+        return 'zero dollars'
+def _expand_ordinal(m):
+    return _inflect.number_to_words(m.group(0))
+def _expand_number(m):
+    num = int(m.group(0))
+    if num > 1000 and num < 3000:
+        if num == 2000:
+            return 'two thousand'
+        elif num > 2000 and num < 2010:
+            return 'two thousand ' + _inflect.number_to_words(num % 100)
+        elif num % 100 == 0:
+            return _inflect.number_to_words(num // 100) + ' hundred'
+        else:
+            return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
+    else:
+        return _inflect.number_to_words(num, andword='')
+def normalize_numbers(text):
+    text = re.sub(_comma_number_re, _remove_commas, text)
+    text = re.sub(_pounds_re, r'\1 pounds', text)
+    text = re.sub(_dollars_re, _expand_dollars, text)
+    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
+    text = re.sub(_ordinal_re, _expand_ordinal, text)
+    text = re.sub(_number_re, _expand_number, text)
+    return text
+def mark_dark_l(text):
+    return re.sub(r'l([^aeiouæɑɔəɛɪʊ ]*(?: |$))', lambda x: 'ɫ'+x.group(1), text)
+def english_to_ipa(text):
+    text = unidecode(text).lower()
+    text = expand_abbreviations(text)
+    text = normalize_numbers(text)
+    phonemes = ipa.convert(text)
+    phonemes = collapse_whitespace(phonemes)
+    return phonemes
+def english_to_ipa2(text):
+    text = english_to_ipa(text)
+    text = mark_dark_l(text)
+    for regex, replacement in _ipa_to_ipa2:
+        text = re.sub(regex, replacement, text)
+    return list(text.replace('...', '…'))

text/japanese.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import re
+from unidecode import unidecode
+import pyopenjtalk
+# Regular expression matching Japanese without punctuation marks:
+_japanese_characters = re.compile(
+    r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
+# Regular expression matching non-Japanese characters or punctuation marks:
+_japanese_marks = re.compile(
+    r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
+# List of (symbol, Japanese) pairs for marks:
+_symbols_to_japanese = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('％', 'パーセント')
+]]
+# List of (romaji, ipa) pairs for marks:
+_romaji_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('ts', 'ʦ'),
+    ('u', 'ɯ'),
+    ('j', 'ʥ'),
+    ('y', 'j'),
+    ('ni', 'n^i'),
+    ('nj', 'n^'),
+    ('hi', 'çi'),
+    ('hj', 'ç'),
+    ('f', 'ɸ'),
+    ('I', 'i*'),
+    ('U', 'ɯ*'),
+    ('r', 'ɾ')
+]]
+# List of (romaji, ipa2) pairs for marks:
+_romaji_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('u', 'ɯ'),
+    ('ʧ', 'tʃ'),
+    ('j', 'dʑ'),
+    ('y', 'j'),
+    ('ni', 'n^i'),
+    ('nj', 'n^'),
+    ('hi', 'çi'),
+    ('hj', 'ç'),
+    ('f', 'ɸ'),
+    ('I', 'i*'),
+    ('U', 'ɯ*'),
+    ('r', 'ɾ')
+]]
+# List of (consonant, sokuon) pairs:
+_real_sokuon = [(re.compile('%s' % x[0]), x[1]) for x in [
+    (r'Q([↑↓]*[kg])', r'k#\1'),
+    (r'Q([↑↓]*[tdjʧ])', r't#\1'),
+    (r'Q([↑↓]*[sʃ])', r's\1'),
+    (r'Q([↑↓]*[pb])', r'p#\1')
+]]
+# List of (consonant, hatsuon) pairs:
+_real_hatsuon = [(re.compile('%s' % x[0]), x[1]) for x in [
+    (r'N([↑↓]*[pbm])', r'm\1'),
+    (r'N([↑↓]*[ʧʥj])', r'n^\1'),
+    (r'N([↑↓]*[tdn])', r'n\1'),
+    (r'N([↑↓]*[kg])', r'ŋ\1')
+]]
+def symbols_to_japanese(text):
+    for regex, replacement in _symbols_to_japanese:
+        text = re.sub(regex, replacement, text)
+    return text
+def japanese_to_romaji_with_accent(text):
+    '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html'''
+    text = symbols_to_japanese(text)
+    sentences = re.split(_japanese_marks, text)
+    marks = re.findall(_japanese_marks, text)
+    text = ''
+    for i, sentence in enumerate(sentences):
+        if re.match(_japanese_characters, sentence):
+            if text != '':
+                text += ' '
+            labels = pyopenjtalk.extract_fullcontext(sentence)
+            for n, label in enumerate(labels):
+                phoneme = re.search(r'\-([^\+]*)\+', label).group(1)
+                if phoneme not in ['sil', 'pau']:
+                    text += phoneme.replace('ch', 'ʧ').replace('sh',
+                                                               'ʃ').replace('cl', 'Q')
+                else:
+                    continue
+                # n_moras = int(re.search(r'/F:(\d+)_', label).group(1))
+                a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1))
+                a2 = int(re.search(r"\+(\d+)\+", label).group(1))
+                a3 = int(re.search(r"\+(\d+)/", label).group(1))
+                if re.search(r'\-([^\+]*)\+', labels[n + 1]).group(1) in ['sil', 'pau']:
+                    a2_next = -1
+                else:
+                    a2_next = int(
+                        re.search(r"\+(\d+)\+", labels[n + 1]).group(1))
+                # Accent phrase boundary
+                if a3 == 1 and a2_next == 1:
+                    text += ' '
+                # Falling
+                elif a1 == 0 and a2_next == a2 + 1:
+                    text += '↓'
+                # Rising
+                elif a2 == 1 and a2_next == 2:
+                    text += '↑'
+        if i < len(marks):
+            text += unidecode(marks[i]).replace(' ', '')
+    return text
+def get_real_sokuon(text):
+    for regex, replacement in _real_sokuon:
+        text = re.sub(regex, replacement, text)
+    return text
+def get_real_hatsuon(text):
+    for regex, replacement in _real_hatsuon:
+        text = re.sub(regex, replacement, text)
+    return text
+def japanese_to_ipa(text):
+    text = japanese_to_romaji_with_accent(text).replace('...', '…')
+    text = re.sub(
+        r'([aiueo])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text)
+    text = get_real_sokuon(text)
+    text = get_real_hatsuon(text)
+    for regex, replacement in _romaji_to_ipa:
+        text = re.sub(regex, replacement, text)
+    return text
+def japanese_to_ipa2(text):
+    text = japanese_to_romaji_with_accent(text).replace('...', '…')
+    text = get_real_sokuon(text)
+    text = get_real_hatsuon(text)
+    for regex, replacement in _romaji_to_ipa2:
+        text = re.sub(regex, replacement, text)
+    return list(text)
+def japanese_to_ipa3(text):
+    text = japanese_to_ipa2(text).replace('n^', 'ȵ').replace(
+        'ʃ', 'ɕ').replace('*', '\u0325').replace('#', '\u031a')
+    text = re.sub(
+        r'([aiɯeo])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text)
+    text = re.sub(r'((?:^|\s)(?:ts|tɕ|[kpt]))', r'\1ʰ', text)
+    return text
+if __name__ == '__main__':
+    a = japanese_to_romaji_with_accent('こんにちは！はい、元気です。あなたは？')
+    print(a)

text/mandarin.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import re
+from typing import Dict, List
+from pypinyin import lazy_pinyin, Style
+from .custom_pypinyin_dict import phrase_pinyin_data
+import jieba
+from .cn2an import an2cn
+# 加载自定义拼音词典数据
+phrase_pinyin_data.load()
+# 标点符号正则
+PUNC_MAP: Dict[str, str] = {
+    "：": ",",
+    "；": ",",
+    "，": ",",
+    "。": ".",
+    "！": "!",
+    "？": "?",
+    "\n": ".",
+    "·": ",",
+    "、": ",",
+    "$": ".",
+    "/": ",",
+    "“": "'",
+    "”": "'",
+    '"': "'",
+    "‘": "'",
+    "’": "'",
+    "（": "'",
+    "）": "'",
+    "(": "'",
+    ")": "'",
+    "《": "'",
+    "》": "'",
+    "【": "'",
+    "】": "'",
+    "[": "'",
+    "]": "'",
+    "—": "-",
+    "～": "~",
+    "「": "'",
+    "」": "'",
+    "『": "'",
+    "』": "'",
+}
+# from GPT_SoVITS.text.zh_normalization.text_normlization
+PUNC_MAP.update ({
+    '/': '每',
+    '①': '一',
+    '②': '二',
+    '③': '三',
+    '④': '四',
+    '⑤': '五',
+    '⑥': '六',
+    '⑦': '七',
+    '⑧': '八',
+    '⑨': '九',
+    '⑩': '十',
+    'α': '阿尔法',
+    'β': '贝塔',
+    'γ': '伽玛',
+    'Γ': '伽玛',
+    'δ': '德尔塔',
+    'Δ': '德尔塔',
+    'ε': '艾普西龙',
+    'ζ': '捷塔',
+    'η': '依塔',
+    'θ': '西塔',
+    'Θ': '西塔',
+    'ι': '艾欧塔',
+    'κ': '喀帕',
+    'λ': '拉姆达',
+    'Λ': '拉姆达',
+    'μ': '缪',
+    'ν': '拗',
+    'ξ': '克西',
+    'Ξ': '克西',
+    'ο': '欧米克伦',
+    'π': '派',
+    'Π': '派',
+    'ρ': '肉',
+    'ς': '西格玛',
+    'σ': '西格玛',
+    'Σ': '西格玛',
+    'τ': '套',
+    'υ': '宇普西龙',
+    'φ': '服艾',
+    'Φ': '服艾',
+    'χ': '器',
+    'ψ': '普赛',
+    'Ψ': '普赛',
+    'ω': '欧米伽',
+    'Ω': '欧米伽',
+    '+': '加',
+    '-': '减',
+    '×': '乘',
+    '÷': '除',
+    '=': '等',
+    "嗯": "恩",
+    "呣": "母"
+})
+PUNC_TABLE = str.maketrans(PUNC_MAP)
+# 数字正则化
+NUMBER_PATTERN: re.Pattern = re.compile(r'\d+(?:\.?\d+)?')
+# 阿拉伯数字转汉字
+def replace_number(match: re.Match) -> str:
+    return an2cn(match.group())
+def normalize_number(text: str) -> str:
+    return NUMBER_PATTERN.sub(replace_number, text)
+# get symbols of phones, not used
+def load_pinyin_symbols(path):
+    pinyin_dict={}
+    temp = []
+    with open(path, "r", encoding='utf-8') as f:
+        content = f.readlines()
+    for line in content:
+        cuts = line.strip().split(',')
+        pinyin = cuts[0]
+        phones = cuts[1].split(' ')
+        pinyin_dict[pinyin] = phones
+        temp.extend(phones)
+    temp = list(set(temp))
+    tone = []
+    for phone in temp:
+        for i in range(1, 6):
+            phone2 = phone + str(i)
+            tone.append(phone2)
+    print(sorted(tone, key=lambda x: len(x)))
+    return pinyin_dict
+def load_pinyin_dict(path: str) -> Dict[str, List[str]]:
+    pinyin_dict = {}
+    with open(path, "r", encoding='utf-8') as f:
+        for line in f:
+            key, value = line.strip().split(',', 1)
+            pinyin_dict[key] = value.split()
+    return pinyin_dict
+import os
+pinyin_dict = load_pinyin_dict(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'cnm3', 'ds_CNM3.txt'))
+# pinyin_dict = load_pinyin_dict('text/cnm3/ds_CNM3.txt')
+def chinese_to_cnm3(text: str) -> List[str]:
+    # 标点符号和数字正则化
+    text = text.translate(PUNC_TABLE)
+    text = normalize_number(text)
+    # 过滤掉特殊字符
+    text = re.sub(r'[#&@“”^_|\\]', '', text)
+    words = jieba.lcut(text, cut_all=False)
+    phones = []
+    for word in words:
+        pinyin_list: List[str] = lazy_pinyin(word, style=Style.TONE3, neutral_tone_with_five=True)
+        for pinyin in pinyin_list:
+            if pinyin[-1].isdigit():
+                tone = pinyin[-1]
+                syllable = pinyin[:-1]
+                phone = pinyin_dict[syllable]
+                phones.extend([ph + tone for ph in phone])
+            elif pinyin[-1].isalpha():
+                pass
+            else:
+                phones.extend(pinyin)
+    return phones

text/symbols.py ADDED Viewed

	@@ -0,0 +1,79 @@

+'''
+Defines the set of symbols used in text input to the model.
+'''
+# japanese_cleaners
+# _pad        = '_'
+# _punctuation = ',.!?-'
+# _letters = 'AEINOQUabdefghijkmnoprstuvwyzʃʧ↓↑ '
+'''# japanese_cleaners2
+_pad        = '_'
+_punctuation = ',.!?-~…'
+_letters = 'AEINOQUabdefghijkmnoprstuvwyzʃʧʦ↓↑ '
+'''
+'''# korean_cleaners
+_pad        = '_'
+_punctuation = ',.!?…~'
+_letters = 'ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ '
+'''
+'''# chinese_cleaners
+_pad        = '_'
+_punctuation = '，。！？—…'
+_letters = 'ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙㄚㄛㄜㄝㄞㄟㄠㄡㄢㄣㄤㄥㄦㄧㄨㄩˉˊˇˋ˙ '
+'''
+# # zh_ja_mixture_cleaners
+# _pad        = '_'
+# _punctuation = ',.!?-~…'
+# _letters = 'AEINOQUabdefghijklmnoprstuvwyzʃʧʦɯɹəɥ⁼ʰ`→↓↑ '
+'''# sanskrit_cleaners
+_pad        = '_'
+_punctuation = '।'
+_letters = 'ँंःअआइईउऊऋएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलळवशषसहऽािीुूृॄेैोौ्ॠॢ '
+'''
+'''# cjks_cleaners
+_pad        = '_'
+_punctuation = ',.!?-~…'
+_letters = 'NQabdefghijklmnopstuvwxyzʃʧʥʦɯɹəɥçɸɾβŋɦː⁼ʰ`^#*=→↓↑ '
+'''
+'''# thai_cleaners
+_pad        = '_'
+_punctuation = '.!? '
+_letters = 'กขฃคฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลวศษสหฬอฮฯะัาำิีึืุูเแโใไๅๆ็่้๊๋์'
+'''
+# # cjke_cleaners2
+_pad        = '_'
+_punctuation = ',.!?-~…' + "'"
+_IPA_letters = 'NQabdefghijklmnopstuvwxyzɑæʃʑçɯɪɔɛɹðəɫɥɸʊɾʒθβŋɦ⁼ʰ`^#*=ˈˌ→↓↑ '
+_CNM3_letters = ['y1', 'y2', 'y3', 'y4', 'y5', 'n1', 'n2', 'n3', 'n4', 'n5', 'p1', 'p2', 'p3', 'p4', 'p5', 'x1', 'x2', 'x3', 'x4', 'x5', 'k1', 'k2', 'k3', 'k4', 'k5', 'l1', 'l2', 'l3', 'l4', 'l5', 'q1', 'q2', 'q3', 'q4', 'q5', 'w1', 'w2', 'w3', 'w4', 'w5', 'E1', 'E2', 'E3', 'E4', 'E5', 'b1', 'b2', 'b3', 'b4', 'b5', 'c1', 'c2', 'c3', 'c4', 'c5', 'z1', 'z2', 'z3', 'z4', 'z5', 'e1', 'e2', 'e3', 'e4', 'e5', 'f1', 'f2', 'f3', 'f4', 'f5', 's1', 's2', 's3', 's4', 's5', 'j1', 'j2', 'j3', 'j4', 'j5', 'o1', 'o2', 'o3', 'o4', 'o5', 'i1', 'i2', 'i3', 'i4', 'i5', 'd1', 'd2', 'd3', 'd4', 'd5', 'm1', 'm2', 'm3', 'm4', 'm5', 't1', 't2', 't3', 't4', 't5', 'h1', 'h2', 'h3', 'h4', 'h5', 'g1', 'g2', 'g3', 'g4', 'g5', 'v1', 'v2', 'v3', 'v4', 'v5', 'r1', 'r2', 'r3', 'r4', 'r5', 'a1', 'a2', 'a3', 'a4', 'a5', 'u1', 'u2', 'u3', 'u4', 'u5', 'I01', 'I02', 'I03', 'I04', 'I05', 'i01', 'i02', 'i03', 'i04', 'i05', 'uo1', 'uo2', 'uo3', 'uo4', 'uo5', 'o01', 'o02', 'o03', 'o04', 'o05', 'U01', 'U02', 'U03', 'U04', 'U05', 'v01', 'v02', 'v03', 'v04', 'v05', 'er1', 'er2', 'er3', 'er4', 'er5', 'A01', 'A02', 'A03', 'A04', 'A05', 'ai1', 'ai2', 'ai3', 'ai4', 'ai5', 'e01', 'e02', 'e03', 'e04', 'e05', 'sh1', 'sh2', 'sh3', 'sh4', 'sh5', 'an1', 'an2', 'an3', 'an4', 'an5', 'ou1', 'ou2', 'ou3', 'ou4', 'ou5', 'ch1', 'ch2', 'ch3', 'ch4', 'ch5', 'a01', 'a02', 'a03', 'a04', 'a05', 'N01', 'N02', 'N03', 'N04', 'N05', 'ao1', 'ao2', 'ao3', 'ao4', 'ao5', 've1', 've2', 've3', 've4', 've5', 'ir1', 'ir2', 'ir3', 'ir4', 'ir5', 'ng1', 'ng2', 'ng3', 'ng4', 'ng5', 'ua1', 'ua2', 'ua3', 'ua4', 'ua5', 'zh1', 'zh2', 'zh3', 'zh4', 'zh5', 'O01', 'O02', 'O03', 'O04', 'O05', 'ie1', 'ie2', 'ie3', 'ie4', 'ie5', 'E01', 'E02', 'E03', 'E04', 'E05', 'ia1', 'ia2', 'ia3', 'ia4', 'ia5', 'iE01', 'iE02', 'iE03', 'iE04', 'iE05', 'ang1', 'ang2', 'ang3', 'ang4', 'ang5', 'ng01', 'ng02', 'ng03', 'ng04', 'ng05', 'io01', 'io02', 'io03', 'io04', 'io05', 'iA01', 'iA02', 'iA03', 'iA04', 'iA05', 'uA01', 'uA02', 'uA03', 'uA04', 'uA05', 'ong1', 'ong2', 'ong3', 'ong4', 'ong5', 'oo01', 'oo02', 'oo03', 'oo04', 'oo05', 'uE01', 'uE02', 'uE03', 'uE04', 'uE05', 'vE01', 'vE02', 'vE03', 'vE04', 'vE05', 'ue01', 'ue02', 'ue03', 'ue04', 'ue05', 'ua01', 'ua02', 'ua03', 'ua04', 'ua05', 'iO01', 'iO02', 'iO03', 'iO04', 'iO05']
+_additional = ['<sil>', '<asp>']
+# _CNM3_letters = []
+'''# shanghainese_cleaners
+_pad        = '_'
+_punctuation = ',.!?…'
+_letters = 'abdfghiklmnopstuvyzøŋȵɑɔɕəɤɦɪɿʑʔʰ̩̃ᴀᴇ15678 '
+'''
+'''# chinese_dialect_cleaners
+_pad        = '_'
+_punctuation = ',.!?~…─'
+_letters = '#Nabdefghijklmnoprstuvwxyzæçøŋœȵɐɑɒɓɔɕɗɘəɚɛɜɣɤɦɪɭɯɵɷɸɻɾɿʂʅʊʋʌʏʑʔʦʮʰʷˀː˥˦˧˨˩̥̩̃̚ᴀᴇ↑↓∅ⱼ '
+'''
+# Export all symbols:
+symbols = [_pad] + list(_punctuation) + list(_IPA_letters) + _CNM3_letters + _additional
+# Special symbol ids
+SPACE_ID = symbols.index(" ")

utils/__init__.py ADDED Viewed

File without changes

utils/audio.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import torch
+from torch import Tensor
+import torch.nn as nn
+import torchaudio
+class LinearSpectrogram(nn.Module):
+    def __init__(self, n_fft, win_length, hop_length, pad, center, pad_mode):
+        super().__init__()
+        self.n_fft = n_fft
+        self.win_length = win_length
+        self.hop_length = hop_length
+        self.pad = pad
+        self.center = center
+        self.pad_mode = pad_mode
+        self.register_buffer("window", torch.hann_window(win_length))
+    def forward(self, waveform: Tensor) -> Tensor:
+        if waveform.ndim == 3:
+            waveform = waveform.squeeze(1)
+        waveform = torch.nn.functional.pad(waveform.unsqueeze(1), (self.pad, self.pad), self.pad_mode).squeeze(1)
+        spec = torch.stft(waveform, self.n_fft, self.hop_length, self.win_length, self.window, self.center, self.pad_mode, False, True, True)
+        spec = torch.view_as_real(spec)
+        spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+        return spec
+class LogMelSpectrogram(nn.Module):
+    def __init__(self, sample_rate, n_fft, win_length, hop_length, f_min, f_max, pad, n_mels, center, pad_mode, mel_scale):
+        super().__init__()
+        self.sample_rate = sample_rate
+        self.n_fft = n_fft
+        self.win_length = win_length
+        self.hop_length = hop_length
+        self.f_min = f_min
+        self.f_max = f_max
+        self.pad = pad
+        self.n_mels = n_mels
+        self.center = center
+        self.pad_mode = pad_mode
+        self.mel_scale = mel_scale
+        self.spectrogram = LinearSpectrogram(n_fft, win_length, hop_length, pad, center, pad_mode)
+        self.mel_scale = torchaudio.transforms.MelScale(n_mels, sample_rate, f_min, f_max, (n_fft//2)+1, mel_scale, mel_scale)
+    def compress(self, x: Tensor) -> Tensor:
+        return torch.log(torch.clamp(x, min=1e-5))
+    def decompress(self, x: Tensor) -> Tensor:
+        return torch.exp(x)
+    def forward(self, x: Tensor) -> Tensor:
+        linear_spec = self.spectrogram(x)
+        x = self.mel_scale(linear_spec)
+        x = self.compress(x)
+        return x
+def load_and_resample_audio(audio_path, target_sr, device='cpu') -> Tensor:
+    try:
+        y, sr = torchaudio.load(audio_path)
+    except Exception as e:
+        print(str(e))
+        return None
+    y.to(device)
+    # Convert to mono
+    if y.size(0) > 1:
+        y = y[0, :].unsqueeze(0) # shape: [2, time] -> [time] -> [1, time]
+    # resample audio to target sample_rate
+    if sr != target_sr:
+        y = torchaudio.functional.resample(y, sr, target_sr)
+    return y

utils/load.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import os
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.nn.parallel import DistributedDataParallel as DDP
+def continue_training(checkpoint_path, model: DDP, optimizer: optim.Optimizer) -> int:
+    """load the latest checkpoints and optimizers"""
+    model_dict = {}
+    optimizer_dict = {}
+    # globt all the checkpoints in the directory
+    for file in os.listdir(checkpoint_path):
+        if file.endswith(".pt") and '_' in file:
+            name, epoch_str = file.rsplit('_', 1)
+            epoch = int(epoch_str.split('.')[0])
+            if name.startswith("checkpoint"):
+                model_dict[epoch] = file
+            elif name.startswith("optimizer"):
+                optimizer_dict[epoch] = file
+    # get the largest epoch
+    common_epochs = set(model_dict.keys()) & set(optimizer_dict.keys())
+    if common_epochs:
+        max_epoch = max(common_epochs)
+        model_path = os.path.join(checkpoint_path, model_dict[max_epoch])
+        optimizer_path = os.path.join(checkpoint_path, optimizer_dict[max_epoch])
+        # load model and optimizer
+        model.module.load_state_dict(torch.load(model_path, map_location='cpu'))
+        optimizer.load_state_dict(torch.load(optimizer_path, map_location='cpu'))
+        print(f'resume model and optimizer from {max_epoch} epoch')
+        return max_epoch + 1
+    else:
+        # load pretrained checkpoint
+        if model_dict:
+            model_path = os.path.join(checkpoint_path, model_dict[max(model_dict.keys())])
+            model.module.load_state_dict(torch.load(model_path, map_location='cpu'))
+        return 0