Spaces:

uto1125
/

Tai-Ya-test

Runtime error

App Files Files Community

uto1125 commited on Jan 13

Commit

371f188

verified ·

1 Parent(s): cab1c25

Delete fish_speech

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

fish_speech/__pycache__/conversation.cpython-310.pyc +0 -0
fish_speech/__pycache__/scheduler.cpython-310.pyc +0 -0
fish_speech/callbacks/__init__.py +0 -3
fish_speech/callbacks/__pycache__/__init__.cpython-310.pyc +0 -0
fish_speech/callbacks/__pycache__/grad_norm.cpython-310.pyc +0 -0
fish_speech/callbacks/grad_norm.py +0 -113
fish_speech/configs/base.yaml +0 -87
fish_speech/configs/firefly_gan_vq.yaml +0 -33
fish_speech/configs/lora/r_8_alpha_16.yaml +0 -4
fish_speech/configs/text2semantic_finetune.yaml +0 -83
fish_speech/conversation.py +0 -2
fish_speech/datasets/__pycache__/semantic.cpython-310.pyc +0 -0
fish_speech/datasets/concat_repeat.py +0 -53
fish_speech/datasets/protos/__pycache__/text_data_pb2.cpython-310.pyc +0 -0
fish_speech/datasets/protos/__pycache__/text_data_stream.cpython-310.pyc +0 -0
fish_speech/datasets/protos/text-data.proto +0 -24
fish_speech/datasets/protos/text_data_pb2.py +0 -33
fish_speech/datasets/protos/text_data_stream.py +0 -36
fish_speech/datasets/semantic.py +0 -496
fish_speech/datasets/vqgan.py +0 -147
fish_speech/i18n/README.md +0 -27
fish_speech/i18n/__init__.py +0 -3
fish_speech/i18n/__pycache__/__init__.cpython-310.pyc +0 -0
fish_speech/i18n/__pycache__/core.cpython-310.pyc +0 -0
fish_speech/i18n/core.py +0 -40
fish_speech/i18n/locale/en_US.json +0 -122
fish_speech/i18n/locale/es_ES.json +0 -122
fish_speech/i18n/locale/ja_JP.json +0 -123
fish_speech/i18n/locale/pt_BR.json +0 -133
fish_speech/i18n/locale/zh_CN.json +0 -122
fish_speech/i18n/scan.py +0 -122
fish_speech/models/text2semantic/__init__.py +0 -0
fish_speech/models/text2semantic/__pycache__/__init__.cpython-310.pyc +0 -0
fish_speech/models/text2semantic/__pycache__/lit_module.cpython-310.pyc +0 -0
fish_speech/models/text2semantic/__pycache__/llama.cpython-310.pyc +0 -0
fish_speech/models/text2semantic/__pycache__/lora.cpython-310.pyc +0 -0
fish_speech/models/text2semantic/lit_module.py +0 -202
fish_speech/models/text2semantic/llama.py +0 -779
fish_speech/models/text2semantic/lora.py +0 -92
fish_speech/models/vqgan/__init__.py +0 -0
fish_speech/models/vqgan/__pycache__/__init__.cpython-310.pyc +0 -0
fish_speech/models/vqgan/modules/__pycache__/firefly.cpython-310.pyc +0 -0
fish_speech/models/vqgan/modules/__pycache__/fsq.cpython-310.pyc +0 -0
fish_speech/models/vqgan/modules/firefly.py +0 -596
fish_speech/models/vqgan/modules/fsq.py +0 -116
fish_speech/models/vqgan/utils.py +0 -94
fish_speech/scheduler.py +0 -40
fish_speech/text/__init__.py +0 -4
fish_speech/text/__pycache__/__init__.cpython-310.pyc +0 -0
fish_speech/text/__pycache__/clean.cpython-310.pyc +0 -0

fish_speech/__pycache__/conversation.cpython-310.pyc DELETED Viewed

Binary file (227 Bytes)

fish_speech/__pycache__/scheduler.cpython-310.pyc DELETED Viewed

Binary file (1.04 kB)

fish_speech/callbacks/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-from .grad_norm import GradNormMonitor
-__all__ = ["GradNormMonitor"]

fish_speech/callbacks/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (239 Bytes)

fish_speech/callbacks/__pycache__/grad_norm.cpython-310.pyc DELETED Viewed

Binary file (3.79 kB)

fish_speech/callbacks/grad_norm.py DELETED Viewed

@@ -1,113 +0,0 @@
-from typing import Optional, Union
-import lightning.pytorch as pl
-import torch
-from lightning import LightningModule, Trainer
-from lightning.pytorch.callbacks import Callback
-from torch import Tensor, nn
-from torch.utils._foreach_utils import (
-    _group_tensors_by_device_and_dtype,
-    _has_foreach_support,
-)
-@torch.no_grad()
-def grad_norm(
-    parameters: Union[Tensor, list[Tensor]],
-    norm_type: float = 2.0,
-) -> float:
-    """
-    Returns the norm of the gradients of the given parameters.
-    Args:
-        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
-            single Tensor that will have gradients normalized
-        norm_type (float): type of the used p-norm.
-    Returns:
-        Total norm of the parameter gradients (viewed as a single vector).
-    """  # noqa: E501
-    if isinstance(parameters, Tensor):
-        parameters = [parameters]
-    grads = [p.grad for p in parameters if p.grad is not None]
-    if len(grads) == 0:
-        return None
-    first_device = grads[0].device
-    grouped_grads: dict[
-        tuple[torch.device, torch.dtype], list[list[Tensor]]
-    ] = _group_tensors_by_device_and_dtype(
-        [[g.detach() for g in grads]]
-    )  # type: ignore[assignment]
-    norms = []
-    for (device, _), ([grads], _) in grouped_grads.items():
-        if _has_foreach_support(grads, device=device):
-            norms.extend(torch._foreach_norm(grads, norm_type))
-        else:
-            norms.extend([torch.norm(g, norm_type) for g in grads])
-    return torch.norm(torch.stack([norm.to(first_device) for norm in norms]), norm_type)
-class GradNormMonitor(Callback):
-    """
-    Callback that computes the gradient norm of the model parameters.
-    """
-    def __init__(
-        self,
-        norm_type: float = 2.0,
-        logging_interval: str = "step",
-        sub_module: Optional[Union[str, list[str]]] = None,
-    ) -> None:
-        """
-        Args:
-            norm_type (float): type of the used p-norm.
-            logging_interval (str): "step" or "epoch".
-        """
-        super().__init__()
-        self.norm_type = norm_type
-        self.logging_interval = logging_interval
-        self.sub_module = sub_module
-    def on_after_backward(self, trainer: Trainer, model: LightningModule) -> None:
-        """
-        Computes the gradient norm of the model parameters and logs it to the logger.
-        Args:
-            trainer (Trainer): The trainer object
-            model (LightningModule): The current lightningModule
-        """
-        lightning_model = model
-        if self.sub_module is None:
-            return self.log_sub_module_grad_norm(lightning_model, model, "")
-        sub_modules = self.sub_module
-        if isinstance(sub_modules, str):
-            sub_modules = [sub_modules]
-        for sub_module in sub_modules:
-            self.log_sub_module_grad_norm(
-                lightning_model, getattr(model, sub_module), f"/{sub_module}"
-            )
-    def log_sub_module_grad_norm(
-        self, lightning_model: LightningModule, model: nn.Module, path: str
-    ) -> None:
-        grad_norm_val = grad_norm(model.parameters(), self.norm_type)
-        if grad_norm_val is None:
-            return
-        on_step = self.logging_interval == "step"
-        lightning_model.log(
-            f"train{path}/grad_norm",
-            grad_norm_val,
-            on_step=on_step,
-            on_epoch=not on_step,
-        )

fish_speech/configs/base.yaml DELETED Viewed

@@ -1,87 +0,0 @@
-# Base configuration for training a model
-paths:
-  run_dir: results/${project}
-  ckpt_dir: ${paths.run_dir}/checkpoints
-hydra:
-  run:
-    dir: ${paths.run_dir}
-# Lightning Trainer
-trainer:
-  _target_: lightning.pytorch.trainer.Trainer
-  default_root_dir: ${paths.run_dir}
-  accelerator: gpu
-  num_nodes: 1
-  devices: auto
-  strategy:
-    _target_: lightning.pytorch.strategies.DDPStrategy
-    process_group_backend: nccl  # This should be override when training on windows
-  precision: bf16-mixed
-  # disable validation by epoch end
-  check_val_every_n_epoch: null
-  val_check_interval: 5000
-  max_steps: 100_000
-  # Use torch.backends.cudnn.benchmark to speed up training
-  benchmark: true
-# Callbacks
-callbacks:
-  model_checkpoint:
-    _target_: lightning.pytorch.callbacks.ModelCheckpoint
-    dirpath: ${paths.ckpt_dir}
-    filename: "step_{step:09d}"
-    save_last: false # additionally always save an exact copy of the last checkpoint to a file last.ckpt
-    save_top_k: 5 # save 5 latest checkpoints
-    monitor: step # use step to monitor checkpoints
-    mode: max # save the latest checkpoint with the highest global_step
-    every_n_epochs: null # don't save checkpoints by epoch end
-    every_n_train_steps: 5000 # save checkpoints every 5000 steps
-    auto_insert_metric_name: false
-  model_summary:
-    _target_: lightning.pytorch.callbacks.ModelSummary
-    max_depth: 2 # the maximum depth of layer nesting that the summary will include
-  learning_rate_monitor:
-    _target_: lightning.pytorch.callbacks.LearningRateMonitor
-    logging_interval: step
-    log_momentum: false
-  grad_norm_monitor:
-    _target_: fish_speech.callbacks.GradNormMonitor
-    norm_type: 2
-    logging_interval: step
-# Logger
-logger:
-  tensorboard:
-    _target_: lightning.pytorch.loggers.tensorboard.TensorBoardLogger
-    save_dir: "${paths.run_dir}/tensorboard/"
-    name: null
-    log_graph: false
-    default_hp_metric: true
-    prefix: ""
-  # wandb:
-  #   _target_: lightning.pytorch.loggers.wandb.WandbLogger
-  #   # name: "" # name of the run (normally generated by wandb)
-  #   save_dir: "${paths.run_dir}"
-  #   offline: False
-  #   id: null # pass correct id to resume experiment!
-  #   anonymous: null # enable anonymous logging
-  #   project: "fish-speech"
-  #   log_model: False # upload lightning ckpts
-  #   prefix: "" # a string to put at the beginning of metric keys
-  #   # entity: "" # set to name of your wandb team
-  #   group: ""
-  #   tags: ["vq", "hq", "finetune"]
-  #   job_type: ""
-# Loop
-train: true
-test: false

fish_speech/configs/firefly_gan_vq.yaml DELETED Viewed

@@ -1,33 +0,0 @@
-_target_: fish_speech.models.vqgan.modules.firefly.FireflyArchitecture
-spec_transform:
-  _target_: fish_speech.utils.spectrogram.LogMelSpectrogram
-  sample_rate: 44100
-  n_mels: 160
-  n_fft: 2048
-  hop_length: 512
-  win_length: 2048
-backbone:
-  _target_: fish_speech.models.vqgan.modules.firefly.ConvNeXtEncoder
-  input_channels: 160
-  depths: [3, 3, 9, 3]
-  dims: [128, 256, 384, 512]
-  drop_path_rate: 0.2
-  kernel_size: 7
-head:
-  _target_: fish_speech.models.vqgan.modules.firefly.HiFiGANGenerator
-  hop_length: 512
-  upsample_rates: [8, 8, 2, 2, 2]  # aka. strides
-  upsample_kernel_sizes: [16, 16, 4, 4, 4]
-  resblock_kernel_sizes: [3, 7, 11]
-  resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
-  num_mels: 512
-  upsample_initial_channel: 512
-  pre_conv_kernel_size: 13
-  post_conv_kernel_size: 13
-quantizer:
-  _target_: fish_speech.models.vqgan.modules.fsq.DownsampleFiniteScalarQuantize
-  input_dim: 512
-  n_groups: 8
-  n_codebooks: 1
-  levels: [8, 5, 5, 5]
-  downsample_factor: [2, 2]

fish_speech/configs/lora/r_8_alpha_16.yaml DELETED Viewed

@@ -1,4 +0,0 @@
-_target_: fish_speech.models.text2semantic.lora.LoraConfig
-r: 8
-lora_alpha: 16
-lora_dropout: 0.01

fish_speech/configs/text2semantic_finetune.yaml DELETED Viewed

@@ -1,83 +0,0 @@
-defaults:
-  - base
-  - _self_
-project: text2semantic_finetune_dual_ar
-max_length: 4096
-pretrained_ckpt_path: checkpoints/fish-speech-1.4
-# Lightning Trainer
-trainer:
-  accumulate_grad_batches: 1
-  gradient_clip_val: 1.0
-  gradient_clip_algorithm: "norm"
-  max_steps: 1000
-  precision: bf16-true
-  limit_val_batches: 10
-  val_check_interval: 100
-# Dataset Configuration
-tokenizer:
-  _target_: transformers.AutoTokenizer.from_pretrained
-  pretrained_model_name_or_path: ${pretrained_ckpt_path}
-# Dataset Configuration
-train_dataset:
-  _target_: fish_speech.datasets.semantic.AutoTextSemanticInstructionDataset
-  proto_files:
-    - data/protos
-  tokenizer: ${tokenizer}
-  causal: true
-  max_length: ${max_length}
-  use_speaker: false
-  interactive_prob: 0.7
-val_dataset:
-  _target_: fish_speech.datasets.semantic.AutoTextSemanticInstructionDataset
-  proto_files:
-    - data/protos
-  tokenizer: ${tokenizer}
-  causal: true
-  max_length: ${max_length}
-  use_speaker: false
-  interactive_prob: 0.7
-data:
-  _target_: fish_speech.datasets.semantic.SemanticDataModule
-  train_dataset: ${train_dataset}
-  val_dataset: ${val_dataset}
-  num_workers: 4
-  batch_size: 8
-  tokenizer: ${tokenizer}
-  max_length: ${max_length}
-# Model Configuration
-model:
-  _target_: fish_speech.models.text2semantic.lit_module.TextToSemantic
-  model:
-    _target_: fish_speech.models.text2semantic.llama.BaseTransformer.from_pretrained
-    path: ${pretrained_ckpt_path}
-    load_weights: true
-    max_length: ${max_length}
-    lora_config: null
-  optimizer:
-    _target_: torch.optim.AdamW
-    _partial_: true
-    lr: 1e-4
-    weight_decay: 0
-    betas: [0.9, 0.95]
-    eps: 1e-5
-  lr_scheduler:
-    _target_: torch.optim.lr_scheduler.LambdaLR
-    _partial_: true
-    lr_lambda:
-      _target_: fish_speech.scheduler.get_constant_schedule_with_warmup_lr_lambda
-      _partial_: true
-      num_warmup_steps: 10
-# Callbacks
-callbacks:
-  model_checkpoint:
-    every_n_train_steps: ${trainer.val_check_interval}

fish_speech/conversation.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- SEMANTIC_TOKEN = "<\|semantic\|>"
2	- CODEBOOK_PAD_TOKEN_ID = 0

fish_speech/datasets/__pycache__/semantic.cpython-310.pyc DELETED Viewed

Binary file (12.4 kB)

fish_speech/datasets/concat_repeat.py DELETED Viewed

@@ -1,53 +0,0 @@
-import bisect
-import random
-from typing import Iterable
-from torch.utils.data import Dataset, IterableDataset
-class ConcatRepeatDataset(Dataset):
-    datasets: list[Dataset]
-    cumulative_sizes: list[int]
-    repeats: list[int]
-    @staticmethod
-    def cumsum(sequence, repeats):
-        r, s = [], 0
-        for dataset, repeat in zip(sequence, repeats):
-            l = len(dataset) * repeat
-            r.append(l + s)
-            s += l
-        return r
-    def __init__(self, datasets: Iterable[Dataset], repeats: list[int]):
-        super().__init__()
-        self.datasets = list(datasets)
-        self.repeats = repeats
-        assert len(self.datasets) > 0, "datasets should not be an empty iterable"
-        assert len(self.datasets) == len(
-            repeats
-        ), "datasets and repeats should have the same length"
-        for d in self.datasets:
-            assert not isinstance(
-                d, IterableDataset
-            ), "ConcatRepeatDataset does not support IterableDataset"
-        self.cumulative_sizes = self.cumsum(self.datasets, self.repeats)
-    def __len__(self):
-        return self.cumulative_sizes[-1]
-    def __getitem__(self, idx):
-        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
-        if dataset_idx == 0:
-            sample_idx = idx
-        else:
-            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
-        dataset = self.datasets[dataset_idx]
-        return dataset[sample_idx % len(dataset)]

fish_speech/datasets/protos/__pycache__/text_data_pb2.cpython-310.pyc DELETED Viewed

Binary file (1.26 kB)

fish_speech/datasets/protos/__pycache__/text_data_stream.cpython-310.pyc DELETED Viewed

Binary file (1.13 kB)

fish_speech/datasets/protos/text-data.proto DELETED Viewed

@@ -1,24 +0,0 @@
-syntax = "proto3";
-package text_data;
-message Semantics {
-    repeated uint32 values = 1;
-}
-message Sentence {
-    repeated string texts = 1;
-    repeated Semantics semantics = 3;
-}
-message TextData {
-    string source = 1;
-    string name = 2;
-    repeated Sentence sentences = 4;
-}
-message SampledData {
-    string source = 1;
-    string name = 2;
-    repeated Sentence samples = 3;
-}

fish_speech/datasets/protos/text_data_pb2.py DELETED Viewed

@@ -1,33 +0,0 @@
-# -*- coding: utf-8 -*-
-# Generated by the protocol buffer compiler.  DO NOT EDIT!
-# source: text-data.proto
-# Protobuf Python Version: 4.25.1
-"""Generated protocol buffer code."""
-from google.protobuf import descriptor as _descriptor
-from google.protobuf import descriptor_pool as _descriptor_pool
-from google.protobuf import symbol_database as _symbol_database
-from google.protobuf.internal import builder as _builder
-# @@protoc_insertion_point(imports)
-_sym_db = _symbol_database.Default()
-DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
-    b'\n\x0ftext-data.proto\x12\ttext_data"\x1b\n\tSemantics\x12\x0e\n\x06values\x18\x01 \x03(\r"B\n\x08Sentence\x12\r\n\x05texts\x18\x01 \x03(\t\x12\'\n\tsemantics\x18\x03 \x03(\x0b\x32\x14.text_data.Semantics"P\n\x08TextData\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12&\n\tsentences\x18\x04 \x03(\x0b\x32\x13.text_data.Sentence"Q\n\x0bSampledData\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12$\n\x07samples\x18\x03 \x03(\x0b\x32\x13.text_data.Sentenceb\x06proto3'
-)
-_globals = globals()
-_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
-_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "text_data_pb2", _globals)
-if _descriptor._USE_C_DESCRIPTORS == False:
-    DESCRIPTOR._options = None
-    _globals["_SEMANTICS"]._serialized_start = 30
-    _globals["_SEMANTICS"]._serialized_end = 57
-    _globals["_SENTENCE"]._serialized_start = 59
-    _globals["_SENTENCE"]._serialized_end = 125
-    _globals["_TEXTDATA"]._serialized_start = 127
-    _globals["_TEXTDATA"]._serialized_end = 207
-    _globals["_SAMPLEDDATA"]._serialized_start = 209
-    _globals["_SAMPLEDDATA"]._serialized_end = 290
-# @@protoc_insertion_point(module_scope)

fish_speech/datasets/protos/text_data_stream.py DELETED Viewed

@@ -1,36 +0,0 @@
-import struct
-from .text_data_pb2 import TextData
-def read_pb_stream(f):
-    while True:
-        buf = f.read(4)
-        if len(buf) == 0:
-            break
-        size = struct.unpack("I", buf)[0]
-        buf = f.read(size)
-        text_data = TextData()
-        text_data.ParseFromString(buf)
-        yield text_data
-def write_pb_stream(f, text_data):
-    buf = text_data.SerializeToString()
-    f.write(struct.pack("I", len(buf)))
-    f.write(buf)
-def pack_pb_stream(text_data):
-    buf = text_data.SerializeToString()
-    return struct.pack("I", len(buf)) + buf
-def split_pb_stream(f):
-    while True:
-        head = f.read(4)
-        if len(head) == 0:
-            break
-        size = struct.unpack("I", head)[0]
-        buf = f.read(size)
-        yield head + buf

fish_speech/datasets/semantic.py DELETED Viewed

@@ -1,496 +0,0 @@
-import random
-from dataclasses import dataclass
-from itertools import chain
-from pathlib import Path
-from random import Random
-from typing import Optional, Union
-import numpy as np
-import pyarrow.parquet as pq
-import torch
-import torch.nn.functional as F
-from datasets.download.streaming_download_manager import xopen
-from huggingface_hub import HfApi
-from lightning import LightningDataModule
-from torch.distributed import get_rank, get_world_size, is_initialized
-from torch.utils.data import DataLoader, IterableDataset, get_worker_info
-from transformers import AutoTokenizer
-from fish_speech.conversation import CODEBOOK_PAD_TOKEN_ID
-from fish_speech.datasets.protos.text_data_pb2 import SampledData
-from fish_speech.datasets.protos.text_data_stream import read_pb_stream
-from fish_speech.text.clean import clean_text
-from fish_speech.utils import RankedLogger
-from fish_speech.utils.braceexpand import braceexpand
-log = RankedLogger(__name__, rank_zero_only=True)
-def split_by_rank_worker(files):
-    # We need to know the total number of devices
-    # to split the data properly
-    total_devices = 1
-    if is_initialized():
-        total_devices = get_world_size()
-    worker_info = get_worker_info()
-    if worker_info is not None:
-        total_devices *= worker_info.num_workers
-    if len(files) < total_devices:
-        # Repeat the files N times to match the number of devices
-        files = files * (total_devices // len(files) + 1)
-    # DDP
-    if is_initialized():
-        files = files[get_rank() :: get_world_size()]
-    # Split by worker
-    if worker_info is not None:
-        files = files[worker_info.id :: worker_info.num_workers]
-    return files
-class AutoTextSemanticInstructionDataset(IterableDataset):
-    """
-    Auto Augment Dataset by Speaker
-    1. Random concatenate multiple sentences from the same speaker to form a longer sentence
-    2. Automatically normalize the text
-    For interactive mode, we use the following format (multiple sequences):
-    <s> [INST] [SPK: speaker] text [/INST] ... [INST] text [/INST] </s>
-    For non-interactive mode, we use the following format (one long sequence):
-    <s> [INST] text [/INST] ... </s>
-    """
-    def __init__(
-        self,
-        proto_files: list[str],
-        seed: int = 42,
-        interactive_prob: float = 0.5,
-        max_length: int = 1024,
-        tokenizer: AutoTokenizer = None,
-        use_speaker: bool | float = True,
-        causal: bool = True,
-        num_codebooks: Optional[int] = None,
-        skip_text_prob: float = 0.0,
-    ):
-        """
-        Args:
-            proto_files: proto buf files if using local data
-            seed: random seed
-            interactive_prob: probability to use interactive mode
-            max_length: max length of the text
-            tokenizer: tokenizer
-            use_speaker: include speaker information in the prompt
-            causal: use causal sampling when using local data, disable will lead to random sampling
-            num_codebooks: number of codebooks, if None, it will be automatically detected
-            skip_text_prob: probability to skip the text (audio only), this only applies to interactive mode
-        """
-        super().__init__()
-        assert 0 <= interactive_prob <= 1, "interactive_prob must be in [0, 1]"
-        self.seed = seed
-        self.max_length = max_length
-        self.tokenizer = tokenizer
-        self.interactive_prob = interactive_prob
-        self.use_speaker = use_speaker
-        self.proto_files = proto_files
-        self.causal = causal
-        self.num_codebooks = num_codebooks
-        self.skip_text_prob = skip_text_prob
-        self.semantic_token_id = self.tokenizer.convert_tokens_to_ids("<|semantic|>")
-        self.groups = None
-    def init_mock_data_server(self):
-        if self.groups is not None:
-            return
-        # Expand the proto files
-        expanded_proto_files = []
-        for filename in self.proto_files:
-            for i in braceexpand(filename):
-                i = Path(i)
-                if i.is_file():
-                    expanded_proto_files.append(i)
-                elif i.is_dir():
-                    expanded_proto_files.extend(i.rglob("*.proto"))
-                    expanded_proto_files.extend(i.rglob("*.protos"))
-                else:
-                    raise ValueError(f"{i} is not a file or directory")
-        expanded_proto_files = sorted(expanded_proto_files)
-        Random(self.seed).shuffle(expanded_proto_files)
-        self.groups = []
-        shard_proto_files = split_by_rank_worker(expanded_proto_files)
-        log.info(
-            f"Reading {len(shard_proto_files)} / {len(expanded_proto_files)} files"
-        )
-        count = 0
-        for filename in shard_proto_files:
-            with open(filename, "rb") as f:
-                for text_data in read_pb_stream(f):
-                    self.groups.append(text_data)
-                    count += 1
-        log.info(f"Read total {count} groups of data")
-        # Shuffle the lines
-        Random(self.seed).shuffle(self.groups)
-        self.group_weights = [len(i.sentences) for i in self.groups]
-    def __iter__(self):
-        while True:
-            yield self.augment()
-    def tokenize_sentence(self, sentence: str):
-        sentence = clean_text(sentence)
-        tokens = self.tokenizer.encode(
-            f"{sentence}",
-            max_length=10**6,
-            add_special_tokens=False,
-            truncation=False,
-        )
-        return sentence, len(tokens)
-    def sample_data(self):
-        if self.groups is None:
-            self.init_mock_data_server()
-        # Shuffle unique lines, estimate that each sample is at least 20 tokens
-        num_samples = self.max_length // 20
-        # choice group based on their number of samples
-        group = random.choices(self.groups, weights=self.group_weights, k=1)[0]
-        if self.causal:
-            # Sample in order
-            if num_samples >= len(group.sentences):
-                samples = group.sentences
-            else:
-                begin = random.randint(0, len(group.sentences) - num_samples)
-                samples = group.sentences[begin : begin + num_samples]
-        else:
-            samples = random.choices(
-                group.sentences, k=min(num_samples, len(group.sentences))
-            )
-        return SampledData(
-            source=group.source,
-            name=group.name,
-            samples=samples,
-        )
-    def augment(self):
-        final_text, final_semantic = [], []
-        response = self.sample_data()
-        if len(response.samples) == 0:
-            # Invalid group
-            return None
-        samples = list(response.samples)
-        idx = 0
-        use_interactive = random.random() < self.interactive_prob
-        if use_interactive is False:
-            # Random sample based on speaker using a truncated normal distribution
-            a = torch.tensor([0], dtype=torch.float32)
-            torch.nn.init.trunc_normal_(
-                a,
-                mean=self.max_length // 2,
-                std=self.max_length // 4,
-                a=10,
-                b=self.max_length,
-            )
-            remaining_tokens = a.long().item() - 4
-        else:
-            remaining_tokens = self.max_length
-        # Use speaker
-        if isinstance(self.use_speaker, float):
-            use_speaker = random.random() < self.use_speaker
-        else:
-            use_speaker = self.use_speaker
-        all_tokens, all_labels = [], []
-        while remaining_tokens > 0 and len(samples) > 0:
-            sentence = samples.pop(0)
-            text = random.choice(sentence.texts)
-            text, length = self.tokenize_sentence(text)
-            remaining_tokens -= length + len(sentence.semantics[0].values)
-            if use_interactive is False:
-                final_text.append(text)
-                final_semantic.append(sentence.semantics)
-            else:
-                # For interactive mode, we only apply speaker for the first sentence
-                # [INST] [SPK: speaker] text [/INST] ... [INST] text [/INST]
-                tokens, labels = self.pack_sentences(
-                    sentences=[text],
-                    semantics=[sentence.semantics],
-                    speaker=response.name if use_speaker else None,
-                    skip_text=random.random() < self.skip_text_prob,
-                )
-                all_tokens.append(tokens)
-                all_labels.append(labels)
-            idx += 1
-        if use_interactive is False:
-            tokens, labels = self.pack_sentences(
-                final_text,
-                semantics=final_semantic,
-                speaker=response.name if use_speaker else None,
-            )
-            all_tokens.append(tokens)
-            all_labels.append(labels)
-        tokens = torch.cat(all_tokens, dim=1)
-        labels = torch.cat(all_labels, dim=1)
-        # Verify that the length is correct
-        assert tokens.size(1) == labels.size(1), f"{tokens.size(1)} != {labels.size(1)}"
-        data = {"tokens": tokens, "labels": labels}
-        return data
-    def pack_sentences(
-        self,
-        sentences: list[str],
-        semantics: list,
-        speaker: Optional[str] = None,
-        skip_text: bool = False,
-    ):
-        if speaker is None:
-            speaker = "assistant"
-        cated_sentences = " ".join(sentences)
-        if skip_text:
-            cated_sentences = "<|skip_text|>"
-        final_text = "<|im_start|>user\n" + cated_sentences + "<|im_end|>"
-        final_text = final_text + f"<|im_start|>{speaker}\n"
-        encoded = self.tokenizer.encode(
-            final_text,
-            add_special_tokens=False,
-            truncation=False,
-            max_length=10**6,
-        )
-        semantic_length = sum([len(i[0].values) for i in semantics])
-        prompt_length = len(encoded)
-        num_codebooks = (
-            len(semantics[0]) if self.num_codebooks is None else self.num_codebooks
-        )
-        # Pack the tokens and semantics (add <s> and </s> to semantic tokens)
-        tokens = (
-            encoded
-            + [self.semantic_token_id] * semantic_length
-            + self.tokenizer.convert_tokens_to_ids(["<|im_end|>"])
-        )
-        # Codebook bos/padding: 0, eos: 1
-        codes = [[CODEBOOK_PAD_TOKEN_ID] * prompt_length for _ in range(num_codebooks)]
-        for segment in semantics:
-            for book_idx, book in zip(range(num_codebooks), segment):
-                for j in book.values:
-                    codes[book_idx].append(int(j) + 1)
-        for book in codes:
-            book.extend([CODEBOOK_PAD_TOKEN_ID] * 1)
-        tokens = [tokens] + codes
-        tokens = torch.tensor(tokens, dtype=torch.long)
-        labels = tokens.clone()
-        if skip_text:
-            # If text is not provided, the sentence is used for condition only, all labels are -100
-            torch.fill_(labels, -100)
-            return tokens, labels
-        # Mask out the <s> tokens for semantic, predict semantic tokens only
-        # Since we don't mask out the input tokens, the language modeling still works
-        labels[1:, :prompt_length] = -100
-        tokens = tokens[:, :-1]
-        labels = labels[:, 1:]
-        # Verify the padding is correct, and the last token is eos
-        assert (tokens[1:, :prompt_length] == CODEBOOK_PAD_TOKEN_ID).all()
-        assert (labels[1:, -1:] == CODEBOOK_PAD_TOKEN_ID).all()
-        return tokens, labels
-@dataclass
-class TextDataCollator:
-    tokenizer: AutoTokenizer
-    max_length: int = 1024
-    def __call__(self, examples):
-        if "negative_tokens" in examples:
-            positive_examples = []
-            negative_examples = []
-            for i in examples:
-                positive_examples.append(
-                    {
-                        "tokens": i["tokens"],
-                        "labels": i["labels"],
-                    }
-                )
-                negative_examples.append(
-                    {
-                        "tokens": i["negative_tokens"],
-                        "labels": i["negative_labels"],
-                    }
-                )
-            examples = positive_examples + negative_examples
-        return self.batchify(examples)
-    def batchify(self, examples, tokens_key="tokens", labels_key="labels"):
-        tokens, attention_masks, labels = [], [], []
-        # Calculate the max length
-        max_tokens_length = 0
-        for example in examples:
-            max_tokens_length = max(max_tokens_length, example[tokens_key].size(1))
-        max_tokens_length = min(max_tokens_length, self.max_length)
-        for example in examples:
-            _tokens = example[tokens_key][:, :max_tokens_length]
-            _labels = example[labels_key][:, :max_tokens_length]
-            _attention_mask = torch.ones((max_tokens_length,), dtype=torch.bool)
-            tokens_length = _tokens.size(1)
-            _attention_mask[:tokens_length] = False
-            assert tokens_length == _labels.size(
-                1
-            ), f"{tokens_length} != {_labels.size(1)}"
-            if tokens_length < max_tokens_length:
-                _tokens = F.pad(
-                    _tokens,
-                    (0, max_tokens_length - tokens_length),
-                    value=self.tokenizer.eos_token_id,
-                )
-                _tokens[1:, tokens_length:] = CODEBOOK_PAD_TOKEN_ID
-                _labels = F.pad(
-                    _labels, (0, max_tokens_length - _labels.size(1)), value=-100
-                )
-            tokens.append(_tokens)
-            attention_masks.append(_attention_mask)
-            labels.append(_labels)
-        tokens = torch.stack(tokens, dim=0)
-        attention_masks = torch.stack(attention_masks, dim=0)
-        labels = torch.stack(labels, dim=0)
-        return {
-            "inputs": tokens,
-            "attention_masks": attention_masks,
-            "labels": labels,
-        }
-class InterleaveDataset(IterableDataset):
-    def __init__(
-        self,
-        datasets: list[IterableDataset],
-        probabilities: list[float],
-        seed: int = 42,
-    ):
-        super().__init__()
-        self.datasets = datasets
-        self.probabilities = probabilities
-        self.seed = seed
-    def __iter__(self):
-        rng = np.random.default_rng(self.seed)
-        dataset_iterators = [iter(dataset) for dataset in self.datasets]
-        while True:
-            # Random choice one
-            dataset_idx = rng.choice(len(self.datasets), p=self.probabilities)
-            dataset_iterator = dataset_iterators[dataset_idx]
-            try:
-                yield next(dataset_iterator)
-            except StopIteration:
-                # Exhausted, create a new iterator
-                dataset_iterators[dataset_idx] = iter(self.datasets[dataset_idx])
-                yield next(dataset_iterators[dataset_idx])
-class SemanticDataModule(LightningDataModule):
-    def __init__(
-        self,
-        train_dataset: Union[AutoTextSemanticInstructionDataset, InterleaveDataset],
-        val_dataset: Union[AutoTextSemanticInstructionDataset, InterleaveDataset],
-        batch_size: int = 32,
-        tokenizer: AutoTokenizer = None,
-        max_length: int = 1024,
-        num_workers: int = 4,
-    ):
-        super().__init__()
-        self.train_dataset = train_dataset
-        self.val_dataset = val_dataset
-        self.batch_size = batch_size
-        self.tokenizer = tokenizer
-        self.max_length = max_length
-        self.num_workers = num_workers
-    def train_dataloader(self):
-        return DataLoader(
-            self.train_dataset,
-            batch_size=self.batch_size,
-            collate_fn=TextDataCollator(self.tokenizer, self.max_length),
-            num_workers=self.num_workers,
-            persistent_workers=True,
-        )
-    def val_dataloader(self):
-        return DataLoader(
-            self.val_dataset,
-            batch_size=self.batch_size,
-            collate_fn=TextDataCollator(self.tokenizer, self.max_length),
-            num_workers=self.num_workers,
-            persistent_workers=True,
-        )
-if __name__ == "__main__":
-    from tqdm import tqdm
-    ds = AutoTextSemanticInstructionDataset(
-        ["data/protos"],
-        tokenizer=AutoTokenizer.from_pretrained("fishaudio/fish-speech-1"),
-        use_speaker=False,
-        interactive_prob=1.0,
-        skip_text_prob=0.5,
-    )
-    for i in ds:
-        print(ds.tokenizer.decode(i["tokens"][0], skip_special_tokens=False))
-        # i["labels"][0][i["labels"][0] == -100] = 0
-        # print(ds.tokenizer.decode(i["labels"][0], skip_special_tokens=False))
-        break

fish_speech/datasets/vqgan.py DELETED Viewed

@@ -1,147 +0,0 @@
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Optional
-import librosa
-import numpy as np
-import torch
-from lightning import LightningDataModule
-from torch.utils.data import DataLoader, Dataset
-from fish_speech.utils import RankedLogger
-logger = RankedLogger(__name__, rank_zero_only=False)
-class VQGANDataset(Dataset):
-    def __init__(
-        self,
-        filelist: str,
-        sample_rate: int = 32000,
-        hop_length: int = 640,
-        slice_frames: Optional[int] = None,
-    ):
-        super().__init__()
-        filelist = Path(filelist)
-        root = filelist.parent
-        self.files = [
-            root / line.strip()
-            for line in filelist.read_text(encoding="utf-8").splitlines()
-            if line.strip()
-        ]
-        self.sample_rate = sample_rate
-        self.hop_length = hop_length
-        self.slice_frames = slice_frames
-    def __len__(self):
-        return len(self.files)
-    def get_item(self, idx):
-        file = self.files[idx]
-        audio, _ = librosa.load(file, sr=self.sample_rate, mono=True)
-        # Slice audio and features
-        if (
-            self.slice_frames is not None
-            and audio.shape[0] > self.slice_frames * self.hop_length
-        ):
-            start = np.random.randint(
-                0, audio.shape[0] - self.slice_frames * self.hop_length
-            )
-            audio = audio[start : start + self.slice_frames * self.hop_length]
-        if len(audio) == 0:
-            return None
-        max_value = np.abs(audio).max()
-        if max_value > 1.0:
-            audio = audio / max_value
-        return {
-            "audio": torch.from_numpy(audio),
-        }
-    def __getitem__(self, idx):
-        try:
-            return self.get_item(idx)
-        except Exception as e:
-            import traceback
-            traceback.print_exc()
-            logger.error(f"Error loading {self.files[idx]}: {e}")
-            return None
-@dataclass
-class VQGANCollator:
-    def __call__(self, batch):
-        batch = [x for x in batch if x is not None]
-        audio_lengths = torch.tensor([len(x["audio"]) for x in batch])
-        audio_maxlen = audio_lengths.max()
-        # Rounds up to nearest multiple of 2 (audio_lengths)
-        audios = []
-        for x in batch:
-            audios.append(
-                torch.nn.functional.pad(x["audio"], (0, audio_maxlen - len(x["audio"])))
-            )
-        return {
-            "audios": torch.stack(audios),
-            "audio_lengths": audio_lengths,
-        }
-class VQGANDataModule(LightningDataModule):
-    def __init__(
-        self,
-        train_dataset: VQGANDataset,
-        val_dataset: VQGANDataset,
-        batch_size: int = 32,
-        num_workers: int = 4,
-        val_batch_size: Optional[int] = None,
-    ):
-        super().__init__()
-        self.train_dataset = train_dataset
-        self.val_dataset = val_dataset
-        self.batch_size = batch_size
-        self.val_batch_size = val_batch_size or batch_size
-        self.num_workers = num_workers
-    def train_dataloader(self):
-        return DataLoader(
-            self.train_dataset,
-            batch_size=self.batch_size,
-            collate_fn=VQGANCollator(),
-            num_workers=self.num_workers,
-            shuffle=True,
-            persistent_workers=True,
-        )
-    def val_dataloader(self):
-        return DataLoader(
-            self.val_dataset,
-            batch_size=self.val_batch_size,
-            collate_fn=VQGANCollator(),
-            num_workers=self.num_workers,
-            persistent_workers=True,
-        )
-if __name__ == "__main__":
-    dataset = VQGANDataset("data/LibriTTS_R/vq_train_filelist.txt")
-    dataloader = DataLoader(
-        dataset, batch_size=4, shuffle=False, collate_fn=VQGANCollator()
-    )
-    for batch in dataloader:
-        print(batch["audios"].shape)
-        print(batch["features"].shape)
-        print(batch["audio_lengths"])
-        print(batch["feature_lengths"])
-        break

fish_speech/i18n/README.md DELETED Viewed

@@ -1,27 +0,0 @@
-## i18n Folder Attribution
-The `i18n` folder within the `fish_speech` directory contains files initially sourced from the RVC project. In compliance with the MIT license under which these files were released, we acknowledge the original authors and sources below:
-### fish_speech/i18n/core.py
-**Related code from RVC:**
-[https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/i18n.py](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/i18n.py)
-**Initial commit:**
-add localization(添加本地化) [RVC-Project/Retrieval-based-Voice-Conversion-WebUI#35](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/pull/35)
-**Initial author:**
-[@L4Ph](https://github.com/L4Ph)
-### fish_speech/i18n/scan.py
-**Related code from RVC:**
-[https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/scan_i18n.py](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/scan_i18n.py)
-**Initial commit:**
-File for detecting i18n missing keys [RVC-Project/Retrieval-based-Voice-Conversion-WebUI#1058](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/pull/1058)
-**Initial author:**
-[@towzeur](https://github.com/towzeur)
-We appreciate the contributions of the RVC project and its authors.

fish_speech/i18n/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-from .core import i18n
-__all__ = ["i18n"]

fish_speech/i18n/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (218 Bytes)

fish_speech/i18n/__pycache__/core.cpython-310.pyc DELETED Viewed

Binary file (1.44 kB)

fish_speech/i18n/core.py DELETED Viewed

@@ -1,40 +0,0 @@
-import json
-import locale
-from pathlib import Path
-I18N_FILE_PATH = Path(__file__).parent / "locale"
-DEFAULT_LANGUAGE = "en_US"
-def load_language_list(language):
-    with open(I18N_FILE_PATH / f"{language}.json", "r", encoding="utf-8") as f:
-        language_list = json.load(f)
-    return language_list
-class I18nAuto:
-    def __init__(self):
-        i18n_file = Path(".locale")
-        if i18n_file.exists():
-            with open(i18n_file, "r", encoding="utf-8") as f:
-                language = f.read().strip()
-        else:
-            # getlocale can't identify the system's language ((None, None))
-            language = locale.getdefaultlocale()[0]
-        if (I18N_FILE_PATH / f"{language}.json").exists() is False:
-            language = DEFAULT_LANGUAGE
-        self.language = language
-        self.language_map = load_language_list(language)
-    def __call__(self, key):
-        return self.language_map.get(key, key)
-    def __repr__(self):
-        return "Use Language: " + self.language
-i18n = I18nAuto()

fish_speech/i18n/locale/en_US.json DELETED Viewed

@@ -1,122 +0,0 @@
-{
-  "16-mixed is recommended for 10+ series GPU": "16-mixed is recommended for 10+ series GPU",
-  "5 to 10 seconds of reference audio, useful for specifying speaker.": "5 to 10 seconds of reference audio, useful for specifying speaker.",
-  "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).",
-  "Accumulate Gradient Batches": "Accumulate Gradient Batches",
-  "Add to Processing Area": "Add to Processing Area",
-  "Added path successfully!": "Added path successfully!",
-  "Advanced Config": "Advanced Config",
-  "Base LLAMA Model": "Base LLAMA Model",
-  "Batch Inference": "Batch Inference",
-  "Batch Size": "Batch Size",
-  "Changing with the Model Path": "Changing with the Model Path",
-  "Chinese": "Chinese",
-  "Compile Model": "Compile Model",
-  "Compile the model can significantly reduce the inference time, but will increase cold start time": "Compile the model can significantly reduce the inference time, but will increase cold start time",
-  "Copy": "Copy",
-  "Data Preprocessing": "Data Preprocessing",
-  "Data Preprocessing Path": "Data Preprocessing Path",
-  "Data Source": "Data Source",
-  "Decoder Model Config": "Decoder Model Config",
-  "Decoder Model Path": "Decoder Model Path",
-  "Disabled": "Disabled",
-  "Enable Reference Audio": "Enable Reference Audio",
-  "English": "English",
-  "Error Message": "Error Message",
-  "File Preprocessing": "File Preprocessing",
-  "Generate": "Generate",
-  "Generated Audio": "Generated Audio",
-  "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format",
-  "Infer interface is closed": "Infer interface is closed",
-  "Inference Configuration": "Inference Configuration",
-  "Inference Server Configuration": "Inference Server Configuration",
-  "Inference Server Error": "Inference Server Error",
-  "Inferring interface is launched at {}": "Inferring interface is launched at {}",
-  "Initial Learning Rate": "Initial Learning Rate",
-  "Input Audio & Source Path for Transcription": "Input Audio & Source Path for Transcription",
-  "Input Text": "Input Text",
-  "Invalid path: {}": "Invalid path: {}",
-  "It is recommended to use CUDA, if you have low configuration, use CPU": "It is recommended to use CUDA, if you have low configuration, use CPU",
-  "Iterative Prompt Length, 0 means off": "Iterative Prompt Length, 0 means off",
-  "Japanese": "Japanese",
-  "LLAMA Configuration": "LLAMA Configuration",
-  "LLAMA Model Config": "LLAMA Model Config",
-  "LLAMA Model Path": "LLAMA Model Path",
-  "Labeling Device": "Labeling Device",
-  "LoRA Model to be merged": "LoRA Model to be merged",
-  "Maximum Audio Duration": "Maximum Audio Duration",
-  "Maximum Length per Sample": "Maximum Length per Sample",
-  "Maximum Training Steps": "Maximum Training Steps",
-  "Maximum tokens per batch, 0 means no limit": "Maximum tokens per batch, 0 means no limit",
-  "Merge": "Merge",
-  "Merge LoRA": "Merge LoRA",
-  "Merge successfully": "Merge successfully",
-  "Minimum Audio Duration": "Minimum Audio Duration",
-  "Model Output Path": "Model Output Path",
-  "Model Size": "Model Size",
-  "Move": "Move",
-  "Move files successfully": "Move files successfully",
-  "No audio generated, please check the input text.": "No audio generated, please check the input text.",
-  "No selected options": "No selected options",
-  "Number of Workers": "Number of Workers",
-  "Open Inference Server": "Open Inference Server",
-  "Open Labeler WebUI": "Open Labeler WebUI",
-  "Open Tensorboard": "Open Tensorboard",
-  "Opened labeler in browser": "Opened labeler in browser",
-  "Optional Label Language": "Optional Label Language",
-  "Optional online ver": "Optional online ver",
-  "Output Path": "Output Path",
-  "Path error, please check the model file exists in the corresponding path": "Path error, please check the model file exists in the corresponding path",
-  "Precision": "Precision",
-  "Probability of applying Speaker Condition": "Probability of applying Speaker Condition",
-  "Put your text here.": "Put your text here.",
-  "Reference Audio": "Reference Audio",
-  "Reference Text": "Reference Text",
-  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "Related code and weights are released under CC BY-NC-SA 4.0 License.",
-  "Remove Selected Data": "Remove Selected Data",
-  "Removed path successfully!": "Removed path successfully!",
-  "Repetition Penalty": "Repetition Penalty",
-  "Save model every n steps": "Save model every n steps",
-  "Select LLAMA ckpt": "Select LLAMA ckpt",
-  "Select VITS ckpt": "Select VITS ckpt",
-  "Select VQGAN ckpt": "Select VQGAN ckpt",
-  "Select source file processing method": "Select source file processing method",
-  "Select the model to be trained (Depending on the Tab page you are on)": "Select the model to be trained (Depending on the Tab page you are on)",
-  "Selected: {}": "Selected: {}",
-  "Speaker": "Speaker",
-  "Speaker is identified by the folder name": "Speaker is identified by the folder name",
-  "Start Training": "Start Training",
-  "Streaming Audio": "Streaming Audio",
-  "Streaming Generate": "Streaming Generate",
-  "Tensorboard Host": "Tensorboard Host",
-  "Tensorboard Log Path": "Tensorboard Log Path",
-  "Tensorboard Port": "Tensorboard Port",
-  "Tensorboard interface is closed": "Tensorboard interface is closed",
-  "Tensorboard interface is launched at {}": "Tensorboard interface is launched at {}",
-  "Text is too long, please keep it under {} characters.": "Text is too long, please keep it under {} characters.",
-  "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.",
-  "Training Configuration": "Training Configuration",
-  "Training Error": "Training Error",
-  "Training stopped": "Training stopped",
-  "Type name of the speaker": "Type name of the speaker",
-  "Type the path or select from the dropdown": "Type the path or select from the dropdown",
-  "Use LoRA": "Use LoRA",
-  "Use LoRA can save GPU memory, but may reduce the quality of the model": "Use LoRA can save GPU memory, but may reduce the quality of the model",
-  "Use filelist": "Use filelist",
-  "Use large for 10G+ GPU, medium for 5G, small for 2G": "Use large for 10G+ GPU, medium for 5G, small for 2G",
-  "VITS Configuration": "VITS Configuration",
-  "VQGAN Configuration": "VQGAN Configuration",
-  "Validation Batch Size": "Validation Batch Size",
-  "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "View the status of the preprocessing folder (use the slider to control the depth of the tree)",
-  "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.",
-  "WebUI Host": "WebUI Host",
-  "WebUI Port": "WebUI Port",
-  "Whisper Model": "Whisper Model",
-  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).",
-  "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU",
-  "latest": "latest",
-  "new": "new",
-  "Realtime Transform Text": "Realtime Transform Text",
-  "Normalization Result Preview (Currently Only Chinese)": "Normalization Result Preview (Currently Only Chinese)",
-  "Text Normalization": "Text Normalization"
-}

fish_speech/i18n/locale/es_ES.json DELETED Viewed

@@ -1,122 +0,0 @@
-{
-  "16-mixed is recommended for 10+ series GPU": "se recomienda 16-mixed para GPU de la serie 10+",
-  "5 to 10 seconds of reference audio, useful for specifying speaker.": "5 a 10 segundos de audio de referencia, útil para especificar el hablante.",
-  "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "Un modelo de texto a voz basado en VQ-GAN y Llama desarrollado por [Fish Audio](https://fish.audio).",
-  "Accumulate Gradient Batches": "Acumular lotes de gradientes",
-  "Add to Processing Area": "Agregar al Área de Procesamiento",
-  "Added path successfully!": "¡Ruta agregada exitosamente!",
-  "Advanced Config": "Configuración Avanzada",
-  "Base LLAMA Model": "Modelo Base LLAMA",
-  "Batch Inference": "Inferencia por Lote",
-  "Batch Size": "Tamaño del Lote",
-  "Changing with the Model Path": "Cambiando con la Ruta del Modelo",
-  "Chinese": "Chino",
-  "Compile Model": "Compilar Modelo",
-  "Compile the model can significantly reduce the inference time, but will increase cold start time": "Compilar el modelo puede reducir significativamente el tiempo de inferencia, pero aumentará el tiempo de inicio en frío",
-  "Copy": "Copiar",
-  "Data Preprocessing": "Preprocesamiento de Datos",
-  "Data Preprocessing Path": "Ruta de Preprocesamiento de Datos",
-  "Data Source": "Fuente de Datos",
-  "Decoder Model Config": "Configuración del modelo decodificador",
-  "Decoder Model Path": "Ruta del modelo decodificador",
-  "Disabled": "Desactivado",
-  "Enable Reference Audio": "Habilitar Audio de Referencia",
-  "English": "Inglés",
-  "Error Message": "Mensaje de Error",
-  "File Preprocessing": "Preprocesamiento de Archivos",
-  "Generate": "Generar",
-  "Generated Audio": "Audio Generado",
-  "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "Si no hay texto correspondiente para el audio, aplique ASR para asistencia, soporte para formato .txt o .lab",
-  "Infer interface is closed": "La interfaz de inferencia está cerrada",
-  "Inference Configuration": "Configuración de Inferencia",
-  "Inference Server Configuration": "Configuración del Servidor de Inferencia",
-  "Inference Server Error": "Error del Servidor de Inferencia",
-  "Inferring interface is launched at {}": "La interfaz de inferencia se ha lanzado en {}",
-  "Initial Learning Rate": "Tasa de Aprendizaje Inicial",
-  "Input Audio & Source Path for Transcription": "Audio de Entrada y Ruta de Origen para Transcripción",
-  "Input Text": "Texto de Entrada",
-  "Invalid path: {}": "Ruta inválida: {}",
-  "It is recommended to use CUDA, if you have low configuration, use CPU": "Se recomienda usar CUDA, si tiene una configuración baja, use CPU",
-  "Iterative Prompt Length, 0 means off": "Longitud de la Indicación Iterativa, 0 significa apagado",
-  "Japanese": "Japonés",
-  "LLAMA Configuration": "Configuración de LLAMA",
-  "LLAMA Model Config": "Configuración del Modelo LLAMA",
-  "LLAMA Model Path": "Ruta del Modelo LLAMA",
-  "Labeling Device": "Dispositivo de Etiquetado",
-  "LoRA Model to be merged": "Modelo LoRA a fusionar",
-  "Maximum Audio Duration": "Duración máxima de audio",
-  "Maximum Length per Sample": "Longitud Máxima por Muestra",
-  "Maximum Training Steps": "Pasos Máximos de Entrenamiento",
-  "Maximum tokens per batch, 0 means no limit": "Máximo de tokens por lote, 0 significa sin límite",
-  "Merge": "Fusionar",
-  "Merge LoRA": "Fusionar LoRA",
-  "Merge successfully": "Fusionado exitosamente",
-  "Minimum Audio Duration": "Duración mínima de audio",
-  "Model Output Path": "Ruta de Salida del Modelo",
-  "Model Size": "Tamaño del Modelo",
-  "Move": "Mover",
-  "Move files successfully": "Archivos movidos exitosamente",
-  "No audio generated, please check the input text.": "No se generó audio, por favor verifique el texto de entrada.",
-  "No selected options": "No hay opciones seleccionadas",
-  "Number of Workers": "Número de Trabajadores",
-  "Open Inference Server": "Abrir Servidor de Inferencia",
-  "Open Labeler WebUI": "Abrir Interfaz Web del Etiquetador",
-  "Open Tensorboard": "Abrir Tensorboard",
-  "Opened labeler in browser": "Se abrió el etiquetador en el navegador",
-  "Optional Label Language": "Idioma de Etiquetado Opcional",
-  "Optional online ver": "Ver en línea opcional",
-  "Output Path": "Ruta de Salida",
-  "Path error, please check the model file exists in the corresponding path": "Error de ruta, por favor verifique que el archivo del modelo exista en la ruta correspondiente",
-  "Precision": "Precisión",
-  "Probability of applying Speaker Condition": "Probabilidad de aplicar Condición de Hablante",
-  "Put your text here.": "Ponga su texto aquí.",
-  "Reference Audio": "Audio de Referencia",
-  "Reference Text": "Texto de Referencia",
-  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "El código relacionado y los pesos se publican bajo la Licencia CC BY-NC-SA 4.0.",
-  "Remove Selected Data": "Eliminar Datos Seleccionados",
-  "Removed path successfully!": "¡Ruta eliminada exitosamente!",
-  "Repetition Penalty": "Penalización por Repetición",
-  "Save model every n steps": "Guardar modelo cada n pasos",
-  "Select LLAMA ckpt": "Seleccionar punto de control LLAMA",
-  "Select VITS ckpt": "Seleccionar punto de control VITS",
-  "Select VQGAN ckpt": "Seleccionar punto de control VQGAN",
-  "Select source file processing method": "Seleccione el método de procesamiento de archivos fuente",
-  "Select the model to be trained (Depending on the Tab page you are on)": "Seleccione el modelo a entrenar (Dependiendo de la pestaña en la que se encuentre)",
-  "Selected: {}": "Seleccionado: {}",
-  "Speaker": "Hablante",
-  "Speaker is identified by the folder name": "El hablante se identifica por el nombre de la carpeta",
-  "Start Training": "Iniciar Entrenamiento",
-  "Streaming Audio": "transmisión de audio",
-  "Streaming Generate": "síntesis en flujo",
-  "Tensorboard Host": "Host de Tensorboard",
-  "Tensorboard Log Path": "Ruta de Registro de Tensorboard",
-  "Tensorboard Port": "Puerto de Tensorboard",
-  "Tensorboard interface is closed": "La interfaz de Tensorboard está cerrada",
-  "Tensorboard interface is launched at {}": "La interfaz de Tensorboard se ha lanzado en {}",
-  "Text is too long, please keep it under {} characters.": "El texto es demasiado largo, por favor manténgalo por debajo de {} caracteres.",
-  "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "La ruta de la carpeta de entrada a la izquierda o la lista de archivos. Ya sea que esté marcado o no, se utilizará para el entrenamiento posterior en esta lista.",
-  "Training Configuration": "Configuración de Entrenamiento",
-  "Training Error": "Error de Entrenamiento",
-  "Training stopped": "Entrenamiento detenido",
-  "Type name of the speaker": "Escriba el nombre del hablante",
-  "Type the path or select from the dropdown": "Escriba la ruta o seleccione de la lista desplegable",
-  "Use LoRA": "Usar LoRA",
-  "Use LoRA can save GPU memory, but may reduce the quality of the model": "Usar LoRA puede ahorrar memoria GPU, pero puede reducir la calidad del modelo",
-  "Use filelist": "Usar lista de archivos",
-  "Use large for 10G+ GPU, medium for 5G, small for 2G": "Use grande para GPU de 10G+, mediano para 5G, pequeño para 2G",
-  "VITS Configuration": "Configuración de VITS",
-  "VQGAN Configuration": "Configuración de VQGAN",
-  "Validation Batch Size": "Tamaño del Lote de Validación",
-  "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "Vea el estado de la carpeta de preprocesamiento (use el control deslizante para controlar la profundidad del árbol)",
-  "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "No somos responsables de ningún mal uso del modelo, por favor considere sus leyes y regulaciones locales antes de usarlo.",
-  "WebUI Host": "Host de WebUI",
-  "WebUI Port": "Puerto de WebUI",
-  "Whisper Model": "Modelo Whisper",
-  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "Puede encontrar el código fuente [aquí](https://github.com/fishaudio/fish-speech) y los modelos [aquí](https://huggingface.co/fishaudio/fish-speech-1).",
-  "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "Se recomienda bf16-true para GPU de la serie 30+, se recomienda 16-mixed para GPU de la serie 10+",
-  "latest": "más reciente",
-  "new": "nuevo",
-  "Realtime Transform Text": "Transformación de Texto en Tiempo Real",
-  "Normalization Result Preview (Currently Only Chinese)": "Vista Previa del Resultado de Normalización (Actualmente Solo Chino)",
-  "Text Normalization": "Normalización de Texto"
-}

fish_speech/i18n/locale/ja_JP.json DELETED Viewed

@@ -1,123 +0,0 @@
-{
-  "16-mixed is recommended for 10+ series GPU": "10シリーズ以降のGPUには16-mixedをお勧めします",
-  "5 to 10 seconds of reference audio, useful for specifying speaker.": "話者を指定するのに役立つ、5～10秒のリファレンスオーディオ。",
-  "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "[Fish Audio](https://fish.audio)が開発したVQ-GANとLlamaに基づくテキスト音声合成モデル。",
-  "Accumulate Gradient Batches": "勾配バッチの累積",
-  "Add to Processing Area": "処理エリアに追加",
-  "Added path successfully!": "パスの追加に成功しました！",
-  "Advanced Config": "詳細設定",
-  "Base LLAMA Model": "基本LLAMAモデル",
-  "Batch Inference": "バッチ推論",
-  "Batch Size": "バッチサイズ",
-  "Changing with the Model Path": "モデルのパスに伴って変化する",
-  "Chinese": "中国語",
-  "Compile Model": "モデルのコンパイル",
-  "Compile the model can significantly reduce the inference time, but will increase cold start time": "モデルをコンパイルすると推論時間を大幅に短縮できますが、コールドスタート時間が長くなります",
-  "Copy": "コピー",
-  "Data Preprocessing": "データ前処理",
-  "Data Preprocessing Path": "データ前処理パス",
-  "Data Source": "データソース",
-  "Decoder Model Config": "デコーダーモデルの構成",
-  "Decoder Model Path": "デコーダーモデルのパス",
-  "Disabled": "無効",
-  "Enable Reference Audio": "リファレンスオーディオを有効にする",
-  "English": "英語",
-  "Error Message": "エラーメッセージ",
-  "File Preprocessing": "文書前处理",
-  "Generate": "生成",
-  "Generated Audio": "生成されたオーディオ",
-  "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "音声に対応するテキストがない場合は、ASRを適用してサポートします。.txtまたは.lab形式をサポートしています",
-  "Infer interface is closed": "推論インターフェースが閉じられています",
-  "Inference Configuration": "推論設定",
-  "Inference Server Configuration": "推論サーバー設定",
-  "Inference Server Error": "推論サーバーエラー",
-  "Inferring interface is launched at {}": "推論インターフェースが{}で起動しました",
-  "Initial Learning Rate": "初期学習率",
-  "Input Audio & Source Path for Transcription": "入力オーディオと文字起こしのソースパス",
-  "Input Text": "入力テキスト",
-  "Invalid path: {}": "無効なパス: {}",
-  "It is recommended to use CUDA, if you have low configuration, use CPU": "CUDAの使用をお勧めします。低い構成の場合はCPUを使用してください",
-  "Iterative Prompt Length, 0 means off": "反復プロンプト長。0はオフを意味します",
-  "Japanese": "日本語",
-  "LLAMA Configuration": "LLAMA設定",
-  "LLAMA Model Config": "LLAMAモデル設定",
-  "LLAMA Model Path": "LLAMAモデルパス",
-  "Labeling Device": "ラベリングデバイス",
-  "LoRA Model to be merged": "マージするLoRAモデル",
-  "Maximum Audio Duration": "最大オーディオの長さ",
-  "Maximum Length per Sample": "サンプルあたりの最大長",
-  "Maximum Training Steps": "最大トレーニングステップ数",
-  "Maximum tokens per batch, 0 means no limit": "バッチあたりの最大トークン数。0は制限なしを意味します",
-  "Merge": "マージ",
-  "Merge LoRA": "LoRAのマージ",
-  "Merge successfully": "マージに成功しました",
-  "Minimum Audio Duration": "最小オーディオの長さ",
-  "Model Output Path": "モデル出力パス",
-  "Model Size": "モデルサイズ",
-  "Move": "移動",
-  "Move files successfully": "ファイルの移動に成功しました",
-  "No audio generated, please check the input text.": "オーディオが生成されていません。入力テキストを確認してください。",
-  "No selected options": "選択されたオプションはありません",
-  "Number of Workers": "ワーカー数",
-  "Open Inference Server": "推論サーバーを開く",
-  "Open Labeler WebUI": "ラベラーWebUIを開く",
-  "Open Tensorboard": "Tensorboardを開く",
-  "Opened labeler in browser": "ブラウザでラベラーを開きました",
-  "Optional Label Language": "オプションのラベル言語",
-  "Optional online ver": "オプションのオンラインバージョン",
-  "Output Path": "出力パス",
-  "Path error, please check the model file exists in the corresponding path": "パスエラー。対応するパスにモデルファイルが存在するか確認してください",
-  "Precision": "精度",
-  "Probability of applying Speaker Condition": "話者条件を適用する確率",
-  "Put your text here.": "ここにテキストを入力してください。",
-  "Reference Audio": "リファレンスオーディオ",
-  "Reference Text": "リファレンステキスト",
-  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "関連コードと重みはCC BY-NC-SA 4.0ライセンスの下でリリースされます。",
-  "Remove Selected Data": "選択したデータを削除",
-  "Removed path successfully!": "パスの削除に成功しました！",
-  "Repetition Penalty": "反復ペナルティ",
-  "Save model every n steps": "nステップごとにモデルを保存",
-  "Select LLAMA ckpt": " LLAMA チェックポイントを選択",
-  "Select VITS ckpt": "VITS チェックポイントを選択",
-  "Select VQGAN ckpt": "VQGAN チェックポイントを選択",
-  "Select source file processing method": "ソースファイルの処理方法を選択",
-  "Select the model to be trained (Depending on the Tab page you are on)": "タブページに応じてトレーニングするモデルを選択してください",
-  "Selected: {}": "選択済み: {}",
-  "Speaker": "話者",
-  "Speaker is identified by the folder name": "話者はフォルダ名で識別されます",
-  "Start Training": "トレーニング開始",
-  "Streaming Audio": "ストリーミングオーディオ",
-  "Streaming Generate": "ストリーミング合成",
-  "Tensorboard Host": "Tensorboardホスト",
-  "Tensorboard Log Path": "Tensorboardログパス",
-  "Tensorboard Port": "Tensorboardポート",
-  "Tensorboard interface is closed": "Tensorboardインターフェースが閉じられています",
-  "Tensorboard interface is launched at {}": "Tensorboardインターフェースが{}で起動されました",
-  "Text is too long, please keep it under {} characters.": "テキストが長すぎます。{}文字以内に抑えてください。",
-  "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "左側の入力フォルダまたはファイルリストのパス。チェックの有無にかかわらず、このリストの後続のトレーニングに使用されます。",
-  "Training Configuration": "トレーニング設定",
-  "Training Error": "トレーニングエラー",
-  "Training stopped": "トレーニングが停止しました",
-  "Type name of the speaker": "話者の名前を入力",
-  "Type the path or select from the dropdown": "パスを入力するか、ドロップダウンから選択してください",
-  "Use LoRA": "LoRAを使用",
-  "Use LoRA can save GPU memory, but may reduce the quality of the model": "LoRAを使用するとGPUメモリを節約できますが、モデルの品質が低下する可能性があります",
-  "Use filelist": "ファイルリストを使用",
-  "Use large for 10G+ GPU, medium for 5G, small for 2G": "10G以上のGPUには大、5Gには中、2Gには小を使用してください",
-  "VITS Configuration": "VITS の構成",
-  "VQGAN Configuration": "VQGAN の構成",
-  "Validation Batch Size": "検証バッチサイズ",
-  "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "前処理フォルダの状態を表示（スライダーを使用してツリーの深さを制御）",
-  "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "モデルの誤用については一切責任を負いません。使用する前に、現地の法律と規制を考慮してください。",
-  "WebUI Host": "WebUIホスト",
-  "WebUI Port": "WebUIポート",
-  "Whisper Model": "Whisperモデル",
-  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "ソースコードは[こちら](https://github.com/fishaudio/fish-speech)、モデルは[こちら](https://huggingface.co/fishaudio/fish-speech-1)にあります。",
-  "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "30シリーズ以降のGPUにはbf16-trueを、10シリーズ以降のGPUには16-mixedをお勧めします",
-  "latest": "最新",
-  "new": "新規",
-  "Realtime Transform Text": "リアルタイム変換テキスト",
-  "Normalization Result Preview (Currently Only Chinese)": "正規化結果プレビュー（現在は中国語のみ）",
-  "Text Normalization": "テキスト正規化"
-}

fish_speech/i18n/locale/pt_BR.json DELETED Viewed

@@ -1,133 +0,0 @@
-{
-  "5 to 10 seconds of reference audio, useful for specifying speaker.": "5 a 10 segundos de áudio de referência, útil para especificar o orador.",
-  "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "Um modelo de texto para fala baseado em VQ-GAN e Llama desenvolvido por [Fish Audio](https://fish.audio).",
-  "Accumulate Gradient Batches": "Acumular Lotes de Gradiente",
-  "Add to Processing Area": "Adicionar à Área de Processamento",
-  "Added path successfully!": "Caminho adicionado com sucesso!",
-  "Advanced Config": "Configuração Avançada",
-  "Base LLAMA Model": "Modelo LLAMA Base",
-  "Batch Inference": "Inferência em Lote",
-  "Batch Size": "Tamanho do Lote",
-  "Changing with the Model Path": "Alterando com o Caminho do Modelo",
-  "Compile Model": "Compilar Modelo",
-  "Compile the model can significantly reduce the inference time, but will increase cold start time": "Compilar o modelo pode reduzir significativamente o tempo de inferência, mas aumentará a latência inicial",
-  "Copy": "Copiar",
-  "Data Preprocessing": "Pré-processamento de Dados",
-  "Data Preprocessing Path": "Caminho de Pré-processamento de Dados",
-  "Data Source": "Fonte de Dados",
-  "Decoder Model Config": "Configuração do Modelo Decodificador",
-  "Decoder Model Path": "Caminho do Modelo Decodificador",
-  "Disabled": "Desativado",
-  "Enable Initial Prompt": "Habilitar Prompt Inicial",
-  "Enable Reference Audio": "Habilitar Áudio de Referência",
-  "English": "Inglês",
-  "Japanese": "Japonês",
-  "Chinese": "Chinês",
-  "Portuguese": "Português",
-  "Spanish": "Espanhol",
-  "Error Message": "Mensagem de Erro",
-  "Faster Whisper, Up to 5g GPU memory usage": "Faster Whisper (Usa até 5 GB de vRAM)",
-  "File Preprocessing": "Pré-processamento de Arquivos",
-  "Generate": "Gerar",
-  "Generated Audio": "Áudio Gerado",
-  "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "Se não houver texto correspondente ao áudio, utilize o ASR para assistência (formatos .txt ou .lab)",
-  "Infer interface is closed": "A interface de inferência foi fechada",
-  "Inference Configuration": "Configuração de Inferência",
-  "Inference Server Configuration": "Configuração do Servidor de Inferência",
-  "Inference Server Error": "Erro do Servidor de Inferência",
-  "Inferring interface is launched at {}": "A interface de inferência foi iniciada em {}",
-  "Initial Learning Rate": "Taxa de Aprendizagem Inicial",
-  "Initial Prompt": "Prompt Inicial",
-  "Initial prompt can provide contextual or vocabulary-specific guidance to the model.": "O prompt inicial pode fornecer orientação contextual ou específica de vocabulário para o modelo.",
-  "Input Audio & Source Path for Transcription": "Entrada de Áudio/Caminho de Origem para Transcrição",
-  "Input Text": "Texto de Entrada",
-  "Invalid path: {}": "Caminho inválido: {}",
-  "It is recommended to use CUDA, if you have low configuration, use CPU": "Para GPUs Nvidia é recomendado usar CUDA. Se não tiver uma GPU Nvidia, use CPU",
-  "Iterative Prompt Length, 0 means off": "Comprimento do Prompt Iterativo (0 = desativado)",
-  "LLAMA Configuration": "Configuração do LLAMA",
-  "LLAMA Model Config": "Configuração do Modelo LLAMA",
-  "LLAMA Model Path": "Caminho do Modelo LLAMA",
-  "Labeling Device": "Dispositivo de Rotulagem",
-  "LoRA Model to be merged": "Modelo LoRA para mesclagem",
-  "Maximum Length per Sample": "Comprimento Máximo por Amostra",
-  "Maximum Training Steps": "Etapas Máximas de Treinamento",
-  "Maximum tokens per batch, 0 means no limit": "Número máximo de tokens por lote, 0 significa sem limite",
-  "Merge": "Mesclar",
-  "Merge LoRA": "Mesclar LoRA",
-  "Merge successfully": "Mesclado com sucesso",
-  "Model Output Path": "Caminho de Saída do Modelo",
-  "Model Quantization": "Quantização do Modelo",
-  "Model Size": "Tamanho do Modelo",
-  "Move": "Mover",
-  "Move files successfully": "Arquivos movidos com sucesso",
-  "No audio generated, please check the input text.": "Nenhum áudio gerado, verifique o texto de entrada.",
-  "No selected options": "Nenhuma opção selecionada",
-  "Normalization Result Preview (Currently Only Chinese)": "Pré-visualização do Resultado da Normalização (Atualmente Apenas Chinês)",
-  "Number of Workers": "Número de Processos",
-  "Open Inference Server": "Abrir Servidor de Inferência",
-  "Open Labeler WebUI": "Abrir WebUI de Rotulagem",
-  "Open Tensorboard": "Abrir Tensorboard",
-  "Opened labeler in browser": "WebUI de rotulagem aberta no navegador",
-  "Optional Label Language": "Idioma do Rótulo (Opcional)",
-  "Optional online ver": "Versão online (opcional)",
-  "Output Path": "Caminho de Saída",
-  "Path error, please check the model file exists in the corresponding path": "Erro de caminho, verifique se o arquivo do modelo existe no caminho correspondente",
-  "Post-quantification Precision": "Precisão Pós-quantização",
-  "Precision": "Precisão",
-  "Probability of applying Speaker Condition": "Probabilidade de Aplicar Condição de Orador",
-  "Put your text here.": "Insira seu texto aqui.",
-  "Quantify": "Quantizar",
-  "Quantify successfully": "Quantizado com sucesso",
-  "Realtime Transform Text": "Transformar Texto em Tempo Real",
-  "Reference Audio": "Áudio de Referência",
-  "Reference Text": "Texto de Referência",
-  "warning": "Aviso",
-  "Pre-processing begins...": "O pré-processamento começou!",
-  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "O código relacionado e os pesos são licenciados sob a Licença CC BY-NC-SA 4.0.",
-  "Remove Selected Data": "Remover Dados Selecionados",
-  "Removed path successfully!": "Caminho removido com sucesso!",
-  "Repetition Penalty": "Penalidade de Repetição",
-  "Save model every n steps": "Salvar modelo a cada n etapas",
-  "Select LLAMA ckpt": "Selecionar .ckpt do LLAMA",
-  "Select source file processing method": "Escolha como processar o arquivo de origem",
-  "Select the model to be trained (Depending on the Tab page you are on)": "Selecione o modelo para o treinamento (dependendo da aba em que você está)",
-  "Selected: {}": "Selecionado: {}",
-  "Speaker is identified by the folder name": "O orador é identificado pelo nome da pasta",
-  "Start Training": "Iniciar Treinamento",
-  "Streaming Audio": "Áudio em Streaming",
-  "Streaming Generate": "Geração em Streaming",
-  "Tensorboard Host": "Host do Tensorboard",
-  "Tensorboard Log Path": "Caminho de Log do Tensorboard",
-  "Tensorboard Port": "Porta do Tensorboard",
-  "Tensorboard interface is closed": "A interface do Tensorboard está fechada",
-  "Tensorboard interface is launched at {}": "A interface do Tensorboard foi iniciada em {}",
-  "Text Normalization": "Normalização de Texto",
-  "Text is too long, please keep it under {} characters.": "O texto é muito longo. Mantenha-o com menos de {} caracteres.",
-  "The lower the quantitative precision, the more the effectiveness may decrease, but the greater the efficiency will increase": "Quanto menor a precisão quantitativa, mais a eficácia pode diminuir, mas maior será o aumento da eficiência",
-  "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "O caminho da pasta de entrada à esquerda ou a lista de arquivos. Independentemente de estar marcada ou não, ela será utilizada para o treinamento subsequente nesta lista.",
-  "Training Configuration": "Configuração de Treinamento",
-  "Training Error": "Erro de Treinamento",
-  "Training stopped": "Treinamento interrompido!",
-  "Type the path or select from the dropdown": "Digite o caminho ou selecione no menu suspenso",
-  "Use LoRA": "Usar LoRA",
-  "Use LoRA can save GPU memory, but may reduce the quality of the model": "O uso de LoRAs pode economizar memória da GPU, mas também pode reduzir a qualidade",
-  "Use filelist": "Usar lista de arquivos",
-  "VQGAN Configuration": "Configuração do VQGAN",
-  "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "Visualizar o status da pasta de pré-processamento (use o controle deslizante para controlar a profundidade da árvore)",
-  "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "Não nos responsabilizamos por qualquer uso indevido do modelo. Por favor, considere as leis e regulamentações locais antes de usá-lo.",
-  "WebUI Host": "Host da WebUI",
-  "WebUI Port": "Porta da WebUI",
-  "Whisper Model": "Modelo Whisper",
-  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "Você pode encontrar o código fonte [aqui](https://github.com/fishaudio/fish-speech) e os modelos [aqui](https://huggingface.co/fishaudio/fish-speech-1).",
-  "auto": "automático",
-  "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "bf16-true é recomendado para GPUs da série 30+, 16-mixed é recomendado para GPUs da série 10+",
-  "latest": "mais recente",
-  "new": "novo",
-  "This audio introduces the basic concepts and applications of artificial intelligence and machine learning.": "Este áudio introduz os conceitos básicos e aplicações de inteligência artificial e aprendizado de máquina.",
-  "You don't need to train this model!": "Não é necessário treinar este modelo!",
-  "Yes": "Sim",
-  "No": "Não",
-  "version:": "versão:",
-  "author:": "autor:"
-}

fish_speech/i18n/locale/zh_CN.json DELETED Viewed

@@ -1,122 +0,0 @@
-{
-  "16-mixed is recommended for 10+ series GPU": "10+ 系列 GPU 建议使用 16-mixed",
-  "5 to 10 seconds of reference audio, useful for specifying speaker.": "5 到 10 秒的参考音频，适用于指定音色。",
-  "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "由 [Fish Audio](https://fish.audio) 研发的基于 VQ-GAN 和 Llama 的多语种语音合成.",
-  "Accumulate Gradient Batches": "梯度累积批次",
-  "Add to Processing Area": "加入处理区",
-  "Added path successfully!": "添加路径成功!",
-  "Advanced Config": "高级参数",
-  "Base LLAMA Model": "基础 LLAMA 模型",
-  "Batch Inference": "批量推理",
-  "Batch Size": "批次大小",
-  "Changing with the Model Path": "随模型路径变化",
-  "Chinese": "中文",
-  "Compile Model": "编译模型",
-  "Compile the model can significantly reduce the inference time, but will increase cold start time": "编译模型可以显著减少推理时间，但会增加冷启动时间",
-  "Copy": "复制",
-  "Data Preprocessing": "数据预处理",
-  "Data Preprocessing Path": "数据预处理路径",
-  "Data Source": "数据源",
-  "Decoder Model Config": "解码器模型配置",
-  "Decoder Model Path": "解码器模型路径",
-  "Disabled": "禁用",
-  "Enable Reference Audio": "启用参考音频",
-  "English": "英文",
-  "Error Message": "错误信息",
-  "File Preprocessing": "文件预处理",
-  "Generate": "生成",
-  "Generated Audio": "音频",
-  "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "如果音频没有对应的文本，可以应用 ASR 辅助，支持 .txt 或 .lab 格式",
-  "Infer interface is closed": "推理界面已关闭",
-  "Inference Configuration": "推理配置",
-  "Inference Server Configuration": "推理服务器配置",
-  "Inference Server Error": "推理服务器错误",
-  "Inferring interface is launched at {}": "推理界面已在 {} 上启动",
-  "Initial Learning Rate": "初始学习率",
-  "Input Audio & Source Path for Transcription": "输入音频和转录源路径",
-  "Input Text": "输入文本",
-  "Invalid path: {}": "无效路径: {}",
-  "It is recommended to use CUDA, if you have low configuration, use CPU": "建议使用 CUDA，如果配置较低，使用 CPU",
-  "Iterative Prompt Length, 0 means off": "迭代提示长度，0 表示关闭",
-  "Japanese": "日文",
-  "LLAMA Configuration": "LLAMA 配置",
-  "LLAMA Model Config": "LLAMA 模型配置",
-  "LLAMA Model Path": "LLAMA 模型路径",
-  "Labeling Device": "标注加速设备",
-  "LoRA Model to be merged": "要合并的 LoRA 模型",
-  "Maximum Audio Duration": "最大音频时长",
-  "Maximum Length per Sample": "每个样本的最大长度",
-  "Maximum Training Steps": "最大训练步数",
-  "Maximum tokens per batch, 0 means no limit": "每批最大令牌数，0 表示无限制",
-  "Merge": "合并",
-  "Merge LoRA": "合并 LoRA",
-  "Merge successfully": "合并成功",
-  "Minimum Audio Duration": "最小音频时长",
-  "Model Output Path": "模型输出路径",
-  "Model Size": "模型规模",
-  "Move": "移动",
-  "Move files successfully": "移动文件成功",
-  "No audio generated, please check the input text.": "没有生成音频，请检查输入文本.",
-  "No selected options": "没有选择的选项",
-  "Number of Workers": "数据加载进程数",
-  "Open Inference Server": "打开推理服务器",
-  "Open Labeler WebUI": "打开标注工具",
-  "Open Tensorboard": "打开 Tensorboard",
-  "Opened labeler in browser": "在浏览器中打开标注工具",
-  "Optional Label Language": "[可选] 标注语言",
-  "Optional online ver": "[可选] 使用在线版",
-  "Output Path": "输出路径",
-  "Path error, please check the model file exists in the corresponding path": "路径错误，请检查模型文件是否存在于相应路径",
-  "Precision": "精度",
-  "Probability of applying Speaker Condition": "应用说话人条件的概率",
-  "Put your text here.": "在此处输入文本.",
-  "Reference Audio": "参考音频",
-  "Reference Text": "参考文本",
-  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "相关代码和权重使用 CC BY-NC-SA 4.0 许可证发布.",
-  "Remove Selected Data": "移除选中数据",
-  "Removed path successfully!": "移除路径成功!",
-  "Repetition Penalty": "重复惩罚",
-  "Save model every n steps": "每 n 步保存模型",
-  "Select LLAMA ckpt": "选择 LLAMA 检查点",
-  "Select VITS ckpt": "选择 VITS 检查点",
-  "Select VQGAN ckpt": "选择 VQGAN 检查点",
-  "Select source file processing method": "选择源文件处理方法",
-  "Select the model to be trained (Depending on the Tab page you are on)": "根据您所在的选项卡页面选择要训练的模型",
-  "Selected: {}": "已选择: {}",
-  "Speaker": "说话人",
-  "Speaker is identified by the folder name": "自动根据父目录名称识别说话人",
-  "Start Training": "开始训练",
-  "Streaming Audio": "流式音频",
-  "Streaming Generate": "流式合成",
-  "Tensorboard Host": "Tensorboard 监听地址",
-  "Tensorboard Log Path": "Tensorboard 日志路径",
-  "Tensorboard Port": "Tensorboard 端口",
-  "Tensorboard interface is closed": "Tensorboard 界面已关闭",
-  "Tensorboard interface is launched at {}": "Tensorboard 界面已在 {} 上启动",
-  "Text is too long, please keep it under {} characters.": "文本太长，请保持在 {} 个字符以内.",
-  "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "左侧输入文件夹的路径或文件列表。无论是否选中，都将在此列表中用于后续训练.",
-  "Training Configuration": "训练配置",
-  "Training Error": "训练错误",
-  "Training stopped": "训练已停止",
-  "Type name of the speaker": "输入说话人的名称",
-  "Type the path or select from the dropdown": "输入路径或从下拉菜单中选择",
-  "Use LoRA": "使用 LoRA",
-  "Use LoRA can save GPU memory, but may reduce the quality of the model": "使用 LoRA 可以节省 GPU 内存，但可能会降低模型质量",
-  "Use filelist": "使用文件列表",
-  "Use large for 10G+ GPU, medium for 5G, small for 2G": "10G+ GPU 使用 large, 5G 使用 medium, 2G 使用 small",
-  "VITS Configuration": "VITS 配置",
-  "VQGAN Configuration": "VQGAN 配置",
-  "Validation Batch Size": "验证批次大小",
-  "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "查看预处理文件夹的状态 (使用滑块控制树的深度)",
-  "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "我们不对模型的任何滥用负责，请在使用之前考虑您当地的法律法规.",
-  "WebUI Host": "WebUI 监听地址",
-  "WebUI Port": "WebUI 端口",
-  "Whisper Model": "Whisper 模型",
-  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "你可以在 [这里](https://github.com/fishaudio/fish-speech) 找到源代码和 [这里](https://huggingface.co/fishaudio/fish-speech-1) 找到模型.",
-  "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "30+ 系列 GPU 建议使用 bf16-true, 10+ 系列 GPU 建议使用 16-mixed",
-  "latest": "最近的检查点",
-  "new": "创建新的检查点",
-  "Realtime Transform Text": "实时规范化文本",
-  "Normalization Result Preview (Currently Only Chinese)": "规范化结果预览",
-  "Text Normalization": "文本规范化"
-}

fish_speech/i18n/scan.py DELETED Viewed

@@ -1,122 +0,0 @@
-import ast
-import glob
-import json
-from collections import OrderedDict
-from pathlib import Path
-from loguru import logger
-from .core import DEFAULT_LANGUAGE, I18N_FILE_PATH
-def extract_i18n_strings(node):
-    i18n_strings = []
-    if (
-        isinstance(node, ast.Call)
-        and isinstance(node.func, ast.Name)
-        and node.func.id == "i18n"
-    ):
-        for arg in node.args:
-            if isinstance(arg, ast.Str):
-                i18n_strings.append(arg.s)
-    for child_node in ast.iter_child_nodes(node):
-        i18n_strings.extend(extract_i18n_strings(child_node))
-    return i18n_strings
-# scan the directory for all .py files (recursively)
-# for each file, parse the code into an AST
-# for each AST, extract the i18n strings
-strings = []
-folders = ["fish_speech", "tools"]
-# for filename in glob.iglob("**/*.py", recursive=True):
-for folder in folders:
-    for f in Path(folder).rglob("*.py"):
-        code = f.read_text(encoding="utf-8")
-        if "i18n(" in code:
-            tree = ast.parse(code)
-            i18n_strings = extract_i18n_strings(tree)
-            logger.info(f"Found {len(i18n_strings)} i18n strings in {f}")
-            strings.extend(i18n_strings)
-code_keys = set(strings)
-logger.info(f"Total unique: {len(code_keys)}")
-standard_file = I18N_FILE_PATH / f"{DEFAULT_LANGUAGE}.json"
-with open(standard_file, "r", encoding="utf-8") as f:
-    standard_data = json.load(f, object_pairs_hook=OrderedDict)
-standard_keys = set(standard_data.keys())
-# Define the standard file name
-unused_keys = standard_keys - code_keys
-logger.info(f"Found {len(unused_keys)} unused keys in {standard_file}")
-for unused_key in unused_keys:
-    logger.info(f"\t{unused_key}")
-missing_keys = code_keys - standard_keys
-logger.info(f"Found {len(missing_keys)} missing keys in {standard_file}")
-for missing_key in missing_keys:
-    logger.info(f"\t{missing_key}")
-code_keys_dict = OrderedDict()
-for s in strings:
-    code_keys_dict[s] = s
-# write back
-with open(standard_file, "w", encoding="utf-8") as f:
-    json.dump(code_keys_dict, f, ensure_ascii=False, indent=4, sort_keys=True)
-    f.write("\n")
-logger.info(f"Updated {standard_file}")
-# Define the standard file name
-standard_file = I18N_FILE_PATH / f"{DEFAULT_LANGUAGE}.json"
-# Find all JSON files in the directory
-dir_path = I18N_FILE_PATH
-languages = [f for f in dir_path.glob("*.json") if f.stem != DEFAULT_LANGUAGE]
-# Load the standard file
-with open(standard_file, "r", encoding="utf-8") as f:
-    standard_data = json.load(f, object_pairs_hook=OrderedDict)
-# Loop through each language file
-for lang_file in languages:
-    # Load the language file
-    with open(lang_file, "r", encoding="utf-8") as f:
-        lang_data = json.load(f, object_pairs_hook=OrderedDict)
-    # Find the difference between the language file and the standard file
-    diff = set(standard_data.keys()) - set(lang_data.keys())
-    miss = set(lang_data.keys()) - set(standard_data.keys())
-    # Add any missing keys to the language file
-    for key in diff:
-        lang_data[key] = "#!" + key
-        logger.info(f"Added missing key: {key} to {lang_file}")
-    # Del any extra keys to the language file
-    for key in miss:
-        del lang_data[key]
-        logger.info(f"Del extra key: {key} from {lang_file}")
-    # Sort the keys of the language file to match the order of the standard file
-    lang_data = OrderedDict(
-        sorted(lang_data.items(), key=lambda x: list(standard_data.keys()).index(x[0]))
-    )
-    # Save the updated language file
-    with open(lang_file, "w", encoding="utf-8") as f:
-        json.dump(lang_data, f, ensure_ascii=False, indent=4, sort_keys=True)
-        f.write("\n")
-    logger.info(f"Updated {lang_file}")
-logger.info("Done")

fish_speech/models/text2semantic/__init__.py DELETED Viewed

File without changes

fish_speech/models/text2semantic/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (179 Bytes)

fish_speech/models/text2semantic/__pycache__/lit_module.cpython-310.pyc DELETED Viewed

Binary file (5.41 kB)

fish_speech/models/text2semantic/__pycache__/llama.cpython-310.pyc DELETED Viewed

Binary file (20.8 kB)

fish_speech/models/text2semantic/__pycache__/lora.cpython-310.pyc DELETED Viewed

Binary file (1.79 kB)

fish_speech/models/text2semantic/lit_module.py DELETED Viewed

@@ -1,202 +0,0 @@
-from typing import Any, Optional
-import lightning as L
-import torch
-import torch.nn.functional as F
-from lightning.pytorch.utilities.types import OptimizerLRScheduler
-import fish_speech.utils as utils
-from fish_speech.conversation import CODEBOOK_PAD_TOKEN_ID
-from fish_speech.models.text2semantic.llama import NaiveTransformer
-log = utils.RankedLogger(__name__, rank_zero_only=True)
-class TextToSemantic(L.LightningModule):
-    def __init__(
-        self,
-        model: NaiveTransformer,
-        optimizer: Any,
-        lr_scheduler: Any,
-    ):
-        super().__init__()
-        self.model = model
-        self.optimizer_builder = optimizer
-        self.lr_scheduler_builder = lr_scheduler
-    def forward(self, x):
-        return self.model(x)
-    def on_save_checkpoint(self, checkpoint):
-        # Save only LoRA parameters
-        state_dict = checkpoint["state_dict"]
-        use_lora = any("lora" in name for name in state_dict.keys())
-        if not use_lora:
-            return
-        for name in list(state_dict.keys()):
-            if "lora" not in name:
-                state_dict.pop(name)
-    def configure_optimizers(self) -> OptimizerLRScheduler:
-        # Get weight decay parameters
-        weight_decay_parameters, other_parameters = [], []
-        for name, param in self.named_parameters():
-            if ".bias" in name or "norm.weight" in name or ".embeddings." in name:
-                other_parameters.append(param)
-            else:
-                weight_decay_parameters.append(param)
-        optimizer = self.optimizer_builder(
-            [
-                {"params": weight_decay_parameters},
-                {"params": other_parameters, "weight_decay": 0.0},
-            ]
-        )
-        # Print the parameters and their weight decay
-        for i in optimizer.param_groups:
-            log.info(
-                f"Set weight decay: {i['weight_decay']} for {len(i['params'])} parameters"
-            )
-        lr_scheduler = self.lr_scheduler_builder(optimizer)
-        return {
-            "optimizer": optimizer,
-            "lr_scheduler": {
-                "scheduler": lr_scheduler,
-                "interval": "step",
-            },
-        }
-    # Copied from https://github.com/eric-mitchell/direct-preference-optimization/blob/main/trainers.py#L90
-    def get_batch_logps(
-        self,
-        logits: torch.FloatTensor,
-        labels: torch.LongTensor,
-        average_log_prob: bool = False,
-    ) -> torch.FloatTensor:
-        """Compute the log probabilities of the given labels under the given logits.
-        Args:
-            logits: Logits of the model (unnormalized). Shape: (batch_size, sequence_length, codebook_size, vocab_size)
-            labels: Labels for which to compute the log probabilities. Label tokens with a value of -100 are ignored. Shape: (batch_size, sequence_length, codebook_size)
-            average_log_prob: If True, return the average log probability per (non-masked) token. Otherwise, return the sum of the log probabilities of the (non-masked) tokens.
-        Returns:
-            A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under the given logits.
-        """
-        assert logits.shape[:-1] == labels.shape
-        labels = labels.clone()
-        loss_mask = labels != -100
-        # dummy token; we'll ignore the losses on these tokens later
-        labels[labels == -100] = 0
-        per_token_logps = torch.gather(
-            logits.log_softmax(-1), dim=-1, index=labels.unsqueeze(-1)
-        ).squeeze(-1)
-        if average_log_prob:
-            return (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
-        else:
-            return (per_token_logps * loss_mask).sum(-1)
-    def _step(self, batch, batch_idx, stage: str):
-        is_train = stage == "train"
-        if is_train:
-            # Key part to make lora work
-            # Otherwise the parameters are merged, which lead to incorrect gradients
-            self.model.train()
-        # Do positive and negative samples in the same batch to speed up training
-        labels = batch["labels"]
-        outputs = self.model(
-            inp=batch["inputs"],
-            key_padding_mask=batch["attention_masks"],
-        )
-        token_logits = outputs.token_logits
-        codebook_logits = outputs.codebook_logits
-        # Generate labels
-        base_loss = F.cross_entropy(
-            token_logits.view(-1, token_logits.size(-1)),
-            labels[:, 0].reshape(-1),
-            ignore_index=-100,
-        )
-        codebook_labels = labels[:, 1 : 1 + self.model.config.num_codebooks].mT
-        semantic_loss = F.cross_entropy(
-            codebook_logits.view(-1, codebook_logits.size(-1)),
-            codebook_labels.reshape(-1),
-            ignore_index=-100,
-        )
-        loss = base_loss + semantic_loss
-        self.log(
-            f"{stage}/loss",
-            loss,
-            on_step=is_train,
-            on_epoch=not is_train,
-            prog_bar=True,
-            logger=True,
-            sync_dist=not is_train,
-        )
-        self.log(
-            f"{stage}/base_loss",
-            base_loss,
-            on_step=is_train,
-            on_epoch=not is_train,
-            prog_bar=False,
-            logger=True,
-            sync_dist=not is_train,
-        )
-        self.log(
-            f"{stage}/semantic_loss",
-            semantic_loss,
-            on_step=is_train,
-            on_epoch=not is_train,
-            prog_bar=False,
-            logger=True,
-            sync_dist=not is_train,
-        )
-        # Top-5 accuracy
-        accuracy = self.get_accuracy(codebook_logits, codebook_labels)
-        self.log(
-            f"{stage}/top_5_accuracy",
-            accuracy,
-            on_step=is_train,
-            on_epoch=not is_train,
-            prog_bar=True,
-            logger=True,
-            sync_dist=not is_train,
-        )
-        return loss
-    def get_accuracy(self, logits, labels):
-        mask = (labels != -100) & (labels != CODEBOOK_PAD_TOKEN_ID)
-        if mask.sum() == 0:
-            return torch.tensor(0.0, device=logits.device)
-        _, indices = logits.topk(5, dim=-1)
-        correct = indices.eq(labels.unsqueeze(-1))
-        correct[~mask] = 0
-        correct = correct.sum()
-        accuracy = correct / mask.sum()
-        return accuracy
-    def training_step(self, batch, batch_idx):
-        return self._step(batch, batch_idx, "train")
-    def validation_step(self, batch, batch_idx):
-        return self._step(batch, batch_idx, "val")

fish_speech/models/text2semantic/llama.py DELETED Viewed

@@ -1,779 +0,0 @@
-import json
-import math
-from collections import OrderedDict
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Optional
-import torch
-import torch.nn as nn
-from einops import rearrange
-from loguru import logger
-from torch import Tensor
-from torch.nn import functional as F
-from torch.nn.attention import SDPBackend, sdpa_kernel
-from torch.utils.checkpoint import checkpoint
-from transformers import AutoTokenizer
-from fish_speech.conversation import SEMANTIC_TOKEN
-from fish_speech.utils import RankedLogger
-from .lora import LoraConfig, setup_lora
-log = RankedLogger(__name__, rank_zero_only=True)
-def find_multiple(n: int, k: int) -> int:
-    if n % k == 0:
-        return n
-    return n + k - (n % k)
-@dataclass
-class BaseModelArgs:
-    model_type: str = "base"
-    vocab_size: int = 32000
-    n_layer: int = 32
-    n_head: int = 32
-    dim: int = 4096
-    intermediate_size: int = None
-    n_local_heads: int = -1
-    head_dim: int = 64
-    rope_base: float = 10000
-    norm_eps: float = 1e-5
-    max_seq_len: int = 2048
-    dropout: float = 0.0
-    tie_word_embeddings: bool = True
-    attention_qkv_bias: bool = False
-    # Codebook configs
-    codebook_size: int = 160
-    num_codebooks: int = 4
-    # Gradient checkpointing
-    use_gradient_checkpointing: bool = True
-    # Initialize the model
-    initializer_range: float = 0.02
-    def __post_init__(self):
-        if self.n_local_heads == -1:
-            self.n_local_heads = self.n_head
-        if self.intermediate_size is None:
-            hidden_dim = 4 * self.dim
-            n_hidden = int(2 * hidden_dim / 3)
-            self.intermediate_size = find_multiple(n_hidden, 256)
-        self.head_dim = self.dim // self.n_head
-    @staticmethod
-    def from_pretrained(path: str):
-        path = Path(path)
-        if path.is_dir():
-            path = path / "config.json"
-        with open(path, "r", encoding="utf-8") as f:
-            data = json.load(f)
-        match data["model_type"]:
-            case "naive":
-                cls = NaiveModelArgs
-            case "dual_ar":
-                cls = DualARModelArgs
-            case _:
-                raise ValueError(f"Unknown model type: {data['model_type']}")
-        return cls(**data)
-    def save(self, path: str):
-        with open(path, "w") as f:
-            json.dump(self.__dict__, f, indent=4, sort_keys=True, ensure_ascii=False)
-@dataclass
-class NaiveModelArgs(BaseModelArgs):
-    model_type: str = "naive"
-@dataclass
-class DualARModelArgs(BaseModelArgs):
-    model_type: str = "dual_ar"
-    n_fast_layer: int = 4
-class KVCache(nn.Module):
-    def __init__(
-        self, max_batch_size, max_seq_len, n_heads, head_dim, dtype=torch.bfloat16
-    ):
-        super().__init__()
-        cache_shape = (max_batch_size, n_heads, max_seq_len, head_dim)
-        self.register_buffer("k_cache", torch.zeros(cache_shape, dtype=dtype))
-        self.register_buffer("v_cache", torch.zeros(cache_shape, dtype=dtype))
-    def update(self, input_pos, k_val, v_val):
-        # input_pos: [S], k_val: [B, H, S, D]
-        assert input_pos.shape[0] == k_val.shape[2]
-        k_out = self.k_cache
-        v_out = self.v_cache
-        k_out[:, :, input_pos] = k_val
-        v_out[:, :, input_pos] = v_val
-        return k_out, v_out
-@dataclass
-class TransformerForwardResult:
-    token_logits: Tensor
-    codebook_logits: Tensor
-@dataclass
-class BaseTransformerForwardResult:
-    logits: Tensor
-    hidden_states: Tensor
-class BaseTransformer(nn.Module):
-    def __init__(
-        self, config: BaseModelArgs, tokenizer: AutoTokenizer, init_weights: bool = True
-    ) -> None:
-        super().__init__()
-        self.config = config
-        self.tokenizer = tokenizer
-        self.semantic_token_id = tokenizer.convert_tokens_to_ids(SEMANTIC_TOKEN)
-        # Slow transformer
-        self.embeddings = nn.Embedding(
-            config.vocab_size,
-            config.dim,
-        )
-        self.codebook_embeddings = nn.Embedding(
-            config.codebook_size * config.num_codebooks,
-            config.dim,
-        )
-        self.layers = nn.ModuleList(
-            TransformerBlock(config, use_sdpa=True) for _ in range(config.n_layer)
-        )
-        self.norm = RMSNorm(config.dim, eps=config.norm_eps)
-        if self.config.tie_word_embeddings is False:
-            self.output = nn.Linear(
-                config.dim,
-                config.vocab_size,
-                bias=False,
-            )
-        self.register_buffer(
-            "freqs_cis",
-            precompute_freqs_cis(
-                config.max_seq_len,
-                config.dim // config.n_head,
-                config.rope_base,
-            ),
-            persistent=False,
-        )
-        self.register_buffer(
-            "causal_mask",
-            torch.tril(
-                torch.ones(
-                    config.max_seq_len,
-                    config.max_seq_len,
-                    dtype=torch.bool,
-                )
-            ),
-            persistent=False,
-        )
-        # For kv cache
-        self.max_batch_size = -1
-        self.max_seq_len = -1
-        if init_weights:
-            self.apply(self._init_weights)
-    def setup_caches(
-        self, max_batch_size: int, max_seq_len: int, dtype: torch.dtype = torch.bfloat16
-    ):
-        if self.max_seq_len >= max_seq_len and self.max_batch_size >= max_batch_size:
-            return
-        head_dim = self.config.dim // self.config.n_head
-        max_seq_len = find_multiple(max_seq_len, 8)
-        self.max_seq_len = max_seq_len
-        self.max_batch_size = max_batch_size
-        for b in self.layers:
-            b.attention.kv_cache = KVCache(
-                max_batch_size,
-                max_seq_len,
-                self.config.n_local_heads,
-                head_dim,
-                dtype=dtype,
-            )
-    def embed(self, x: Tensor) -> Tensor:
-        vocab_embeds = [self.embeddings(x[:, 0])]
-        for i in range(self.config.num_codebooks):
-            emb = self.codebook_embeddings(x[:, i + 1] + i * self.config.codebook_size)
-            emb[x[:, 0] != self.semantic_token_id] = 0
-            vocab_embeds.append(emb)
-        x = torch.stack(vocab_embeds, dim=3)
-        x = x.sum(dim=3)
-        return x
-    def forward(
-        self,
-        inp: Tensor,
-        key_padding_mask: Optional[Tensor] = None,
-    ) -> BaseTransformerForwardResult:
-        seq_len = inp.size(2)
-        # Here we want to merge the embeddings of the codebooks
-        x = self.embed(inp)
-        freqs_cis = self.freqs_cis[:seq_len]
-        # Not that the causal mask here follows the definition of scaled_dot_product_attention
-        # That is, FALSE means masked out
-        # To maintain consistency, key_padding_mask use TRUE to mask out
-        mask = None
-        if key_padding_mask is not None:
-            mask = self.causal_mask[None, None, :seq_len, :seq_len]  # (B, N, Q, K)
-            mask = mask & key_padding_mask[:, None, None, :].logical_not()
-        for layer in self.layers:
-            if self.config.use_gradient_checkpointing and self.training:
-                x = checkpoint(layer, x, freqs_cis, mask, use_reentrant=True)
-            else:
-                x = layer(x, freqs_cis, mask)
-        # We got slow_out here
-        slow_out = self.norm(x)
-        if self.config.tie_word_embeddings:
-            token_logits = F.linear(slow_out, self.embeddings.weight)
-        else:
-            token_logits = self.output(slow_out)
-        return BaseTransformerForwardResult(
-            logits=token_logits,
-            hidden_states=x,
-        )
-    def forward_generate(
-        self,
-        x: Tensor,
-        input_pos: Optional[Tensor] = None,
-        return_all: bool = False,
-    ) -> BaseTransformerForwardResult:
-        # This is used for generation, optimized for torch compile
-        assert (
-            self.max_seq_len != -1 and self.max_batch_size != -1
-        ), "Please call setup_caches before forward_generate"
-        x = self.embed(x)
-        mask = self.causal_mask[
-            None, None, input_pos, : self.max_seq_len
-        ]  # (B, N, Q, K)
-        freqs_cis = self.freqs_cis[input_pos]
-        for layer in self.layers:
-            x = layer(x, freqs_cis, mask, input_pos=input_pos)
-        # If prefill, we only calculate the logits of last token
-        if x.size(1) > 1 and not return_all:
-            x = x[:, -1:]
-        # We got slow_out here
-        slow_out = self.norm(x)
-        if self.config.tie_word_embeddings:
-            token_logits = F.linear(slow_out, self.embeddings.weight)
-        else:
-            token_logits = self.output(slow_out)
-        return BaseTransformerForwardResult(
-            logits=token_logits,
-            hidden_states=x,
-        )
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-    @staticmethod
-    def from_pretrained(
-        path: str,
-        load_weights: bool = False,
-        max_length: int | None = None,
-        lora_config: LoraConfig | None = None,
-        rope_base: int | None = None,
-    ) -> "BaseTransformer":
-        config = BaseModelArgs.from_pretrained(str(path))
-        if max_length is not None:
-            config.max_seq_len = max_length
-            log.info(f"Override max_seq_len to {max_length}")
-        if rope_base is not None:
-            config.rope_base = rope_base
-            log.info(f"Override rope_base to {rope_base}")
-        match config.model_type:
-            case "naive":
-                model_cls = NaiveTransformer
-            case "dual_ar":
-                model_cls = DualARTransformer
-            case _:
-                raise ValueError(f"Unknown model type: {config.model_type}")
-        tokenizer = AutoTokenizer.from_pretrained(str(path))
-        log.info(f"Loading model from {path}, config: {config}")
-        model = model_cls(config, tokenizer=tokenizer)
-        if lora_config is not None:
-            setup_lora(model, lora_config)
-            log.info(f"LoRA setup: {lora_config}")
-        if load_weights is False:
-            log.info("Randomly initialized model")
-        else:
-            if "int8" in str(Path(path)):
-                logger.info("Using int8 weight-only quantization!")
-                from tools.llama.quantize import WeightOnlyInt8QuantHandler
-                simple_quantizer = WeightOnlyInt8QuantHandler(model)
-                model = simple_quantizer.convert_for_runtime()
-            if "int4" in str(Path(path)):
-                logger.info("Using int4 quantization!")
-                path_comps = path.name.split("-")
-                assert path_comps[-2].startswith("g")
-                groupsize = int(path_comps[-2][1:])
-                from tools.llama.quantize import WeightOnlyInt4QuantHandler
-                simple_quantizer = WeightOnlyInt4QuantHandler(model, groupsize)
-                model = simple_quantizer.convert_for_runtime()
-            weights = torch.load(
-                Path(path) / "model.pth", map_location="cpu", mmap=True
-            )
-            if "state_dict" in weights:
-                logger.warning(
-                    "Using a TextToSemantic LightningModule checkpoint, "
-                    "please make sure it is a full model, not a LoRA model."
-                )
-                weights = weights["state_dict"]
-            if next(iter(weights.keys())).startswith("model."):
-                logger.info(
-                    f"Remove prefix 'model.' created by TextToSemantic LightningModule from keys"
-                )
-                new_weights = OrderedDict()
-                for k, v in weights.items():
-                    new_weights[k.replace("model.", "")] = v
-                weights = new_weights
-            # Verify the name and shape of parameters since strict=False in load_state_dict.
-            for k, v in model.named_parameters():
-                if k not in weights:
-                    logger.warning(f"No weight for {k}")
-                elif v.shape != weights[k].shape:
-                    logger.warning(
-                        f"Shape mismatch for {k}: {v.shape} vs {weights[k].shape}"
-                    )
-            err = model.load_state_dict(weights, strict=False, assign=True)
-            log.info(f"Loaded weights with error: {err}")
-        return model
-    def save_pretrained(self, path: str, drop_lora: bool = False):
-        path = Path(path)
-        path.mkdir(parents=True, exist_ok=True)
-        self.config.save(path / "config.json")
-        state_dict = self.state_dict()
-        if drop_lora:
-            for key in list(state_dict.keys()):
-                if "lora" not in key:
-                    continue
-                state_dict.pop(key)
-                log.info(f"Drop LoRA parameter: {key}")
-        torch.save(state_dict, path / "model.pth")
-        self.tokenizer.save_pretrained(path)
-class NaiveTransformer(BaseTransformer):
-    def __init__(self, config: NaiveModelArgs, tokenizer: AutoTokenizer) -> None:
-        super().__init__(config, init_weights=False, tokenizer=tokenizer)
-        self.codebook_norm = RMSNorm(config.dim, eps=config.norm_eps)
-        self.codebook_output = nn.Linear(
-            config.dim,
-            config.codebook_size * config.num_codebooks,
-            bias=False,
-        )
-        self.apply(self._init_weights)
-    def decode(self, result: BaseTransformerForwardResult) -> TransformerForwardResult:
-        token_logits = result.logits
-        x = result.hidden_states
-        # Codebook
-        codebook_logits = self.codebook_output(self.codebook_norm(x))
-        codebook_logits = rearrange(
-            codebook_logits, "b n (c d) -> b n c d", c=self.config.num_codebooks
-        )
-        return TransformerForwardResult(
-            token_logits=token_logits,
-            codebook_logits=codebook_logits,
-        )
-    def forward(
-        self,
-        inp: Tensor,
-        key_padding_mask: Optional[Tensor] = None,
-    ) -> TransformerForwardResult:
-        result = super().forward(
-            inp=inp,
-            key_padding_mask=key_padding_mask,
-        )
-        return self.decode(result)
-    def forward_generate(
-        self, x: Tensor, input_pos: Optional[Tensor] = None
-    ) -> TransformerForwardResult:
-        result = super().forward_generate(x, input_pos)
-        return self.decode(result)
-class DualARTransformer(BaseTransformer):
-    def __init__(self, config: NaiveModelArgs, tokenizer: AutoTokenizer) -> None:
-        super().__init__(config, init_weights=False, tokenizer=tokenizer)
-        # Fast transformer
-        self.fast_embeddings = nn.Embedding(config.codebook_size, config.dim)
-        # The equivalent bs is so large that sdpa doesn't work
-        self.fast_layers = nn.ModuleList(
-            TransformerBlock(config, use_sdpa=False) for _ in range(config.n_fast_layer)
-        )
-        self.fast_norm = RMSNorm(config.dim, eps=config.norm_eps)
-        self.fast_output = nn.Linear(
-            config.dim,
-            config.codebook_size,
-            bias=False,
-        )
-        self.apply(self._init_weights)
-    def setup_caches(
-        self, max_batch_size: int, max_seq_len: int, dtype: torch.dtype = torch.bfloat16
-    ):
-        super().setup_caches(max_batch_size, max_seq_len, dtype)
-        head_dim = self.config.dim // self.config.n_head
-        # Fast transformer
-        # The max seq len here is the number of codebooks
-        for b in self.fast_layers:
-            b.attention.kv_cache = KVCache(
-                max_batch_size,
-                self.config.num_codebooks,
-                self.config.n_local_heads,
-                head_dim,
-                dtype=dtype,
-            )
-    def forward(
-        self,
-        inp: Tensor,
-        key_padding_mask: Optional[Tensor] = None,
-    ) -> TransformerForwardResult:
-        parent_result = super().forward(inp, key_padding_mask)
-        token_logits = parent_result.logits
-        x = parent_result.hidden_states
-        # Fast transformer
-        fast_seq_len = self.config.num_codebooks
-        fast_mask = self.causal_mask[
-            None, None, :fast_seq_len, :fast_seq_len
-        ]  # (B, N, Q, K)
-        fast_freqs_cis = self.freqs_cis[:fast_seq_len]
-        # Drop the last token and rotate left
-        codebooks = inp[:, 1:-1, 1:]
-        codebooks = F.pad(codebooks, (0, 1), value=0)
-        codebook_embeddings = self.fast_embeddings(codebooks)
-        x = torch.cat([x[:, None], codebook_embeddings], dim=1)
-        b, s = x.size(0), x.size(2)
-        x = rearrange(x, "b n s d -> (b s) n d")  # flatten the batch and seq_len
-        # Remove padded part
-        codebooks = rearrange(codebooks, "b n s -> (b s) n")
-        codebook_mask = (codebooks == 0).all(dim=-1)
-        if torch.all(codebook_mask):
-            # If all codebooks are padded, we keep first 8 to make sure the model runs
-            codebook_mask[:8] = False
-        x_bs, x_len = x.size(0), x.size(1)
-        x = x[~codebook_mask]
-        for layer in self.fast_layers:
-            if self.config.use_gradient_checkpointing and self.training:
-                x = checkpoint(layer, x, fast_freqs_cis, fast_mask, use_reentrant=True)
-            else:
-                x = layer(x, fast_freqs_cis, fast_mask)
-        # unflatten the batch and num_codebooks
-        fast_out = self.fast_norm(x)
-        codebook_logits = self.fast_output(fast_out)
-        # Re-pad the codebook_logits
-        buffer = torch.zeros(
-            x_bs,
-            x_len,
-            codebook_logits.size(-1),
-            device=codebook_logits.device,
-            dtype=codebook_logits.dtype,
-        )
-        buffer[~codebook_mask] = codebook_logits
-        codebook_logits = buffer
-        assert codebook_logits.shape[1] == self.config.num_codebooks
-        codebook_logits = rearrange(
-            codebook_logits,
-            "(b s) n d -> b s n d",
-            b=b,
-            s=s,
-            n=self.config.num_codebooks,
-        )
-        return TransformerForwardResult(
-            token_logits=token_logits,
-            codebook_logits=codebook_logits,
-        )
-    def forward_generate_fast(
-        self, x: Tensor, input_pos: Optional[Tensor] = None
-    ) -> Tensor:
-        # Fast transformer
-        x = x.view(1, 1, -1)
-        fast_mask = self.causal_mask[
-            None, None, input_pos, : self.config.num_codebooks
-        ]  # (B, N, Q, K)
-        fast_freqs_cis = self.freqs_cis[input_pos]
-        for layer in self.fast_layers:
-            x = layer(x, fast_freqs_cis, fast_mask, input_pos=input_pos)
-        # unflatten the batch and num_codebooks
-        fast_out = self.fast_norm(x)  # only take the last token
-        codebook_logits = self.fast_output(fast_out)
-        return codebook_logits
-class TransformerBlock(nn.Module):
-    def __init__(self, config: BaseModelArgs, use_sdpa: bool = True) -> None:
-        super().__init__()
-        self.attention = Attention(config, use_sdpa=use_sdpa)
-        self.feed_forward = FeedForward(config)
-        self.ffn_norm = RMSNorm(config.dim, config.norm_eps)
-        self.attention_norm = RMSNorm(config.dim, config.norm_eps)
-    def forward(
-        self, x: Tensor, freqs_cis: Tensor, mask: Tensor, input_pos: Tensor = None
-    ) -> Tensor:
-        h = x + self.attention(self.attention_norm(x), freqs_cis, mask, input_pos)
-        out = h + self.feed_forward(self.ffn_norm(h))
-        return out
-class Attention(nn.Module):
-    def __init__(self, config: BaseModelArgs, use_sdpa: bool = True):
-        super().__init__()
-        assert config.dim % config.n_head == 0
-        total_head_dim = (config.n_head + 2 * config.n_local_heads) * config.head_dim
-        # key, query, value projections for all heads, but in a batch
-        self.wqkv = nn.Linear(
-            config.dim, total_head_dim, bias=config.attention_qkv_bias
-        )
-        self.wo = nn.Linear(config.dim, config.dim, bias=False)
-        self.kv_cache = None
-        self.dropout = config.dropout
-        self.n_head = config.n_head
-        self.head_dim = config.head_dim
-        self.n_local_heads = config.n_local_heads
-        self.dim = config.dim
-        self.use_sdpa = use_sdpa
-        self._register_load_state_dict_pre_hook(self.load_hook)
-    def load_hook(self, state_dict, prefix, *args):
-        if prefix + "wq.weight" in state_dict:
-            wq = state_dict.pop(prefix + "wq.weight")
-            wk = state_dict.pop(prefix + "wk.weight")
-            wv = state_dict.pop(prefix + "wv.weight")
-            state_dict[prefix + "wqkv.weight"] = torch.cat([wq, wk, wv])
-    def forward(
-        self,
-        x: Tensor,
-        freqs_cis: Tensor,
-        mask: Tensor,
-        input_pos: Optional[Tensor] = None,
-    ) -> Tensor:
-        bsz, seqlen, _ = x.shape
-        kv_size = self.n_local_heads * self.head_dim
-        q, k, v = self.wqkv(x).split([self.dim, kv_size, kv_size], dim=-1)
-        q = q.view(bsz, seqlen, self.n_head, self.head_dim)
-        k = k.view(bsz, seqlen, self.n_local_heads, self.head_dim)
-        v = v.view(bsz, seqlen, self.n_local_heads, self.head_dim)
-        q = apply_rotary_emb(q, freqs_cis)
-        k = apply_rotary_emb(k, freqs_cis)
-        q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
-        if self.kv_cache is not None:
-            k, v = self.kv_cache.update(input_pos, k, v)
-        k = k.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
-        v = v.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
-        if self.use_sdpa:
-            if mask is None:
-                with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
-                    y = F.scaled_dot_product_attention(
-                        q,
-                        k,
-                        v,
-                        dropout_p=self.dropout if self.training else 0.0,
-                        is_causal=True,
-                        # No third party attn_mask here to use flash_attention
-                    )
-            else:
-                y = F.scaled_dot_product_attention(
-                    q,
-                    k,
-                    v,
-                    attn_mask=mask,
-                    dropout_p=self.dropout if self.training else 0.0,
-                )
-        else:
-            y = self.eq_scaled_dot_product_attention(
-                q,
-                k,
-                v,
-                attn_mask=mask,
-                dropout_p=self.dropout if self.training else 0.0,
-            )
-        y = y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
-        return self.wo(y)
-    def eq_scaled_dot_product_attention(
-        self,
-        query,
-        key,
-        value,
-        attn_mask=None,
-        dropout_p=0.0,
-    ) -> torch.Tensor:
-        # This is a standard scaled dot product attention
-        # It's low efficient, but it doesn't raise cuda error
-        L, S = query.size(-2), key.size(-2)
-        scale_factor = 1 / math.sqrt(query.size(-1))
-        attn_bias = torch.zeros(1, 1, L, S, dtype=query.dtype, device=query.device)
-        if attn_mask is not None:
-            if attn_mask.dtype == torch.bool:
-                attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
-            else:
-                attn_bias += attn_mask
-        attn_weight = query @ key.transpose(-2, -1) * scale_factor
-        attn_weight += attn_bias
-        attn_weight = torch.softmax(attn_weight, dim=-1)
-        attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
-        return attn_weight @ value
-class FeedForward(nn.Module):
-    def __init__(self, config: BaseModelArgs) -> None:
-        super().__init__()
-        self.w1 = nn.Linear(config.dim, config.intermediate_size, bias=False)
-        self.w3 = nn.Linear(config.dim, config.intermediate_size, bias=False)
-        self.w2 = nn.Linear(config.intermediate_size, config.dim, bias=False)
-    def forward(self, x: Tensor) -> Tensor:
-        return self.w2(F.silu(self.w1(x)) * self.w3(x))
-class RMSNorm(nn.Module):
-    def __init__(self, dim: int, eps: float = 1e-5):
-        super().__init__()
-        self.eps = eps
-        self.weight = nn.Parameter(torch.ones(dim))
-    def _norm(self, x):
-        return x * torch.rsqrt(torch.mean(x * x, dim=-1, keepdim=True) + self.eps)
-    def forward(self, x: Tensor) -> Tensor:
-        output = self._norm(x.float()).type_as(x)
-        return output * self.weight
-def precompute_freqs_cis(seq_len: int, n_elem: int, base: int = 10000) -> Tensor:
-    freqs = 1.0 / (
-        base ** (torch.arange(0, n_elem, 2)[: (n_elem // 2)].float() / n_elem)
-    )
-    t = torch.arange(seq_len, device=freqs.device)
-    freqs = torch.outer(t, freqs)
-    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
-    cache = torch.stack([freqs_cis.real, freqs_cis.imag], dim=-1)
-    return cache.to(dtype=torch.bfloat16)
-def apply_rotary_emb(x: Tensor, freqs_cis: Tensor) -> Tensor:
-    xshaped = x.float().reshape(*x.shape[:-1], -1, 2)
-    freqs_cis = freqs_cis.view(1, xshaped.size(1), 1, xshaped.size(3), 2)
-    x_out2 = torch.stack(
-        [
-            xshaped[..., 0] * freqs_cis[..., 0] - xshaped[..., 1] * freqs_cis[..., 1],
-            xshaped[..., 1] * freqs_cis[..., 0] + xshaped[..., 0] * freqs_cis[..., 1],
-        ],
-        -1,
-    )
-    x_out2 = x_out2.flatten(3)
-    return x_out2.type_as(x)

fish_speech/models/text2semantic/lora.py DELETED Viewed

@@ -1,92 +0,0 @@
-from dataclasses import dataclass
-import loralib as lora
-@dataclass
-class LoraConfig:
-    r: int
-    lora_alpha: float
-    lora_dropout: float = 0.0
-def setup_lora(model, lora_config):
-    # Replace the embedding layer with a LoRA layer
-    model.embeddings = lora.Embedding(
-        num_embeddings=model.embeddings.num_embeddings,
-        embedding_dim=model.embeddings.embedding_dim,
-        padding_idx=model.embeddings.padding_idx,
-        r=lora_config.r,
-        lora_alpha=lora_config.lora_alpha,
-    )
-    model.codebook_embeddings = lora.Embedding(
-        num_embeddings=model.codebook_embeddings.num_embeddings,
-        embedding_dim=model.codebook_embeddings.embedding_dim,
-        padding_idx=model.codebook_embeddings.padding_idx,
-        r=lora_config.r,
-        lora_alpha=lora_config.lora_alpha,
-    )
-    # Replace output layer with a LoRA layer
-    linears = [(model, "output")]
-    # Replace all linear layers with LoRA layers
-    for layer in model.layers:
-        linears.extend([(layer.attention, "wqkv"), (layer.attention, "wo")])
-        linears.extend(
-            [
-                (layer.feed_forward, "w1"),
-                (layer.feed_forward, "w2"),
-                (layer.feed_forward, "w3"),
-            ]
-        )
-    if hasattr(model, "fast_layers"):
-        model.fast_embeddings = lora.Embedding(
-            num_embeddings=model.fast_embeddings.num_embeddings,
-            embedding_dim=model.fast_embeddings.embedding_dim,
-            padding_idx=model.fast_embeddings.padding_idx,
-            r=lora_config.r,
-            lora_alpha=lora_config.lora_alpha,
-        )
-        # Dual-AR model
-        linears.append((model, "fast_output"))
-        for layer in model.fast_layers:
-            linears.extend([(layer.attention, "wqkv"), (layer.attention, "wo")])
-            linears.extend(
-                [
-                    (layer.feed_forward, "w1"),
-                    (layer.feed_forward, "w2"),
-                    (layer.feed_forward, "w3"),
-                ]
-            )
-    for module, layer in linears:
-        updated_linear = lora.Linear(
-            in_features=getattr(module, layer).in_features,
-            out_features=getattr(module, layer).out_features,
-            bias=getattr(module, layer).bias,
-            r=lora_config.r,
-            lora_alpha=lora_config.lora_alpha,
-            lora_dropout=lora_config.lora_dropout,
-        )
-        setattr(module, layer, updated_linear)
-    # Mark only the LoRA layers as trainable
-    lora.mark_only_lora_as_trainable(model, bias="none")
-def get_merged_state_dict(model):
-    # This line will merge the state dict of the model and the LoRA parameters
-    model.eval()
-    # Then we need to remove the LoRA parameters from the state dict
-    state_dict = model.state_dict()
-    for name in list(state_dict.keys()):
-        if "lora" in name:
-            state_dict.pop(name)
-    return state_dict

fish_speech/models/vqgan/__init__.py DELETED Viewed

File without changes

fish_speech/models/vqgan/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (171 Bytes)

fish_speech/models/vqgan/modules/__pycache__/firefly.cpython-310.pyc DELETED Viewed

Binary file (18.3 kB)

fish_speech/models/vqgan/modules/__pycache__/fsq.cpython-310.pyc DELETED Viewed

Binary file (3.72 kB)

fish_speech/models/vqgan/modules/firefly.py DELETED Viewed

@@ -1,596 +0,0 @@
-import math
-from functools import partial
-from math import prod
-from typing import Callable
-import torch
-import torch.nn.functional as F
-from torch import nn
-from torch.nn.utils.parametrizations import weight_norm
-from torch.nn.utils.parametrize import remove_parametrizations
-from torch.utils.checkpoint import checkpoint
-def sequence_mask(length, max_length=None):
-    if max_length is None:
-        max_length = length.max()
-    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
-    return x.unsqueeze(0) < length.unsqueeze(1)
-def init_weights(m, mean=0.0, std=0.01):
-    classname = m.__class__.__name__
-    if classname.find("Conv1D") != -1:
-        m.weight.data.normal_(mean, std)
-def get_padding(kernel_size, dilation=1):
-    return (kernel_size * dilation - dilation) // 2
-def unpad1d(x: torch.Tensor, paddings: tuple[int, int]):
-    """Remove padding from x, handling properly zero padding. Only for 1d!"""
-    padding_left, padding_right = paddings
-    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
-    assert (padding_left + padding_right) <= x.shape[-1]
-    end = x.shape[-1] - padding_right
-    return x[..., padding_left:end]
-def get_extra_padding_for_conv1d(
-    x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0
-) -> int:
-    """See `pad_for_conv1d`."""
-    length = x.shape[-1]
-    n_frames = (length - kernel_size + padding_total) / stride + 1
-    ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
-    return ideal_length - length
-def pad1d(
-    x: torch.Tensor,
-    paddings: tuple[int, int],
-    mode: str = "zeros",
-    value: float = 0.0,
-):
-    """Tiny wrapper around F.pad, just to allow for reflect padding on small input.
-    If this is the case, we insert extra 0 padding to the right
-    before the reflection happen.
-    """
-    length = x.shape[-1]
-    padding_left, padding_right = paddings
-    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
-    if mode == "reflect":
-        max_pad = max(padding_left, padding_right)
-        extra_pad = 0
-        if length <= max_pad:
-            extra_pad = max_pad - length + 1
-            x = F.pad(x, (0, extra_pad))
-        padded = F.pad(x, paddings, mode, value)
-        end = padded.shape[-1] - extra_pad
-        return padded[..., :end]
-    else:
-        return F.pad(x, paddings, mode, value)
-class FishConvNet(nn.Module):
-    def __init__(
-        self, in_channels, out_channels, kernel_size, dilation=1, stride=1, groups=1
-    ):
-        super(FishConvNet, self).__init__()
-        self.conv = nn.Conv1d(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride=stride,
-            dilation=dilation,
-            groups=groups,
-        )
-        self.stride = stride
-        self.kernel_size = (kernel_size - 1) * dilation + 1
-        self.dilation = dilation
-    def forward(self, x):
-        pad = self.kernel_size - self.stride
-        extra_padding = get_extra_padding_for_conv1d(
-            x, self.kernel_size, self.stride, pad
-        )
-        x = pad1d(x, (pad, extra_padding), mode="constant", value=0)
-        return self.conv(x).contiguous()
-    def weight_norm(self, name="weight", dim=0):
-        self.conv = weight_norm(self.conv, name=name, dim=dim)
-        return self
-    def remove_weight_norm(self):
-        self.conv = remove_parametrizations(self.conv)
-        return self
-class FishTransConvNet(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size, dilation=1, stride=1):
-        super(FishTransConvNet, self).__init__()
-        self.conv = nn.ConvTranspose1d(
-            in_channels, out_channels, kernel_size, stride=stride, dilation=dilation
-        )
-        self.stride = stride
-        self.kernel_size = kernel_size
-    def forward(self, x):
-        x = self.conv(x)
-        pad = self.kernel_size - self.stride
-        padding_right = math.ceil(pad)
-        padding_left = pad - padding_right
-        x = unpad1d(x, (padding_left, padding_right))
-        return x.contiguous()
-    def weight_norm(self, name="weight", dim=0):
-        self.conv = weight_norm(self.conv, name=name, dim=dim)
-        return self
-    def remove_weight_norm(self):
-        self.conv = remove_parametrizations(self.conv)
-        return self
-class ResBlock1(torch.nn.Module):
-    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
-        super().__init__()
-        self.convs1 = nn.ModuleList(
-            [
-                FishConvNet(
-                    channels, channels, kernel_size, stride=1, dilation=dilation[0]
-                ).weight_norm(),
-                FishConvNet(
-                    channels, channels, kernel_size, stride=1, dilation=dilation[1]
-                ).weight_norm(),
-                FishConvNet(
-                    channels, channels, kernel_size, stride=1, dilation=dilation[2]
-                ).weight_norm(),
-            ]
-        )
-        self.convs1.apply(init_weights)
-        self.convs2 = nn.ModuleList(
-            [
-                FishConvNet(
-                    channels, channels, kernel_size, stride=1, dilation=dilation[0]
-                ).weight_norm(),
-                FishConvNet(
-                    channels, channels, kernel_size, stride=1, dilation=dilation[1]
-                ).weight_norm(),
-                FishConvNet(
-                    channels, channels, kernel_size, stride=1, dilation=dilation[2]
-                ).weight_norm(),
-            ]
-        )
-        self.convs2.apply(init_weights)
-    def forward(self, x):
-        for c1, c2 in zip(self.convs1, self.convs2):
-            xt = F.silu(x)
-            xt = c1(xt)
-            xt = F.silu(xt)
-            xt = c2(xt)
-            x = xt + x
-        return x
-    def remove_parametrizations(self):
-        for conv in self.convs1:
-            remove_parametrizations(conv, tensor_name="weight")
-        for conv in self.convs2:
-            remove_parametrizations(conv, tensor_name="weight")
-class ParallelBlock(nn.Module):
-    def __init__(
-        self,
-        channels: int,
-        kernel_sizes: tuple[int] = (3, 7, 11),
-        dilation_sizes: tuple[tuple[int]] = ((1, 3, 5), (1, 3, 5), (1, 3, 5)),
-    ):
-        super().__init__()
-        assert len(kernel_sizes) == len(dilation_sizes)
-        self.blocks = nn.ModuleList()
-        for k, d in zip(kernel_sizes, dilation_sizes):
-            self.blocks.append(ResBlock1(channels, k, d))
-    def forward(self, x):
-        return torch.stack([block(x) for block in self.blocks], dim=0).mean(dim=0)
-    def remove_parametrizations(self):
-        for block in self.blocks:
-            block.remove_parametrizations()
-class HiFiGANGenerator(nn.Module):
-    def __init__(
-        self,
-        *,
-        hop_length: int = 512,
-        upsample_rates: tuple[int] = (8, 8, 2, 2, 2),
-        upsample_kernel_sizes: tuple[int] = (16, 16, 8, 2, 2),
-        resblock_kernel_sizes: tuple[int] = (3, 7, 11),
-        resblock_dilation_sizes: tuple[tuple[int]] = ((1, 3, 5), (1, 3, 5), (1, 3, 5)),
-        num_mels: int = 128,
-        upsample_initial_channel: int = 512,
-        pre_conv_kernel_size: int = 7,
-        post_conv_kernel_size: int = 7,
-        post_activation: Callable = partial(nn.SiLU, inplace=True),
-    ):
-        super().__init__()
-        assert (
-            prod(upsample_rates) == hop_length
-        ), f"hop_length must be {prod(upsample_rates)}"
-        self.conv_pre = FishConvNet(
-            num_mels,
-            upsample_initial_channel,
-            pre_conv_kernel_size,
-            stride=1,
-        ).weight_norm()
-        self.num_upsamples = len(upsample_rates)
-        self.num_kernels = len(resblock_kernel_sizes)
-        self.noise_convs = nn.ModuleList()
-        self.ups = nn.ModuleList()
-        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
-            self.ups.append(
-                FishTransConvNet(
-                    upsample_initial_channel // (2**i),
-                    upsample_initial_channel // (2 ** (i + 1)),
-                    k,
-                    stride=u,
-                ).weight_norm()
-            )
-        self.resblocks = nn.ModuleList()
-        for i in range(len(self.ups)):
-            ch = upsample_initial_channel // (2 ** (i + 1))
-            self.resblocks.append(
-                ParallelBlock(ch, resblock_kernel_sizes, resblock_dilation_sizes)
-            )
-        self.activation_post = post_activation()
-        self.conv_post = FishConvNet(
-            ch, 1, post_conv_kernel_size, stride=1
-        ).weight_norm()
-        self.ups.apply(init_weights)
-        self.conv_post.apply(init_weights)
-    def forward(self, x):
-        x = self.conv_pre(x)
-        for i in range(self.num_upsamples):
-            x = F.silu(x, inplace=True)
-            x = self.ups[i](x)
-            if self.training and self.checkpointing:
-                x = checkpoint(
-                    self.resblocks[i],
-                    x,
-                    use_reentrant=False,
-                )
-            else:
-                x = self.resblocks[i](x)
-        x = self.activation_post(x)
-        x = self.conv_post(x)
-        x = torch.tanh(x)
-        return x
-    def remove_parametrizations(self):
-        for up in self.ups:
-            remove_parametrizations(up, tensor_name="weight")
-        for block in self.resblocks:
-            block.remove_parametrizations()
-        remove_parametrizations(self.conv_pre, tensor_name="weight")
-        remove_parametrizations(self.conv_post, tensor_name="weight")
-# DropPath copied from timm library
-def drop_path(
-    x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
-):
-    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
-    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
-    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
-    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
-    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
-    'survival rate' as the argument.
-    """  # noqa: E501
-    if drop_prob == 0.0 or not training:
-        return x
-    keep_prob = 1 - drop_prob
-    shape = (x.shape[0],) + (1,) * (
-        x.ndim - 1
-    )  # work with diff dim tensors, not just 2D ConvNets
-    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
-    if keep_prob > 0.0 and scale_by_keep:
-        random_tensor.div_(keep_prob)
-    return x * random_tensor
-class DropPath(nn.Module):
-    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""  # noqa: E501
-    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
-        super(DropPath, self).__init__()
-        self.drop_prob = drop_prob
-        self.scale_by_keep = scale_by_keep
-    def forward(self, x):
-        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
-    def extra_repr(self):
-        return f"drop_prob={round(self.drop_prob,3):0.3f}"
-class LayerNorm(nn.Module):
-    r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
-    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
-    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
-    with shape (batch_size, channels, height, width).
-    """  # noqa: E501
-    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(normalized_shape))
-        self.bias = nn.Parameter(torch.zeros(normalized_shape))
-        self.eps = eps
-        self.data_format = data_format
-        if self.data_format not in ["channels_last", "channels_first"]:
-            raise NotImplementedError
-        self.normalized_shape = (normalized_shape,)
-    def forward(self, x):
-        if self.data_format == "channels_last":
-            return F.layer_norm(
-                x, self.normalized_shape, self.weight, self.bias, self.eps
-            )
-        elif self.data_format == "channels_first":
-            u = x.mean(1, keepdim=True)
-            s = (x - u).pow(2).mean(1, keepdim=True)
-            x = (x - u) / torch.sqrt(s + self.eps)
-            x = self.weight[:, None] * x + self.bias[:, None]
-            return x
-# ConvNeXt Block copied from https://github.com/fishaudio/fish-diffusion/blob/main/fish_diffusion/modules/convnext.py
-class ConvNeXtBlock(nn.Module):
-    r"""ConvNeXt Block. There are two equivalent implementations:
-    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
-    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
-    We use (2) as we find it slightly faster in PyTorch
-    Args:
-        dim (int): Number of input channels.
-        drop_path (float): Stochastic depth rate. Default: 0.0
-        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
-        kernel_size (int): Kernel size for depthwise conv. Default: 7.
-        dilation (int): Dilation for depthwise conv. Default: 1.
-    """  # noqa: E501
-    def __init__(
-        self,
-        dim: int,
-        drop_path: float = 0.0,
-        layer_scale_init_value: float = 1e-6,
-        mlp_ratio: float = 4.0,
-        kernel_size: int = 7,
-        dilation: int = 1,
-    ):
-        super().__init__()
-        self.dwconv = FishConvNet(
-            dim,
-            dim,
-            kernel_size=kernel_size,
-            # padding=int(dilation * (kernel_size - 1) / 2),
-            groups=dim,
-        )  # depthwise conv
-        self.norm = LayerNorm(dim, eps=1e-6)
-        self.pwconv1 = nn.Linear(
-            dim, int(mlp_ratio * dim)
-        )  # pointwise/1x1 convs, implemented with linear layers
-        self.act = nn.GELU()
-        self.pwconv2 = nn.Linear(int(mlp_ratio * dim), dim)
-        self.gamma = (
-            nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True)
-            if layer_scale_init_value > 0
-            else None
-        )
-        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
-    def forward(self, x, apply_residual: bool = True):
-        input = x
-        x = self.dwconv(x)
-        x = x.permute(0, 2, 1)  # (N, C, L) -> (N, L, C)
-        x = self.norm(x)
-        x = self.pwconv1(x)
-        x = self.act(x)
-        x = self.pwconv2(x)
-        if self.gamma is not None:
-            x = self.gamma * x
-        x = x.permute(0, 2, 1)  # (N, L, C) -> (N, C, L)
-        x = self.drop_path(x)
-        if apply_residual:
-            x = input + x
-        return x
-class ConvNeXtEncoder(nn.Module):
-    def __init__(
-        self,
-        input_channels: int = 3,
-        depths: list[int] = [3, 3, 9, 3],
-        dims: list[int] = [96, 192, 384, 768],
-        drop_path_rate: float = 0.0,
-        layer_scale_init_value: float = 1e-6,
-        kernel_size: int = 7,
-    ):
-        super().__init__()
-        assert len(depths) == len(dims)
-        self.downsample_layers = nn.ModuleList()
-        stem = nn.Sequential(
-            FishConvNet(
-                input_channels,
-                dims[0],
-                kernel_size=7,
-                # padding=3,
-                # padding_mode="replicate",
-                # padding_mode="zeros",
-            ),
-            LayerNorm(dims[0], eps=1e-6, data_format="channels_first"),
-        )
-        self.downsample_layers.append(stem)
-        for i in range(len(depths) - 1):
-            mid_layer = nn.Sequential(
-                LayerNorm(dims[i], eps=1e-6, data_format="channels_first"),
-                nn.Conv1d(dims[i], dims[i + 1], kernel_size=1),
-            )
-            self.downsample_layers.append(mid_layer)
-        self.stages = nn.ModuleList()
-        dp_rates = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
-        cur = 0
-        for i in range(len(depths)):
-            stage = nn.Sequential(
-                *[
-                    ConvNeXtBlock(
-                        dim=dims[i],
-                        drop_path=dp_rates[cur + j],
-                        layer_scale_init_value=layer_scale_init_value,
-                        kernel_size=kernel_size,
-                    )
-                    for j in range(depths[i])
-                ]
-            )
-            self.stages.append(stage)
-            cur += depths[i]
-        self.norm = LayerNorm(dims[-1], eps=1e-6, data_format="channels_first")
-        self.apply(self._init_weights)
-    def _init_weights(self, m):
-        if isinstance(m, (nn.Conv1d, nn.Linear)):
-            nn.init.trunc_normal_(m.weight, std=0.02)
-            nn.init.constant_(m.bias, 0)
-    def forward(
-        self,
-        x: torch.Tensor,
-    ) -> torch.Tensor:
-        for i in range(len(self.downsample_layers)):
-            x = self.downsample_layers[i](x)
-            x = self.stages[i](x)
-        return self.norm(x)
-class FireflyArchitecture(nn.Module):
-    def __init__(
-        self,
-        backbone: nn.Module,
-        head: nn.Module,
-        quantizer: nn.Module,
-        spec_transform: nn.Module,
-    ):
-        super().__init__()
-        self.backbone = backbone
-        self.head = head
-        self.quantizer = quantizer
-        self.spec_transform = spec_transform
-        self.downsample_factor = math.prod(self.quantizer.downsample_factor)
-    def forward(self, x: torch.Tensor, template=None, mask=None) -> torch.Tensor:
-        if self.spec_transform is not None:
-            x = self.spec_transform(x)
-        x = self.backbone(x)
-        if mask is not None:
-            x = x * mask
-        if self.quantizer is not None:
-            vq_result = self.quantizer(x)
-            x = vq_result.z
-            if mask is not None:
-                x = x * mask
-        x = self.head(x, template=template)
-        if x.ndim == 2:
-            x = x[:, None, :]
-        if self.vq is not None:
-            return x, vq_result
-        return x
-    def encode(self, audios, audio_lengths):
-        audios = audios.float()
-        mels = self.spec_transform(audios)
-        mel_lengths = audio_lengths // self.spec_transform.hop_length
-        mel_masks = sequence_mask(mel_lengths, mels.shape[2])
-        mel_masks_float_conv = mel_masks[:, None, :].float()
-        mels = mels * mel_masks_float_conv
-        # Encode
-        encoded_features = self.backbone(mels) * mel_masks_float_conv
-        feature_lengths = mel_lengths // self.downsample_factor
-        return self.quantizer.encode(encoded_features), feature_lengths
-    def decode(self, indices, feature_lengths) -> torch.Tensor:
-        mel_masks = sequence_mask(
-            feature_lengths * self.downsample_factor,
-            indices.shape[2] * self.downsample_factor,
-        )
-        mel_masks_float_conv = mel_masks[:, None, :].float()
-        audio_lengths = (
-            feature_lengths * self.downsample_factor * self.spec_transform.hop_length
-        )
-        audio_masks = sequence_mask(
-            audio_lengths,
-            indices.shape[2] * self.downsample_factor * self.spec_transform.hop_length,
-        )
-        audio_masks_float_conv = audio_masks[:, None, :].float()
-        z = self.quantizer.decode(indices) * mel_masks_float_conv
-        x = self.head(z) * audio_masks_float_conv
-        return x, audio_lengths
-    def remove_parametrizations(self):
-        if hasattr(self.backbone, "remove_parametrizations"):
-            self.backbone.remove_parametrizations()
-        if hasattr(self.head, "remove_parametrizations"):
-            self.head.remove_parametrizations()
-    @property
-    def device(self):
-        return next(self.parameters()).device

fish_speech/models/vqgan/modules/fsq.py DELETED Viewed

@@ -1,116 +0,0 @@
-from dataclasses import dataclass
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange
-from vector_quantize_pytorch import GroupedResidualFSQ
-from .firefly import ConvNeXtBlock, FishConvNet, FishTransConvNet
-@dataclass
-class FSQResult:
-    z: torch.Tensor
-    codes: torch.Tensor
-    latents: torch.Tensor
-class DownsampleFiniteScalarQuantize(nn.Module):
-    def __init__(
-        self,
-        input_dim: int = 512,
-        n_codebooks: int = 9,
-        n_groups: int = 1,
-        levels: tuple[int] = (8, 5, 5, 5),  # Approximate 2**10
-        downsample_factor: tuple[int] = (2, 2),
-        downsample_dims: tuple[int] | None = None,
-    ):
-        super().__init__()
-        if downsample_dims is None:
-            downsample_dims = [input_dim for _ in range(len(downsample_factor))]
-        all_dims = (input_dim,) + tuple(downsample_dims)
-        self.residual_fsq = GroupedResidualFSQ(
-            dim=all_dims[-1],
-            levels=levels,
-            num_quantizers=n_codebooks,
-            groups=n_groups,
-        )
-        self.downsample_factor = downsample_factor
-        self.downsample_dims = downsample_dims
-        self.downsample = nn.Sequential(
-            *[
-                nn.Sequential(
-                    FishConvNet(
-                        all_dims[idx],
-                        all_dims[idx + 1],
-                        kernel_size=factor,
-                        stride=factor,
-                    ),
-                    ConvNeXtBlock(dim=all_dims[idx + 1]),
-                )
-                for idx, factor in enumerate(downsample_factor)
-            ]
-        )
-        self.upsample = nn.Sequential(
-            *[
-                nn.Sequential(
-                    FishTransConvNet(
-                        all_dims[idx + 1],
-                        all_dims[idx],
-                        kernel_size=factor,
-                        stride=factor,
-                    ),
-                    ConvNeXtBlock(dim=all_dims[idx]),
-                )
-                for idx, factor in reversed(list(enumerate(downsample_factor)))
-            ]
-        )
-        self.apply(self._init_weights)
-    def _init_weights(self, m):
-        if isinstance(m, (nn.Conv1d, nn.Linear)):
-            nn.init.trunc_normal_(m.weight, std=0.02)
-            nn.init.constant_(m.bias, 0)
-    def forward(self, z) -> FSQResult:
-        original_shape = z.shape
-        z = self.downsample(z)
-        quantized, indices = self.residual_fsq(z.mT)
-        result = FSQResult(
-            z=quantized.mT,
-            codes=indices.mT,
-            latents=z,
-        )
-        result.z = self.upsample(result.z)
-        # Pad or crop z to match original shape
-        diff = original_shape[-1] - result.z.shape[-1]
-        left = diff // 2
-        right = diff - left
-        if diff > 0:
-            result.z = F.pad(result.z, (left, right))
-        elif diff < 0:
-            result.z = result.z[..., left:-right]
-        return result
-    def encode(self, z):
-        z = self.downsample(z)
-        _, indices = self.residual_fsq(z.mT)
-        indices = rearrange(indices, "g b l r -> b (g r) l")
-        return indices
-    def decode(self, indices: torch.Tensor):
-        indices = rearrange(indices, "b (g r) l -> g b l r", g=self.residual_fsq.groups)
-        z_q = self.residual_fsq.get_output_from_indices(indices)
-        z_q = self.upsample(z_q.mT)
-        return z_q

fish_speech/models/vqgan/utils.py DELETED Viewed

@@ -1,94 +0,0 @@
-import matplotlib
-import torch
-from matplotlib import pyplot as plt
-matplotlib.use("Agg")
-def convert_pad_shape(pad_shape):
-    l = pad_shape[::-1]
-    pad_shape = [item for sublist in l for item in sublist]
-    return pad_shape
-def sequence_mask(length, max_length=None):
-    if max_length is None:
-        max_length = length.max()
-    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
-    return x.unsqueeze(0) < length.unsqueeze(1)
-def init_weights(m, mean=0.0, std=0.01):
-    classname = m.__class__.__name__
-    if classname.find("Conv") != -1:
-        m.weight.data.normal_(mean, std)
-def get_padding(kernel_size, dilation=1):
-    return int((kernel_size * dilation - dilation) / 2)
-def plot_mel(data, titles=None):
-    fig, axes = plt.subplots(len(data), 1, squeeze=False)
-    if titles is None:
-        titles = [None for i in range(len(data))]
-    plt.tight_layout()
-    for i in range(len(data)):
-        mel = data[i]
-        if isinstance(mel, torch.Tensor):
-            mel = mel.float().detach().cpu().numpy()
-        axes[i][0].imshow(mel, origin="lower")
-        axes[i][0].set_aspect(2.5, adjustable="box")
-        axes[i][0].set_ylim(0, mel.shape[0])
-        axes[i][0].set_title(titles[i], fontsize="medium")
-        axes[i][0].tick_params(labelsize="x-small", left=False, labelleft=False)
-        axes[i][0].set_anchor("W")
-    return fig
-def slice_segments(x, ids_str, segment_size=4):
-    ret = torch.zeros_like(x[:, :, :segment_size])
-    for i in range(x.size(0)):
-        idx_str = ids_str[i]
-        idx_end = idx_str + segment_size
-        ret[i] = x[i, :, idx_str:idx_end]
-    return ret
-def rand_slice_segments(x, x_lengths=None, segment_size=4):
-    b, d, t = x.size()
-    if x_lengths is None:
-        x_lengths = t
-    ids_str_max = torch.clamp(x_lengths - segment_size + 1, min=0)
-    ids_str = (torch.rand([b], device=x.device) * ids_str_max).to(dtype=torch.long)
-    ret = slice_segments(x, ids_str, segment_size)
-    return ret, ids_str
-@torch.jit.script
-def fused_add_tanh_sigmoid_multiply(in_act, n_channels):
-    n_channels_int = n_channels[0]
-    t_act = torch.tanh(in_act[:, :n_channels_int, :])
-    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
-    acts = t_act * s_act
-    return acts
-def avg_with_mask(x, mask):
-    assert mask.dtype == torch.float, "Mask should be float"
-    if mask.ndim == 2:
-        mask = mask.unsqueeze(1)
-    if mask.shape[1] == 1:
-        mask = mask.expand_as(x)
-    return (x * mask).sum() / mask.sum()

fish_speech/scheduler.py DELETED Viewed

@@ -1,40 +0,0 @@
-import math
-def get_cosine_schedule_with_warmup_lr_lambda(
-    current_step: int,
-    *,
-    num_warmup_steps: int | float,
-    num_training_steps: int,
-    num_cycles: float = 0.5,
-    final_lr_ratio: float = 0.0,
-):
-    if 0 < num_warmup_steps < 1:  # float mode
-        num_warmup_steps = int(num_warmup_steps * num_training_steps)
-    if current_step < num_warmup_steps:
-        return float(current_step) / float(max(1, num_warmup_steps))
-    progress = float(current_step - num_warmup_steps) / float(
-        max(1, num_training_steps - num_warmup_steps)
-    )
-    return max(
-        final_lr_ratio,
-        0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)),
-    )
-def get_constant_schedule_with_warmup_lr_lambda(
-    current_step: int,
-    *,
-    num_warmup_steps: int | float,
-    num_training_steps: int | None = None,
-):
-    if 0 < num_warmup_steps < 1:  # float mode
-        num_warmup_steps = int(num_warmup_steps * num_training_steps)
-    if current_step < num_warmup_steps:
-        return float(current_step) / float(max(1, num_warmup_steps))
-    return 1.0

fish_speech/text/__init__.py DELETED Viewed

@@ -1,4 +0,0 @@
-from .clean import clean_text
-from .spliter import split_text
-__all__ = ["clean_text", "split_text"]

fish_speech/text/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (274 Bytes)

fish_speech/text/__pycache__/clean.cpython-310.pyc DELETED Viewed

Binary file (840 Bytes)