Delete xttsv2_gpt2

Browse files

Files changed (8) hide show

xttsv2_gpt2/config.json +0 -44
xttsv2_gpt2/gpt2_model.safetensors +0 -3
xttsv2_gpt2/gpt_config.py +0 -143
xttsv2_gpt2/special_tokens_map.json +0 -6
xttsv2_gpt2/tokenizer.json +0 -0
xttsv2_gpt2/tokenizer.py +0 -887
xttsv2_gpt2/tokenizer_config.json +0 -192
xttsv2_gpt2/xtts2_gpt_modeling.py +0 -505

xttsv2_gpt2/config.json DELETED Viewed

@@ -1,44 +0,0 @@
-{
-  "activation_function": "gelu",
-  "architectures": [
-    "XttsGPT"
-  ],
-  "attn_pdrop": 0.1,
-  "audio_config": {
-    "mel_channels": 80,
-    "output_sample_rate": 24000,
-    "sample_rate": 22050
-  },
-  "auto_map": {
-    "AutoConfig": "AstraMindAI/xtts2-gpt--gpt_config.XTTSGPTConfig",
-    "AutoModelForCausalLM": "AstraMindAI/xtts2-gpt--xtts2_gpt_modeling.XttsGPT",
-    "AutoTokenizer": "AstraMindAI/xtts2-gpt--tokenizer.XTTSTokenizerFast"
-  },
-  "decoder_input_dim": 1024,
-  "enable_redaction": false,
-  "gpt_batch_size": 1,
-  "gpt_max_audio_tokens": 605,
-  "hidden_size": 1024,
-  "initializer_range": 0.02,
-  "kv_cache": true,
-  "layer_norm_epsilon": 1e-05,
-  "max_audio_tokens": 605,
-  "max_prompt_tokens": 70,
-  "max_text_tokens": 402,
-  "model_type": "xtts_gpt",
-  "n_inner": 4096,
-  "num_attention_heads": 16,
-  "num_audio_tokens": 1026,
-  "num_hidden_layers": 30,
-  "number_text_tokens": 6681,
-  "reorder_and_upcast_attn": false,
-  "scale_attn_by_inverse_layer_idx": false,
-  "start_audio_token": 1024,
-  "start_text_token": null,
-  "stop_audio_token": 1025,
-  "stop_text_token": null,
-  "transformers_version": "4.46.0",
-  "use_masking_gt_prompt_approach": true,
-  "use_perceiver_resampler": true,
-  "vocab_size": 6681
-}

xttsv2_gpt2/gpt2_model.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:104d92b2297c243b64d1417bd5cfda015faca0a670e9bc90088eed0e844f8e35
-size 1522497936

xttsv2_gpt2/gpt_config.py DELETED Viewed

@@ -1,143 +0,0 @@
-from dataclasses import asdict, dataclass
-from typing import Dict, Optional, List
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-logger = logging.get_logger(__name__)
-@dataclass
-class GPTAudioConfig:
-    """Configuration for GPT audio processing parameters"""
-    mel_channels: int = 80
-    sample_rate: int = 22050
-    output_sample_rate: int = 24000
-@dataclass
-class XTTSAudioConfig:
-    """Configuration for audio processing parameters"""
-    sample_rate: int = 22050
-    output_sample_rate: int = 24000
-    mel_channels: int = 80
-    hop_length: int = 256
-    win_length: int = 1024
-    n_fft: int = 1024
-    fmin: int = 0
-    fmax: int = 8000
-    power: float = 1.0
-    mel_norms_file: Optional[str] = None
-class XTTSGPTConfig(PretrainedConfig):
-    """Configuration class for the GPT component of XTTS."""
-    model_type = "xtts_gpt"
-    def __init__(
-            self,
-            # Model architecture
-            hidden_size: int = 1024,  # gpt_n_model_channels in original
-            n_inner: int = 4096,
-            num_hidden_layers: int = 30,  # gpt_layers in original
-            num_attention_heads: int = 16,  # gpt_n_heads in original
-            # Tokenizer settings
-            vocab_size: int = 6681,  # gpt_number_text_tokens in original
-            number_text_tokens: int = 6681,  # Explicit text token vocabulary size
-            start_text_token: Optional[int] = None,
-            stop_text_token: Optional[int] = None,
-            # Audio token settings
-            num_audio_tokens: int = 1026,  # gpt_num_audio_tokens in original
-            start_audio_token: int = 1024,  # gpt_start_audio_token in original
-            stop_audio_token: int = 1025,  # gpt_stop_audio_token in original
-            # Sequence length settings
-            max_audio_tokens: int = 605,  # gpt_max_audio_tokens in original
-            max_text_tokens: int = 402,  # gpt_max_text_tokens in original
-            max_prompt_tokens: int = 70,  # gpt_max_prompt_tokens in original
-            gpt_max_audio_tokens: int = 605,  # Used for generation
-            # Model behavior settings
-            use_masking_gt_prompt_approach: bool = True,  # gpt_use_masking_gt_prompt_approach in original
-            use_perceiver_resampler: bool = True,  # gpt_use_perceiver_resampler in original
-            kv_cache: bool = True,
-            enable_redaction: bool = False,
-            # GPT batch settings
-            gpt_batch_size: int = 1,
-            # Audio processing
-            audio_config: Optional[Dict] = None,
-            # Architecture specifics
-            layer_norm_epsilon: float = 1e-5,
-            initializer_range: float = 0.02,
-            add_cross_attention: bool = False,
-            scale_attn_by_inverse_layer_idx: bool = False,
-            reorder_and_upcast_attn: bool = False,
-            # Size settings for the decoder
-            decoder_input_dim: int = 1024,
-            architectures=["XttsGPT"],
-            auto_map={
-                "AutoConfig": "AstraMindAI/xtts2-gpt--gpt_config.XTTSGPTConfig",
-                "AutoModelForCausalLM": "AstraMindAI/xtts2-gpt--xtts2_gpt_modeling.XttsGPT",
-            },
-            activation_function: str = "gelu",
-            attn_pdrop: float = 0.1,
-            **kwargs
-    ):
-        super().__init__(**kwargs)
-        self.architectures = architectures
-        self.auto_map = auto_map
-        self.audio_config = GPTAudioConfig(
-            **audio_config if audio_config is not None else {}
-        )
-        self.activation_function = activation_function
-        self.attn_pdrop = attn_pdrop
-        self.hidden_size = hidden_size
-        self.n_inner = n_inner
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.vocab_size = vocab_size
-        self.number_text_tokens = number_text_tokens
-        self.start_text_token = start_text_token
-        self.stop_text_token = stop_text_token
-        self.num_audio_tokens = num_audio_tokens
-        self.start_audio_token = start_audio_token
-        self.stop_audio_token = stop_audio_token
-        self.max_audio_tokens = max_audio_tokens
-        self.max_text_tokens = max_text_tokens
-        self.max_prompt_tokens = max_prompt_tokens
-        self.gpt_max_audio_tokens = gpt_max_audio_tokens
-        self.use_masking_gt_prompt_approach = use_masking_gt_prompt_approach
-        self.use_perceiver_resampler = use_perceiver_resampler
-        self.kv_cache = kv_cache
-        self.enable_redaction = enable_redaction
-        self.gpt_batch_size = gpt_batch_size
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.initializer_range = initializer_range
-        self.add_cross_attention = add_cross_attention
-        self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
-        self.reorder_and_upcast_attn = reorder_and_upcast_attn
-        self.decoder_input_dim = decoder_input_dim
-    def to_dict(self) -> Dict:
-        """Convert the config to a dictionary."""
-        output = super().to_dict()
-        output["audio_config"] = asdict(self.audio_config)
-        return output
-    @classmethod
-    def from_dict(cls, config_dict: Dict, *args, **kwargs) -> "XTTSGPTConfig":
-        """Create a config from a dictionary."""
-        return cls(**config_dict)

xttsv2_gpt2/special_tokens_map.json DELETED Viewed

@@ -1,6 +0,0 @@
-{
-  "bos_token": "[START]",
-  "eos_token": "[STOP]",
-  "pad_token": "[PAD]",
-  "unk_token": "[UNK]"
-}

xttsv2_gpt2/tokenizer.json DELETED Viewed

The diff for this file is too large to render. See raw diff

xttsv2_gpt2/tokenizer.py DELETED Viewed

@@ -1,887 +0,0 @@
-import os
-import re
-import textwrap
-from typing import List, Optional, Union, Dict, Any
-from functools import cached_property
-import pypinyin
-import torch
-from hangul_romanize import Transliter
-from hangul_romanize.rule import academic
-from num2words import num2words
-from spacy.lang.ar import Arabic
-from spacy.lang.en import English
-from spacy.lang.es import Spanish
-from spacy.lang.ja import Japanese
-from spacy.lang.zh import Chinese
-from transformers import PreTrainedTokenizerFast, BatchEncoding
-from transformers.tokenization_utils_base import TruncationStrategy, PaddingStrategy
-from tokenizers import Tokenizer
-from tokenizers.pre_tokenizers import WhitespaceSplit
-from tokenizers.processors import TemplateProcessing
-from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words
-import cutlet
-# Funzioni di preprocessing del testo
-def get_spacy_lang(lang):
-    if lang == "zh":
-        return Chinese()
-    elif lang == "ja":
-        return Japanese()
-    elif lang == "ar":
-        return Arabic()
-    elif lang == "es":
-        return Spanish()
-    else:
-        # For most languages, English does the job
-        return English()
-def split_sentence(text, lang, text_split_length=250):
-    """Preprocess the input text and split into sentences based on language."""
-    text_splits = []
-    if text_split_length is not None and len(text) >= text_split_length:
-        text_splits.append("")
-        nlp = get_spacy_lang(lang)
-        nlp.add_pipe("sentencizer")
-        doc = nlp(text)
-        for sentence in doc.sents:
-            if len(text_splits[-1]) + len(str(sentence)) <= text_split_length:
-                text_splits[-1] += " " + str(sentence)
-                text_splits[-1] = text_splits[-1].lstrip()
-            elif len(str(sentence)) > text_split_length:
-                for line in textwrap.wrap(
-                    str(sentence),
-                    width=text_split_length,
-                    drop_whitespace=True,
-                    break_on_hyphens=False,
-                    tabsize=1,
-                ):
-                    text_splits.append(str(line))
-            else:
-                text_splits.append(str(sentence))
-        if len(text_splits) > 1 and text_splits[0] == "":
-                del text_splits[0]
-    else:
-        text_splits = [text.lstrip()]
-    return text_splits
-_whitespace_re = re.compile(r"\s+")
-# List of (regular expression, replacement) pairs for abbreviations:
-_abbreviations = {
-    "en": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
-        for x in [
-            ("mrs", "misess"),
-            ("mr", "mister"),
-            ("dr", "doctor"),
-            ("st", "saint"),
-            ("co", "company"),
-            ("jr", "junior"),
-            ("maj", "major"),
-            ("gen", "general"),
-            ("drs", "doctors"),
-            ("rev", "reverend"),
-            ("lt", "lieutenant"),
-            ("hon", "honorable"),
-            ("sgt", "sergeant"),
-            ("capt", "captain"),
-            ("esq", "esquire"),
-            ("ltd", "limited"),
-            ("col", "colonel"),
-            ("ft", "fort"),
-        ]
-    ],
-    "es": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
-        for x in [
-            ("sra", "señora"),
-            ("sr", "señor"),
-            ("dr", "doctor"),
-            ("dra", "doctora"),
-            ("st", "santo"),
-            ("co", "compañía"),
-            ("jr", "junior"),
-            ("ltd", "limitada"),
-        ]
-    ],
-    "fr": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
-        for x in [
-            ("mme", "madame"),
-            ("mr", "monsieur"),
-            ("dr", "docteur"),
-            ("st", "saint"),
-            ("co", "compagnie"),
-            ("jr", "junior"),
-            ("ltd", "limitée"),
-        ]
-    ],
-    "de": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
-        for x in [
-            ("fr", "frau"),
-            ("dr", "doktor"),
-            ("st", "sankt"),
-            ("co", "firma"),
-            ("jr", "junior"),
-        ]
-    ],
-    "pt": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
-        for x in [
-            ("sra", "senhora"),
-            ("sr", "senhor"),
-            ("dr", "doutor"),
-            ("dra", "doutora"),
-            ("st", "santo"),
-            ("co", "companhia"),
-            ("jr", "júnior"),
-            ("ltd", "limitada"),
-        ]
-    ],
-    "it": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
-        for x in [
-            # ("sig.ra", "signora"),
-            ("sig", "signore"),
-            ("dr", "dottore"),
-            ("st", "santo"),
-            ("co", "compagnia"),
-            ("jr", "junior"),
-            ("ltd", "limitata"),
-        ]
-    ],
-    "pl": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
-        for x in [
-            ("p", "pani"),
-            ("m", "pan"),
-            ("dr", "doktor"),
-            ("sw", "święty"),
-            ("jr", "junior"),
-        ]
-    ],
-    "ar": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
-        for x in [
-            # There are not many common abbreviations in Arabic as in English.
-        ]
-    ],
-    "zh": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
-        for x in [
-            # Chinese doesn't typically use abbreviations in the same way as Latin-based scripts.
-        ]
-    ],
-    "cs": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
-        for x in [
-            ("dr", "doktor"),  # doctor
-            ("ing", "inženýr"),  # engineer
-            ("p", "pan"),  # Could also map to pani for woman but no easy way to do it
-            # Other abbreviations would be specialized and not as common.
-        ]
-    ],
-    "ru": [
-        (re.compile("\\b%s\\b" % x[0], re.IGNORECASE), x[1])
-        for x in [
-            ("г-жа", "госпожа"),  # Mrs.
-            ("г-н", "господин"),  # Mr.
-            ("д-р", "доктор"),  # doctor
-            # Other abbreviations are less common or specialized.
-        ]
-    ],
-    "nl": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
-        for x in [
-            ("dhr", "de heer"),  # Mr.
-            ("mevr", "mevrouw"),  # Mrs.
-            ("dr", "dokter"),  # doctor
-            ("jhr", "jonkheer"),  # young lord or nobleman
-            # Dutch uses more abbreviations, but these are the most common ones.
-        ]
-    ],
-    "tr": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
-        for x in [
-            ("b", "bay"),  # Mr.
-            ("byk", "büyük"),  # büyük
-            ("dr", "doktor"),  # doctor
-            # Add other Turkish abbreviations here if needed.
-        ]
-    ],
-    "hu": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
-        for x in [
-            ("dr", "doktor"),  # doctor
-            ("b", "bácsi"),  # Mr.
-            ("nőv", "nővér"),  # nurse
-            # Add other Hungarian abbreviations here if needed.
-        ]
-    ],
-    "ko": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
-        for x in [
-            # Korean doesn't typically use abbreviations in the same way as Latin-based scripts.
-        ]
-    ],
-}
-def expand_abbreviations_multilingual(text, lang="en"):
-    if lang in _abbreviations:
-        for regex, replacement in _abbreviations[lang]:
-            text = re.sub(regex, replacement, text)
-    return text
-_symbols_multilingual = {
-    "en": [
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
-        for x in [
-            ("&", " and "),
-            ("@", " at "),
-            ("%", " percent "),
-            ("#", " hash "),
-            ("$", " dollar "),
-            ("£", " pound "),
-            ("°", " degree "),
-        ]
-    ],
-    "es": [
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
-        for x in [
-            ("&", " y "),
-            ("@", " arroba "),
-            ("%", " por ciento "),
-            ("#", " numeral "),
-            ("$", " dolar "),
-            ("£", " libra "),
-            ("°", " grados "),
-        ]
-    ],
-    "fr": [
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
-        for x in [
-            ("&", " et "),
-            ("@", " arobase "),
-            ("%", " pour cent "),
-            ("#", " dièse "),
-            ("$", " dollar "),
-            ("£", " livre "),
-            ("°", " degrés "),
-        ]
-    ],
-    "de": [
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
-        for x in [
-            ("&", " und "),
-            ("@", " at "),
-            ("%", " prozent "),
-            ("#", " raute "),
-            ("$", " dollar "),
-            ("£", " pfund "),
-            ("°", " grad "),
-        ]
-    ],
-    "pt": [
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
-        for x in [
-            ("&", " e "),
-            ("@", " arroba "),
-            ("%", " por cento "),
-            ("#", " cardinal "),
-            ("$", " dólar "),
-            ("£", " libra "),
-            ("°", " graus "),
-        ]
-    ],
-    "it": [
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
-        for x in [
-            ("&", " e "),
-            ("@", " chiocciola "),
-            ("%", " per cento "),
-            ("#", " cancelletto "),
-            ("$", " dollaro "),
-            ("£", " sterlina "),
-            ("°", " gradi "),
-        ]
-    ],
-    "pl": [
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
-        for x in [
-            ("&", " i "),
-            ("@", " małpa "),
-            ("%", " procent "),
-            ("#", " krzyżyk "),
-            ("$", " dolar "),
-            ("£", " funt "),
-            ("°", " stopnie "),
-        ]
-    ],
-    "ar": [
-        # Arabic
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
-        for x in [
-            ("&", " و "),
-            ("@", " على "),
-            ("%", " في المئة "),
-            ("#", " رقم "),
-            ("$", " دولار "),
-            ("£", " جنيه "),
-            ("°", " درجة "),
-        ]
-    ],
-    "zh": [
-        # Chinese
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
-        for x in [
-            ("&", " 和 "),
-            ("@", " 在 "),
-            ("%", " 百分之 "),
-            ("#", " 号 "),
-            ("$", " 美元 "),
-            ("£", " 英镑 "),
-            ("°", " 度 "),
-        ]
-    ],
-    "cs": [
-        # Czech
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
-        for x in [
-            ("&", " a "),
-            ("@", " na "),
-            ("%", " procento "),
-            ("#", " křížek "),
-            ("$", " dolar "),
-            ("£", " libra "),
-            ("°", " stupně "),
-        ]
-    ],
-    "ru": [
-        # Russian
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
-        for x in [
-            ("&", " и "),
-            ("@", " собака "),
-            ("%", " процентов "),
-            ("#", " номер "),
-            ("$", " доллар "),
-            ("£", " фунт "),
-            ("°", " градус "),
-        ]
-    ],
-    "nl": [
-        # Dutch
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
-        for x in [
-            ("&", " en "),
-            ("@", " bij "),
-            ("%", " procent "),
-            ("#", " hekje "),
-            ("$", " dollar "),
-            ("£", " pond "),
-            ("°", " graden "),
-        ]
-    ],
-    "tr": [
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
-        for x in [
-            ("&", " ve "),
-            ("@", " at "),
-            ("%", " yüzde "),
-            ("#", " diyez "),
-            ("$", " dolar "),
-            ("£", " sterlin "),
-            ("°", " derece "),
-        ]
-    ],
-    "hu": [
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
-        for x in [
-            ("&", " és "),
-            ("@", " kukac "),
-            ("%", " százalék "),
-            ("#", " kettőskereszt "),
-            ("$", " dollár "),
-            ("£", " font "),
-            ("°", " fok "),
-        ]
-    ],
-    "ko": [
-        # Korean
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
-        for x in [
-            ("&", " 그리고 "),
-            ("@", " 에 "),
-            ("%", " 퍼센트 "),
-            ("#", " 번호 "),
-            ("$", " 달러 "),
-            ("£", " 파운드 "),
-            ("°", " 도 "),
-        ]
-    ],
-}
-def expand_symbols_multilingual(text, lang="en"):
-    if lang in _symbols_multilingual:
-        for regex, replacement in _symbols_multilingual[lang]:
-            text = re.sub(regex, replacement, text)
-            text = text.replace("  ", " ")  # Ensure there are no double spaces
-    return text.strip()
-_ordinal_re = {
-    "en": re.compile(r"([0-9]+)(st|nd|rd|th)"),
-    "es": re.compile(r"([0-9]+)(º|ª|er|o|a|os|as)"),
-    "fr": re.compile(r"([0-9]+)(º|ª|er|re|e|ème)"),
-    "de": re.compile(r"([0-9]+)(st|nd|rd|th|º|ª|\.(?=\s|$))"),
-    "pt": re.compile(r"([0-9]+)(º|ª|o|a|os|as)"),
-    "it": re.compile(r"([0-9]+)(º|°|ª|o|a|i|e)"),
-    "pl": re.compile(r"([0-9]+)(º|ª|st|nd|rd|th)"),
-    "ar": re.compile(r"([0-9]+)(ون|ين|ث|ر|ى)"),
-    "cs": re.compile(r"([0-9]+)\.(?=\s|$)"),  # In Czech, a dot is often used after the number to indicate ordinals.
-    "ru": re.compile(r"([0-9]+)(-й|-я|-е|-ое|-ье|-го)"),
-    "nl": re.compile(r"([0-9]+)(de|ste|e)"),
-    "tr": re.compile(r"([0-9]+)(\.|inci|nci|uncu|üncü|\.)"),
-    "hu": re.compile(r"([0-9]+)(\.|adik|edik|odik|edik|ödik|ödike|ik)"),
-    "ko": re.compile(r"([0-9]+)(번째|번|차|째)"),
-}
-_number_re = re.compile(r"[0-9]+")
-_currency_re = {
-    "USD": re.compile(r"((\$[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+\$))"),
-    "GBP": re.compile(r"((£[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+£))"),
-    "EUR": re.compile(r"(([0-9\.\,]*[0-9]+€)|((€[0-9\.\,]*[0-9]+)))"),
-}
-_comma_number_re = re.compile(r"\b\d{1,3}(,\d{3})*(\.\d+)?\b")
-_dot_number_re = re.compile(r"\b\d{1,3}(\.\d{3})*(\,\d+)?\b")
-_decimal_number_re = re.compile(r"([0-9]+[.,][0-9]+)")
-def _remove_commas(m):
-    text = m.group(0)
-    if "," in text:
-        text = text.replace(",", "")
-    return text
-def _remove_dots(m):
-    text = m.group(0)
-    if "." in text:
-        text = text.replace(".", "")
-    return text
-def _expand_decimal_point(m, lang="en"):
-    amount = m.group(1).replace(",", ".")
-    return num2words(float(amount), lang=lang if lang != "cs" else "cz")
-def _expand_currency(m, lang="en", currency="USD"):
-    amount = float((re.sub(r"[^\d.]", "", m.group(0).replace(",", "."))))
-    full_amount = num2words(amount, to="currency", currency=currency, lang=lang if lang != "cs" else "cz")
-    and_equivalents = {
-        "en": ", ",
-        "es": " con ",
-        "fr": " et ",
-        "de": " und ",
-        "pt": " e ",
-        "it": " e ",
-        "pl": ", ",
-        "cs": ", ",
-        "ru": ", ",
-        "nl": ", ",
-        "ar": ", ",
-        "tr": ", ",
-        "hu": ", ",
-        "ko": ", ",
-    }
-    if amount.is_integer():
-        last_and = full_amount.rfind(and_equivalents.get(lang, ", "))
-        if last_and != -1:
-            full_amount = full_amount[:last_and]
-    return full_amount
-def _expand_ordinal(m, lang="en"):
-    return num2words(int(m.group(1)), ordinal=True, lang=lang if lang != "cs" else "cz")
-def _expand_number(m, lang="en"):
-    return num2words(int(m.group(0)), lang=lang if lang != "cs" else "cz")
-def expand_numbers_multilingual(text, lang="en"):
-    if lang == "zh":
-        text = zh_num2words()(text)
-    else:
-        if lang in ["en", "ru"]:
-            text = re.sub(_comma_number_re, _remove_commas, text)
-        else:
-            text = re.sub(_dot_number_re, _remove_dots, text)
-        try:
-            text = re.sub(_currency_re["GBP"], lambda m: _expand_currency(m, lang, "GBP"), text)
-            text = re.sub(_currency_re["USD"], lambda m: _expand_currency(m, lang, "USD"), text)
-            text = re.sub(_currency_re["EUR"], lambda m: _expand_currency(m, lang, "EUR"), text)
-        except Exception as e:
-            pass
-        if lang != "tr":
-            text = re.sub(_decimal_number_re, lambda m: _expand_decimal_point(m, lang), text)
-        if lang in _ordinal_re:
-            text = re.sub(_ordinal_re[lang], lambda m: _expand_ordinal(m, lang), text)
-        text = re.sub(_number_re, lambda m: _expand_number(m, lang), text)
-    return text
-def lowercase(text):
-    return text.lower()
-def collapse_whitespace(text):
-    return re.sub(_whitespace_re, " ", text)
-def multilingual_cleaners(text, lang):
-    text = text.replace('"', "")
-    if lang == "tr":
-        text = text.replace("İ", "i")
-        text = text.replace("Ö", "ö")
-        text = text.replace("Ü", "ü")
-    text = lowercase(text)
-    text = expand_numbers_multilingual(text, lang)
-    text = expand_abbreviations_multilingual(text, lang)
-    text = expand_symbols_multilingual(text, lang=lang)
-    text = collapse_whitespace(text)
-    return text
-def basic_cleaners(text):
-    """Basic pipeline that lowercases and collapses whitespace without transliteration."""
-    text = lowercase(text)
-    text = collapse_whitespace(text)
-    return text
-def chinese_transliterate(text):
-    return "".join(
-        [p[0] for p in pypinyin.pinyin(text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True)]
-    )
-def japanese_cleaners(text, katsu):
-    text = katsu.romaji(text)
-    text = lowercase(text)
-    return text
-def korean_transliterate(text, transliter):
-    return transliter.translit(text)
-# Fast Tokenizer Class
-class XTTSTokenizerFast(PreTrainedTokenizerFast):
-    """
-    Fast Tokenizer implementation for XTTS model using HuggingFace's PreTrainedTokenizerFast
-    """
-    def __init__(
-            self,
-            vocab_file: str = None,
-            tokenizer_object: Optional[Tokenizer] = None,
-            unk_token: str = "[UNK]",
-            pad_token: str = "[PAD]",
-            bos_token: str = "[START]",
-            eos_token: str = "[STOP]",
-            auto_map: dict = {"AutoTokenizer": ["AstraMindAI/xtts2-gpt--tokenizer.XTTSTokenizerFast", None]},
-            clean_up_tokenization_spaces: bool = True,
-            **kwargs
-    ):
-        if tokenizer_object is None and vocab_file is not None:
-            tokenizer_object = Tokenizer.from_file(vocab_file)
-        if tokenizer_object is not None:
-            # Configure the tokenizer
-            tokenizer_object.pre_tokenizer = WhitespaceSplit()
-            tokenizer_object.post_processor = TemplateProcessing(
-                single=f"{bos_token} $A {eos_token}",
-                special_tokens=[
-                    (bos_token, tokenizer_object.token_to_id(bos_token)),
-                    (eos_token, tokenizer_object.token_to_id(eos_token)),
-                ],
-            )
-        super().__init__(
-            tokenizer_object=tokenizer_object,
-            unk_token=unk_token,
-            pad_token=pad_token,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
-            **kwargs
-        )
-        # Character limits per language
-        self.char_limits = {
-            "en": 250, "de": 253, "fr": 273, "es": 239,
-            "it": 213, "pt": 203, "pl": 224, "zh": 82,
-            "ar": 166, "cs": 186, "ru": 182, "nl": 251,
-            "tr": 226, "ja": 71, "hu": 224, "ko": 95,
-        }
-        # Initialize language tools
-        self._katsu = None
-        self._korean_transliter = Transliter(academic)
-        # Ensure pad_token_id is set
-        if self.pad_token_id is None:
-            self.pad_token_id = self.tokenizer.token_to_id(self.pad_token)
-    @cached_property
-    def katsu(self):
-        if self._katsu is None:
-            self._katsu = cutlet.Cutlet()
-        return self._katsu
-    def preprocess_text(self, text: str, lang: str) -> str:
-        """Apply text preprocessing for language"""
-        base_lang = lang.split("-")[0]  # remove region
-        if base_lang in {"ar", "cs", "de", "en", "es", "fr", "hu", "it",
-                         "nl", "pl", "pt", "ru", "tr", "zh", "ko"}:
-            text = multilingual_cleaners(text, base_lang)
-            if base_lang == "zh":
-                text = chinese_transliterate(text)
-            if base_lang == "ko":
-                text = korean_transliterate(text, self._korean_transliter)
-        elif base_lang == "ja":
-            text = japanese_cleaners(text, self.katsu)
-        else:
-            text = basic_cleaners(text)
-        return text
-    def batch_encode_with_split(self, texts: Union[str, List[str]], lang: Union[str, List[str]],
-                                **kwargs) -> torch.Tensor:
-        """
-        Split texts into smaller chunks based on language character limits and encode them using HuggingFace fast tokenizer.
-        """
-        # Convert single inputs to lists
-        if isinstance(texts, str):
-            texts = [texts]
-        if isinstance(lang, str):
-            lang = [lang]
-        # Ensure lang list matches texts list
-        if len(lang) == 1 and len(texts) > 1:
-            lang = lang * len(texts)
-        # Check if texts and lang have the same length
-        if len(texts) != len(lang):
-            raise ValueError(f"Number of texts ({len(texts)}) does not match number of languages ({len(lang)}).")
-        batch_chunks = []
-        max_splits = 0
-        # For each text, split into chunks based on character limit
-        for text, text_lang in zip(texts, lang):
-            # Get language character limit
-            base_lang = text_lang.split("-")[0]
-            char_limit = self.char_limits.get(base_lang, 250)
-            # Clean and preprocess
-            text = self.preprocess_text(text, text_lang)
-            # Split text into sentences/chunks based on language
-            chunks = split_sentence(text, base_lang, text_split_length=char_limit)
-            # Format each chunk
-            formatted_chunks = []
-            for chunk in chunks:
-                lang_code = "zh-cn" if base_lang == "zh" else base_lang
-                formatted_chunk = f"[{lang_code}]{chunk}"
-                formatted_chunk = formatted_chunk.replace(" ", "[SPACE]")
-                formatted_chunks.append(formatted_chunk)
-            batch_chunks.append(formatted_chunks)
-            max_splits = max(max_splits, len(formatted_chunks))
-        # Flatten all chunks to a single list for batch encoding
-        all_chunks = [chunk for chunks in batch_chunks for chunk in chunks]
-        # Ensure the tokenizer is a fast tokenizer
-        if not self.is_fast:
-            raise ValueError("The tokenizer must be a fast tokenizer.")
-        # Encode all chunks using the fast tokenizer
-        encoding: BatchEncoding = self(
-            all_chunks,
-            add_special_tokens=False,
-            padding=True,
-            return_tensors='pt',
-            **kwargs
-        )
-        # The 'input_ids' tensor will have shape [total_chunks, max_sequence_length]
-        input_ids = encoding['input_ids']  # Tensor of shape [total_chunks, sequence_length]
-        # Now, we need to organize this tensor back into the desired shape
-        # We'll use 'batch_indices' to keep track of which chunks belong to which text
-        batch_indices = []
-        idx = 0
-        for chunks in batch_chunks:
-            batch_indices.append((idx, idx + len(chunks)))
-            idx += len(chunks)
-        # Determine max sequence length and add space for special tokens
-        max_seq_length = input_ids.size(1) + 2  # +2 for BOS and EOS tokens
-        # Prepare the final tensor
-        batch_size = len(texts)
-        padded_batch = torch.full(
-            (batch_size, max_splits, max_seq_length),
-            fill_value=self.pad_token_id,
-            dtype=torch.long
-        )
-        # Populate the final tensor with BOS and EOS tokens
-        for i, (start, end) in enumerate(batch_indices):
-            chunks_input_ids = input_ids[start:end]
-            num_chunks = chunks_input_ids.size(0)
-            for j in range(num_chunks):
-                sequence = chunks_input_ids[j]
-                # find the length of the sequence
-                seq_len = (sequence != self.pad_token_id).sum().item()
-                # insert BOS
-                padded_batch[i, j, 0] = self.bos_token_id
-                # insert sequence
-                padded_batch[i, j, 1:seq_len + 1] = sequence[:seq_len]
-                # insert EOS
-                padded_batch[i, j, seq_len + 1] = self.eos_token_id
-        return padded_batch
-    def _batch_encode_plus(
-            self,
-            batch_text_or_text_pairs,
-            add_special_tokens: bool = True,
-            padding_strategy=PaddingStrategy.DO_NOT_PAD,
-            truncation_strategy=TruncationStrategy.DO_NOT_TRUNCATE,
-            max_length: Optional[int] = None,
-            stride: int = 0,
-            is_split_into_words: bool = False,
-            pad_to_multiple_of: Optional[int] = None,
-            return_tensors: Optional[str] = None,
-            return_token_type_ids: Optional[bool] = None,
-            return_attention_mask: Optional[bool] = None,
-            return_overflowing_tokens: bool = False,
-            return_special_tokens_mask: bool = False,
-            return_offsets_mapping: bool = False,
-            return_length: bool = False,
-            verbose: bool = True,
-            **kwargs
-    ) -> Dict[str, Any]:
-        """
-        Override batch encoding to handle language-specific preprocessing
-        """
-        lang = kwargs.pop("lang", ["en"] * len(batch_text_or_text_pairs))
-        if isinstance(lang, str):
-            lang = [lang]
-        # Ensure lang list matches texts list
-        if len(lang) == 1 and len(batch_text_or_text_pairs) > 1:
-            lang = lang * len(batch_text_or_text_pairs)
-        # Check if batch_text_or_text_pairs and lang have the same length
-        if len(batch_text_or_text_pairs) != len(lang):
-            raise ValueError(f"Number of texts ({len(batch_text_or_text_pairs)}) does not match number of languages ({len(lang)}).")
-        # Preprocess each text in the batch with its corresponding language
-        processed_texts = []
-        for text, text_lang in zip(batch_text_or_text_pairs, lang):
-            if isinstance(text, str):
-                # Check length and preprocess
-                #self.check_input_length(text, text_lang)
-                processed_text = self.preprocess_text(text, text_lang)
-                # Format text with language tag and spaces
-                base_lang = text_lang.split("-")[0]
-                lang_code = "zh-cn" if base_lang == "zh" else base_lang
-                processed_text = f"[{lang_code}]{processed_text}"
-                processed_text = processed_text.replace(" ", "[SPACE]")
-                processed_texts.append(processed_text)
-            else:
-                processed_texts.append(text)
-        # Call the parent class's encoding method with processed texts
-        return super()._batch_encode_plus(
-            processed_texts,
-            add_special_tokens=add_special_tokens,
-            padding_strategy=padding_strategy,
-            truncation_strategy=truncation_strategy,
-            max_length=max_length,
-            stride=stride,
-            is_split_into_words=is_split_into_words,
-            pad_to_multiple_of=pad_to_multiple_of,
-            return_tensors=return_tensors,
-            return_token_type_ids=return_token_type_ids,
-            return_attention_mask=return_attention_mask,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_offsets_mapping=return_offsets_mapping,
-            return_length=return_length,
-            verbose=verbose,
-            **kwargs
-        )
-    def __call__(
-            self,
-            text: Union[str, List[str]],
-            lang: Union[str, List[str]] = "en",
-            add_special_tokens: bool = True,
-            padding: Union[bool, str, PaddingStrategy] = False,
-            truncation: Union[bool, str, TruncationStrategy] = False,
-            max_length: Optional[int] = None,
-            stride: int = 0,
-            return_tensors: Optional[str] = None,
-            return_token_type_ids: Optional[bool] = None,
-            return_attention_mask: Optional[bool] = True,
-            **kwargs
-    ):
-        """
-        Main tokenization method
-        """
-        # Convert single string to list for batch processing
-        if isinstance(text, str):
-            text = [text]
-        if isinstance(lang, str):
-            lang = [lang]
-        # Ensure lang list matches texts list
-        if len(lang) == 1 and len(text) > 1:
-            lang = lang * len(text)
-        # Ensure text and lang lists have same length
-        if len(text) != len(lang):
-            raise ValueError(f"Number of texts ({len(text)}) does not match number of languages ({len(lang)}).")
-        # Convert padding strategy
-        if isinstance(padding, bool):
-            padding_strategy = PaddingStrategy.LONGEST if padding else PaddingStrategy.DO_NOT_PAD
-        else:
-            padding_strategy = PaddingStrategy(padding)
-        # Convert truncation strategy
-        if isinstance(truncation, bool):
-            truncation_strategy = TruncationStrategy.LONGEST_FIRST if truncation else TruncationStrategy.DO_NOT_TRUNCATE
-        else:
-            truncation_strategy = TruncationStrategy(truncation)
-        # Use the batch encoding method
-        encoded = self._batch_encode_plus(
-            text,
-            add_special_tokens=add_special_tokens,
-            padding_strategy=padding_strategy,
-            truncation_strategy=truncation_strategy,
-            max_length=max_length,
-            stride=stride,
-            return_tensors=return_tensors,
-            return_token_type_ids=return_token_type_ids,
-            return_attention_mask=return_attention_mask,
-            lang=lang,
-            **kwargs
-        )
-        return encoded

xttsv2_gpt2/tokenizer_config.json DELETED Viewed

@@ -1,192 +0,0 @@
-{
-  "added_tokens_decoder": {
-    "0": {
-      "content": "[STOP]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "1": {
-      "content": "[UNK]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "2": {
-      "content": "[SPACE]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "259": {
-      "content": "[en]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "260": {
-      "content": "[de]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "261": {
-      "content": "[START]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "262": {
-      "content": "[fr]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "267": {
-      "content": "[ru]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "284": {
-      "content": "[es]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "285": {
-      "content": "[it]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "286": {
-      "content": "[pt]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "293": {
-      "content": "[cs]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "294": {
-      "content": "[pl]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "295": {
-      "content": "[tr]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "297": {
-      "content": "[nl]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "5022": {
-      "content": "[ar]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "5023": {
-      "content": "[zh-cn]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "5412": {
-      "content": "[ja]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "5753": {
-      "content": "[hu]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "6152": {
-      "content": "[ko]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "6680": {
-      "content": "[hi]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "6681": {
-      "content": "[PAD]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    }
-  },
-  "auto_map": {"AutoTokenizer": ["AstraMindAI/xtts2-gpt--tokenizer.XTTSTokenizerFast", null]},
-  "bos_token": "[START]",
-  "clean_up_tokenization_spaces": true,
-  "eos_token": "[STOP]",
-  "max_length": null,
-  "model_max_length": 1000000000000000019884624838656,
-  "pad_to_multiple_of": null,
-  "pad_token": "[PAD]",
-  "pad_token_type_id": 0,
-  "padding_side": "right",
-  "tokenizer_class": "XTTSTokenizerFast",
-  "unk_token": "[UNK]"
-}

xttsv2_gpt2/xtts2_gpt_modeling.py DELETED Viewed

@@ -1,505 +0,0 @@
-import functools
-import math
-import random
-import uuid
-from array import array
-import numpy as np
-import torch
-import torch.nn as nn
-from typing import List, Optional, Union, Iterable, Tuple, Mapping, Dict
-from torch import Tensor
-from transformers import PretrainedConfig, GPT2Config
-from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, MultiModalConfig
-from vllm.distributed import get_pp_group
-from vllm.inputs import InputContext, INPUT_REGISTRY, DecoderOnlyInputs, token_inputs
-from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
-from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding, ParallelLMHead
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.gpt2 import GPT2Block
-from vllm.model_executor.models.utils import make_layers, make_empty_intermediate_tensors_factory
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs
-from vllm.sequence import IntermediateTensors, SequenceData, VLLM_TOKEN_ID_ARRAY_TYPE
-from vllm.model_executor.models.interfaces import SupportsMultiModal, SupportsPP
-class LearnedPositionEmbeddings(nn.Module):
-    def __init__(self, seq_len, model_dim, init=0.02, relative=False, supports_pp=False):
-        super().__init__()
-        # nn.Embedding
-        self.emb = VocabParallelEmbedding(seq_len, model_dim) if supports_pp else nn.Embedding(seq_len, model_dim)
-        # Initializing this way is standard for GPT-2
-        self.emb.weight.data.normal_(mean=0.0, std=init)
-        self.relative = relative
-        self.seq_len = seq_len
-    def forward(self, x):
-        sl = x.shape[1]
-        if self.relative:
-            start = random.randint(sl, self.seq_len) - sl
-            return self.emb(torch.arange(start, start + sl, device=x.device))
-        else:
-            return self.emb(torch.arange(0, sl, device=x.device))
-    def get_fixed_embedding(self, ind: torch.Tensor, dev: torch.device) -> torch.Tensor:
-        """Get position embeddings with batch support.
-        Handles both single and batched inputs, returning embeddings that can be
-        directly added to input embeddings of the same shape.
-        Args:
-            ind: Position indices tensor. Can be single or batched
-                 Shape: [..., seq_len] or [seq_len]
-            dev: Target device for the embeddings
-        Returns:
-            Position embeddings tensor matching input shape plus embedding dimension
-            Shape: [batch_size, seq_len, model_dim] or [1, 1, model_dim]
-        Example:
-            >>> pos_emb = LearnedPositionEmbeddings(100, 64)
-            >>> # Batched input
-            >>> batch_indices = torch.zeros((3, 5))  # batch_size=3, seq_len=5
-            >>> embeddings = pos_emb.get_fixed_embedding(batch_indices, 'cuda')
-            >>> embeddings.shape  # Returns: [3, 5, 64]
-        """
-        if ind.shape[0] > 1:
-            pos_embeddings = []
-            for index in ind:
-                # Create embeddings for each position in the sequence
-                pos_embeddings.append(self.emb(index))
-            # Shape: [1, seq_len, model_dim] -> [batch_size, seq_len, model_dim]
-            return torch.stack(pos_embeddings, dim=0)
-        else:
-            # Handle single input
-            # Shape: [1, 1, model_dim]
-            return self.emb(torch.tensor([ind], device=dev)).unsqueeze(0)
-def get_xtts_max_audio_tokens(ctx: InputContext) -> int:
-    """Calculate maximum audio tokens based on text context and audio duration."""
-    # Based on GPT config and XTTSv2 settings
-    return 608
-def dummy_seq_data_for_xtts(
-        ctx: InputContext,
-        seq_len: int,
-        audio_count: int,
-) -> SequenceData:
-    """Create dummy sequence data for XTTS profiling."""
-    # Calculate audio token space needed
-    max_audio_token_conditioning = ctx.model_config.hf_config.max_prompt_tokens # in xtts prompt = voice conditioning
-    audio_placeholder = array(
-        VLLM_TOKEN_ID_ARRAY_TYPE,
-        [1]
-    ) * max_audio_token_conditioning
-    # Add separator between chunks
-    audio_token_ids = (audio_placeholder + array(VLLM_TOKEN_ID_ARRAY_TYPE, [1])) * audio_count
-    # Fill remaining sequence with padding
-    other_token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, [1]) * (seq_len - len(audio_token_ids))
-    # not -1 since we add the start audio token
-    return SequenceData(
-        audio_token_ids +
-        other_token_ids
-    )
-def dummy_conditioning_for_xtts(
-        ctx: InputContext,
-        seq_len: int,
-        audio_count: int,
-) -> dict:
-    """Create dummy conditioning data for XTTS."""
-    return {
-        "audio": {
-            "embeds":[
-            torch.zeros(
-                (seq_len, ctx.model_config.hf_config.hidden_size),
-                dtype=ctx.model_config.dtype) for _ in range(audio_count)
-        ],
-            "is_logits_only_mode": False,
-        }
-    }
-def dummy_data_for_xtts(
-        ctx: InputContext,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-) -> Tuple[SequenceData, dict]:
-    """Create complete dummy data for XTTS profiling."""
-    audio_count = mm_counts["audio"]
-    seq_data = dummy_seq_data_for_xtts(ctx, seq_len, audio_count)
-    cond_data = dummy_conditioning_for_xtts(ctx, seq_len, audio_count)
-    return seq_data, cond_data
-def input_mapper_for_xtts(ctx: InputContext, data: Union[Dict, List[Tensor]]) -> MultiModalInputs:
-    """Map input data to XTTS format."""
-    assert isinstance(data, dict), "XTTS MultiModal input data must be a dictionary with keys: 'embeds', 'is_logits_only_mode'"
-    embeds = data.get("embeds")
-    is_logits_only_mode = data.get("is_logits_only_mode", False)
-    # Each item should be a torch tensor
-    for audio_input in embeds:
-        if not isinstance(audio_input, Tensor):
-            raise NotImplementedError(f"Unsupported data type: {type(audio_input)}")
-    return MultiModalInputs({"cond_latents": embeds,
-                             "is_logits_only_mode": is_logits_only_mode,
-                             })
-def input_processor_for_xtts2_gpt(ctx: InputContext, inputs: DecoderOnlyInputs):
-    """
-    We'll accomodate for the extra contditioning token and for the start audio token,
-    we actually insert a -1 repeated for the differecne in length between the conditioning and the tokenized text
-    and then we add 1 for the start audio token
-    Args:
-        ctx:
-        inputs:
-    Returns:
-    """
-    multi_modal_data = inputs.get("multi_modal_data")
-    audio_dict = multi_modal_data['audio']
-    audio = audio_dict.get('embeds')
-    is_last_decoding_pass = audio_dict.get("is_logits_only_mode", False)
-    prompt_token_ids = inputs.get("prompt_token_ids")
-    if not is_last_decoding_pass:
-        # we fill everything with 0 since we don't actually needs text token ids, it would mess up in the sampling step
-        new_token_ids = [1] * (audio.shape[0] + 1) # +1 for the start audio generation token
-    else:
-        new_token_ids = ([1] * audio.shape[0]) + prompt_token_ids
-    # the encoding had already been done externally to reuse the embeddings for later use but we
-    # account for the new token that will be added before generation
-    new_prompt = None
-    return token_inputs(prompt_token_ids=new_token_ids,
-                 prompt=new_prompt,
-                 multi_modal_data=multi_modal_data)
-@MULTIMODAL_REGISTRY.register_input_mapper("audio", input_mapper_for_xtts)
-@MULTIMODAL_REGISTRY.register_max_multimodal_tokens("audio", get_xtts_max_audio_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_xtts)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_xtts2_gpt)
-class XttsGPT(nn.Module, SupportsMultiModal, SupportsPP):
-    def __init__(
-            self,
-            config: PretrainedConfig,
-            multimodal_config: MultiModalConfig,
-            cache_config: Optional[CacheConfig] = None,
-            quant_config: Optional[QuantizationConfig] = None,
-    ):
-        super().__init__()
-        self.config = config
-        self.quant_config = quant_config
-        # Core GPT components
-        self.gpt = GPT2Model(
-            config,
-            cache_config,
-            quant_config,
-            prefix="gpt"
-        )
-        self.final_norm =  nn.LayerNorm(config.hidden_size, bias=True, eps=config.layer_norm_epsilon)
-        # Output head for mel tokens
-        self.mel_head = ParallelLMHead(
-            config.num_audio_tokens,
-            config.hidden_size,
-            bias=True,
-            quant_config=quant_config,
-            prefix="mel_head"
-        )
-        self.audio_start_generation_token = config.start_audio_token
-        # Initialize logits processor and sampler
-        logit_scale = getattr(config, "logit_scale", 1.0)
-        self.logits_processor = LogitsProcessor(config.num_audio_tokens,
-                                                config.num_audio_tokens,
-                                                logit_scale)
-        self.sampler = Sampler()
-    @staticmethod
-    def check_is_logits_only_mode(is_logits_only_mode):
-        # First check if it's a boolean
-        if isinstance(is_logits_only_mode, bool):
-            return is_logits_only_mode
-        # Then check if it's a tensor
-        if torch.is_tensor(is_logits_only_mode):
-            # if it's a scalar tensor, return the value
-            if is_logits_only_mode.numel() == 1:
-                return bool(is_logits_only_mode.item())
-            # for non-scalar tensors, check if all elements are the same
-            return is_logits_only_mode.any()
-        # Fallback
-        return bool(is_logits_only_mode)
-    def _calculate_start_token_indices(self, cond_latents: List[torch.Tensor]) -> List[int]:
-        """Calcola gli indici dove inserire i token di start.
-        Args:
-            cond_latents: Lista di tensori di condizionamento
-        Returns:
-            Lista di indici dove inserire i token di start
-        """
-        indices = []
-        current_idx = 0
-        for cond_latent in cond_latents:
-            # Aggiungi la lunghezza del segmento corrente
-            current_idx += cond_latent.shape[0]
-            # Aggiungi l'indice per il token di start dopo questo segmento
-            indices.append(current_idx)
-            # Incrementa per il token di start che verrà aggiunto
-            current_idx += 1
-        return indices
-    # noinspection PyMethodOverriding
-    def forward(
-            self,
-            input_ids: torch.Tensor,
-            positions: torch.Tensor,
-            kv_caches: List[torch.Tensor],
-            attn_metadata: AttentionMetadata,
-            intermediate_tensors: Optional["IntermediateTensors"] = None,
-            cond_latents: Optional[torch.Tensor] = None,
-            is_logits_only_mode: bool = False,
-            **kwargs,
-    ) -> Union[torch.Tensor, "IntermediateTensors"]:
-        """Forward pass following VLLM pattern."""
-        # it is not the first iter either if the cond latents are emtpy or if the kv_caches are not empty
-        is_first_iteration = (input_ids==1).all()
-        #assert len(input_ids) == 1 or (cond_latents is not None and not is_first_iteration), "Conditioning data (voice conditioning+text_embeddings) is required for XTTS"
-        is_logits_only_mode = self.check_is_logits_only_mode(is_logits_only_mode)
-        if is_first_iteration:
-            # we add it to enable the model to start the generation
-            input_ids[-1] = self.audio_start_generation_token
-        hidden_states = self.gpt(
-            input_ids=input_ids,
-            position_ids=positions,
-            kv_caches=kv_caches,
-            attn_metadata=attn_metadata,
-            intermediate_tensors=intermediate_tensors,
-            # this is the conditioning input ( voice conditioning + text_embeds )
-            input_embeds=cond_latents,
-            is_first_iteration=is_first_iteration,
-            is_logits_only_mode=is_logits_only_mode
-        )
-        return hidden_states
-    def compute_logits(
-            self,
-            hidden_states: torch.Tensor,
-            sampling_metadata: SamplingMetadata,
-    ) -> Optional[torch.Tensor]:
-        # normalize the hidden states
-        hidden_states = self.final_norm(hidden_states)
-        # Check if we need to collect hidden states
-        sampling_params = sampling_metadata.seq_groups[0].sampling_params
-        if hasattr(sampling_params, 'hidden_state_collector'):
-            # Call the collector directly with the hidden states
-            sampling_params.hidden_state_collector(hidden_states, None)  # The request_id is already bound
-        # Compute logits using the mel_head
-        logits = self.logits_processor(self.mel_head, hidden_states, sampling_metadata)
-        return logits
-    def sample(
-            self,
-            logits: torch.Tensor,
-            sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        """Load weights following VLLM pattern."""
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        loaded_names = set()
-        for name, loaded_weight in weights:
-            if name not in params_dict:
-                #print(f"Skipping loading of {name} bc it is not found") # used to check if all weights were loaded
-                continue
-            param = params_dict[name]
-            if "c_attn" in name or "c_proj" in name or "c_fc" in name:
-                if name.endswith(".weight"):
-                    loaded_weight = loaded_weight.t()
-            weight_loader = getattr(param, "weight_loader", default_weight_loader)
-            weight_loader(param, loaded_weight)
-            loaded_names.add(name)
-        # used to check if all weights were loaded
-        assert set(params_dict.keys()) - loaded_names == set(), \
-            (f"Missing weights: {set(params_dict.keys()) - loaded_names}, "
-             f"this probably means you are using an incompatible model ")
-class GPT2Model(nn.Module):
-    def __init__(
-            self,
-            config: GPT2Config,
-            cache_config: Optional[CacheConfig] = None,
-            quant_config: Optional[QuantizationConfig] = None,
-            prefix: str = "",
-    ):
-        super().__init__()
-        self.config = config
-        assert not config.add_cross_attention
-        assert not config.scale_attn_by_inverse_layer_idx
-        assert not config.reorder_and_upcast_attn
-        self.embed_dim = config.hidden_size
-        self.wte = VocabParallelEmbedding(config.num_audio_tokens, self.embed_dim)
-        self.wpe = (
-            LearnedPositionEmbeddings(config.max_audio_tokens + 3, config.decoder_input_dim)
-            if config.max_audio_tokens != -1
-            else functools.partial(config.null_position_embeddings, dim=config.decoder_input_dim)
-        )
-        self.start_layer, self.end_layer, self.h = make_layers(
-            config.num_hidden_layers,
-            lambda prefix: GPT2Block(
-                config, cache_config, quant_config, prefix=prefix),
-            prefix=f"{prefix}.h")
-        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(["hidden_states"],
-                                                    config.hidden_size))
-    def forward(
-            self,
-            input_ids: torch.Tensor,
-            position_ids: torch.Tensor,
-            kv_caches: List[torch.Tensor],
-            attn_metadata: AttentionMetadata,
-            intermediate_tensors: Optional[IntermediateTensors],
-            # we pass this so that we can concatenate the text and conditioning input
-            input_embeds: Optional[torch.Tensor] = None,
-            is_first_iteration: bool = False,
-            is_logits_only_mode: bool = False,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        if get_pp_group().is_first_rank:
-            # if we are not doing the final conversion from token to latent and it is first pass(prefill)
-            if is_first_iteration and not is_logits_only_mode:
-                input_ids = input_ids[-1].reshape(1, 1)
-            elif is_logits_only_mode:
-                # we remove the contidioning input and keep just the audio token
-                if isinstance(input_embeds, list):
-                    starting_idx = []
-                    for input_embed in input_embeds:
-                        starting_idx.append(input_embed.shape[0])
-                    ending_ids = attn_metadata.seq_lens  # list
-                    # First sequence: from starting_idx[0] to ending_ids[0]
-                    cumulative_starts = [starting_idx[0]]  # First starts at its own index
-                    cumulative_ends = [ending_ids[0]]  # First ends at its ending_id
-                    # For subsequent sequences:
-                    # Start = previous_end + current_start
-                    # End = previous_end + current_end
-                    for i in range(1, len(starting_idx)):
-                        next_start = cumulative_ends[i - 1] + starting_idx[i]
-                        next_end = cumulative_ends[i - 1] + ending_ids[i]
-                        cumulative_starts.append(next_start)
-                        cumulative_ends.append(next_end)
-                    ids_for_unpacking = [end-start for start, end in zip(cumulative_starts, cumulative_ends)]
-                    input_ids = torch.cat([
-                        input_ids[start:end].reshape(1, -1)
-                        for start, end in zip(cumulative_starts, cumulative_ends)
-                    ], dim=-1)
-                    position_ids = torch.cat([
-                        position_ids[start:end].reshape(1, -1)
-                        for start, end in zip(cumulative_starts, cumulative_ends)
-                    ], dim= -1).squeeze(0)
-                else:
-                    input_ids = input_ids[input_embeds.shape[1]:].reshape(1, -1)
-                    position_ids = position_ids[input_embeds.shape[1]:]#.reshape(1, -1)
-            else:
-                input_ids = input_ids
-            audio_inputs_embeds = self.wte(input_ids).squeeze(0)
-            # weird but they to it like this in the xtts2 model
-            position_embeds = self.wpe.get_fixed_embedding(
-                    position_ids, input_ids.device
-            ) if not is_first_iteration \
-                    else self.wpe(audio_inputs_embeds.reshape(-1, 1)) # we need to reshape to 2D tensor or useless?
-            hidden_states = audio_inputs_embeds + position_embeds
-            if isinstance(input_embeds, list) and is_logits_only_mode:
-                hidden_states = list(hidden_states.split(ids_for_unpacking, dim=0))
-            if is_first_iteration or is_logits_only_mode:
-                # We concat the text and audio conditioning input in the sequence dimension
-                if isinstance(input_embeds, list):
-                    input_embeds = [input_embed.view(-1, input_embed.shape[-1]) for input_embed in input_embeds]
-                else:
-                    input_embeds = input_embeds.view(-1, input_embeds.shape[-1]) # we ensure we have a 2D tensor
-                if not isinstance(input_embeds, list) and input_embeds.shape[0] == attn_metadata.num_prefill_tokens:
-                    # this is during profiling, wee need to remove the last token
-                    # the attn_metadata.num_prefill_tokens(prompt len) should be == to input_embeds.shape[0] - 1
-                    # to account for the start audio gen embedding that will be cat to the text embeddings
-                    input_embeds = input_embeds[:-1]
-            if is_first_iteration or is_logits_only_mode:
-                # we concatenate the conditioning input to the text conditioning input
-                if isinstance(input_embeds, list):
-                        hidden_states = torch.cat([
-                                tensor for pair in zip(input_embeds, [hidden_states] * len(input_embeds)
-                                                    if not isinstance(hidden_states, list) else hidden_states)
-                                for tensor in pair
-                            ], dim=0)
-                else:
-                    hidden_states = torch.cat([input_embeds, hidden_states], dim=0)
-            #flatten the hidden state
-            hidden_states = hidden_states.view(-1, self.embed_dim)
-        else:
-            assert intermediate_tensors is not None
-            hidden_states = intermediate_tensors["hidden_states"]
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.h[i]
-            hidden_states = layer(hidden_states,
-                                  kv_caches[i - self.start_layer],
-                                  attn_metadata)
-        if not get_pp_group().is_last_rank:
-            return IntermediateTensors({"hidden_states": hidden_states})
-        hidden_states = self.ln_f(hidden_states)
-        return hidden_states