tezuesh commited on Jan 15

Commit

22d5f88

verified ·

1 Parent(s): 828c20a

Upload folder using huggingface_hub

Browse files

Files changed (35) hide show

Dockerfile +30 -0
README.md +32 -0
chunk_silence.py +53 -0
config.yaml +6 -0
hotkey.txt +1 -0
inference.py +244 -0
mimi_tokenizer.py +48 -0
models/model.safetensors +3 -0
models/tokenizer-e351c8d8-checkpoint125.safetensors +3 -0
models/tokenizer_spm_32k_3.model +3 -0
moshi/chunk_silence.py +53 -0
moshi/models/__init__.py +14 -0
moshi/models/compression.py +474 -0
moshi/models/lm.py +487 -0
moshi/models/loaders.py +159 -0
moshi/modules/__init__.py +23 -0
moshi/modules/conv.py +329 -0
moshi/modules/gating.py +82 -0
moshi/modules/resample.py +119 -0
moshi/modules/rope.py +90 -0
moshi/modules/seanet.py +395 -0
moshi/modules/streaming.py +363 -0
moshi/modules/transformer.py +750 -0
moshi/quantization/__init__.py +13 -0
moshi/quantization/base.py +170 -0
moshi/quantization/core_vq.py +384 -0
moshi/quantization/vq.py +340 -0
moshi/utils/__init__.py +10 -0
moshi/utils/autocast.py +45 -0
moshi/utils/compile.py +284 -0
moshi/utils/sampling.py +126 -0
pyproject.toml +39 -0
requirements.txt +43 -0
server.py +121 -0
setup.py +78 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,30 @@

+FROM pytorch/pytorch:2.1.2-cuda12.1-cudnn8-runtime
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && \
+    DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    build-essential \
+    python3-dev \
+    git \
+    ffmpeg \
+    libsndfile1 \
+    && rm -rf /var/lib/apt/lists/*
+# Pre-install critical dependencies
+RUN pip install --no-cache-dir --upgrade pip setuptools wheel
+# Copy entire directory
+COPY . .
+# Install package in editable mode
+RUN pip install -e .
+# Environment variables
+ENV MODEL_PATH=/app/models \
+    PYTHONUNBUFFERED=1
+EXPOSE 8000
+CMD ["python", "server.py"]

README.md ADDED Viewed

	@@ -0,0 +1,32 @@

+---
+license: mit
+tags:
+- any-to-any
+- omega
+- omegalabs
+- bittensor
+- agi
+---
+This is an Any-to-Any model checkpoint for the OMEGA Labs x Bittensor Any-to-Any subnet.
+Check out the [git repo](https://github.com/omegalabsinc/omegalabs-anytoany-bittensor) and find OMEGA on X: [@omegalabsai](https://x.com/omegalabsai).
+## License
+This project is licensed under the MIT License - see the LICENSE file for details.
+## Links
+- GitHub Repository: [omegalabs-anytoany-bittensor](https://github.com/omegalabsinc/omegalabs-anytoany-bittensor)
+- OMEGA Labs on X: [@omegalabsai](https://x.com/omegalabsai)
+## Contributing
+Contributions are welcome! Please feel free to submit a Pull Request.
+## Support
+For support and questions, please:
+1. Open an issue on GitHub
+2. Follow OMEGA Labs on X [@omegalabsai](https://x.com/omegalabsai)

chunk_silence.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import os
+from pydub import AudioSegment
+from pydub.silence import detect_silence
+import glob
+from tqdm import tqdm
+import numpy as np
+def detect_and_trim_silence(audio_array, frame_rate, min_silence_duration=1000, silence_threshold=-40):
+    # Load the audio file
+    audio_array = audio_array.detach().cpu().numpy()
+    audio_bytes = (audio_array * 32767).astype(np.int16).tobytes()
+    # Create AudioSegment from raw audio bytes
+    audio = AudioSegment(
+        data=audio_bytes,
+        sample_width=2,  # 16-bit audio = 2 bytes
+        frame_rate=frame_rate,
+        channels=1  # Mono audio
+    )
+    # Detect silence
+    silence_intervals = detect_silence(
+        audio,
+        min_silence_len=min_silence_duration,
+        silence_thresh=silence_threshold
+    )
+    # Convert milliseconds to seconds
+    silence_intervals_seconds = [(start / 1000, end / 1000) for start, end in silence_intervals]
+    first_silence_end = silence_intervals_seconds[0][1]
+    # Create audio from first silence end to end of audio
+    trimmed_audio = audio_array[:,:,int(first_silence_end * frame_rate):]  # Slice audio from first silence end to the end
+    # trimmed_audio.export(output_path, format="wav")
+    return trimmed_audio
+def process_all_audio_files(root_dir, output_dir):
+    # Use glob to find all .wav files in the directory and its subdirectories
+    wav_files = glob.glob(os.path.join(root_dir, '**', '*.wav'), recursive=True)
+    for input_path in tqdm(wav_files, desc="Processing audio files"):
+        relative_path = os.path.relpath(input_path, root_dir)
+        output_dir = os.path.join(output_dir, os.path.dirname(relative_path))
+        os.makedirs(output_dir, exist_ok=True)
+        output_path = os.path.join(output_dir, os.path.basename(input_path))
+        detect_and_trim_silence(input_path, output_path)
+# Use the function to process all audio files
+if __name__ == "__main__":
+    root_directory = "/workspace/tezuesh/omega-v2v/.predictions_warmup/moshi/audio/"
+    output_directory = "/workspace/tezuesh/omega-v2v/.predictions_warmup/moshi/trimmed/"
+    process_all_audio_files(root_directory, output_directory)

config.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+model:
+  name: moshi
+  version: 1.0
+  description: "Moshi Pretrained Model"
+  author: "Tezuesh"
+  license: "MIT"

hotkey.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 5FeqmebkCWfepQPgSkrEHRwtpUmHGASF4BNERZDs9pvKFtcD

inference.py ADDED Viewed

	@@ -0,0 +1,244 @@

+import torch
+import numpy as np
+import torchaudio
+import sentencepiece
+import logging
+from pathlib import Path
+from moshi.models import loaders, LMGen
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class InferenceRecipe:
+    """Handles model inference for the Any-to-Any model."""
+    def __init__(self, model_path: str, device: str='cuda'):
+        """Initialize the model.
+        Args:
+            model_path (str): Path to model directory with pre-downloaded files
+            device (str): Device to run on ('cuda' or 'cpu')
+        """
+        self.device = torch.device(device)
+        self.model_path = Path(model_path)
+        # Set sample rate and frame rate
+        self.sample_rate = 24000  # Based on model config in loaders.py
+        self.frame_rate = 12.5    # Based on model config in loaders.py
+        # Initialize all model components
+        logger.info(f"Initializing models from {model_path}")
+        self.mimi, self.text_tokenizer, self.lm_gen = self._initialize_models()
+        logger.info("Model initialization complete")
+    def _initialize_models(self):
+        """Initialize all required model components."""
+        print("Initializing models...")
+        try:
+            # Load MIMI model for encoding/decoding
+            mimi_path = self.model_path / loaders.MIMI_NAME
+            if not mimi_path.exists():
+                raise RuntimeError(f"MIMI model not found at {mimi_path}")
+            logger.info(f"Loading MIMI model from {mimi_path}")
+            mimi = loaders.get_mimi(str(mimi_path), device=self.device)
+            mimi.set_num_codebooks(8)
+            # Load text tokenizer
+            tokenizer_path = self.model_path / loaders.TEXT_TOKENIZER_NAME
+            if not tokenizer_path.exists():
+                raise RuntimeError(f"Text tokenizer not found at {tokenizer_path}")
+            logger.info(f"Loading text tokenizer from {tokenizer_path}")
+            text_tokenizer = sentencepiece.SentencePieceProcessor(str(tokenizer_path))
+            # Load language model
+            moshi_path = self.model_path / loaders.MOSHI_NAME
+            if not moshi_path.exists():
+                raise RuntimeError(f"Language model not found at {moshi_path}")
+            logger.info(f"Loading language model from {moshi_path}")
+            moshi = loaders.get_moshi_lm(str(moshi_path), device=self.device)
+            lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7)
+            return mimi, text_tokenizer, lm_gen
+        except Exception as e:
+            logger.error(f"Model initialization failed: {str(e)}")
+            raise
+    def _load_audio(self, audio_array: np.ndarray, sample_rate: int):
+        """Load and preprocess audio."""
+        try:
+            # Convert to tensor
+            wav = torch.from_numpy(audio_array).float().unsqueeze(0).unsqueeze(0)
+            # Resample if needed
+            if sample_rate != self.sample_rate:
+                logger.info(f"Resampling from {sample_rate} to {self.sample_rate}")
+                wav = torchaudio.transforms.Resample(
+                    orig_freq=sample_rate,
+                    new_freq=self.sample_rate
+                )(wav)
+            # Ensure frame alignment
+            frame_size = int(self.sample_rate / self.frame_rate)
+            orig_length = wav.shape[-1]
+            wav = wav[:, :, :(wav.shape[-1] // frame_size) * frame_size]
+            if wav.shape[-1] != orig_length:
+                logger.info(f"Trimmed audio from {orig_length} to {wav.shape[-1]} samples for frame alignment")
+            return wav
+        except Exception as e:
+            logger.error(f"Audio loading failed: {str(e)}")
+            raise
+    def _pad_codes(self, all_codes, time_seconds=30):
+        """Pad codes to minimum length if needed."""
+        try:
+            min_frames = int(time_seconds * self.frame_rate)
+            frame_size = int(self.sample_rate / self.frame_rate)
+            if len(all_codes) < min_frames:
+                frames_to_add = min_frames - len(all_codes)
+                logger.info(f"Padding {frames_to_add} frames to reach minimum length")
+                with torch.no_grad(), self.mimi.streaming(batch_size=1):
+                    chunk = torch.zeros(1, 1, frame_size, dtype=torch.float32, device=self.device)
+                    for _ in range(frames_to_add):
+                        additional_code = self.mimi.encode(chunk)
+                        all_codes.append(additional_code)
+            return all_codes
+        except Exception as e:
+            logger.error(f"Code padding failed: {str(e)}")
+            raise
+    def _encode_audio(self, wav: torch.Tensor):
+        """Convert audio to codes."""
+        try:
+            frame_size = int(self.sample_rate / self.frame_rate)
+            all_codes = []
+            with torch.no_grad(), self.mimi.streaming(batch_size=1):
+                for offset in range(0, wav.shape[-1], frame_size):
+                    frame = wav[:, :, offset: offset + frame_size]
+                    codes = self.mimi.encode(frame.to(self.device))
+                    assert codes.shape[-1] == 1, f"Expected code shape (*, *, 1), got {codes.shape}"
+                    all_codes.append(codes)
+            logger.info(f"Encoded {len(all_codes)} frames")
+            return all_codes
+        except Exception as e:
+            logger.error(f"Audio encoding failed: {str(e)}")
+            raise
+    def _warmup(self):
+        """Run a warmup pass."""
+        try:
+            frame_size = int(self.sample_rate / self.frame_rate)
+            chunk = torch.zeros(1, 1, frame_size, dtype=torch.float32, device=self.device)
+            codes = self.mimi.encode(chunk)
+            with torch.no_grad(), self.lm_gen.streaming(1), self.mimi.streaming(1):
+                tokens = self.lm_gen.step(codes[:, :, 0:1])
+                if tokens is not None:
+                    _ = self.mimi.decode(tokens[:, 1:])
+            torch.cuda.synchronize()
+            logger.info("Warmup pass completed")
+        except Exception as e:
+            logger.error(f"Warmup failed: {str(e)}")
+            raise
+    def _generate(self, all_codes):
+        """Generate audio and text from codes."""
+        try:
+            out_wav_chunks = []
+            text_output = []
+            with torch.no_grad(), self.lm_gen.streaming(1), self.mimi.streaming(1):
+                for i, code in enumerate(all_codes):
+                    assert code.shape == (1, 8, 1), f"Expected code shape (1, 8, 1), got {code.shape}"
+                    tokens_out = self.lm_gen.step(code.to(self.device))
+                    if tokens_out is not None:
+                        # Generate audio
+                        wav_chunk = self.mimi.decode(tokens_out[:, 1:])
+                        out_wav_chunks.append(wav_chunk)
+                        # Generate text if available
+                        text_token = tokens_out[0, 0, 0].item()
+                        if text_token not in (0, 3):
+                            _text = self.text_tokenizer.id_to_piece(text_token)
+                            _text = _text.replace("▁", " ")
+                            text_output.append(_text)
+                    if (i + 1) % 100 == 0:
+                        logger.info(f"Processed {i + 1}/{len(all_codes)} frames")
+            wav = torch.cat(out_wav_chunks, dim=-1)
+            text = ''.join(text_output)
+            logger.info(f"Generated {wav.shape[-1]} samples of audio and {len(text)} characters of text")
+            return wav, text
+        except Exception as e:
+            logger.error(f"Generation failed: {str(e)}")
+            raise
+    def inference(self, audio_array: np.ndarray, sample_rate: int) -> dict:
+        """Run inference on input audio.
+        Args:
+            audio_array (np.ndarray): Input audio as numpy array
+            sample_rate (int): Sample rate of input audio
+        Returns:
+            dict: Contains generated audio array and optional transcribed text
+        """
+        try:
+            logger.info(f"Starting inference on {len(audio_array)} samples at {sample_rate}Hz")
+            # Load and preprocess audio
+            wav = self._load_audio(audio_array, sample_rate)
+            wav = wav.to(self.device)
+            # Convert to codes
+            all_codes = self._encode_audio(wav)
+            all_codes = self._pad_codes(all_codes)
+            # Warmup pass
+            self._warmup()
+            # Generate output
+            out_wav, text = self._generate(all_codes)
+            # Convert output to numpy
+            output = out_wav.cpu().numpy().squeeze()
+            logger.info("Inference completed successfully")
+            return {
+                "audio": output,
+                "text": text
+            }
+        except Exception as e:
+            logger.error(f"Inference failed: {str(e)}")
+            raise
+if __name__ == "__main__":
+    # Example usage
+    import librosa
+    # Initialize model
+    model = InferenceRecipe("/path/to/models", device="cuda")
+    # Load test audio
+    audio, sr = librosa.load("test.wav", sr=None)
+    # Run inference
+    result = model.inference(audio, sr)
+    print(f"Generated {len(result['audio'])} samples of audio")
+    print(f"Generated text: {result['text']}")

mimi_tokenizer.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from moshi import models
+loaders = models.loaders
+from huggingface_hub import hf_hub_download
+import torch
+from pydub import AudioSegment
+import numpy as np
+MIMI_NAME = 'tokenizer-e351c8d8-checkpoint125.safetensors'
+DEFAULT_REPO = 'kyutai/moshiko-pytorch-bf16'
+device = "cuda" if torch.cuda.is_available() else "cpu"
+mimi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MIMI_NAME)
+mimi = loaders.get_mimi(mimi_weight, device=device)
+def encode_audio(mimi, wav, device):
+    frame_size = int(mimi.sample_rate / mimi.frame_rate)
+    all_codes = []
+    with torch.no_grad(), mimi.streaming(batch_size=1):
+        for offset in range(0, wav.shape[-1], frame_size):
+            frame = wav[:, :, offset: offset + frame_size]
+            codes = mimi.encode(frame.to(device))
+            assert codes.shape[-1] == 1, codes.shape
+            all_codes.append(codes)
+    return all_codes
+def load_audio(wav_path, mimi):
+    audio = AudioSegment.from_wav(wav_path)
+    samples = np.array(audio.get_array_of_samples())
+    samples = samples.astype(np.float32) / (2**15 if audio.sample_width == 2 else 2**31)
+    wav = torch.from_numpy(samples).float().unsqueeze(0).unsqueeze(0)
+    if audio.frame_rate != mimi.sample_rate:
+        wav = torch.nn.functional.interpolate(wav, scale_factor=mimi.sample_rate/audio.frame_rate, mode='linear', align_corners=False)
+    frame_size = int(mimi.sample_rate / mimi.frame_rate)
+    wav = wav[:, :, :(wav.shape[-1] // frame_size) * frame_size]
+    return wav

models/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b835c664f3830bf453808cbca9bfbcc9de332c328cc01cbffdfbaba2a8838a7
+size 15375500136

models/tokenizer-e351c8d8-checkpoint125.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:09b782f0629851a271227fb9d36db65c041790365f11bbe5d3d59369cf863f50
+size 384644900

models/tokenizer_spm_32k_3.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:78d4336533ddc26f9acf7250d7fb83492152196c6ea4212c841df76933f18d2d
+size 552778

moshi/chunk_silence.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import os
+from pydub import AudioSegment
+from pydub.silence import detect_silence
+import glob
+from tqdm import tqdm
+import numpy as np
+def detect_and_trim_silence(audio_array, frame_rate, min_silence_duration=1000, silence_threshold=-40):
+    # Load the audio file
+    audio_array = audio_array.detach().cpu().numpy()
+    audio_bytes = (audio_array * 32767).astype(np.int16).tobytes()
+    # Create AudioSegment from raw audio bytes
+    audio = AudioSegment(
+        data=audio_bytes,
+        sample_width=2,  # 16-bit audio = 2 bytes
+        frame_rate=frame_rate,
+        channels=1  # Mono audio
+    )
+    # Detect silence
+    silence_intervals = detect_silence(
+        audio,
+        min_silence_len=min_silence_duration,
+        silence_thresh=silence_threshold
+    )
+    # Convert milliseconds to seconds
+    silence_intervals_seconds = [(start / 1000, end / 1000) for start, end in silence_intervals]
+    first_silence_end = silence_intervals_seconds[0][1]
+    # Create audio from first silence end to end of audio
+    trimmed_audio = audio_array[:,:,int(first_silence_end * frame_rate):]  # Slice audio from first silence end to the end
+    # trimmed_audio.export(output_path, format="wav")
+    return trimmed_audio
+def process_all_audio_files(root_dir, output_dir):
+    # Use glob to find all .wav files in the directory and its subdirectories
+    wav_files = glob.glob(os.path.join(root_dir, '**', '*.wav'), recursive=True)
+    for input_path in tqdm(wav_files, desc="Processing audio files"):
+        relative_path = os.path.relpath(input_path, root_dir)
+        output_dir = os.path.join(output_dir, os.path.dirname(relative_path))
+        os.makedirs(output_dir, exist_ok=True)
+        output_path = os.path.join(output_dir, os.path.basename(input_path))
+        detect_and_trim_silence(input_path, output_path)
+# Use the function to process all audio files
+if __name__ == "__main__":
+    root_directory = "/workspace/tezuesh/omega-v2v/.predictions_warmup/moshi/audio/"
+    output_directory = "/workspace/tezuesh/omega-v2v/.predictions_warmup/moshi/trimmed/"
+    process_all_audio_files(root_directory, output_directory)

moshi/models/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Models for the compression model Moshi,
+"""
+# flake8: noqa
+from moshi.models.compression import (
+    CompressionModel,
+    MimiModel,
+)
+from moshi.models.lm import LMModel, LMGen
+from moshi.models.loaders import get_mimi, get_moshi_lm

moshi/models/compression.py ADDED Viewed

	@@ -0,0 +1,474 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Part of this file is adapted from encodec.py in https://github.com/facebookresearch/audiocraft
+# released under the following license.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Compression models or wrapper around existing models. In particular, provides the implementation
+for Mimi. Also defines the main interface that a model must follow to be usable as an audio tokenizer.
+"""
+from abc import abstractmethod
+from contextlib import nullcontext
+from dataclasses import dataclass
+import logging
+import typing as tp
+import torch
+from torch import nn
+from moshi.quantization import (
+    QuantizedResult,
+    BaseQuantizer,
+    SplitResidualVectorQuantizer,
+    ResidualVectorQuantizer,
+)
+from moshi.modules.resample import ConvDownsample1d, ConvTrUpsample1d
+from moshi.modules.streaming import StreamingModule, State
+from moshi.utils.compile import no_compile, CUDAGraphed
+logger = logging.getLogger()
+class CompressionModel(StreamingModule[State]):
+    """Base API for all compression model that aim at being used as audio tokenizers
+    with a language model.
+    """
+    @abstractmethod
+    def forward(self, x: torch.Tensor) -> QuantizedResult: ...
+    @abstractmethod
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        """See `MimiModel.encode`."""
+        ...
+    @abstractmethod
+    def decode(self, codes: torch.Tensor) -> torch.Tensor:
+        """See `MimiModel.decode`."""
+        ...
+    @abstractmethod
+    def decode_latent(self, codes: torch.Tensor) -> torch.Tensor:
+        """Decode from the discrete codes to continuous latent space."""
+        ...
+    @property
+    @abstractmethod
+    def channels(self) -> int: ...
+    @property
+    @abstractmethod
+    def frame_rate(self) -> float: ...
+    @property
+    @abstractmethod
+    def sample_rate(self) -> int: ...
+    @property
+    @abstractmethod
+    def cardinality(self) -> int: ...
+    @property
+    @abstractmethod
+    def num_codebooks(self) -> int: ...
+    @property
+    @abstractmethod
+    def total_codebooks(self) -> int: ...
+    @abstractmethod
+    def set_num_codebooks(self, n: int):
+        """Set the active number of codebooks used by the quantizer."""
+        ...
+@dataclass
+class _MimiState:
+    graphed_tr_enc: CUDAGraphed | None
+    graphed_tr_dec: CUDAGraphed | None
+    def reset(self):
+        pass
+class MimiModel(CompressionModel[_MimiState]):
+    """Mimi model operating on the raw waveform.
+    Args:
+        encoder (nn.Module): Encoder network.
+        decoder (nn.Module): Decoder network.
+        quantizer (qt.BaseQuantizer): Quantizer network.
+        frame_rate (float): Final frame rate of the quantized representatiopn.
+        encoder_frame_rate (float): frame rate of the encoder model. Note that if `frame_rate != encopder_frame_rate`,
+            the latent will be resampled linearly to match the desired `frame_rate` before and after quantization.
+        sample_rate (int): Audio sample rate.
+        channels (int): Number of audio channels.
+        causal (bool): Whether to use a causal version of the model.
+        encoder_transformer (nn.Module or None): optional transformer for the encoder.
+        decoder_transformer (nn.Module or None): optional transformer for the decoder.
+        resample_method (str): method to use for resampling the latent space before the quantizer.
+        upsample_channel_wise_bug (bool): controls whether the upsampling is channel wise.
+            Defaults to true to reproduce bug in original implementation.
+        freeze_encoder: whether to freeze the encoder weights.
+        freeze_quantizer: whether to freeze the quantizer weights.
+        freeze_quantizer_level: If positive, freeze the quantizer up to this level.
+        torch_compile_encoder_decoder (bool): if True, uses torch.compile on the encoder / decoder.
+            Deactivated by default for training as this is incompatible at the moment with weight norm.
+            See https://github.com/pytorch/pytorch/issues/121902
+            Also this seems to work well with 2.2.0, but completely fail with 2.4.0.
+    """
+    def __init__(
+        self,
+        encoder: nn.Module,
+        decoder: nn.Module,
+        quantizer: BaseQuantizer,
+        frame_rate: float,
+        encoder_frame_rate: float,
+        sample_rate: int,
+        channels: int,
+        causal: bool = False,
+        encoder_transformer: tp.Optional[nn.Module] = None,
+        decoder_transformer: tp.Optional[nn.Module] = None,
+        resample_method: str = "interpolate",
+        upsample_channel_wise_bug: bool = True,
+        freeze_encoder: bool = False,
+        freeze_quantizer: bool = False,
+        freeze_quantizer_level: int = -1,
+        torch_compile_encoder_decoder: bool = False,
+    ):
+        super().__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+        self.encoder_transformer = encoder_transformer
+        self.decoder_transformer = decoder_transformer
+        self.quantizer = quantizer
+        self._frame_rate = frame_rate
+        self._sample_rate = sample_rate
+        self._channels = channels
+        self.encoder_frame_rate = encoder_frame_rate
+        self.torch_compile_encoder_decoder = torch_compile_encoder_decoder
+        if freeze_encoder:
+            for p in self.encoder.parameters():
+                p.requires_grad = False
+            if self.encoder_transformer is not None:
+                for p in self.encoder_transformer.parameters():
+                    p.requires_grad = False
+            for name, p in self.quantizer.named_parameters():
+                if name.endswith("input_proj.weight"):
+                    p.requires_grad = False
+        if freeze_quantizer:
+            self.quantizer.ema_frozen_(True)
+        self.freeze_quantizer = freeze_quantizer
+        self.freeze_quantizer_level = (
+            freeze_quantizer_level
+            if freeze_quantizer_level > 0
+            else self.quantizer.num_codebooks
+        )
+        # We will need the dimension for the resampling. In general the encoder will be a SeanetEncoder
+        # which exposes a `dimension` attribute.
+        dimension = encoder.dimension
+        assert isinstance(
+            dimension, int
+        ), f"Dimension should be int, got {dimension} of type {type(dimension)}."
+        self.dimension = dimension
+        assert resample_method in [
+            "interpolate",
+            "conv",
+            "avg_pool",
+        ], f"Invalid resample_method {resample_method}"
+        self.resample_method = resample_method
+        if encoder_frame_rate != frame_rate:
+            assert not (
+                causal and resample_method == "interpolate"
+            ), "Cannot interpolate with causal model."
+            if resample_method in ["conv", "avg_pool"]:
+                assert (
+                    self.encoder_frame_rate > self.frame_rate
+                ), "Cannot upsample with conv."
+                downsample_stride = self.encoder_frame_rate / self.frame_rate
+                assert downsample_stride == int(
+                    downsample_stride
+                ), f"Only integer strides are supported, got {downsample_stride}"
+                learnt = resample_method == "conv"
+                self.downsample = ConvDownsample1d(
+                    int(downsample_stride),
+                    dimension=dimension,
+                    learnt=learnt,
+                    causal=causal,
+                )
+                if freeze_encoder:
+                    for p in self.downsample.parameters():
+                        p.requires_grad = False
+                self.upsample = ConvTrUpsample1d(
+                    int(downsample_stride),
+                    dimension=dimension,
+                    learnt=learnt,
+                    causal=causal,
+                    channel_wise=upsample_channel_wise_bug,
+                )
+    def _init_streaming_state(self, batch_size: int) -> _MimiState:
+        device = next(self.parameters()).device
+        disable = device.type != 'cuda'
+        graphed_tr_dec = None
+        graphed_tr_enc = None
+        if self.encoder_transformer is not None:
+            graphed_tr_enc = CUDAGraphed(self.encoder_transformer, disable=disable)
+        if self.decoder_transformer is not None:
+            graphed_tr_dec = CUDAGraphed(self.decoder_transformer, disable=disable)
+        return _MimiState(graphed_tr_enc, graphed_tr_dec)
+    @property
+    def channels(self) -> int:
+        return self._channels
+    @property
+    def frame_rate(self) -> float:
+        return self._frame_rate
+    @property
+    def sample_rate(self) -> int:
+        return self._sample_rate
+    @property
+    def total_codebooks(self):
+        """Total number of quantizer codebooks available."""
+        return self.quantizer.total_codebooks
+    @property
+    def num_codebooks(self):
+        """Active number of codebooks used by the quantizer."""
+        return self.quantizer.num_codebooks
+    def set_num_codebooks(self, n: int):
+        """Set the active number of codebooks used by the quantizer."""
+        self.quantizer.set_num_codebooks(n)
+    @property
+    def cardinality(self):
+        """Cardinality of each codebook."""
+        return self.quantizer.cardinality
+    def _to_framerate(self, x: torch.Tensor):
+        # Convert from the encoder frame rate to the overall framerate.
+        _, _, length = x.shape
+        frame_rate = self.encoder_frame_rate
+        new_frame_rate = self.frame_rate
+        if frame_rate == new_frame_rate:
+            return x
+        if self.resample_method == "interpolate":
+            target_length = int(length * new_frame_rate / frame_rate)
+            return nn.functional.interpolate(x, size=target_length, mode="linear")
+        else:
+            return self.downsample(x)
+    def _to_encoder_framerate(self, x: torch.Tensor):
+        # Convert from overall framerate to the encoder frame rate.
+        _, _, length = x.shape
+        frame_rate = self.encoder_frame_rate
+        new_frame_rate = self.frame_rate
+        if frame_rate == new_frame_rate:
+            return x
+        if self.resample_method == "interpolate":
+            target_length = int(length * new_frame_rate / frame_rate)
+            return nn.functional.interpolate(x, size=target_length, mode="linear")
+        else:
+            return self.upsample(x)
+    @property
+    def _context_for_encoder_decoder(self):
+        if self.torch_compile_encoder_decoder:
+            return nullcontext()
+        else:
+            return no_compile()
+    def forward(self, x: torch.Tensor) -> QuantizedResult:
+        assert x.dim() == 3
+        length = x.shape[-1]
+        extra_metrics: tp.Dict[str, torch.Tensor] = {}
+        if self.freeze_quantizer:
+            if isinstance(self.quantizer, SplitResidualVectorQuantizer):
+                self.quantizer.rvq_first.eval()
+                for i in range(
+                    self.freeze_quantizer_level - self.quantizer.n_q_semantic
+                ):
+                    self.quantizer.rvq_rest.vq.layers[i].eval()
+            elif isinstance(self.quantizer, ResidualVectorQuantizer):
+                for i in range(self.freeze_quantizer_level):
+                    self.quantizer.vq.layers[i].eval()
+            else:
+                raise ValueError(f"Unsupported quantizer type {type(self.quantizer)}")
+        with self._context_for_encoder_decoder:
+            emb = self.encoder(x)
+        if self.encoder_transformer is not None:
+            (emb,) = self.encoder_transformer(emb)
+        emb = self._to_framerate(emb)
+        expected_length = self.frame_rate * length / self.sample_rate
+        # Checking that we have the proper length given the advertised frame rate.
+        assert abs(emb.shape[-1] - expected_length) < 1, (
+            emb.shape[-1],
+            expected_length,
+        )
+        q_res = self.quantizer(emb, self.frame_rate)
+        emb = q_res.x
+        emb = self._to_encoder_framerate(emb)
+        if self.decoder_transformer is not None:
+            (emb,) = self.decoder_transformer(emb)
+        with self._context_for_encoder_decoder:
+            out = self.decoder(emb)
+        # remove extra padding added by the encoder and decoder
+        assert out.shape[-1] >= length, (out.shape[-1], length)
+        out = out[..., :length]
+        q_res.x = out
+        q_res.metrics.update(extra_metrics)
+        return q_res
+    def _encode_to_unquantized_latent(self, x: torch.Tensor) -> torch.Tensor:
+        """Projects a batch of waveforms to unquantized latent space.
+        Args:
+            x (torch.Tensor): Float tensor of shape [B, C, T].
+        Returns:
+            Unquantized embeddings.
+        """
+        assert (
+            x.dim() == 3
+        ), f"CompressionModel._encode_to_unquantized_latent expects audio of shape [B, C, T] but got {x.shape}"
+        state = self._streaming_state
+        with self._context_for_encoder_decoder:
+            emb = self.encoder(x)
+        if self.encoder_transformer is not None:
+            if state is None:
+                (emb,) = self.encoder_transformer(emb)
+            else:
+                assert state.graphed_tr_enc is not None
+                (emb,) = state.graphed_tr_enc(emb)
+        emb = self._to_framerate(emb)
+        return emb
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        """Encode the given input tensor to quantized representation.
+        Args:
+            x (torch.Tensor): Float tensor of shape [B, C, T]
+        Returns:
+            codes (torch.Tensor): an int tensor of shape [B, K, T]
+                with K the number of codebooks used and T the timestep.
+        """
+        emb = self._encode_to_unquantized_latent(x)
+        codes = self.quantizer.encode(emb)
+        return codes
+    def encode_to_latent(self, x: torch.Tensor, quantize: bool = True) -> torch.Tensor:
+        """Projects a batch of waveforms to latent space.
+        Args:
+            x (torch.Tensor): Float tensor of shape [B, C, T].
+        Returns:
+            Embeddings, either quantized or not.
+        """
+        emb = self._encode_to_unquantized_latent(x)
+        if not quantize:
+            return emb
+        else:
+            codes = self.quantizer.encode(emb)
+            return self.decode_latent(codes)
+    def decode(self, codes: torch.Tensor):
+        """Decode the given codes to a reconstructed representation.
+        Args:
+            codes (torch.Tensor): Int tensor of shape [B, K, T]
+        Returns:
+            out (torch.Tensor): Float tensor of shape [B, C, T], the reconstructed audio.
+        """
+        state = self._streaming_state
+        emb = self.decode_latent(codes)
+        emb = self._to_encoder_framerate(emb)
+        if self.decoder_transformer is not None:
+            if state is None:
+                (emb,) = self.decoder_transformer(emb)
+            else:
+                assert state.graphed_tr_dec is not None
+                (emb,) = state.graphed_tr_dec(emb)
+        with self._context_for_encoder_decoder:
+            out = self.decoder(emb)
+        # out contains extra padding added by the encoder and decoder
+        return out
+    def decode_latent(self, codes: torch.Tensor) -> torch.Tensor:
+        """Decode from the discrete codes to continuous latent space."""
+        return self.quantizer.decode(codes)
+class WrapperCompressionModel(CompressionModel[State]):
+    """Base API for CompressionModel wrappers that do not depend on external frameworks."""
+    def __init__(self, model: CompressionModel):
+        super().__init__()
+        self.model = model
+    def forward(self, x: torch.Tensor) -> QuantizedResult:
+        return self.model.forward(x)
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        return self.model.encode(x)
+    def decode(self, codes: torch.Tensor) -> torch.Tensor:
+        return self.model.decode(codes)
+    def decode_latent(self, codes: torch.Tensor) -> torch.Tensor:
+        return self.model.decode_latent(codes)
+    def set_num_codebooks(self, n: int):
+        self.model.set_num_codebooks(n)
+    @property
+    def quantizer(self):
+        return self.model.quantizer
+    @property
+    def channels(self) -> int:
+        return self.model.channels
+    @property
+    def frame_rate(self) -> float:
+        return self.model.frame_rate
+    @property
+    def sample_rate(self) -> int:
+        return self.model.sample_rate
+    @property
+    def cardinality(self) -> int:
+        return self.model.cardinality
+    @property
+    def num_codebooks(self) -> int:
+        return self.model.num_codebooks
+    @property
+    def total_codebooks(self) -> int:
+        return self.model.total_codebooks

moshi/models/lm.py ADDED Viewed

	@@ -0,0 +1,487 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass
+from functools import partial
+import logging
+import typing as tp
+import torch
+from torch import nn
+from moshi.utils.sampling import sample_token
+from moshi.utils.compile import CUDAGraphed
+from moshi.modules.streaming import StreamingContainer, StreamingModule
+from moshi.modules.transformer import (
+    StreamingTransformer,
+    create_norm_fn,
+)
+logger = logging.getLogger(__name__)
+class ScaledEmbedding(nn.Embedding):
+    """Boost learning rate for embeddings (with `scale`).
+    Args:
+        norm (bool): if True, uses a layer norm after the embedding.
+        zero_idx (int): special value indicating that the output should be exactly 0.
+    """
+    def __init__(self, *args, norm: bool = False, zero_idx: int = -1, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.norm = None
+        if norm:
+            self.norm = create_norm_fn("layer_norm", self.embedding_dim)
+        assert zero_idx < 0, "Please use negative values for the zero_idx."
+        self.zero_idx = zero_idx
+    def forward(self, input, *args, **kwargs):
+        is_zero = input == self.zero_idx
+        zero = torch.zeros(1, dtype=input.dtype, device=input.device)
+        input = input.clamp(min=0)
+        y = super().forward(input, *args, **kwargs)
+        if self.norm is not None:
+            y = self.norm(y)
+        y = torch.where(is_zero[..., None], zero, y)
+        return y
+class LMModel(StreamingContainer):
+    """Transformer-based language model on multiple streams of codes.
+    Args:
+        n_q (int): Number of parallel streams to model as input.
+        dep_q (int): Number of parallel streams to model in the depformer.
+        card (int): Cardinality, vocabulary size.
+        text_card (int): Cardinality of the text vocabulary.
+        dim (int): Dimension of the transformer encoder.
+        num_heads (int): Number of heads for the transformer encoder.
+        hidden_scale (int): Scale for hidden feed forward dimension of the transformer encoder.
+        norm (str): Normalization method.
+        norm_emb (bool): Whether to normalize embeddings.
+        bias_proj (bool): Use bias for output projections.
+        depformer_*: params used for the Depformer Transformer, all the other will be shared.
+        depformer_multi_linear (bool): if True, uses one linear layer per codebook to project the
+            output of the main transformer to the Depformer latent space.
+        depformer_dim_feedforward (int| list[int]| None): If None, defaults to hidden_scale * depformer_dim.
+        existing_text_padding_id (bool): if True, will use a different token for the initial text token, and
+            the text padding token.
+        same_initial (bool): if True, uses the same initial tokens for both text and audio mode.
+        **kwargs: Additional parameters for the transformer encoder.
+    """
+    def __init__(
+        self,
+        delays: tp.List[int] = [0],
+        n_q: int = 8,
+        dep_q: int = 8,
+        card: int = 1024,
+        text_card: int = 32000,
+        dim: int = 128,
+        num_heads: int = 8,
+        hidden_scale: int = 4,
+        norm: str = "layer_norm",
+        norm_emb: bool = False,
+        bias_proj: bool = False,
+        depformer_dim: int = 256,
+        depformer_dim_feedforward: int | list[int] | None = None,
+        depformer_multi_linear: bool = False,
+        depformer_weights_per_step: bool = False,
+        depformer_pos_emb: str = "sin",
+        existing_text_padding_id: tp.Optional[int] = None,
+        context: tp.Optional[int] = None,
+        device=None,
+        dtype=None,
+        **kwargs,
+    ):
+        super().__init__()
+        self.n_q = n_q
+        self.dep_q = dep_q
+        self.card = card
+        self.text_card = text_card
+        assert len(delays) == self.num_codebooks, "unexpected number of delays"
+        self.delays = delays
+        self.dim = dim
+        self.existing_text_padding_id = existing_text_padding_id
+        self.context = context
+        kwargs["context"] = context
+        EmbeddingFactory = partial(
+            ScaledEmbedding,
+            norm=norm_emb,
+            device=device,
+            dtype=dtype,
+            zero_idx=self.zero_token_id,
+        )
+        self.emb = nn.ModuleList(
+            [EmbeddingFactory(self.card + 1, dim) for _ in range(n_q)]
+        )
+        # Text card + padding token (if not in the original tokenizer)
+        extra_text = self.existing_text_padding_id is None
+        # Unlike for audio, here we authorize the model to output the special token.
+        self.text_emb = EmbeddingFactory(text_card + 1, dim)
+        self.text_linear = nn.Linear(dim, text_card + extra_text, bias=bias_proj)
+        depformer_prefix = "depformer_"
+        main_kwargs = {
+            k: v for k, v in kwargs.items() if not k.startswith(depformer_prefix)
+        }
+        self.transformer = StreamingTransformer(
+            d_model=dim,
+            num_heads=num_heads,
+            dim_feedforward=int(hidden_scale * dim),
+            norm=norm,
+            device=device,
+            dtype=dtype,
+            **main_kwargs,
+        )
+        self.out_norm = create_norm_fn(norm, dim)
+        self.depformer_multi_linear = depformer_multi_linear
+        kwargs_dep = main_kwargs.copy()
+        kwargs_dep.update(
+            {
+                k.removeprefix(depformer_prefix): v
+                for k, v in kwargs.items()
+                if k.startswith(depformer_prefix)
+            }
+        )
+        kwargs_dep["positional_embedding"] = depformer_pos_emb
+        kwargs_dep["context"] = None
+        if depformer_weights_per_step:
+            kwargs_dep["weights_per_step"] = dep_q
+        if depformer_multi_linear:
+            # One linear layer per codebook to project different informations from the main model.
+            self.depformer_in = nn.ModuleList(
+                [nn.Linear(dim, depformer_dim, bias=False) for _ in range(dep_q)]
+            )
+        else:
+            self.depformer_in = nn.ModuleList(
+                [nn.Linear(dim, depformer_dim, bias=False)]
+            )
+        # Only using up to dep_q - 1 because the last codebook is never an input to Depformer.
+        self.depformer_emb = nn.ModuleList(
+            [EmbeddingFactory(self.card + 1, depformer_dim) for _ in range(dep_q - 1)]
+        )
+        self.depformer_text_emb = EmbeddingFactory(text_card + 1, depformer_dim)
+        if depformer_dim_feedforward is None:
+            depformer_dim_feedforward = int(hidden_scale * depformer_dim)
+        self.depformer = StreamingTransformer(
+            d_model=depformer_dim,
+            dim_feedforward=depformer_dim_feedforward,
+            norm=norm,
+            device=device,
+            dtype=dtype,
+            **kwargs_dep,
+        )
+        self.depformer.set_streaming_propagate(False)
+        dim = depformer_dim  # we will directly apply the next linears to the output of the Depformer.
+        self.linears = nn.ModuleList(
+            [nn.Linear(dim, self.card, bias=bias_proj) for _ in range(dep_q)]
+        )
+    @property
+    def initial_token_id(self) -> int:
+        """Token id for the start of sequence (audio)."""
+        return self.card
+    @property
+    def text_initial_token_id(self) -> int:
+        """Token id for the start of sequence (text)."""
+        return self.text_card
+    @property
+    def text_padding_token_id(self) -> int:
+        """Token id for text padding."""
+        if self.existing_text_padding_id is None:
+            return self.text_card
+        else:
+            return self.existing_text_padding_id
+    @property
+    def end_of_text_padding_id(self) -> int:
+        """Token id for optionally marking the last padding step for a word."""
+        return 0
+    @property
+    def zero_token_id(self) -> int:
+        """Special value in the input tokens, indicating that no sampling should
+        happen for that value, and no input should be given to the model."""
+        return -1
+    @property
+    def ungenerated_token_id(self) -> int:
+        """Special value that can be provided in the prompt to indicate that this specific
+        value should be predicted and sampled. This allows for partial teacher forcing, by generating
+        one modality, with the other one fixed.
+        """
+        return -2
+    @property
+    def device(self):
+        first_param = next(iter(self.parameters()))
+        return first_param.device
+    @property
+    def num_codebooks(self) -> int:
+        return self.n_q + 1
+    @property
+    def num_audio_codebooks(self) -> int:
+        return self.n_q
+    @property
+    def audio_offset(self) -> int:
+        return 1
+    def _get_initial_token(self) -> torch.Tensor:
+        # Returns the initial token that will be fed to the model to predict the very first timestep.
+        # The output shape will be [B, K, 1].
+        device = next(iter(self.parameters())).device
+        zero = torch.full(
+            [1, 1, 1], self.zero_token_id, device=device, dtype=torch.long
+        )
+        special = torch.full_like(zero, self.initial_token_id)
+        text_special = torch.full_like(zero, self.text_initial_token_id)
+        audio_token = special
+        text_token = text_special
+        audio_token = audio_token.expand(-1, self.num_audio_codebooks, -1)
+        token = torch.cat([text_token, audio_token], dim=1)
+        return token
+    def forward_text(
+        self,
+        sequence: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        B, K, S = sequence.shape
+        assert (
+            K == self.num_codebooks
+        ), f"Sequence shape {sequence.shape} must match the number of codebooks."
+        input_sequence = sequence
+        input_ = None
+        for cb_index in range(self.num_audio_codebooks):
+            audio_emb = self.emb[cb_index](
+                input_sequence[:, cb_index + self.audio_offset]
+            )
+            input_ = audio_emb if input_ is None else input_ + audio_emb
+        text_emb = self.text_emb(input_sequence[:, 0])
+        input_ = text_emb if input_ is None else input_ + text_emb
+        transformer_out = self.transformer(input_)
+        if self.out_norm:
+            transformer_out = self.out_norm(transformer_out)
+        assert isinstance(transformer_out, torch.Tensor)
+        text_logits = self.text_linear(transformer_out)
+        text_logits = text_logits[:, None]
+        return transformer_out, text_logits
+    def forward_depformer(
+        self,
+        depformer_cb_index: int,
+        sequence: torch.Tensor,
+        transformer_out: torch.Tensor,
+    ) -> torch.Tensor:
+        B, K, S = sequence.shape
+        assert (
+            K == 1
+        ), f"Codebooks for Depformer streaming should be passed 1 by 1, got {K}."
+        assert (
+            S == 1
+        ), f"Steps for Depformer streaming should be passed 1 by 1, got {S}."
+        assert (
+            transformer_out.shape[1] == 1
+        ), "Transformer out should be a for a single step."
+        last_token_input: tp.Optional[torch.Tensor] = None
+        depformer_input = transformer_out
+        if self.depformer_multi_linear:
+            depformer_input = self.depformer_in[depformer_cb_index](depformer_input)
+        else:
+            depformer_input = self.depformer_in[0](depformer_input)
+        if depformer_cb_index == 0:
+            last_token_input = self.depformer_text_emb(sequence[:, 0])
+        else:
+            last_token_input = self.depformer_emb[depformer_cb_index - 1](
+                sequence[:, 0]
+            )
+        depformer_input = depformer_input + last_token_input
+        assert depformer_input.shape[1] == 1
+        # depformer_input is [B, 1, depformer_dim].
+        # The streaming state of the depformer ensures that the proper layer is run.
+        dep_output = self.depformer(depformer_input)
+        logits = self.linears[depformer_cb_index](dep_output)
+        logits = logits[:, None]
+        assert logits.dim() == 4, logits.shape  # [B, Ka, S, card]
+        return logits
+@dataclass
+class _LMGenState:
+    cache: torch.Tensor
+    initial: torch.Tensor
+    graphed_main: CUDAGraphed
+    graphed_depth: CUDAGraphed
+    offset: int = 0
+    def reset(self):
+        self.offset = 0
+class LMGen(StreamingModule[_LMGenState]):
+    def __init__(
+        self,
+        lm_model: LMModel,
+        use_sampling: bool = True,
+        temp: float = 0.8,
+        temp_text: float = 0.7,
+        top_k: int = 250,
+        top_k_text: int = 25,
+        check: bool = False,
+    ):
+        assert not lm_model.training, "generation shouldn't be used in training mode."
+        super().__init__()
+        self.lm_model = lm_model
+        self.use_sampling = use_sampling
+        self.temp = temp
+        self.temp_text = temp_text
+        self.top_k = top_k
+        self.top_k_text = top_k_text
+        self.check = check
+        self.max_delay = max(
+            lm_model.delays
+        )  # with delays, we need to generate a few more time steps.
+        self.delays_cuda = torch.tensor(
+            lm_model.delays, device=lm_model.device, dtype=torch.long
+        )
+    def _init_streaming_state(self, batch_size: int) -> _LMGenState:
+        lm_model = self.lm_model
+        initial = lm_model._get_initial_token()
+        cache = torch.full(
+            (batch_size, self.lm_model.num_codebooks, self.max_delay + 2),
+            lm_model.ungenerated_token_id,
+            device=lm_model.device,
+            dtype=torch.long,
+        )
+        disable = lm_model.device.type != 'cuda'
+        graphed_main = CUDAGraphed(lm_model.forward_text, disable=disable)
+        graphed_depth = CUDAGraphed(self.depformer_step, disable=disable)
+        return _LMGenState(cache, initial, graphed_main, graphed_depth)
+    @torch.no_grad()
+    def step(self, input_tokens: torch.Tensor) -> torch.Tensor | None:
+        state = self._streaming_state
+        if state is None:
+            raise RuntimeError(
+                "You should wrap those calls with a `with lm_gen.streaming(): ...`."
+            )
+        lm_model = self.lm_model
+        assert input_tokens.dim() == 3, "Shape should be [B, K, T]."
+        B, Ki, S = input_tokens.shape
+        assert S == 1, "Only support being given steps one by one."
+        needed_tokens = lm_model.num_codebooks - lm_model.dep_q - 1
+        assert (
+            Ki == needed_tokens
+        ), f"We expect {needed_tokens} tokens from the user stream, got {Ki}."
+        CT = state.cache.shape[2]
+        for q_other in range(input_tokens.shape[1]):
+            k = lm_model.dep_q + 1 + q_other
+            delay = lm_model.delays[k]
+            write_position = (state.offset + delay) % CT
+            state.cache[:, k, write_position : write_position + 1] = input_tokens[
+                :, q_other
+            ]
+        position = state.offset % CT
+        for k, delay in enumerate(lm_model.delays):
+            # Only for the very beginning, we extend the initial token for the acoustic
+            # token that are delayed, and thus have no good value to take.
+            if state.offset <= delay:
+                state.cache[:, k, position] = state.initial[:, k, 0]
+        input_ = state.cache[:, :, position : position + 1]
+        if self.check:
+            # Check that we are not feeding in any value that is not generated yet.
+            assert not (input_ == lm_model.ungenerated_token_id).any(), (
+                state.offset,
+                input_,
+            )
+            assert (input_[:, lm_model.audio_offset :] <= lm_model.card).all(), input_
+            assert (input_[:, :1] <= lm_model.text_card).all()
+        transformer_out, text_logits = state.graphed_main(input_)
+        # Shape of text_logits should be [B, K_text=1, T=1, Card_text]
+        text_token = sample_token(
+            text_logits.float(),
+            self.use_sampling,
+            self.temp_text,
+            self.top_k_text,
+        )
+        assert text_token.dim() == 3, text_token.shape
+        assert text_token.shape[2] == 1
+        assert text_token.shape[1] == 1, "Only one text stream supported."
+        text_token = text_token[:, 0, 0]  # shape is [B]
+        audio_tokens = state.graphed_depth(text_token, transformer_out)
+        # ensure we don't overwrite prompt tokens, we only write over ungenerated tokens
+        state.offset += 1
+        position = state.offset % CT
+        state.cache[:, 0, position] = text_token
+        state.cache[:, 1 : lm_model.dep_q + 1, position] = audio_tokens
+        if state.offset <= self.max_delay:
+            return None
+        B = state.cache.shape[0]
+        gen_delays_cuda = self.delays_cuda[: lm_model.dep_q + 1]
+        index = (
+            ((state.offset - self.max_delay + gen_delays_cuda) % CT)
+            .view(1, -1, 1)
+            .expand(B, -1, 1)
+        )
+        out = state.cache.gather(dim=2, index=index)
+        return out
+    def depformer_step(
+        self,
+        text_token: torch.Tensor,
+        transformer_out: torch.Tensor,
+    ) -> torch.Tensor:
+        (B,) = text_token.shape
+        prev_token = text_token
+        lm_model = self.lm_model
+        depformer_tokens: list[torch.Tensor] = []
+        assert not lm_model.depformer.is_streaming
+        with lm_model.depformer.streaming(B):
+            for cb_index in range(lm_model.dep_q):
+                input_ = prev_token[:, None, None]
+                logits = lm_model.forward_depformer(cb_index, input_, transformer_out)
+                next_token = sample_token(
+                    logits.float(),
+                    self.use_sampling,
+                    self.temp,
+                    self.top_k,
+                )
+                assert next_token.shape == (B, 1, 1)
+                next_token = next_token[:, 0, 0]  # shape is B
+                depformer_tokens.append(next_token)
+                prev_token = next_token
+        assert len(depformer_tokens) == lm_model.dep_q, (
+            len(depformer_tokens),
+            lm_model.dep_q,
+        )
+        out = torch.stack(depformer_tokens, dim=1)
+        assert out.shape == (B, lm_model.dep_q), out.shape
+        return out

moshi/models/loaders.py ADDED Viewed

	@@ -0,0 +1,159 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Retrieves the pretrained models for Moshi and Mimi."""
+from pathlib import Path
+from safetensors.torch import load_model
+import torch
+from moshi.models.compression import MimiModel
+from moshi.models.lm import LMModel
+from moshi.modules import SEANetEncoder, SEANetDecoder, transformer
+from moshi.quantization import SplitResidualVectorQuantizer
+SAMPLE_RATE = 24000
+FRAME_RATE = 12.5
+TEXT_TOKENIZER_NAME = 'tokenizer_spm_32k_3.model'
+MOSHI_NAME = 'model.safetensors'
+MIMI_NAME = 'tokenizer-e351c8d8-checkpoint125.safetensors'
+DEFAULT_REPO = 'kyutai/moshiko-pytorch-bf16'
+_seanet_kwargs = {
+    "channels": 1,
+    "dimension": 512,
+    "causal": True,
+    "n_filters": 64,
+    "n_residual_layers": 1,
+    "activation": "ELU",
+    "compress": 2,
+    "dilation_base": 2,
+    "disable_norm_outer_blocks": 0,
+    "kernel_size": 7,
+    "residual_kernel_size": 3,
+    "last_kernel_size": 3,
+    # We train using weight_norm but then the weights are pre-processed for inference so
+    # that we can use a normal convolution.
+    "norm": "none",
+    "pad_mode": "constant",
+    "ratios": [8, 6, 5, 4],
+    "true_skip": True,
+}
+_quantizer_kwargs = {
+    "dimension": 256,
+    "n_q": 32,
+    "bins": 2048,
+    "input_dimension": _seanet_kwargs["dimension"],
+    "output_dimension": _seanet_kwargs["dimension"],
+}
+_transformer_kwargs = {
+    "d_model": _seanet_kwargs["dimension"],
+    "num_heads": 8,
+    "num_layers": 8,
+    "causal": True,
+    "layer_scale": 0.01,
+    "context": 250,
+    "conv_layout": True,
+    "max_period": 10000,
+    "gating": "none",
+    "norm": "layer_norm",
+    "positional_embedding": "rope",
+    "dim_feedforward": 2048,
+    "input_dimension": _seanet_kwargs["dimension"],
+    "output_dimensions": [_seanet_kwargs["dimension"]],
+}
+_lm_kwargs = {
+    "dim": 4096,
+    "text_card": 32000,
+    "existing_text_padding_id": 3,
+    "n_q": 16,
+    "dep_q": 8,
+    "card": _quantizer_kwargs["bins"],
+    "num_heads": 32,
+    "num_layers": 32,
+    "hidden_scale": 4.125,
+    "causal": True,
+    "layer_scale": None,
+    "context": 3000,
+    "max_period": 10000,
+    "gating": "silu",
+    "norm": "rms_norm_f32",
+    "positional_embedding": "rope",
+    "depformer_dim": 1024,
+    "depformer_dim_feedforward": int(4.125 * 1024),
+    "depformer_num_heads": 16,
+    "depformer_num_layers": 6,
+    "depformer_causal": True,
+    "depformer_layer_scale": None,
+    "depformer_multi_linear": True,
+    "depformer_context": 8,
+    "depformer_max_period": 10000,
+    "depformer_gating": "silu",
+    "depformer_pos_emb": "none",
+    "depformer_weights_per_step": True,
+    "delays": [0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1],
+}
+def _is_safetensors(path: Path | str) -> bool:
+    return Path(path).suffix in (".safetensors", ".sft", ".sfts")
+def get_mimi(filename: str | Path,
+             device: torch.device | str = 'cpu') -> MimiModel:
+    """Return a pretrained Mimi model."""
+    encoder = SEANetEncoder(**_seanet_kwargs)
+    decoder = SEANetDecoder(**_seanet_kwargs)
+    encoder_transformer = transformer.ProjectedTransformer(
+        device=device, **_transformer_kwargs
+    )
+    decoder_transformer = transformer.ProjectedTransformer(
+        device=device, **_transformer_kwargs
+    )
+    quantizer = SplitResidualVectorQuantizer(
+        **_quantizer_kwargs,
+    )
+    model = MimiModel(
+        encoder,
+        decoder,
+        quantizer,
+        channels=1,
+        sample_rate=SAMPLE_RATE,
+        frame_rate=FRAME_RATE,
+        encoder_frame_rate=SAMPLE_RATE / encoder.hop_length,
+        causal=True,
+        resample_method="conv",
+        encoder_transformer=encoder_transformer,
+        decoder_transformer=decoder_transformer,
+    ).to(device=device)
+    model.eval()
+    if _is_safetensors(filename):
+        load_model(model, filename)
+    else:
+        pkg = torch.load(filename, "cpu")
+        model.load_state_dict(pkg["model"])
+    model.set_num_codebooks(8)
+    return model
+def get_moshi_lm(filename: str | Path,
+                 device: torch.device | str = 'cpu') -> LMModel:
+    dtype = torch.bfloat16
+    model = LMModel(
+        device=device,
+        dtype=dtype,
+        **_lm_kwargs,
+    ).to(device=device, dtype=dtype)
+    model.eval()
+    if _is_safetensors(filename):
+        load_model(model, filename)
+    else:
+        pkg = torch.load(
+            filename,
+            "cpu",
+        )
+        model.load_state_dict(pkg["fsdp_best_state"]["model"])
+    return model

moshi/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,23 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Modules used for building the models."""
+# flake8: noqa
+from .conv import (
+    NormConv1d,
+    NormConvTranspose1d,
+    StreamingConv1d,
+    StreamingConvTranspose1d,
+    pad_for_conv1d,
+    pad1d,
+    unpad1d,
+)
+from .seanet import SEANetEncoder, SEANetDecoder
+from .transformer import StreamingTransformer

moshi/modules/conv.py ADDED Viewed

	@@ -0,0 +1,329 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass
+import math
+import typing as tp
+import warnings
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.utils import weight_norm
+from .streaming import RawStreamingConv1d, RawStreamingConvTranspose1d, StreamingModule
+CONV_NORMALIZATIONS = frozenset(["none", "weight_norm"])
+class TransposedLayerNorm(nn.Module):
+    """LayerNorm for [B, C, T] inputs."""
+    def __init__(self, **kwargs):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(**kwargs)
+    def forward(self, x):
+        x = x.transpose(1, 2)
+        x = self.layer_norm(x)
+        return x.transpose(1, 2)
+def apply_parametrization_norm(module: nn.Module, norm: str = "none"):
+    assert norm in CONV_NORMALIZATIONS
+    if norm == "weight_norm":
+        return weight_norm(module)
+    else:
+        # We already check was in CONV_NORMALIZATION, so any other choice
+        # doesn't need reparametrization.
+        return module
+def get_extra_padding_for_conv1d(
+    x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0
+) -> int:
+    """See `pad_for_conv1d`."""
+    length = x.shape[-1]
+    n_frames = (length - kernel_size + padding_total) / stride + 1
+    ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
+    return ideal_length - length
+def pad_for_conv1d(
+    x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0
+):
+    """Pad for a convolution to make sure that the last window is full.
+    Extra padding is added at the end. This is required to ensure that we can rebuild
+    an output of the same length, as otherwise, even with padding, some time steps
+    might get removed.
+    For instance, with total padding = 4, kernel size = 4, stride = 2:
+        0 0 1 2 3 4 5 0 0   # (0s are padding)
+        1   2   3           # (output frames of a convolution, last 0 is never used)
+        0 0 1 2 3 4 5 0     # (output of tr. conv., but pos. 5 is going to get removed as padding)
+            1 2 3 4         # once you removed padding, we are missing one time step !
+    """
+    extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total)
+    return F.pad(x, (0, extra_padding))
+def pad1d(
+    x: torch.Tensor,
+    paddings: tp.Tuple[int, int],
+    mode: str = "constant",
+    value: float = 0.0,
+):
+    """Tiny wrapper around F.pad, just to allow for reflect padding on small input.
+    If this is the case, we insert extra 0 padding to the right before the reflection happen.
+    """
+    length = x.shape[-1]
+    padding_left, padding_right = paddings
+    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
+    if mode == "reflect":
+        max_pad = max(padding_left, padding_right)
+        extra_pad = 0
+        if length <= max_pad:
+            extra_pad = max_pad - length + 1
+            x = F.pad(x, (0, extra_pad))
+        padded = F.pad(x, paddings, mode, value)
+        end = padded.shape[-1] - extra_pad
+        return padded[..., :end]
+    else:
+        return F.pad(x, paddings, mode, value)
+def unpad1d(x: torch.Tensor, paddings: tp.Tuple[int, int]):
+    """Remove padding from x, handling properly zero padding. Only for 1d!"""
+    padding_left, padding_right = paddings
+    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
+    assert (padding_left + padding_right) <= x.shape[-1]
+    end = x.shape[-1] - padding_right
+    return x[..., padding_left:end]
+class NormConv1d(nn.Module):
+    """Wrapper around Conv1d and normalization applied to this conv
+    to provide a uniform interface across normalization approaches.
+    """
+    def __init__(
+        self,
+        *args,
+        causal: bool = False,
+        norm: str = "none",
+        norm_kwargs: tp.Dict[str, tp.Any] = {},
+        **kwargs,
+    ):
+        super().__init__()
+        self.conv = apply_parametrization_norm(
+            RawStreamingConv1d(*args, **kwargs), norm
+        )
+        self.norm_type = norm
+    def forward(self, x):
+        x = self.conv(x)
+        return x
+class NormConvTranspose1d(nn.Module):
+    """Wrapper around ConvTranspose1d and normalization applied to this conv
+    to provide a uniform interface across normalization approaches.
+    """
+    def __init__(
+        self,
+        *args,
+        causal: bool = False,
+        norm: str = "none",
+        norm_kwargs: tp.Dict[str, tp.Any] = {},
+        **kwargs,
+    ):
+        super().__init__()
+        self.convtr = apply_parametrization_norm(
+            RawStreamingConvTranspose1d(*args, **kwargs), norm
+        )
+        self.norm_type = norm
+    def forward(self, x):
+        x = self.convtr(x)
+        return x
+@dataclass
+class _StreamingConv1dState:
+    padding_to_add: int
+    original_padding_to_add: int
+    def reset(self):
+        self.padding_to_add = self.original_padding_to_add
+class StreamingConv1d(StreamingModule[_StreamingConv1dState]):
+    """Conv1d with some builtin handling of asymmetric or causal padding
+    and normalization.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        causal: bool = False,
+        norm: str = "none",
+        norm_kwargs: tp.Dict[str, tp.Any] = {},
+        pad_mode: str = "reflect",
+    ):
+        super().__init__()
+        # warn user on unusual setup between dilation and stride
+        if stride > 1 and dilation > 1:
+            warnings.warn(
+                "StreamingConv1d has been initialized with stride > 1 and dilation > 1"
+                f" (kernel_size={kernel_size} stride={stride}, dilation={dilation})."
+            )
+        self.conv = NormConv1d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+            causal=causal,
+            norm=norm,
+            norm_kwargs=norm_kwargs,
+        )
+        self.causal = causal
+        self.pad_mode = pad_mode
+    @property
+    def _stride(self) -> int:
+        return self.conv.conv.stride[0]
+    @property
+    def _kernel_size(self) -> int:
+        return self.conv.conv.kernel_size[0]
+    @property
+    def _effective_kernel_size(self) -> int:
+        dilation = self.conv.conv.dilation[0]
+        return (
+            self._kernel_size - 1
+        ) * dilation + 1  # effective kernel size with dilations
+    @property
+    def _padding_total(self) -> int:
+        return self._effective_kernel_size - self._stride
+    def _init_streaming_state(self, batch_size: int) -> _StreamingConv1dState:
+        assert self.causal, "streaming is only supported for causal convs"
+        return _StreamingConv1dState(self._padding_total, self._padding_total)
+    def forward(self, x):
+        B, C, T = x.shape
+        padding_total = self._padding_total
+        extra_padding = get_extra_padding_for_conv1d(
+            x, self._effective_kernel_size, self._stride, padding_total
+        )
+        state = self._streaming_state
+        if state is None:
+            if self.causal:
+                # Left padding for causal
+                x = pad1d(x, (padding_total, extra_padding), mode=self.pad_mode)
+            else:
+                # Asymmetric padding required for odd strides
+                padding_right = padding_total // 2
+                padding_left = padding_total - padding_right
+                x = pad1d(
+                    x, (padding_left, padding_right + extra_padding), mode=self.pad_mode
+                )
+        else:
+            if state.padding_to_add > 0 and x.shape[-1] > 0:
+                x = pad1d(x, (state.padding_to_add, 0), mode=self.pad_mode)
+                state.padding_to_add = 0
+        return self.conv(x)
+@dataclass
+class _StreamingConvTr1dState:
+    pass
+    def reset(self):
+        pass
+class StreamingConvTranspose1d(StreamingModule[_StreamingConvTr1dState]):
+    """ConvTranspose1d with some builtin handling of asymmetric or causal padding
+    and normalization.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        causal: bool = False,
+        norm: str = "none",
+        trim_right_ratio: float = 1.0,
+        norm_kwargs: tp.Dict[str, tp.Any] = {},
+    ):
+        super().__init__()
+        self.convtr = NormConvTranspose1d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            groups=groups,
+            bias=bias,
+            causal=causal,
+            norm=norm,
+            norm_kwargs=norm_kwargs,
+        )
+        self.causal = causal
+        self.trim_right_ratio = trim_right_ratio
+        assert (
+            self.causal or self.trim_right_ratio == 1.0
+        ), "`trim_right_ratio` != 1.0 only makes sense for causal convolutions"
+        assert self.trim_right_ratio >= 0.0 and self.trim_right_ratio <= 1.0
+    def _init_streaming_state(self, batch_size: int) -> _StreamingConvTr1dState:
+        assert self.causal, "streaming is only supported for causal convtrs"
+        return _StreamingConvTr1dState()
+    def forward(self, x):
+        kernel_size = self.convtr.convtr.kernel_size[0]
+        stride = self.convtr.convtr.stride[0]
+        padding_total = kernel_size - stride
+        y = self.convtr(x)
+        if not self.is_streaming:
+            # We will only trim fixed padding. Extra padding from `pad_for_conv1d` would be
+            # removed at the very end, when keeping only the right length for the output,
+            # as removing it here would require also passing the length at the matching layer
+            # in the encoder.
+            if self.causal:
+                # Trim the padding on the right according to the specified ratio
+                # if trim_right_ratio = 1.0, trim everything from right
+                padding_right = math.ceil(padding_total * self.trim_right_ratio)
+                padding_left = padding_total - padding_right
+                y = unpad1d(y, (padding_left, padding_right))
+            else:
+                # Asymmetric padding required for odd strides
+                padding_right = padding_total // 2
+                padding_left = padding_total - padding_right
+                y = unpad1d(y, (padding_left, padding_right))
+        return y

moshi/modules/gating.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from torch import nn
+from torch.nn import functional as F
+from ..utils.compile import torch_compile_lazy
+@torch_compile_lazy
+def gating_forward_kernel(
+    weight_in: torch.Tensor, weight_out: torch.Tensor, activation, x: torch.Tensor
+):
+    x = F.linear(x, weight_in)
+    B, T, _ = x.shape
+    x = x.view(B, T, 2, -1)
+    x = activation(x[..., 0, :]) * x[..., 1, :]
+    x = F.linear(x, weight_out)
+    return x
+class ActivationGating(nn.Module):
+    """
+    Gating FFN layer, using the given activation.
+    Args:
+        dim (int): dimension of the input and output of the transformer.
+        activation (any callable Tensor to Tensor): activation function to use.
+        **factory_kwargs: other kwargs passed to the linear layer, in particular device and dtype.
+    """
+    _fsdp_final = True
+    def __init__(self, dim: int, dim_feedforward: int, activation, **factory_kwargs):
+        super().__init__()
+        # We should have 8 d^2 param, instead we will have
+        # 2 * h * d + h * d = 3 h * d = 8 d^2
+        # so h = 8 d / 3 but following Hervé's advice we use 21 / 8 as an approx.
+        if dim_feedforward == 4 * dim:
+            hidden = (21 * dim) // 8
+        else:
+            hidden = (2 * dim_feedforward) // 3
+        self.linear_in = nn.Linear(dim, 2 * hidden, bias=False, **factory_kwargs)
+        self.linear_out = nn.Linear(hidden, dim, bias=False, **factory_kwargs)
+        self.activation = activation
+    def forward(self, x: torch.Tensor):
+        return gating_forward_kernel(
+            self.linear_in.weight, self.linear_out.weight, self.activation, x
+        )
+def _get_activation(name: str):
+    if name in ["sigmoid", "tanh", "relu"]:
+        return getattr(torch, name)
+    elif name in ["leaky_relu", "elu", "gelu", "silu", "mish", "softsign"]:
+        return getattr(torch.nn.functional, name)
+    elif name == "identity":
+        return torch.nn.Identity()
+    else:
+        raise ValueError(f"Unknown activation {name}")
+def _make_gating(
+    name: str, dim: int, dim_feedforward: int, **factory_kwargs
+) -> nn.Module:
+    return ActivationGating(
+        dim, dim_feedforward, _get_activation(name), **factory_kwargs
+    )
+def make_gating(
+    name: str, dim: int, dim_feedforward: int, **factory_kwargs
+) -> nn.Module:
+    gating = _make_gating(name, dim, dim_feedforward, **factory_kwargs)
+    max_params = 2 * dim * dim_feedforward
+    params = sum(p.numel() for p in gating.parameters())
+    assert (
+        params <= max_params
+    ), f"{name} gating has {params} params, max is {max_params}"
+    return gating

moshi/modules/resample.py ADDED Viewed

	@@ -0,0 +1,119 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import typing as tp
+from einops import rearrange
+import torch
+from torch import nn
+from .conv import StreamingConv1d, StreamingConvTranspose1d
+class ConvDownsample1d(nn.Module):
+    """
+    Downsampling by some integer amount `stride` using convolutions
+    with a kernel size of twice the stride.
+    If `causal` is True, the output uses a causal convolution.
+    """
+    def __init__(
+        self,
+        stride: int,
+        dimension: tp.Optional[int] = None,
+        causal: bool = False,
+        learnt: bool = False,
+        channel_wise: bool = False,
+    ):
+        super().__init__()
+        self.learnt = learnt
+        self.channel_wise = channel_wise
+        groups = 1
+        if learnt:
+            assert dimension is not None, "Dimension required for learnt convolutions."
+            in_channels = dimension
+            out_channels = dimension
+            if channel_wise:
+                groups = dimension
+        else:
+            in_channels = 1
+            out_channels = 1
+        self.conv = StreamingConv1d(
+            in_channels,
+            out_channels,
+            kernel_size=2 * stride,
+            stride=stride,
+            causal=causal,
+            groups=groups,
+            bias=False,
+            pad_mode="replicate",
+        )
+        if not learnt:
+            actual_conv = self.conv.conv.conv
+            actual_conv.weight.requires_grad_(False)
+            actual_conv.weight.data.fill_(1.0 / (2 * stride))
+    def forward(self, x: torch.Tensor):
+        batch_size = len(x)
+        if not self.learnt:
+            x = rearrange(x, "b c t -> (b c) () t")
+        y = self.conv(x)
+        if not self.learnt:
+            y = rearrange(y, "(b c) () t -> b c t", b=batch_size)
+        return y
+class ConvTrUpsample1d(nn.Module):
+    """
+    Upsample by some integer amount `stride` using transposed convolutions.
+    """
+    def __init__(
+        self,
+        stride: int,
+        dimension: tp.Optional[int] = None,
+        causal: bool = False,
+        learnt: bool = False,
+        channel_wise: bool = False,
+    ):
+        super().__init__()
+        self.learnt = learnt
+        self.channel_wise = channel_wise
+        groups = 1
+        if learnt:
+            assert dimension is not None, "Dimension required for learnt convolutions."
+            in_channels = dimension
+            out_channels = dimension
+            if channel_wise:
+                groups = dimension
+        else:
+            in_channels = 1
+            out_channels = 1
+        self.convtr = StreamingConvTranspose1d(
+            in_channels,
+            out_channels,
+            kernel_size=2 * stride,
+            stride=stride,
+            causal=causal,
+            groups=groups,
+            bias=False,
+        )
+        if not learnt:
+            actual_convtr = self.convtr.convtr.convtr
+            actual_convtr.weight.requires_grad_(False)
+            actual_convtr.weight.data.fill_(1.0)
+    def forward(self, x: torch.Tensor):
+        batch_size = len(x)
+        if not self.learnt:
+            x = rearrange(x, "b c t -> (b c) () t")
+        y = self.convtr(x)
+        if not self.learnt:
+            x_for_normalization = torch.ones_like(x[:1])
+            normalization = self.convtr(x_for_normalization)
+            y = y / normalization
+            y = rearrange(y, "(b c) () t -> b c t", b=batch_size)
+        return y

moshi/modules/rope.py ADDED Viewed

	@@ -0,0 +1,90 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from torch import nn
+import math
+import torch
+from ..utils.compile import torch_compile_lazy
+@torch_compile_lazy
+def apply_rope(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    offset: torch.Tensor,
+    max_period: float = 10_000,
+    time_before_heads: bool = False,
+):
+    """
+    Args:
+        q (torch.Tensor): queries, shape `[B, T, H, D]`.
+        k (torch.Tensor): keys, shape `[B, T, H, D]`.
+        offset (int): current offset, e.g. when streaming.
+        max_period (float): maximum period for the cos and sin.
+        time_before_heads (bool):  if True, expected [B, T, H, D], else [B, H, T ,D]
+    """
+    if time_before_heads:
+        B, T, H, D = q.shape
+    else:
+        B, H, T, D = q.shape
+    assert k.shape == q.shape
+    assert D > 0
+    assert D % 2 == 0
+    assert max_period > 0
+    ds = torch.arange(D // 2, device=q.device, dtype=torch.float32)
+    freqs = torch.exp(ds * (-math.log(max_period) * 2 / D))
+    ts = offset.float() + torch.arange(T, device=q.device, dtype=torch.float32)
+    if time_before_heads:
+        ts = ts.view(-1, 1, 1)
+    else:
+        ts = ts.view(1, -1, 1)
+    dims = q.shape[:-1]
+    q = q.view(*dims, D // 2, 2)
+    k = k.view(*dims, D // 2, 2)
+    # convention is `r` suffix is real part, `i` is imaginary.
+    qr = q[..., 0].float()
+    qi = q[..., 1].float()
+    kr = k[..., 0].float()
+    ki = k[..., 1].float()
+    rotr = torch.cos(freqs * ts)
+    roti = torch.sin(freqs * ts)
+    qor = qr * rotr - qi * roti
+    qoi = qr * roti + qi * rotr
+    kor = kr * rotr - ki * roti
+    koi = kr * roti + ki * rotr
+    dtype = q.dtype
+    qo = torch.stack([qor.to(dtype), qoi.to(dtype)], dim=-1)
+    ko = torch.stack([kor.to(dtype), koi.to(dtype)], dim=-1)
+    return qo.view(*dims, D), ko.view(*dims, D)
+class RotaryEmbedding(nn.Module):
+    """Rotary positional embedding (RoPE) from [Su et al 2022](https://arxiv.org/abs/2104.09864).
+    Args:
+        max_period (float): Maximum period of the rotation frequencies.
+    """
+    def __init__(self, max_period: float = 10000.0):
+        super().__init__()
+        self.max_period = max_period
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        offset: torch.Tensor,
+        time_before_heads: bool = False,
+    ):
+        """Apply rope rotation to query or key tensor."""
+        return apply_rope(q, k, offset, self.max_period, time_before_heads)

moshi/modules/seanet.py ADDED Viewed

	@@ -0,0 +1,395 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import typing as tp
+import numpy as np
+import torch.nn as nn
+from .conv import StreamingConv1d, StreamingConvTranspose1d
+from .streaming import StreamingContainer, StreamingAdd
+from ..utils.compile import torch_compile_lazy
+class SEANetResnetBlock(StreamingContainer):
+    """Residual block from SEANet model.
+    Args:
+        dim (int): Dimension of the input/output.
+        kernel_sizes (list): List of kernel sizes for the convolutions.
+        dilations (list): List of dilations for the convolutions.
+        activation (str): Activation function.
+        activation_params (dict): Parameters to provide to the activation function.
+        norm (str): Normalization method.
+        norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution.
+        causal (bool): Whether to use fully causal convolution.
+        pad_mode (str): Padding mode for the convolutions.
+        compress (int): Reduced dimensionality in residual branches (from Demucs v3).
+        true_skip (bool): Whether to use true skip connection or a simple
+            (streamable) convolution as the skip connection.
+    """
+    def __init__(
+        self,
+        dim: int,
+        kernel_sizes: tp.List[int] = [3, 1],
+        dilations: tp.List[int] = [1, 1],
+        activation: str = "ELU",
+        activation_params: dict = {"alpha": 1.0},
+        norm: str = "none",
+        norm_params: tp.Dict[str, tp.Any] = {},
+        causal: bool = False,
+        pad_mode: str = "reflect",
+        compress: int = 2,
+        true_skip: bool = True,
+    ):
+        super().__init__()
+        assert len(kernel_sizes) == len(
+            dilations
+        ), "Number of kernel sizes should match number of dilations"
+        act = getattr(nn, activation)
+        hidden = dim // compress
+        block = []
+        for i, (kernel_size, dilation) in enumerate(zip(kernel_sizes, dilations)):
+            in_chs = dim if i == 0 else hidden
+            out_chs = dim if i == len(kernel_sizes) - 1 else hidden
+            block += [
+                act(**activation_params),
+                StreamingConv1d(
+                    in_chs,
+                    out_chs,
+                    kernel_size=kernel_size,
+                    dilation=dilation,
+                    norm=norm,
+                    norm_kwargs=norm_params,
+                    causal=causal,
+                    pad_mode=pad_mode,
+                ),
+            ]
+        self.block = nn.Sequential(*block)
+        self.add = StreamingAdd()
+        self.shortcut: nn.Module
+        if true_skip:
+            self.shortcut = nn.Identity()
+        else:
+            self.shortcut = StreamingConv1d(
+                dim,
+                dim,
+                kernel_size=1,
+                norm=norm,
+                norm_kwargs=norm_params,
+                causal=causal,
+                pad_mode=pad_mode,
+            )
+    def forward(self, x):
+        u, v = self.shortcut(x), self.block(x)
+        return self.add(u, v)
+class SEANetEncoder(StreamingContainer):
+    """SEANet encoder.
+    Args:
+        channels (int): Audio channels.
+        dimension (int): Intermediate representation dimension.
+        n_filters (int): Base width for the model.
+        n_residual_layers (int): nb of residual layers.
+        ratios (Sequence[int]): kernel size and stride ratios. The encoder uses downsampling ratios instead of
+            upsampling ratios, hence it will use the ratios in the reverse order to the ones specified here
+            that must match the decoder order. We use the decoder order as some models may only employ the decoder.
+        activation (str): Activation function.
+        activation_params (dict): Parameters to provide to the activation function.
+        norm (str): Normalization method.
+        norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution.
+        kernel_size (int): Kernel size for the initial convolution.
+        last_kernel_size (int): Kernel size for the initial convolution.
+        residual_kernel_size (int): Kernel size for the residual layers.
+        dilation_base (int): How much to increase the dilation with each layer.
+        causal (bool): Whether to use fully causal convolution.
+        pad_mode (str): Padding mode for the convolutions.
+        true_skip (bool): Whether to use true skip connection or a simple
+            (streamable) convolution as the skip connection in the residual network blocks.
+        compress (int): Reduced dimensionality in residual branches (from Demucs v3).
+        disable_norm_outer_blocks (int): Number of blocks for which we don't apply norm.
+            For the encoder, it corresponds to the N first blocks.
+        mask_fn (nn.Module): Optional mask function to apply after convolution layers.
+        mask_position (int): Position of the mask function, with mask_position == 0 for the first convolution layer,
+            mask_position == 1 for the first conv block, etc.
+    """
+    def __init__(
+        self,
+        channels: int = 1,
+        dimension: int = 128,
+        n_filters: int = 32,
+        n_residual_layers: int = 3,
+        ratios: tp.List[int] = [8, 5, 4, 2],
+        activation: str = "ELU",
+        activation_params: dict = {"alpha": 1.0},
+        norm: str = "none",
+        norm_params: tp.Dict[str, tp.Any] = {},
+        kernel_size: int = 7,
+        last_kernel_size: int = 7,
+        residual_kernel_size: int = 3,
+        dilation_base: int = 2,
+        causal: bool = False,
+        pad_mode: str = "reflect",
+        true_skip: bool = True,
+        compress: int = 2,
+        disable_norm_outer_blocks: int = 0,
+        mask_fn: tp.Optional[nn.Module] = None,
+        mask_position: tp.Optional[int] = None,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.dimension = dimension
+        self.n_filters = n_filters
+        self.ratios = list(reversed(ratios))
+        del ratios
+        self.n_residual_layers = n_residual_layers
+        self.hop_length = int(np.prod(self.ratios))
+        self.n_blocks = len(self.ratios) + 2  # first and last conv + residual blocks
+        self.disable_norm_outer_blocks = disable_norm_outer_blocks
+        assert (
+            self.disable_norm_outer_blocks >= 0 and self.disable_norm_outer_blocks <= self.n_blocks
+        ), (
+            "Number of blocks for which to disable norm is invalid."
+            "It should be lower or equal to the actual number of blocks in the network and greater or equal to 0."
+        )
+        act = getattr(nn, activation)
+        mult = 1
+        model: tp.List[nn.Module] = [
+            StreamingConv1d(
+                channels,
+                mult * n_filters,
+                kernel_size,
+                norm="none" if self.disable_norm_outer_blocks >= 1 else norm,
+                norm_kwargs=norm_params,
+                causal=causal,
+                pad_mode=pad_mode,
+            )
+        ]
+        if mask_fn is not None and mask_position == 0:
+            model += [mask_fn]
+        # Downsample to raw audio scale
+        for i, ratio in enumerate(self.ratios):
+            block_norm = "none" if self.disable_norm_outer_blocks >= i + 2 else norm
+            # Add residual layers
+            for j in range(n_residual_layers):
+                model += [
+                    SEANetResnetBlock(
+                        mult * n_filters,
+                        kernel_sizes=[residual_kernel_size, 1],
+                        dilations=[dilation_base**j, 1],
+                        norm=block_norm,
+                        norm_params=norm_params,
+                        activation=activation,
+                        activation_params=activation_params,
+                        causal=causal,
+                        pad_mode=pad_mode,
+                        compress=compress,
+                        true_skip=true_skip,
+                    )
+                ]
+            # Add downsampling layers
+            model += [
+                act(**activation_params),
+                StreamingConv1d(
+                    mult * n_filters,
+                    mult * n_filters * 2,
+                    kernel_size=ratio * 2,
+                    stride=ratio,
+                    norm=block_norm,
+                    norm_kwargs=norm_params,
+                    causal=causal,
+                    pad_mode=pad_mode,
+                ),
+            ]
+            mult *= 2
+            if mask_fn is not None and mask_position == i + 1:
+                model += [mask_fn]
+        model += [
+            act(**activation_params),
+            StreamingConv1d(
+                mult * n_filters,
+                dimension,
+                last_kernel_size,
+                norm=(
+                    "none" if self.disable_norm_outer_blocks == self.n_blocks else norm
+                ),
+                norm_kwargs=norm_params,
+                causal=causal,
+                pad_mode=pad_mode,
+            ),
+        ]
+        self.model = nn.Sequential(*model)
+    @torch_compile_lazy
+    def forward(self, x):
+        return self.model(x)
+class SEANetDecoder(StreamingContainer):
+    """SEANet decoder.
+    Args:
+        channels (int): Audio channels.
+        dimension (int): Intermediate representation dimension.
+        n_filters (int): Base width for the model.
+        n_residual_layers (int): nb of residual layers.
+        ratios (Sequence[int]): kernel size and stride ratios.
+        activation (str): Activation function.
+        activation_params (dict): Parameters to provide to the activation function.
+        final_activation (str): Final activation function after all convolutions.
+        final_activation_params (dict): Parameters to provide to the activation function.
+        norm (str): Normalization method.
+        norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution.
+        kernel_size (int): Kernel size for the initial convolution.
+        last_kernel_size (int): Kernel size for the initial convolution.
+        residual_kernel_size (int): Kernel size for the residual layers.
+        dilation_base (int): How much to increase the dilation with each layer.
+        causal (bool): Whether to use fully causal convolution.
+        pad_mode (str): Padding mode for the convolutions.
+        true_skip (bool): Whether to use true skip connection or a simple.
+            (streamable) convolution as the skip connection in the residual network blocks.
+        compress (int): Reduced dimensionality in residual branches (from Demucs v3).
+        disable_norm_outer_blocks (int): Number of blocks for which we don't apply norm.
+            For the decoder, it corresponds to the N last blocks.
+        trim_right_ratio (float): Ratio for trimming at the right of the transposed convolution under the causal setup.
+            If equal to 1.0, it means that all the trimming is done at the right.
+    """
+    def __init__(
+        self,
+        channels: int = 1,
+        dimension: int = 128,
+        n_filters: int = 32,
+        n_residual_layers: int = 3,
+        ratios: tp.List[int] = [8, 5, 4, 2],
+        activation: str = "ELU",
+        activation_params: dict = {"alpha": 1.0},
+        final_activation: tp.Optional[str] = None,
+        final_activation_params: tp.Optional[dict] = None,
+        norm: str = "none",
+        norm_params: tp.Dict[str, tp.Any] = {},
+        kernel_size: int = 7,
+        last_kernel_size: int = 7,
+        residual_kernel_size: int = 3,
+        dilation_base: int = 2,
+        causal: bool = False,
+        pad_mode: str = "reflect",
+        true_skip: bool = True,
+        compress: int = 2,
+        disable_norm_outer_blocks: int = 0,
+        trim_right_ratio: float = 1.0,
+    ):
+        super().__init__()
+        self.dimension = dimension
+        self.channels = channels
+        self.n_filters = n_filters
+        self.ratios = ratios
+        del ratios
+        self.n_residual_layers = n_residual_layers
+        self.hop_length = int(np.prod(self.ratios))
+        self.n_blocks = len(self.ratios) + 2  # first and last conv + residual blocks
+        self.disable_norm_outer_blocks = disable_norm_outer_blocks
+        assert (
+            self.disable_norm_outer_blocks >= 0 and self.disable_norm_outer_blocks <= self.n_blocks
+        ), (
+            "Number of blocks for which to disable norm is invalid."
+            "It should be lower or equal to the actual number of blocks in the network and greater or equal to 0."
+        )
+        act = getattr(nn, activation)
+        mult = int(2 ** len(self.ratios))
+        model: tp.List[nn.Module] = [
+            StreamingConv1d(
+                dimension,
+                mult * n_filters,
+                kernel_size,
+                norm=(
+                    "none" if self.disable_norm_outer_blocks == self.n_blocks else norm
+                ),
+                norm_kwargs=norm_params,
+                causal=causal,
+                pad_mode=pad_mode,
+            )
+        ]
+        # Upsample to raw audio scale
+        for i, ratio in enumerate(self.ratios):
+            block_norm = (
+                "none"
+                if self.disable_norm_outer_blocks >= self.n_blocks - (i + 1)
+                else norm
+            )
+            # Add upsampling layers
+            model += [
+                act(**activation_params),
+                StreamingConvTranspose1d(
+                    mult * n_filters,
+                    mult * n_filters // 2,
+                    kernel_size=ratio * 2,
+                    stride=ratio,
+                    norm=block_norm,
+                    norm_kwargs=norm_params,
+                    causal=causal,
+                    trim_right_ratio=trim_right_ratio,
+                ),
+            ]
+            # Add residual layers
+            for j in range(n_residual_layers):
+                model += [
+                    SEANetResnetBlock(
+                        mult * n_filters // 2,
+                        kernel_sizes=[residual_kernel_size, 1],
+                        dilations=[dilation_base**j, 1],
+                        activation=activation,
+                        activation_params=activation_params,
+                        norm=block_norm,
+                        norm_params=norm_params,
+                        causal=causal,
+                        pad_mode=pad_mode,
+                        compress=compress,
+                        true_skip=true_skip,
+                    )
+                ]
+            mult //= 2
+        # Add final layers
+        model += [
+            act(**activation_params),
+            StreamingConv1d(
+                n_filters,
+                channels,
+                last_kernel_size,
+                norm="none" if self.disable_norm_outer_blocks >= 1 else norm,
+                norm_kwargs=norm_params,
+                causal=causal,
+                pad_mode=pad_mode,
+            ),
+        ]
+        # Add optional final activation to decoder (eg. tanh)
+        if final_activation is not None:
+            final_act = getattr(nn, final_activation)
+            final_activation_params = final_activation_params or {}
+            model += [final_act(**final_activation_params)]
+        self.model = nn.Sequential(*model)
+    @torch_compile_lazy
+    def forward(self, z):
+        y = self.model(z)
+        return y

moshi/modules/streaming.py ADDED Viewed

	@@ -0,0 +1,363 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Streaming module API that should be implemented by all Streaming components,
+"""
+import abc
+from contextlib import contextmanager
+from dataclasses import dataclass
+import itertools
+import math
+import typing as tp
+from torch import nn
+import torch
+class Resetable(tp.Protocol):
+    def reset(self) -> None:
+        pass
+State = tp.TypeVar("State", bound=Resetable)
+class StreamingModule(abc.ABC, nn.Module, tp.Generic[State]):
+    """Common API for streaming components.
+    Each streaming component has a streaming state, which is just a dict[str, Tensor].
+    By convention, the first dim of each tensor must be the batch size.
+    Don't use dots in the key names, as this would clash with submodules
+    (like in state_dict).
+    If `self._is_streaming` is True, the component should use and remember
+    the proper state inside `self._streaming_state`.
+    To set a streaming component in streaming state, use
+        with module.streaming():
+            ...
+    This will automatically reset the streaming state when exiting the context manager.
+    This also automatically propagates to all streaming children module.
+    Some module might also implement the `StreamingModule.flush` method, although
+    this one is trickier, as all parents module must be StreamingModule and implement
+    it as well for it to work properly. See `StreamingSequential` after.
+    """
+    def __init__(self) -> None:
+        super().__init__()
+        self._streaming_state: State | None = None
+        self._streaming_propagate: bool = True
+    @property
+    def is_streaming(self):
+        return self._streaming_state is not None
+    def set_streaming_propagate(self, streaming_propagate: bool):
+        self._streaming_propagate = streaming_propagate
+    def _apply_named_streaming(self, fn: tp.Any):
+        def _handle_module(prefix: str, module: nn.Module, recurse: bool = True):
+            propagate = True
+            if isinstance(module, StreamingModule):
+                if module._streaming_propagate:
+                    fn(prefix, module)
+                else:
+                    propagate = False
+            if not recurse:
+                return
+            if propagate:
+                for name, child in module.named_children():
+                    _handle_module(prefix + "." + name, child)
+        _handle_module("", self, recurse=False)
+        for name, child in self.named_children():
+            _handle_module(name, child)
+    def _start_streaming(self, batch_size: int):
+        def _start_streaming(name: str, module: StreamingModule):
+            module._streaming_state = module._init_streaming_state(batch_size)
+        self._apply_named_streaming(_start_streaming)
+    def _stop_streaming(self):
+        def _stop_streaming(name: str, module: StreamingModule):
+            module._streaming_state = None
+        self._apply_named_streaming(_stop_streaming)
+    @abc.abstractmethod
+    def _init_streaming_state(self, batch_size: int) -> State: ...
+    def streaming_forever(self, batch_size: int):
+        self._start_streaming(batch_size)
+    @contextmanager
+    def streaming(self, batch_size: int):
+        """Context manager to enter streaming mode. Reset streaming state on exit."""
+        self._start_streaming(batch_size)
+        try:
+            yield
+        finally:
+            self._stop_streaming()
+    def reset_streaming(self):
+        """Reset the streaming state."""
+        def _reset(name: str, module: StreamingModule):
+            state = module._streaming_state
+            if state is None:
+                raise ValueError(
+                    f"Trying to reset streaming, but {name} wasn't streaming."
+                )
+            state.reset()
+        self._apply_named_streaming(_reset)
+    def get_streaming_state(self) -> dict[str, tp.Any]:
+        """Return the complete streaming state, including that of sub-modules."""
+        state: dict[str, tp.Any] = {}
+        def _add(name: str, module: StreamingModule):
+            state[name] = module._streaming_state
+        self._apply_named_streaming(_add)
+        return state
+    def set_streaming_state(self, state: dict[str, tp.Any]):
+        """Set the streaming state, including that of sub-modules."""
+        state = dict(state)
+        def _set(name: str, module: StreamingModule):
+            if name in state:
+                module._streaming_state = state[name]
+                state.pop(name)
+            else:
+                raise RuntimeError(f"Expected to find a streaming state for {name}.")
+        self._apply_named_streaming(_set)
+        if state:
+            raise RuntimeError(f"Some states were not consumed: {list(state.keys())}")
+@dataclass
+class _NullState:
+    pass
+    def reset(self) -> None:
+        pass
+class StreamingContainer(StreamingModule[_NullState]):
+    def _init_streaming_state(self, batch_size: int) -> _NullState:
+        return _NullState()
+@dataclass
+class _StreamingAddState:
+    previous_x: torch.Tensor | None = None
+    previous_y: torch.Tensor | None = None
+    def reset(self):
+        self.previous_x = None
+        self.previous_y = None
+class StreamingAdd(StreamingModule[_StreamingAddState]):
+    def _init_streaming_state(self, batch_size: int) -> _StreamingAddState:
+        return _StreamingAddState()
+    def forward(self, x: torch.Tensor, y: torch.Tensor):
+        if self._streaming_state is None:
+            return x + y
+        else:
+            prev_x = self._streaming_state.previous_x
+            prev_y = self._streaming_state.previous_y
+            if prev_x is not None:
+                x = torch.cat([prev_x, x], dim=-1)
+            if prev_y is not None:
+                y = torch.cat([prev_y, y], dim=-1)
+            m_l = min(x.shape[-1], y.shape[-1])
+            self._streaming_state.previous_x = x[..., m_l:]
+            self._streaming_state.previous_y = y[..., m_l:]
+            return x[..., :m_l] + y[..., :m_l]
+@dataclass
+class _StreamingConvState:
+    previous: torch.Tensor | None = None
+    def reset(self):
+        self.previous = None
+class RawStreamingConv1d(nn.Conv1d, StreamingModule[_StreamingConvState]):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        assert self.padding[0] == 0, "Padding should be handled outside."
+        assert (
+            self.stride[0] <= self.kernel_size[0]
+        ), "stride must be less than kernel_size."
+    def _init_streaming_state(self, batch_size: int) -> _StreamingConvState:
+        return _StreamingConvState()
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        stride = self.stride[0]
+        # Effective kernel size accounting for dilation.
+        kernel = (self.kernel_size[0] - 1) * self.dilation[0] + 1
+        if self._streaming_state is None:
+            return super().forward(input)
+        else:
+            # Due to the potential overlap, we might have some cache of the previous time steps.
+            previous = self._streaming_state.previous
+            if previous is not None:
+                input = torch.cat([previous, input], dim=-1)
+            B, C, T = input.shape
+            # We now compute the number of full convolution frames, i.e. the frames
+            # that are ready to be computed.
+            num_frames = max(0, int(math.floor((T - kernel) / stride) + 1))
+            offset = num_frames * stride
+            # We will compute `num_frames` outputs, and we are advancing by `stride`
+            # for each of the frame, so we know the data before `stride * num_frames`
+            # will never be used again.
+            self._streaming_state.previous = input[..., offset:]
+            if num_frames > 0:
+                input_length = (num_frames - 1) * stride + kernel
+                out = super().forward(input[..., :input_length])
+            else:
+                # Not enough data as this point to output some new frames.
+                out = torch.empty(
+                    B, self.out_channels, 0, device=input.device, dtype=input.dtype
+                )
+            return out
+@dataclass
+class _StreamingConvTrState:
+    partial: torch.Tensor | None = None
+    def reset(self):
+        self.partial = None
+class RawStreamingConvTranspose1d(
+    nn.ConvTranspose1d, StreamingModule[_StreamingConvTrState]
+):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        assert self.padding[0] == 0, "Padding should be handled outside."
+        assert self.dilation[0] == 1, "No dilation for now"
+        assert (
+            self.stride[0] <= self.kernel_size[0]
+        ), "stride must be less than kernel_size."
+        assert self.output_padding[0] == 0, "Output padding not supported."
+    def _init_streaming_state(self, batch_size: int) -> _StreamingConvTrState:
+        return _StreamingConvTrState()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:  # type: ignore
+        B, C, T = x.shape
+        stride = self.stride[0]
+        kernel = self.kernel_size[0]
+        if self._streaming_state is None:
+            return super().forward(x)
+        else:
+            if T == 0:
+                return torch.empty(
+                    B, self.out_channels, 0, device=x.device, dtype=x.dtype
+                )
+            out = super().forward(x)
+            OT = out.shape[-1]
+            partial = self._streaming_state.partial
+            if partial is not None:
+                # Due to the potential overlap, the rightmost output of the conv transpose is not
+                # ready to be output, as it will receive contributions from the next input frames.
+                # Here we recover those `partial` output frames. We know that the first time step
+                # of the `partial` tensor corresponds to the first time step of `out` as anything
+                # coming before the first time step of `out` would have been already flushed.
+                PT = partial.shape[-1]
+                if self.bias is not None:
+                    out[..., :PT] += partial - self.bias[:, None]
+                else:
+                    out[..., :PT] += partial
+            # The input is T, the output is S * (T - 1) + K.
+            # The offset of the left of the next frame will be S * T
+            # so everything between 0 and S * T is ready to be output, and we need
+            # to keep in the internal state everything beyond that, i.e. S (T - 1) + K - S T = K - S
+            invalid_steps = kernel - stride
+            partial = out[..., OT - invalid_steps :]
+            out = out[..., : OT - invalid_steps]
+            self._streaming_state.partial = partial
+            return out
+def test():
+    torch.manual_seed(1234)
+    device = "cpu"
+    if torch.cuda.is_available():
+        # Avoid the cuda optimizations that would take place on single precision
+        # floats for convolutions.
+        torch.backends.cudnn.enabled = True
+        torch.backends.cudnn.benchmark = False
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cuda.matmul.allow_tf32 = False
+        torch.backends.cudnn.allow_tf32 = False
+        device = "cuda:0"
+    kernel_sizes = [1, 3, 4, 8, 15, 16]
+    strides = [1, 2, 3, 4, 5, 6, 7, 8, 9]
+    chin = 6
+    chout = 12
+    for kernel, stride in itertools.product(kernel_sizes, strides):
+        if stride > kernel:
+            continue
+        conv = RawStreamingConv1d(chin, chout, kernel, stride).to(device)
+        convtr = RawStreamingConvTranspose1d(chout, chin, kernel, stride).to(device)
+        for length in [4, 8, 32, 54, 65, 128, 1043]:
+            print(f"ksize {kernel} strides {stride} len {length}")
+            if length < kernel:
+                continue
+            batch_size = 3
+            x = torch.randn(batch_size, chin, length).to(device)
+            y = conv(x)
+            z = convtr(y)
+            for chunk_size in [1, 3, 5, 8]:
+                ys = []
+                zs = []
+                with conv.streaming(batch_size), convtr.streaming(batch_size):
+                    for offset in range(0, length, chunk_size):
+                        chunk = x[..., offset : offset + chunk_size]
+                        ys.append(conv(chunk))
+                        zs.append(convtr(ys[-1]))
+                y_stream = torch.cat(ys, dim=-1)
+                z_stream = torch.cat(zs, dim=-1)
+                y = y[..., : y_stream.shape[-1]]
+                z = z[..., : z_stream.shape[-1]]
+                assert y.shape == y_stream.shape, (y.shape, y_stream.shape)
+                delta = (y_stream - y).norm() / y.norm()
+                assert delta <= 1e-6, delta
+                num_frames = int((length - kernel) / stride) + 1
+                assert num_frames == y_stream.shape[-1]
+                assert z.shape == z_stream.shape, (z.shape, z_stream.shape)
+                delta = (z_stream - z).norm() / z.norm()
+                assert delta <= 1e-6, (delta, (z_stream - z).abs().mean(dim=(0, 1)))
+if __name__ == "__main__":
+    with torch.no_grad():
+        test()

moshi/modules/transformer.py ADDED Viewed

	@@ -0,0 +1,750 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Transformer model, with streaming support, + CUDA Graphable.
+Optimized for inference.
+See `StreamingTransformer` for more information.
+"""
+from contextlib import ExitStack
+from dataclasses import dataclass
+import typing as tp
+from einops import rearrange
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from ..utils.compile import no_compile
+from .gating import make_gating
+from .rope import RotaryEmbedding
+from .streaming import StreamingModule, StreamingContainer
+class LayerNormF32(nn.LayerNorm):
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        x_f32 = input.float()
+        out_f32 = super().forward(x_f32)
+        return out_f32.to(input.dtype)
+def _rms_norm(
+    x: torch.Tensor,
+    alpha: torch.Tensor,
+    dtype: tp.Optional[torch.dtype],
+    eps: float,
+):
+    assert x.dim() == 3, f"RMSNorm expects 3D inputs but got {x.shape}"
+    x_dtype = x.dtype
+    if dtype is not None:
+        x = x.to(dtype)
+    var = eps + torch.mean(x**2, dim=2, keepdim=True)
+    y = (x * (alpha.to(var) * torch.rsqrt(var))).to(x_dtype)
+    return y
+class RMSNorm(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        eps: float = 1e-5,
+        dtype: tp.Optional[torch.dtype] = None,
+        device=None,
+    ):
+        super().__init__()
+        self.eps = eps
+        self.dtype = dtype
+        self.alpha = nn.Parameter(
+            torch.full((1, 1, dim), 1.0, requires_grad=True, device=device, dtype=dtype)
+        )
+    def forward(self, x: torch.Tensor):
+        return _rms_norm(x, self.alpha, self.dtype, self.eps)
+class LayerScale(nn.Module):
+    """Layer scale from [Touvron et al 2021] (https://arxiv.org/pdf/2103.17239.pdf).
+    This rescales diagonally the residual outputs close to 0, with a learnt scale.
+    Args:
+        channels (int): Number of channels.
+        init (float): Initial scale.
+        channel_last (bool): If True, expect `[*, C]` shaped tensors, otherwise, `[*, C, T]`.
+        device (torch.device or str, optional): Device on which to initialize the module.
+        dtype (torch.dtype, optional): dtype to use to initialize the module.
+    """
+    def __init__(
+        self,
+        channels: int,
+        init: float = 1e-4,
+        channel_last: bool = True,
+        device=None,
+        dtype=None,
+    ):
+        super().__init__()
+        self.channel_last = channel_last
+        self.scale = nn.Parameter(
+            torch.full(
+                (channels,), init, requires_grad=True, device=device, dtype=dtype
+            )
+        )
+    def forward(self, x: torch.Tensor):
+        if self.channel_last:
+            return self.scale * x
+        else:
+            return self.scale[:, None] * x
+def create_norm_fn(norm_type: str, dim: int, **kwargs) -> nn.Module:
+    """Create normalization module for transformer encoder layer.
+    Args:
+        norm_type (str): Normalization method.
+        dim (int): Dimension of the normalized layer.
+        **kwargs (dict): Additional parameters for normalization layer.
+    Returns:
+        nn.Module: Normalization module.
+    """
+    if norm_type == "layer_norm":
+        return nn.LayerNorm(dim, eps=1e-5, **kwargs)
+    elif norm_type == "layer_norm_f32":
+        kwargs.pop("dtype", None)
+        return LayerNormF32(dim, eps=1e-8, **kwargs)
+    elif norm_type in {"rms_norm"}:
+        return RMSNorm(dim, eps=1e-5, **kwargs)
+    elif norm_type in {"rms_norm_f32"}:
+        kwargs.pop("dtype", None)
+        return RMSNorm(dim, eps=1e-8, dtype=torch.float, **kwargs)
+    else:
+        raise ValueError(f"Unknown norm type: {norm_type}")
+def create_sin_embedding(
+    positions: torch.Tensor,
+    dim: int,
+    max_period: float = 10000,
+    dtype: torch.dtype = torch.float32,
+) -> torch.Tensor:
+    """Create sinusoidal positional embedding, with shape `[B, T, C]`.
+    Args:
+        positions (torch.Tensor): LongTensor of positions.
+        dim (int): Dimension of the embedding.
+        max_period (float): Maximum period of the cosine/sine functions.
+        dtype (torch.dtype or str): dtype to use to generate the embedding.
+    Returns:
+        torch.Tensor: Sinusoidal positional embedding.
+    """
+    # We aim for BTC format
+    assert dim % 2 == 0
+    half_dim = dim // 2
+    positions = positions.to(dtype)
+    adim = torch.arange(half_dim, device=positions.device, dtype=dtype).view(1, 1, -1)
+    max_period_tensor = torch.full(
+        [], max_period, device=positions.device, dtype=dtype
+    )  # avoid sync point
+    phase = positions / (max_period_tensor ** (adim / (half_dim - 1)))
+    return torch.cat([torch.cos(phase), torch.sin(phase)], dim=-1)
+def multi_linear(
+    num_linear: int,
+    weight: torch.Tensor,
+    x: torch.Tensor,
+    offset: int,
+):
+    """Utility to apply a multi linear layer to the given input. A multi linear layer
+    applies a different set of weight for each time step.
+    Args:
+        num_linear (int): Number of possible time steps and so number of linears.
+        weight (torch.Tensor): Weight tensor, with shape `[num_linear * chout, chin]`.
+        x (torch.Tensor): Input tensor, with shape `[B, T, C]`.
+        offset (int): offset for the current time step, in particular for decoding, with
+            time steps provided one by one.
+    """
+    B, T, C = x.shape
+    ys = []
+    chout, chin = weight.shape
+    weight = weight.view(num_linear, -1, chin)
+    for t in range(T):
+        y = F.linear(x[:, t], weight[t + offset])
+        ys.append(y)
+    out = torch.stack(ys, 1)
+    return out
+def set_attention_context(model: nn.Module, context: tp.Optional[int] = None) -> None:
+    """Deactivates or changes the context span (in time steps) in a model.
+    Args:
+        model (nn.Module): model over which to look for attentions.
+        context (int or None): new temporary context value.
+    ..Note:: this is not a context manager but a plain function changing the context forever.
+        Initially, it was a context manager, but that led to interesting bugs when using
+        activation checkpointing, with the context being inconsistent between the forward
+        and backward.
+    """
+    for module in model.modules():
+        if isinstance(module, StreamingMultiheadAttention):
+            module.context = context
+class KVCacheResult(tp.NamedTuple):
+    keys: torch.Tensor
+    values: torch.Tensor
+    positions: torch.Tensor
+    @staticmethod
+    def from_kv(keys: torch.Tensor, values: torch.Tensor) -> "KVCacheResult":
+        B, H, T, D = keys.shape
+        assert tuple(values.shape[:-1]) == (B, H, T)
+        positions = torch.arange(T, device=keys.device, dtype=torch.long)
+        return KVCacheResult(keys, values, positions)
+class RingKVCache:
+    """Efficient streaming KVCache to be compatible with Cuda Graph.
+    Args:
+        batch_size (int): Batch size.
+        num_heads (int): Number of heads in the attention.
+        dim_per_head (int): Dimension per head.
+        device (torch.device): Device on which to initialize the cache.
+        dtype (torch.dtype): dtype to use for the cache.
+    """
+    def __init__(
+        self,
+        batch_size: int,
+        num_heads: int,
+        dim_per_head: int,
+        capacity: int,
+        device: torch.device = torch.device("cuda"),
+        dtype: torch.dtype = torch.bfloat16,
+    ):
+        self.capacity = capacity
+        self.cache = torch.zeros(
+            (2, batch_size, num_heads, capacity, dim_per_head),
+            device=device,
+            dtype=dtype,
+        )
+        self.end_offset = torch.zeros(1, device=device, dtype=torch.long)
+    def reset(self):
+        self.end_offset.zero_()
+    def complete(self, k: torch.Tensor, v: torch.Tensor) -> KVCacheResult:
+        assert k.shape[:-1] == v.shape[:-1], (k.shape, v.shape)
+        B, H, T, D = k.shape
+        indexes = torch.arange(T, device=self.end_offset.device, dtype=self.end_offset.dtype) + self.end_offset
+        indexes = indexes % self.capacity
+        self.cache[0].index_copy_(2, indexes, k)
+        self.cache[1].index_copy_(2, indexes, v)
+        self.end_offset.add_(T)
+        keys = self.cache[0]
+        values = self.cache[1]
+        indexes = torch.arange(
+            self.capacity, device=self.end_offset.device, dtype=torch.long
+        )
+        invalid = indexes >= self.end_offset
+        end_index = self.end_offset % self.capacity
+        delta = indexes - end_index
+        # If last key is for step S, and capacity is C, last key was written at index S % C.
+        # then end_offset = S + 1, and end_index = (S + 1) % C.
+        # Then for index = (S % C), delta = -1, and the next code gives us:
+        # position(index) = (S + 1) - 1 = S, all good.
+        # Now the time step at end_offset is actually the oldest in the KVCache, e.g., its
+        # position should be (S - self.capacity + 1).
+        # The following code gives us:
+        # position(index + 1) = S + 1 + 0 - self.capacity.
+        positions = torch.where(
+            delta <= 0,
+            self.end_offset + delta,
+            self.end_offset + delta - self.capacity,
+        )
+        positions = torch.where(invalid, torch.full_like(positions, -1), positions)
+        return KVCacheResult(keys, values, positions)
+@dataclass
+class _MHAState:
+    kv_cache: RingKVCache
+    offset: torch.Tensor
+    offset_cpu: int
+    def reset(self):
+        self.kv_cache.reset()
+        self.offset.zero_()
+        self.offset_cpu = 0
+class StreamingMultiheadAttention(StreamingModule[_MHAState]):
+    """Similar to `nn.MultiheadAttention` but with support for streaming, causal evaluation.
+    Args:
+        embed_dim (int): Dimension to project to.
+        num_heads (int): Number of heads.
+        causal (bool): Causal mask applied automatically.
+        context (int, optional): Number of time steps the attention can access to.
+            When causal, can access `context` time steps into the past, and when non causal,
+            can access `context // 2` steps in the past, and the same in the future.
+        rope (`RotaryEmbedding`, optional): Rope embedding to use.
+        weights_per_step (int): use different weights per time step. If non zero, should correspond to the
+            number of possible time steps.
+        device (torch.device, optional): Device on which to initialize.
+        dtype (torch.dtype, optional): dtype to use.
+    """
+    _fsdp_final = True
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        causal: bool = False,
+        context: tp.Optional[int] = None,
+        rope: tp.Optional[RotaryEmbedding] = None,
+        weights_per_step: int = 0,
+        device=None,
+        dtype=None,
+    ):
+        super().__init__()
+        factory_kwargs = {"device": device, "dtype": dtype}
+        self.embed_dim = embed_dim
+        self.causal = causal
+        self.context = context
+        self.rope = rope
+        self.num_heads = num_heads
+        out_dim = embed_dim
+        out_dim = 3 * embed_dim
+        mult = 1
+        self.weights_per_step = weights_per_step
+        if weights_per_step:
+            mult = weights_per_step
+        in_proj = nn.Linear(embed_dim, mult * out_dim, bias=False, **factory_kwargs)
+        # We try to follow the default PyTorch MHA convention, to easily compare results.
+        self.in_proj_weight = in_proj.weight
+        self.in_proj_bias = in_proj.bias
+        self.out_proj = nn.Linear(
+            embed_dim, mult * embed_dim, bias=False, **factory_kwargs
+        )
+    def _init_streaming_state(self, batch_size: int) -> _MHAState:
+        if self.context is None:
+            if self.weights_per_step:
+                capacity = self.weights_per_step
+            else:
+                raise RuntimeError(
+                    "Cannot create a streaming KVCache without a context to estimate capacity."
+                )
+        else:
+            capacity = self.context
+        device = self.in_proj_weight.device
+        # TODO: the following estimation will not work great with FSDP.
+        dtype = self.in_proj_weight.dtype
+        dim_per_head = self.embed_dim // self.num_heads
+        kv_cache = RingKVCache(
+            batch_size, self.num_heads, dim_per_head, capacity, device, dtype
+        )
+        return _MHAState(
+            kv_cache,
+            offset=torch.zeros(1, device=device, dtype=torch.long),
+            offset_cpu=0,
+        )
+    def _complete_kv(self, k, v) -> KVCacheResult:
+        state = self._streaming_state
+        if state is None:
+            return KVCacheResult.from_kv(k, v)
+        else:
+            return state.kv_cache.complete(k, v)
+    def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor):
+        state = self._streaming_state
+        T = query.shape[1]
+        if state is None:
+            offset = torch.zeros(1, device=query.device, dtype=torch.long)
+            offset_cpu = 0
+        else:
+            assert self.causal, "Streaming only available for causal"
+            offset = state.offset
+            offset_cpu = state.offset_cpu
+        if self.weights_per_step:
+            projected = multi_linear(
+                self.weights_per_step, self.in_proj_weight, query, offset_cpu
+            )
+        else:
+            projected = nn.functional.linear(query, self.in_proj_weight)
+        q, k, v = rearrange(
+            projected, "b t (p h d) -> p b h t d", p=3, h=self.num_heads
+        )
+        if self.rope:
+            q, k = self.rope(q, k, offset, time_before_heads=False)
+        k, v, pos_k = self._complete_kv(k, v)
+        if self.causal:
+            pos_k = pos_k.view(1, -1)
+            pos_q = offset + torch.arange(T, device=q.device, dtype=torch.long).view(
+                -1, 1
+            )
+            delta = pos_q - pos_k
+            attn_bias = (pos_k >= 0) & (delta >= 0)
+            if self.context is not None:
+                attn_bias = attn_bias & (delta < self.context)
+        else:
+            attn_bias = None
+        x = F.scaled_dot_product_attention(q, k, v, attn_bias, dropout_p=0.0)
+        x = rearrange(x, "b h t d -> b t (h d)")
+        if self.weights_per_step:
+            x = multi_linear(self.weights_per_step, self.out_proj.weight, x, offset_cpu)
+        else:
+            x = self.out_proj(x)
+        if state is not None:
+            state.offset.add_(T)
+            state.offset_cpu += T
+        return x
+@dataclass
+class _LayerState:
+    offset_cpu: int
+    def reset(self):
+        self.offset_cpu = 0
+class StreamingTransformerLayer(StreamingModule[_LayerState]):
+    """TransformerLayer with Streaming / Causal support.
+    Args:
+        d_model (int): Dimension of the data.
+        num_heads (int): Number of heads.
+        dim_feedforward (int): Intermediate dimension of FF module.
+        causal (bool): Causal mask applied automatically.
+        context (int, optional): Receptive field for the causal mask, infinite if None.
+        custom (bool): Use custom MHA implementation, for testing / benchmarking.
+        rope (`RotaryEmbedding`, optional): Rope embedding to use.
+        norm (str): Normalization to use. Currently, only 'layer_norm' is supported.
+        layer_scale (float, optional): If not None, LayerScale will be used with the given value as initial scale.
+        gating (str): if provided, replaces FFN with special gating, like GLU, GSiGLU etc.
+        weights_per_step (int): use different weights per time step. If non zero, should correspond to the
+            number of possible time steps.
+        skip_self_attn: If true, skips the self attention module and the norm
+        device (torch.device, optional): Device on which to initialize.
+        dtype (torch.dtype, optional): dtype to use.
+    """
+    _fsdp_final = True
+    def __init__(
+        self,
+        d_model: int,
+        num_heads: int,
+        dim_feedforward: int | list[int] = 2048,
+        causal: bool = False,
+        context: tp.Optional[int] = None,
+        rope: tp.Optional[RotaryEmbedding] = None,
+        norm: str = "layer_norm",
+        layer_scale: tp.Optional[float] = None,
+        gating: str = "none",
+        weights_per_step: int = 0,
+        activation=F.gelu,
+        skip_self_attn: bool = False,
+        device=None,
+        dtype=None,
+    ):
+        super().__init__()
+        factory_kwargs = {"device": device, "dtype": dtype}
+        # Redefine self_attn to our streaming multi-head attention
+        attn_kwargs: tp.Dict[str, tp.Any] = {
+            "embed_dim": d_model,
+            "num_heads": num_heads,
+        }
+        if not skip_self_attn:
+            self.self_attn: StreamingMultiheadAttention = StreamingMultiheadAttention(
+                causal=causal,
+                context=context,
+                rope=rope,
+                weights_per_step=weights_per_step,
+                **attn_kwargs,  # type: ignore
+                **factory_kwargs,  # type: ignore
+            )  # type: ignore
+            self.norm1 = create_norm_fn(norm, d_model, **factory_kwargs)
+        self.norm2 = create_norm_fn(norm, d_model, **factory_kwargs)
+        # Redefine feedforward layers to expose bias parameter
+        self.weights_per_step = weights_per_step
+        self.gating: tp.Optional[nn.Module] = None
+        self.linear1: tp.Optional[nn.Module] = None
+        self.linear2: tp.Optional[nn.Module] = None
+        self.activation = activation
+        self.skip_self_attn = skip_self_attn
+        if isinstance(dim_feedforward, list):
+            assert dim_feedforward
+            assert len(dim_feedforward) == weights_per_step, (
+                "Length of dim_feedforward must match weights_per_step,"
+                f" got {len(dim_feedforward)} != {weights_per_step}"
+            )
+        if gating == "none":
+            assert (
+                not weights_per_step
+            ), "weights_per_step without gating not supported for now."
+            assert not isinstance(
+                dim_feedforward, list
+            ), "List dim_feedforward without gating not supported for now."
+            self.linear1 = nn.Linear(
+                d_model, dim_feedforward, bias=False, **factory_kwargs
+            )
+            self.linear2 = nn.Linear(
+                dim_feedforward, d_model, bias=False, **factory_kwargs
+            )
+        else:
+            self.linear1 = None
+            self.linear2 = None
+            if weights_per_step:
+                if isinstance(dim_feedforward, int):
+                    dim_feedforward = [dim_feedforward] * weights_per_step
+                assert isinstance(dim_feedforward, list), dim_feedforward
+                self.gating = nn.ModuleList(
+                    [
+                        make_gating(gating, d_model, dim, **factory_kwargs)
+                        for dim in dim_feedforward
+                    ]
+                )
+            else:
+                assert isinstance(dim_feedforward, int)
+                self.gating = make_gating(
+                    gating, d_model, dim_feedforward, **factory_kwargs
+                )
+        self.layer_scale_1: nn.Module
+        self.layer_scale_2: nn.Module
+        if layer_scale is None:
+            self.layer_scale_1 = nn.Identity()
+            self.layer_scale_2 = nn.Identity()
+        else:
+            self.layer_scale_1 = LayerScale(d_model, layer_scale, **factory_kwargs)  # type: ignore
+            self.layer_scale_2 = LayerScale(d_model, layer_scale, **factory_kwargs)  # type: ignore
+    def _init_streaming_state(self, batch_size: int) -> _LayerState:
+        return _LayerState(offset_cpu=0)
+    # feed forward block
+    def _ff_block(self, x: torch.Tensor) -> torch.Tensor:
+        state = self._streaming_state
+        offset = 0
+        if state is not None:
+            offset = state.offset_cpu
+        x_orig = x
+        x = self.norm2(x)
+        if self.gating is None:
+            assert self.linear1 is not None
+            assert self.linear2 is not None
+            update = self.linear2(self.activation(self.linear1(x)))
+        else:
+            if self.weights_per_step:
+                assert isinstance(self.gating, nn.ModuleList)
+                B, T, D = x.shape
+                ys = []
+                for t in range(T):
+                    y = self.gating[offset + t](x[:, t : t + 1])
+                    ys.append(y)
+                update = torch.cat(ys, dim=1)
+            else:
+                update = self.gating(x)
+        return x_orig + self.layer_scale_2(update)
+    def _sa_block(self, x: torch.Tensor):
+        if self.skip_self_attn:
+            return x
+        x_orig = x
+        x = self.norm1(x)
+        update = self.self_attn(x, x, x)
+        return x_orig + self.layer_scale_1(update)
+    def forward(self, x: torch.Tensor):
+        with ExitStack() as stack:
+            if x.device.type != 'cuda':
+                stack.enter_context(no_compile())
+            x = self._sa_block(x)
+            x = self._ff_block(x)
+            state = self._streaming_state
+            if state:
+                state.offset_cpu += x.shape[1]
+            return x
+@dataclass
+class _TransformerState:
+    offset: torch.Tensor
+    def reset(self):
+        self.offset.zero_()
+class StreamingTransformer(StreamingModule[_TransformerState]):
+    """Transformer with Streaming / Causal support.
+    Args:
+        d_model (int): Dimension of the data.
+        num_heads (int): Number of heads.
+        dim_feedforward (int): Intermediate dimension of FF module.
+        causal (bool): Causal mask applied automatically.
+        context (int, optional): Receptive field for the causal mask, infinite if None.
+        layer_scale (float, optional): If not None, LayerScale will be used
+            with the given value as initial scale.
+        positional_embedding (str): Positional embedding strategy (sin, rope, sin_rope, or none).
+        max_period (float): Maximum period of the time embedding.
+        positional_scale (float): Scale of positional embedding, set to 0 to deactivate.
+        layer_class: (subclass of `StreamingTransformerLayer): class to use
+            to initialize the layers, allowing further customization outside of AudioCraft.
+        device (torch.device, optional): Device on which to initialize.
+        dtype (torch.dtype, optional): dtype to use.
+        **kwargs: See `StreamingTransformerLayer`.
+    """
+    def __init__(
+        self,
+        d_model: int,
+        num_heads: int,
+        num_layers: int,
+        dim_feedforward: int | list[int] = 2048,
+        causal: bool = False,
+        context: tp.Optional[int] = None,
+        positional_embedding: str = "sin",
+        max_period: float = 10_000,
+        positional_scale: float = 1.0,
+        betas: tp.Optional[tp.Tuple[float, float]] = None,
+        layer_class: tp.Type[StreamingTransformerLayer] = StreamingTransformerLayer,
+        device=None,
+        dtype=None,
+        **kwargs,
+    ):
+        super().__init__()
+        assert d_model % num_heads == 0
+        self.positional_embedding = positional_embedding
+        self.max_period = max_period
+        self.positional_scale = positional_scale
+        self.betas = betas
+        assert positional_embedding in {"sin", "rope", "sin_rope", "none"}
+        self.rope: tp.Optional[RotaryEmbedding] = None
+        if self.positional_embedding in {"rope", "sin_rope"}:
+            self.rope = RotaryEmbedding(max_period=max_period)
+        self.layers = nn.ModuleList()
+        for _ in range(num_layers):
+            self.layers.append(
+                layer_class(
+                    d_model=d_model,
+                    num_heads=num_heads,
+                    dim_feedforward=dim_feedforward,
+                    causal=causal,
+                    context=context,
+                    rope=self.rope,
+                    device=device,
+                    dtype=dtype,
+                    **kwargs,
+                )
+            )
+    def _init_streaming_state(self, batch_size: int) -> _TransformerState:
+        device = next(self.parameters()).device
+        return _TransformerState(offset=torch.zeros(1, device=device, dtype=torch.long))
+    def forward(self, x: torch.Tensor, *args, **kwargs):
+        B, T, C = x.shape
+        state = self._streaming_state
+        if state is None:
+            offset = torch.zeros(1, dtype=torch.long, device=x.device)
+        else:
+            offset = state.offset
+        if self.positional_embedding in {"sin", "sin_rope"}:
+            positions = torch.arange(T, device=x.device).view(1, -1, 1)
+            positions = positions + offset.view(-1, 1, 1)
+            pos_emb = create_sin_embedding(
+                positions, C, max_period=self.max_period, dtype=x.dtype
+            )
+            x = x + self.positional_scale * pos_emb
+        for layer in self.layers:
+            x = layer(x, *args, **kwargs)
+        if state is not None:
+            state.offset.add_(T)
+        return x
+class ProjectedTransformer(StreamingContainer):
+    """Transformer with optional projections of the input and output to different dimensions when needed.
+    Supports multiple outputs.
+    Args:
+        input_dimension (int): dimension of the input.
+        output_dimensions (tuple[int]): dimensions of the outputs.
+        d_model (int): inner dimension of the Transformer.
+        conv_layout (bool): If True, expects `[B, C, T]` shaped tensors, otherwise, `[B, T, C]`.
+            Similarly, the output will have the same layout.
+    """
+    def __init__(
+        self,
+        input_dimension: int,
+        output_dimensions: tp.Tuple[int, ...],
+        d_model: int,
+        *,
+        conv_layout: bool = False,
+        **kwargs,
+    ):
+        super().__init__()
+        self.transformer = StreamingTransformer(d_model=d_model, **kwargs)
+        self.input_dimension = input_dimension
+        self.output_dimensions = output_dimensions
+        self.conv_layout = conv_layout
+        self.input_proj = None
+        if d_model != input_dimension:
+            self.input_proj = nn.Linear(input_dimension, d_model, bias=False)
+        self.output_projs = nn.ModuleList()
+        for output_dimension in output_dimensions:
+            if d_model == output_dimension:
+                self.output_projs.append(nn.Identity())
+            else:
+                self.output_projs.append(
+                    nn.Linear(d_model, output_dimension, bias=False)
+                )
+    def forward(self, x, *args, **kwargs):
+        if self.conv_layout:
+            x = x.transpose(1, 2)
+        if self.input_proj is not None:
+            x = self.input_proj(x)
+        z = self.transformer(x, *args, **kwargs)
+        ys = []
+        for output_proj in self.output_projs:
+            y = output_proj(z)
+            if self.conv_layout:
+                y = y.transpose(1, 2)
+            ys.append(y)
+        return ys

moshi/quantization/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""RVQ."""
+# flake8: noqa
+from .vq import ResidualVectorQuantizer, SplitResidualVectorQuantizer
+from .base import BaseQuantizer, DummyQuantizer, QuantizedResult

moshi/quantization/base.py ADDED Viewed

	@@ -0,0 +1,170 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Base class for all quantizers.
+"""
+from dataclasses import dataclass, field
+import typing as tp
+import torch
+from torch import nn
+@dataclass
+class QuantizedResult:
+    x: torch.Tensor
+    codes: torch.Tensor
+    bandwidth: torch.Tensor  # bandwidth in kb/s used, per batch item.
+    penalty: tp.Optional[torch.Tensor] = None
+    metrics: dict = field(default_factory=dict)
+class BaseQuantizer(nn.Module):
+    """Base class for quantizers."""
+    def __init__(self):
+        super().__init__()
+        self._ema_frozen = False
+    def forward(self, x: torch.Tensor, frame_rate: int) -> QuantizedResult:
+        """
+        Given input tensor x, returns first the quantized (or approximately quantized)
+        representation along with quantized codes, bandwidth, and any penalty term for the loss.
+        Finally, this returns a dict of metrics to update logging etc.
+        Frame rate must be passed so that the bandwidth is properly computed.
+        """
+        raise NotImplementedError()
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        """Encode a given input tensor with the specified sample rate at the given bandwidth."""
+        raise NotImplementedError()
+    def decode(self, codes: torch.Tensor) -> torch.Tensor:
+        """Decode the given codes to the quantized representation."""
+        raise NotImplementedError()
+    @property
+    def cardinality(self) -> int:
+        """Cardinality of each codebook."""
+        raise NotImplementedError()
+    @property
+    def total_codebooks(self) -> int:
+        """Total number of codebooks."""
+        raise NotImplementedError()
+    @property
+    def num_codebooks(self) -> int:
+        """Number of active codebooks."""
+        raise NotImplementedError()
+    @property
+    def semantic_quantizer(self) -> 'BaseQuantizer':
+        """This returns the quantizer that models the first level of the hierarchy (typically semantic).
+        In this case, it's the quantizer itself.
+        """
+        return self
+    @property
+    def acoustic_quantizer(self) -> 'BaseQuantizer':
+        """This returns the quantizer that models the higher levels of the hierarchy (typically acoustic).
+        In this case, it's the quantizer itself.
+        """
+        return self
+    def set_num_codebooks(self, n: int) -> None:
+        """Set the number of active codebooks."""
+        raise NotImplementedError()
+    @property
+    def ema_frozen(self) -> bool:
+        """Whether to apply ema to the codebooks."""
+        return self._ema_frozen
+    def ema_frozen_(self, ema_frozen: bool) -> None:
+        """Set whether ema should be applied to the codebooks."""
+        self._ema_frozen = ema_frozen
+class DummyQuantizer(BaseQuantizer):
+    """Fake quantizer that actually does not perform any quantization."""
+    def __init__(
+        self,
+        dimension: int,
+        input_dimension: tp.Optional[int] = None,
+        output_dimension: tp.Optional[int] = None,
+    ):
+        super().__init__()
+        self.dimension = dimension
+        self.input_dimension = input_dimension or dimension
+        self.output_dimension = output_dimension or dimension
+        self.input_proj: torch.nn.Module
+        self.output_proj: torch.nn.Module
+        if self.input_dimension == self.dimension:
+            self.input_proj = torch.nn.Identity()
+        else:
+            self.input_proj = torch.nn.Conv1d(
+                self.input_dimension, self.dimension, 1, bias=False
+            )
+        if self.input_dimension == self.dimension:
+            self.output_proj = torch.nn.Identity()
+        else:
+            self.output_proj = torch.nn.Conv1d(
+                self.dimension, self.output_dimension, 1, bias=False
+            )
+    def forward(self, x: torch.Tensor, frame_rate: int):
+        q = x.unsqueeze(1)
+        x = self.output_proj(self.input_proj(x))
+        return QuantizedResult(
+            x, q, torch.tensor(q.numel() * 32 * frame_rate / 1000 / len(x)).to(x)
+        )
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        """Encode a given input tensor with the specified sample rate at the given bandwidth.
+        In the case of the DummyQuantizer, the codes are actually identical
+        to the input and resulting quantized representation as no quantization is done.
+        """
+        x = self.input_proj(x)
+        return x.unsqueeze(1)
+    def decode(self, codes: torch.Tensor) -> torch.Tensor:
+        """Decode the given codes to the quantized representation.
+        In the case of the DummyQuantizer, the codes are actually identical
+        to the input and resulting quantized representation as no quantization is done.
+        """
+        y = codes.squeeze(1)
+        return self.output_proj(y)
+    @property
+    def total_codebooks(self):
+        """Total number of codebooks."""
+        return 1
+    @property
+    def num_codebooks(self):
+        """Total number of codebooks."""
+        return self.total_codebooks
+    def set_num_codebooks(self, n: int):
+        """Set the number of active codebooks."""
+        raise AttributeError(
+            "Cannot override the number of codebooks for the dummy quantizer"
+        )
+    @property
+    def cardinality(self) -> int:
+        """Cardinality of each codebook."""
+        return 1

moshi/quantization/core_vq.py ADDED Viewed

	@@ -0,0 +1,384 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import typing as tp
+from einops import rearrange
+import torch
+from torch import nn
+from torch import distributed
+import torch.nn.functional as F
+class _CodebookForwardResult(tp.NamedTuple):
+    quantized: torch.Tensor
+    codes: torch.Tensor
+    metrics: tp.Dict[str, torch.Tensor]
+class _VQForwardResult(tp.NamedTuple):
+    quantized: torch.Tensor
+    codes: torch.Tensor
+    loss: torch.Tensor
+    metrics: tp.Dict[str, torch.Tensor]
+def _ema_inplace(moving_avg: torch.Tensor, new: torch.Tensor, decay: float) -> None:
+    moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay))
+def _uniform_init(*shape: int) -> torch.Tensor:
+    t = torch.empty(shape)
+    nn.init.kaiming_uniform_(t)
+    return t
+def _sample_vectors(samples: torch.Tensor, num: int) -> torch.Tensor:
+    num_samples, device = samples.shape[0], samples.device
+    if num_samples >= num:
+        indices = torch.randperm(num_samples, device=device)[:num]
+    else:
+        indices = torch.randint(0, num_samples, (num,), device=device)
+    return samples[indices]
+def _compute_entropy(usage: torch.Tensor) -> torch.Tensor:
+    # Usage is some unnormalized distribution.
+    proba = usage / usage.sum()
+    p_log_p = torch.where(
+        proba == 0, zero_scalar(usage.device), proba * torch.log(proba)
+    )
+    return -p_log_p.sum()
+def _is_distributed() -> bool:
+    # Checks if we need to use distributed routines.
+    return distributed.is_initialized() and distributed.get_world_size() > 1
+def zero_scalar(device) -> torch.Tensor:
+    """Returns a 0. value on the given device without introducing a synchronization point."""
+    return torch.zeros([1], device=device)[0]
+class EuclideanCodebook(nn.Module):
+    """Codebook with Euclidean distance.
+    Args:
+        dim (int): Dimension.
+        codebook_size (int): Codebook size.
+        decay (float): Decay for exponential moving average over the codebooks.
+        epsilon (float): Epsilon value for numerical stability.
+        threshold_usage_ratio (float): Defines the threshold for the cluster usage under which a centroid
+            is replaced. This is expressed as a fraction of the usage a centroid would get under
+            a uniform distribution, so that it doesn't depend on the batch size etc.
+        replaced_usage_ratio (float): When replacing a centroid, use this as an initial centroid usage,
+            to avoid the centroid getting replaced too quickly.
+        check_unused_every (int): Check for unused centroids every `check_unused_every` iterations.
+            This is to avoid too many synchronization points.
+    Buffers:
+        cluster_usage (torch.Tensor): EMA of the cluster usage per batch, e.g. this will
+            be dependent on the batch size etc.
+        embedding_sum (torch.Tensor): EMA of the sum of the assigned points to each cluster.
+            In particular, this can be normalized by `cluster_usage` to obtain the
+            actual cluster centroids.
+    """
+    def __init__(
+        self,
+        dim: int,
+        codebook_size: int,
+        decay: float = 0.99,
+        epsilon: float = 1e-5,
+        threshold_usage_ratio: float = 0.1,
+        replaced_usage_ratio: float = 1.0,
+        check_unused_every: int = 5,
+    ):
+        super().__init__()
+        self.decay = decay
+        embedding = torch.zeros(codebook_size, dim)
+        self.dim = dim
+        self.codebook_size = codebook_size
+        self.epsilon = epsilon
+        self.threshold_usage_ratio = threshold_usage_ratio
+        self.replaced_usage_ratio = replaced_usage_ratio
+        self.check_unused_every = check_unused_every
+        self._next_unused_check = check_unused_every
+        self.register_buffer("_initialized", torch.tensor([False], dtype=torch.float))
+        self.register_buffer("cluster_usage", torch.ones(codebook_size))
+        self.register_buffer("embedding_sum", embedding)
+        self.register_buffer("_embedding", None, persistent=False)
+        self._cached_initialized = False
+    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs) -> None:
+        # Mapping old names to new names
+        mappings = {
+            "inited": "_initialized",
+            "cluster_size": "cluster_usage",
+            "embed_avg": "embedding_sum",
+            "embed_sum": "embedding_sum",
+        }
+        for old_name, new_name in mappings.items():
+            old_name = prefix + old_name
+            if old_name in state_dict:
+                value = state_dict.pop(old_name)
+                if new_name is not None:
+                    state_dict[prefix + new_name] = value
+        super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+    @property
+    def embedding(self) -> torch.Tensor:
+        if self._embedding is None:
+            embedding = (
+                self.embedding_sum / self.cluster_usage.clamp(min=self.epsilon)[:, None]
+            )
+            self.register_buffer("_embedding", embedding, persistent=False)
+            return embedding
+        return self._embedding
+    def _broadcast_buffers(self) -> None:
+        if _is_distributed():
+            for buffer in self.buffers():
+                distributed.broadcast(buffer, 0)
+    def _replace_expired_codes(self, samples: torch.Tensor, mask: torch.Tensor) -> None:
+        # Replaces expired centroids, as indicated by `mask` (a true value indicate the code needs to be replaced).
+        # The new codes are sampled from the batch `samples`.
+        new_vectors = _sample_vectors(samples, self.codebook_size)
+        replace_cluster_usage = (
+            self.replaced_usage_ratio * self.cluster_usage.sum() / self.codebook_size
+        )
+        self.embedding_sum[:] = torch.where(
+            mask[:, None], replace_cluster_usage * new_vectors, self.embedding_sum
+        )
+        self.cluster_usage[:] = torch.where(
+            mask, replace_cluster_usage, self.cluster_usage
+        )
+    def _reshape_input(self, x: torch.Tensor) -> torch.Tensor:
+        # Flattens all the dimensions but the last one, e.g. return a vector of shape `[N, D]`.
+        x = rearrange(x, "... d -> (...) d")
+        return x
+    def _reshape_codes(self, codes: torch.Tensor, shape: torch.Size) -> torch.Tensor:
+        return codes.view(*shape[:-1])
+    def _quantize(self, x: torch.Tensor) -> torch.Tensor:
+        # Projects each vector in `x` over the nearest centroid and return its index.
+        # `x` should be `[N, D]` with `N` the number of input vectors and `D` the dimension.
+        assert x.dim() == 2
+        dists = torch.cdist(x[None], self.embedding[None], p=2)[0]
+        codes = dists.argmin(dim=-1)
+        return codes
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        """Given a tensor `x` of shape `[*, D]`, returns a tensor of integer codes of shape `[*]`.
+        The codes are defined as the indexes of the centroids nearest to each vector in `x`.
+        """
+        assert x.dtype.is_floating_point, f"Input should be floats, got {x.dtype}"
+        shape = x.shape
+        x = self._reshape_input(x)
+        codes = self._quantize(x)
+        codes = self._reshape_codes(codes, shape)
+        return codes
+    def decode(self, codes: torch.Tensor) -> torch.Tensor:
+        """Given a tensor of codes of shape `[*]`, returns a tensor of shape `[*, D]`,
+        corresponding to the centroids associated to each code index.
+        """
+        assert (
+            not codes.dtype.is_floating_point
+        ), f"Codes should be integers, got {codes.dtype}"
+        quantized = F.embedding(codes, self.embedding)
+        return quantized
+    def forward(
+        self, x: torch.Tensor, initialize: bool = True
+    ) -> _CodebookForwardResult:
+        shape = x.shape
+        x = self._reshape_input(x)
+        flat_codes = self._quantize(x)
+        codes = self._reshape_codes(flat_codes, shape)
+        quantized = self.decode(codes)
+        metrics: tp.Dict[str, torch.Tensor] = {}
+        return _CodebookForwardResult(quantized, codes, metrics)
+class VectorQuantization(nn.Module):
+    """Vector quantization implementation.
+    Currently supports only euclidean distance.
+    Args:
+        dim (int): Dimension
+        codebook_size (int): Codebook size
+        codebook_dim (int): Codebook dimension. If not defined, uses the specified dimension in dim.
+        decay (float): Decay for exponential moving average over the codebooks.
+        epsilon (float): Epsilon value for numerical stability.
+        threshold_usage_ratio (float): Defines the threshold for the cluster usage under which a centroid
+            is replaced. This is expressed as a fraction of the usage a centroid would get under
+            a uniform distribution, so that it doesn't depend on the batch size etc.
+        replaced_usage_ratio (float): When replacing a centroid, use this as an initial centroid usage,
+            to avoid the centroid getting replaced too quickly.
+        check_unused_every (int): Check for unused centroids every `check_unused_every` iterations.
+            This is to avoid too many synchronization points.
+    """
+    def __init__(
+        self,
+        dim: int,
+        codebook_size: int,
+        codebook_dim: tp.Optional[int] = None,
+        decay: float = 0.99,
+        epsilon: float = 1e-5,
+        threshold_usage_ratio: float = 0.1,
+        **kwargs,
+    ):
+        super().__init__()
+        if codebook_dim is None:
+            codebook_dim = dim
+        requires_projection = codebook_dim != dim
+        self.project_in = (
+            nn.Linear(dim, codebook_dim) if requires_projection else nn.Identity()
+        )
+        self.project_out = (
+            nn.Linear(codebook_dim, dim) if requires_projection else nn.Identity()
+        )
+        self.epsilon = epsilon
+        self._codebook = EuclideanCodebook(
+            dim=codebook_dim,
+            codebook_size=codebook_size,
+            decay=decay,
+            epsilon=epsilon,
+            threshold_usage_ratio=threshold_usage_ratio,
+            **kwargs,
+        )
+        self.codebook_size = codebook_size
+    @property
+    def embedding(self):
+        return self._codebook.embedding
+    def _rearrange_input(self, x):
+        x = rearrange(x, "b d n -> b n d")
+        return x
+    def _rearrange_output(self, quantized):
+        quantized = rearrange(quantized, "b n d -> b d n")
+        return quantized
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        """Encodes `x` into discrete integer codes."""
+        x = self._rearrange_input(x)
+        x = self.project_in(x)
+        codes = self._codebook.encode(x)
+        return codes
+    def decode(self, codes: torch.Tensor) -> torch.Tensor:
+        """Converts integer codes into quantized vectors."""
+        quantized = self._codebook.decode(codes)
+        quantized = self.project_out(quantized)
+        quantized = self._rearrange_output(quantized)
+        return quantized
+    def forward(self, x: torch.Tensor, initialize: bool = True) -> _VQForwardResult:
+        x = self._rearrange_input(x)
+        quantized, codes, metrics = self._codebook(x, initialize=initialize)
+        loss = zero_scalar(x.device)
+        quantized = self.project_out(quantized)
+        quantized = self._rearrange_output(quantized)
+        return _VQForwardResult(quantized, codes, loss, metrics)
+class ResidualVectorQuantization(nn.Module):
+    """Residual vector quantization implementation.
+    Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf
+    """
+    def __init__(self, *, num_quantizers: int, codebook_offset: int, **kwargs):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [VectorQuantization(**kwargs) for _ in range(num_quantizers)]
+        )
+        self.codebook_offset = codebook_offset
+    def forward(
+        self, x: torch.Tensor, n_q: tp.Optional[int] = None
+    ) -> _VQForwardResult:
+        """
+        Args:
+            x (torch.Tensor): input tensor to quantize, of shape `[B, C, T]`.
+            n_q (int or None): if provided, number of codebook levels to use in RVQ.
+        """
+        quantized_out = zero_scalar(x.device)
+        residual = x
+        all_losses = []
+        all_codes = []
+        all_metrics: tp.Dict[str, torch.Tensor] = {}
+        n_q = n_q or len(self.layers)
+        previous_layer_is_initialized = True
+        for i, layer in enumerate(self.layers[:n_q]):  # type: ignore
+            quantized, codes, loss, metrics = layer(
+                residual, initialize=previous_layer_is_initialized
+            )
+            quantized = quantized.detach()
+            residual = residual - quantized
+            quantized_out = quantized_out + quantized
+            all_codes.append(codes)
+            all_losses.append(loss)
+            for key, value in metrics.items():
+                if key in all_metrics:
+                    all_metrics[key] += value / n_q
+                else:
+                    all_metrics[key] = value / n_q
+                all_metrics[key + f"_{i + self.codebook_offset}"] = value
+        out_losses, out_codes = map(torch.stack, (all_losses, all_codes))
+        return _VQForwardResult(quantized_out, out_codes, out_losses, all_metrics)
+    def encode(self, x: torch.Tensor, n_q: tp.Optional[int] = None) -> torch.Tensor:
+        """Encodes `x` into discrete integer codes. If `n_q` is provided, only uses the first `n_q` codebook levels."""
+        residual = x
+        all_indices = []
+        n_q = n_q or len(self.layers)
+        for layer in self.layers[:n_q]:  # type: ignore
+            indices = layer.encode(residual)
+            quantized = layer.decode(indices)
+            residual = residual - quantized
+            all_indices.append(indices)
+        out_indices = torch.stack(all_indices)
+        return out_indices
+    def decode(self, codes: torch.Tensor) -> torch.Tensor:
+        """Converts the integer codes into quantized vectors."""
+        quantized = zero_scalar(codes.device)
+        for idx, layer_codes in enumerate(codes):
+            layer = self.layers[idx]
+            quantized = quantized + layer.decode(layer_codes)
+        return quantized

moshi/quantization/vq.py ADDED Viewed

	@@ -0,0 +1,340 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import typing as tp
+import torch
+from .base import BaseQuantizer, QuantizedResult
+from .core_vq import ResidualVectorQuantization
+class ResidualVectorQuantizer(BaseQuantizer):
+    """Residual Vector Quantizer.
+    Args:
+        dimension (int): Dimension of the codebooks.
+        input_dimension (None or int): dimension of the input, defaults to `dimension` if not provided.
+        output_dimension (None or int): dimension of the output, defaults to `dimension` if not provided.
+        n_q (int): Number of vector quantizers used.
+        q_dropout (bool): Random quantizer drop out at train time.
+        no_quantization_rate (float): Gives the probability of applying no quantization at all
+            at train time. The RVQ codebooks will still get the input value to learn the proper codebook.
+        bins (int): Codebook size.
+        decay (float): Decay for exponential moving average over the codebooks.
+        threshold_usage_ratio (float): Defines the threshold for the cluster usage under which a centroid
+            is replaced. This is expressed as a fraction of the usage a centroid would get under
+            a uniform distribution, so that it doesn't depend on the batch size etc.
+        replaced_usage_ratio (float): When replacing a centroid, use this as an initial centroid usage,
+            to avoid the centroid getting replaced too quickly.
+        codebook_offset (int): Offset to use for the codebook indices. This is useful when using multiple quantizers
+            such as in SplitResidualVectorQuantizer.
+        force_projection (bool): Whether to force input and output projections even when dimension is constant.
+        generator_seed (int or None): seed used to initialize the RNG used for no quantization.
+    """
+    def __init__(
+        self,
+        dimension: int = 128,
+        input_dimension: tp.Optional[int] = None,
+        output_dimension: tp.Optional[int] = None,
+        n_q: int = 8,
+        q_dropout: bool = False,
+        q_first_only_proba: float = 0.0,
+        no_quantization_rate: float = 0.0,
+        bins: int = 1024,
+        decay: float = 0.99,
+        threshold_usage_ratio: float = 0.1,
+        replaced_usage_ratio: float = 1.0,
+        codebook_offset: int = 0,
+        force_projection: bool = False,
+        generator_seed: tp.Optional[int] = None,
+    ):
+        super().__init__()
+        self.max_n_q = n_q
+        self.n_q = n_q
+        self.q_dropout = q_dropout
+        self.no_quantization_rate = no_quantization_rate
+        self.q_first_only_proba = q_first_only_proba
+        self.dimension = dimension
+        self.input_dimension = input_dimension or dimension
+        self.output_dimension = output_dimension or dimension
+        self.bins = bins
+        self.decay = decay
+        self.input_proj: torch.nn.Module
+        self.output_proj: torch.nn.Module
+        self.generator = None
+        if generator_seed is not None:
+            self.generator = torch.Generator(
+                device="cuda" if torch.cuda.is_available() else "cpu"
+            )
+            self.generator.manual_seed(generator_seed)
+        if self.input_dimension == self.dimension and not force_projection:
+            self.input_proj = torch.nn.Identity()
+        else:
+            self.input_proj = torch.nn.Conv1d(
+                self.input_dimension, self.dimension, 1, bias=False
+            )
+        if self.output_dimension == self.dimension and not force_projection:
+            self.output_proj = torch.nn.Identity()
+        else:
+            self.output_proj = torch.nn.Conv1d(
+                self.dimension, self.output_dimension, 1, bias=False
+            )
+        self.vq = ResidualVectorQuantization(
+            dim=self.dimension,
+            codebook_size=self.bins,
+            num_quantizers=self.n_q,
+            decay=self.decay,
+            threshold_usage_ratio=threshold_usage_ratio,
+            replaced_usage_ratio=replaced_usage_ratio,
+            codebook_offset=codebook_offset,
+        )
+    def forward(self, x: torch.Tensor, frame_rate: int):
+        """
+        Args:
+            x (torch.Tensor): Input tensor of shape [B, C, T] with `C` number of channels.
+            frame_rate (int): frame rate of the input (e.g `T = frame_rate * duration`), used to compute
+                the bandwidth.
+        Returns:
+            QuantizedResult: Quantized result with the following attributes:
+                - `x` (torch.Tensor): Quantized tensor of shape [B, C, T].
+                - `codes` (torch.Tensor): Quantized codes of shape [B, K, T] with `K` number of codebooks.
+                - `bw` (torch.Tensor): Bandwidth of the quantized tensor in kbits per second.
+                - `penalty` (torch.Tensor): Commitment loss.
+                - `metrics` (dict): RVQ metrics, in particular rate of dead code replacement, and entropy.
+        """
+        n_q = self.n_q
+        x = self.input_proj(x)
+        bw_per_q = math.log2(self.bins) * frame_rate / 1000
+        quantized, codes, commit_loss, metrics = self.vq(x, n_q=n_q)
+        B, _, _ = quantized.shape
+        quantized = self.output_proj(quantized)
+        codes = codes.transpose(0, 1)
+        # codes is [B, K, T], with T frames, K nb of codebooks.
+        bw = torch.tensor(n_q * bw_per_q).to(x)
+        return QuantizedResult(
+            quantized, codes, bw, penalty=torch.mean(commit_loss), metrics=metrics
+        )
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        """Encode a given input tensor with the specified frame rate at the given bandwidth.
+        The RVQ encode method sets the appropriate number of quantizer to use
+        and returns indices for each quantizer.
+        """
+        n_q = self.n_q
+        if x.shape[-1] == 0:
+            return torch.empty((x.shape[0], n_q, 0), device=x.device, dtype=torch.int64)
+        x = self.input_proj(x)
+        codes = self.vq.encode(x, n_q=n_q)
+        codes = codes.transpose(0, 1)
+        # codes is [B, K, T], with T frames, K nb of codebooks.
+        return codes
+    def decode(self, codes: torch.Tensor) -> torch.Tensor:
+        """Decode the given codes to the quantized representation."""
+        # codes is [B, K, T], with T frames, K nb of codebooks, vq.decode expects [K, B, T].
+        codes = codes.transpose(0, 1)
+        quantized = self.vq.decode(codes)
+        quantized = self.output_proj(quantized)
+        return quantized
+    @property
+    def total_codebooks(self):
+        return self.max_n_q
+    @property
+    def num_codebooks(self):
+        return self.n_q
+    def set_num_codebooks(self, n: int):
+        assert n >= 0 and n <= self.max_n_q
+        self.n_q = n
+    @property
+    def cardinality(self) -> int:
+        return self.bins
+class SplitResidualVectorQuantizer(BaseQuantizer):
+    """Residual Vector Quantizer with separate projections for the first quantizer and the rest.
+    Args:
+        n_q (int): Number of residual vector quantizers used.
+        n_semantic_q (int): Number of residual vector quantizers used for the semantic quantizer.
+        no_quantization_mode (str): if 'true_skip', when doing no quantization, the input will not go
+            through the sub quantizers. If `independent`, independent decisions are taken by
+            the semantic and acoustic quantizers. If `same` (the default), the same decision is taken by both.
+        **kwargs: Arguments to the constructor of `ResidualVectorQuantizer` that are shared between both.
+    """
+    def __init__(
+        self,
+        *,
+        n_q: int = 8,
+        no_quantization_rate: float = 0.0,
+        no_quantization_mode: str = "same",
+        n_q_semantic: int = 1,
+        **kwargs,
+    ):
+        super().__init__()
+        assert n_q > n_q_semantic, (
+            f"Number of quantizers {n_q} must be larger "
+            f"than the number of semantic quantizers {n_q_semantic}."
+        )
+        self.max_n_q = n_q
+        self.n_q_semantic = n_q_semantic
+        self.n_q_acoustic = n_q - n_q_semantic
+        if no_quantization_mode == "true_skip":
+            self.no_quantization_rate = no_quantization_rate
+            # Setting to zero for the underlying RVQ.
+            no_quantization_rate = 0.0
+        else:
+            self.no_quantization_rate = 0.0
+        if no_quantization_mode == "same":
+            kwargs["generator_seed"] = 1234
+        kwargs["no_quantization_rate"] = no_quantization_rate
+        q_dropout = kwargs.pop("q_dropout", False)
+        self.rvq_first = ResidualVectorQuantizer(
+            n_q=n_q_semantic, force_projection=True, q_dropout=False, **kwargs
+        )
+        self.rvq_rest = ResidualVectorQuantizer(
+            n_q=n_q - n_q_semantic,
+            codebook_offset=1,
+            force_projection=True,
+            q_dropout=q_dropout,
+            **kwargs,
+        )
+        if no_quantization_mode == "true_skip":
+            assert self.rvq_first.input_dimension == self.rvq_first.output_dimension
+            assert self.rvq_rest.input_dimension == self.rvq_rest.output_dimension
+    def _renorm_and_add(
+        self,
+        first_val: torch.Tensor,
+        rest_val: torch.Tensor,
+        n_q_semantic: int,
+        n_q_acoustic: int,
+    ):
+        """Renormalizes values from `rvq_first` and `rvq_rest` and adds them.
+        This allows correcting statistics that are normalized by the number of quantizers. To renormalize, we use the
+        number of quantizers that are actually used, e.g. taking into account quantizer dropout.
+        """
+        n_q = n_q_semantic + n_q_acoustic
+        renorm_first_val = first_val * n_q_semantic / n_q
+        renorm_rest_val = rest_val * n_q_acoustic / n_q
+        return renorm_first_val + renorm_rest_val
+    def forward(self, x: torch.Tensor, frame_rate: int):
+        """
+        Args:
+            x (torch.Tensor): Input tensor of shape [B, C, T] with `C` number of channels.
+            frame_rate (int): frame rate of the input (e.g `T = frame_rate * duration`), used to compute
+                the bandwidth.
+        Returns:
+            QuantizedResult: Quantized result with the following attributes:
+                - `x` (torch.Tensor): Quantized tensor of shape [B, C, T].
+                - `codes` (torch.Tensor): Quantized codes of shape [B, K, T] with `K` number of codebooks.
+                - `bw` (torch.Tensor): Bandwidth of the quantized tensor in kbits per second.
+                - `penalty` (torch.Tensor): Commitment loss.
+                - `metrics` (dict): RVQ metrics, in particular rate of dead code replacement, and entropy.
+        """
+        semantic_result = self.rvq_first(x, frame_rate)
+        if self.n_q == self.n_q_semantic:
+            return semantic_result
+        acoustic_result = self.rvq_rest(x, frame_rate)
+        full_quantized_emb = semantic_result.x + acoustic_result.x
+        full_quantized_codes = torch.cat(
+            [semantic_result.codes, acoustic_result.codes], dim=1
+        )
+        # This is the actual number of quantizers used,  e.g. taking into account quantizer dropout.
+        n_q_semantic = semantic_result.codes.shape[1]
+        n_q_acoustic = acoustic_result.codes.shape[1]
+        full_quantized_bandwidth = semantic_result.bandwidth + acoustic_result.bandwidth
+        full_quantized_penalty = self._renorm_and_add(
+            semantic_result.penalty, acoustic_result.penalty, n_q_semantic, n_q_acoustic
+        )
+        full_quantized_metrics = semantic_result.metrics
+        for key, value in acoustic_result.metrics.items():
+            if key in full_quantized_metrics:
+                full_quantized_metrics[key] = self._renorm_and_add(
+                    full_quantized_metrics[key], value, n_q_semantic, n_q_acoustic
+                )
+            else:
+                full_quantized_metrics[key] = value
+        return QuantizedResult(
+            full_quantized_emb,
+            full_quantized_codes,
+            full_quantized_bandwidth,
+            penalty=full_quantized_penalty,
+            metrics=full_quantized_metrics,
+        )
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        """Encode a given input tensor with the specified frame rate at the given bandwidth.
+        The RVQ encode method sets the appropriate number of quantizer to use
+        and returns indices for each quantizer.
+        """
+        codes = self.rvq_first.encode(x)
+        if self.n_q > self.n_q_semantic:
+            acoustic_codes = self.rvq_rest.encode(x)
+            codes = torch.cat([codes, acoustic_codes], dim=1)
+        # codes is [B, K, T], with T frames, K nb of codebooks.
+        return codes
+    def decode(self, codes: torch.Tensor) -> torch.Tensor:
+        """Decode the given codes to the quantized representation."""
+        # codes is [B, K, T], with T frames, K nb of codebooks.
+        quantized = self.rvq_first.decode(codes[:, : self.n_q_semantic])
+        if codes.shape[1] > self.n_q_semantic:
+            quantized += self.rvq_rest.decode(codes[:, self.n_q_semantic :])
+        return quantized
+    @property
+    def total_codebooks(self):
+        return self.rvq_first.max_n_q + self.rvq_rest.max_n_q
+    @property
+    def num_codebooks(self):
+        return self.rvq_first.num_codebooks + self.rvq_rest.num_codebooks
+    @property
+    def n_q(self):
+        return self.rvq_first.n_q + self.rvq_rest.n_q
+    @property
+    def dimension(self):
+        return self.rvq_first.dimension
+    @property
+    def semantic_quantizer(self) -> ResidualVectorQuantizer:
+        """This returns the quantizer that models the first level of the hierarchy (typically semantic)."""
+        return self.rvq_first
+    @property
+    def acoustic_quantizer(self) -> ResidualVectorQuantizer:
+        """This returns the quantizer that models the higher levels of the hierarchy (typically acoustic)."""
+        return self.rvq_rest
+    def set_num_codebooks(self, n: int):
+        assert n >= self.n_q_semantic and n <= self.total_codebooks
+        self.rvq_rest.set_num_codebooks(n - self.n_q_semantic)
+    @property
+    def cardinality(self) -> int:
+        assert self.rvq_rest.cardinality == self.rvq_first.cardinality
+        return self.rvq_first.cardinality

moshi/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Utilities."""

moshi/utils/autocast.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+class TorchAutocast:
+    """TorchAutocast utility class.
+    Allows you to enable and disable autocast. This is specially useful
+    when dealing with different architectures and clusters with different
+    levels of support.
+    Args:
+        enabled (bool): Whether to enable torch.autocast or not.
+        args: Additional args for torch.autocast.
+        kwargs: Additional kwargs for torch.autocast
+    """
+    def __init__(self, enabled: bool, *args, **kwargs):
+        self.autocast = torch.autocast(*args, **kwargs) if enabled else None
+    def __enter__(self):
+        if self.autocast is None:
+            return
+        try:
+            self.autocast.__enter__()
+        except RuntimeError:
+            device = self.autocast.device
+            dtype = self.autocast.fast_dtype
+            raise RuntimeError(
+                f"There was an error autocasting with dtype={dtype} device={device}\n"
+                "If you are on the FAIR Cluster, you might need to use autocast_dtype=float16"
+            )
+    def __exit__(self, *args, **kwargs):
+        if self.autocast is None:
+            return
+        self.autocast.__exit__(*args, **kwargs)

moshi/utils/compile.py ADDED Viewed

	@@ -0,0 +1,284 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Provides some extra utilities around torch compile, in particular with a way
+to fully deactivate it easily with a context manager.
+Provides a simple activation checkpointing that is compatible with FSDP and torch compile.
+Finally, provides some utilities for CUDA graphing functions.
+"""
+from contextlib import contextmanager
+from functools import wraps
+import inspect
+import os
+import typing as tp
+import torch
+from torch import cuda
+_compile_disabled: bool = False
+@contextmanager
+def no_compile():
+    """Disable torch.compile locally. Now Pytorch 2.4 provides a function to do that."""
+    global _compile_disabled
+    prev_disabled = _compile_disabled
+    _compile_disabled = True
+    try:
+        yield
+    finally:
+        _compile_disabled = prev_disabled
+def torch_compile_lazy(fun):
+    """torch.compile creates a huge pool of processes, even when not using the function at all,
+    e.g. with Dora. This can polute stderr when doing CTRL+C. So we do it in a lazy way.
+    """
+    if os.environ.get("NO_TORCH_COMPILE"):
+        return fun
+    fun_compiled = None
+    @wraps(fun)
+    def _wrapped(*args, **kwargs):
+        nonlocal fun_compiled
+        if _compile_disabled:
+            return fun(*args, **kwargs)
+        if fun_compiled is None:
+            fun_compiled = torch.compile(fun)
+        return fun_compiled(*args, **kwargs)
+    return _wrapped
+class Checkpoint(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, function, *args) -> tp.Any:
+        to_save = []
+        ctx.others = []
+        ctx.function = function
+        # Sources will indicate whether the arg in position N is
+        # a tensor stored in ctx.save_for_backward, or inside ctx.others.
+        ctx.sources = []
+        new_args = []
+        for arg in args:
+            if isinstance(arg, torch.Tensor):
+                to_save.append(arg)
+                ctx.sources.append("tensor")
+                new_args.append(arg.detach())
+            else:
+                ctx.sources.append("other")
+                ctx.others.append(arg)
+                new_args.append(arg)
+        ctx.save_for_backward(*to_save)
+        # During the forward, we just make a pass with no gradient computed.
+        with torch.no_grad():
+            res = function(*new_args)
+        return res
+    @staticmethod
+    def backward(ctx, *grads) -> tp.Tuple[tp.Optional[torch.Tensor], ...]:
+        pseudo_tensors = []
+        with torch.set_grad_enabled(True):
+            # We create leaf tensors to collect the output gradients.
+            # We call them pseudo_tensors because they are pretending to be the input
+            # to `function` but are not directly
+            for tensor in ctx.saved_tensors:
+                pseudo_tensor = tensor.detach()
+                pseudo_tensor.requires_grad_(True)
+                pseudo_tensors.append(pseudo_tensor)
+            pseudo_tensors_copy = list(pseudo_tensors)
+            args = []
+            for source in ctx.sources:
+                if source == "other":
+                    args.append(ctx.others.pop(0))
+                else:
+                    assert source == "tensor"
+                    args.append(pseudo_tensors_copy.pop(0))
+            res = ctx.function(*args)
+            # The second forward with grad computation allows us to connect the input leaf tensors
+            # inside pseudo_tensors, to the outputs of the function called.
+        if not isinstance(res, tuple):
+            res = (res,)
+        # Now we just ask Torch to compute the derivative of `res` given the gradient coming from above
+        # `grads`. The computed gradient will end up into the `pseudo_tensors` grad attributes.
+        torch.autograd.backward(res, grads)
+        out: tp.List[tp.Optional[torch.Tensor]] = [None]
+        for source in ctx.sources:
+            # We still need to output `None` values for non tensor parameters.
+            if source == "other":
+                out.append(None)
+            else:
+                assert source == "tensor"
+                out.append(pseudo_tensors.pop(0).grad)
+        return tuple(out)
+def simple_checkpoint(module: torch.nn.Module, *args, **kwargs):
+    """Custom implementation of checkpointing in PyTorch as the builtin implementation is broken
+    when using torch compile. Only supports wrapping a `nn.Module` with a forward with no `*args` or `**kwargs`.
+    https://github.com/pytorch/pytorch/issues/97436.
+    Should be resolved in nightlies, but it is quite fun and simple to code it ourselves.
+    """
+    if hasattr(module, "_fsdp_wrapped_module"):
+        module_for_sig = module._fsdp_wrapped_module
+    else:
+        module_for_sig = module
+    sig = inspect.signature(module_for_sig.forward)
+    # We first flatten all arguments to use only *args, to make things easier and because
+    # torch.autograd.Function has weird support for kwargs.
+    bounded = sig.bind(*args, **kwargs)
+    new_args = []
+    for name, param in sig.parameters.items():
+        if param.kind in {
+            inspect.Parameter.VAR_POSITIONAL,
+            inspect.Parameter.VAR_KEYWORD,
+        }:
+            raise RuntimeError("simple_checkpoint doesn't support var args.")
+        if name not in bounded.arguments:
+            break
+        new_args.append(bounded.arguments[name])
+    return Checkpoint.apply(module, *new_args)
+_in_cuda_graph = False
+_disable_cuda_graph = False
+def in_cuda_graph() -> bool:
+    """Indicate whether we are in a function that is CUDA Graphed (or will be soon)."""
+    return _in_cuda_graph
+@contextmanager
+def _set_in_cuda_graph():
+    global _in_cuda_graph
+    assert not _in_cuda_graph
+    _in_cuda_graph = True
+    try:
+        yield
+    finally:
+        _in_cuda_graph = False
+def _is_cuda_graph_enabled() -> bool:
+    if _disable_cuda_graph:
+        return False
+    no_cuda_graph = os.environ.get("NO_CUDA_GRAPH", "")
+    if no_cuda_graph.lower() not in {"0", "no", "n", ""}:
+        return False
+    return True
+@contextmanager
+def no_cuda_graph():
+    """Deactivate CUDA Graphing for all the calls in this context manager."""
+    global _disable_cuda_graph
+    old_value = _disable_cuda_graph
+    _disable_cuda_graph = True
+    try:
+        yield
+    finally:
+        _disable_cuda_graph = old_value
+class CUDAGraphed:
+    """Allow simple CUDA Graphing of a function.
+    Args:
+        func: callable, taking any number of arguments. Its tensors arguments should
+            be top level args, not nested in structures (tuples, dicts, etc). Keyword
+            arguments are NOT supported for simplicity.
+        warmup_steps: how many call to make normally before CUDA Graphing. In particular, this
+            allows torch.compiled functions to get properly compiled.
+        disabled: if True, just call the func directly, useful to quickly deactivate on CPU.
+    """
+    def __init__(self, func: tp.Callable, warmup_steps: int = 1, disable: bool = False):
+        self.func = func
+        self.warmup_steps = warmup_steps
+        self.disable = disable
+        self._graph: cuda.CUDAGraph | None = None
+        self._output: tuple | None = None
+        self._args: tuple | None = None
+    def reset(self, warmup_steps: int = 0) -> None:
+        """Reset the state, meaning the next call we get CUDA Graphed again. Useful if some
+        shapes have changed, or external state (e.g. KVCache) has changed."""
+        self.warmup_steps = warmup_steps
+        self._graph = None
+        self._output = None
+        self._args = None
+    def __call__(self, *args, **kwargs) -> tp.Any:
+        if kwargs:
+            raise RuntimeError("Named arguments not supported for now.")
+        if self.disable or not _is_cuda_graph_enabled() or in_cuda_graph():
+            return self.func(*args, **kwargs)
+        def _clone_tensors(args: tuple) -> tuple:
+            out: list = []
+            for arg in args:
+                if isinstance(arg, torch.Tensor):
+                    arg = arg.clone()
+                out.append(arg)
+            return tuple(out)
+        def _match_values_copy_tensors(args: tuple, target_args: tuple) -> None:
+            if len(args) != len(target_args):
+                raise ValueError(
+                    f"Expected {len(target_args)}, but got {args} for CUDA Graphed function."
+                )
+            for idx, (source, target) in enumerate(zip(args, target_args)):
+                if isinstance(target, torch.Tensor):
+                    if not isinstance(source, torch.Tensor):
+                        raise ValueError(
+                            f"Argument #{idx} was a tensor, and is no longer (now {source})."
+                        )
+                    if source.shape != target.shape:
+                        raise ValueError(
+                            f"Argument #{idx} had shape {target.shape}, but got shae {source.shape}"
+                        )
+                    target.copy_(source)
+                else:
+                    if isinstance(source, torch.Tensor):
+                        raise ValueError(
+                            f"Argument #{idx} was not a tensor {target}, but is now one."
+                        )
+                    if source is not target and source != target:
+                        raise ValueError(
+                            f"Argument #{idx} changed value from {target} to {source}."
+                        )
+        with _set_in_cuda_graph():
+            # Prevent any one under us to try and CUDA Graph things.
+            if self._graph is None:
+                if self.warmup_steps <= 0:
+                    self._graph = cuda.CUDAGraph()
+                    # Making a copy just to ensure those are not used else where.
+                    self._args = _clone_tensors(args)
+                    with cuda.graph(self._graph):
+                        self._output = self.func(*self._args)
+                    # At this point nothing really happened, so we have to make it run for real.
+                    self._graph.replay()
+                    return self._output
+                else:
+                    self.warmup_steps -= 1
+                    return self.func(*args)
+            else:
+                assert self._args is not None
+                assert self._output is not None
+                _match_values_copy_tensors(args, self._args)
+                self._graph.replay()
+                return self._output
+def cuda_graph(func: tp.Callable, warmup_steps: int = 1):
+    """Just calls `CUDAGraphed` on the given function."""
+    if not _is_cuda_graph_enabled():
+        return func
+    return CUDAGraphed(func, warmup_steps)

moshi/utils/sampling.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+def multinomial(
+    input: torch.Tensor, num_samples: int, replacement=False, *, generator=None
+):
+    """torch.multinomial with arbitrary number of dimensions, and number of candidates on the last dimension.
+    Args:
+        input (torch.Tensor): The input tensor containing probabilities.
+        num_samples (int): Number of samples to draw.
+        replacement (bool): Whether to draw with replacement or not.
+    Keywords args:
+        generator (torch.Generator): A pseudorandom number generator for sampling.
+    Returns:
+        torch.Tensor: Last dimension contains num_samples indices
+            sampled from the multinomial probability distribution
+            located in the last dimension of tensor input.
+    """
+    input_ = input.reshape(-1, input.shape[-1])
+    # We should probably be able to remove this once the following PR has landed:
+    # https://github.com/pytorch/pytorch/pull/134818/files
+    # In the meantime, we specialize the case no-replacement, nsamples=1 so as not
+    # to have a synchronization point.
+    if replacement or num_samples != 1:
+        output_ = torch.multinomial(
+            input_,
+            num_samples=num_samples,
+            replacement=replacement,
+            generator=generator,
+        )
+    else:
+        q = torch.empty_like(input_).exponential_(1, generator=generator)
+        q = input_ / q
+        output_ = q.argmax(dim=-1, keepdim=True)
+    output = output_.reshape(*list(input.shape[:-1]), -1)
+    return output
+def sample_top_k(probs: torch.Tensor, k: int) -> torch.Tensor:
+    """Sample next token from top K values along the last dimension of the input probs tensor.
+    Args:
+        probs (torch.Tensor): Input probabilities with token candidates on the last dimension.
+        k (int): The k in “top-k”.
+    Returns:
+        torch.Tensor: Sampled tokens.
+    """
+    probs, indices = torch.topk(probs, k, dim=-1)
+    next_token = multinomial(probs, num_samples=1)
+    next_token = indices.gather(-1, next_token)
+    return next_token
+def sample_top_p(probs: torch.Tensor, p: float) -> torch.Tensor:
+    """Sample next token from top P probabilities along the last dimension of the input probs tensor.
+    Args:
+        probs (torch.Tensor): Input probabilities with token candidates on the last dimension.
+        p (int): The p in “top-p”.
+    Returns:
+        torch.Tensor: Sampled tokens.
+    """
+    probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
+    probs_sum = torch.cumsum(probs_sort, dim=-1)
+    mask = probs_sum - probs_sort > p
+    probs_sort *= (~mask).float()
+    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
+    next_token = multinomial(probs_sort, num_samples=1)
+    next_token = torch.gather(probs_idx, -1, next_token)
+    return next_token
+def sample_token(
+    logits: torch.Tensor,
+    use_sampling: bool = False,
+    temp: float = 1.0,
+    top_k: int = 0,
+    top_p: float = 0.0,
+) -> torch.Tensor:
+    """Given logits of shape [*, Card], returns a LongTensor of shape [*]."""
+    # Apply softmax for sampling if temp > 0. Else, do greedy sampling to avoid zero division error.
+    if use_sampling and temp > 0.0:
+        probs = torch.softmax(logits / temp, dim=-1)
+        if top_p > 0.0:
+            next_token = sample_top_p(probs, p=top_p)
+        elif top_k > 0:
+            next_token = sample_top_k(probs, k=top_k)
+        else:
+            next_token = multinomial(probs, num_samples=1)
+    else:
+        next_token = torch.argmax(logits, dim=-1, keepdim=True)
+    assert next_token.shape[-1] == 1
+    return next_token[..., 0]
+if __name__ == "__main__":
+    torch.manual_seed(1234)
+    device = "cpu"
+    if torch.cuda.is_available():
+        torch.backends.cuda.matmul.allow_tf32 = False
+        torch.backends.cudnn.allow_tf32 = False
+        device = "cuda:0"
+    ps = torch.tensor([5.0, 2.0, 12.0, 6.0, 8.0, 1.0, 0.0, 4.0], device=device)
+    cnts = torch.zeros(ps.shape, dtype=torch.long, device=device)
+    total_samples = 1000
+    for _ in range(total_samples):
+        vs = multinomial(ps, num_samples=1, replacement=False)
+        cnts[vs] += 1
+    diff = cnts / cnts.sum() - ps / ps.sum()
+    max_diff = diff.abs().max().cpu().item()
+    print(ps / ps.sum())
+    print(cnts / cnts.sum())
+    assert max_diff < 1.5e-2

pyproject.toml ADDED Viewed

	@@ -0,0 +1,39 @@

+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "moshi-hf"
+version = "0.1.0"
+description = "Moshi HuggingFace inference server"
+requires-python = ">=3.8"
+dependencies = [
+    "torch==2.4.1",
+    "torchaudio==2.4.1",
+    "torchvision==0.19.1",
+    "torchdata==0.10.0",
+    "transformers==4.46.3",
+    "huggingface-hub==0.27.1",
+    "safetensors==0.5.1",
+    "accelerate>=0.20.0",
+    "datasets==3.1.0",
+    "requests==2.32.3",
+    "urllib3==2.2.3",
+    "pyyaml==6.0.2",
+    "einops>=0.6.1",
+    "ulid-py==1.1.0",
+    "tqdm>=4.65.0",
+    "sentencepiece==0.2.0",
+    "jiwer==3.0.5",
+    "numpy==1.24.4",
+    "pandas==2.0.3",
+    "scikit-learn==1.5.2",
+    "pydub==0.25.1",
+    "librosa==0.10.2.post1",
+    "pyannote.audio==3.1.1",
+    "pesq==0.0.4",
+    "torchmetrics==1.6.0",
+    "uvicorn==0.25.0",
+    "fastapi==0.104.1",
+    "pydantic==2.5.2"
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,43 @@

+# PyTorch ecosystem with fixed versions
+torch==2.4.1
+torchaudio==2.4.1
+torchvision==0.19.1
+torchdata==0.10.0
+# HuggingFace ecosystem
+transformers==4.46.3
+huggingface-hub==0.27.1
+safetensors==0.5.1
+accelerate>=0.20.0
+datasets==3.1.0
+# HTTP and utils
+requests==2.32.3
+urllib3==2.2.3
+pyyaml==6.0.2
+einops>=0.6.1
+ulid-py==1.1.0
+tqdm>=4.65.0
+# NLP and text processing
+sentencepiece==0.2.0
+jiwer==3.0.5
+# Data processing
+numpy==1.24.4
+pandas==2.0.3
+scikit-learn==1.5.2
+# Audio processing
+pydub==0.25.1
+librosa==0.10.2.post1
+pyannote.audio==3.1.1
+pesq==0.0.4
+# Metrics and evaluation
+torchmetrics==1.6.0
+# Server
+uvicorn==0.25.0
+fastapi==0.104.1
+pydantic==2.5.2

server.py ADDED Viewed

	@@ -0,0 +1,121 @@

+from fastapi import FastAPI, HTTPException
+import numpy as np
+import torch
+from pydantic import BaseModel
+import base64
+import io
+import os
+import logging
+from pathlib import Path
+from inference import InferenceRecipe
+from fastapi.middleware.cors import CORSMiddleware
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+app = FastAPI()
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+class AudioRequest(BaseModel):
+    audio_data: str
+    sample_rate: int
+class AudioResponse(BaseModel):
+    audio_data: str
+    text: str = ""
+# Model initialization status
+INITIALIZATION_STATUS = {
+    "model_loaded": False,
+    "error": None
+}
+# Global model instance
+model = None
+def initialize_model():
+    """Initialize the model from mounted directory"""
+    global model, INITIALIZATION_STATUS
+    try:
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        logger.info(f"Initializing model on device: {device}")
+        model_path = os.getenv("MODEL_PATH", "/app/models")
+        if not os.path.exists(model_path):
+            raise RuntimeError(f"Model path {model_path} does not exist")
+        model = InferenceRecipe(model_path, device=device)
+        INITIALIZATION_STATUS["model_loaded"] = True
+        logger.info("Model initialized successfully")
+        return True
+    except Exception as e:
+        INITIALIZATION_STATUS["error"] = str(e)
+        logger.error(f"Failed to initialize model: {e}")
+        return False
+@app.on_event("startup")
+async def startup_event():
+    """Initialize model on startup"""
+    initialize_model()
+@app.get("/api/v1/health")
+def health_check():
+    """Health check endpoint"""
+    status = {
+        "status": "healthy" if INITIALIZATION_STATUS["model_loaded"] else "initializing",
+        "gpu_available": torch.cuda.is_available(),
+        "initialization_status": INITIALIZATION_STATUS
+    }
+    if model is not None:
+        status.update({
+            "device": str(model.device),
+            "model_path": str(model.model_path),
+            "mimi_loaded": model.mimi is not None,
+            "tokenizer_loaded": model.text_tokenizer is not None,
+            "lm_loaded": model.lm_gen is not None
+        })
+    return status
+@app.post("/api/v1/inference")
+async def inference(request: AudioRequest) -> AudioResponse:
+    """Run inference on audio input"""
+    if not INITIALIZATION_STATUS["model_loaded"]:
+        raise HTTPException(
+            status_code=503,
+            detail=f"Model not ready. Status: {INITIALIZATION_STATUS}"
+        )
+    try:
+        # Decode audio from base64
+        audio_bytes = base64.b64decode(request.audio_data)
+        audio_array = np.load(io.BytesIO(audio_bytes))
+        # Run inference
+        result = model.inference(audio_array, request.sample_rate)
+        # Encode output audio
+        buffer = io.BytesIO()
+        np.save(buffer, result['audio'])
+        audio_b64 = base64.b64encode(buffer.getvalue()).decode()
+        return AudioResponse(
+            audio_data=audio_b64,
+            text=result.get("text", "")
+        )
+    except Exception as e:
+        logger.error(f"Inference failed: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)

setup.py ADDED Viewed

	@@ -0,0 +1,78 @@

+# The MIT License (MIT)
+# Copyright © 2023 Yuma Rao
+# Copyright © 2024 Omega Labs, Inc.
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the “Software”), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+# and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of
+# the Software.
+# THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+# THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+import re
+import os
+import codecs
+from os import path
+from io import open
+from setuptools import setup, find_packages
+def read_requirements(path):
+    with open(path, "r") as f:
+        requirements = f.read().splitlines()
+    return requirements
+requirements = read_requirements("requirements.txt")
+here = path.abspath(path.dirname(__file__))
+with open(path.join(here, "README.md"), encoding="utf-8") as f:
+    long_description = f.read()
+# loading version from setup.py
+with codecs.open(
+    os.path.join(here, "template/__init__.py"), encoding="utf-8"
+) as init_file:
+    version_match = re.search(
+        r"^__version__ = ['\"]([^'\"]*)['\"]", init_file.read(), re.M
+    )
+    version_string = version_match.group(1)
+setup(
+    name="omegalabs-anytoany-bittensor",
+    version=version_string,
+    description="omegalabs-anytoany-bittensor",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="https://github.com/omegalabsinc/omegalabs-anytoany-bittensor",
+    author="OMEGA Labs, Inc.",
+    packages=find_packages(),
+    include_package_data=True,
+    author_email="[email protected]",
+    license="MIT",
+    python_requires=">=3.8",
+    install_requires=requirements,
+    classifiers=[
+        "Development Status :: 3 - Alpha",
+        "Intended Audience :: Developers",
+        "Topic :: Software Development :: Build Tools",
+        "License :: OSI Approved :: MIT License",
+        "Programming Language :: Python :: 3 :: Only",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Topic :: Scientific/Engineering",
+        "Topic :: Scientific/Engineering :: Mathematics",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Topic :: Software Development",
+        "Topic :: Software Development :: Libraries",
+        "Topic :: Software Development :: Libraries :: Python Modules",
+    ],
+)