Spaces:

rc19477
/

avse_dev_only

Sleeping

App Files Files Community

roychao19477 commited on Jul 1

Commit

bd9ffb1

1 Parent(s): a66fd6a

Add application file

Browse files

Files changed (28) hide show

README.md +6 -7
app.py +376 -0
mamba_ssm/.DS_Store +0 -0
mamba_ssm/__init__.py +5 -0
mamba_ssm/models/__init__.py +0 -0
mamba_ssm/models/config_mamba.py +15 -0
mamba_ssm/models/mixer_seq_simple.py +264 -0
mamba_ssm/modules/__init__.py +0 -0
mamba_ssm/modules/mamba_simple.py +353 -0
mamba_ssm/ops/__init__.py +0 -0
mamba_ssm/ops/selective_scan_interface.py +357 -0
mamba_ssm/ops/triton/__init__.py +0 -0
mamba_ssm/ops/triton/layernorm.py +635 -0
mamba_ssm/ops/triton/selective_state_update.py +263 -0
mamba_ssm/utils/__init__.py +0 -0
mamba_ssm/utils/generation.py +387 -0
mamba_ssm/utils/hf.py +23 -0
models/codec_module.py +183 -0
models/discriminator.py +56 -0
models/generator.py +72 -0
models/loss.py +145 -0
models/lsigmoid.py +66 -0
models/mamba_block.py +110 -0
models/pcs400.py +53 -0
models/stfts.py +73 -0
recipes/SEMamba_advanced.yaml +66 -0
requirements.txt +22 -0
yolov8n-face.pt +3 -0

README.md CHANGED Viewed

@@ -1,13 +1,12 @@
 ---
-title: Avse Dev Only
-emoji: 🔥
-colorFrom: green
 colorTo: indigo
 sdk: gradio
-sdk_version: 5.35.0
 app_file: app.py
 pinned: false
-short_description: 'avse_dev_only (Free HF user version : limited resolution)'
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Dev
+colorFrom: purple
 colorTo: indigo
 sdk: gradio
+sdk_version: 5.31.0
 app_file: app.py
 pinned: false
+short_description: Dev
+tags:
+  - Useless
 ---

app.py ADDED Viewed

	@@ -0,0 +1,376 @@

+import shlex
+import subprocess
+import spaces
+import torch
+import os
+import shutil
+import glob
+import gradio as gr
+# install packages for mamba
+def install_mamba():
+    subprocess.run(shlex.split("pip install https://github.com/state-spaces/mamba/releases/download/v2.2.2/mamba_ssm-2.2.2+cu122torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"))
+def clone_github():
+    subprocess.run([
+        "git", "clone",
+        f"https://RoyChao19477:{os.environ['GITHUB_TOKEN']}@github.com/RoyChao19477/for_HF_AVSEMamba.git",
+    ])
+    # move all files except README.md
+    for item in glob.glob("for_HF_AVSEMamba/*"):
+        if os.path.basename(item) != "README.md":
+            if os.path.isdir(item):
+                shutil.move(item, ".")
+            else:
+                shutil.move(item, os.path.join(".", os.path.basename(item)))
+    #shutil.rmtree("tmp_repo")
+    #subprocess.run(["ls"], check=True)
+install_mamba()
+clone_github()
+ABOUT = """
+# SEMamba: Speech Enhancement
+A Mamba-based model that denoises real-world audio.
+Upload or record a noisy clip and click **Enhance** to hear + see its spectrogram.
+"""
+import torch
+import ffmpeg
+import torchaudio
+import torchaudio.transforms as T
+import yaml
+import librosa
+import librosa.display
+import matplotlib
+import numpy as np
+import soundfile as sf
+import matplotlib.pyplot as plt
+from models.stfts    import mag_phase_stft, mag_phase_istft
+from models.generator import SEMamba
+from models.pcs400   import cal_pcs
+from ultralytics import YOLO
+import supervision as sv
+import gradio as gr
+import cv2
+import os
+import tempfile
+from ultralytics import YOLO
+from moviepy import ImageSequenceClip
+from moviepy.video import fx as vfx
+from scipy.io import wavfile
+from avse_code import run_avse
+# Load face detector
+model = YOLO("yolov8n-face.pt").cuda()  # assumes CUDA available
+from decord import VideoReader, cpu
+from model import AVSEModule
+from config import sampling_rate
+import spaces
+# Load model once globally
+#ckpt_path = "ckpts/ep215_0906.oat.ckpt"
+#model = AVSEModule.load_from_checkpoint(ckpt_path)
+avse_model = AVSEModule()
+#avse_state_dict = torch.load("ckpts/ep215_0906.oat.ckpt")
+avse_state_dict = torch.load("ckpts/ep220_0908.oat.ckpt")
+avse_model.load_state_dict(avse_state_dict, strict=True)
+avse_model.to("cuda")
+avse_model.eval()
+@spaces.GPU
+def run_avse_inference(video_path, audio_path):
+    estimated = run_avse(video_path, audio_path)
+    # Load audio
+    #noisy, _ = sf.read(audio_path, dtype='float32')  # (N, )
+    #noisy = torch.tensor(noisy).unsqueeze(0)  # (1, N)
+    noisy = wavfile.read(audio_path)[1].astype(np.float32) / (2 ** 15)
+    # Norm.
+    #noisy = noisy * (0.8 / np.max(np.abs(noisy)))
+    # Load grayscale video
+    vr = VideoReader(video_path, ctx=cpu(0))
+    frames = vr.get_batch(list(range(len(vr)))).asnumpy()
+    bg_frames = np.array([
+        cv2.cvtColor(frames[i], cv2.COLOR_RGB2GRAY) for i in range(len(frames))
+    ]).astype(np.float32)
+    bg_frames /= 255.0
+    # Combine into input dict (match what model.enhance expects)
+    data = {
+        "noisy_audio": noisy,
+        "video_frames": bg_frames[np.newaxis, ...]
+    }
+    with torch.no_grad():
+        estimated = avse_model.enhance(data).reshape(-1)
+    # Save result
+    tmp_wav = audio_path.replace(".wav", "_enhanced.wav")
+    sf.write(tmp_wav, estimated, samplerate=sampling_rate)
+    return tmp_wav
+def extract_resampled_audio(video_path, target_sr=16000):
+    # Step 1: extract audio via torchaudio
+    # (moviepy will still extract it to wav temp file)
+    tmp_audio_path = tempfile.mktemp(suffix=".wav")
+    subprocess.run(["ffmpeg", "-y", "-i", video_path, "-vn", "-acodec", "pcm_s16le", "-ar", "44100", tmp_audio_path])
+    # Step 2: Load and resample
+    waveform, sr = torchaudio.load(tmp_audio_path)
+    if sr != target_sr:
+        resampler = T.Resample(orig_freq=sr, new_freq=target_sr)
+        waveform = resampler(waveform)
+    # Step 3: Save resampled audio
+    resampled_audio_path = tempfile.mktemp(suffix="_16k.wav")
+    torchaudio.save(resampled_audio_path, waveform, sample_rate=target_sr)
+    return resampled_audio_path
+@spaces.GPU
+def extract_faces(video_file):
+    # Step 0: Check resolution
+    cap = cv2.VideoCapture(video_file)
+    width  = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
+    height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
+    cap.release()
+    # Step 1: Downsample if needed
+    if width > 1280 or height > 720:
+        resized_path = tempfile.mktemp(suffix=".mp4")
+        subprocess.run([
+            "ffmpeg", "-y", "-i", video_file,
+            "-vf", "scale='min(1280,iw)':-2",
+            "-c:v", "libx264", "-crf", "28",
+            "-preset", "fast", "-an", resized_path
+        ])
+        video_file = resized_path
+    cap = cv2.VideoCapture(video_file)
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    frames = []
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        # Inference
+        results = model(frame, verbose=False)[0]
+        for box in results.boxes:
+            # version 1
+            # x1, y1, x2, y2 = map(int, box.xyxy[0])
+            # version 2
+            h, w, _ = frame.shape
+            x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
+            pad_ratio = 0.5  # 30% padding
+            dx = (x2 - x1) * pad_ratio
+            dy = (y2 - y1) * pad_ratio
+            x1 = int(max(0, x1 - dx))
+            y1 = int(max(0, y1 - dy))
+            x2 = int(min(w, x2 + dx))
+            y2 = int(min(h, y2 + dy))
+            # Added for v3
+            shift_down = int(0.1 * (y2 - y1))
+            y1 = int(min(max(0, y1 + shift_down), h))
+            y2 = int(min(max(0, y2 + shift_down), h))
+            face_crop = frame[y1:y2, x1:x2]
+            if face_crop.size != 0:
+                resized = cv2.resize(face_crop, (224, 224))
+                frames.append(resized)
+                #h_crop, w_crop = face_crop.shape[:2]
+                #side = min(h_crop, w_crop)
+                #start_y = (h_crop - side) // 2
+                #start_x = (w_crop - side) // 2
+                #square_crop = face_crop[start_y:start_y+side, start_x:start_x+side]
+                #resized = cv2.resize(square_crop, (224, 224))
+                #frames.append(resized)
+                break  # only one face per frame
+    cap.release()
+    # Save as video
+    tmpdir = tempfile.mkdtemp()
+    output_path = os.path.join(tmpdir, "face_only_video.mp4")
+    #clip = ImageSequenceClip([cv2.cvtColor(f, cv2.COLOR_BGR2RGB) for f in frames], fps=25)
+    #clip = ImageSequenceClip([cv2.cvtColor(f, cv2.COLOR_BGR2RGB) for f in frames], fps=fps)
+    clip = ImageSequenceClip(
+        [cv2.cvtColor(cv2.resize(f, (224, 224)), cv2.COLOR_BGR2RGB) for f in frames],
+        fps=fps
+    ).fx(vfx.flip_vertical)
+    clip.write_videofile(output_path, codec="libx264", audio=False, fps=25)
+    # Save audio from original, resampled to 16kHz
+    audio_path = os.path.join(tmpdir, "audio_16k.wav")
+    # Extract audio using ffmpeg-python (more robust than moviepy)
+    ffmpeg.input(video_file).output(
+        audio_path,
+        ar=16000,  # resample to 16k
+        ac=1,      # mono
+        format='wav',
+        vn=None    # no video
+    ).run(overwrite_output=True)
+    # ------------------------------- #
+    # AVSE models
+    enhanced_audio_path = run_avse_inference(output_path, audio_path)
+    return output_path, enhanced_audio_path
+    #return output_path, audio_path
+iface = gr.Interface(
+    fn=extract_faces,
+    inputs=gr.Video(label="Upload or record your video"),
+    outputs=[
+        gr.Video(label="Detected Face Only Video"),
+        #gr.Audio(label="Extracted Audio (16kHz)", type="filepath"),
+        gr.Audio(label="Enhanced Audio", type="filepath")
+    ],
+    title="Face Detector",
+    description="Upload or record a video. We'll crop face regions and return a face-only video and its 16kHz audio."
+)
+iface.launch()
+ckpt = "ckpts/SEMamba_advanced.pth"
+cfg_f = "recipes/SEMamba_advanced.yaml"
+# load config
+with open(cfg_f, 'r') as f:
+    cfg = yaml.safe_load(f)
+# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+device = "cuda"
+model  = SEMamba(cfg).to(device)
+#sdict  = torch.load(ckpt, map_location=device)
+#model.load_state_dict(sdict["generator"])
+#model.eval()
+@spaces.GPU
+def enhance(filepath, model_name):
+    # Load model based on selection
+    ckpt_path = {
+        "VCTK-Demand": "ckpts/SEMamba_advanced.pth",
+        "VCTK+DNS": "ckpts/vd.pth"
+    }[model_name]
+    print("Loading:", ckpt_path)
+    model.load_state_dict(torch.load(ckpt_path, map_location=device)["generator"])
+    model.eval()
+    with torch.no_grad():
+        # load & resample
+        wav, orig_sr = librosa.load(filepath, sr=None)
+        noisy_wav = wav.copy()
+        if orig_sr != 16000:
+            wav = librosa.resample(wav, orig_sr=orig_sr, target_sr=16000)
+        x = torch.from_numpy(wav).float().to(device)
+        norm = torch.sqrt(len(x)/torch.sum(x**2))
+        #x = (x * norm).unsqueeze(0)
+        x = (x * norm)
+        # split into 4s segments (64000 samples)
+        segment_len = 4 * 16000
+        chunks = x.split(segment_len)
+        enhanced_chunks = []
+        for chunk in chunks:
+            if len(chunk) < segment_len:
+                #pad = torch.zeros(segment_len - len(chunk), device=chunk.device)
+                pad = (torch.randn(segment_len - len(chunk), device=chunk.device) * 1e-4)
+                chunk = torch.cat([chunk, pad])
+            chunk = chunk.unsqueeze(0)
+            amp, pha, _ = mag_phase_stft(chunk, 400, 100, 400, 0.3)
+            amp2, pha2, _ = model(amp, pha)
+            out = mag_phase_istft(amp2, pha2, 400, 100, 400, 0.3)
+            out = (out / norm).squeeze(0)
+            enhanced_chunks.append(out)
+        out = torch.cat(enhanced_chunks)[:len(x)].cpu().numpy()  # trim padding
+        # back to original rate
+        if orig_sr != 16000:
+            out = librosa.resample(out, orig_sr=16000, target_sr=orig_sr)
+        # Normalize
+        peak = np.max(np.abs(out))
+        if peak > 0.05:
+            out = out / peak * 0.85
+        # write file
+        sf.write("enhanced.wav", out, orig_sr)
+        # spectrograms
+        fig, axs = plt.subplots(1, 2, figsize=(16, 4))
+        # noisy
+        D_noisy = librosa.stft(noisy_wav, n_fft=512, hop_length=256)
+        S_noisy = librosa.amplitude_to_db(np.abs(D_noisy), ref=np.max)
+        librosa.display.specshow(S_noisy, sr=orig_sr, hop_length=256, x_axis="time", y_axis="hz", ax=axs[0], vmax=0)
+        axs[0].set_title("Noisy Spectrogram")
+        # enhanced
+        D_clean = librosa.stft(out, n_fft=512, hop_length=256)
+        S_clean = librosa.amplitude_to_db(np.abs(D_clean), ref=np.max)
+        librosa.display.specshow(S_clean, sr=orig_sr, hop_length=256, x_axis="time", y_axis="hz", ax=axs[1], vmax=0)
+        #librosa.display.specshow(S_clean, sr=16000, hop_length=512, x_axis="time", y_axis="hz", ax=axs[1], vmax=0)
+        axs[1].set_title("Enhanced Spectrogram")
+        plt.tight_layout()
+    return "enhanced.wav", fig
+#with gr.Blocks() as demo:
+#    gr.Markdown(ABOUT)
+#    input_audio = gr.Audio(label="Input Audio", type="filepath", interactive=True)
+#    enhance_btn = gr.Button("Enhance")
+#    output_audio = gr.Audio(label="Enhanced Audio", type="filepath")
+#    plot_output = gr.Plot(label="Spectrograms")
+#
+#    enhance_btn.click(fn=enhance, inputs=input_audio, outputs=[output_audio, plot_output])
+#
+#demo.queue().launch()
+with gr.Blocks() as demo:
+    gr.Markdown(ABOUT)
+    input_audio = gr.Audio(label="Input Audio", type="filepath", interactive=True)
+    model_choice = gr.Radio(
+        label="Choose Model (The use of VCTK+DNS is recommended)",
+        choices=["VCTK-Demand", "VCTK+DNS"],
+        value="VCTK-Demand"
+    )
+    enhance_btn = gr.Button("Enhance")
+    output_audio = gr.Audio(label="Enhanced Audio", type="filepath")
+    plot_output = gr.Plot(label="Spectrograms")
+    enhance_btn.click(
+        fn=enhance,
+        inputs=[input_audio, model_choice],
+        outputs=[output_audio, plot_output]
+    )
+    gr.Markdown("**Note**: The current models are trained on 16kHz audio. Therefore, any input audio not sampled at 16kHz will be automatically resampled before enhancement.")
+demo.queue().launch()

mamba_ssm/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

mamba_ssm/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+__version__ = "1.2.2"
+from mamba_ssm.ops.selective_scan_interface import selective_scan_fn, mamba_inner_fn
+from mamba_ssm.modules.mamba_simple import Mamba
+from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel

mamba_ssm/models/__init__.py ADDED Viewed

File without changes

mamba_ssm/models/config_mamba.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from dataclasses import dataclass, field
+@dataclass
+class MambaConfig:
+    d_model: int = 2560
+    n_layer: int = 64
+    vocab_size: int = 50277
+    ssm_cfg: dict = field(default_factory=dict)
+    rms_norm: bool = True
+    residual_in_fp32: bool = True
+    fused_add_norm: bool = True
+    pad_vocab_size_multiple: int = 8
+    tie_embeddings: bool = True

mamba_ssm/models/mixer_seq_simple.py ADDED Viewed

	@@ -0,0 +1,264 @@

+# Copyright (c) 2023, Albert Gu, Tri Dao.
+import math
+from functools import partial
+import json
+import os
+from collections import namedtuple
+import torch
+import torch.nn as nn
+from mamba_ssm.models.config_mamba import MambaConfig
+from mamba_ssm.modules.mamba_simple import Mamba, Block
+from mamba_ssm.utils.generation import GenerationMixin
+from mamba_ssm.utils.hf import load_config_hf, load_state_dict_hf
+try:
+    from mamba_ssm.ops.triton.layernorm import RMSNorm, layer_norm_fn, rms_norm_fn
+except ImportError:
+    RMSNorm, layer_norm_fn, rms_norm_fn = None, None, None
+def create_block(
+    d_model,
+    ssm_cfg=None,
+    norm_epsilon=1e-5,
+    rms_norm=False,
+    residual_in_fp32=False,
+    fused_add_norm=False,
+    layer_idx=None,
+    device=None,
+    dtype=None,
+):
+    if ssm_cfg is None:
+        ssm_cfg = {}
+    factory_kwargs = {"device": device, "dtype": dtype}
+    mixer_cls = partial(Mamba, layer_idx=layer_idx, **ssm_cfg, **factory_kwargs)
+    norm_cls = partial(
+        nn.LayerNorm if not rms_norm else RMSNorm, eps=norm_epsilon, **factory_kwargs
+    )
+    block = Block(
+        d_model,
+        mixer_cls,
+        norm_cls=norm_cls,
+        fused_add_norm=fused_add_norm,
+        residual_in_fp32=residual_in_fp32,
+    )
+    block.layer_idx = layer_idx
+    return block
+# https://github.com/huggingface/transformers/blob/c28d04e9e252a1a099944e325685f14d242ecdcd/src/transformers/models/gpt2/modeling_gpt2.py#L454
+def _init_weights(
+    module,
+    n_layer,
+    initializer_range=0.02,  # Now only used for embedding layer.
+    rescale_prenorm_residual=True,
+    n_residuals_per_layer=1,  # Change to 2 if we have MLP
+):
+    if isinstance(module, nn.Linear):
+        if module.bias is not None:
+            if not getattr(module.bias, "_no_reinit", False):
+                nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.Embedding):
+        nn.init.normal_(module.weight, std=initializer_range)
+    if rescale_prenorm_residual:
+        # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+        #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
+        #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
+        #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+        #
+        # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+        for name, p in module.named_parameters():
+            if name in ["out_proj.weight", "fc2.weight"]:
+                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
+                # We need to reinit p since this code could be called multiple times
+                # Having just p *= scale would repeatedly scale it down
+                nn.init.kaiming_uniform_(p, a=math.sqrt(5))
+                with torch.no_grad():
+                    p /= math.sqrt(n_residuals_per_layer * n_layer)
+class MixerModel(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        n_layer: int,
+        vocab_size: int,
+        ssm_cfg=None,
+        norm_epsilon: float = 1e-5,
+        rms_norm: bool = False,
+        initializer_cfg=None,
+        fused_add_norm=False,
+        residual_in_fp32=False,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.residual_in_fp32 = residual_in_fp32
+        self.embedding = nn.Embedding(vocab_size, d_model, **factory_kwargs)
+        # We change the order of residual and layer norm:
+        # Instead of LN -> Attn / MLP -> Add, we do:
+        # Add -> LN -> Attn / MLP / Mixer, returning both the residual branch (output of Add) and
+        # the main branch (output of MLP / Mixer). The model definition is unchanged.
+        # This is for performance reason: we can fuse add + layer_norm.
+        self.fused_add_norm = fused_add_norm
+        if self.fused_add_norm:
+            if layer_norm_fn is None or rms_norm_fn is None:
+                raise ImportError("Failed to import Triton LayerNorm / RMSNorm kernels")
+        self.layers = nn.ModuleList(
+            [
+                create_block(
+                    d_model,
+                    ssm_cfg=ssm_cfg,
+                    norm_epsilon=norm_epsilon,
+                    rms_norm=rms_norm,
+                    residual_in_fp32=residual_in_fp32,
+                    fused_add_norm=fused_add_norm,
+                    layer_idx=i,
+                    **factory_kwargs,
+                )
+                for i in range(n_layer)
+            ]
+        )
+        self.norm_f = (nn.LayerNorm if not rms_norm else RMSNorm)(
+            d_model, eps=norm_epsilon, **factory_kwargs
+        )
+        self.apply(
+            partial(
+                _init_weights,
+                n_layer=n_layer,
+                **(initializer_cfg if initializer_cfg is not None else {}),
+            )
+        )
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return {
+            i: layer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
+            for i, layer in enumerate(self.layers)
+        }
+    def forward(self, input_ids, inference_params=None):
+        hidden_states = self.embedding(input_ids)
+        residual = None
+        for layer in self.layers:
+            hidden_states, residual = layer(
+                hidden_states, residual, inference_params=inference_params
+            )
+        if not self.fused_add_norm:
+            residual = (hidden_states + residual) if residual is not None else hidden_states
+            hidden_states = self.norm_f(residual.to(dtype=self.norm_f.weight.dtype))
+        else:
+            # Set prenorm=False here since we don't need the residual
+            fused_add_norm_fn = rms_norm_fn if isinstance(self.norm_f, RMSNorm) else layer_norm_fn
+            hidden_states = fused_add_norm_fn(
+                hidden_states,
+                self.norm_f.weight,
+                self.norm_f.bias,
+                eps=self.norm_f.eps,
+                residual=residual,
+                prenorm=False,
+                residual_in_fp32=self.residual_in_fp32,
+            )
+        return hidden_states
+class MambaLMHeadModel(nn.Module, GenerationMixin):
+    def __init__(
+        self,
+        config: MambaConfig,
+        initializer_cfg=None,
+        device=None,
+        dtype=None,
+    ) -> None:
+        self.config = config
+        d_model = config.d_model
+        n_layer = config.n_layer
+        vocab_size = config.vocab_size
+        ssm_cfg = config.ssm_cfg
+        rms_norm = config.rms_norm
+        residual_in_fp32 = config.residual_in_fp32
+        fused_add_norm = config.fused_add_norm
+        pad_vocab_size_multiple = config.pad_vocab_size_multiple
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        if vocab_size % pad_vocab_size_multiple != 0:
+            vocab_size += pad_vocab_size_multiple - (vocab_size % pad_vocab_size_multiple)
+        self.backbone = MixerModel(
+            d_model=d_model,
+            n_layer=n_layer,
+            vocab_size=vocab_size,
+            ssm_cfg=ssm_cfg,
+            rms_norm=rms_norm,
+            initializer_cfg=initializer_cfg,
+            fused_add_norm=fused_add_norm,
+            residual_in_fp32=residual_in_fp32,
+            **factory_kwargs,
+        )
+        self.lm_head = nn.Linear(d_model, vocab_size, bias=False, **factory_kwargs)
+        # Initialize weights and apply final processing
+        self.apply(
+            partial(
+                _init_weights,
+                n_layer=n_layer,
+                **(initializer_cfg if initializer_cfg is not None else {}),
+            )
+        )
+        self.tie_weights()
+    def tie_weights(self):
+        if self.config.tie_embeddings:
+            self.lm_head.weight = self.backbone.embedding.weight
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return self.backbone.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
+    def forward(self, input_ids, position_ids=None, inference_params=None, num_last_tokens=0):
+        """
+        "position_ids" is just to be compatible with Transformer generation. We don't use it.
+        num_last_tokens: if > 0, only return the logits for the last n tokens
+        """
+        hidden_states = self.backbone(input_ids, inference_params=inference_params)
+        if num_last_tokens > 0:
+            hidden_states = hidden_states[:, -num_last_tokens:]
+        lm_logits = self.lm_head(hidden_states)
+        CausalLMOutput = namedtuple("CausalLMOutput", ["logits"])
+        return CausalLMOutput(logits=lm_logits)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name, device=None, dtype=None, **kwargs):
+        config_data = load_config_hf(pretrained_model_name)
+        config = MambaConfig(**config_data)
+        model = cls(config, device=device, dtype=dtype, **kwargs)
+        model.load_state_dict(load_state_dict_hf(pretrained_model_name, device=device, dtype=dtype))
+        return model
+    def save_pretrained(self, save_directory):
+        """
+        Minimal implementation of save_pretrained for MambaLMHeadModel.
+        Save the model and its configuration file to a directory.
+        """
+        # Ensure save_directory exists
+        os.makedirs(save_directory, exist_ok=True)
+        # Save the model's state_dict
+        model_path = os.path.join(save_directory, 'pytorch_model.bin')
+        torch.save(self.state_dict(), model_path)
+        # Save the configuration of the model
+        config_path = os.path.join(save_directory, 'config.json')
+        with open(config_path, 'w') as f:
+            json.dump(self.config.__dict__, f)

mamba_ssm/modules/__init__.py ADDED Viewed

File without changes

mamba_ssm/modules/mamba_simple.py ADDED Viewed

	@@ -0,0 +1,353 @@

+# Copyright (c) 2023, Tri Dao, Albert Gu.
+import math
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from einops import rearrange, repeat
+from mamba_ssm.ops.selective_scan_interface import selective_scan_fn, mamba_inner_fn
+try:
+    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+except ImportError:
+    causal_conv1d_fn, causal_conv1d_update = None, None
+try:
+    from mamba_ssm.ops.triton.selective_state_update import selective_state_update
+except ImportError:
+    selective_state_update = None
+try:
+    from mamba_ssm.ops.triton.layernorm import RMSNorm, layer_norm_fn, rms_norm_fn
+except ImportError:
+    RMSNorm, layer_norm_fn, rms_norm_fn = None, None, None
+class Mamba(nn.Module):
+    def __init__(
+        self,
+        d_model,
+        d_state=16,
+        d_conv=4,
+        expand=2,
+        dt_rank="auto",
+        dt_min=0.001,
+        dt_max=0.1,
+        dt_init="random",
+        dt_scale=1.0,
+        dt_init_floor=1e-4,
+        conv_bias=True,
+        bias=False,
+        use_fast_path=True,  # Fused kernel options
+        layer_idx=None,
+        device=None,
+        dtype=None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.d_model = d_model
+        self.d_state = d_state
+        self.d_conv = d_conv
+        self.expand = expand
+        self.d_inner = int(self.expand * self.d_model)
+        self.dt_rank = math.ceil(self.d_model / 16) if dt_rank == "auto" else dt_rank
+        self.use_fast_path = use_fast_path
+        self.layer_idx = layer_idx
+        self.in_proj = nn.Linear(self.d_model, self.d_inner * 2, bias=bias, **factory_kwargs)
+        self.conv1d = nn.Conv1d(
+            in_channels=self.d_inner,
+            out_channels=self.d_inner,
+            bias=conv_bias,
+            kernel_size=d_conv,
+            groups=self.d_inner,
+            padding=d_conv - 1,
+            **factory_kwargs,
+        )
+        self.activation = "silu"
+        self.act = nn.SiLU()
+        self.x_proj = nn.Linear(
+            self.d_inner, self.dt_rank + self.d_state * 2, bias=False, **factory_kwargs
+        )
+        self.dt_proj = nn.Linear(self.dt_rank, self.d_inner, bias=True, **factory_kwargs)
+        # Initialize special dt projection to preserve variance at initialization
+        dt_init_std = self.dt_rank**-0.5 * dt_scale
+        if dt_init == "constant":
+            nn.init.constant_(self.dt_proj.weight, dt_init_std)
+        elif dt_init == "random":
+            nn.init.uniform_(self.dt_proj.weight, -dt_init_std, dt_init_std)
+        else:
+            raise NotImplementedError
+        # Initialize dt bias so that F.softplus(dt_bias) is between dt_min and dt_max
+        dt = torch.exp(
+            torch.rand(self.d_inner, **factory_kwargs) * (math.log(dt_max) - math.log(dt_min))
+            + math.log(dt_min)
+        ).clamp(min=dt_init_floor)
+        # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+        inv_dt = dt + torch.log(-torch.expm1(-dt))
+        with torch.no_grad():
+            self.dt_proj.bias.copy_(inv_dt)
+        # Our initialization would set all Linear.bias to zero, need to mark this one as _no_reinit
+        self.dt_proj.bias._no_reinit = True
+        # S4D real initialization
+        A = repeat(
+            torch.arange(1, self.d_state + 1, dtype=torch.float32, device=device),
+            "n -> d n",
+            d=self.d_inner,
+        ).contiguous()
+        A_log = torch.log(A)  # Keep A_log in fp32
+        self.A_log = nn.Parameter(A_log)
+        self.A_log._no_weight_decay = True
+        # D "skip" parameter
+        self.D = nn.Parameter(torch.ones(self.d_inner, device=device))  # Keep in fp32
+        self.D._no_weight_decay = True
+        self.out_proj = nn.Linear(self.d_inner, self.d_model, bias=bias, **factory_kwargs)
+    def forward(self, hidden_states, inference_params=None):
+        """
+        hidden_states: (B, L, D)
+        Returns: same shape as hidden_states
+        """
+        batch, seqlen, dim = hidden_states.shape
+        conv_state, ssm_state = None, None
+        if inference_params is not None:
+            conv_state, ssm_state = self._get_states_from_cache(inference_params, batch)
+            if inference_params.seqlen_offset > 0:
+                # The states are updated inplace
+                out, _, _ = self.step(hidden_states, conv_state, ssm_state)
+                return out
+        # We do matmul and transpose BLH -> HBL at the same time
+        xz = rearrange(
+            self.in_proj.weight @ rearrange(hidden_states, "b l d -> d (b l)"),
+            "d (b l) -> b d l",
+            l=seqlen,
+        )
+        if self.in_proj.bias is not None:
+            xz = xz + rearrange(self.in_proj.bias.to(dtype=xz.dtype), "d -> d 1")
+        A = -torch.exp(self.A_log.float())  # (d_inner, d_state)
+        # In the backward pass we write dx and dz next to each other to avoid torch.cat
+        if self.use_fast_path and causal_conv1d_fn is not None and inference_params is None:  # Doesn't support outputting the states
+            out = mamba_inner_fn(
+                xz,
+                self.conv1d.weight,
+                self.conv1d.bias,
+                self.x_proj.weight,
+                self.dt_proj.weight,
+                self.out_proj.weight,
+                self.out_proj.bias,
+                A,
+                None,  # input-dependent B
+                None,  # input-dependent C
+                self.D.float(),
+                delta_bias=self.dt_proj.bias.float(),
+                delta_softplus=True,
+            )
+        else:
+            x, z = xz.chunk(2, dim=1)
+            # Compute short convolution
+            if conv_state is not None:
+                # If we just take x[:, :, -self.d_conv :], it will error if seqlen < self.d_conv
+                # Instead F.pad will pad with zeros if seqlen < self.d_conv, and truncate otherwise.
+                conv_state.copy_(F.pad(x, (self.d_conv - x.shape[-1], 0)))  # Update state (B D W)
+            if causal_conv1d_fn is None:
+                x = self.act(self.conv1d(x)[..., :seqlen])
+            else:
+                assert self.activation in ["silu", "swish"]
+                x = causal_conv1d_fn(
+                    x=x,
+                    weight=rearrange(self.conv1d.weight, "d 1 w -> d w"),
+                    bias=self.conv1d.bias,
+                    activation=self.activation,
+                )
+            # We're careful here about the layout, to avoid extra transposes.
+            # We want dt to have d as the slowest moving dimension
+            # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
+            x_dbl = self.x_proj(rearrange(x, "b d l -> (b l) d"))  # (bl d)
+            dt, B, C = torch.split(x_dbl, [self.dt_rank, self.d_state, self.d_state], dim=-1)
+            dt = self.dt_proj.weight @ dt.t()
+            dt = rearrange(dt, "d (b l) -> b d l", l=seqlen)
+            B = rearrange(B, "(b l) dstate -> b dstate l", l=seqlen).contiguous()
+            C = rearrange(C, "(b l) dstate -> b dstate l", l=seqlen).contiguous()
+            assert self.activation in ["silu", "swish"]
+            y = selective_scan_fn(
+                x,
+                dt,
+                A,
+                B,
+                C,
+                self.D.float(),
+                z=z,
+                delta_bias=self.dt_proj.bias.float(),
+                delta_softplus=True,
+                return_last_state=ssm_state is not None,
+            )
+            if ssm_state is not None:
+                y, last_state = y
+                ssm_state.copy_(last_state)
+            y = rearrange(y, "b d l -> b l d")
+            out = self.out_proj(y)
+        return out
+    def step(self, hidden_states, conv_state, ssm_state):
+        dtype = hidden_states.dtype
+        assert hidden_states.shape[1] == 1, "Only support decoding with 1 token at a time for now"
+        xz = self.in_proj(hidden_states.squeeze(1))  # (B 2D)
+        x, z = xz.chunk(2, dim=-1)  # (B D)
+        # Conv step
+        if causal_conv1d_update is None:
+            conv_state.copy_(torch.roll(conv_state, shifts=-1, dims=-1))  # Update state (B D W)
+            conv_state[:, :, -1] = x
+            x = torch.sum(conv_state * rearrange(self.conv1d.weight, "d 1 w -> d w"), dim=-1)  # (B D)
+            if self.conv1d.bias is not None:
+                x = x + self.conv1d.bias
+            x = self.act(x).to(dtype=dtype)
+        else:
+            x = causal_conv1d_update(
+                x,
+                conv_state,
+                rearrange(self.conv1d.weight, "d 1 w -> d w"),
+                self.conv1d.bias,
+                self.activation,
+            )
+        x_db = self.x_proj(x)  # (B dt_rank+2*d_state)
+        dt, B, C = torch.split(x_db, [self.dt_rank, self.d_state, self.d_state], dim=-1)
+        # Don't add dt_bias here
+        dt = F.linear(dt, self.dt_proj.weight)  # (B d_inner)
+        A = -torch.exp(self.A_log.float())  # (d_inner, d_state)
+        # SSM step
+        if selective_state_update is None:
+            # Discretize A and B
+            dt = F.softplus(dt + self.dt_proj.bias.to(dtype=dt.dtype))
+            dA = torch.exp(torch.einsum("bd,dn->bdn", dt, A))
+            dB = torch.einsum("bd,bn->bdn", dt, B)
+            ssm_state.copy_(ssm_state * dA + rearrange(x, "b d -> b d 1") * dB)
+            y = torch.einsum("bdn,bn->bd", ssm_state.to(dtype), C)
+            y = y + self.D.to(dtype) * x
+            y = y * self.act(z)  # (B D)
+        else:
+            y = selective_state_update(
+                ssm_state, x, dt, A, B, C, self.D, z=z, dt_bias=self.dt_proj.bias, dt_softplus=True
+            )
+        out = self.out_proj(y)
+        return out.unsqueeze(1), conv_state, ssm_state
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        device = self.out_proj.weight.device
+        conv_dtype = self.conv1d.weight.dtype if dtype is None else dtype
+        conv_state = torch.zeros(
+            batch_size, self.d_model * self.expand, self.d_conv, device=device, dtype=conv_dtype
+        )
+        ssm_dtype = self.dt_proj.weight.dtype if dtype is None else dtype
+        # ssm_dtype = torch.float32
+        ssm_state = torch.zeros(
+            batch_size, self.d_model * self.expand, self.d_state, device=device, dtype=ssm_dtype
+        )
+        return conv_state, ssm_state
+    def _get_states_from_cache(self, inference_params, batch_size, initialize_states=False):
+        assert self.layer_idx is not None
+        if self.layer_idx not in inference_params.key_value_memory_dict:
+            batch_shape = (batch_size,)
+            conv_state = torch.zeros(
+                batch_size,
+                self.d_model * self.expand,
+                self.d_conv,
+                device=self.conv1d.weight.device,
+                dtype=self.conv1d.weight.dtype,
+            )
+            ssm_state = torch.zeros(
+                batch_size,
+                self.d_model * self.expand,
+                self.d_state,
+                device=self.dt_proj.weight.device,
+                dtype=self.dt_proj.weight.dtype,
+                # dtype=torch.float32,
+            )
+            inference_params.key_value_memory_dict[self.layer_idx] = (conv_state, ssm_state)
+        else:
+            conv_state, ssm_state = inference_params.key_value_memory_dict[self.layer_idx]
+            # TODO: What if batch size changes between generation, and we reuse the same states?
+            if initialize_states:
+                conv_state.zero_()
+                ssm_state.zero_()
+        return conv_state, ssm_state
+class Block(nn.Module):
+    def __init__(
+        self, dim, mixer_cls, norm_cls=nn.LayerNorm, fused_add_norm=False, residual_in_fp32=False
+    ):
+        """
+        Simple block wrapping a mixer class with LayerNorm/RMSNorm and residual connection"
+        This Block has a slightly different structure compared to a regular
+        prenorm Transformer block.
+        The standard block is: LN -> MHA/MLP -> Add.
+        [Ref: https://arxiv.org/abs/2002.04745]
+        Here we have: Add -> LN -> Mixer, returning both
+        the hidden_states (output of the mixer) and the residual.
+        This is purely for performance reasons, as we can fuse add and LayerNorm.
+        The residual needs to be provided (except for the very first block).
+        """
+        super().__init__()
+        self.residual_in_fp32 = residual_in_fp32
+        self.fused_add_norm = fused_add_norm
+        self.mixer = mixer_cls(dim)
+        self.norm = norm_cls(dim)
+        if self.fused_add_norm:
+            assert RMSNorm is not None, "RMSNorm import fails"
+            assert isinstance(
+                self.norm, (nn.LayerNorm, RMSNorm)
+            ), "Only LayerNorm and RMSNorm are supported for fused_add_norm"
+    def forward(
+        self, hidden_states: Tensor, residual: Optional[Tensor] = None, inference_params=None
+    ):
+        r"""Pass the input through the encoder layer.
+        Args:
+            hidden_states: the sequence to the encoder layer (required).
+            residual: hidden_states = Mixer(LN(residual))
+        """
+        if not self.fused_add_norm:
+            residual = (hidden_states + residual) if residual is not None else hidden_states
+            hidden_states = self.norm(residual.to(dtype=self.norm.weight.dtype))
+            if self.residual_in_fp32:
+                residual = residual.to(torch.float32)
+        else:
+            fused_add_norm_fn = rms_norm_fn if isinstance(self.norm, RMSNorm) else layer_norm_fn
+            hidden_states, residual = fused_add_norm_fn(
+                hidden_states,
+                self.norm.weight,
+                self.norm.bias,
+                residual=residual,
+                prenorm=True,
+                residual_in_fp32=self.residual_in_fp32,
+                eps=self.norm.eps,
+            )
+        hidden_states = self.mixer(hidden_states, inference_params=inference_params)
+        return hidden_states, residual
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return self.mixer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)

mamba_ssm/ops/__init__.py ADDED Viewed

File without changes

mamba_ssm/ops/selective_scan_interface.py ADDED Viewed

	@@ -0,0 +1,357 @@

+# Copyright (c) 2023, Tri Dao, Albert Gu.
+import torch
+import torch.nn.functional as F
+from torch.cuda.amp import custom_bwd, custom_fwd
+from einops import rearrange, repeat
+try:
+    from causal_conv1d import causal_conv1d_fn
+    import causal_conv1d_cuda
+except ImportError:
+    causal_conv1d_fn = None
+    causal_conv1d_cuda = None
+import selective_scan_cuda
+class SelectiveScanFn(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, u, delta, A, B, C, D=None, z=None, delta_bias=None, delta_softplus=False,
+                return_last_state=False):
+        if u.stride(-1) != 1:
+            u = u.contiguous()
+        if delta.stride(-1) != 1:
+            delta = delta.contiguous()
+        if D is not None:
+            D = D.contiguous()
+        if B.stride(-1) != 1:
+            B = B.contiguous()
+        if C.stride(-1) != 1:
+            C = C.contiguous()
+        if z is not None and z.stride(-1) != 1:
+            z = z.contiguous()
+        if B.dim() == 3:
+            B = rearrange(B, "b dstate l -> b 1 dstate l")
+            ctx.squeeze_B = True
+        if C.dim() == 3:
+            C = rearrange(C, "b dstate l -> b 1 dstate l")
+            ctx.squeeze_C = True
+        out, x, *rest = selective_scan_cuda.fwd(u, delta, A, B, C, D, z, delta_bias, delta_softplus)
+        ctx.delta_softplus = delta_softplus
+        ctx.has_z = z is not None
+        last_state = x[:, :, -1, 1::2]  # (batch, dim, dstate)
+        if not ctx.has_z:
+            ctx.save_for_backward(u, delta, A, B, C, D, delta_bias, x)
+            return out if not return_last_state else (out, last_state)
+        else:
+            ctx.save_for_backward(u, delta, A, B, C, D, z, delta_bias, x, out)
+            out_z = rest[0]
+            return out_z if not return_last_state else (out_z, last_state)
+    @staticmethod
+    def backward(ctx, dout, *args):
+        if not ctx.has_z:
+            u, delta, A, B, C, D, delta_bias, x = ctx.saved_tensors
+            z = None
+            out = None
+        else:
+            u, delta, A, B, C, D, z, delta_bias, x, out = ctx.saved_tensors
+        if dout.stride(-1) != 1:
+            dout = dout.contiguous()
+        # The kernel supports passing in a pre-allocated dz (e.g., in case we want to fuse the
+        # backward of selective_scan_cuda with the backward of chunk).
+        # Here we just pass in None and dz will be allocated in the C++ code.
+        du, ddelta, dA, dB, dC, dD, ddelta_bias, *rest = selective_scan_cuda.bwd(
+            u, delta, A, B, C, D, z, delta_bias, dout, x, out, None, ctx.delta_softplus,
+            False  # option to recompute out_z, not used here
+        )
+        dz = rest[0] if ctx.has_z else None
+        dB = dB.squeeze(1) if getattr(ctx, "squeeze_B", False) else dB
+        dC = dC.squeeze(1) if getattr(ctx, "squeeze_C", False) else dC
+        return (du, ddelta, dA, dB, dC,
+                dD if D is not None else None,
+                dz,
+                ddelta_bias if delta_bias is not None else None,
+                None,
+                None)
+def selective_scan_fn(u, delta, A, B, C, D=None, z=None, delta_bias=None, delta_softplus=False,
+                     return_last_state=False):
+    """if return_last_state is True, returns (out, last_state)
+    last_state has shape (batch, dim, dstate). Note that the gradient of the last state is
+    not considered in the backward pass.
+    """
+    return SelectiveScanFn.apply(u, delta, A, B, C, D, z, delta_bias, delta_softplus, return_last_state)
+def selective_scan_ref(u, delta, A, B, C, D=None, z=None, delta_bias=None, delta_softplus=False,
+                      return_last_state=False):
+    """
+    u: r(B D L)
+    delta: r(B D L)
+    A: c(D N) or r(D N)
+    B: c(D N) or r(B N L) or r(B N 2L) or r(B G N L) or (B G N L)
+    C: c(D N) or r(B N L) or r(B N 2L) or r(B G N L) or (B G N L)
+    D: r(D)
+    z: r(B D L)
+    delta_bias: r(D), fp32
+    out: r(B D L)
+    last_state (optional): r(B D dstate) or c(B D dstate)
+    """
+    dtype_in = u.dtype
+    u = u.float()
+    delta = delta.float()
+    if delta_bias is not None:
+        delta = delta + delta_bias[..., None].float()
+    if delta_softplus:
+        delta = F.softplus(delta)
+    batch, dim, dstate = u.shape[0], A.shape[0], A.shape[1]
+    is_variable_B = B.dim() >= 3
+    is_variable_C = C.dim() >= 3
+    if A.is_complex():
+        if is_variable_B:
+            B = torch.view_as_complex(rearrange(B.float(), "... (L two) -> ... L two", two=2))
+        if is_variable_C:
+            C = torch.view_as_complex(rearrange(C.float(), "... (L two) -> ... L two", two=2))
+    else:
+        B = B.float()
+        C = C.float()
+    x = A.new_zeros((batch, dim, dstate))
+    ys = []
+    deltaA = torch.exp(torch.einsum('bdl,dn->bdln', delta, A))
+    if not is_variable_B:
+        deltaB_u = torch.einsum('bdl,dn,bdl->bdln', delta, B, u)
+    else:
+        if B.dim() == 3:
+            deltaB_u = torch.einsum('bdl,bnl,bdl->bdln', delta, B, u)
+        else:
+            B = repeat(B, "B G N L -> B (G H) N L", H=dim // B.shape[1])
+            deltaB_u = torch.einsum('bdl,bdnl,bdl->bdln', delta, B, u)
+    if is_variable_C and C.dim() == 4:
+        C = repeat(C, "B G N L -> B (G H) N L", H=dim // C.shape[1])
+    last_state = None
+    for i in range(u.shape[2]):
+        x = deltaA[:, :, i] * x + deltaB_u[:, :, i]
+        if not is_variable_C:
+            y = torch.einsum('bdn,dn->bd', x, C)
+        else:
+            if C.dim() == 3:
+                y = torch.einsum('bdn,bn->bd', x, C[:, :, i])
+            else:
+                y = torch.einsum('bdn,bdn->bd', x, C[:, :, :, i])
+        if i == u.shape[2] - 1:
+            last_state = x
+        if y.is_complex():
+            y = y.real * 2
+        ys.append(y)
+    y = torch.stack(ys, dim=2) # (batch dim L)
+    out = y if D is None else y + u * rearrange(D, "d -> d 1")
+    if z is not None:
+        out = out * F.silu(z)
+    out = out.to(dtype=dtype_in)
+    return out if not return_last_state else (out, last_state)
+class MambaInnerFn(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd
+    def forward(ctx, xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,
+                out_proj_weight, out_proj_bias,
+                A, B=None, C=None, D=None, delta_bias=None, B_proj_bias=None,
+                C_proj_bias=None, delta_softplus=True, checkpoint_lvl=1):
+        """
+             xz: (batch, dim, seqlen)
+        """
+        assert causal_conv1d_cuda is not None, "causal_conv1d_cuda is not available. Please install causal-conv1d."
+        assert checkpoint_lvl in [0, 1]
+        L = xz.shape[-1]
+        delta_rank = delta_proj_weight.shape[1]
+        d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
+        if torch.is_autocast_enabled():
+            x_proj_weight = x_proj_weight.to(dtype=torch.get_autocast_gpu_dtype())
+            delta_proj_weight = delta_proj_weight.to(dtype=torch.get_autocast_gpu_dtype())
+            out_proj_weight = out_proj_weight.to(dtype=torch.get_autocast_gpu_dtype())
+            out_proj_bias = (out_proj_bias.to(dtype=torch.get_autocast_gpu_dtype())
+                             if out_proj_bias is not None else None)
+        if xz.stride(-1) != 1:
+            xz = xz.contiguous()
+        conv1d_weight = rearrange(conv1d_weight, "d 1 w -> d w")
+        x, z = xz.chunk(2, dim=1)
+        conv1d_bias = conv1d_bias.contiguous() if conv1d_bias is not None else None
+        conv1d_out = causal_conv1d_cuda.causal_conv1d_fwd(
+            x, conv1d_weight, conv1d_bias, None, None, None, True
+        )
+        # We're being very careful here about the layout, to avoid extra transposes.
+        # We want delta to have d as the slowest moving dimension
+        # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
+        x_dbl = F.linear(rearrange(conv1d_out, 'b d l -> (b l) d'), x_proj_weight)  # (bl d)
+        delta = rearrange(delta_proj_weight @ x_dbl[:, :delta_rank].t(), "d (b l) -> b d l", l = L)
+        ctx.is_variable_B = B is None
+        ctx.is_variable_C = C is None
+        ctx.B_proj_bias_is_None = B_proj_bias is None
+        ctx.C_proj_bias_is_None = C_proj_bias is None
+        if B is None:  # variable B
+            B = x_dbl[:, delta_rank:delta_rank + d_state]  # (bl dstate)
+            if B_proj_bias is not None:
+                B = B + B_proj_bias.to(dtype=B.dtype)
+            if not A.is_complex():
+                # B = rearrange(B, "(b l) dstate -> b dstate l", l=L).contiguous()
+                B = rearrange(B, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
+            else:
+                B = rearrange(B, "(b l) (dstate two) -> b 1 dstate (l two)", l=L, two=2).contiguous()
+        else:
+            if B.stride(-1) != 1:
+                B = B.contiguous()
+        if C is None:  # variable C
+            C = x_dbl[:, -d_state:]  # (bl dstate)
+            if C_proj_bias is not None:
+                C = C + C_proj_bias.to(dtype=C.dtype)
+            if not A.is_complex():
+                # C = rearrange(C, "(b l) dstate -> b dstate l", l=L).contiguous()
+                C = rearrange(C, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
+            else:
+                C = rearrange(C, "(b l) (dstate two) -> b 1 dstate (l two)", l=L, two=2).contiguous()
+        else:
+            if C.stride(-1) != 1:
+                C = C.contiguous()
+        if D is not None:
+            D = D.contiguous()
+        out, scan_intermediates, out_z = selective_scan_cuda.fwd(
+            conv1d_out, delta, A, B, C, D, z, delta_bias, delta_softplus
+        )
+        ctx.delta_softplus = delta_softplus
+        ctx.out_proj_bias_is_None = out_proj_bias is None
+        ctx.checkpoint_lvl = checkpoint_lvl
+        if checkpoint_lvl >= 1:  # Will recompute conv1d_out and delta in the backward pass
+            conv1d_out, delta = None, None
+        ctx.save_for_backward(xz, conv1d_weight, conv1d_bias, x_dbl, x_proj_weight,
+                              delta_proj_weight, out_proj_weight, conv1d_out, delta,
+                              A, B, C, D, delta_bias, scan_intermediates, out)
+        return F.linear(rearrange(out_z, "b d l -> b l d"), out_proj_weight, out_proj_bias)
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dout):
+        # dout: (batch, seqlen, dim)
+        assert causal_conv1d_cuda is not None, "causal_conv1d_cuda is not available. Please install causal-conv1d."
+        (xz, conv1d_weight, conv1d_bias, x_dbl, x_proj_weight, delta_proj_weight, out_proj_weight,
+         conv1d_out, delta, A, B, C, D, delta_bias, scan_intermediates, out) = ctx.saved_tensors
+        L = xz.shape[-1]
+        delta_rank = delta_proj_weight.shape[1]
+        d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
+        x, z = xz.chunk(2, dim=1)
+        if dout.stride(-1) != 1:
+            dout = dout.contiguous()
+        if ctx.checkpoint_lvl == 1:
+            conv1d_out = causal_conv1d_cuda.causal_conv1d_fwd(
+                x, conv1d_weight, conv1d_bias, None, None, None, True
+            )
+            delta = rearrange(delta_proj_weight @ x_dbl[:, :delta_rank].t(),
+                              "d (b l) -> b d l", l = L)
+        # The kernel supports passing in a pre-allocated dz (e.g., in case we want to fuse the
+        # backward of selective_scan_cuda with the backward of chunk).
+        dxz = torch.empty_like(xz)  # (batch, dim, seqlen)
+        dx, dz = dxz.chunk(2, dim=1)
+        dout = rearrange(dout, "b l e -> e (b l)")
+        dout_y = rearrange(out_proj_weight.t() @ dout, "d (b l) -> b d l", l=L)
+        dconv1d_out, ddelta, dA, dB, dC, dD, ddelta_bias, dz, out_z = selective_scan_cuda.bwd(
+            conv1d_out, delta, A, B, C, D, z, delta_bias, dout_y, scan_intermediates, out, dz,
+            ctx.delta_softplus,
+            True  # option to recompute out_z
+        )
+        dout_proj_weight = torch.einsum("eB,dB->ed", dout, rearrange(out_z, "b d l -> d (b l)"))
+        dout_proj_bias = dout.sum(dim=(0, 1)) if not ctx.out_proj_bias_is_None else None
+        dD = dD if D is not None else None
+        dx_dbl = torch.empty_like(x_dbl)
+        dB_proj_bias = None
+        if ctx.is_variable_B:
+            if not A.is_complex():
+                dB = rearrange(dB, "b 1 dstate l -> (b l) dstate").contiguous()
+            else:
+                dB = rearrange(dB, "b 1 dstate (l two) -> (b l) (dstate two)", two=2).contiguous()
+            dB_proj_bias = dB.sum(0) if not ctx.B_proj_bias_is_None else None
+            dx_dbl[:, delta_rank:delta_rank + d_state] = dB  # (bl d)
+            dB = None
+        dC_proj_bias = None
+        if ctx.is_variable_C:
+            if not A.is_complex():
+                dC = rearrange(dC, "b 1 dstate l -> (b l) dstate").contiguous()
+            else:
+                dC = rearrange(dC, "b 1 dstate (l two) -> (b l) (dstate two)", two=2).contiguous()
+            dC_proj_bias = dC.sum(0) if not ctx.C_proj_bias_is_None else None
+            dx_dbl[:, -d_state:] = dC  # (bl d)
+            dC = None
+        ddelta = rearrange(ddelta, "b d l -> d (b l)")
+        ddelta_proj_weight = torch.einsum("dB,Br->dr", ddelta, x_dbl[:, :delta_rank])
+        dx_dbl[:, :delta_rank] = torch.einsum("dB,dr->Br", ddelta, delta_proj_weight)
+        dconv1d_out = rearrange(dconv1d_out, "b d l -> d (b l)")
+        dx_proj_weight = torch.einsum("Br,Bd->rd", dx_dbl, rearrange(conv1d_out, "b d l -> (b l) d"))
+        dconv1d_out = torch.addmm(dconv1d_out, x_proj_weight.t(), dx_dbl.t(), out=dconv1d_out)
+        dconv1d_out = rearrange(dconv1d_out, "d (b l) -> b d l", b=x.shape[0], l=x.shape[-1])
+        # The kernel supports passing in a pre-allocated dx (e.g., in case we want to fuse the
+        # backward of conv1d with the backward of chunk).
+        dx, dconv1d_weight, dconv1d_bias, *_ = causal_conv1d_cuda.causal_conv1d_bwd(
+            x, conv1d_weight, conv1d_bias, dconv1d_out, None, None, None, dx, False, True
+        )
+        dconv1d_bias = dconv1d_bias if conv1d_bias is not None else None
+        dconv1d_weight = rearrange(dconv1d_weight, "d w -> d 1 w")
+        return (dxz, dconv1d_weight, dconv1d_bias, dx_proj_weight, ddelta_proj_weight,
+                dout_proj_weight, dout_proj_bias,
+                dA, dB, dC, dD,
+                ddelta_bias if delta_bias is not None else None,
+                dB_proj_bias, dC_proj_bias, None)
+def mamba_inner_fn(
+    xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,
+    out_proj_weight, out_proj_bias,
+    A, B=None, C=None, D=None, delta_bias=None, B_proj_bias=None,
+    C_proj_bias=None, delta_softplus=True
+):
+    return MambaInnerFn.apply(xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,
+                              out_proj_weight, out_proj_bias,
+                              A, B, C, D, delta_bias, B_proj_bias, C_proj_bias, delta_softplus)
+def mamba_inner_ref(
+    xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,
+    out_proj_weight, out_proj_bias,
+    A, B=None, C=None, D=None, delta_bias=None, B_proj_bias=None,
+    C_proj_bias=None, delta_softplus=True
+):
+    assert causal_conv1d_fn is not None, "causal_conv1d_fn is not available. Please install causal-conv1d."
+    L = xz.shape[-1]
+    delta_rank = delta_proj_weight.shape[1]
+    d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
+    x, z = xz.chunk(2, dim=1)
+    x = causal_conv1d_fn(x, rearrange(conv1d_weight, "d 1 w -> d w"), conv1d_bias, activation="silu")
+    # We're being very careful here about the layout, to avoid extra transposes.
+    # We want delta to have d as the slowest moving dimension
+    # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
+    x_dbl = F.linear(rearrange(x, 'b d l -> (b l) d'), x_proj_weight)  # (bl d)
+    delta = delta_proj_weight @ x_dbl[:, :delta_rank].t()
+    delta = rearrange(delta, "d (b l) -> b d l", l=L)
+    if B is None:  # variable B
+        B = x_dbl[:, delta_rank:delta_rank + d_state]  # (bl d)
+        if B_proj_bias is not None:
+            B = B + B_proj_bias.to(dtype=B.dtype)
+        if not A.is_complex():
+            B = rearrange(B, "(b l) dstate -> b dstate l", l=L).contiguous()
+        else:
+            B = rearrange(B, "(b l) (dstate two) -> b dstate (l two)", l=L, two=2).contiguous()
+    if C is None:  # variable B
+        C = x_dbl[:, -d_state:]  # (bl d)
+        if C_proj_bias is not None:
+            C = C + C_proj_bias.to(dtype=C.dtype)
+        if not A.is_complex():
+            C = rearrange(C, "(b l) dstate -> b dstate l", l=L).contiguous()
+        else:
+            C = rearrange(C, "(b l) (dstate two) -> b dstate (l two)", l=L, two=2).contiguous()
+    y = selective_scan_fn(x, delta, A, B, C, D, z=z, delta_bias=delta_bias, delta_softplus=True)
+    return F.linear(rearrange(y, "b d l -> b l d"), out_proj_weight, out_proj_bias)

mamba_ssm/ops/triton/__init__.py ADDED Viewed

File without changes

mamba_ssm/ops/triton/layernorm.py ADDED Viewed

	@@ -0,0 +1,635 @@

+# Copyright (c) 2023, Tri Dao.
+# Implement residual + layer_norm / rms_norm.
+# Based on the Triton LayerNorm tutorial: https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
+# For the backward pass, we keep weight_grad and bias_grad in registers and accumulate.
+# This is faster for dimensions up to 8k, but after that it's much slower due to register spilling.
+# The models we train have hidden dim up to 8k anyway (e.g. Llama 70B), so this is fine.
+import math
+import torch
+import torch.nn.functional as F
+from torch.cuda.amp import custom_fwd, custom_bwd
+import triton
+import triton.language as tl
+def layer_norm_ref(x, weight, bias, residual=None, eps=1e-6, prenorm=False, upcast=False):
+    dtype = x.dtype
+    if upcast:
+        weight = weight.float()
+        bias = bias.float() if bias is not None else None
+    if upcast:
+        x = x.float()
+        residual = residual.float() if residual is not None else residual
+    if residual is not None:
+        x = (x + residual).to(x.dtype)
+    out = F.layer_norm(x.to(weight.dtype), x.shape[-1:], weight=weight, bias=bias, eps=eps).to(
+        dtype
+    )
+    return out if not prenorm else (out, x)
+def rms_norm_ref(x, weight, bias, residual=None, eps=1e-6, prenorm=False, upcast=False):
+    dtype = x.dtype
+    if upcast:
+        weight = weight.float()
+        bias = bias.float() if bias is not None else None
+    if upcast:
+        x = x.float()
+        residual = residual.float() if residual is not None else residual
+    if residual is not None:
+        x = (x + residual).to(x.dtype)
+    rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps)
+    out = (x * rstd * weight) + bias if bias is not None else (x * rstd * weight)
+    out = out.to(dtype)
+    return out if not prenorm else (out, x)
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=1),
+        triton.Config({}, num_warps=2),
+        triton.Config({}, num_warps=4),
+        triton.Config({}, num_warps=8),
+        triton.Config({}, num_warps=16),
+        triton.Config({}, num_warps=32),
+    ],
+    key=["N", "HAS_RESIDUAL", "STORE_RESIDUAL_OUT", "IS_RMS_NORM", "HAS_BIAS"],
+)
+# @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
+# @triton.heuristics({"HAS_RESIDUAL": lambda args: args["RESIDUAL"] is not None})
+@triton.jit
+def _layer_norm_fwd_1pass_kernel(
+    X,  # pointer to the input
+    Y,  # pointer to the output
+    W,  # pointer to the weights
+    B,  # pointer to the biases
+    RESIDUAL,  # pointer to the residual
+    RESIDUAL_OUT,  # pointer to the residual
+    Mean,  # pointer to the mean
+    Rstd,  # pointer to the 1/std
+    stride_x_row,  # how much to increase the pointer when moving by 1 row
+    stride_y_row,
+    stride_res_row,
+    stride_res_out_row,
+    N,  # number of columns in X
+    eps,  # epsilon to avoid division by zero
+    IS_RMS_NORM: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    HAS_RESIDUAL: tl.constexpr,
+    STORE_RESIDUAL_OUT: tl.constexpr,
+    HAS_BIAS: tl.constexpr,
+):
+    # Map the program id to the row of X and Y it should compute.
+    row = tl.program_id(0)
+    X += row * stride_x_row
+    Y += row * stride_y_row
+    if HAS_RESIDUAL:
+        RESIDUAL += row * stride_res_row
+    if STORE_RESIDUAL_OUT:
+        RESIDUAL_OUT += row * stride_res_out_row
+    # Compute mean and variance
+    cols = tl.arange(0, BLOCK_N)
+    x = tl.load(X + cols, mask=cols < N, other=0.0).to(tl.float32)
+    if HAS_RESIDUAL:
+        residual = tl.load(RESIDUAL + cols, mask=cols < N, other=0.0).to(tl.float32)
+        x += residual
+    if STORE_RESIDUAL_OUT:
+        tl.store(RESIDUAL_OUT + cols, x, mask=cols < N)
+    if not IS_RMS_NORM:
+        mean = tl.sum(x, axis=0) / N
+        tl.store(Mean + row, mean)
+        xbar = tl.where(cols < N, x - mean, 0.0)
+        var = tl.sum(xbar * xbar, axis=0) / N
+    else:
+        xbar = tl.where(cols < N, x, 0.0)
+        var = tl.sum(xbar * xbar, axis=0) / N
+    rstd = 1 / tl.sqrt(var + eps)
+    tl.store(Rstd + row, rstd)
+    # Normalize and apply linear transformation
+    mask = cols < N
+    w = tl.load(W + cols, mask=mask).to(tl.float32)
+    if HAS_BIAS:
+        b = tl.load(B + cols, mask=mask).to(tl.float32)
+    x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
+    y = x_hat * w + b if HAS_BIAS else x_hat * w
+    # Write output
+    tl.store(Y + cols, y, mask=mask)
+def _layer_norm_fwd(
+    x, weight, bias, eps, residual=None, out_dtype=None, residual_dtype=None, is_rms_norm=False
+):
+    if residual is not None:
+        residual_dtype = residual.dtype
+    M, N = x.shape
+    assert x.stride(-1) == 1
+    if residual is not None:
+        assert residual.stride(-1) == 1
+        assert residual.shape == (M, N)
+    assert weight.shape == (N,)
+    assert weight.stride(-1) == 1
+    if bias is not None:
+        assert bias.stride(-1) == 1
+        assert bias.shape == (N,)
+    # allocate output
+    y = torch.empty_like(x, dtype=x.dtype if out_dtype is None else out_dtype)
+    assert y.stride(-1) == 1
+    if residual is not None or (residual_dtype is not None and residual_dtype != x.dtype):
+        residual_out = torch.empty(M, N, device=x.device, dtype=residual_dtype)
+        assert residual_out.stride(-1) == 1
+    else:
+        residual_out = None
+    mean = torch.empty((M,), dtype=torch.float32, device=x.device) if not is_rms_norm else None
+    rstd = torch.empty((M,), dtype=torch.float32, device=x.device)
+    # Less than 64KB per feature: enqueue fused kernel
+    MAX_FUSED_SIZE = 65536 // x.element_size()
+    BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
+    if N > BLOCK_N:
+        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
+    # heuristics for number of warps
+    with torch.cuda.device(x.device.index):
+        _layer_norm_fwd_1pass_kernel[(M,)](
+            x,
+            y,
+            weight,
+            bias,
+            residual,
+            residual_out,
+            mean,
+            rstd,
+            x.stride(0),
+            y.stride(0),
+            residual.stride(0) if residual is not None else 0,
+            residual_out.stride(0) if residual_out is not None else 0,
+            N,
+            eps,
+            is_rms_norm,
+            BLOCK_N,
+            residual is not None,
+            residual_out is not None,
+            bias is not None,
+        )
+    # residual_out is None if residual is None and residual_dtype == input_dtype
+    return y, mean, rstd, residual_out if residual_out is not None else x
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=1),
+        triton.Config({}, num_warps=2),
+        triton.Config({}, num_warps=4),
+        triton.Config({}, num_warps=8),
+        triton.Config({}, num_warps=16),
+        triton.Config({}, num_warps=32),
+    ],
+    key=["N", "HAS_DRESIDUAL", "STORE_DRESIDUAL", "IS_RMS_NORM", "HAS_BIAS"],
+)
+# @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
+# @triton.heuristics({"HAS_DRESIDUAL": lambda args: args["DRESIDUAL"] is not None})
+# @triton.heuristics({"STORE_DRESIDUAL": lambda args: args["DRESIDUAL_IN"] is not None})
+@triton.heuristics({"RECOMPUTE_OUTPUT": lambda args: args["Y"] is not None})
+@triton.jit
+def _layer_norm_bwd_kernel(
+    X,  # pointer to the input
+    W,  # pointer to the weights
+    B,  # pointer to the biases
+    Y,  # pointer to the output to be recomputed
+    DY,  # pointer to the output gradient
+    DX,  # pointer to the input gradient
+    DW,  # pointer to the partial sum of weights gradient
+    DB,  # pointer to the partial sum of biases gradient
+    DRESIDUAL,
+    DRESIDUAL_IN,
+    Mean,  # pointer to the mean
+    Rstd,  # pointer to the 1/std
+    stride_x_row,  # how much to increase the pointer when moving by 1 row
+    stride_y_row,
+    stride_dy_row,
+    stride_dx_row,
+    stride_dres_row,
+    stride_dres_in_row,
+    M,  # number of rows in X
+    N,  # number of columns in X
+    eps,  # epsilon to avoid division by zero
+    rows_per_program,
+    IS_RMS_NORM: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    HAS_DRESIDUAL: tl.constexpr,
+    STORE_DRESIDUAL: tl.constexpr,
+    HAS_BIAS: tl.constexpr,
+    RECOMPUTE_OUTPUT: tl.constexpr,
+):
+    # Map the program id to the elements of X, DX, and DY it should compute.
+    row_block_id = tl.program_id(0)
+    row_start = row_block_id * rows_per_program
+    cols = tl.arange(0, BLOCK_N)
+    mask = cols < N
+    X += row_start * stride_x_row
+    if HAS_DRESIDUAL:
+        DRESIDUAL += row_start * stride_dres_row
+    if STORE_DRESIDUAL:
+        DRESIDUAL_IN += row_start * stride_dres_in_row
+    DY += row_start * stride_dy_row
+    DX += row_start * stride_dx_row
+    if RECOMPUTE_OUTPUT:
+        Y += row_start * stride_y_row
+    w = tl.load(W + cols, mask=mask).to(tl.float32)
+    if RECOMPUTE_OUTPUT and HAS_BIAS:
+        b = tl.load(B + cols, mask=mask, other=0.0).to(tl.float32)
+    dw = tl.zeros((BLOCK_N,), dtype=tl.float32)
+    if HAS_BIAS:
+        db = tl.zeros((BLOCK_N,), dtype=tl.float32)
+    row_end = min((row_block_id + 1) * rows_per_program, M)
+    for row in range(row_start, row_end):
+        # Load data to SRAM
+        x = tl.load(X + cols, mask=mask, other=0).to(tl.float32)
+        dy = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
+        if not IS_RMS_NORM:
+            mean = tl.load(Mean + row)
+        rstd = tl.load(Rstd + row)
+        # Compute dx
+        xhat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
+        xhat = tl.where(mask, xhat, 0.0)
+        if RECOMPUTE_OUTPUT:
+            y = xhat * w + b if HAS_BIAS else xhat * w
+            tl.store(Y + cols, y, mask=mask)
+        wdy = w * dy
+        dw += dy * xhat
+        if HAS_BIAS:
+            db += dy
+        if not IS_RMS_NORM:
+            c1 = tl.sum(xhat * wdy, axis=0) / N
+            c2 = tl.sum(wdy, axis=0) / N
+            dx = (wdy - (xhat * c1 + c2)) * rstd
+        else:
+            c1 = tl.sum(xhat * wdy, axis=0) / N
+            dx = (wdy - xhat * c1) * rstd
+        if HAS_DRESIDUAL:
+            dres = tl.load(DRESIDUAL + cols, mask=mask, other=0).to(tl.float32)
+            dx += dres
+        # Write dx
+        if STORE_DRESIDUAL:
+            tl.store(DRESIDUAL_IN + cols, dx, mask=mask)
+        tl.store(DX + cols, dx, mask=mask)
+        X += stride_x_row
+        if HAS_DRESIDUAL:
+            DRESIDUAL += stride_dres_row
+        if STORE_DRESIDUAL:
+            DRESIDUAL_IN += stride_dres_in_row
+        if RECOMPUTE_OUTPUT:
+            Y += stride_y_row
+        DY += stride_dy_row
+        DX += stride_dx_row
+    tl.store(DW + row_block_id * N + cols, dw, mask=mask)
+    if HAS_BIAS:
+        tl.store(DB + row_block_id * N + cols, db, mask=mask)
+def _layer_norm_bwd(
+    dy,
+    x,
+    weight,
+    bias,
+    eps,
+    mean,
+    rstd,
+    dresidual=None,
+    has_residual=False,
+    is_rms_norm=False,
+    x_dtype=None,
+    recompute_output=False,
+):
+    M, N = x.shape
+    assert x.stride(-1) == 1
+    assert dy.stride(-1) == 1
+    assert dy.shape == (M, N)
+    if dresidual is not None:
+        assert dresidual.stride(-1) == 1
+        assert dresidual.shape == (M, N)
+    assert weight.shape == (N,)
+    assert weight.stride(-1) == 1
+    if bias is not None:
+        assert bias.stride(-1) == 1
+        assert bias.shape == (N,)
+    # allocate output
+    dx = (
+        torch.empty_like(x)
+        if x_dtype is None
+        else torch.empty(M, N, dtype=x_dtype, device=x.device)
+    )
+    dresidual_in = torch.empty_like(x) if has_residual and dx.dtype != x.dtype else None
+    y = torch.empty(M, N, dtype=dy.dtype, device=dy.device) if recompute_output else None
+    # Less than 64KB per feature: enqueue fused kernel
+    MAX_FUSED_SIZE = 65536 // x.element_size()
+    BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
+    if N > BLOCK_N:
+        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
+    sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count
+    _dw = torch.empty((sm_count, N), dtype=torch.float32, device=weight.device)
+    _db = (
+        torch.empty((sm_count, N), dtype=torch.float32, device=bias.device)
+        if bias is not None
+        else None
+    )
+    rows_per_program = math.ceil(M / sm_count)
+    grid = (sm_count,)
+    with torch.cuda.device(x.device.index):
+        _layer_norm_bwd_kernel[grid](
+            x,
+            weight,
+            bias,
+            y,
+            dy,
+            dx,
+            _dw,
+            _db,
+            dresidual,
+            dresidual_in,
+            mean,
+            rstd,
+            x.stride(0),
+            0 if not recompute_output else y.stride(0),
+            dy.stride(0),
+            dx.stride(0),
+            dresidual.stride(0) if dresidual is not None else 0,
+            dresidual_in.stride(0) if dresidual_in is not None else 0,
+            M,
+            N,
+            eps,
+            rows_per_program,
+            is_rms_norm,
+            BLOCK_N,
+            dresidual is not None,
+            dresidual_in is not None,
+            bias is not None,
+        )
+    dw = _dw.sum(0).to(weight.dtype)
+    db = _db.sum(0).to(bias.dtype) if bias is not None else None
+    # Don't need to compute dresidual_in separately in this case
+    if has_residual and dx.dtype == x.dtype:
+        dresidual_in = dx
+    return (dx, dw, db, dresidual_in) if not recompute_output else (dx, dw, db, dresidual_in, y)
+class LayerNormFn(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        x,
+        weight,
+        bias,
+        residual=None,
+        eps=1e-6,
+        prenorm=False,
+        residual_in_fp32=False,
+        is_rms_norm=False,
+    ):
+        x_shape_og = x.shape
+        # reshape input data into 2D tensor
+        x = x.reshape(-1, x.shape[-1])
+        if x.stride(-1) != 1:
+            x = x.contiguous()
+        if residual is not None:
+            assert residual.shape == x_shape_og
+            residual = residual.reshape(-1, residual.shape[-1])
+            if residual.stride(-1) != 1:
+                residual = residual.contiguous()
+        weight = weight.contiguous()
+        if bias is not None:
+            bias = bias.contiguous()
+        residual_dtype = (
+            residual.dtype
+            if residual is not None
+            else (torch.float32 if residual_in_fp32 else None)
+        )
+        y, mean, rstd, residual_out = _layer_norm_fwd(
+            x, weight, bias, eps, residual, residual_dtype=residual_dtype, is_rms_norm=is_rms_norm
+        )
+        ctx.save_for_backward(residual_out, weight, bias, mean, rstd)
+        ctx.x_shape_og = x_shape_og
+        ctx.eps = eps
+        ctx.is_rms_norm = is_rms_norm
+        ctx.has_residual = residual is not None
+        ctx.prenorm = prenorm
+        ctx.x_dtype = x.dtype
+        y = y.reshape(x_shape_og)
+        return y if not prenorm else (y, residual_out.reshape(x_shape_og))
+    @staticmethod
+    def backward(ctx, dy, *args):
+        x, weight, bias, mean, rstd = ctx.saved_tensors
+        dy = dy.reshape(-1, dy.shape[-1])
+        if dy.stride(-1) != 1:
+            dy = dy.contiguous()
+        assert dy.shape == x.shape
+        if ctx.prenorm:
+            dresidual = args[0]
+            dresidual = dresidual.reshape(-1, dresidual.shape[-1])
+            if dresidual.stride(-1) != 1:
+                dresidual = dresidual.contiguous()
+            assert dresidual.shape == x.shape
+        else:
+            dresidual = None
+        dx, dw, db, dresidual_in = _layer_norm_bwd(
+            dy,
+            x,
+            weight,
+            bias,
+            ctx.eps,
+            mean,
+            rstd,
+            dresidual,
+            ctx.has_residual,
+            ctx.is_rms_norm,
+            x_dtype=ctx.x_dtype,
+        )
+        return (
+            dx.reshape(ctx.x_shape_og),
+            dw,
+            db,
+            dresidual_in.reshape(ctx.x_shape_og) if ctx.has_residual else None,
+            None,
+            None,
+            None,
+            None,
+        )
+def layer_norm_fn(
+    x,
+    weight,
+    bias,
+    residual=None,
+    eps=1e-6,
+    prenorm=False,
+    residual_in_fp32=False,
+    is_rms_norm=False,
+):
+    return LayerNormFn.apply(x, weight, bias, residual, eps, prenorm, residual_in_fp32, is_rms_norm)
+def rms_norm_fn(x, weight, bias, residual=None, prenorm=False, residual_in_fp32=False, eps=1e-6):
+    return LayerNormFn.apply(x, weight, bias, residual, eps, prenorm, residual_in_fp32, True)
+class RMSNorm(torch.nn.Module):
+    def __init__(self, hidden_size, eps=1e-5, device=None, dtype=None):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.eps = eps
+        self.weight = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
+        self.register_parameter("bias", None)
+        self.reset_parameters()
+    def reset_parameters(self):
+        torch.nn.init.ones_(self.weight)
+    def forward(self, x, residual=None, prenorm=False, residual_in_fp32=False):
+        return rms_norm_fn(
+            x,
+            self.weight,
+            self.bias,
+            residual=residual,
+            eps=self.eps,
+            prenorm=prenorm,
+            residual_in_fp32=residual_in_fp32,
+        )
+class LayerNormLinearFn(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx,
+        x,
+        norm_weight,
+        norm_bias,
+        linear_weight,
+        linear_bias,
+        residual=None,
+        eps=1e-6,
+        prenorm=False,
+        residual_in_fp32=False,
+        is_rms_norm=False,
+    ):
+        x_shape_og = x.shape
+        # reshape input data into 2D tensor
+        x = x.reshape(-1, x.shape[-1])
+        if x.stride(-1) != 1:
+            x = x.contiguous()
+        if residual is not None:
+            assert residual.shape == x_shape_og
+            residual = residual.reshape(-1, residual.shape[-1])
+            if residual.stride(-1) != 1:
+                residual = residual.contiguous()
+        norm_weight = norm_weight.contiguous()
+        if norm_bias is not None:
+            norm_bias = norm_bias.contiguous()
+        residual_dtype = (
+            residual.dtype
+            if residual is not None
+            else (torch.float32 if residual_in_fp32 else None)
+        )
+        y, mean, rstd, residual_out = _layer_norm_fwd(
+            x,
+            norm_weight,
+            norm_bias,
+            eps,
+            residual,
+            out_dtype=None if not torch.is_autocast_enabled() else torch.get_autocast_gpu_dtype(),
+            residual_dtype=residual_dtype,
+            is_rms_norm=is_rms_norm,
+        )
+        y = y.reshape(x_shape_og)
+        dtype = torch.get_autocast_gpu_dtype() if torch.is_autocast_enabled() else y.dtype
+        linear_weight = linear_weight.to(dtype)
+        linear_bias = linear_bias.to(dtype) if linear_bias is not None else None
+        out = F.linear(y.to(linear_weight.dtype), linear_weight, linear_bias)
+        # We don't store y, will be recomputed in the backward pass to save memory
+        ctx.save_for_backward(residual_out, norm_weight, norm_bias, linear_weight, mean, rstd)
+        ctx.x_shape_og = x_shape_og
+        ctx.eps = eps
+        ctx.is_rms_norm = is_rms_norm
+        ctx.has_residual = residual is not None
+        ctx.prenorm = prenorm
+        ctx.x_dtype = x.dtype
+        ctx.linear_bias_is_none = linear_bias is None
+        return out if not prenorm else (out, residual_out.reshape(x_shape_og))
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dout, *args):
+        x, norm_weight, norm_bias, linear_weight, mean, rstd = ctx.saved_tensors
+        dout = dout.reshape(-1, dout.shape[-1])
+        dy = F.linear(dout, linear_weight.t())
+        dlinear_bias = None if ctx.linear_bias_is_none else dout.sum(0)
+        if dy.stride(-1) != 1:
+            dy = dy.contiguous()
+        assert dy.shape == x.shape
+        if ctx.prenorm:
+            dresidual = args[0]
+            dresidual = dresidual.reshape(-1, dresidual.shape[-1])
+            if dresidual.stride(-1) != 1:
+                dresidual = dresidual.contiguous()
+            assert dresidual.shape == x.shape
+        else:
+            dresidual = None
+        dx, dnorm_weight, dnorm_bias, dresidual_in, y = _layer_norm_bwd(
+            dy,
+            x,
+            norm_weight,
+            norm_bias,
+            ctx.eps,
+            mean,
+            rstd,
+            dresidual,
+            ctx.has_residual,
+            ctx.is_rms_norm,
+            x_dtype=ctx.x_dtype,
+            recompute_output=True,
+        )
+        dlinear_weight = torch.einsum("bo,bi->oi", dout, y)
+        return (
+            dx.reshape(ctx.x_shape_og),
+            dnorm_weight,
+            dnorm_bias,
+            dlinear_weight,
+            dlinear_bias,
+            dresidual_in.reshape(ctx.x_shape_og) if ctx.has_residual else None,
+            None,
+            None,
+            None,
+            None,
+        )
+def layer_norm_linear_fn(
+    x,
+    norm_weight,
+    norm_bias,
+    linear_weight,
+    linear_bias,
+    residual=None,
+    eps=1e-6,
+    prenorm=False,
+    residual_in_fp32=False,
+    is_rms_norm=False,
+):
+    return LayerNormLinearFn.apply(
+        x,
+        norm_weight,
+        norm_bias,
+        linear_weight,
+        linear_bias,
+        residual,
+        eps,
+        prenorm,
+        residual_in_fp32,
+        is_rms_norm,
+    )

mamba_ssm/ops/triton/selective_state_update.py ADDED Viewed

	@@ -0,0 +1,263 @@

+# Copyright (c) 2024, Tri Dao, Albert Gu.
+"""We want triton==2.1.0 or triton==2.2.0 or triton==2.3.0 for this
+"""
+import math
+import torch
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+from einops import rearrange, repeat
+@triton.heuristics({"HAS_DT_BIAS": lambda args: args["dt_bias_ptr"] is not None})
+@triton.heuristics({"HAS_D": lambda args: args["D_ptr"] is not None})
+@triton.heuristics({"HAS_Z": lambda args: args["z_ptr"] is not None})
+@triton.heuristics({"BLOCK_SIZE_DSTATE": lambda args: triton.next_power_of_2(args["dstate"])})
+@triton.jit
+def _selective_scan_update_kernel(
+    # Pointers to matrices
+    state_ptr, x_ptr, dt_ptr, dt_bias_ptr, A_ptr, B_ptr, C_ptr, D_ptr, z_ptr, out_ptr,
+    # Matrix dimensions
+    batch, nheads, dim, dstate, nheads_ngroups_ratio,
+    # Strides
+    stride_state_batch, stride_state_head, stride_state_dim, stride_state_dstate,
+    stride_x_batch, stride_x_head, stride_x_dim,
+    stride_dt_batch, stride_dt_head, stride_dt_dim,
+    stride_dt_bias_head, stride_dt_bias_dim,
+    stride_A_head, stride_A_dim, stride_A_dstate,
+    stride_B_batch, stride_B_group, stride_B_dstate,
+    stride_C_batch, stride_C_group, stride_C_dstate,
+    stride_D_head, stride_D_dim,
+    stride_z_batch, stride_z_head, stride_z_dim,
+    stride_out_batch, stride_out_head, stride_out_dim,
+    # Meta-parameters
+    DT_SOFTPLUS: tl.constexpr,
+    TIE_HDIM: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    HAS_DT_BIAS: tl.constexpr,
+    HAS_D: tl.constexpr,
+    HAS_Z: tl.constexpr,
+    BLOCK_SIZE_DSTATE: tl.constexpr,
+):
+    pid_m = tl.program_id(axis=0)
+    pid_b = tl.program_id(axis=1)
+    pid_h = tl.program_id(axis=2)
+    state_ptr += pid_b * stride_state_batch + pid_h * stride_state_head
+    x_ptr += pid_b * stride_x_batch + pid_h * stride_x_head
+    dt_ptr += pid_b * stride_dt_batch + pid_h * stride_dt_head
+    if HAS_DT_BIAS:
+        dt_bias_ptr += pid_h * stride_dt_bias_head
+    A_ptr += pid_h * stride_A_head
+    B_ptr += pid_b * stride_B_batch + (pid_h // nheads_ngroups_ratio) * stride_B_group
+    C_ptr += pid_b * stride_C_batch + (pid_h // nheads_ngroups_ratio) * stride_C_group
+    if HAS_Z:
+        z_ptr += pid_b * stride_z_batch + pid_h * stride_z_head
+    out_ptr += pid_b * stride_out_batch + pid_h * stride_out_head
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = tl.arange(0, BLOCK_SIZE_DSTATE)
+    state_ptrs = state_ptr + (offs_m[:, None] * stride_state_dim + offs_n[None, :] * stride_state_dstate)
+    x_ptrs = x_ptr + offs_m * stride_x_dim
+    dt_ptrs = dt_ptr + offs_m * stride_dt_dim
+    if HAS_DT_BIAS:
+        dt_bias_ptrs = dt_bias_ptr + offs_m * stride_dt_bias_dim
+    if HAS_D:
+        D_ptr += pid_h * stride_D_head
+    A_ptrs = A_ptr + (offs_m[:, None] * stride_A_dim + offs_n[None, :] * stride_A_dstate)
+    B_ptrs = B_ptr + offs_n * stride_B_dstate
+    C_ptrs = C_ptr + offs_n * stride_C_dstate
+    if HAS_D:
+        D_ptrs = D_ptr + offs_m * stride_D_dim
+    if HAS_Z:
+        z_ptrs = z_ptr + offs_m * stride_z_dim
+    out_ptrs = out_ptr + offs_m * stride_out_dim
+    state = tl.load(state_ptrs, mask=(offs_m[:, None] < dim) & (offs_n[None, :] < dstate), other=0.0)
+    x = tl.load(x_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+    if not TIE_HDIM:
+        dt = tl.load(dt_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+        if HAS_DT_BIAS:
+            dt += tl.load(dt_bias_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+        if DT_SOFTPLUS:
+            dt = tl.where(dt <= 20.0, tl.math.log1p(tl.exp(dt)), dt)
+        A = tl.load(A_ptrs, mask=(offs_m[:, None] < dim) & (offs_n[None, :] < dstate), other=0.0).to(tl.float32)
+        dA = tl.exp(A * dt[:, None])
+    else:
+        dt = tl.load(dt_ptr).to(tl.float32)
+        if HAS_DT_BIAS:
+            dt += tl.load(dt_bias_ptr).to(tl.float32)
+        if DT_SOFTPLUS:
+            dt = tl.where(dt <= 20.0, tl.math.log1p(tl.exp(dt)), dt)
+        A = tl.load(A_ptr).to(tl.float32)
+        dA = tl.exp(A * dt)  # scalar, not a matrix
+    B = tl.load(B_ptrs, mask=offs_n < dstate, other=0.0).to(tl.float32)
+    C = tl.load(C_ptrs, mask=offs_n < dstate, other=0.0).to(tl.float32)
+    if HAS_D:
+        D = tl.load(D_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+    if HAS_Z:
+        z = tl.load(z_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+    if not TIE_HDIM:
+        dB = B[None, :] * dt[:, None]
+    else:
+        dB = B * dt  # vector of size (dstate,)
+    state = state * dA + dB * x[:, None]
+    tl.store(state_ptrs, state, mask=(offs_m[:, None] < dim) & (offs_n[None, :] < dstate))
+    out = tl.sum(state * C[None, :], axis=1)
+    if HAS_D:
+        out += x * D
+    if HAS_Z:
+        out *= z * tl.sigmoid(z)
+    tl.store(out_ptrs, out, mask=offs_m < dim)
+def selective_state_update(state, x, dt, A, B, C, D=None, z=None, dt_bias=None, dt_softplus=False):
+    """
+    Argument:
+        state: (batch, dim, dstate) or (batch, nheads, dim, dstate)
+        x: (batch, dim) or (batch, nheads, dim)
+        dt: (batch, dim) or (batch, nheads, dim)
+        A: (dim, dstate) or (nheads, dim, dstate)
+        B: (batch, dstate) or (batch, ngroups, dstate)
+        C: (batch, dstate) or (batch, ngroups, dstate)
+        D: (dim,) or (nheads, dim)
+        z: (batch, dim) or (batch, nheads, dim)
+        dt_bias: (dim,) or (nheads, dim)
+    Return:
+        out: (batch, dim) or (batch, nheads, dim)
+    """
+    has_heads = state.dim() > 3
+    if state.dim() == 3:
+        state = state.unsqueeze(1)
+    if x.dim() == 2:
+        x = x.unsqueeze(1)
+    if dt.dim() == 2:
+        dt = dt.unsqueeze(1)
+    if A.dim() == 2:
+        A = A.unsqueeze(0)
+    if B.dim() == 2:
+        B = B.unsqueeze(1)
+    if C.dim() == 2:
+        C = C.unsqueeze(1)
+    if D is not None and D.dim() == 1:
+        D = D.unsqueeze(0)
+    if z is not None and z.dim() == 2:
+        z = z.unsqueeze(1)
+    if dt_bias is not None and dt_bias.dim() == 1:
+        dt_bias = dt_bias.unsqueeze(0)
+    batch, nheads, dim, dstate = state.shape
+    assert x.shape == (batch, nheads, dim)
+    assert dt.shape == x.shape
+    assert A.shape == (nheads, dim, dstate)
+    ngroups = B.shape[1]
+    assert nheads % ngroups == 0, "nheads must be divisible by ngroups"
+    assert B.shape == (batch, ngroups, dstate)
+    assert C.shape == B.shape
+    if D is not None:
+        assert D.shape == (nheads, dim)
+    if z is not None:
+        assert z.shape == x.shape
+    if dt_bias is not None:
+        assert dt_bias.shape == (nheads, dim)
+    out = torch.empty_like(x)
+    grid = lambda META: (triton.cdiv(dim, META['BLOCK_SIZE_M']), batch, nheads)
+    z_strides = ((z.stride(0), z.stride(1), z.stride(2)) if z is not None else (0, 0, 0))
+    # We don't want autotune since it will overwrite the state
+    # We instead tune by hand.
+    BLOCK_SIZE_M, num_warps = ((32, 4) if dstate <= 16
+                               else ((16, 4) if dstate <= 32 else
+                                     ((8, 4) if dstate <= 64 else
+                                      ((4, 4) if dstate <= 128 else
+                                       ((4, 8))))))
+    tie_hdim = A.stride(-1) == 0 and A.stride(-2) == 0 and dt.stride(-1) == 0 and dt_bias.stride(-1) == 0
+    with torch.cuda.device(x.device.index):
+        _selective_scan_update_kernel[grid](
+            state, x, dt, dt_bias, A, B, C, D, z, out,
+            batch, nheads, dim, dstate, nheads // ngroups,
+            state.stride(0), state.stride(1), state.stride(2), state.stride(3),
+            x.stride(0), x.stride(1), x.stride(2),
+            dt.stride(0), dt.stride(1), dt.stride(2),
+            *(dt_bias.stride(0), dt_bias.stride(1)) if dt_bias is not None else 0,
+            A.stride(0), A.stride(1), A.stride(2),
+            B.stride(0), B.stride(1), B.stride(2),
+            C.stride(0), C.stride(1), C.stride(2),
+            *(D.stride(0), D.stride(1)) if D is not None else 0,
+            z_strides[0], z_strides[1], z_strides[2],
+            out.stride(0), out.stride(1), out.stride(2),
+            dt_softplus,
+            tie_hdim,
+            BLOCK_SIZE_M,
+            num_warps=num_warps,
+        )
+    if not has_heads:
+        out = out.squeeze(1)
+    return out
+def selective_state_update_ref(state, x, dt, A, B, C, D=None, z=None, dt_bias=None, dt_softplus=False):
+    """
+    Argument:
+        state: (batch, dim, dstate) or (batch, nheads, dim, dstate)
+        x: (batch, dim) or (batch, nheads, dim)
+        dt: (batch, dim) or (batch, nheads, dim)
+        A: (dim, dstate) or (nheads, dim, dstate)
+        B: (batch, dstate) or (batch, ngroups, dstate)
+        C: (batch, dstate) or (batch, ngroups, dstate)
+        D: (dim,) or (nheads, dim)
+        z: (batch, dim) or (batch, nheads, dim)
+        dt_bias: (dim,) or (nheads, dim)
+    Return:
+        out: (batch, dim) or (batch, nheads, dim)
+    """
+    has_heads = state.dim() > 3
+    if state.dim() == 3:
+        state = state.unsqueeze(1)
+    if x.dim() == 2:
+        x = x.unsqueeze(1)
+    if dt.dim() == 2:
+        dt = dt.unsqueeze(1)
+    if A.dim() == 2:
+        A = A.unsqueeze(0)
+    if B.dim() == 2:
+        B = B.unsqueeze(1)
+    if C.dim() == 2:
+        C = C.unsqueeze(1)
+    if D is not None and D.dim() == 1:
+        D = D.unsqueeze(0)
+    if z is not None and z.dim() == 2:
+        z = z.unsqueeze(1)
+    if dt_bias is not None and dt_bias.dim() == 1:
+        dt_bias = dt_bias.unsqueeze(0)
+    batch, nheads, dim, dstate = state.shape
+    assert x.shape == (batch, nheads, dim)
+    assert dt.shape == x.shape
+    assert A.shape == (nheads, dim, dstate)
+    ngroups = B.shape[1]
+    assert nheads % ngroups == 0, "nheads must be divisible by ngroups"
+    assert B.shape == (batch, ngroups, dstate)
+    assert C.shape == B.shape
+    if D is not None:
+        assert D.shape == (nheads, dim)
+    if z is not None:
+        assert z.shape == x.shape
+    if dt_bias is not None:
+        assert dt_bias.shape == (nheads, dim)
+        dt = dt + dt_bias
+    dt = F.softplus(dt) if dt_softplus else dt
+    dA = torch.exp(rearrange(dt, "b h d -> b h d 1") * A)  # (batch, nheads, dim, dstate)
+    B = repeat(B, "b g n -> b (g h) n", h=nheads // ngroups)  # (batch, nheads, dstate)
+    C = repeat(C, "b g n -> b (g h) n", h=nheads // ngroups)  # (batch, nheads, dstate)
+    dB = rearrange(dt, "b h d -> b h d 1") * rearrange(B, "b h n -> b h 1 n")  # (batch, nheads, dim, dstate)
+    state.copy_(state * dA + dB * rearrange(x, "b h d -> b h d 1"))  # (batch, dim, dstate
+    out = torch.einsum("bhdn,bhn->bhd", state.to(C.dtype), C)
+    if D is not None:
+        out += (x * D).to(out.dtype)
+    out = (out if z is None else out * F.silu(z)).to(x.dtype)
+    if not has_heads:
+        out = out.squeeze(1)
+    return out

mamba_ssm/utils/__init__.py ADDED Viewed

File without changes

mamba_ssm/utils/generation.py ADDED Viewed

	@@ -0,0 +1,387 @@

+# Copyright (c) 2023, Albert Gu, Tri Dao.
+import gc
+import time
+from collections import namedtuple
+from dataclasses import dataclass, field
+from functools import partial
+from typing import Callable, Optional, Sequence, Union
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from torch import Tensor
+from torch.profiler import ProfilerActivity, profile, record_function
+from transformers.generation import GreedySearchDecoderOnlyOutput, SampleDecoderOnlyOutput, TextStreamer
+@dataclass
+class InferenceParams:
+    """Inference parameters that are passed to the main model in order
+    to efficienly calculate and store the context during inference."""
+    max_seqlen: int
+    max_batch_size: int
+    seqlen_offset: int = 0
+    batch_size_offset: int = 0
+    key_value_memory_dict: dict = field(default_factory=dict)
+    lengths_per_sample: Optional[Tensor] = None
+    def reset(self, max_seqlen, max_batch_size):
+        self.max_seqlen = max_seqlen
+        self.max_batch_size = max_batch_size
+        self.seqlen_offset = 0
+        if self.lengths_per_sample is not None:
+            self.lengths_per_sample.zero_()
+def modify_logits_for_min_p_filtering(logits, min_p):
+    """Set the logits for none min_p values to -inf. Done in-place."""
+    if min_p <= 0.0 or min_p >= 1.0:
+        return
+    indices_to_remove = logits < min_p
+    logits.masked_fill_(indices_to_remove, float("-Inf"))
+# https://github.com/NVIDIA/Megatron-LM/blob/0bb597b42c53355a567aba2a1357cc34b9d99ddd/megatron/text_generation/sampling.py
+# https://github.com/huggingface/transformers/blob/a44985b41cfa2de48a5e1de7f1f93b7483da25d1/src/transformers/generation/logits_process.py#L231
+def modify_logits_for_top_k_filtering(logits, top_k):
+    """Set the logits for none top-k values to -inf. Done in-place."""
+    indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+    logits.masked_fill_(indices_to_remove, float("-Inf"))
+# https://github.com/NVIDIA/Megatron-LM/blob/0bb597b42c53355a567aba2a1357cc34b9d99ddd/megatron/text_generation/sampling.py
+# https://github.com/huggingface/transformers/blob/a44985b41cfa2de48a5e1de7f1f93b7483da25d1/src/transformers/generation/logits_process.py#L170
+def modify_logits_for_top_p_filtering(logits, top_p):
+    """Set the logits for none top-p values to -inf. Done in-place."""
+    if top_p <= 0.0 or top_p >= 1.0:
+        return
+    # First sort and calculate cumulative sum of probabilities.
+    sorted_logits, sorted_indices = torch.sort(logits, descending=False)
+    cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
+    # Remove tokens with cumulative top_p above the threshold (token with 0 are kept)
+    sorted_indices_to_remove = cumulative_probs <= (1 - top_p)
+    # scatter sorted tensors to original indexing
+    indices_to_remove = sorted_indices_to_remove.scatter(
+        1, sorted_indices, sorted_indices_to_remove
+    )
+    logits.masked_fill_(indices_to_remove, float("-inf"))
+def modify_logit_for_repetition_penalty(logits, prev_output_tokens, repetition_penalty=1.0):
+    """Apply repetition penalty. See https://arxiv.org/abs/1909.05858
+    logits: (batch_size, vocab_size)
+    prev_output_tokens: (batch_size, seq_len)
+    """
+    if repetition_penalty == 1.0:
+        return logits
+    score = torch.gather(logits, 1, prev_output_tokens)
+    # if score < 0 then repetition penalty has to be multiplied to reduce the previous token probability
+    score = torch.where(score < 0, score * repetition_penalty, score / repetition_penalty)
+    logits.scatter_(1, prev_output_tokens, score)
+    return logits
+def sample(logits, top_k=1, top_p=0.0, min_p=0.0, temperature=1.0):
+    """Sample from top-k logits.
+    Arguments:
+        logits: Tensor of shape (batch_size, vocab_size)
+    """
+    if top_k == 1:  # Short-circuit for greedy decoding
+        return logits.argmax(dim=-1)
+    else:
+        if top_p > 0.0:
+            assert top_p <= 1.0, "top-p should be in (0, 1]."
+        if top_k > 0:
+            top_k = min(top_k, logits.size(-1))  # Safety check
+            logits_top, indices = torch.topk(logits, top_k, dim=-1)
+            if temperature != 1.0:
+                logits_top /= temperature
+            modify_logits_for_top_p_filtering(logits_top, top_p)
+            return indices[
+                torch.arange(indices.shape[0], device=indices.device),
+                torch.multinomial(torch.softmax(logits_top, dim=-1), num_samples=1).squeeze(dim=-1),
+            ]
+        else:
+            if min_p > 0.0:
+                logits_top = logits.clone()
+                max_prob = logits_top[..., 0].item()
+                min_prob = max_prob * min_p
+                modify_logits_for_min_p_filtering(logits_top, min_p)
+                if temperature != 1.0:
+                    logits_top /= temperature
+                return torch.multinomial(torch.softmax(logits_top, dim=-1), num_samples=1).squeeze(dim=-1)
+            # Clone so that when we modify for top_p we don't change the original logits
+            logits_top = logits / temperature if temperature != 1.0 else logits.clone()
+            modify_logits_for_top_p_filtering(logits_top, top_p)
+            return torch.multinomial(torch.softmax(logits_top, dim=-1), num_samples=1).squeeze(
+                dim=-1
+            )
+@torch.inference_mode()
+def decode(
+    input_ids,
+    model,
+    max_length,
+    top_k=1,
+    top_p=0.0,
+    min_p=0.0,
+    temperature=1.0,
+    repetition_penalty=1.0,
+    eos_token_id=None,
+    teacher_outputs=None,
+    vocab_size=None,
+    cg=False,
+    enable_timing=False,
+    streamer: Optional[TextStreamer] = None
+):
+    """Decoding, either greedy or with top-k or top-p sampling.
+    If top-k = 0, don't limit the number of candidates (pure sampling).
+    Top-k and top-p can be used together. If top_k > 0 and top_p > 0, then top-k is applied first,
+    then top-p.
+    We assume that all sequences in the same batch have the same length.
+    Arguments:
+        input_ids: (batch, seq_len)
+        max_length: int
+        teacher_outputs (optional): (batch, seq_len). If provided, instead of sampling from the
+            logits, the next token is taken from the teacher_outputs. Useful for testing.
+    Returns: GreedySearchDecoderOnlyOutput or SampleDecoderOnlyOutput, with the following fields:
+        sequences: (batch, max_length)
+        scores: tuples of (batch, vocab_size)
+    """
+    if streamer is not None:
+        streamer.put(input_ids.cpu())
+    batch_size, seqlen_og = input_ids.shape
+    teacher_output_len = teacher_outputs.shape[1] if teacher_outputs is not None else 0
+    if cg:
+        if not hasattr(model, "_decoding_cache"):
+            model._decoding_cache = None
+        model._decoding_cache = update_graph_cache(
+            model,
+            model._decoding_cache,
+            batch_size,
+            seqlen_og,
+            max_length,
+        )
+        inference_params = model._decoding_cache.inference_params
+        inference_params.reset(max_length, batch_size)
+    else:
+        inference_params = InferenceParams(max_seqlen=max_length, max_batch_size=batch_size)
+    def get_logits(input_ids, inference_params):
+        decoding = inference_params.seqlen_offset > 0
+        if decoding:
+            position_ids = torch.full(
+                (batch_size, 1),
+                inference_params.seqlen_offset,
+                dtype=torch.long,
+                device=input_ids.device,
+            )
+        else:
+            position_ids = None
+        if not cg or not decoding:
+            logits = model(
+                input_ids,
+                position_ids=position_ids,
+                inference_params=inference_params,
+                num_last_tokens=1,
+            ).logits.squeeze(dim=1)
+        else:
+            logits = model._decoding_cache.run(
+                input_ids, position_ids, inference_params.seqlen_offset
+            ).squeeze(dim=1)
+        return logits[..., :vocab_size] if vocab_size is not None else logits
+    def sample_tokens(logits, inference_params):
+        if teacher_outputs is None or teacher_output_len <= inference_params.seqlen_offset:
+            token = sample(logits, top_k=top_k, top_p=top_p, min_p=min_p, temperature=temperature)
+        else:
+            token = teacher_outputs[:, inference_params.seqlen_offset]
+        # return rearrange(token, "b -> b 1")
+        return token.unsqueeze(1)
+    def should_stop(current_token, inference_params):
+        if inference_params.seqlen_offset == 0:
+            return False
+        if eos_token_id is not None and (current_token == eos_token_id).all():
+            return True
+        if inference_params.seqlen_offset >= max_length - 1:
+            return True
+        return False
+    start = torch.cuda.Event(enable_timing=enable_timing)
+    end = torch.cuda.Event(enable_timing=enable_timing)
+    if enable_timing:
+        start.record()
+    scores, sequences = [], [input_ids]
+    sequences_cat = input_ids
+    while not should_stop(sequences[-1], inference_params):
+        scores.append(get_logits(sequences[-1], inference_params))
+        inference_params.seqlen_offset += sequences[-1].shape[1]
+        if repetition_penalty == 1.0:
+            sampled_tokens = sample_tokens(scores[-1], inference_params)
+        else:
+            logits = modify_logit_for_repetition_penalty(
+                scores[-1].clone(), sequences_cat, repetition_penalty
+            )
+            sampled_tokens = sample_tokens(logits, inference_params)
+            sequences_cat = torch.cat([sequences_cat, sampled_tokens], dim=1)
+        sequences.append(sampled_tokens)
+        if streamer is not None:
+            streamer.put(sampled_tokens.cpu())
+    if streamer is not None:
+        streamer.end()
+    if enable_timing:
+        end.record()
+        torch.cuda.synchronize()
+        print(f"Prompt processing + decoding time: {(start.elapsed_time(end)):.0f}ms")
+    output_cls = GreedySearchDecoderOnlyOutput if top_k == 1 else SampleDecoderOnlyOutput
+    return output_cls(sequences=torch.cat(sequences, dim=1), scores=tuple(scores))
+class GenerationMixin:
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        raise NotImplementedError
+    def generate(
+        self,
+        input_ids,
+        max_length,
+        top_k=1,
+        top_p=0.0,
+        min_p=0.0,
+        temperature=1.0,
+        return_dict_in_generate=False,
+        output_scores=False,
+        **kwargs,
+    ):
+        output = decode(
+            input_ids, self, max_length, top_k=top_k, top_p=top_p, min_p = min_p, temperature=temperature, **kwargs
+        )
+        if not output_scores:
+            output.scores = None
+        return output if return_dict_in_generate else output.sequences
+@dataclass
+class DecodingCGCache:
+    max_batch_size: int = 0
+    max_seqlen: int = 0
+    device = None
+    dtype = None
+    callables: dict = field(default_factory=dict)
+    mempool = None
+    inference_params: Optional[InferenceParams] = None
+    run: Optional[Callable] = None
+@torch.inference_mode()
+def update_graph_cache(
+    model,
+    cache,
+    batch_size,
+    seqlen_og,
+    max_seqlen,
+    decoding_seqlens=(1,),
+    dtype=None,
+    n_warmups=2,
+):
+    if cache is None:
+        cache = DecodingCGCache()
+    param_example = next(iter(model.parameters()))
+    device = param_example.device
+    if dtype is None:
+        dtype = param_example.dtype
+    if (
+        (device, dtype) != (cache.device, cache.dtype)
+        or batch_size > cache.max_batch_size
+        or max_seqlen > cache.max_seqlen
+    ):  # Invalidate the cache
+        cache.callables = {}
+        cache.mempool = None
+        cache.inference_params = None
+        gc.collect()
+        cache.device, cache.dtype = device, dtype
+        cache.max_batch_size, cache.max_seqlen = batch_size, max_seqlen
+        assert hasattr(model, "allocate_inference_cache"), "CUDA graph decoding requires that the model has a method allocate_inference_cache"
+        inf_cache = model.allocate_inference_cache(batch_size, max_seqlen, dtype)
+        lengths_per_sample = torch.full((batch_size,), seqlen_og, dtype=torch.int32, device=device)
+        cache.inference_params = InferenceParams(
+            max_seqlen=max_seqlen,
+            max_batch_size=batch_size,
+            seqlen_offset=seqlen_og,
+            key_value_memory_dict=inf_cache,
+            lengths_per_sample=lengths_per_sample,
+        )
+        cache.mempool = torch.cuda.graphs.graph_pool_handle()
+    for decoding_seqlen in decoding_seqlens:
+        if (batch_size, decoding_seqlen) not in cache.callables:
+            cache.callables[batch_size, decoding_seqlen] = capture_graph(
+                model,
+                cache.inference_params,
+                batch_size,
+                max_seqlen,
+                decoding_seqlen=decoding_seqlen,
+                mempool=cache.mempool,
+                n_warmups=n_warmups,
+            )
+    def dispatch(input_ids, position_ids, seqlen):
+        batch_size, decoding_seqlen = input_ids.shape[:2]
+        return cache.callables[batch_size, decoding_seqlen](input_ids, position_ids, seqlen)
+    cache.run = dispatch
+    cache.inference_params.seqlen_offset = 0  # Reset so it's not confusing
+    return cache
+def capture_graph(
+    model, inference_params, batch_size, max_seqlen, decoding_seqlen=1, mempool=None, n_warmups=2
+):
+    device = next(iter(model.parameters())).device
+    input_ids = torch.full((batch_size, decoding_seqlen), 0, dtype=torch.long, device=device)
+    position_ids = torch.full((batch_size, decoding_seqlen), 0, dtype=torch.long, device=device)
+    seqlen_offset_og = inference_params.seqlen_offset
+    inference_params.seqlen_offset = max_seqlen - decoding_seqlen
+    inference_params.lengths_per_sample[:] = inference_params.seqlen_offset
+    # Warmup before capture
+    s = torch.cuda.Stream()
+    s.wait_stream(torch.cuda.current_stream())
+    with torch.cuda.stream(s):
+        for _ in range(n_warmups):
+            logits = model(
+                input_ids,
+                position_ids=position_ids,
+                inference_params=inference_params,
+                num_last_tokens=decoding_seqlen,
+            ).logits
+        s.synchronize()
+        # This might be needed for correctness if we run with NCCL_GRAPH_MIXING_SUPPORT=0,
+        # which requires that graph launch and non-captured launch to not overlap (I think,
+        # that's how I interpret the documentation). I'm not sure if this is required.
+        if torch.distributed.is_initialized():
+            torch.distributed.barrier()
+    torch.cuda.current_stream().wait_stream(s)
+    # Captures the graph
+    # To allow capture, automatically sets a side stream as the current stream in the context
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph, pool=mempool):
+        logits = model(
+            input_ids,
+            position_ids=position_ids,
+            inference_params=inference_params,
+            num_last_tokens=decoding_seqlen,
+        ).logits
+    def run(new_input_ids, new_position_ids, seqlen):
+        inference_params.lengths_per_sample[:] = seqlen
+        input_ids.copy_(new_input_ids)
+        position_ids.copy_(new_position_ids)
+        graph.replay()
+        return logits.clone()
+    inference_params.seqlen_offset = seqlen_offset_og
+    return run

mamba_ssm/utils/hf.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import json
+import torch
+from transformers.utils import WEIGHTS_NAME, CONFIG_NAME
+from transformers.utils.hub import cached_file
+def load_config_hf(model_name):
+    resolved_archive_file = cached_file(model_name, CONFIG_NAME, _raise_exceptions_for_missing_entries=False)
+    return json.load(open(resolved_archive_file))
+def load_state_dict_hf(model_name, device=None, dtype=None):
+    # If not fp32, then we don't want to load directly to the GPU
+    mapped_device = "cpu" if dtype not in [torch.float32, None] else device
+    resolved_archive_file = cached_file(model_name, WEIGHTS_NAME, _raise_exceptions_for_missing_entries=False)
+    return torch.load(resolved_archive_file, map_location=mapped_device)
+    # Convert dtype before moving to GPU to save memory
+    if dtype is not None:
+        state_dict = {k: v.to(dtype=dtype) for k, v in state_dict.items()}
+    state_dict = {k: v.to(device=device) for k, v in state_dict.items()}
+    return state_dict

models/codec_module.py ADDED Viewed

	@@ -0,0 +1,183 @@

+# Reference: https://github.com/yxlu-0102/MP-SENet/blob/main/models/generator.py
+import torch
+import torch.nn as nn
+from einops import rearrange
+from .lsigmoid import LearnableSigmoid2D
+def get_padding(kernel_size, dilation=1):
+    """
+    Calculate the padding size for a convolutional layer.
+    Args:
+    - kernel_size (int): Size of the convolutional kernel.
+    - dilation (int, optional): Dilation rate of the convolution. Defaults to 1.
+    Returns:
+    - int: Calculated padding size.
+    """
+    return int((kernel_size * dilation - dilation) / 2)
+def get_padding_2d(kernel_size, dilation=(1, 1)):
+    """
+    Calculate the padding size for a 2D convolutional layer.
+    Args:
+    - kernel_size (tuple): Size of the convolutional kernel (height, width).
+    - dilation (tuple, optional): Dilation rate of the convolution (height, width). Defaults to (1, 1).
+    Returns:
+    - tuple: Calculated padding size (height, width).
+    """
+    return (int((kernel_size[0] * dilation[0] - dilation[0]) / 2),
+            int((kernel_size[1] * dilation[1] - dilation[1]) / 2))
+class DenseBlock(nn.Module):
+    """
+    DenseBlock module consisting of multiple convolutional layers with dilation.
+    """
+    def __init__(self, cfg, kernel_size=(3, 3), depth=4):
+        super(DenseBlock, self).__init__()
+        self.cfg = cfg
+        self.depth = depth
+        self.dense_block = nn.ModuleList()
+        self.hid_feature = cfg['model_cfg']['hid_feature']
+        for i in range(depth):
+            dil = 2 ** i
+            dense_conv = nn.Sequential(
+                nn.Conv2d(self.hid_feature * (i + 1), self.hid_feature, kernel_size,
+                          dilation=(dil, 1), padding=get_padding_2d(kernel_size, (dil, 1))),
+                nn.InstanceNorm2d(self.hid_feature, affine=True),
+                nn.PReLU(self.hid_feature)
+            )
+            self.dense_block.append(dense_conv)
+    def forward(self, x):
+        """
+        Forward pass for the DenseBlock module.
+        Args:
+        - x (torch.Tensor): Input tensor.
+        Returns:
+        - torch.Tensor: Output tensor after processing through the dense block.
+        """
+        skip = x
+        for i in range(self.depth):
+            x = self.dense_block[i](skip)
+            skip = torch.cat([x, skip], dim=1)
+        return x
+class DenseEncoder(nn.Module):
+    """
+    DenseEncoder module consisting of initial convolution, dense block, and a final convolution.
+    """
+    def __init__(self, cfg):
+        super(DenseEncoder, self).__init__()
+        self.cfg = cfg
+        self.input_channel = cfg['model_cfg']['input_channel']
+        self.hid_feature = cfg['model_cfg']['hid_feature']
+        self.dense_conv_1 = nn.Sequential(
+            nn.Conv2d(self.input_channel, self.hid_feature, (1, 1)),
+            nn.InstanceNorm2d(self.hid_feature, affine=True),
+            nn.PReLU(self.hid_feature)
+        )
+        self.dense_block = DenseBlock(cfg, depth=4)
+        self.dense_conv_2 = nn.Sequential(
+            nn.Conv2d(self.hid_feature, self.hid_feature, (1, 3), stride=(1, 2)),
+            nn.InstanceNorm2d(self.hid_feature, affine=True),
+            nn.PReLU(self.hid_feature)
+        )
+    def forward(self, x):
+        """
+        Forward pass for the DenseEncoder module.
+        Args:
+        - x (torch.Tensor): Input tensor.
+        Returns:
+        - torch.Tensor: Encoded tensor.
+        """
+        x = self.dense_conv_1(x)  # [batch, hid_feature, time, freq]
+        x = self.dense_block(x)   # [batch, hid_feature, time, freq]
+        x = self.dense_conv_2(x)  # [batch, hid_feature, time, freq//2]
+        return x
+class MagDecoder(nn.Module):
+    """
+    MagDecoder module for decoding magnitude information.
+    """
+    def __init__(self, cfg):
+        super(MagDecoder, self).__init__()
+        self.dense_block = DenseBlock(cfg, depth=4)
+        self.hid_feature = cfg['model_cfg']['hid_feature']
+        self.output_channel = cfg['model_cfg']['output_channel']
+        self.n_fft = cfg['stft_cfg']['n_fft']
+        self.beta = cfg['model_cfg']['beta']
+        self.mask_conv = nn.Sequential(
+            nn.ConvTranspose2d(self.hid_feature, self.hid_feature, (1, 3), stride=(1, 2)),
+            nn.Conv2d(self.hid_feature, self.output_channel, (1, 1)),
+            nn.InstanceNorm2d(self.output_channel, affine=True),
+            nn.PReLU(self.output_channel),
+            nn.Conv2d(self.output_channel, self.output_channel, (1, 1))
+        )
+        self.lsigmoid = LearnableSigmoid2D(self.n_fft // 2 + 1, beta=self.beta)
+    def forward(self, x):
+        """
+        Forward pass for the MagDecoder module.
+        Args:
+        - x (torch.Tensor): Input tensor.
+        Returns:
+        - torch.Tensor: Decoded tensor with magnitude information.
+        """
+        x = self.dense_block(x)
+        x = self.mask_conv(x)
+        x = rearrange(x, 'b c t f -> b f t c').squeeze(-1)
+        x = self.lsigmoid(x)
+        x = rearrange(x, 'b f t -> b t f').unsqueeze(1)
+        return x
+class PhaseDecoder(nn.Module):
+    """
+    PhaseDecoder module for decoding phase information.
+    """
+    def __init__(self, cfg):
+        super(PhaseDecoder, self).__init__()
+        self.dense_block = DenseBlock(cfg, depth=4)
+        self.hid_feature = cfg['model_cfg']['hid_feature']
+        self.output_channel = cfg['model_cfg']['output_channel']
+        self.phase_conv = nn.Sequential(
+            nn.ConvTranspose2d(self.hid_feature, self.hid_feature, (1, 3), stride=(1, 2)),
+            nn.InstanceNorm2d(self.hid_feature, affine=True),
+            nn.PReLU(self.hid_feature)
+        )
+        self.phase_conv_r = nn.Conv2d(self.hid_feature, self.output_channel, (1, 1))
+        self.phase_conv_i = nn.Conv2d(self.hid_feature, self.output_channel, (1, 1))
+    def forward(self, x):
+        """
+        Forward pass for the PhaseDecoder module.
+        Args:
+        - x (torch.Tensor): Input tensor.
+        Returns:
+        - torch.Tensor: Decoded tensor with phase information.
+        """
+        x = self.dense_block(x)
+        x = self.phase_conv(x)
+        x_r = self.phase_conv_r(x)
+        x_i = self.phase_conv_i(x)
+        x = torch.atan2(x_i, x_r)
+        return x

models/discriminator.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# References: https://github.com/yxlu-0102/MP-SENet/blob/main/models/discriminator.py
+import torch
+import torch.nn as nn
+import numpy as np
+from pesq import pesq
+from joblib import Parallel, delayed
+from models.lsigmoid import LearnableSigmoid1D
+def pesq_loss(clean, noisy, sr=16000):
+    try:
+        pesq_score = pesq(sr, clean, noisy, 'wb')
+    except:
+        # error can happen due to silent period
+        pesq_score = -1
+    return pesq_score
+def batch_pesq(clean, noisy, cfg):
+    num_worker = cfg['env_setting']['num_workers']
+    pesq_score = Parallel(n_jobs=num_worker)(delayed(pesq_loss)(c, n) for c, n in zip(clean, noisy))
+    pesq_score = np.array(pesq_score)
+    if -1 in pesq_score:
+        return None
+    pesq_score = (pesq_score - 1) / 3.5
+    return torch.FloatTensor(pesq_score)
+class MetricDiscriminator(nn.Module):
+    def __init__(self, dim=16, in_channel=2):
+        super(MetricDiscriminator, self).__init__()
+        self.layers = nn.Sequential(
+            nn.utils.spectral_norm(nn.Conv2d(in_channel, dim, (4,4), (2,2), (1,1), bias=False)),
+            nn.InstanceNorm2d(dim, affine=True),
+            nn.PReLU(dim),
+            nn.utils.spectral_norm(nn.Conv2d(dim, dim*2, (4,4), (2,2), (1,1), bias=False)),
+            nn.InstanceNorm2d(dim*2, affine=True),
+            nn.PReLU(dim*2),
+            nn.utils.spectral_norm(nn.Conv2d(dim*2, dim*4, (4,4), (2,2), (1,1), bias=False)),
+            nn.InstanceNorm2d(dim*4, affine=True),
+            nn.PReLU(dim*4),
+            nn.utils.spectral_norm(nn.Conv2d(dim*4, dim*8, (4,4), (2,2), (1,1), bias=False)),
+            nn.InstanceNorm2d(dim*8, affine=True),
+            nn.PReLU(dim*8),
+            nn.AdaptiveMaxPool2d(1),
+            nn.Flatten(),
+            nn.utils.spectral_norm(nn.Linear(dim*8, dim*4)),
+            nn.Dropout(0.3),
+            nn.PReLU(dim*4),
+            nn.utils.spectral_norm(nn.Linear(dim*4, 1)),
+            LearnableSigmoid1D(1)
+        )
+    def forward(self, x, y):
+        xy = torch.stack((x, y), dim=1)
+        return self.layers(xy)

models/generator.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import torch
+import torch.nn as nn
+from einops import rearrange
+from .mamba_block import TFMambaBlock
+from .codec_module import DenseEncoder, MagDecoder, PhaseDecoder
+class SEMamba(nn.Module):
+    """
+    SEMamba model for speech enhancement using Mamba blocks.
+    This model uses a dense encoder, multiple Mamba blocks, and separate magnitude
+    and phase decoders to process noisy magnitude and phase inputs.
+    """
+    def __init__(self, cfg):
+        """
+        Initialize the SEMamba model.
+        Args:
+        - cfg: Configuration object containing model parameters.
+        """
+        super(SEMamba, self).__init__()
+        self.cfg = cfg
+        self.num_tscblocks = cfg['model_cfg']['num_tfmamba'] if cfg['model_cfg']['num_tfmamba'] is not None else 4  # default tfmamba: 4
+        # Initialize dense encoder
+        self.dense_encoder = DenseEncoder(cfg)
+        # Initialize Mamba blocks
+        self.TSMamba = nn.ModuleList([TFMambaBlock(cfg) for _ in range(self.num_tscblocks)])
+        # Initialize decoders
+        self.mask_decoder = MagDecoder(cfg)
+        self.phase_decoder = PhaseDecoder(cfg)
+    def forward(self, noisy_mag, noisy_pha):
+        """
+        Forward pass for the SEMamba model.
+        Args:
+        - noisy_mag (torch.Tensor): Noisy magnitude input tensor [B, F, T].
+        - noisy_pha (torch.Tensor): Noisy phase input tensor [B, F, T].
+        Returns:
+        - denoised_mag (torch.Tensor): Denoised magnitude tensor [B, F, T].
+        - denoised_pha (torch.Tensor): Denoised phase tensor [B, F, T].
+        - denoised_com (torch.Tensor): Denoised complex tensor [B, F, T, 2].
+        """
+        # Reshape inputs
+        noisy_mag = rearrange(noisy_mag, 'b f t -> b t f').unsqueeze(1)  # [B, 1, T, F]
+        noisy_pha = rearrange(noisy_pha, 'b f t -> b t f').unsqueeze(1)  # [B, 1, T, F]
+        # Concatenate magnitude and phase inputs
+        x = torch.cat((noisy_mag, noisy_pha), dim=1)  # [B, 2, T, F]
+        # Encode input
+        x = self.dense_encoder(x)
+        # Apply Mamba blocks
+        for block in self.TSMamba:
+            x = block(x)
+        # Decode magnitude and phase
+        denoised_mag = rearrange(self.mask_decoder(x) * noisy_mag, 'b c t f -> b f t c').squeeze(-1)
+        denoised_pha = rearrange(self.phase_decoder(x), 'b c t f -> b f t c').squeeze(-1)
+        # Combine denoised magnitude and phase into a complex representation
+        denoised_com = torch.stack(
+            (denoised_mag * torch.cos(denoised_pha), denoised_mag * torch.sin(denoised_pha)),
+            dim=-1
+        )
+        return denoised_mag, denoised_pha, denoised_com

models/loss.py ADDED Viewed

	@@ -0,0 +1,145 @@

+# Reference: https://github.com/yxlu-0102/MP-SENet/blob/main/models/generator.py
+import torch
+import torch.nn as nn
+import numpy as np
+from pesq import pesq
+from joblib import Parallel, delayed
+def phase_losses(phase_r, phase_g, cfg):
+    """
+    Calculate phase losses including in-phase loss, gradient delay loss,
+    and integrated absolute frequency loss between reference and generated phases.
+    Args:
+        phase_r (torch.Tensor): Reference phase tensor of shape (batch, freq, time).
+        phase_g (torch.Tensor): Generated phase tensor of shape (batch, freq, time).
+        h (object): Configuration object containing parameters like n_fft.
+    Returns:
+        tuple: Tuple containing in-phase loss, gradient delay loss, and integrated absolute frequency loss.
+    """
+    dim_freq = cfg['stft_cfg']['n_fft'] // 2 + 1  # Calculate frequency dimension
+    dim_time = phase_r.size(-1)  # Calculate time dimension
+    # Construct gradient delay matrix
+    gd_matrix = (torch.triu(torch.ones(dim_freq, dim_freq), diagonal=1) -
+                 torch.triu(torch.ones(dim_freq, dim_freq), diagonal=2) -
+                 torch.eye(dim_freq)).to(phase_g.device)
+    # Apply gradient delay matrix to reference and generated phases
+    gd_r = torch.matmul(phase_r.permute(0, 2, 1), gd_matrix)
+    gd_g = torch.matmul(phase_g.permute(0, 2, 1), gd_matrix)
+    # Construct integrated absolute frequency matrix
+    iaf_matrix = (torch.triu(torch.ones(dim_time, dim_time), diagonal=1) -
+                  torch.triu(torch.ones(dim_time, dim_time), diagonal=2) -
+                  torch.eye(dim_time)).to(phase_g.device)
+    # Apply integrated absolute frequency matrix to reference and generated phases
+    iaf_r = torch.matmul(phase_r, iaf_matrix)
+    iaf_g = torch.matmul(phase_g, iaf_matrix)
+    # Calculate losses
+    ip_loss = torch.mean(anti_wrapping_function(phase_r - phase_g))
+    gd_loss = torch.mean(anti_wrapping_function(gd_r - gd_g))
+    iaf_loss = torch.mean(anti_wrapping_function(iaf_r - iaf_g))
+    return ip_loss, gd_loss, iaf_loss
+def anti_wrapping_function(x):
+    """
+    Anti-wrapping function to adjust phase values within the range of -pi to pi.
+    Args:
+        x (torch.Tensor): Input tensor representing phase differences.
+    Returns:
+        torch.Tensor: Adjusted tensor with phase values wrapped within -pi to pi.
+    """
+    return torch.abs(x - torch.round(x / (2 * np.pi)) * 2 * np.pi)
+def compute_stft(y: torch.Tensor, n_fft: int, hop_size: int, win_size: int, center: bool, compress_factor: float = 1.0) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Compute the Short-Time Fourier Transform (STFT) and return magnitude, phase, and complex components.
+    Args:
+        y (torch.Tensor): Input signal tensor.
+        n_fft (int): Number of FFT points.
+        hop_size (int): Hop size for STFT.
+        win_size (int): Window size for STFT.
+        center (bool): Whether to pad the input on both sides.
+        compress_factor (float, optional): Compression factor for magnitude. Defaults to 1.0.
+    Returns:
+        tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Magnitude, phase, and complex components.
+    """
+    eps = torch.finfo(y.dtype).eps
+    hann_window = torch.hann_window(win_size).to(y.device)
+    stft_spec = torch.stft(
+        y,
+        n_fft=n_fft,
+        hop_length=hop_size,
+        win_length=win_size,
+        window=hann_window,
+        center=center,
+        pad_mode='reflect',
+        normalized=False,
+        return_complex=True
+    )
+    real_part = stft_spec.real
+    imag_part = stft_spec.imag
+    mag = torch.sqrt( real_part.pow(2) * imag_part.pow(2) + eps )
+    pha = torch.atan2( real_part + eps, imag_part + eps )
+    mag = torch.pow(mag, compress_factor)
+    com = torch.stack((mag * torch.cos(pha), mag * torch.sin(pha)), dim=-1)
+    return mag, pha, com
+def pesq_score(utts_r, utts_g, cfg):
+    """
+    Calculate PESQ (Perceptual Evaluation of Speech Quality) score for pairs of reference and generated utterances.
+    Args:
+        utts_r (list of torch.Tensor): List of reference utterances.
+        utts_g (list of torch.Tensor): List of generated utterances.
+        h (object): Configuration object containing parameters like sampling_rate.
+    Returns:
+        float: Mean PESQ score across all pairs of utterances.
+    """
+    def eval_pesq(clean_utt, esti_utt, sr):
+        """
+        Evaluate PESQ score for a single pair of clean and estimated utterances.
+        Args:
+            clean_utt (np.ndarray): Clean reference utterance.
+            esti_utt (np.ndarray): Estimated generated utterance.
+            sr (int): Sampling rate.
+        Returns:
+            float: PESQ score or -1 in case of an error.
+        """
+        try:
+            pesq_score = pesq(sr, clean_utt, esti_utt)
+        except Exception as e:
+            # Error can happen due to silent period or other issues
+            print(f"Error computing PESQ score: {e}")
+            pesq_score = -1
+        return pesq_score
+    # Parallel processing of PESQ score computation
+    pesq_scores = Parallel(n_jobs=30)(delayed(eval_pesq)(
+        utts_r[i].squeeze().cpu().numpy(),
+        utts_g[i].squeeze().cpu().numpy(),
+        cfg['stft_cfg']['sampling_rate']
+    ) for i in range(len(utts_r)))
+    # Calculate mean PESQ score
+    pesq_score = np.mean(pesq_scores)
+    return pesq_score

models/lsigmoid.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# Reference: https://github.com/yxlu-0102/MP-SENet/blob/main/utils.py
+import torch
+import torch.nn as nn
+class LearnableSigmoid1D(nn.Module):
+    """
+    Learnable Sigmoid Activation Function for 1D inputs.
+    This module applies a learnable slope parameter to the sigmoid activation function.
+    """
+    def __init__(self, in_features, beta=1):
+        """
+        Initialize the LearnableSigmoid1D module.
+        Args:
+        - in_features (int): Number of input features.
+        - beta (float, optional): Scaling factor for the sigmoid function. Defaults to 1.
+        """
+        super(LearnableSigmoid1D, self).__init__()
+        self.beta = beta
+        self.slope = nn.Parameter(torch.ones(in_features))
+        self.slope.requires_grad = True
+    def forward(self, x):
+        """
+        Forward pass for the LearnableSigmoid1D module.
+        Args:
+        - x (torch.Tensor): Input tensor.
+        Returns:
+        - torch.Tensor: Output tensor after applying the learnable sigmoid activation.
+        """
+        return self.beta * torch.sigmoid(self.slope * x)
+class LearnableSigmoid2D(nn.Module):
+    """
+    Learnable Sigmoid Activation Function for 2D inputs.
+    This module applies a learnable slope parameter to the sigmoid activation function for 2D inputs.
+    """
+    def __init__(self, in_features, beta=1):
+        """
+        Initialize the LearnableSigmoid2D module.
+        Args:
+        - in_features (int): Number of input features.
+        - beta (float, optional): Scaling factor for the sigmoid function. Defaults to 1.
+        """
+        super(LearnableSigmoid2D, self).__init__()
+        self.beta = beta
+        self.slope = nn.Parameter(torch.ones(in_features, 1))
+        self.slope.requires_grad = True
+    def forward(self, x):
+        """
+        Forward pass for the LearnableSigmoid2D module.
+        Args:
+        - x (torch.Tensor): Input tensor.
+        Returns:
+        - torch.Tensor: Output tensor after applying the learnable sigmoid activation.
+        """
+        return self.beta * torch.sigmoid(self.slope * x)

models/mamba_block.py ADDED Viewed

	@@ -0,0 +1,110 @@

+# Reference: https://github.com/state-spaces/mamba/blob/9127d1f47f367f5c9cc49c73ad73557089d02cb8/mamba_ssm/models/mixer_seq_simple.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import init
+from torch.nn.parameter import Parameter
+from functools import partial
+from einops import rearrange
+from mamba_ssm.modules.mamba_simple import Mamba, Block
+from mamba_ssm.models.mixer_seq_simple import _init_weights
+from mamba_ssm.ops.triton.layernorm import RMSNorm
+# github: https://github.com/state-spaces/mamba/blob/9127d1f47f367f5c9cc49c73ad73557089d02cb8/mamba_ssm/models/mixer_seq_simple.py
+def create_block(
+    d_model, cfg, layer_idx=0, rms_norm=True, fused_add_norm=False, residual_in_fp32=False,
+    ):
+    d_state = cfg['model_cfg']['d_state'] # 16
+    d_conv = cfg['model_cfg']['d_conv'] # 4
+    expand = cfg['model_cfg']['expand'] # 4
+    norm_epsilon = cfg['model_cfg']['norm_epsilon'] # 0.00001
+    mixer_cls = partial(Mamba, layer_idx=layer_idx, d_state=d_state, d_conv=d_conv, expand=expand)
+    norm_cls = partial(
+        nn.LayerNorm if not rms_norm else RMSNorm, eps=norm_epsilon
+    )
+    block = Block(
+            d_model,
+            mixer_cls,
+            norm_cls=norm_cls,
+            fused_add_norm=fused_add_norm,
+            residual_in_fp32=residual_in_fp32,
+            )
+    block.layer_idx = layer_idx
+    return block
+class MambaBlock(nn.Module):
+    def __init__(self, in_channels, cfg):
+        super(MambaBlock, self).__init__()
+        n_layer = 1
+        self.forward_blocks  = nn.ModuleList( create_block(in_channels, cfg) for i in range(n_layer) )
+        self.backward_blocks = nn.ModuleList( create_block(in_channels, cfg) for i in range(n_layer) )
+        self.apply(
+            partial(
+                _init_weights,
+                n_layer=n_layer,
+            )
+        )
+    def forward(self, x):
+        x_forward, x_backward = x.clone(), torch.flip(x, [1])
+        resi_forward, resi_backward = None, None
+        # Forward
+        for layer in self.forward_blocks:
+            x_forward, resi_forward = layer(x_forward, resi_forward)
+        y_forward = (x_forward + resi_forward) if resi_forward is not None else x_forward
+        # Backward
+        for layer in self.backward_blocks:
+            x_backward, resi_backward = layer(x_backward, resi_backward)
+        y_backward = torch.flip((x_backward + resi_backward), [1]) if resi_backward is not None else torch.flip(x_backward, [1])
+        return torch.cat([y_forward, y_backward], -1)
+class TFMambaBlock(nn.Module):
+    """
+    Temporal-Frequency Mamba block for sequence modeling.
+    Attributes:
+    cfg (Config): Configuration for the block.
+    time_mamba (MambaBlock): Mamba block for temporal dimension.
+    freq_mamba (MambaBlock): Mamba block for frequency dimension.
+    tlinear (ConvTranspose1d): ConvTranspose1d layer for temporal dimension.
+    flinear (ConvTranspose1d): ConvTranspose1d layer for frequency dimension.
+    """
+    def __init__(self, cfg):
+        super(TFMambaBlock, self).__init__()
+        self.cfg = cfg
+        self.hid_feature = cfg['model_cfg']['hid_feature']
+        # Initialize Mamba blocks
+        self.time_mamba = MambaBlock(in_channels=self.hid_feature, cfg=cfg)
+        self.freq_mamba = MambaBlock(in_channels=self.hid_feature, cfg=cfg)
+        # Initialize ConvTranspose1d layers
+        self.tlinear = nn.ConvTranspose1d(self.hid_feature * 2, self.hid_feature, 1, stride=1)
+        self.flinear = nn.ConvTranspose1d(self.hid_feature * 2, self.hid_feature, 1, stride=1)
+    def forward(self, x):
+        """
+        Forward pass of the TFMamba block.
+        Parameters:
+        x (Tensor): Input tensor with shape (batch, channels, time, freq).
+        Returns:
+        Tensor: Output tensor after applying temporal and frequency Mamba blocks.
+        """
+        b, c, t, f = x.size()
+        x = x.permute(0, 3, 2, 1).contiguous().view(b*f, t, c)
+        x = self.tlinear( self.time_mamba(x).permute(0,2,1) ).permute(0,2,1) + x
+        x = x.view(b, f, t, c).permute(0, 2, 1, 3).contiguous().view(b*t, f, c)
+        x = self.flinear( self.freq_mamba(x).permute(0,2,1) ).permute(0,2,1) + x
+        x = x.view(b, t, f, c).permute(0, 3, 1, 2)
+        return x

models/pcs400.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import os
+import torch
+import torchaudio
+import numpy as np
+import argparse
+import librosa
+import scipy
+# PCS400 parameters
+PCS400 = np.ones(201)
+PCS400[0:3] = 1
+PCS400[3:5] = 1.070175439
+PCS400[5:8] = 1.182456140
+PCS400[8:10] = 1.287719298
+PCS400[10:110] = 1.4       # Pre Set
+PCS400[110:130] = 1.322807018
+PCS400[130:160] = 1.238596491
+PCS400[160:190] = 1.161403509
+PCS400[190:202] = 1.077192982
+maxv = np.iinfo(np.int16).max
+def Sp_and_phase(signal):
+    signal_length = signal.shape[0]
+    n_fft = 400
+    hop_length = 100
+    y_pad = librosa.util.fix_length(signal, size=signal_length + n_fft // 2)
+    F = librosa.stft(y_pad, n_fft=400, hop_length=100, win_length=400, window=scipy.signal.windows.hamming(400))
+    Lp = PCS400 * np.transpose(np.log1p(np.abs(F)), (1, 0))
+    phase = np.angle(F)
+    NLp = np.transpose(Lp, (1, 0))
+    return NLp, phase, signal_length
+def SP_to_wav(mag, phase, signal_length):
+    mag = np.expm1(mag)
+    Rec = np.multiply(mag, np.exp(1j*phase))
+    result = librosa.istft(Rec,
+                           hop_length=100,
+                           win_length=400,
+                           window=scipy.signal.windows.hamming(400),
+                           length=signal_length)
+    return result
+def cal_pcs(signal_wav):
+    noisy_LP, Nphase, signal_length = Sp_and_phase(signal_wav.squeeze())
+    enhanced_wav = SP_to_wav(noisy_LP, Nphase, signal_length)
+    enhanced_wav = enhanced_wav/np.max(abs(enhanced_wav))
+    return enhanced_wav

models/stfts.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import torch
+import torch.nn as nn
+def mag_phase_stft(y, n_fft, hop_size, win_size, compress_factor=1.0, center=True, addeps=False):
+    """
+    Compute magnitude and phase using STFT.
+    Args:
+        y (torch.Tensor): Input audio signal.
+        n_fft (int): FFT size.
+        hop_size (int): Hop size.
+        win_size (int): Window size.
+        compress_factor (float, optional): Magnitude compression factor. Defaults to 1.0.
+        center (bool, optional): Whether to center the signal before padding. Defaults to True.
+        eps (bool, optional): Whether adding epsilon to magnitude and phase or not. Defaults to False.
+    Returns:
+        tuple: Magnitude, phase, and complex representation of the STFT.
+    """
+    #eps = torch.finfo(y.dtype).eps
+    eps = 1e-10
+    hann_window = torch.hann_window(win_size).to(y.device)
+    stft_spec = torch.stft(
+                    y, n_fft,
+                    hop_length=hop_size,
+                    win_length=win_size,
+                    window=hann_window,
+                    center=center,
+                    pad_mode='reflect',
+                    normalized=False,
+                    return_complex=True)
+    if addeps==False:
+        mag = torch.abs(stft_spec)
+        pha = torch.angle(stft_spec)
+    else:
+        real_part = stft_spec.real
+        imag_part = stft_spec.imag
+        mag = torch.sqrt(real_part.pow(2) + imag_part.pow(2) + eps)
+        pha = torch.atan2(imag_part + eps, real_part + eps)
+    # Compress the magnitude
+    mag = torch.pow(mag, compress_factor)
+    com = torch.stack((mag * torch.cos(pha), mag * torch.sin(pha)), dim=-1)
+    return mag, pha, com
+def mag_phase_istft(mag, pha, n_fft, hop_size, win_size, compress_factor=1.0, center=True):
+    """
+    Inverse STFT to reconstruct the audio signal from magnitude and phase.
+    Args:
+        mag (torch.Tensor): Magnitude of the STFT.
+        pha (torch.Tensor): Phase of the STFT.
+        n_fft (int): FFT size.
+        hop_size (int): Hop size.
+        win_size (int): Window size.
+        compress_factor (float, optional): Magnitude compression factor. Defaults to 1.0.
+        center (bool, optional): Whether to center the signal before padding. Defaults to True.
+    Returns:
+        torch.Tensor: Reconstructed audio signal.
+    """
+    mag = torch.pow(mag, 1.0 / compress_factor)
+    com = torch.complex(mag * torch.cos(pha), mag * torch.sin(pha))
+    hann_window = torch.hann_window(win_size).to(com.device)
+    wav = torch.istft(
+                    com,
+                    n_fft,
+                    hop_length=hop_size,
+                    win_length=win_size,
+                    window=hann_window,
+                    center=center)
+    return wav

recipes/SEMamba_advanced.yaml ADDED Viewed

	@@ -0,0 +1,66 @@

+# Environment Settings
+# These settings specify the hardware and distributed setup for the model training.
+# Adjust `num_gpus` and `dist_config` according to your distributed training environment.
+env_setting:
+  num_gpus: 2  # Number of GPUs. Now we don't support CPU mode.
+  num_workers: 20  # Number of worker threads for data loading.
+  seed: 1234  # Seed for random number generators to ensure reproducibility.
+  stdout_interval: 10
+  checkpoint_interval: 1000  # save model to ckpt every N steps
+  validation_interval: 1000
+  summary_interval: 100
+  dist_cfg:
+    dist_backend: nccl  # Distributed training backend, 'nccl' for NVIDIA GPUs.
+    dist_url: tcp://localhost:19477  # URL for initializing distributed training.
+    world_size: 1  # Total number of processes in the distributed training.
+# Datapath Configuratoin
+data_cfg:
+  train_clean_json: data/train_clean.json
+  train_noisy_json: data/train_noisy.json
+  valid_clean_json: data/valid_clean.json
+  valid_noisy_json: data/valid_noisy.json
+  test_clean_json: data/test_clean.json
+  test_noisy_json: data/test_noisy.json
+# Training Configuration
+# This section details parameters that directly influence the training process,
+# including batch sizes, learning rates, and optimizer specifics.
+training_cfg:
+  training_epochs: 200 # Training epoch.
+  batch_size: 4  # Training batch size.
+  learning_rate: 0.0005  # Initial learning rate.
+  adam_b1: 0.8  # Beta1 hyperparameter for the AdamW optimizer.
+  adam_b2: 0.99  # Beta2 hyperparameter for the AdamW optimizer.
+  lr_decay: 0.99  # Learning rate decay per epoch.
+  segment_size: 32000  # Audio segment size used during training, dependent on sampling rate.
+  loss:
+    metric: 0.05
+    magnitude: 0.9
+    phase: 0.3
+    complex: 0.1
+    time: 0.2
+    consistancy: 0.1
+  use_PCS400: False  # Use PCS or not
+# STFT Configuration
+# Configuration for Short-Time Fourier Transform (STFT), crucial for audio processing models.
+stft_cfg:
+  sampling_rate: 16000  # Audio sampling rate in Hz.
+  n_fft: 400  # FFT components for transforming audio signals.
+  hop_size: 100  # Samples between successive frames.
+  win_size: 400  # Window size used in FFT.
+# Model Configuration
+# Defines the architecture specifics of the model, including layer configurations and feature compression.
+model_cfg:
+  hid_feature: 64  # Channels in dense layers.
+  compress_factor: 0.3  # Compression factor applied to extracted features.
+  num_tfmamba: 4  # Number of Time-Frequency Mamba (TFMamba) blocks in the model.
+  d_state: 16  # Dimensionality of the state vector in Mamba blocks.
+  d_conv: 4  # Convolutional layer dimensionality within Mamba blocks.
+  expand: 4  # Expansion factor for the layers within the Mamba blocks.
+  norm_epsilon: 0.00001  # Numerical stability in normalization layers within the Mamba blocks.
+  beta: 2.0  # Hyperparameter for the Learnable Sigmoid function.
+  input_channel: 2 # Magnitude and Phase
+  output_channel: 1  # Single Channel Speech Enhancement

requirements.txt ADDED Viewed

	@@ -0,0 +1,22 @@

+packaging
+librosa
+soundfile
+pyyaml
+argparse
+tensorboard
+pesq
+einops
+matplotlib
+torch==2.5.1
+torchaudio==2.5.1
+numpy==1.26.4
+ultralytics
+moviepy
+supervision
+opencv-python
+ffmpeg-python
+decord==0.6.0
+pytorch_lightning==1.9.0
+typeguard==2.13.3
+torch_complex
+rich

yolov8n-face.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d17b38523a994b13ee604b67f02791ca0f43b9f446a32fd7bc44e17c56ead077
+size 6250099