Spaces:

uto1125
/

Tai-Ya-test

Runtime error

App Files Files Community

uto1125 commited on Jan 13

Commit

c667b6b

verified ·

1 Parent(s): 28c311d

Upload 37 files

Browse files

Files changed (37) hide show

tools/__pycache__/api.cpython-310.pyc +0 -0
tools/__pycache__/api.cpython-311.pyc +0 -0
tools/__pycache__/auto_rerank.cpython-310.pyc +0 -0
tools/__pycache__/commons.cpython-310.pyc +0 -0
tools/__pycache__/file.cpython-310.pyc +0 -0
tools/__pycache__/schema.cpython-310.pyc +0 -0
tools/__pycache__/webui.cpython-310.pyc +0 -0
tools/api.py +943 -0
tools/auto_rerank.py +159 -0
tools/commons.py +35 -0
tools/download_models.py +55 -0
tools/e2e_webui.py +232 -0
tools/extract_model.py +21 -0
tools/file.py +125 -0
tools/fish_e2e.py +298 -0
tools/llama/__pycache__/generate.cpython-310.pyc +0 -0
tools/llama/build_dataset.py +169 -0
tools/llama/eval_in_context.py +171 -0
tools/llama/generate.py +1087 -0
tools/llama/merge_lora.py +95 -0
tools/llama/quantize.py +497 -0
tools/llama/rebuild_tokenizer.py +57 -0
tools/msgpack_api.py +95 -0
tools/post_api.py +227 -0
tools/schema.py +187 -0
tools/sensevoice/README.md +59 -0
tools/sensevoice/__init__.py +0 -0
tools/sensevoice/auto_model.py +573 -0
tools/sensevoice/fun_asr.py +332 -0
tools/sensevoice/vad_utils.py +61 -0
tools/smart_pad.py +60 -0
tools/vqgan/__pycache__/inference.cpython-310.pyc +0 -0
tools/vqgan/create_train_split.py +83 -0
tools/vqgan/extract_vq.py +233 -0
tools/vqgan/inference.py +121 -0
tools/webui.py +570 -0
tools/whisper_asr.py +176 -0

tools/__pycache__/api.cpython-310.pyc ADDED Viewed

Binary file (22.2 kB). View file

tools/__pycache__/api.cpython-311.pyc ADDED Viewed

Binary file (45 kB). View file

tools/__pycache__/auto_rerank.cpython-310.pyc ADDED Viewed

Binary file (3.49 kB). View file

tools/__pycache__/commons.cpython-310.pyc ADDED Viewed

Binary file (1.49 kB). View file

tools/__pycache__/file.cpython-310.pyc ADDED Viewed

Binary file (2.99 kB). View file

tools/__pycache__/schema.cpython-310.pyc ADDED Viewed

Binary file (7.67 kB). View file

tools/__pycache__/webui.cpython-310.pyc ADDED Viewed

Binary file (11.6 kB). View file

tools/api.py ADDED Viewed

	@@ -0,0 +1,943 @@

+import io
+import os
+import queue
+import re
+import time
+import traceback
+import wave
+from argparse import ArgumentParser
+from http import HTTPStatus
+from pathlib import Path
+from typing import Annotated, Any
+import librosa
+import numpy as np
+import ormsgpack
+import pyrootutils
+import soundfile as sf
+import torch
+import torchaudio
+from baize.datastructures import ContentType
+from kui.asgi import (
+    Body,
+    FactoryClass,
+    HTTPException,
+    HttpRequest,
+    HttpView,
+    JSONResponse,
+    Kui,
+    OpenAPI,
+    StreamResponse,
+    request,
+)
+from kui.asgi.routing import MultimethodRoutes
+from loguru import logger
+from transformers import AutoTokenizer
+pyrootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
+import struct
+from threading import Lock
+import httpx
+from cachetools import LRUCache, cached
+from funasr import AutoModel
+from silero_vad import get_speech_timestamps, load_silero_vad
+from fish_speech.conversation import IM_END_TOKEN, SEMANTIC_TOKEN
+from fish_speech.models.text2semantic.llama import BaseModelArgs
+# from fish_speech.models.vqgan.lit_module import VQGAN
+from fish_speech.models.vqgan.modules.firefly import FireflyArchitecture
+from fish_speech.text.chn_text_norm.text import Text as ChnNormedText
+from fish_speech.utils import autocast_exclude_mps, set_seed
+from tools.file import AUDIO_EXTENSIONS, audio_to_bytes, list_files, read_ref_text
+from tools.llama.generate import (
+    GenerateRequest,
+    GenerateResponse,
+    WrappedGenerateResponse,
+    launch_thread_safe_queue,
+    launch_thread_safe_queue_agent,
+)
+from tools.schema import (
+    GLOBAL_NUM_SAMPLES,
+    ASRPackRequest,
+    ServeASRRequest,
+    ServeASRResponse,
+    ServeASRSegment,
+    ServeAudioPart,
+    ServeForwardMessage,
+    ServeMessage,
+    ServeRequest,
+    ServeResponse,
+    ServeStreamDelta,
+    ServeStreamResponse,
+    ServeTextPart,
+    ServeTimedASRResponse,
+    ServeTTSRequest,
+    ServeVQGANDecodeRequest,
+    ServeVQGANDecodeResponse,
+    ServeVQGANEncodeRequest,
+    ServeVQGANEncodeResponse,
+    ServeVQPart,
+)
+from tools.vqgan.inference import load_model as load_decoder_model
+global_lock = Lock()
+# Whether to disable keepalive (which is helpful if the server is in the same cluster)
+DISABLE_KEEPALIVE = os.getenv("DISABLE_KEEPALIVE", "false").lower() == "true"
+async_client = httpx.AsyncClient(
+    timeout=120, limits=httpx.Limits(keepalive_expiry=0 if DISABLE_KEEPALIVE else None)
+)
+backends = torchaudio.list_audio_backends()
+if "ffmpeg" in backends:
+    backend = "ffmpeg"
+else:
+    backend = "soundfile"
+def wav_chunk_header(sample_rate=44100, bit_depth=16, channels=1):
+    buffer = io.BytesIO()
+    with wave.open(buffer, "wb") as wav_file:
+        wav_file.setnchannels(channels)
+        wav_file.setsampwidth(bit_depth // 8)
+        wav_file.setframerate(sample_rate)
+    wav_header_bytes = buffer.getvalue()
+    buffer.close()
+    return wav_header_bytes
+# Define utils for web server
+async def http_execption_handler(exc: HTTPException):
+    return JSONResponse(
+        dict(
+            statusCode=exc.status_code,
+            message=exc.content,
+            error=HTTPStatus(exc.status_code).phrase,
+        ),
+        exc.status_code,
+        exc.headers,
+    )
+async def other_exception_handler(exc: "Exception"):
+    traceback.print_exc()
+    status = HTTPStatus.INTERNAL_SERVER_ERROR
+    return JSONResponse(
+        dict(statusCode=status, message=str(exc), error=status.phrase),
+        status,
+    )
+def load_audio(reference_audio, sr):
+    if len(reference_audio) > 255 or not Path(reference_audio).exists():
+        audio_data = reference_audio
+        reference_audio = io.BytesIO(audio_data)
+    waveform, original_sr = torchaudio.load(reference_audio, backend=backend)
+    if waveform.shape[0] > 1:
+        waveform = torch.mean(waveform, dim=0, keepdim=True)
+    if original_sr != sr:
+        resampler = torchaudio.transforms.Resample(orig_freq=original_sr, new_freq=sr)
+        waveform = resampler(waveform)
+    audio = waveform.squeeze().numpy()
+    return audio
+def encode_reference(*, decoder_model, reference_audio, enable_reference_audio):
+    if enable_reference_audio and reference_audio is not None:
+        # Load audios, and prepare basic info here
+        reference_audio_content = load_audio(
+            reference_audio, decoder_model.spec_transform.sample_rate
+        )
+        audios = torch.from_numpy(reference_audio_content).to(decoder_model.device)[
+            None, None, :
+        ]
+        audio_lengths = torch.tensor(
+            [audios.shape[2]], device=decoder_model.device, dtype=torch.long
+        )
+        logger.info(
+            f"Loaded audio with {audios.shape[2] / decoder_model.spec_transform.sample_rate:.2f} seconds"
+        )
+        # VQ Encoder
+        if isinstance(decoder_model, FireflyArchitecture):
+            prompt_tokens = decoder_model.encode(audios, audio_lengths)[0][0]
+        logger.info(f"Encoded prompt: {prompt_tokens.shape}")
+    else:
+        prompt_tokens = None
+        logger.info("No reference audio provided")
+    return prompt_tokens
+def decode_vq_tokens(
+    *,
+    decoder_model,
+    codes,
+):
+    feature_lengths = torch.tensor([codes.shape[1]], device=decoder_model.device)
+    logger.info(f"VQ features: {codes.shape}")
+    if isinstance(decoder_model, FireflyArchitecture):
+        # VQGAN Inference
+        return decoder_model.decode(
+            indices=codes[None],
+            feature_lengths=feature_lengths,
+        )[0].squeeze()
+    raise ValueError(f"Unknown model type: {type(decoder_model)}")
+routes = MultimethodRoutes(base_class=HttpView)
+def get_content_type(audio_format):
+    if audio_format == "wav":
+        return "audio/wav"
+    elif audio_format == "flac":
+        return "audio/flac"
+    elif audio_format == "mp3":
+        return "audio/mpeg"
+    else:
+        return "application/octet-stream"
+@torch.no_grad()
+@torch.autocast(device_type="cuda", dtype=torch.half)
+def batch_encode(model, audios: list[bytes | torch.Tensor]):
+    audios = [
+        (
+            torch.from_numpy(
+                librosa.load(io.BytesIO(audio), sr=model.spec_transform.sample_rate)[0]
+            )[None]
+            if isinstance(audio, bytes)
+            else audio
+        )
+        for audio in audios
+    ]
+    # if any(audio.shape[-1] > model.spec_transform.sample_rate * 120 for audio in audios):
+    #     raise ValueError("Single audio length is too long (>120s)")
+    max_length = max(audio.shape[-1] for audio in audios)
+    print(f"Encode max length: {max_length / model.spec_transform.sample_rate:.2f}s")
+    lengths = torch.tensor([audio.shape[-1] for audio in audios], device=model.device)
+    max_length = lengths.max().item()
+    padded = torch.stack(
+        [
+            torch.nn.functional.pad(audio, (0, max_length - audio.shape[-1]))
+            for audio in audios
+        ]
+    ).to(model.device)
+    features, feature_lengths = model.encode(padded, audio_lengths=lengths)
+    features, feature_lengths = features.cpu(), feature_lengths.cpu()
+    return [feature[..., :length] for feature, length in zip(features, feature_lengths)]
+@cached(
+    cache=LRUCache(maxsize=10000),
+    key=lambda model, audios: (model.device, tuple(audios)),
+)
+def cached_vqgan_batch_encode(model, audios: list[bytes]):
+    return batch_encode(model, audios)
+@routes.http.post("/v1/vqgan/encode")
+def api_vqgan_encode(payload: Annotated[ServeVQGANEncodeRequest, Body(exclusive=True)]):
+    start_time = time.time()
+    tokens = cached_vqgan_batch_encode(decoder_model, payload.audios)
+    logger.info(f"[EXEC] VQGAN encode time: {(time.time() - start_time) * 1000:.2f}ms")
+    return ormsgpack.packb(
+        ServeVQGANEncodeResponse(tokens=[i.tolist() for i in tokens]),
+        option=ormsgpack.OPT_SERIALIZE_PYDANTIC,
+    )
+@torch.no_grad()
+@torch.autocast(device_type="cuda", dtype=torch.half)
+def vqgan_decode(model, features):
+    lengths = torch.tensor(
+        [feature.shape[-1] for feature in features], device=model.device
+    )
+    max_length = lengths.max().item()
+    padded = torch.stack(
+        [
+            torch.nn.functional.pad(feature, (0, max_length - feature.shape[-1]))
+            for feature in features
+        ]
+    ).to(model.device)
+    # If bs too large, we do micro batch decode
+    audios, audio_lengths = [], []
+    for i in range(0, padded.shape[0], 8):
+        audio, audio_length = model.decode(
+            padded[i : i + 8], feature_lengths=lengths[i : i + 8]
+        )
+        audios.append(audio)
+        audio_lengths.append(audio_length)
+    audios = torch.cat(audios, dim=0)
+    audio_lengths = torch.cat(audio_lengths, dim=0)
+    audios, audio_lengths = audios.cpu(), audio_lengths.cpu()
+    return [audio[..., :length].numpy() for audio, length in zip(audios, audio_lengths)]
+@routes.http.post("/v1/vqgan/decode")
+def api_vqgan_decode(payload: Annotated[ServeVQGANDecodeRequest, Body(exclusive=True)]):
+    tokens = [torch.tensor(token, dtype=torch.int) for token in payload.tokens]
+    start_time = time.time()
+    audios = vqgan_decode(decoder_model, tokens)
+    logger.info(f"[EXEC] VQGAN decode time: {(time.time() - start_time) * 1000:.2f}ms")
+    audios = [audio.astype(np.float16).tobytes() for audio in audios]
+    return ormsgpack.packb(
+        ServeVQGANDecodeResponse(audios=audios), option=ormsgpack.OPT_SERIALIZE_PYDANTIC
+    )
+@torch.no_grad()
+def batch_asr(model, audios, sr, language="auto"):
+    resampled_audios = []
+    for audio in audios:
+        audio = torchaudio.functional.resample(audio, sr, 16000)
+        assert audio.ndim == 1
+        resampled_audios.append(audio)
+    with global_lock:
+        res = model.generate(
+            input=resampled_audios,
+            batch_size=len(resampled_audios),
+            language=language,
+            use_itn=True,
+        )
+    results = []
+    for r, audio in zip(res, audios):
+        text = r["text"]
+        text = re.sub(r"<\|.*?\|>", "", text)
+        duration = len(audio) / sr * 1000
+        huge_gap = False
+        if "timestamp" in r and len(r["timestamp"]) > 2:
+            for timestamp_a, timestamp_b in zip(
+                r["timestamp"][:-1], r["timestamp"][1:]
+            ):
+                # If there is a gap of more than 5 seconds, we consider it as a huge gap
+                if timestamp_b[0] - timestamp_a[1] > 5000:
+                    huge_gap = True
+                    break
+            # Doesn't make sense to have a huge gap at the end
+            if duration - r["timestamp"][-1][1] > 3000:
+                huge_gap = True
+        results.append(
+            {
+                "text": text,
+                "duration": duration,
+                "huge_gap": huge_gap,
+            }
+        )
+    return results
+@routes.http.post("/v1/asr")
+def api_invoke_asr(payload: Annotated[ServeASRRequest, Body(exclusive=True)]):
+    start_time = time.time()
+    audios = [np.frombuffer(audio, dtype=np.float16) for audio in payload.audios]
+    audios = [torch.from_numpy(audio).float() for audio in audios]
+    if any(audios.shape[-1] >= 30 * payload.sample_rate for audios in audios):
+        raise HTTPException(status_code=400, detail="Audio length is too long")
+    transcriptions = batch_asr(
+        asr_model, audios=audios, sr=payload.sample_rate, language=payload.language
+    )
+    logger.info(f"[EXEC] ASR time: {(time.time() - start_time) * 1000:.2f}ms")
+    return ormsgpack.packb(
+        ServeASRResponse(transcriptions=transcriptions),
+        option=ormsgpack.OPT_SERIALIZE_PYDANTIC,
+    )
+from fish_speech.conversation import Conversation, Message
+def execute_request(
+    input_queue: queue.Queue,
+    tokenizer: AutoTokenizer,
+    config: BaseModelArgs,
+    request: ServeRequest,
+    device: str = "cuda:0",
+):
+    semantic_id, im_end_id = tokenizer.convert_tokens_to_ids(
+        [SEMANTIC_TOKEN, IM_END_TOKEN]
+    )
+    messages = []
+    for message in request.messages:
+        messages.append(message.to_conversation_message())
+    assert len(messages) >= 1, "At least one message is required"
+    # assert messages[-1].role == "user", "The last message must be from the user"
+    if messages[-1].role == "user":
+        messages.append(Message(role="assistant", parts=[], add_im_end=False))
+    else:
+        assert (
+            messages[-1].role == "assistant"
+        ), "The last message must be from the assistant"
+        messages[-1].add_im_end = False
+    conv = Conversation(messages=messages)
+    prompt = conv.encode_for_inference(
+        tokenizer=tokenizer, num_codebooks=config.num_codebooks
+    ).to(device)
+    if request.streaming:
+        for i in range(request.num_samples):
+            yield ServeStreamResponse(
+                sample_id=i,
+                delta=ServeStreamDelta(
+                    role="assistant",
+                ),
+            )
+    req = {
+        "prompt": prompt,
+        "max_new_tokens": request.max_new_tokens,
+        "im_end_id": im_end_id,
+        "semantic_id": semantic_id,
+        "temperature": request.temperature,
+        "top_p": request.top_p,
+        "repetition_penalty": request.repetition_penalty,
+        "num_samples": request.num_samples,
+        "early_stop_threshold": request.early_stop_threshold,
+    }
+    start = time.time()
+    response_queue = queue.Queue()
+    input_queue.put(GenerateRequest(req, response_queue))
+    # Decoding
+    decode_buffer = [[] for _ in range(request.num_samples)]
+    parts = [[] for _ in range(request.num_samples)]
+    def send_reset_buffer(sample_id):
+        nonlocal decode_buffer
+        if len(decode_buffer[sample_id]) == 0:
+            return
+        decoded = tokenizer.decode(decode_buffer[sample_id])
+        part = ServeTextPart(text=decoded)
+        if request.streaming:
+            yield ServeStreamResponse(delta=ServeStreamDelta(part=part))
+        else:
+            parts[sample_id].append(part)
+        decode_buffer[sample_id] = []
+    # Decode process
+    finished = [False for _ in range(request.num_samples)]
+    stats = {}
+    idx = 0
+    while True:
+        response = response_queue.get()
+        if response in ["stop", "error"]:
+            break
+        for sample_id, tokens in enumerate(response):
+            if finished[sample_id]:
+                continue
+            if tokens[0] == im_end_id:
+                finished[sample_id] = True
+                if request.streaming:
+                    yield from send_reset_buffer(sample_id)
+                    yield ServeStreamResponse(
+                        sample_id=sample_id,
+                        finish_reason="stop",
+                        stats=stats,
+                    )
+                continue
+            if tokens[0] == semantic_id and request.streaming:
+                yield from send_reset_buffer(sample_id)
+                # Streaming vq
+                _tokens = tokens[1:].clone() - 1
+                if config.share_codebook_embeddings is False:
+                    for i in range(len(_tokens)):
+                        _tokens[i] -= config.codebook_size * i
+                yield ServeStreamResponse(
+                    sample_id=sample_id,
+                    delta=ServeStreamDelta(part=ServeVQPart(codes=_tokens.tolist())),
+                )
+                continue
+            # Not streaming vq
+            if tokens[0] == semantic_id:
+                yield from send_reset_buffer(sample_id)
+                # None streaming vq
+                if len(parts[sample_id]) == 0 or not isinstance(
+                    parts[sample_id][-1], ServeVQPart
+                ):
+                    _tokens = tokens[1:].clone() - 1
+                    if config.share_codebook_embeddings is False:
+                        for i in range(len(_tokens)):
+                            _tokens[i] -= config.codebook_size * i
+                    parts[sample_id].append(ServeVQPart(codes=_tokens.tolist()))
+                else:
+                    for codebook_id, value in enumerate(tokens[1:, :]):
+                        val = value.item() - 1
+                        if config.share_codebook_embeddings is False:
+                            val -= config.codebook_size * codebook_id
+                        parts[sample_id][-1].codes[codebook_id].append(val)
+                continue
+            if tokens[0] != semantic_id:
+                # Stream text decode is not supported now
+                decode_buffer[sample_id].append(tokens[0, 0])
+        if idx == 0:
+            stats["time_to_first_token"] = (time.time() - start) * 1000
+        idx += 1
+    for sample_id in range(request.num_samples):
+        yield from send_reset_buffer(sample_id)
+    stats["total_time"] = (time.time() - start) * 1000
+    stats["total_tokens"] = idx
+    if request.streaming:
+        for sample_id in range(request.num_samples):
+            if finished[sample_id]:
+                continue
+            yield ServeStreamResponse(
+                finish_reason=response, stats=stats, sample_id=sample_id
+            )
+        return
+    yield ServeResponse(
+        messages=[
+            ServeMessage(role="assistant", parts=parts[i])
+            for i in range(request.num_samples)
+        ],
+        finish_reason=response,
+        stats=stats,
+    )
+@routes.http.post("/v1/chat")
+def api_invoke_chat(
+    req: Annotated[ServeRequest, Body(exclusive=True)],
+):
+    """
+    Invoke model and generate audio
+    """
+    # This makes torch compile happy
+    assert (
+        req.num_samples == GLOBAL_NUM_SAMPLES
+    ), f"num_samples must be {GLOBAL_NUM_SAMPLES}"
+    content_type = request.headers.get("Content-Type", "application/json")
+    json_mode = "application/json" in content_type
+    async def wrapped_generator():
+        generator = execute_request(llama_queue, tokenizer, config, req, args.device)
+        for i in generator:
+            if json_mode:
+                body = i.model_dump_json().encode("utf-8")
+                yield b"data: " + body + b"\n\n"
+            else:
+                body = ormsgpack.packb(i, option=ormsgpack.OPT_SERIALIZE_PYDANTIC)
+                yield struct.pack("I", len(body)) + body
+    # Naive mode
+    if req.streaming is False:
+        result = next(execute_request(llama_queue, tokenizer, config, req, args.device))
+        if json_mode:
+            return JSONResponse(result.model_dump())
+        else:
+            return ormsgpack.packb(result, option=ormsgpack.OPT_SERIALIZE_PYDANTIC)
+    return StreamResponse(
+        iterable=wrapped_generator(), content_type="text/event-stream"
+    )
+@torch.inference_mode()
+def inference(req: ServeTTSRequest):
+    global prompt_tokens, prompt_texts
+    idstr: str | None = req.reference_id
+    if idstr is not None:
+        ref_folder = Path("references") / idstr
+        ref_folder.mkdir(parents=True, exist_ok=True)
+        ref_audios = list_files(
+            ref_folder, AUDIO_EXTENSIONS, recursive=True, sort=False
+        )
+        if req.use_memory_cache == "never" or (
+            req.use_memory_cache == "on-demand" and len(prompt_tokens) == 0
+        ):
+            prompt_tokens = [
+                encode_reference(
+                    decoder_model=decoder_model,
+                    reference_audio=audio_to_bytes(str(ref_audio)),
+                    enable_reference_audio=True,
+                )
+                for ref_audio in ref_audios
+            ]
+            prompt_texts = [
+                read_ref_text(str(ref_audio.with_suffix(".lab")))
+                for ref_audio in ref_audios
+            ]
+        else:
+            logger.info("Use same references")
+    else:
+        # Parse reference audio aka prompt
+        refs = req.references
+        if req.use_memory_cache == "never" or (
+            req.use_memory_cache == "on-demand" and len(prompt_tokens) == 0
+        ):
+            prompt_tokens = [
+                encode_reference(
+                    decoder_model=decoder_model,
+                    reference_audio=ref.audio,
+                    enable_reference_audio=True,
+                )
+                for ref in refs
+            ]
+            prompt_texts = [ref.text for ref in refs]
+        else:
+            logger.info("Use same references")
+    if req.seed is not None:
+        set_seed(req.seed)
+        logger.warning(f"set seed: {req.seed}")
+    # LLAMA Inference
+    request = dict(
+        device=decoder_model.device,
+        max_new_tokens=req.max_new_tokens,
+        text=(
+            req.text
+            if not req.normalize
+            else ChnNormedText(raw_text=req.text).normalize()
+        ),
+        top_p=req.top_p,
+        repetition_penalty=req.repetition_penalty,
+        temperature=req.temperature,
+        compile=args.compile,
+        iterative_prompt=req.chunk_length > 0,
+        chunk_length=req.chunk_length,
+        max_length=4096,
+        prompt_tokens=prompt_tokens,
+        prompt_text=prompt_texts,
+    )
+    response_queue = queue.Queue()
+    llama_queue.put(
+        GenerateRequest(
+            request=request,
+            response_queue=response_queue,
+        )
+    )
+    if req.streaming:
+        yield wav_chunk_header()
+    segments = []
+    while True:
+        result: WrappedGenerateResponse = response_queue.get()
+        if result.status == "error":
+            raise result.response
+            break
+        result: GenerateResponse = result.response
+        if result.action == "next":
+            break
+        with autocast_exclude_mps(
+            device_type=decoder_model.device.type, dtype=args.precision
+        ):
+            fake_audios = decode_vq_tokens(
+                decoder_model=decoder_model,
+                codes=result.codes,
+            )
+        fake_audios = fake_audios.float().cpu().numpy()
+        if req.streaming:
+            yield (fake_audios * 32768).astype(np.int16).tobytes()
+        else:
+            segments.append(fake_audios)
+    if req.streaming:
+        return
+    if len(segments) == 0:
+        raise HTTPException(
+            HTTPStatus.INTERNAL_SERVER_ERROR,
+            content="No audio generated, please check the input text.",
+        )
+    fake_audios = np.concatenate(segments, axis=0)
+    yield fake_audios
+async def inference_async(req: ServeTTSRequest):
+    for chunk in inference(req):
+        yield chunk
+async def buffer_to_async_generator(buffer):
+    yield buffer
+@routes.http.post("/v1/tts")
+async def api_invoke_model(
+    req: Annotated[ServeTTSRequest, Body(exclusive=True)],
+):
+    """
+    Invoke model and generate audio
+    """
+    if args.max_text_length > 0 and len(req.text) > args.max_text_length:
+        raise HTTPException(
+            HTTPStatus.BAD_REQUEST,
+            content=f"Text is too long, max length is {args.max_text_length}",
+        )
+    if req.streaming and req.format != "wav":
+        raise HTTPException(
+            HTTPStatus.BAD_REQUEST,
+            content="Streaming only supports WAV format",
+        )
+    if req.streaming:
+        return StreamResponse(
+            iterable=inference_async(req),
+            headers={
+                "Content-Disposition": f"attachment; filename=audio.{req.format}",
+            },
+            content_type=get_content_type(req.format),
+        )
+    else:
+        fake_audios = next(inference(req))
+        buffer = io.BytesIO()
+        sf.write(
+            buffer,
+            fake_audios,
+            decoder_model.spec_transform.sample_rate,
+            format=req.format,
+        )
+        return StreamResponse(
+            iterable=buffer_to_async_generator(buffer.getvalue()),
+            headers={
+                "Content-Disposition": f"attachment; filename=audio.{req.format}",
+            },
+            content_type=get_content_type(req.format),
+        )
+@routes.http.post("/v1/health")
+async def api_health():
+    """
+    Health check
+    """
+    return JSONResponse({"status": "ok"})
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument("--mode", type=str, choices=["agent", "tts"], default="tts")
+    parser.add_argument("--load-asr-model", action="store_true")
+    parser.add_argument(
+        "--llama-checkpoint-path",
+        type=str,
+        default="checkpoints/fish-speech-1.4",
+    )
+    parser.add_argument(
+        "--decoder-checkpoint-path",
+        type=str,
+        default="checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
+    )
+    parser.add_argument("--decoder-config-name", type=str, default="firefly_gan_vq")
+    parser.add_argument("--device", type=str, default="cuda")
+    parser.add_argument("--half", action="store_true")
+    parser.add_argument("--compile", action="store_true")
+    parser.add_argument("--max-text-length", type=int, default=0)
+    parser.add_argument("--listen", type=str, default="127.0.0.1:8080")
+    parser.add_argument("--workers", type=int, default=1)
+    return parser.parse_args()
+# Define Kui app
+openapi = OpenAPI(
+    {
+        "title": "Fish Speech API",
+        "version": "1.4.2",
+    },
+).routes
+class MsgPackRequest(HttpRequest):
+    async def data(
+        self,
+    ) -> Annotated[
+        Any, ContentType("application/msgpack"), ContentType("application/json")
+    ]:
+        if self.content_type == "application/msgpack":
+            return ormsgpack.unpackb(await self.body)
+        elif self.content_type == "application/json":
+            return await self.json
+        raise HTTPException(
+            HTTPStatus.UNSUPPORTED_MEDIA_TYPE,
+            headers={"Accept": "application/msgpack, application/json"},
+        )
+app = Kui(
+    routes=routes + openapi[1:],  # Remove the default route
+    exception_handlers={
+        HTTPException: http_execption_handler,
+        Exception: other_exception_handler,
+    },
+    factory_class=FactoryClass(http=MsgPackRequest),
+    cors_config={},
+)
+def load_asr_model(*, device="cuda", hub="ms"):
+    return AutoModel(
+        model="iic/SenseVoiceSmall",
+        device=device,
+        disable_pbar=True,
+        hub=hub,
+    )
+# Each worker process created by Uvicorn has its own memory space,
+# meaning that models and variables are not shared between processes.
+# Therefore, any global variables (like `llama_queue` or `decoder_model`)
+# will not be shared across workers.
+# Multi-threading for deep learning can cause issues, such as inconsistent
+# outputs if multiple threads access the same buffers simultaneously.
+# Instead, it's better to use multiprocessing or independent models per thread.
+@app.on_startup
+def initialize_app(app: Kui):
+    global args, llama_queue, tokenizer, config, decoder_model, vad_model, asr_model, prompt_tokens, prompt_texts
+    prompt_tokens, prompt_texts = [], []
+    args = parse_args()  # args same as ones in other processes
+    args.precision = torch.half if args.half else torch.bfloat16
+    if args.load_asr_model:
+        logger.info(f"Loading ASR model...")
+        asr_model = load_asr_model(device=args.device)
+    logger.info("Loading Llama model...")
+    if args.mode == "tts":
+        llama_queue = launch_thread_safe_queue(
+            checkpoint_path=args.llama_checkpoint_path,
+            device=args.device,
+            precision=args.precision,
+            compile=args.compile,
+        )
+    else:
+        llama_queue, tokenizer, config = launch_thread_safe_queue_agent(
+            checkpoint_path=args.llama_checkpoint_path,
+            device=args.device,
+            precision=args.precision,
+            compile=args.compile,
+        )
+    logger.info("Llama model loaded, loading VQ-GAN model...")
+    decoder_model = load_decoder_model(
+        config_name=args.decoder_config_name,
+        checkpoint_path=args.decoder_checkpoint_path,
+        device=args.device,
+    )
+    logger.info("VQ-GAN model loaded, warming up...")
+    vad_model = load_silero_vad()
+    logger.info("VAD model loaded, warming up...")
+    if args.mode == "tts":
+        # Dry run to ensure models work and avoid first-time latency
+        list(
+            inference(
+                ServeTTSRequest(
+                    text="Hello world.",
+                    references=[],
+                    reference_id=None,
+                    max_new_tokens=0,
+                    chunk_length=200,
+                    top_p=0.7,
+                    repetition_penalty=1.2,
+                    temperature=0.7,
+                    emotion=None,
+                    format="wav",
+                )
+            )
+        )
+    logger.info(f"Warming up done, starting server at http://{args.listen}")
+if __name__ == "__main__":
+    import uvicorn
+    args = parse_args()
+    host, port = args.listen.split(":")
+    uvicorn.run(
+        "tools.api:app",
+        host='0.0.0.0',
+        port=int(port),
+        workers=args.workers,
+        log_level="info",
+    )

tools/auto_rerank.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import os
+os.environ["MODELSCOPE_CACHE"] = ".cache/"
+import string
+import time
+from threading import Lock
+import librosa
+import numpy as np
+import opencc
+import torch
+from faster_whisper import WhisperModel
+t2s_converter = opencc.OpenCC("t2s")
+def load_model(*, device="cuda"):
+    model = WhisperModel(
+        "medium",
+        device=device,
+        compute_type="float16",
+        download_root="faster_whisper",
+    )
+    print("faster_whisper loaded!")
+    return model
+@torch.no_grad()
+def batch_asr_internal(model: WhisperModel, audios, sr):
+    resampled_audios = []
+    for audio in audios:
+        if isinstance(audio, np.ndarray):
+            audio = torch.from_numpy(audio).float()
+        if audio.dim() > 1:
+            audio = audio.squeeze()
+        assert audio.dim() == 1
+        audio_np = audio.numpy()
+        resampled_audio = librosa.resample(audio_np, orig_sr=sr, target_sr=16000)
+        resampled_audios.append(resampled_audio)
+    trans_results = []
+    for resampled_audio in resampled_audios:
+        segments, info = model.transcribe(
+            resampled_audio,
+            language=None,
+            beam_size=5,
+            initial_prompt="Punctuation is needed in any language.",
+        )
+        trans_results.append(list(segments))
+    results = []
+    for trans_res, audio in zip(trans_results, audios):
+        duration = len(audio) / sr * 1000
+        huge_gap = False
+        max_gap = 0.0
+        text = None
+        last_tr = None
+        for tr in trans_res:
+            delta = tr.text.strip()
+            if tr.id > 1:
+                max_gap = max(tr.start - last_tr.end, max_gap)
+                text += delta
+            else:
+                text = delta
+            last_tr = tr
+            if max_gap > 3.0:
+                huge_gap = True
+                break
+        sim_text = t2s_converter.convert(text)
+        results.append(
+            {
+                "text": sim_text,
+                "duration": duration,
+                "huge_gap": huge_gap,
+            }
+        )
+    return results
+global_lock = Lock()
+def batch_asr(model, audios, sr):
+    return batch_asr_internal(model, audios, sr)
+def is_chinese(text):
+    return True
+def calculate_wer(text1, text2, debug=False):
+    chars1 = remove_punctuation(text1)
+    chars2 = remove_punctuation(text2)
+    m, n = len(chars1), len(chars2)
+    if m > n:
+        chars1, chars2 = chars2, chars1
+        m, n = n, m
+    prev = list(range(m + 1))  # row 0 distance: [0, 1, 2, ...]
+    curr = [0] * (m + 1)
+    for j in range(1, n + 1):
+        curr[0] = j
+        for i in range(1, m + 1):
+            if chars1[i - 1] == chars2[j - 1]:
+                curr[i] = prev[i - 1]
+            else:
+                curr[i] = min(prev[i], curr[i - 1], prev[i - 1]) + 1
+        prev, curr = curr, prev
+    edits = prev[m]
+    tot = max(len(chars1), len(chars2))
+    wer = edits / tot
+    if debug:
+        print("            gt:   ", chars1)
+        print("          pred:   ", chars2)
+        print(" edits/tot = wer: ", edits, "/", tot, "=", wer)
+    return wer
+def remove_punctuation(text):
+    chinese_punctuation = (
+        " \n\t”“！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—"
+        '‛""„‟…‧﹏'
+    )
+    all_punctuation = string.punctuation + chinese_punctuation
+    translator = str.maketrans("", "", all_punctuation)
+    text_without_punctuation = text.translate(translator)
+    return text_without_punctuation
+if __name__ == "__main__":
+    model = load_model()
+    audios = [
+        librosa.load("44100.wav", sr=44100)[0],
+        librosa.load("lengyue.wav", sr=44100)[0],
+    ]
+    print(np.array(audios[0]))
+    print(batch_asr(model, audios, 44100))
+    start_time = time.time()
+    for _ in range(10):
+        print(batch_asr(model, audios, 44100))
+    print("Time taken:", time.time() - start_time)

tools/commons.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from typing import Annotated, Literal, Optional
+from pydantic import BaseModel, Field, conint
+class ServeReferenceAudio(BaseModel):
+    audio: bytes
+    text: str
+class ServeTTSRequest(BaseModel):
+    text: str
+    chunk_length: Annotated[int, conint(ge=100, le=300, strict=True)] = 200
+    # Audio format
+    format: Literal["wav", "pcm", "mp3"] = "wav"
+    mp3_bitrate: Literal[64, 128, 192] = 128
+    # References audios for in-context learning
+    references: list[ServeReferenceAudio] = []
+    # Reference id
+    # For example, if you want use https://fish.audio/m/7f92f8afb8ec43bf81429cc1c9199cb1/
+    # Just pass 7f92f8afb8ec43bf81429cc1c9199cb1
+    reference_id: str | None = None
+    # Normalize text for en & zh, this increase stability for numbers
+    normalize: bool = True
+    mp3_bitrate: Optional[int] = 64
+    opus_bitrate: Optional[int] = -1000
+    # Balance mode will reduce latency to 300ms, but may decrease stability
+    latency: Literal["normal", "balanced"] = "normal"
+    # not usually used below
+    streaming: bool = False
+    emotion: Optional[str] = None
+    max_new_tokens: int = 1024
+    top_p: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7
+    repetition_penalty: Annotated[float, Field(ge=0.9, le=2.0, strict=True)] = 1.2
+    temperature: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7

tools/download_models.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import os
+from huggingface_hub import hf_hub_download
+# Download
+def check_and_download_files(repo_id, file_list, local_dir):
+    os.makedirs(local_dir, exist_ok=True)
+    for file in file_list:
+        file_path = os.path.join(local_dir, file)
+        if not os.path.exists(file_path):
+            print(f"{file} 不存在，从 Hugging Face 仓库下载...")
+            hf_hub_download(
+                repo_id=repo_id,
+                filename=file,
+                resume_download=True,
+                local_dir=local_dir,
+                local_dir_use_symlinks=False,
+            )
+        else:
+            print(f"{file} 已存在，跳过下载。")
+# 1st
+repo_id_1 = "fishaudio/fish-speech-1.4"
+local_dir_1 = "./checkpoints/fish-speech-1.4"
+files_1 = [
+    "model.pth",
+    "README.md",
+    "special_tokens_map.json",
+    "tokenizer_config.json",
+    "tokenizer.json",
+    "config.json",
+    "firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
+]
+# 3rd
+repo_id_3 = "fishaudio/fish-speech-1"
+local_dir_3 = "./"
+files_3 = [
+    "ffmpeg.exe",
+    "ffprobe.exe",
+]
+# 4th
+repo_id_4 = "SpicyqSama007/fish-speech-packed"
+local_dir_4 = "./"
+files_4 = [
+    "asr-label-win-x64.exe",
+]
+check_and_download_files(repo_id_1, files_1, local_dir_1)
+check_and_download_files(repo_id_3, files_3, local_dir_3)
+check_and_download_files(repo_id_4, files_4, local_dir_4)

tools/e2e_webui.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import io
+import re
+import wave
+import gradio as gr
+import numpy as np
+from .fish_e2e import FishE2EAgent, FishE2EEventType
+from .schema import ServeMessage, ServeTextPart, ServeVQPart
+def wav_chunk_header(sample_rate=44100, bit_depth=16, channels=1):
+    buffer = io.BytesIO()
+    with wave.open(buffer, "wb") as wav_file:
+        wav_file.setnchannels(channels)
+        wav_file.setsampwidth(bit_depth // 8)
+        wav_file.setframerate(sample_rate)
+    wav_header_bytes = buffer.getvalue()
+    buffer.close()
+    return wav_header_bytes
+class ChatState:
+    def __init__(self):
+        self.conversation = []
+        self.added_systext = False
+        self.added_sysaudio = False
+    def get_history(self):
+        results = []
+        for msg in self.conversation:
+            results.append({"role": msg.role, "content": self.repr_message(msg)})
+        # Process assistant messages to extract questions and update user messages
+        for i, msg in enumerate(results):
+            if msg["role"] == "assistant":
+                match = re.search(r"Question: (.*?)\n\nResponse:", msg["content"])
+                if match and i > 0 and results[i - 1]["role"] == "user":
+                    # Update previous user message with extracted question
+                    results[i - 1]["content"] += "\n" + match.group(1)
+                    # Remove the Question/Answer format from assistant message
+                    msg["content"] = msg["content"].split("\n\nResponse: ", 1)[1]
+        return results
+    def repr_message(self, msg: ServeMessage):
+        response = ""
+        for part in msg.parts:
+            if isinstance(part, ServeTextPart):
+                response += part.text
+            elif isinstance(part, ServeVQPart):
+                response += f"<audio {len(part.codes[0]) / 21:.2f}s>"
+        return response
+def clear_fn():
+    return [], ChatState(), None, None, None
+async def process_audio_input(
+    sys_audio_input, sys_text_input, audio_input, state: ChatState, text_input: str
+):
+    if audio_input is None and not text_input:
+        raise gr.Error("No input provided")
+    agent = FishE2EAgent()  # Create new agent instance for each request
+    # Convert audio input to numpy array
+    if isinstance(audio_input, tuple):
+        sr, audio_data = audio_input
+    elif text_input:
+        sr = 44100
+        audio_data = None
+    else:
+        raise gr.Error("Invalid audio format")
+    if isinstance(sys_audio_input, tuple):
+        sr, sys_audio_data = sys_audio_input
+    else:
+        sr = 44100
+        sys_audio_data = None
+    def append_to_chat_ctx(
+        part: ServeTextPart | ServeVQPart, role: str = "assistant"
+    ) -> None:
+        if not state.conversation or state.conversation[-1].role != role:
+            state.conversation.append(ServeMessage(role=role, parts=[part]))
+        else:
+            state.conversation[-1].parts.append(part)
+    if state.added_systext is False and sys_text_input:
+        state.added_systext = True
+        append_to_chat_ctx(ServeTextPart(text=sys_text_input), role="system")
+    if text_input:
+        append_to_chat_ctx(ServeTextPart(text=text_input), role="user")
+        audio_data = None
+    result_audio = b""
+    async for event in agent.stream(
+        sys_audio_data,
+        audio_data,
+        sr,
+        1,
+        chat_ctx={
+            "messages": state.conversation,
+            "added_sysaudio": state.added_sysaudio,
+        },
+    ):
+        if event.type == FishE2EEventType.USER_CODES:
+            append_to_chat_ctx(ServeVQPart(codes=event.vq_codes), role="user")
+        elif event.type == FishE2EEventType.SPEECH_SEGMENT:
+            append_to_chat_ctx(ServeVQPart(codes=event.vq_codes))
+            yield state.get_history(), wav_chunk_header() + event.frame.data, None, None
+        elif event.type == FishE2EEventType.TEXT_SEGMENT:
+            append_to_chat_ctx(ServeTextPart(text=event.text))
+            yield state.get_history(), None, None, None
+    yield state.get_history(), None, None, None
+async def process_text_input(
+    sys_audio_input, sys_text_input, state: ChatState, text_input: str
+):
+    async for event in process_audio_input(
+        sys_audio_input, sys_text_input, None, state, text_input
+    ):
+        yield event
+def create_demo():
+    with gr.Blocks() as demo:
+        state = gr.State(ChatState())
+        with gr.Row():
+            # Left column (70%) for chatbot and notes
+            with gr.Column(scale=7):
+                chatbot = gr.Chatbot(
+                    [],
+                    elem_id="chatbot",
+                    bubble_full_width=False,
+                    height=600,
+                    type="messages",
+                )
+                # notes = gr.Markdown(
+                #     """
+                # # Fish Agent
+                # 1. 此Demo为Fish Audio自研端到端语言模型Fish Agent 3B版本.
+                # 2. 你可以在我们的官方仓��找到代码以及权重，但是相关内容全部基于 CC BY-NC-SA 4.0 许可证发布.
+                # 3. Demo为早期灰度测试版本，推理速度尚待优化.
+                # # 特色
+                # 1. 该模型自动集成ASR与TTS部分，不需要外挂其它模型，即真正的端到端，而非三段式(ASR+LLM+TTS).
+                # 2. 模型可以使用reference audio控制说话音色.
+                # 3. 可以生成具有较强情感与韵律的音频.
+                # """
+                # )
+                notes = gr.Markdown(
+                    """
+                    # Fish Agent
+                    1. This demo is Fish Audio's self-researh end-to-end language model, Fish Agent version 3B.
+                    2. You can find the code and weights in our official repo in [gitub](https://github.com/fishaudio/fish-speech) and [hugging face](https://huggingface.co/fishaudio/fish-agent-v0.1-3b), but the content is released under a CC BY-NC-SA 4.0 licence.
+                    3. The demo is an early alpha test version, the inference speed needs to be optimised.
+                    # Features
+                    1. The model automatically integrates ASR and TTS parts, no need to plug-in other models, i.e., true end-to-end, not three-stage (ASR+LLM+TTS).
+                    2. The model can use reference audio to control the speech timbre.
+                    3. The model can generate speech with strong emotion.
+                """
+                )
+            # Right column (30%) for controls
+            with gr.Column(scale=3):
+                sys_audio_input = gr.Audio(
+                    sources=["upload"],
+                    type="numpy",
+                    label="Give a timbre for your assistant",
+                )
+                sys_text_input = gr.Textbox(
+                    label="What is your assistant's role?",
+                    value="You are a voice assistant created by Fish Audio, offering end-to-end voice interaction for a seamless user experience. You are required to first transcribe the user's speech, then answer it in the following format: 'Question: [USER_SPEECH]\n\nAnswer: [YOUR_RESPONSE]\n'. You are required to use the following voice in this conversation.",
+                    type="text",
+                )
+                audio_input = gr.Audio(
+                    sources=["microphone"], type="numpy", label="Speak your message"
+                )
+                text_input = gr.Textbox(label="Or type your message", type="text")
+                output_audio = gr.Audio(
+                    label="Assistant's Voice",
+                    streaming=True,
+                    autoplay=True,
+                    interactive=False,
+                )
+                send_button = gr.Button("Send", variant="primary")
+                clear_button = gr.Button("Clear")
+        # Event handlers
+        audio_input.stop_recording(
+            process_audio_input,
+            inputs=[sys_audio_input, sys_text_input, audio_input, state, text_input],
+            outputs=[chatbot, output_audio, audio_input, text_input],
+            show_progress=True,
+        )
+        send_button.click(
+            process_text_input,
+            inputs=[sys_audio_input, sys_text_input, state, text_input],
+            outputs=[chatbot, output_audio, audio_input, text_input],
+            show_progress=True,
+        )
+        text_input.submit(
+            process_text_input,
+            inputs=[sys_audio_input, sys_text_input, state, text_input],
+            outputs=[chatbot, output_audio, audio_input, text_input],
+            show_progress=True,
+        )
+        clear_button.click(
+            clear_fn,
+            inputs=[],
+            outputs=[chatbot, state, audio_input, output_audio, text_input],
+        )
+    return demo
+if __name__ == "__main__":
+    demo = create_demo()
+    demo.launch(server_name="127.0.0.1", server_port=7860, share=True)

tools/extract_model.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import click
+import torch
+from loguru import logger
+@click.command()
+@click.argument("model_path")
+@click.argument("output_path")
+def main(model_path, output_path):
+    if model_path == output_path:
+        logger.error("Model path and output path are the same")
+        return
+    logger.info(f"Loading model from {model_path}")
+    state_dict = torch.load(model_path, map_location="cpu")["state_dict"]
+    torch.save(state_dict, output_path)
+    logger.info(f"Model saved to {output_path}")
+if __name__ == "__main__":
+    main()

tools/file.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import base64
+from pathlib import Path
+from typing import Union
+from loguru import logger
+from natsort import natsorted
+AUDIO_EXTENSIONS = {
+    ".mp3",
+    ".wav",
+    ".flac",
+    ".ogg",
+    ".m4a",
+    ".wma",
+    ".aac",
+    ".aiff",
+    ".aif",
+    ".aifc",
+}
+VIDEO_EXTENSIONS = {
+    ".mp4",
+    ".avi",
+}
+def audio_to_bytes(file_path):
+    if not file_path or not Path(file_path).exists():
+        return None
+    with open(file_path, "rb") as wav_file:
+        wav = wav_file.read()
+    return wav
+def read_ref_text(ref_text):
+    path = Path(ref_text)
+    if path.exists() and path.is_file():
+        with path.open("r", encoding="utf-8") as file:
+            return file.read()
+    return ref_text
+def list_files(
+    path: Union[Path, str],
+    extensions: set[str] = None,
+    recursive: bool = False,
+    sort: bool = True,
+) -> list[Path]:
+    """List files in a directory.
+    Args:
+        path (Path): Path to the directory.
+        extensions (set, optional): Extensions to filter. Defaults to None.
+        recursive (bool, optional): Whether to search recursively. Defaults to False.
+        sort (bool, optional): Whether to sort the files. Defaults to True.
+    Returns:
+        list: List of files.
+    """
+    if isinstance(path, str):
+        path = Path(path)
+    if not path.exists():
+        raise FileNotFoundError(f"Directory {path} does not exist.")
+    files = [file for ext in extensions for file in path.rglob(f"*{ext}")]
+    if sort:
+        files = natsorted(files)
+    return files
+def load_filelist(path: Path | str) -> list[tuple[Path, str, str, str]]:
+    """
+    Load a Bert-VITS2 style filelist.
+    """
+    files = set()
+    results = []
+    count_duplicated, count_not_found = 0, 0
+    LANGUAGE_TO_LANGUAGES = {
+        "zh": ["zh", "en"],
+        "jp": ["jp", "en"],
+        "en": ["en"],
+    }
+    with open(path, "r", encoding="utf-8") as f:
+        for line in f.readlines():
+            splits = line.strip().split("|", maxsplit=3)
+            if len(splits) != 4:
+                logger.warning(f"Invalid line: {line}")
+                continue
+            filename, speaker, language, text = splits
+            file = Path(filename)
+            language = language.strip().lower()
+            if language == "ja":
+                language = "jp"
+            assert language in ["zh", "jp", "en"], f"Invalid language {language}"
+            languages = LANGUAGE_TO_LANGUAGES[language]
+            if file in files:
+                logger.warning(f"Duplicated file: {file}")
+                count_duplicated += 1
+                continue
+            if not file.exists():
+                logger.warning(f"File not found: {file}")
+                count_not_found += 1
+                continue
+            results.append((file, speaker, languages, text))
+    if count_duplicated > 0:
+        logger.warning(f"Total duplicated files: {count_duplicated}")
+    if count_not_found > 0:
+        logger.warning(f"Total files not found: {count_not_found}")
+    return results

tools/fish_e2e.py ADDED Viewed

	@@ -0,0 +1,298 @@

+import base64
+import ctypes
+import io
+import json
+import os
+import struct
+from dataclasses import dataclass
+from enum import Enum
+from typing import AsyncGenerator, Union
+import httpx
+import numpy as np
+import ormsgpack
+import soundfile as sf
+from .schema import (
+    ServeMessage,
+    ServeRequest,
+    ServeTextPart,
+    ServeVQGANDecodeRequest,
+    ServeVQGANEncodeRequest,
+    ServeVQPart,
+)
+class CustomAudioFrame:
+    def __init__(self, data, sample_rate, num_channels, samples_per_channel):
+        if len(data) < num_channels * samples_per_channel * ctypes.sizeof(
+            ctypes.c_int16
+        ):
+            raise ValueError(
+                "data length must be >= num_channels * samples_per_channel * sizeof(int16)"
+            )
+        self._data = bytearray(data)
+        self._sample_rate = sample_rate
+        self._num_channels = num_channels
+        self._samples_per_channel = samples_per_channel
+    @property
+    def data(self):
+        return memoryview(self._data).cast("h")
+    @property
+    def sample_rate(self):
+        return self._sample_rate
+    @property
+    def num_channels(self):
+        return self._num_channels
+    @property
+    def samples_per_channel(self):
+        return self._samples_per_channel
+    @property
+    def duration(self):
+        return self.samples_per_channel / self.sample_rate
+    def __repr__(self):
+        return (
+            f"CustomAudioFrame(sample_rate={self.sample_rate}, "
+            f"num_channels={self.num_channels}, "
+            f"samples_per_channel={self.samples_per_channel}, "
+            f"duration={self.duration:.3f})"
+        )
+class FishE2EEventType(Enum):
+    SPEECH_SEGMENT = 1
+    TEXT_SEGMENT = 2
+    END_OF_TEXT = 3
+    END_OF_SPEECH = 4
+    ASR_RESULT = 5
+    USER_CODES = 6
+@dataclass
+class FishE2EEvent:
+    type: FishE2EEventType
+    frame: np.ndarray = None
+    text: str = None
+    vq_codes: list[list[int]] = None
+client = httpx.AsyncClient(
+    timeout=None,
+    limits=httpx.Limits(
+        max_connections=None,
+        max_keepalive_connections=None,
+        keepalive_expiry=None,
+    ),
+)
+class FishE2EAgent:
+    def __init__(self):
+        self.llm_url = "http://localhost:8080/v1/chat"
+        self.vqgan_url = "http://localhost:8080"
+        self.client = httpx.AsyncClient(timeout=None)
+    async def get_codes(self, audio_data, sample_rate):
+        audio_buffer = io.BytesIO()
+        sf.write(audio_buffer, audio_data, sample_rate, format="WAV")
+        audio_buffer.seek(0)
+        # Step 1: Encode audio using VQGAN
+        encode_request = ServeVQGANEncodeRequest(audios=[audio_buffer.read()])
+        encode_request_bytes = ormsgpack.packb(
+            encode_request, option=ormsgpack.OPT_SERIALIZE_PYDANTIC
+        )
+        encode_response = await self.client.post(
+            f"{self.vqgan_url}/v1/vqgan/encode",
+            data=encode_request_bytes,
+            headers={"Content-Type": "application/msgpack"},
+        )
+        encode_response_data = ormsgpack.unpackb(encode_response.content)
+        codes = encode_response_data["tokens"][0]
+        return codes
+    async def stream(
+        self,
+        system_audio_data: np.ndarray | None,
+        user_audio_data: np.ndarray | None,
+        sample_rate: int,
+        num_channels: int,
+        chat_ctx: dict | None = None,
+    ) -> AsyncGenerator[bytes, None]:
+        if system_audio_data is not None:
+            sys_codes = await self.get_codes(system_audio_data, sample_rate)
+        else:
+            sys_codes = None
+        if user_audio_data is not None:
+            user_codes = await self.get_codes(user_audio_data, sample_rate)
+        # Step 2: Prepare LLM request
+        if chat_ctx is None:
+            sys_parts = [
+                ServeTextPart(
+                    text='您是由 Fish Audio 设计的语音助手，提供端到端的语音交互，实现无缝用户体验。首先转录用户的语音，然后使用以下格式回答："Question: [用户语音]\n\nAnswer: [你的回答]\n"。'
+                ),
+            ]
+            if system_audio_data is not None:
+                sys_parts.append(ServeVQPart(codes=sys_codes))
+            chat_ctx = {
+                "messages": [
+                    ServeMessage(
+                        role="system",
+                        parts=sys_parts,
+                    ),
+                ],
+            }
+        else:
+            if chat_ctx["added_sysaudio"] is False and sys_codes:
+                chat_ctx["added_sysaudio"] = True
+                chat_ctx["messages"][0].parts.append(ServeVQPart(codes=sys_codes))
+        prev_messages = chat_ctx["messages"].copy()
+        if user_audio_data is not None:
+            yield FishE2EEvent(
+                type=FishE2EEventType.USER_CODES,
+                vq_codes=user_codes,
+            )
+        else:
+            user_codes = None
+        request = ServeRequest(
+            messages=prev_messages
+            + (
+                [
+                    ServeMessage(
+                        role="user",
+                        parts=[ServeVQPart(codes=user_codes)],
+                    )
+                ]
+                if user_codes
+                else []
+            ),
+            streaming=True,
+            num_samples=1,
+        )
+        # Step 3: Stream LLM response and decode audio
+        buffer = b""
+        vq_codes = []
+        current_vq = False
+        async def decode_send():
+            nonlocal current_vq
+            nonlocal vq_codes
+            data = np.concatenate(vq_codes, axis=1).tolist()
+            # Decode VQ codes to audio
+            decode_request = ServeVQGANDecodeRequest(tokens=[data])
+            decode_response = await self.client.post(
+                f"{self.vqgan_url}/v1/vqgan/decode",
+                data=ormsgpack.packb(
+                    decode_request,
+                    option=ormsgpack.OPT_SERIALIZE_PYDANTIC,
+                ),
+                headers={"Content-Type": "application/msgpack"},
+            )
+            decode_data = ormsgpack.unpackb(decode_response.content)
+            # Convert float16 audio data to int16
+            audio_data = np.frombuffer(decode_data["audios"][0], dtype=np.float16)
+            audio_data = (audio_data * 32768).astype(np.int16).tobytes()
+            audio_frame = CustomAudioFrame(
+                data=audio_data,
+                samples_per_channel=len(audio_data) // 2,
+                sample_rate=44100,
+                num_channels=1,
+            )
+            yield FishE2EEvent(
+                type=FishE2EEventType.SPEECH_SEGMENT,
+                frame=audio_frame,
+                vq_codes=data,
+            )
+            current_vq = False
+            vq_codes = []
+        async with self.client.stream(
+            "POST",
+            self.llm_url,
+            data=ormsgpack.packb(request, option=ormsgpack.OPT_SERIALIZE_PYDANTIC),
+            headers={"Content-Type": "application/msgpack"},
+        ) as response:
+            async for chunk in response.aiter_bytes():
+                buffer += chunk
+                while len(buffer) >= 4:
+                    read_length = struct.unpack("I", buffer[:4])[0]
+                    if len(buffer) < 4 + read_length:
+                        break
+                    body = buffer[4 : 4 + read_length]
+                    buffer = buffer[4 + read_length :]
+                    data = ormsgpack.unpackb(body)
+                    if data["delta"] and data["delta"]["part"]:
+                        if current_vq and data["delta"]["part"]["type"] == "text":
+                            async for event in decode_send():
+                                yield event
+                        if data["delta"]["part"]["type"] == "text":
+                            yield FishE2EEvent(
+                                type=FishE2EEventType.TEXT_SEGMENT,
+                                text=data["delta"]["part"]["text"],
+                            )
+                        elif data["delta"]["part"]["type"] == "vq":
+                            vq_codes.append(np.array(data["delta"]["part"]["codes"]))
+                            current_vq = True
+        if current_vq and vq_codes:
+            async for event in decode_send():
+                yield event
+        yield FishE2EEvent(type=FishE2EEventType.END_OF_TEXT)
+        yield FishE2EEvent(type=FishE2EEventType.END_OF_SPEECH)
+# Example usage:
+async def main():
+    import torchaudio
+    agent = FishE2EAgent()
+    # Replace this with actual audio data loading
+    with open("uz_story_en.m4a", "rb") as f:
+        audio_data = f.read()
+    audio_data, sample_rate = torchaudio.load("uz_story_en.m4a")
+    audio_data = (audio_data.numpy() * 32768).astype(np.int16)
+    stream = agent.stream(audio_data, sample_rate, 1)
+    if os.path.exists("audio_segment.wav"):
+        os.remove("audio_segment.wav")
+    async for event in stream:
+        if event.type == FishE2EEventType.SPEECH_SEGMENT:
+            # Handle speech segment (e.g., play audio or save to file)
+            with open("audio_segment.wav", "ab+") as f:
+                f.write(event.frame.data)
+        elif event.type == FishE2EEventType.ASR_RESULT:
+            print(event.text, flush=True)
+        elif event.type == FishE2EEventType.TEXT_SEGMENT:
+            print(event.text, flush=True, end="")
+        elif event.type == FishE2EEventType.END_OF_TEXT:
+            print("\nEnd of text reached.")
+        elif event.type == FishE2EEventType.END_OF_SPEECH:
+            print("End of speech reached.")
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(main())

tools/llama/__pycache__/generate.cpython-310.pyc ADDED Viewed

Binary file (21.1 kB). View file

tools/llama/build_dataset.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import itertools
+import os
+import re
+from collections import defaultdict
+from functools import partial
+from multiprocessing import Pool
+from pathlib import Path
+import click
+import numpy as np
+from loguru import logger
+from tqdm import tqdm
+from fish_speech.datasets.protos.text_data_pb2 import Semantics, Sentence, TextData
+from fish_speech.datasets.protos.text_data_stream import pack_pb_stream
+from tools.file import load_filelist
+# To avoid CPU overload
+os.environ["MKL_NUM_THREADS"] = "1"
+os.environ["OMP_NUM_THREADS"] = "1"
+def task_generator_folder(root: Path, text_extension: str):
+    files = list(tqdm(Path(root).rglob("*.npy"), desc=f"Loading {root}"))
+    files = sorted(files)
+    grouped_files = defaultdict(list)
+    for file in tqdm(files, desc=f"Grouping {root}"):
+        p = str(file.parent)
+        speaker = file.parent.name
+        try:
+            if isinstance(text_extension, str):
+                texts = [file.with_suffix(text_extension).read_text(encoding="utf-8")]
+            else:
+                texts = [
+                    file.with_suffix(ext).read_text(encoding="utf-8")
+                    for ext in text_extension
+                ]
+        except Exception as e:
+            logger.error(f"Failed to read text {file}: {e}")
+            continue
+        grouped_files[p].append((speaker, file, texts))
+    logger.info(
+        f"Found {len(grouped_files)} groups in {root}, {list(grouped_files.keys())[:5]}..."
+    )
+    for i in grouped_files.values():
+        subset = [(f, t) for _, f, t in i]
+        yield i[0][0], subset, "folder"
+def task_generator_filelist(filelist):
+    grouped_files = defaultdict(list)
+    for filename, speaker, _, text in load_filelist(filelist):
+        grouped_files[speaker].append((Path(filename), [text]))
+    logger.info(f"Found {len(grouped_files)} groups in {filelist}")
+    for speaker, values in grouped_files.items():
+        yield speaker, values, "filelist"
+def run_task(task):
+    name, subset, source = task
+    # Parse the files
+    sentences = []
+    for file, texts in subset:
+        np_file = file.with_suffix(".npy")
+        if np_file.exists() is False:
+            logger.warning(f"Can't find {np_file}")
+            continue
+        new_texts = []
+        for text in texts:
+            # Simple cleaning: replace { xxx } and < xxx > with space
+            text = re.sub(r"\{.*?\}", " ", text)
+            text = re.sub(r"<.*?>", " ", text)
+            text = re.sub(r"\s+", " ", text)
+            new_texts.append(text)
+        try:
+            semantics = np.load(np_file)
+        except Exception as e:
+            logger.error(f"Failed to parse {file}: {e}")
+            continue
+        if isinstance(semantics, np.ndarray):
+            semantics = semantics.tolist()
+        sentences.append(
+            Sentence(
+                texts=new_texts,
+                semantics=[Semantics(values=s) for s in semantics],
+            )
+        )
+    # Pack the sentences
+    return pack_pb_stream(
+        TextData(
+            source=source,
+            name=name,
+            sentences=sentences,
+        )
+    )
+@click.command()
+@click.option(
+    "--input",
+    type=click.Path(path_type=Path),
+    required=True,
+    help="A folder containing the dataset or a filelist",
+    multiple=True,
+)
+@click.option(
+    "--output", type=click.Path(path_type=Path), default="data/quantized-dataset-ft"
+)
+@click.option("--num-workers", type=int, default=16)
+@click.option("--text-extension", type=str, default=[".txt"], multiple=True)
+@click.option(
+    "--shard-size", type=int, default=10, help="The maximum size of each shard in mb"
+)
+def main(input, output, num_workers, text_extension, shard_size):
+    generator_fns = []
+    for f in input:
+        assert f.exists(), f"{f} not found"
+        if f.is_dir():
+            generator_fn = task_generator_folder(f, text_extension)
+        else:
+            generator_fn = task_generator_filelist(f)
+        generator_fns.append(generator_fn)
+    generator_fn = itertools.chain(*generator_fns)
+    output.mkdir(parents=True, exist_ok=True)
+    dataset_fp = None
+    tar_idx = 0
+    written_size = 0
+    with Pool(num_workers) as p:
+        for result in tqdm(p.imap_unordered(run_task, generator_fn)):
+            if dataset_fp is None:
+                dataset_fp = open(Path(output) / f"{tar_idx:08d}.protos", "wb")
+            dataset_fp.write(result)
+            written_size += len(result)
+            if written_size > shard_size * 1024 * 1024:
+                logger.info(f"Finished writing {tar_idx} shards to {output}")
+                dataset_fp.close()
+                dataset_fp = None
+                written_size = 0
+                tar_idx += 1
+    if dataset_fp is not None:
+        dataset_fp.close()
+    logger.info(f"Finished writing {tar_idx + 1} shards to {output}")
+if __name__ == "__main__":
+    main()

tools/llama/eval_in_context.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import pyrootutils
+import torch
+import torch.nn.functional as F
+from matplotlib import pyplot as plt
+from transformers import AutoTokenizer
+# register eval resolver and root
+pyrootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
+from torch.utils.data import DataLoader
+from fish_speech.datasets.semantic import AutoAugTextDataset, TextDataCollator
+from tools.llama.generate import load_model
+def smooth(
+    scalars: list[float], weight: float
+) -> list[float]:  # Weight between 0 and 1
+    last = scalars[0]  # First value in the plot (first timestep)
+    smoothed = list()
+    for point in scalars:
+        smoothed_val = last * weight + (1 - weight) * point  # Calculate smoothed value
+        smoothed.append(smoothed_val)  # Save it
+        last = smoothed_val  # Anchor the last smoothed value
+    return smoothed
+@torch.inference_mode()
+def analyze_one_model(loader, config, weight, max_length):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = load_model(
+        config,
+        weight,
+        device,
+        torch.bfloat16,
+        max_length,
+        compile=False,
+    )[0]
+    current_step = 0
+    model.eval()
+    semantic_loss_sum = torch.zeros(
+        max_length,
+        dtype=torch.float32,
+        device=device,
+    )
+    counter = torch.zeros(
+        max_length,
+        dtype=torch.long,
+        device=device,
+    )
+    for batch in loader:
+        batch = {k: v.to(device) for k, v in batch.items()}
+        labels = batch["labels"]
+        outputs = model(
+            inp=batch["inputs"],
+            key_padding_mask=batch["attention_masks"],
+        )
+        token_logits = outputs.token_logits
+        codebook_logits = outputs.codebook_logits
+        # Generate labels
+        base_loss = F.cross_entropy(
+            token_logits.reshape(-1, token_logits.size(-1)),
+            labels[:, 0].reshape(-1),
+            ignore_index=-100,
+            reduction="none",
+        )
+        codebook_labels = labels[:, 1 : 1 + model.config.num_codebooks].mT
+        semantic_loss = F.cross_entropy(
+            codebook_logits.reshape(-1, codebook_logits.size(-1)),
+            codebook_labels.reshape(-1),
+            ignore_index=-100,
+            reduction="none",
+        )
+        base_loss = base_loss.reshape(labels[:, 0].shape)
+        semantic_loss = semantic_loss.reshape(codebook_labels.shape)
+        semantic_loss_frame = semantic_loss.mean(-1)
+        pad_pos = codebook_labels.sum(-1) == -100 * model.config.num_codebooks
+        for loss_sample, pad in zip(semantic_loss_frame, pad_pos):
+            semantic_loss_sum[~pad] += loss_sample[~pad]
+            counter[~pad] += 1
+        current_step += 1
+        if current_step == 10:
+            break
+    semantic_loss = semantic_loss.cpu()
+    counter = counter.cpu()
+    xs, ys = [], []
+    for i, (loss, count) in enumerate(zip(semantic_loss_sum, counter)):
+        if count > 0:
+            xs.append(i)
+            ys.append((loss / count).item())  # for better loss visualization
+    smoothed_ys = smooth(ys, 0.95)
+    # Unload model
+    del model
+    torch.cuda.empty_cache()
+    return xs, ys, smoothed_ys
+def main():
+    tokenizer = AutoTokenizer.from_pretrained("fishaudio/fish-speech-1")
+    max_length = 4096
+    ds = AutoAugTextDataset(
+        ["data/protos/sft/云天河"],
+        tokenizer=tokenizer,
+        use_speaker=False,
+        interactive_prob=1.0,
+        max_length=max_length,
+    )
+    loader = DataLoader(
+        ds,
+        batch_size=8,
+        collate_fn=TextDataCollator(tokenizer, max_length=max_length),
+        num_workers=0,
+        shuffle=False,
+    )
+    plt.figure(figsize=(10, 5), dpi=200)
+    plt.xlabel("Frame")
+    plt.ylabel("Loss")
+    plt.yscale("log")
+    plt.title("Semantic Loss")
+    plt.grid(which="both", axis="both")
+    plt.xlim(0, max_length)
+    tests = [
+        (
+            "pertrain-medium",
+            "dual_ar_2_codebook_medium",
+            "checkpoints/text2semantic-pretrain-medium-2k-v1.pth",
+        ),
+        (
+            "sft-medium",
+            "dual_ar_2_codebook_medium",
+            "checkpoints/text2semantic-sft-medium-v1.1-4k.pth",
+        ),
+        (
+            "sft-large",
+            "dual_ar_2_codebook_large",
+            "checkpoints/text2semantic-sft-large-v1.1-4k.pth",
+        ),
+    ]
+    for name, config, weight in tests:
+        xs, _, smoothed_ys = analyze_one_model(loader, config, weight, max_length)
+        plt.plot(xs, smoothed_ys, label=name)
+    plt.legend()
+    plt.savefig("semantic_loss.png")
+if __name__ == "__main__":
+    main()

tools/llama/generate.py ADDED Viewed

	@@ -0,0 +1,1087 @@

+import os
+import queue
+import threading
+import time
+from contextlib import nullcontext
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Literal, Optional, Tuple, Union
+import click
+import hydra
+import numpy as np
+import torch
+import torch._dynamo.config
+import torch._inductor.config
+from loguru import logger
+from tqdm import tqdm
+from transformers import AutoTokenizer
+from fish_speech.conversation import CODEBOOK_PAD_TOKEN_ID
+from fish_speech.models.text2semantic.llama import BaseModelArgs
+from fish_speech.text import clean_text, split_text
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+torch._inductor.config.coordinate_descent_tuning = True
+torch._inductor.config.triton.unique_kernel_names = True
+if hasattr(torch._inductor.config, "fx_graph_cache"):
+    # Experimental feature to reduce compilation times, will be on by default in future
+    torch._inductor.config.fx_graph_cache = True
+from torch.nn.attention import SDPBackend, sdpa_kernel
+from fish_speech.models.text2semantic.llama import (
+    BaseTransformer,
+    DualARTransformer,
+    NaiveTransformer,
+)
+def multinomial_sample_one_no_sync(
+    probs_sort,
+):  # Does multinomial sampling without a cuda synchronization
+    q = torch.empty_like(probs_sort).exponential_(1)
+    return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)
+def logits_to_probs(
+    logits,
+    previous_tokens: Optional[torch.Tensor] = None,
+    temperature: torch.Tensor = 1.0,
+    top_p: torch.Tensor = 1.0,
+    repetition_penalty: torch.Tensor = 1.0,
+) -> torch.Tensor:
+    # Apply repetition penalty
+    if previous_tokens is not None:
+        previous_tokens = previous_tokens.long()
+        score = torch.gather(logits, dim=0, index=previous_tokens)
+        score = torch.where(
+            score < 0, score * repetition_penalty, score / repetition_penalty
+        )
+        logits.scatter_(dim=0, index=previous_tokens, src=score)
+    # Apply top-p sampling
+    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+    cum_probs = torch.cumsum(torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1)
+    sorted_indices_to_remove = cum_probs > top_p
+    sorted_indices_to_remove[0] = False  # keep at least one option
+    indices_to_remove = sorted_indices_to_remove.scatter(
+        dim=0, index=sorted_indices, src=sorted_indices_to_remove
+    )
+    logits = logits.masked_fill(indices_to_remove, -float("Inf"))
+    logits = logits / max(temperature, 1e-5)
+    probs = torch.nn.functional.softmax(logits, dim=-1)
+    return probs
+def multinomial_sample_one_no_sync_agent(
+    probs_sort,
+):  # Does multinomial sampling without a cuda synchronization
+    q = torch.empty_like(probs_sort).exponential_(1)
+    return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)
+def logits_to_probs_agent(
+    logits,
+    previous_tokens: Optional[torch.Tensor] = None,
+    temperature: torch.Tensor = 1.0,
+    top_p: torch.Tensor = 1.0,
+    repetition_penalty: torch.Tensor = 1.0,
+) -> torch.Tensor:
+    # Apply repetition penalty
+    if previous_tokens is not None:
+        previous_tokens = previous_tokens.long()
+        score = torch.gather(logits, dim=-1, index=previous_tokens)
+        score = torch.where(
+            score < 0, score * repetition_penalty, score / repetition_penalty
+        )
+        logits.scatter_(dim=-1, index=previous_tokens, src=score)
+    # Apply top-p sampling
+    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+    cum_probs = torch.cumsum(torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1)
+    sorted_indices_to_remove = cum_probs > top_p
+    sorted_indices_to_remove[..., 0] = False  # keep at least one option
+    indices_to_remove = sorted_indices_to_remove.scatter(
+        dim=-1, index=sorted_indices, src=sorted_indices_to_remove
+    )
+    logits = logits.masked_fill(indices_to_remove, -float("Inf"))
+    logits = logits / max(temperature, 1e-5)
+    probs = torch.nn.functional.softmax(logits, dim=-1)
+    return probs
+def sample(
+    logits,
+    previous_tokens: Optional[torch.Tensor] = None,
+    **sampling_kwargs,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    probs = logits_to_probs(
+        logits=logits[0, -1], previous_tokens=previous_tokens, **sampling_kwargs
+    )
+    idx_next = multinomial_sample_one_no_sync(probs)
+    return idx_next, probs
+def sample_agent(
+    logits,
+    previous_tokens: Optional[torch.Tensor] = None,
+    **sampling_kwargs,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    probs = logits_to_probs_agent(
+        logits=logits[:, -1], previous_tokens=previous_tokens, **sampling_kwargs
+    )
+    idx_next = multinomial_sample_one_no_sync_agent(probs)
+    return idx_next, probs
+def decode_one_token_ar_agent(
+    model: DualARTransformer,
+    x: torch.Tensor,
+    input_pos: torch.Tensor,
+    previous_tokens: torch.Tensor = None,
+    semantic_id: int = 32003,
+    **sampling_kwargs,
+) -> torch.Tensor:
+    # print(x, input_pos)
+    x = model.forward_generate(x, input_pos)
+    logits = x.logits  # [:, -1:]
+    hidden_states = x.hidden_states  # [:, -1:]
+    sampling_kwargs_main = sampling_kwargs.copy()
+    sampling_kwargs_main["temperature"] = 0.1
+    sampling_kwargs_main["top_p"] = 0.1
+    sampling_kwargs_main["repetition_penalty"] = 1.0
+    codebooks = [
+        sample_agent(
+            logits,
+            previous_tokens=None,  # Disable repetition penalty for the token codebook
+            **sampling_kwargs_main,
+        )[0]
+    ]
+    # Cleanup the cache
+    for layer in model.fast_layers:
+        layer.attention.kv_cache.k_cache.fill_(0)
+        layer.attention.kv_cache.v_cache.fill_(0)
+    for codebook_idx in range(model.config.num_codebooks):
+        input_pos = torch.tensor(
+            [codebook_idx], device=hidden_states.device, dtype=torch.long
+        )
+        logits = model.forward_generate_fast(hidden_states, input_pos)
+        a = sample_agent(
+            logits,
+            previous_tokens=(
+                previous_tokens[:, codebook_idx + 1]
+                if previous_tokens is not None
+                else None
+            ),
+            **sampling_kwargs,
+        )[0]
+        hidden_states = model.fast_embeddings(a)
+        codebooks.append(a)
+    codebooks = torch.stack(codebooks, dim=1)
+    codebooks[:, 1:, :] = torch.masked_fill(
+        codebooks[:, 1:, :], codebooks[:, :1, :] != semantic_id, CODEBOOK_PAD_TOKEN_ID
+    )
+    # for i in range(codebooks.size(1) - 1):
+    #     codebooks[:, i + 1, :] = torch.masked_fill(
+    #         codebooks[:, i + 1, :],
+    #         codebooks[:, :1, :] != semantic_id,
+    #         CODEBOOK_PAD_TOKEN_ID + i * 1024,
+    #     )
+    # print(codebooks)
+    return codebooks
+def decode_one_token_naive_agent(
+    model: NaiveTransformer,
+    x: torch.Tensor,
+    input_pos: torch.Tensor,
+    previous_tokens: torch.Tensor = None,
+    semantic_id: int = 32003,
+    **sampling_kwargs,
+) -> torch.Tensor:
+    x = model.forward_generate(x, input_pos)
+    codebooks = [
+        sample(
+            x.token_logits,
+            previous_tokens=None,  # Disable repetition penalty for the token codebook
+            **sampling_kwargs,
+        )[0]
+    ]
+    for i in range(model.config.num_codebooks):
+        codebooks.append(
+            sample_agent(
+                x.codebook_logits[:, :, i],
+                previous_tokens=(
+                    previous_tokens[:, i + 1] if previous_tokens is not None else None
+                ),
+                **sampling_kwargs,
+            )[0]
+        )
+    codebooks = torch.stack(codebooks, dim=1)
+    codebooks[:, 1:, :] = torch.masked_fill(
+        codebooks[:, 1:, :], codebooks[:, :1, :] != semantic_id, CODEBOOK_PAD_TOKEN_ID
+    )
+    return codebooks
+def decode_one_token_ar(
+    model: DualARTransformer,
+    x: torch.Tensor,
+    input_pos: torch.Tensor,
+    previous_tokens: torch.Tensor = None,
+    semantic_id: int = 0,
+    **sampling_kwargs,
+) -> torch.Tensor:
+    x = model.forward_generate(x, input_pos)
+    sampling_kwargs_main = sampling_kwargs.copy()
+    # sampling_kwargs_main["temperature"] = 0.1
+    # sampling_kwargs_main["top_p"] = 0.1
+    # sampling_kwargs_main["repetition_penalty"] = 1.0
+    codebooks = [
+        sample(
+            x.logits,
+            previous_tokens=None,  # Disable repetition penalty for the token codebook
+            **sampling_kwargs_main,
+        )[0]
+    ]
+    x = x.hidden_states
+    # Cleanup the cache
+    for layer in model.fast_layers:
+        layer.attention.kv_cache.k_cache.fill_(0)
+        layer.attention.kv_cache.v_cache.fill_(0)
+    for codebook_idx in range(model.config.num_codebooks):
+        input_pos = torch.tensor([codebook_idx], device=x.device, dtype=torch.long)
+        logits = model.forward_generate_fast(x, input_pos)
+        a = sample(
+            logits,
+            previous_tokens=(
+                previous_tokens[codebook_idx + 1]
+                if previous_tokens is not None
+                else None
+            ),
+            **sampling_kwargs,
+        )[0]
+        x = model.fast_embeddings(a)
+        codebooks.append(a)
+    codebooks = torch.stack(codebooks, dim=0)
+    codebooks[1:, :] = torch.masked_fill(
+        codebooks[1:, :], codebooks[:1, :] != semantic_id, CODEBOOK_PAD_TOKEN_ID
+    )
+    return codebooks
+def decode_one_token_naive(
+    model: NaiveTransformer,
+    x: torch.Tensor,
+    input_pos: torch.Tensor,
+    previous_tokens: torch.Tensor = None,
+    **sampling_kwargs,
+) -> torch.Tensor:
+    x = model.forward_generate(x, input_pos)
+    sampling_kwargs_main = sampling_kwargs.copy()
+    sampling_kwargs_main["temperature"] = 0.1
+    sampling_kwargs_main["top_p"] = 0.1
+    sampling_kwargs_main["repetition_penalty"] = 1.0
+    codebooks = [
+        sample(
+            x.logits,
+            previous_tokens=None,  # Disable repetition penalty for the token codebook
+            **sampling_kwargs_main,
+        )[0]
+    ]
+    for i in range(model.config.num_codebooks):
+        codebooks.append(
+            sample(
+                x.codebook_logits[:, :, i],
+                previous_tokens=(
+                    previous_tokens[i + 1] if previous_tokens is not None else None
+                ),
+                **sampling_kwargs,
+            )[0]
+        )
+    return torch.stack(codebooks, dim=0)
+def decode_n_tokens(
+    model: NaiveTransformer,
+    cur_token: torch.Tensor,
+    input_pos: torch.Tensor,
+    num_new_tokens: int,
+    im_end_id: int = 4,
+    decode_one_token=decode_one_token_naive,
+    semantic_id: int = 0,
+    **sampling_kwargs,
+):
+    previous_tokens = torch.zeros(
+        (model.config.num_codebooks + 1, model.config.max_seq_len),
+        dtype=torch.int,
+        device=cur_token.device,
+    )
+    for i in tqdm(range(num_new_tokens)):
+        # We need to get windowed repeat penalty
+        win_size = 16
+        if i < win_size:
+            window = previous_tokens[:, :win_size]
+        else:
+            window = previous_tokens[:, i - win_size : i]
+        with (
+            torch.backends.cuda.sdp_kernel(
+                enable_flash=False, enable_mem_efficient=False, enable_math=True
+            )
+            if torch.cuda.is_available()
+            else nullcontext()
+        ):  # Actually better for Inductor to codegen attention here
+            next_token = decode_one_token(
+                model=model,
+                x=cur_token,
+                input_pos=input_pos,
+                previous_tokens=window,
+                semantic_id=semantic_id,
+                **sampling_kwargs,
+            )
+        input_pos += 1
+        cur_token = next_token.view(1, model.config.num_codebooks + 1, -1)
+        previous_tokens[:, i : i + 1] = next_token.view(
+            model.config.num_codebooks + 1, -1
+        )
+        if cur_token[0, 0, -1] == im_end_id:
+            break
+    return previous_tokens[:, : i + 1]
+@torch.no_grad()
+@torch.inference_mode()
+def generate(
+    *,
+    model: NaiveTransformer,
+    prompt: torch.Tensor,
+    max_new_tokens: int =600,
+    im_end_id: int = 4,
+    decode_one_token=decode_one_token_naive,
+    **sampling_kwargs,
+) -> torch.Tensor:
+    """
+    Takes a conditioning sequence (prompt) as input and continues to generate as many tokens as requested.
+    """
+    # create an empty tensor of the expected final shape and fill in the current tokens
+    T = prompt.size(1)
+    semantic_id = model.tokenizer.convert_tokens_to_ids("<|semantic|>")
+    if max_new_tokens:
+        if T + max_new_tokens > model.config.max_seq_len:
+            max_new_tokens = model.config.max_seq_len - T
+            logger.info(f"Truncating max_new_tokens to {max_new_tokens}")
+        T_new = T + max_new_tokens
+    else:
+        T_new = model.config.max_seq_len
+        max_new_tokens = T_new - T
+    device, dtype = prompt.device, prompt.dtype
+    codebook_dim = 1 + model.config.num_codebooks
+    # create an empty tensor of the expected final shape and fill in the current tokens
+    empty = torch.empty(
+        (codebook_dim, model.config.max_seq_len), dtype=dtype, device=device
+    )
+    empty[:, :T] = prompt
+    seq = empty
+    input_pos = torch.arange(0, T, device=device)
+    # Use non-accelerated version for now, to avoid compilation overhead
+    prefill_decode = (
+        decode_one_token_naive
+        if isinstance(model, NaiveTransformer)
+        else decode_one_token_ar
+    )
+    next_token = prefill_decode(
+        model,
+        prompt.view(1, codebook_dim, -1),
+        input_pos,
+        semantic_id=semantic_id,
+        **sampling_kwargs,
+    )
+    seq[:, T : T + 1] = next_token
+    input_pos = torch.tensor([T], device=device, dtype=torch.int)
+    x = decode_n_tokens(
+        model,
+        next_token.view(1, codebook_dim, -1),
+        input_pos,
+        max_new_tokens - 1,
+        im_end_id=im_end_id,
+        decode_one_token=decode_one_token,
+        semantic_id=semantic_id,
+        **sampling_kwargs,
+    )
+    # x = torch.cat(generated_tokens, dim=1)
+    seq = seq[:, : T + 1 + x.size(1)]
+    seq[:, T + 1 :] = x
+    return seq
+def decode_n_tokens_agent(
+    model: NaiveTransformer,
+    cur_token: torch.Tensor,
+    input_pos: torch.Tensor,
+    num_new_tokens: int,
+    im_end_id: int = 4,
+    semantic_id: int = 32003,
+    decode_one_token=decode_one_token_naive_agent,
+    early_stop_threshold: float = 0.6,
+    **sampling_kwargs,
+):
+    batch_size = cur_token.size(0)
+    previous_tokens = torch.zeros(
+        (batch_size, model.config.num_codebooks + 1, model.config.max_seq_len),
+        dtype=torch.int,
+        device=cur_token.device,
+    )
+    finished = torch.zeros(batch_size, dtype=torch.bool, device=cur_token.device)
+    finished = finished | (cur_token[:, 0, -1] == im_end_id)
+    start_time = time.time()
+    for i in tqdm(range(num_new_tokens), desc="Decoding: ", total=num_new_tokens):
+        # We need to get windowed repeat penalty
+        win_size = 16
+        if i < win_size:
+            window = previous_tokens[:, :, :win_size]
+        else:
+            window = previous_tokens[:, :, i - win_size : i]
+        with sdpa_kernel(
+            SDPBackend.MATH
+        ):  # Actually better for Inductor to codegen attention here
+            next_token = decode_one_token(
+                model=model,
+                x=cur_token,
+                input_pos=input_pos,
+                previous_tokens=window,
+                semantic_id=semantic_id,
+                **sampling_kwargs,
+            )
+        input_pos += 1
+        cur_token = next_token.view(batch_size, model.config.num_codebooks + 1, -1)
+        previous_tokens[:, :, i : i + 1] = next_token.view(
+            batch_size, model.config.num_codebooks + 1, -1
+        )
+        yield cur_token.cpu()
+        finished = finished | (cur_token[:, 0, -1] == im_end_id)
+        if finished.all() or (
+            0 < early_stop_threshold < 1
+            and finished.sum() >= round(batch_size * early_stop_threshold)
+        ):
+            break
+    total_time = time.time() - start_time
+    generated_tokens = i + 1
+    tokens_per_second = (generated_tokens / total_time) * batch_size
+    logger.info(
+        f"Decoded {generated_tokens} x {batch_size} tokens in {total_time:.2f}s ({tokens_per_second:.2f} tokens/s)"
+    )
+@torch.no_grad()
+@torch.inference_mode()
+def generate_agent(
+    *,
+    model: BaseTransformer,
+    prompt: torch.Tensor,
+    max_new_tokens: int =500,
+    im_end_id: int = 4,
+    semantic_id: int = 32003,
+    decode_one_token=decode_one_token_naive_agent,
+    num_samples: int = 1,
+    early_stop_threshold: float = 0.6,
+    **sampling_kwargs,
+):
+    """
+    Takes a conditioning sequence (prompt) as input and continues to generate as many tokens as requested.
+    """
+    # create an empty tensor of the expected final shape and fill in the current tokens
+    T = prompt.size(1)
+    prompt = prompt[None].repeat(num_samples, 1, 1)
+    if T >= model.config.max_seq_len:
+        raise ValueError(
+            f"Input sequence length {T} exceeds max_seq_len {model.config.max_seq_len}"
+        )
+    if max_new_tokens:
+        if T + max_new_tokens > model.config.max_seq_len:
+            max_new_tokens = model.config.max_seq_len - T
+            logger.info(f"Truncating max_new_tokens to {max_new_tokens}")
+        T_new = T + max_new_tokens
+    else:
+        T_new = model.config.max_seq_len
+        max_new_tokens = T_new - T
+    device, dtype = prompt.device, prompt.dtype
+    codebook_dim = 1 + model.config.num_codebooks
+    input_pos = torch.arange(0, T, device=device)
+    # Use non-accelerated version for now, to avoid compilation overhead
+    prefill_decode = (
+        decode_one_token_naive_agent
+        if isinstance(model, NaiveTransformer)
+        else decode_one_token_ar_agent
+    )
+    next_token = prefill_decode(
+        model,
+        prompt,
+        input_pos,
+        semantic_id=semantic_id,
+        **sampling_kwargs,
+    ).view(num_samples, codebook_dim, -1)
+    yield next_token.cpu()
+    input_pos = torch.tensor([T], device=device, dtype=torch.int)
+    yield from decode_n_tokens_agent(
+        model,
+        next_token,
+        input_pos,
+        max_new_tokens - 1,
+        im_end_id=im_end_id,
+        semantic_id=semantic_id,
+        decode_one_token=decode_one_token,
+        early_stop_threshold=early_stop_threshold,
+        **sampling_kwargs,
+    )
+def encode_tokens(
+    tokenizer,
+    string,
+    device="cuda",
+    prompt_tokens=None,
+    num_codebooks=4,
+):
+    string = clean_text(string)
+    string = f"<|im_start|>user\n{string}<|im_end|><|im_start|>assistant\n"
+    new_tokens = tokenizer.encode(
+        string,
+        add_special_tokens=False,
+        max_length=10**6,
+        truncation=False,
+    )
+    tokens = torch.tensor([new_tokens], dtype=torch.int, device=device)
+    # Codebooks
+    zeros = (
+        torch.ones((num_codebooks, tokens.size(1)), dtype=torch.int, device=device)
+        * CODEBOOK_PAD_TOKEN_ID
+    )
+    prompt = torch.cat((tokens, zeros), dim=0)
+    if prompt_tokens is None:
+        return prompt
+    # Get prompt tokens
+    if prompt_tokens.ndim == 3:
+        assert (
+            prompt_tokens.shape[0] == 1
+        ), f"3 dim prompt tokens should have shape (1, num_codebooks, seq_len)"
+        prompt_tokens = prompt_tokens[0]
+    assert prompt_tokens.ndim == 2
+    data = prompt_tokens + 1
+    if prompt_tokens.shape[0] > num_codebooks:
+        logger.warning(
+            f"Prompt tokens shape {prompt_tokens.shape} is larger than num_codebooks {num_codebooks}, getting first {num_codebooks} codebooks"
+        )
+        data = data[:num_codebooks]
+    # Add pad token for each codebook
+    data = torch.cat(
+        (data, torch.zeros((data.size(0), 1), dtype=torch.int, device=device)),
+        dim=1,
+    )
+    # Since 1.0, we use <|semantic|>
+    s0_token_id = tokenizer.convert_tokens_to_ids("<|semantic|>")
+    end_token_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
+    main_token_ids = (
+        torch.ones((1, data.size(1)), dtype=torch.int, device=device) * s0_token_id
+    )
+    main_token_ids[0, -1] = end_token_id
+    data = torch.cat((main_token_ids, data), dim=0)
+    prompt = torch.cat((prompt, data), dim=1)
+    return prompt
+def load_model(checkpoint_path, device, precision, compile=False, is_agent=False):
+    model: Union[NaiveTransformer, DualARTransformer] = BaseTransformer.from_pretrained(
+        checkpoint_path, load_weights=True
+    )
+    model = model.to(device=device, dtype=precision)
+    logger.info(f"Restored model from checkpoint")
+    if isinstance(model, DualARTransformer):
+        decode_one_token = (
+            decode_one_token_ar_agent if is_agent else decode_one_token_ar
+        )
+        logger.info("Using DualARTransformer")
+    else:
+        decode_one_token = (
+            decode_one_token_naive_agent if is_agent else decode_one_token_naive
+        )
+        logger.info("Using NaiveTransformer")
+    if compile:
+        logger.info("Compiling function...")
+        decode_one_token = torch.compile(
+            decode_one_token,
+            fullgraph=True,
+            backend="inductor" if torch.cuda.is_available() else "aot_eager",
+            mode="reduce-overhead" if torch.cuda.is_available() else None,
+        )
+    return model.eval(), decode_one_token
+@dataclass
+class GenerateResponse:
+    action: Literal["sample", "next"]
+    codes: Optional[torch.Tensor] = None
+    text: Optional[str] = None
+def generate_long(
+    *,
+    model,
+    device: str | torch.device,
+    decode_one_token: callable,
+    text: str,
+    num_samples: int = 1,
+    max_new_tokens: int = 600,
+    top_p: int = 0.7,
+    repetition_penalty: float = 1.5,
+    temperature: float = 0.7,
+    compile: bool = False,
+    iterative_prompt: bool = True,
+    max_length: int = 2048,
+    chunk_length: int = 150,
+    prompt_text: Optional[str | list[str]] = None,
+    prompt_tokens: Optional[torch.Tensor | list[torch.Tensor]] = None,
+):
+    assert 0 < top_p <= 1, "top_p must be in (0, 1]"
+    assert 0 < repetition_penalty < 2, "repetition_penalty must be in (0, 2)"
+    assert 0 < temperature < 2, "temperature must be in (0, 2)"
+    use_prompt = prompt_text is not None and prompt_tokens is not None
+    if use_prompt and isinstance(prompt_text, str):
+        prompt_text = [prompt_text]
+        prompt_tokens = [prompt_tokens]
+    assert use_prompt is False or len(prompt_text) == len(
+        prompt_tokens
+    ), "Prompt text and tokens must have the same length"
+    model_size = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    tokenizer = model.tokenizer
+    im_end_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
+    encoded = []
+    texts = split_text(text, chunk_length) if iterative_prompt else [text]
+    encoded_prompts = []
+    if use_prompt:
+        for idx, (t, c) in enumerate(zip(prompt_text, prompt_tokens)):
+            encoded_prompts.append(
+                encode_tokens(
+                    tokenizer,
+                    string=t,
+                    device=device,
+                    prompt_tokens=c,
+                    num_codebooks=model.config.num_codebooks,
+                )
+            )
+    for idx, text in enumerate(texts):
+        encoded.append(
+            encode_tokens(
+                tokenizer,
+                string=text,
+                device=device,
+                num_codebooks=model.config.num_codebooks,
+            )
+        )
+        logger.info(f"Encoded text: {text}")
+    # Move temperature, top_p, repetition_penalty to device
+    # This is important so that changing params doesn't trigger recompile
+    temperature = torch.tensor(temperature, device=device, dtype=torch.float)
+    top_p = torch.tensor(top_p, device=device, dtype=torch.float)
+    repetition_penalty = torch.tensor(
+        repetition_penalty, device=device, dtype=torch.float
+    )
+    for sample_idx in range(num_samples):
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        global_encoded = []
+        seg_idx = 0
+        while seg_idx < len(encoded):
+            logger.info(
+                f"Generating sentence {seg_idx + 1}/{len(encoded)} of sample {sample_idx + 1}/{num_samples}"
+            )
+            seg = encoded[seg_idx]
+            global_encoded.append(seg)
+            lengths = reversed([seg.size(1) for seg in global_encoded])
+            # Pick last 2000 tokens
+            count = 0
+            for i, length in enumerate(lengths):
+                count += length
+                if count + length > max_length - 1024 - sum(
+                    t.shape[1] for t in encoded_prompts
+                ):
+                    break
+            if i != 0 and i % 2 == 0:
+                i -= 1
+            # Rotate the list, always make sure first segment is included to avoid drift
+            if i < len(global_encoded) - 2:
+                partial_encoded = global_encoded[:2] + global_encoded[-i:]
+            else:
+                partial_encoded = global_encoded
+            if use_prompt:
+                partial_encoded = encoded_prompts + partial_encoded
+            cat_encoded = torch.cat(partial_encoded, dim=1)
+            prompt_length = cat_encoded.size(1)
+            t0 = time.perf_counter()
+            y = generate(
+                model=model,
+                prompt=cat_encoded,
+                max_new_tokens=max_new_tokens,
+                im_end_id=im_end_id,
+                decode_one_token=decode_one_token,
+                temperature=temperature,
+                top_p=top_p,
+                repetition_penalty=repetition_penalty,
+            )
+            if sample_idx == 0 and seg_idx == 0 and compile:
+                logger.info(f"Compilation time: {time.perf_counter() - t0:.2f} seconds")
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+            t = time.perf_counter() - t0
+            tokens_generated = y.size(1) - prompt_length
+            tokens_sec = tokens_generated / t
+            logger.info(
+                f"Generated {tokens_generated} tokens in {t:.02f} seconds, {tokens_sec:.02f} tokens/sec"
+            )
+            logger.info(
+                f"Bandwidth achieved: {model_size * tokens_sec / 1e9:.02f} GB/s"
+            )
+            if torch.cuda.is_available():
+                logger.info(
+                    f"GPU Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB"
+                )
+            # Put the generated tokens
+            # since there is <im_end> and <eos> tokens, we remove last 2 tokens
+            codes = y[1:, prompt_length:-1].clone()
+            codes = codes - 1
+            assert (codes >= 0).all(), f"Negative code found"
+            decoded = y[:, prompt_length:-1].clone()
+            # But for global encoding, we should keep the <im_end> token
+            global_encoded.append(decoded)
+            assert (codes >= 0).all(), f"Negative code found: {codes}"
+            yield GenerateResponse(action="sample", codes=codes, text=texts[seg_idx])
+            seg_idx += 1
+        # This indicates the end of the current sample
+        yield GenerateResponse(action="next")
+@dataclass
+class WrappedGenerateResponse:
+    status: Literal["success", "error"]
+    response: Optional[GenerateResponse | Exception] = None
+@dataclass
+class GenerateRequest:
+    request: dict
+    response_queue: queue.Queue
+def launch_thread_safe_queue(
+    checkpoint_path,
+    device,
+    precision,
+    compile: bool = False,
+):
+    input_queue = queue.Queue()
+    init_event = threading.Event()
+    def worker():
+        model, decode_one_token = load_model(
+            checkpoint_path, device, precision, compile=compile
+        )
+        with torch.device(device):
+            model.setup_caches(
+                max_batch_size=1,
+                max_seq_len=model.config.max_seq_len,
+                dtype=next(model.parameters()).dtype,
+            )
+        init_event.set()
+        while True:
+            item: GenerateRequest | None = input_queue.get()
+            if item is None:
+                break
+            kwargs = item.request
+            response_queue = item.response_queue
+            try:
+                for chunk in generate_long(
+                    model=model, decode_one_token=decode_one_token, **kwargs
+                ):
+                    response_queue.put(
+                        WrappedGenerateResponse(status="success", response=chunk)
+                    )
+            except Exception as e:
+                response_queue.put(WrappedGenerateResponse(status="error", response=e))
+    threading.Thread(target=worker, daemon=True).start()
+    init_event.wait()
+    return input_queue
+def launch_thread_safe_queue_agent(
+    checkpoint_path,
+    device,
+    precision,
+    compile: bool = False,
+):
+    input_queue = queue.Queue()
+    init_event = threading.Event()
+    tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
+    config = BaseModelArgs.from_pretrained(checkpoint_path)
+    def worker():
+        model, decode_one_token = load_model(
+            checkpoint_path, device, precision, compile=compile, is_agent=True
+        )
+        with torch.device(device):
+            model.setup_caches(
+                max_batch_size=1,
+                max_seq_len=model.config.max_seq_len,
+                dtype=next(model.parameters()).dtype,
+            )
+        init_event.set()
+        while True:
+            item: GenerateRequest | None = input_queue.get()
+            if item is None:
+                break
+            kwargs = item.request
+            response_queue = item.response_queue
+            try:
+                for token in generate_agent(
+                    model=model,
+                    decode_one_token=decode_one_token,
+                    **kwargs,
+                ):
+                    response_queue.put(token)
+                response_queue.put("stop")
+            except Exception as e:
+                import traceback
+                logger.exception(f"Error in worker: {traceback.format_exc()}")
+                response_queue.put("error")
+    threading.Thread(target=worker, daemon=True).start()
+    init_event.wait()
+    return input_queue, tokenizer, config
+@click.command()
+@click.option(
+    "--text",
+    type=str,
+    default="你说的对, 但是原神是一款由米哈游自主研发的开放世界手游.",
+)
+@click.option("--prompt-text", type=str, default=None, multiple=True)
+@click.option(
+    "--prompt-tokens",
+    type=click.Path(path_type=Path, exists=True),
+    default=None,
+    multiple=True,
+)
+@click.option("--num-samples", type=int, default=1)
+@click.option("--max-new-tokens", type=int, default=0)
+@click.option("--top-p", type=float, default=0.7)
+@click.option("--repetition-penalty", type=float, default=1.2)
+@click.option("--temperature", type=float, default=0.7)
+@click.option(
+    "--checkpoint-path",
+    type=click.Path(path_type=Path, exists=True),
+    default="checkpoints/fish-speech-1.4",
+)
+@click.option("--device", type=str, default="cuda")
+@click.option("--compile/--no-compile", default=False)
+@click.option("--seed", type=int, default=42)
+@click.option("--half/--no-half", default=False)
+@click.option("--iterative-prompt/--no-iterative-prompt", default=True)
+@click.option("--chunk-length", type=int, default=100)
+def main(
+    text: str,
+    prompt_text: Optional[list[str]],
+    prompt_tokens: Optional[list[Path]],
+    num_samples: int,
+    max_new_tokens: int,
+    top_p: int,
+    repetition_penalty: float,
+    temperature: float,
+    checkpoint_path: Path,
+    device: str,
+    compile: bool,
+    seed: int,
+    half: bool,
+    iterative_prompt: bool,
+    chunk_length: int,
+) -> None:
+    precision = torch.half if half else torch.bfloat16
+    if prompt_text is not None and len(prompt_text) != len(prompt_tokens):
+        raise ValueError(
+            f"Number of prompt text ({len(prompt_text)}) and prompt tokens ({len(prompt_tokens)}) should be the same"
+        )
+    logger.info("Loading model ...")
+    t0 = time.time()
+    model, decode_one_token = load_model(
+        checkpoint_path, device, precision, compile=compile
+    )
+    with torch.device(device):
+        model.setup_caches(
+            max_batch_size=1,
+            max_seq_len=model.config.max_seq_len,
+            dtype=next(model.parameters()).dtype,
+        )
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    logger.info(f"Time to load model: {time.time() - t0:.02f} seconds")
+    if prompt_tokens is not None:
+        prompt_tokens = [torch.from_numpy(np.load(p)).to(device) for p in prompt_tokens]
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    generator = generate_long(
+        model=model,
+        device=device,
+        decode_one_token=decode_one_token,
+        text=text,
+        num_samples=num_samples,
+        max_new_tokens=max_new_tokens,
+        top_p=top_p,
+        repetition_penalty=repetition_penalty,
+        temperature=temperature,
+        compile=compile,
+        iterative_prompt=iterative_prompt,
+        chunk_length=chunk_length,
+        prompt_text=prompt_text,
+        prompt_tokens=prompt_tokens,
+    )
+    idx = 0
+    codes = []
+    for response in generator:
+        if response.action == "sample":
+            codes.append(response.codes)
+            logger.info(f"Sampled text: {response.text}")
+        elif response.action == "next":
+            if codes:
+                np.save(f"codes_{idx}.npy", torch.cat(codes, dim=1).cpu().numpy())
+                logger.info(f"Saved codes to codes_{idx}.npy")
+            logger.info(f"Next sample")
+            codes = []
+            idx += 1
+        else:
+            logger.error(f"Error: {response}")
+if __name__ == "__main__":
+    main()

tools/llama/merge_lora.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import shutil
+from copy import deepcopy
+from pathlib import Path
+import click
+import hydra
+import torch
+from hydra import compose, initialize
+from hydra.utils import instantiate
+from loguru import logger
+from fish_speech.models.text2semantic.llama import BaseTransformer
+from fish_speech.models.text2semantic.lora import get_merged_state_dict
+@click.command()
+@click.option("--lora-config", type=str, default="r_8_alpha_16")
+@click.option("--base-weight", type=str, default="checkpoints/fish-speech-1.4")
+@click.option("--lora-weight", type=str, required=True)
+@click.option("--output", type=str, required=True)
+def merge(lora_config, base_weight, lora_weight, output):
+    output = Path(output)
+    logger.info(
+        f"Merging {base_weight} and {lora_weight} into {output} with {lora_config}"
+    )
+    with initialize(version_base="1.3", config_path="../../fish_speech/configs/lora"):
+        cfg = compose(config_name=lora_config)
+    lora_config = instantiate(cfg)
+    logger.info(f"Loaded lora model with config {lora_config}")
+    llama_model = BaseTransformer.from_pretrained(
+        path=base_weight,
+        load_weights=True,
+        lora_config=lora_config,
+    )
+    logger.info(f"Loaded llama model")
+    llama_state_dict = llama_model.state_dict()
+    llama_state_dict = {k: v for k, v in llama_state_dict.items() if "lora" not in k}
+    llama_state_dict_copy = deepcopy(llama_state_dict)
+    lora_state_dict = torch.load(lora_weight, map_location="cpu")
+    if "state_dict" in llama_state_dict:
+        llama_state_dict = llama_state_dict["state_dict"]
+    if "state_dict" in lora_state_dict:
+        lora_state_dict = lora_state_dict["state_dict"]
+    # remove prefix model.
+    if any(k.startswith("model.") for k in llama_state_dict.keys()):
+        llama_state_dict = {
+            k.replace("model.", ""): v
+            for k, v in llama_state_dict.items()
+            if k.startswith("model.")
+        }
+    if any(k.startswith("model.") for k in lora_state_dict.keys()):
+        lora_state_dict = {
+            k.replace("model.", ""): v
+            for k, v in lora_state_dict.items()
+            if k.startswith("model.")
+        }
+    logger.info(f"Found {len(llama_state_dict)} keys in llama model")
+    logger.info(f"Found {len(lora_state_dict)} keys in lora model")
+    merged_state_dict = llama_state_dict | lora_state_dict
+    llama_model.load_state_dict(merged_state_dict, strict=True)
+    logger.info(f"Merged model loaded")
+    # Trigger eval mode to merge lora
+    llama_model.eval()
+    llama_model.save_pretrained(output, drop_lora=True)
+    logger.info(f"Saved merged model to {output}, validating")
+    new_state_dict = torch.load(output / "model.pth", map_location="cpu")
+    original_keys = set(llama_state_dict_copy.keys())
+    merged_keys = set(new_state_dict.keys())
+    assert original_keys == merged_keys, "Keys should be same"
+    for key in original_keys:
+        diff_l1 = (new_state_dict[key] - llama_state_dict_copy[key]).abs().sum().item()
+        if diff_l1 != 0:
+            break
+    else:
+        logger.error("Merged model is same as the original model")
+        exit(1)
+    logger.info("Merged model is different from the original model, check passed")
+if __name__ == "__main__":
+    merge()

tools/llama/quantize.py ADDED Viewed

	@@ -0,0 +1,497 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+import datetime
+import shutil
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import time
+from pathlib import Path
+import click
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fish_speech.models.text2semantic.llama import find_multiple
+from tools.llama.generate import load_model
+##### Quantization Primitives ######
+def dynamically_quantize_per_channel(x, quant_min, quant_max, target_dtype):
+    # assumes symmetric quantization
+    # assumes axis == 0
+    # assumes dense memory format
+    # TODO(future): relax ^ as needed
+    # default setup for affine quantization of activations
+    eps = torch.finfo(torch.float32).eps
+    # get min and max
+    min_val, max_val = torch.aminmax(x, dim=1)
+    # calculate scales and zero_points based on min and max
+    # reference: https://fburl.com/code/srbiybme
+    min_val_neg = torch.min(min_val, torch.zeros_like(min_val))
+    max_val_pos = torch.max(max_val, torch.zeros_like(max_val))
+    device = min_val_neg.device
+    # reference: https://fburl.com/code/4wll53rk
+    max_val_pos = torch.max(-min_val_neg, max_val_pos)
+    scales = max_val_pos / (float(quant_max - quant_min) / 2)
+    # ensure scales is the same dtype as the original tensor
+    scales = torch.clamp(scales, min=eps).to(x.dtype)
+    zero_points = torch.zeros(min_val_neg.size(), dtype=torch.int64, device=device)
+    # quantize based on qmin/qmax/scales/zp
+    # reference: https://www.internalfb.com/code/fbsource/[8edc275012b1]/fbcode/caffe2/torch/ao/quantization/fx/_decomposed.py?lines=63
+    x_div = x / scales.unsqueeze(-1)
+    x_round = torch.round(x_div)
+    x_zp = x_round + zero_points.unsqueeze(-1)
+    quant = torch.clamp(x_zp, quant_min, quant_max).to(target_dtype)
+    return quant, scales, zero_points
+def get_group_qparams(w, n_bit=4, groupsize=128):
+    # needed for GPTQ with padding
+    if groupsize > w.shape[-1]:
+        groupsize = w.shape[-1]
+    assert groupsize > 1
+    assert w.shape[-1] % groupsize == 0
+    assert w.dim() == 2
+    to_quant = w.reshape(-1, groupsize)
+    assert torch.isnan(to_quant).sum() == 0
+    max_val = to_quant.amax(dim=1, keepdim=True)
+    min_val = to_quant.amin(dim=1, keepdim=True)
+    max_int = 2**n_bit - 1
+    scales = (max_val - min_val).clamp(min=1e-6) / max_int
+    zeros = min_val + scales * (2 ** (n_bit - 1))
+    return scales.to(torch.bfloat16).reshape(w.shape[0], -1), zeros.to(
+        torch.bfloat16
+    ).reshape(w.shape[0], -1)
+def pack_scales_and_zeros(scales, zeros):
+    assert scales.shape == zeros.shape
+    assert scales.dtype == torch.bfloat16
+    assert zeros.dtype == torch.bfloat16
+    return (
+        torch.cat(
+            [
+                scales.reshape(scales.size(0), scales.size(1), 1),
+                zeros.reshape(zeros.size(0), zeros.size(1), 1),
+            ],
+            2,
+        )
+        .transpose(0, 1)
+        .contiguous()
+    )
+def unpack_scales_and_zeros(scales_and_zeros):
+    assert len(scales_and_zeros.shape) == 3 and scales_and_zeros.shape[2] == 2
+    assert scales_and_zeros.dtype == torch.float
+    return torch.split(scales_and_zeros.transpose(0, 1), 1, 2)
+def group_quantize_tensor_from_qparams(w, scales, zeros, n_bit=4, groupsize=128):
+    assert groupsize > 1
+    # needed for GPTQ single column quantize
+    if groupsize > w.shape[-1] and scales.shape[-1] == 1:
+        groupsize = w.shape[-1]
+    assert w.shape[-1] % groupsize == 0
+    assert w.dim() == 2
+    to_quant = w.reshape(-1, groupsize)
+    assert torch.isnan(to_quant).sum() == 0
+    scales = scales.reshape(-1, 1)
+    zeros = zeros.reshape(-1, 1)
+    min_val = zeros - scales * (2 ** (n_bit - 1))
+    max_int = 2**n_bit - 1
+    min_int = 0
+    w_int32 = (
+        to_quant.sub(min_val)
+        .div(scales)
+        .round()
+        .clamp_(min_int, max_int)
+        .to(torch.int32)
+        .reshape_as(w)
+    )
+    return w_int32
+def group_quantize_tensor(w, n_bit=4, groupsize=128):
+    scales, zeros = get_group_qparams(w, n_bit, groupsize)
+    w_int32 = group_quantize_tensor_from_qparams(w, scales, zeros, n_bit, groupsize)
+    scales_and_zeros = pack_scales_and_zeros(scales, zeros)
+    return w_int32, scales_and_zeros
+def group_dequantize_tensor_from_qparams(
+    w_int32, scales, zeros, n_bit=4, groupsize=128
+):
+    assert groupsize > 1
+    # needed for GPTQ single column dequantize
+    if groupsize > w_int32.shape[-1] and scales.shape[-1] == 1:
+        groupsize = w_int32.shape[-1]
+    assert w_int32.shape[-1] % groupsize == 0
+    assert w_int32.dim() == 2
+    w_int32_grouped = w_int32.reshape(-1, groupsize)
+    scales = scales.reshape(-1, 1)
+    zeros = zeros.reshape(-1, 1)
+    w_dq = (
+        w_int32_grouped.sub(2 ** (n_bit - 1)).mul(scales).add(zeros).reshape_as(w_int32)
+    )
+    return w_dq
+def group_dequantize_tensor(w_int32, scales_and_zeros, n_bit=4, groupsize=128):
+    scales, zeros = unpack_scales_and_zeros(scales_and_zeros)
+    return group_dequantize_tensor_from_qparams(
+        w_int32, scales, zeros, n_bit, groupsize
+    )
+class QuantHandler:
+    def __init__(self, mod):
+        self.mod = mod
+    def create_quantized_state_dict(self) -> "StateDict":
+        pass
+    def convert_for_runtime(self) -> "nn.Module":
+        pass
+##### Weight-only int8 per-channel quantized code ######
+def replace_linear_weight_only_int8_per_channel(module):
+    for name, child in module.named_children():
+        if isinstance(child, nn.Linear):
+            setattr(
+                module,
+                name,
+                WeightOnlyInt8Linear(child.in_features, child.out_features),
+            )
+        else:
+            replace_linear_weight_only_int8_per_channel(child)
+class WeightOnlyInt8QuantHandler:
+    def __init__(self, mod):
+        self.mod = mod
+    @torch.no_grad()
+    def create_quantized_state_dict(self):
+        cur_state_dict = self.mod.state_dict()
+        for fqn, mod in self.mod.named_modules():
+            if isinstance(mod, torch.nn.Linear):
+                int8_weight, scales, _ = dynamically_quantize_per_channel(
+                    mod.weight.float(), -128, 127, torch.int8
+                )
+                cur_state_dict[f"{fqn}.weight"] = int8_weight
+                cur_state_dict[f"{fqn}.scales"] = scales.to(mod.weight.dtype)
+        return cur_state_dict
+    def convert_for_runtime(self):
+        replace_linear_weight_only_int8_per_channel(self.mod)
+        return self.mod
+class WeightOnlyInt8Linear(torch.nn.Module):
+    __constants__ = ["in_features", "out_features"]
+    in_features: int
+    out_features: int
+    weight: torch.Tensor
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.register_buffer(
+            "weight", torch.empty((out_features, in_features), dtype=torch.int8)
+        )
+        self.register_buffer("scales", torch.ones(out_features, dtype=torch.bfloat16))
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return F.linear(input, self.weight.to(dtype=input.dtype)) * self.scales
+##### weight only int4 per channel groupwise quantized code ######
+def prepare_int4_weight_and_scales_and_zeros(weight_bf16, groupsize, inner_k_tiles):
+    weight_int32, scales_and_zeros = group_quantize_tensor(
+        weight_bf16, n_bit=4, groupsize=groupsize
+    )
+    weight_int4pack = torch.ops.aten._convert_weight_to_int4pack(
+        weight_int32, inner_k_tiles
+    )
+    return weight_int4pack, scales_and_zeros
+def linear_forward_int4(x, weight_int4pack, scales_and_zeros, out_features, groupsize):
+    origin_x_size = x.size()
+    x = x.reshape(-1, origin_x_size[-1])
+    c = torch.ops.aten._weight_int4pack_mm(
+        x, weight_int4pack, groupsize, scales_and_zeros
+    )
+    new_shape = origin_x_size[:-1] + (out_features,)
+    c = c.reshape(new_shape)
+    return c
+def _check_linear_int4_k(k, groupsize=1, inner_k_tiles=1):
+    return k % groupsize == 0 and k % (inner_k_tiles * 16) == 0
+def replace_linear_int4(module, groupsize, inner_k_tiles, padding):
+    for name, child in module.named_children():
+        if isinstance(child, nn.Linear):
+            if _check_linear_int4_k(child.in_features, groupsize, inner_k_tiles):
+                setattr(
+                    module,
+                    name,
+                    WeightOnlyInt4Linear(
+                        child.in_features,
+                        child.out_features,
+                        bias=False,
+                        groupsize=groupsize,
+                        inner_k_tiles=inner_k_tiles,
+                        padding=False,
+                    ),
+                )
+            elif padding:
+                setattr(
+                    module,
+                    name,
+                    WeightOnlyInt4Linear(
+                        child.in_features,
+                        child.out_features,
+                        bias=False,
+                        groupsize=groupsize,
+                        inner_k_tiles=inner_k_tiles,
+                        padding=True,
+                    ),
+                )
+        else:
+            replace_linear_int4(child, groupsize, inner_k_tiles, padding)
+class WeightOnlyInt4QuantHandler:
+    def __init__(self, mod, groupsize=128, inner_k_tiles=8, padding=True):
+        self.mod = mod
+        self.groupsize = groupsize
+        self.inner_k_tiles = inner_k_tiles
+        self.padding = padding
+        assert groupsize in [32, 64, 128, 256]
+        assert inner_k_tiles in [2, 4, 8]
+    @torch.no_grad()
+    def create_quantized_state_dict(self):
+        cur_state_dict = self.mod.state_dict()
+        for fqn, mod in self.mod.named_modules():
+            if isinstance(mod, torch.nn.Linear):
+                assert not mod.bias
+                out_features = mod.out_features
+                in_features = mod.in_features
+                assert out_features % 8 == 0, "require out_features % 8 == 0"
+                print(f"linear: {fqn}, in={in_features}, out={out_features}")
+                weight = mod.weight.data
+                if not _check_linear_int4_k(
+                    in_features, self.groupsize, self.inner_k_tiles
+                ):
+                    if self.padding:
+                        import torch.nn.functional as F
+                        print(
+                            f"warning: {fqn} is padded to satisfy in_features % 1024 == 0"
+                        )
+                        padded_in_features = find_multiple(in_features, 1024)
+                        weight = F.pad(
+                            weight, pad=(0, padded_in_features - in_features)
+                        )
+                    else:
+                        print(
+                            f"warning: {fqn} is skipped, int4 requires that in_features is 32, 64, or is divisible by 1024, "
+                            + "and that groupsize and inner_k_tiles*16 evenly divide into it"
+                        )
+                        continue
+                (
+                    weight_int4pack,
+                    scales_and_zeros,
+                ) = prepare_int4_weight_and_scales_and_zeros(
+                    weight.to(torch.bfloat16).to("cuda"),
+                    self.groupsize,
+                    self.inner_k_tiles,
+                )
+                cur_state_dict[f"{fqn}.weight"] = weight_int4pack.to("cpu")
+                cur_state_dict[f"{fqn}.scales_and_zeros"] = scales_and_zeros.to("cpu")
+        return cur_state_dict
+    def convert_for_runtime(self):
+        replace_linear_int4(self.mod, self.groupsize, self.inner_k_tiles, self.padding)
+        return self.mod
+class WeightOnlyInt4Linear(torch.nn.Module):
+    __constants__ = ["in_features", "out_features"]
+    in_features: int
+    out_features: int
+    weight: torch.Tensor
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias=True,
+        device=None,
+        dtype=None,
+        groupsize: int = 128,
+        inner_k_tiles: int = 8,
+        padding: bool = True,
+    ) -> None:
+        super().__init__()
+        self.padding = padding
+        if padding:
+            self.origin_in_features = in_features
+            in_features = find_multiple(in_features, 1024)
+        self.in_features = in_features
+        self.out_features = out_features
+        assert not bias, "require bias=False"
+        self.groupsize = groupsize
+        self.inner_k_tiles = inner_k_tiles
+        assert out_features % 8 == 0, "require out_features % 8 == 0"
+        assert (
+            in_features % (inner_k_tiles * 16) == 0
+        ), "require in_features % (innerKTiles * 16) == 0"
+        self.register_buffer(
+            "weight",
+            torch.empty(
+                (
+                    out_features // 8,
+                    in_features // (inner_k_tiles * 16),
+                    32,
+                    inner_k_tiles // 2,
+                ),
+                dtype=torch.int32,
+            ),
+        )
+        self.register_buffer(
+            "scales_and_zeros",
+            torch.empty(
+                (in_features // groupsize, out_features, 2), dtype=torch.bfloat16
+            ),
+        )
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        input = input.to(torch.bfloat16)
+        if self.padding:
+            import torch.nn.functional as F
+            input = F.pad(input, pad=(0, self.in_features - self.origin_in_features))
+        return linear_forward_int4(
+            input, self.weight, self.scales_and_zeros, self.out_features, self.groupsize
+        )
+def generate_folder_name():
+    now = datetime.datetime.now()
+    folder_name = now.strftime("%Y%m%d_%H%M%S")
+    return folder_name
+@click.command()
+@click.option(
+    "--checkpoint-path",
+    type=click.Path(path_type=Path, exists=True),
+    default="checkpoints/fish-speech-1.4",
+)
+@click.option(
+    "--mode", type=str, default="int8", help="type of quantization to perform"
+)
+@click.option(
+    "--groupsize", type=int, default=128, help="Group size for int4 quantization."
+)
+@click.option("--timestamp", type=str, default="None", help="When to do quantization")
+def quantize(checkpoint_path: Path, mode: str, groupsize: int, timestamp: str) -> None:
+    device = "cpu"
+    precision = torch.bfloat16
+    print("Loading model ...")
+    t0 = time.time()
+    model, _ = load_model(
+        checkpoint_path=checkpoint_path,
+        device=device,
+        precision=precision,
+        compile=False,
+    )
+    vq_model = "firefly-gan-vq-fsq-8x1024-21hz-generator.pth"
+    now = timestamp if timestamp != "None" else generate_folder_name()
+    if mode == "int8":
+        print(
+            "Quantizing model weights for int8 weight-only symmetric per-channel quantization"
+        )
+        quant_handler = WeightOnlyInt8QuantHandler(model)
+        quantized_state_dict = quant_handler.create_quantized_state_dict()
+        dir_name = checkpoint_path
+        dst_name = Path(f"checkpoints/fs-1.2-int8-{now}")
+        shutil.copytree(str(dir_name.resolve()), str(dst_name.resolve()))
+        if (dst_name / vq_model).exists():
+            (dst_name / vq_model).unlink()
+        quantize_path = dst_name / "model.pth"
+    elif mode == "int4":
+        print(
+            "Quantizing model weights for int4 weight-only affine per-channel groupwise quantization"
+        )
+        quant_handler = WeightOnlyInt4QuantHandler(model, groupsize)
+        quantized_state_dict = quant_handler.create_quantized_state_dict()
+        dir_name = checkpoint_path
+        dst_name = Path(f"checkpoints/fs-1.2-int4-g{groupsize}-{now}")
+        shutil.copytree(str(dir_name.resolve()), str(dst_name.resolve()))
+        if (dst_name / vq_model).exists():
+            (dst_name / vq_model).unlink()
+        quantize_path = dst_name / "model.pth"
+    else:
+        raise ValueError(
+            f"Invalid quantization mode {mode} needs to be one of [int8, int4, int4-gpptq]"
+        )
+    print(f"Writing quantized weights to {quantize_path}")
+    quantize_path.unlink(missing_ok=True)  # remove existing file if one already there
+    torch.save(quantized_state_dict, quantize_path)
+    print(f"Quantization complete took {time.time() - t0:.02f} seconds")
+if __name__ == "__main__":
+    quantize()

tools/llama/rebuild_tokenizer.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from tokenizers import Tokenizer, decoders, models, pre_tokenizers, processors, trainers
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+# Initialize a tokenizer
+tokenizer = Tokenizer(models.BPE())
+# Customize pre-tokenization and decoding
+tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
+tokenizer.decoder = decoders.ByteLevel()
+tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
+# Don't train the tokenizer
+trainer = trainers.BpeTrainer(
+    vocab_size=0,
+    min_frequency=2,
+    initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
+    special_tokens=[
+        "<|begin_of_sequence|>",
+        "<|end_of_sequence|>",
+        "<|im_start|>",
+        "<|im_sep|>",  # system, user, assistant, etc.
+        "<|im_end|>",
+        "<|semantic|>",  # audio features
+        "<|pad|>",
+    ],
+)
+# <|im_start|>user<|im_sep|>...<|im_end|>
+# <|im_start|>assistant<|im_sep|><|semantic|><|semantic|><|semantic|><|semantic|><|semantic|><|im_end|>
+tokenizer.train_from_iterator([], trainer=trainer)
+print(len(tokenizer.get_vocab()))
+x = tokenizer.encode(
+    "Hello, how are you? dfgnviadfjoiviouajeiodfjv 你好世界 🈶<|semantic|>"
+).ids
+print(x, len(x))
+print(tokenizer.decode(x, skip_special_tokens=True))
+tokenizer = PreTrainedTokenizerFast(
+    tokenizer_object=tokenizer,
+    pad_token="<|pad|>",
+    bos_token="<|begin_of_sequence|>",
+    eos_token="<|end_of_sequence|>",
+)
+# Try tokenizing a new sequence
+sequence = "All around, too, lay vast quantities of the costliest merchandise, and treasures were heaped in every cranny of the rocks, but all these things only added to the desolation of the scene. 测试中文, 你好世界 🈶<|semantic|>"
+encoded = tokenizer(sequence).input_ids
+print("Test encoding....")
+print(f"\tSentence: {sequence}")
+print(f"\tEncoded: {encoded}")
+print(f"\tDecoded: {tokenizer.batch_decode(encoded)}")
+print(f"\tDecoded: {tokenizer.decode(encoded)}")
+tokenizer.push_to_hub("fishaudio/fish-speech-1", private=True)

tools/msgpack_api.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import os
+from argparse import ArgumentParser
+from pathlib import Path
+import httpx
+import ormsgpack
+from tools.schema import ServeReferenceAudio, ServeTTSRequest
+api_key = os.environ.get("FISH_API_KEY", "YOUR_API_KEY")
+def audio_request():
+    # priority: ref_id > references
+    request = ServeTTSRequest(
+        text="你说的对, 但是原神是一款由米哈游自主研发的开放世界手游.",
+        # reference_id="114514",
+        references=[
+            ServeReferenceAudio(
+                audio=open("lengyue.wav", "rb").read(),
+                text=open("lengyue.lab", "r", encoding="utf-8").read(),
+            )
+        ],
+        streaming=True,
+    )
+    api_key = os.environ.get("FISH_API_KEY", "YOUR_API_KEY")
+    with (
+        httpx.Client() as client,
+        open("hello.wav", "wb") as f,
+    ):
+        with client.stream(
+            "POST",
+            "http://127.0.0.1:8080/v1/tts",
+            content=ormsgpack.packb(request, option=ormsgpack.OPT_SERIALIZE_PYDANTIC),
+            headers={
+                "authorization": f"Bearer {api_key}",
+                "content-type": "application/msgpack",
+            },
+            timeout=None,
+        ) as response:
+            for chunk in response.iter_bytes():
+                f.write(chunk)
+def asr_request(audio_path: Path):
+    # Read the audio file
+    with open(
+        str(audio_path),
+        "rb",
+    ) as audio_file:
+        audio_data = audio_file.read()
+    # Prepare the request data
+    request_data = {
+        "audio": audio_data,
+        "language": "en",  # Optional: specify the language
+        "ignore_timestamps": False,  # Optional: set to True to ignore precise timestamps
+    }
+    # Send the request
+    with httpx.Client() as client:
+        response = client.post(
+            "https://api.fish.audio/v1/asr",
+            headers={
+                "Authorization": f"Bearer {api_key}",
+                "Content-Type": "application/msgpack",
+            },
+            content=ormsgpack.packb(request_data),
+        )
+    # Parse the response
+    result = response.json()
+    print(f"Transcribed text: {result['text']}")
+    print(f"Audio duration: {result['duration']} seconds")
+    for segment in result["segments"]:
+        print(f"Segment: {segment['text']}")
+        print(f"Start time: {segment['start']}, End time: {segment['end']}")
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument("--audio_path", type=Path, default="audio/ref/trump.mp3")
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = parse_args()
+    asr_request(args.audio_path)

tools/post_api.py ADDED Viewed

	@@ -0,0 +1,227 @@

+import argparse
+import base64
+import wave
+import ormsgpack
+import pyaudio
+import requests
+from pydub import AudioSegment
+from pydub.playback import play
+from tools.file import audio_to_bytes, read_ref_text
+from tools.schema import ServeReferenceAudio, ServeTTSRequest
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Send a WAV file and text to a server and receive synthesized audio.",
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "--url",
+        "-u",
+        type=str,
+        default="http://127.0.0.1:8080/v1/tts",
+        help="URL of the server",
+    )
+    parser.add_argument(
+        "--text", "-t", type=str, required=True, help="Text to be synthesized"
+    )
+    parser.add_argument(
+        "--reference_id",
+        "-id",
+        type=str,
+        default=None,
+        help="ID of the reference model to be used for the speech\n(Local: name of folder containing audios and files)",
+    )
+    parser.add_argument(
+        "--reference_audio",
+        "-ra",
+        type=str,
+        nargs="+",
+        default=None,
+        help="Path to the audio file",
+    )
+    parser.add_argument(
+        "--reference_text",
+        "-rt",
+        type=str,
+        nargs="+",
+        default=None,
+        help="Reference text for voice synthesis",
+    )
+    parser.add_argument(
+        "--output",
+        "-o",
+        type=str,
+        default="generated_audio",
+        help="Output audio file name",
+    )
+    parser.add_argument(
+        "--play",
+        type=bool,
+        default=True,
+        help="Whether to play audio after receiving data",
+    )
+    parser.add_argument("--normalize", type=bool, default=True)
+    parser.add_argument(
+        "--format", type=str, choices=["wav", "mp3", "flac"], default="wav"
+    )
+    parser.add_argument(
+        "--mp3_bitrate", type=int, choices=[64, 128, 192], default=64, help="kHz"
+    )
+    parser.add_argument("--opus_bitrate", type=int, default=-1000)
+    parser.add_argument(
+        "--latency",
+        type=str,
+        default="normal",
+        choices=["normal", "balanced"],
+        help="Used in api.fish.audio/v1/tts",
+    )
+    parser.add_argument(
+        "--max_new_tokens",
+        type=int,
+        default=0,
+        help="Maximum new tokens to generate. \n0 means no limit.",
+    )
+    parser.add_argument(
+        "--chunk_length", type=int, default=200, help="Chunk length for synthesis"
+    )
+    parser.add_argument(
+        "--top_p", type=float, default=0.7, help="Top-p sampling for synthesis"
+    )
+    parser.add_argument(
+        "--repetition_penalty",
+        type=float,
+        default=1.2,
+        help="Repetition penalty for synthesis",
+    )
+    parser.add_argument(
+        "--temperature", type=float, default=0.7, help="Temperature for sampling"
+    )
+    parser.add_argument(
+        "--streaming", type=bool, default=False, help="Enable streaming response"
+    )
+    parser.add_argument(
+        "--channels", type=int, default=1, help="Number of audio channels"
+    )
+    parser.add_argument("--rate", type=int, default=44100, help="Sample rate for audio")
+    parser.add_argument(
+        "--use_memory_cache",
+        type=str,
+        default="never",
+        choices=["on-demand", "never"],
+        help="Cache encoded references codes in memory.\n"
+        "If `on-demand`, the server will use cached encodings\n "
+        "instead of encoding reference audio again.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="`None` means randomized inference, otherwise deterministic.\n"
+        "It can't be used for fixing a timbre.",
+    )
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = parse_args()
+    idstr: str | None = args.reference_id
+    # priority: ref_id > [{text, audio},...]
+    if idstr is None:
+        ref_audios = args.reference_audio
+        ref_texts = args.reference_text
+        if ref_audios is None:
+            byte_audios = []
+        else:
+            byte_audios = [audio_to_bytes(ref_audio) for ref_audio in ref_audios]
+        if ref_texts is None:
+            ref_texts = []
+        else:
+            ref_texts = [read_ref_text(ref_text) for ref_text in ref_texts]
+    else:
+        byte_audios = []
+        ref_texts = []
+        pass  # in api.py
+    data = {
+        "text": args.text,
+        "references": [
+            ServeReferenceAudio(audio=ref_audio, text=ref_text)
+            for ref_text, ref_audio in zip(ref_texts, byte_audios)
+        ],
+        "reference_id": idstr,
+        "normalize": args.normalize,
+        "format": args.format,
+        "mp3_bitrate": args.mp3_bitrate,
+        "opus_bitrate": args.opus_bitrate,
+        "max_new_tokens": args.max_new_tokens,
+        "chunk_length": args.chunk_length,
+        "top_p": args.top_p,
+        "repetition_penalty": args.repetition_penalty,
+        "temperature": args.temperature,
+        "streaming": args.streaming,
+        "use_memory_cache": args.use_memory_cache,
+        "seed": args.seed,
+    }
+    pydantic_data = ServeTTSRequest(**data)
+    response = requests.post(
+        args.url,
+        data=ormsgpack.packb(pydantic_data, option=ormsgpack.OPT_SERIALIZE_PYDANTIC),
+        stream=args.streaming,
+        headers={
+            "authorization": "Bearer YOUR_API_KEY",
+            "content-type": "application/msgpack",
+        },
+    )
+    if response.status_code == 200:
+        if args.streaming:
+            p = pyaudio.PyAudio()
+            audio_format = pyaudio.paInt16  # Assuming 16-bit PCM format
+            stream = p.open(
+                format=audio_format, channels=args.channels, rate=args.rate, output=True
+            )
+            wf = wave.open(f"{args.output}.wav", "wb")
+            wf.setnchannels(args.channels)
+            wf.setsampwidth(p.get_sample_size(audio_format))
+            wf.setframerate(args.rate)
+            stream_stopped_flag = False
+            try:
+                for chunk in response.iter_content(chunk_size=1024):
+                    if chunk:
+                        stream.write(chunk)
+                        wf.writeframesraw(chunk)
+                    else:
+                        if not stream_stopped_flag:
+                            stream.stop_stream()
+                            stream_stopped_flag = True
+            finally:
+                stream.close()
+                p.terminate()
+                wf.close()
+        else:
+            audio_content = response.content
+            audio_path = f"{args.output}.{args.format}"
+            with open(audio_path, "wb") as audio_file:
+                audio_file.write(audio_content)
+            audio = AudioSegment.from_file(audio_path, format=args.format)
+            if args.play:
+                play(audio)
+            print(f"Audio has been saved to '{audio_path}'.")
+    else:
+        print(f"Request failed with status code {response.status_code}")
+        print(response.json())

tools/schema.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import os
+import queue
+from dataclasses import dataclass
+from typing import Annotated, Literal, Optional
+import torch
+from pydantic import AfterValidator, BaseModel, Field, confloat, conint, conlist
+from pydantic.functional_validators import SkipValidation
+from fish_speech.conversation import Message, TextPart, VQPart
+GLOBAL_NUM_SAMPLES = int(os.getenv("GLOBAL_NUM_SAMPLES", 1))
+class ServeVQPart(BaseModel):
+    type: Literal["vq"] = "vq"
+    codes: SkipValidation[list[list[int]]]
+class ServeTextPart(BaseModel):
+    type: Literal["text"] = "text"
+    text: str
+class ServeAudioPart(BaseModel):
+    type: Literal["audio"] = "audio"
+    audio: bytes
+@dataclass
+class ASRPackRequest:
+    audio: torch.Tensor
+    result_queue: queue.Queue
+    language: str
+class ServeASRRequest(BaseModel):
+    # The audio should be an uncompressed PCM float16 audio
+    audios: list[bytes]
+    sample_rate: int = 44100
+    language: Literal["zh", "en", "ja", "auto"] = "auto"
+class ServeASRTranscription(BaseModel):
+    text: str
+    duration: float
+    huge_gap: bool
+class ServeASRSegment(BaseModel):
+    text: str
+    start: float
+    end: float
+class ServeTimedASRResponse(BaseModel):
+    text: str
+    segments: list[ServeASRSegment]
+    duration: float
+class ServeASRResponse(BaseModel):
+    transcriptions: list[ServeASRTranscription]
+class ServeMessage(BaseModel):
+    role: Literal["system", "assistant", "user"]
+    parts: list[ServeVQPart | ServeTextPart]
+    def to_conversation_message(self):
+        new_message = Message(role=self.role, parts=[])
+        for part in self.parts:
+            if isinstance(part, ServeTextPart):
+                new_message.parts.append(TextPart(text=part.text))
+            elif isinstance(part, ServeVQPart):
+                new_message.parts.append(
+                    VQPart(codes=torch.tensor(part.codes, dtype=torch.int))
+                )
+            else:
+                raise ValueError(f"Unsupported part type: {part}")
+        return new_message
+class ServeRequest(BaseModel):
+    messages: Annotated[list[ServeMessage], conlist(ServeMessage, min_length=1)]
+    max_new_tokens: int = 1024
+    top_p: float = 0.7
+    repetition_penalty: float = 1.2
+    temperature: float = 0.7
+    streaming: bool = False
+    num_samples: int = 1
+    early_stop_threshold: float = 1.0
+class ServeVQGANEncodeRequest(BaseModel):
+    # The audio here should be in wav, mp3, etc
+    audios: list[bytes]
+class ServeVQGANEncodeResponse(BaseModel):
+    tokens: SkipValidation[list[list[list[int]]]]
+class ServeVQGANDecodeRequest(BaseModel):
+    tokens: SkipValidation[list[list[list[int]]]]
+class ServeVQGANDecodeResponse(BaseModel):
+    # The audio here should be in PCM float16 format
+    audios: list[bytes]
+class ServeReferenceAudio(BaseModel):
+    audio: bytes
+    text: str
+class ServeForwardMessage(BaseModel):
+    role: str
+    content: str
+class ServeResponse(BaseModel):
+    messages: list[ServeMessage]
+    finish_reason: Literal["stop", "error"] | None = None
+    stats: dict[str, int | float | str] = {}
+class ServeStreamDelta(BaseModel):
+    role: Literal["system", "assistant", "user"] | None = None
+    part: ServeVQPart | ServeTextPart | None = None
+class ServeStreamResponse(BaseModel):
+    sample_id: int = 0
+    delta: ServeStreamDelta | None = None
+    finish_reason: Literal["stop", "error"] | None = None
+    stats: dict[str, int | float | str] | None = None
+class ServeReferenceAudio(BaseModel):
+    audio: bytes
+    text: str
+    def __repr__(self) -> str:
+        return f"ServeReferenceAudio(text={self.text!r}, audio_size={len(self.audio)})"
+class ServeChatRequestV1(BaseModel):
+    model: str = "llama3-8b"
+    messages: list[ServeForwardMessage] = []
+    audio: bytes | None = None
+    temperature: float = 1.0
+    top_p: float = 1.0
+    max_tokens: int = 256
+    voice: str = "jessica"
+    tts_audio_format: Literal["mp3", "pcm", "opus"] = "mp3"
+    tts_audio_bitrate: Literal[16, 24, 32, 48, 64, 96, 128, 192] = 128
+class ServeTTSRequest(BaseModel):
+    text: str
+    chunk_length: Annotated[int, conint(ge=100, le=300, strict=True)] = 200
+    # Audio format
+    format: Literal["wav", "pcm", "mp3"] = "wav"
+    mp3_bitrate: Literal[64, 128, 192] = 128
+    # References audios for in-context learning
+    references: list[ServeReferenceAudio] = []
+    # Reference id
+    # For example, if you want use https://fish.audio/m/7f92f8afb8ec43bf81429cc1c9199cb1/
+    # Just pass 7f92f8afb8ec43bf81429cc1c9199cb1
+    reference_id: str | None = None
+    seed: int | None = None
+    use_memory_cache: Literal["on-demand", "never"] = "never"
+    # Normalize text for en & zh, this increase stability for numbers
+    normalize: bool = True
+    mp3_bitrate: Optional[int] = 64
+    opus_bitrate: Optional[int] = -1000
+    # Balance mode will reduce latency to 300ms, but may decrease stability
+    latency: Literal["normal", "balanced"] = "normal"
+    # not usually used below
+    streaming: bool = False
+    max_new_tokens: int = 1024
+    top_p: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7
+    repetition_penalty: Annotated[float, Field(ge=0.9, le=2.0, strict=True)] = 1.2
+    temperature: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7

tools/sensevoice/README.md ADDED Viewed

	@@ -0,0 +1,59 @@

+# FunASR Command Line Interface
+This tool provides a command-line interface for separating vocals from instrumental tracks, converting videos to audio, and performing speech-to-text transcription on the resulting audio files.
+## Requirements
+- Python >= 3.10
+- PyTorch <= 2.3.1
+- ffmpeg, pydub, audio-separator[gpu].
+## Installation
+Install the required packages:
+```bash
+pip install -e .[stable]
+```
+Make sure you have `ffmpeg` installed and available in your `PATH`.
+## Usage
+### Basic Usage
+To run the tool with default settings:
+```bash
+python tools/sensevoice/fun_asr.py --audio-dir <audio_directory> --save-dir <output_directory>
+```
+## Options
+|          Option           |                                  Description                                  |
+| :-----------------------: | :---------------------------------------------------------------------------: |
+|        --audio-dir        |                  Directory containing audio or video files.                   |
+|        --save-dir         |                   Directory to save processed audio files.                    |
+|         --device          |         Device to use for processing. Options: cuda (default) or cpu.         |
+|        --language         |                Language of the transcription. Default is auto.                |
+| --max_single_segment_time | Maximum duration of a single audio segment in milliseconds. Default is 20000. |
+|          --punc           |                        Enable punctuation prediction.                         |
+|         --denoise         |                  Enable noise reduction (vocal separation).                   |
+## Example
+To process audio files in the directory `path/to/audio` and save the output to `path/to/output`, with punctuation and noise reduction enabled:
+```bash
+python tools/sensevoice/fun_asr.py --audio-dir path/to/audio --save-dir path/to/output --punc --denoise
+```
+## Additional Notes
+- The tool supports `both audio and video files`. Videos will be converted to audio automatically.
+- If the `--denoise` option is used, the tool will perform vocal separation to isolate the vocals from the instrumental tracks.
+- The script will automatically create necessary directories in the `--save-dir`.
+## Troubleshooting
+If you encounter any issues, make sure all dependencies are correctly installed and configured. For more detailed troubleshooting, refer to the documentation of each dependency.

tools/sensevoice/__init__.py ADDED Viewed

File without changes

tools/sensevoice/auto_model.py ADDED Viewed

	@@ -0,0 +1,573 @@

+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+import copy
+import json
+import logging
+import os.path
+import random
+import re
+import string
+import time
+import numpy as np
+import torch
+from funasr.download.download_model_from_hub import download_model
+from funasr.download.file import download_from_url
+from funasr.register import tables
+from funasr.train_utils.load_pretrained_model import load_pretrained_model
+from funasr.train_utils.set_all_random_seed import set_all_random_seed
+from funasr.utils import export_utils, misc
+from funasr.utils.load_utils import load_audio_text_image_video, load_bytes
+from funasr.utils.misc import deep_update
+from funasr.utils.timestamp_tools import timestamp_sentence, timestamp_sentence_en
+from tqdm import tqdm
+from .vad_utils import merge_vad, slice_padding_audio_samples
+try:
+    from funasr.models.campplus.cluster_backend import ClusterBackend
+    from funasr.models.campplus.utils import distribute_spk, postprocess, sv_chunk
+except:
+    pass
+def prepare_data_iterator(data_in, input_len=None, data_type=None, key=None):
+    """ """
+    data_list = []
+    key_list = []
+    filelist = [".scp", ".txt", ".json", ".jsonl", ".text"]
+    chars = string.ascii_letters + string.digits
+    if isinstance(data_in, str):
+        if data_in.startswith("http://") or data_in.startswith("https://"):  # url
+            data_in = download_from_url(data_in)
+    if isinstance(data_in, str) and os.path.exists(
+        data_in
+    ):  # wav_path; filelist: wav.scp, file.jsonl;text.txt;
+        _, file_extension = os.path.splitext(data_in)
+        file_extension = file_extension.lower()
+        if file_extension in filelist:  # filelist: wav.scp, file.jsonl;text.txt;
+            with open(data_in, encoding="utf-8") as fin:
+                for line in fin:
+                    key = "rand_key_" + "".join(random.choice(chars) for _ in range(13))
+                    if data_in.endswith(
+                        ".jsonl"
+                    ):  # file.jsonl: json.dumps({"source": data})
+                        lines = json.loads(line.strip())
+                        data = lines["source"]
+                        key = data["key"] if "key" in data else key
+                    else:  # filelist, wav.scp, text.txt: id \t data or data
+                        lines = line.strip().split(maxsplit=1)
+                        data = lines[1] if len(lines) > 1 else lines[0]
+                        key = lines[0] if len(lines) > 1 else key
+                    data_list.append(data)
+                    key_list.append(key)
+        else:
+            if key is None:
+                # key = "rand_key_" + "".join(random.choice(chars) for _ in range(13))
+                key = misc.extract_filename_without_extension(data_in)
+            data_list = [data_in]
+            key_list = [key]
+    elif isinstance(data_in, (list, tuple)):
+        if data_type is not None and isinstance(
+            data_type, (list, tuple)
+        ):  # mutiple inputs
+            data_list_tmp = []
+            for data_in_i, data_type_i in zip(data_in, data_type):
+                key_list, data_list_i = prepare_data_iterator(
+                    data_in=data_in_i, data_type=data_type_i
+                )
+                data_list_tmp.append(data_list_i)
+            data_list = []
+            for item in zip(*data_list_tmp):
+                data_list.append(item)
+        else:
+            # [audio sample point, fbank, text]
+            data_list = data_in
+            key_list = []
+            for data_i in data_in:
+                if isinstance(data_i, str) and os.path.exists(data_i):
+                    key = misc.extract_filename_without_extension(data_i)
+                else:
+                    if key is None:
+                        key = "rand_key_" + "".join(
+                            random.choice(chars) for _ in range(13)
+                        )
+                key_list.append(key)
+    else:  # raw text; audio sample point, fbank; bytes
+        if isinstance(data_in, bytes):  # audio bytes
+            data_in = load_bytes(data_in)
+        if key is None:
+            key = "rand_key_" + "".join(random.choice(chars) for _ in range(13))
+        data_list = [data_in]
+        key_list = [key]
+    return key_list, data_list
+class AutoModel:
+    def __init__(self, **kwargs):
+        try:
+            from funasr.utils.version_checker import check_for_update
+            print(
+                "Check update of funasr, and it would cost few times. You may disable it by set `disable_update=True` in AutoModel"
+            )
+            check_for_update(disable=kwargs.get("disable_update", False))
+        except:
+            pass
+        log_level = getattr(logging, kwargs.get("log_level", "INFO").upper())
+        logging.basicConfig(level=log_level)
+        model, kwargs = self.build_model(**kwargs)
+        # if vad_model is not None, build vad model else None
+        vad_model = kwargs.get("vad_model", None)
+        vad_kwargs = (
+            {} if kwargs.get("vad_kwargs", {}) is None else kwargs.get("vad_kwargs", {})
+        )
+        if vad_model is not None:
+            logging.info("Building VAD model.")
+            vad_kwargs["model"] = vad_model
+            vad_kwargs["model_revision"] = kwargs.get("vad_model_revision", "master")
+            vad_kwargs["device"] = kwargs["device"]
+            vad_model, vad_kwargs = self.build_model(**vad_kwargs)
+        # if punc_model is not None, build punc model else None
+        punc_model = kwargs.get("punc_model", None)
+        punc_kwargs = (
+            {}
+            if kwargs.get("punc_kwargs", {}) is None
+            else kwargs.get("punc_kwargs", {})
+        )
+        if punc_model is not None:
+            logging.info("Building punc model.")
+            punc_kwargs["model"] = punc_model
+            punc_kwargs["model_revision"] = kwargs.get("punc_model_revision", "master")
+            punc_kwargs["device"] = kwargs["device"]
+            punc_model, punc_kwargs = self.build_model(**punc_kwargs)
+        # if spk_model is not None, build spk model else None
+        spk_model = kwargs.get("spk_model", None)
+        spk_kwargs = (
+            {} if kwargs.get("spk_kwargs", {}) is None else kwargs.get("spk_kwargs", {})
+        )
+        if spk_model is not None:
+            logging.info("Building SPK model.")
+            spk_kwargs["model"] = spk_model
+            spk_kwargs["model_revision"] = kwargs.get("spk_model_revision", "master")
+            spk_kwargs["device"] = kwargs["device"]
+            spk_model, spk_kwargs = self.build_model(**spk_kwargs)
+            self.cb_model = ClusterBackend().to(kwargs["device"])
+            spk_mode = kwargs.get("spk_mode", "punc_segment")
+            if spk_mode not in ["default", "vad_segment", "punc_segment"]:
+                logging.error(
+                    "spk_mode should be one of default, vad_segment and punc_segment."
+                )
+            self.spk_mode = spk_mode
+        self.kwargs = kwargs
+        self.model = model
+        self.vad_model = vad_model
+        self.vad_kwargs = vad_kwargs
+        self.punc_model = punc_model
+        self.punc_kwargs = punc_kwargs
+        self.spk_model = spk_model
+        self.spk_kwargs = spk_kwargs
+        self.model_path = kwargs.get("model_path")
+    @staticmethod
+    def build_model(**kwargs):
+        assert "model" in kwargs
+        if "model_conf" not in kwargs:
+            logging.info(
+                "download models from model hub: {}".format(kwargs.get("hub", "ms"))
+            )
+            kwargs = download_model(**kwargs)
+        set_all_random_seed(kwargs.get("seed", 0))
+        device = kwargs.get("device", "cuda")
+        if not torch.cuda.is_available() or kwargs.get("ngpu", 1) == 0:
+            device = "cpu"
+            kwargs["batch_size"] = 1
+        kwargs["device"] = device
+        torch.set_num_threads(kwargs.get("ncpu", 4))
+        # build tokenizer
+        tokenizer = kwargs.get("tokenizer", None)
+        if tokenizer is not None:
+            tokenizer_class = tables.tokenizer_classes.get(tokenizer)
+            tokenizer = tokenizer_class(**kwargs.get("tokenizer_conf", {}))
+            kwargs["token_list"] = (
+                tokenizer.token_list if hasattr(tokenizer, "token_list") else None
+            )
+            kwargs["token_list"] = (
+                tokenizer.get_vocab()
+                if hasattr(tokenizer, "get_vocab")
+                else kwargs["token_list"]
+            )
+            vocab_size = (
+                len(kwargs["token_list"]) if kwargs["token_list"] is not None else -1
+            )
+            if vocab_size == -1 and hasattr(tokenizer, "get_vocab_size"):
+                vocab_size = tokenizer.get_vocab_size()
+        else:
+            vocab_size = -1
+        kwargs["tokenizer"] = tokenizer
+        # build frontend
+        frontend = kwargs.get("frontend", None)
+        kwargs["input_size"] = None
+        if frontend is not None:
+            frontend_class = tables.frontend_classes.get(frontend)
+            frontend = frontend_class(**kwargs.get("frontend_conf", {}))
+            kwargs["input_size"] = (
+                frontend.output_size() if hasattr(frontend, "output_size") else None
+            )
+        kwargs["frontend"] = frontend
+        # build model
+        model_class = tables.model_classes.get(kwargs["model"])
+        assert model_class is not None, f'{kwargs["model"]} is not registered'
+        model_conf = {}
+        deep_update(model_conf, kwargs.get("model_conf", {}))
+        deep_update(model_conf, kwargs)
+        model = model_class(**model_conf, vocab_size=vocab_size)
+        # init_param
+        init_param = kwargs.get("init_param", None)
+        if init_param is not None:
+            if os.path.exists(init_param):
+                logging.info(f"Loading pretrained params from {init_param}")
+                load_pretrained_model(
+                    model=model,
+                    path=init_param,
+                    ignore_init_mismatch=kwargs.get("ignore_init_mismatch", True),
+                    oss_bucket=kwargs.get("oss_bucket", None),
+                    scope_map=kwargs.get("scope_map", []),
+                    excludes=kwargs.get("excludes", None),
+                )
+            else:
+                print(f"error, init_param does not exist!: {init_param}")
+        # fp16
+        if kwargs.get("fp16", False):
+            model.to(torch.float16)
+        elif kwargs.get("bf16", False):
+            model.to(torch.bfloat16)
+        model.to(device)
+        if not kwargs.get("disable_log", True):
+            tables.print()
+        return model, kwargs
+    def __call__(self, *args, **cfg):
+        kwargs = self.kwargs
+        deep_update(kwargs, cfg)
+        res = self.model(*args, kwargs)
+        return res
+    def generate(self, input, input_len=None, **cfg):
+        if self.vad_model is None:
+            return self.inference(input, input_len=input_len, **cfg)
+        else:
+            return self.inference_with_vad(input, input_len=input_len, **cfg)
+    def inference(
+        self, input, input_len=None, model=None, kwargs=None, key=None, **cfg
+    ):
+        kwargs = self.kwargs if kwargs is None else kwargs
+        if "cache" in kwargs:
+            kwargs.pop("cache")
+        deep_update(kwargs, cfg)
+        model = self.model if model is None else model
+        model.eval()
+        batch_size = kwargs.get("batch_size", 1)
+        # if kwargs.get("device", "cpu") == "cpu":
+        #     batch_size = 1
+        key_list, data_list = prepare_data_iterator(
+            input, input_len=input_len, data_type=kwargs.get("data_type", None), key=key
+        )
+        speed_stats = {}
+        asr_result_list = []
+        num_samples = len(data_list)
+        disable_pbar = self.kwargs.get("disable_pbar", False)
+        pbar = (
+            tqdm(colour="blue", total=num_samples, dynamic_ncols=True)
+            if not disable_pbar
+            else None
+        )
+        time_speech_total = 0.0
+        time_escape_total = 0.0
+        for beg_idx in range(0, num_samples, batch_size):
+            end_idx = min(num_samples, beg_idx + batch_size)
+            data_batch = data_list[beg_idx:end_idx]
+            key_batch = key_list[beg_idx:end_idx]
+            batch = {"data_in": data_batch, "key": key_batch}
+            if (end_idx - beg_idx) == 1 and kwargs.get(
+                "data_type", None
+            ) == "fbank":  # fbank
+                batch["data_in"] = data_batch[0]
+                batch["data_lengths"] = input_len
+            time1 = time.perf_counter()
+            with torch.no_grad():
+                res = model.inference(**batch, **kwargs)
+                if isinstance(res, (list, tuple)):
+                    results = res[0] if len(res) > 0 else [{"text": ""}]
+                    meta_data = res[1] if len(res) > 1 else {}
+            time2 = time.perf_counter()
+            asr_result_list.extend(results)
+            # batch_data_time = time_per_frame_s * data_batch_i["speech_lengths"].sum().item()
+            batch_data_time = meta_data.get("batch_data_time", -1)
+            time_escape = time2 - time1
+            speed_stats["load_data"] = meta_data.get("load_data", 0.0)
+            speed_stats["extract_feat"] = meta_data.get("extract_feat", 0.0)
+            speed_stats["forward"] = f"{time_escape:0.3f}"
+            speed_stats["batch_size"] = f"{len(results)}"
+            speed_stats["rtf"] = f"{(time_escape) / batch_data_time:0.3f}"
+            description = f"{speed_stats}, "
+            if pbar:
+                pbar.update(end_idx - beg_idx)
+                pbar.set_description(description)
+            time_speech_total += batch_data_time
+            time_escape_total += time_escape
+        if pbar:
+            # pbar.update(1)
+            pbar.set_description(f"rtf_avg: {time_escape_total/time_speech_total:0.3f}")
+        torch.cuda.empty_cache()
+        return asr_result_list
+    def vad(self, input, input_len=None, **cfg):
+        kwargs = self.kwargs
+        # step.1: compute the vad model
+        deep_update(self.vad_kwargs, cfg)
+        beg_vad = time.time()
+        res = self.inference(
+            input,
+            input_len=input_len,
+            model=self.vad_model,
+            kwargs=self.vad_kwargs,
+            **cfg,
+        )
+        end_vad = time.time()
+        #  FIX(gcf): concat the vad clips for sense vocie model for better aed
+        if cfg.get("merge_vad", False):
+            for i in range(len(res)):
+                res[i]["value"] = merge_vad(
+                    res[i]["value"], kwargs.get("merge_length_s", 15) * 1000
+                )
+        elapsed = end_vad - beg_vad
+        return elapsed, res
+    def inference_with_vadres(self, input, vad_res, input_len=None, **cfg):
+        kwargs = self.kwargs
+        # step.2 compute asr model
+        model = self.model
+        deep_update(kwargs, cfg)
+        batch_size = max(int(kwargs.get("batch_size_s", 300)) * 1000, 1)
+        batch_size_threshold_ms = int(kwargs.get("batch_size_threshold_s", 60)) * 1000
+        kwargs["batch_size"] = batch_size
+        key_list, data_list = prepare_data_iterator(
+            input, input_len=input_len, data_type=kwargs.get("data_type", None)
+        )
+        results_ret_list = []
+        time_speech_total_all_samples = 1e-6
+        beg_total = time.time()
+        pbar_total = (
+            tqdm(colour="red", total=len(vad_res), dynamic_ncols=True)
+            if not kwargs.get("disable_pbar", False)
+            else None
+        )
+        for i in range(len(vad_res)):
+            key = vad_res[i]["key"]
+            vadsegments = vad_res[i]["value"]
+            input_i = data_list[i]
+            fs = kwargs["frontend"].fs if hasattr(kwargs["frontend"], "fs") else 16000
+            speech = load_audio_text_image_video(
+                input_i, fs=fs, audio_fs=kwargs.get("fs", 16000)
+            )
+            speech_lengths = len(speech)
+            n = len(vadsegments)
+            data_with_index = [(vadsegments[i], i) for i in range(n)]
+            sorted_data = sorted(data_with_index, key=lambda x: x[0][1] - x[0][0])
+            results_sorted = []
+            if not len(sorted_data):
+                results_ret_list.append({"key": key, "text": "", "timestamp": []})
+                logging.info("decoding, utt: {}, empty speech".format(key))
+                continue
+            if len(sorted_data) > 0 and len(sorted_data[0]) > 0:
+                batch_size = max(
+                    batch_size, sorted_data[0][0][1] - sorted_data[0][0][0]
+                )
+            if kwargs["device"] == "cpu":
+                batch_size = 0
+            beg_idx = 0
+            beg_asr_total = time.time()
+            time_speech_total_per_sample = speech_lengths / 16000
+            time_speech_total_all_samples += time_speech_total_per_sample
+            # pbar_sample = tqdm(colour="blue", total=n, dynamic_ncols=True)
+            all_segments = []
+            max_len_in_batch = 0
+            end_idx = 1
+            for j, _ in enumerate(range(0, n)):
+                # pbar_sample.update(1)
+                sample_length = sorted_data[j][0][1] - sorted_data[j][0][0]
+                potential_batch_length = max(max_len_in_batch, sample_length) * (
+                    j + 1 - beg_idx
+                )
+                # batch_size_ms_cum += sorted_data[j][0][1] - sorted_data[j][0][0]
+                if (
+                    j < n - 1
+                    and sample_length < batch_size_threshold_ms
+                    and potential_batch_length < batch_size
+                ):
+                    max_len_in_batch = max(max_len_in_batch, sample_length)
+                    end_idx += 1
+                    continue
+                speech_j, speech_lengths_j, intervals = slice_padding_audio_samples(
+                    speech, speech_lengths, sorted_data[beg_idx:end_idx]
+                )
+                results = self.inference(
+                    speech_j, input_len=None, model=model, kwargs=kwargs, **cfg
+                )
+                for _b in range(len(speech_j)):
+                    results[_b]["interval"] = intervals[_b]
+                if self.spk_model is not None:
+                    # compose vad segments: [[start_time_sec, end_time_sec, speech], [...]]
+                    for _b in range(len(speech_j)):
+                        vad_segments = [
+                            [
+                                sorted_data[beg_idx:end_idx][_b][0][0] / 1000.0,
+                                sorted_data[beg_idx:end_idx][_b][0][1] / 1000.0,
+                                np.array(speech_j[_b]),
+                            ]
+                        ]
+                        segments = sv_chunk(vad_segments)
+                        all_segments.extend(segments)
+                        speech_b = [i[2] for i in segments]
+                        spk_res = self.inference(
+                            speech_b,
+                            input_len=None,
+                            model=self.spk_model,
+                            kwargs=kwargs,
+                            **cfg,
+                        )
+                        results[_b]["spk_embedding"] = spk_res[0]["spk_embedding"]
+                beg_idx = end_idx
+                end_idx += 1
+                max_len_in_batch = sample_length
+                if len(results) < 1:
+                    continue
+                results_sorted.extend(results)
+            # end_asr_total = time.time()
+            # time_escape_total_per_sample = end_asr_total - beg_asr_total
+            # pbar_sample.update(1)
+            # pbar_sample.set_description(f"rtf_avg_per_sample: {time_escape_total_per_sample / time_speech_total_per_sample:0.3f}, "
+            #                      f"time_speech_total_per_sample: {time_speech_total_per_sample: 0.3f}, "
+            #                      f"time_escape_total_per_sample: {time_escape_total_per_sample:0.3f}")
+            restored_data = [0] * n
+            for j in range(n):
+                index = sorted_data[j][1]
+                cur = results_sorted[j]
+                pattern = r"<\|([^|]+)\|>"
+                emotion_string = re.findall(pattern, cur["text"])
+                cur["text"] = re.sub(pattern, "", cur["text"])
+                cur["emo"] = "".join([f"<|{t}|>" for t in emotion_string])
+                if self.punc_model is not None and len(cur["text"].strip()) > 0:
+                    deep_update(self.punc_kwargs, cfg)
+                    punc_res = self.inference(
+                        cur["text"],
+                        model=self.punc_model,
+                        kwargs=self.punc_kwargs,
+                        **cfg,
+                    )
+                    cur["text"] = punc_res[0]["text"]
+                restored_data[index] = cur
+            end_asr_total = time.time()
+            time_escape_total_per_sample = end_asr_total - beg_asr_total
+            if pbar_total:
+                pbar_total.update(1)
+                pbar_total.set_description(
+                    f"rtf_avg: {time_escape_total_per_sample / time_speech_total_per_sample:0.3f}, "
+                    f"time_speech: {time_speech_total_per_sample: 0.3f}, "
+                    f"time_escape: {time_escape_total_per_sample:0.3f}"
+                )
+        # end_total = time.time()
+        # time_escape_total_all_samples = end_total - beg_total
+        # print(f"rtf_avg_all: {time_escape_total_all_samples / time_speech_total_all_samples:0.3f}, "
+        #                      f"time_speech_all: {time_speech_total_all_samples: 0.3f}, "
+        #                      f"time_escape_all: {time_escape_total_all_samples:0.3f}")
+        return restored_data
+    def export(self, input=None, **cfg):
+        """
+        :param input:
+        :param type:
+        :param quantize:
+        :param fallback_num:
+        :param calib_num:
+        :param opset_version:
+        :param cfg:
+        :return:
+        """
+        device = cfg.get("device", "cpu")
+        model = self.model.to(device=device)
+        kwargs = self.kwargs
+        deep_update(kwargs, cfg)
+        kwargs["device"] = device
+        del kwargs["model"]
+        model.eval()
+        type = kwargs.get("type", "onnx")
+        key_list, data_list = prepare_data_iterator(
+            input, input_len=None, data_type=kwargs.get("data_type", None), key=None
+        )
+        with torch.no_grad():
+            export_dir = export_utils.export(model=model, data_in=data_list, **kwargs)
+        return export_dir

tools/sensevoice/fun_asr.py ADDED Viewed

	@@ -0,0 +1,332 @@

+import gc
+import os
+import re
+from audio_separator.separator import Separator
+os.environ["MODELSCOPE_CACHE"] = "./.cache/funasr"
+os.environ["UVR5_CACHE"] = "./.cache/uvr5-models"
+import json
+import subprocess
+from pathlib import Path
+import click
+import torch
+from loguru import logger
+from pydub import AudioSegment
+from silero_vad import get_speech_timestamps, load_silero_vad, read_audio
+from tqdm import tqdm
+from tools.file import AUDIO_EXTENSIONS, VIDEO_EXTENSIONS, list_files
+from tools.sensevoice.auto_model import AutoModel
+def uvr5_cli(
+    audio_dir: Path,
+    output_folder: Path,
+    audio_files: list[Path] | None = None,
+    output_format: str = "flac",
+    model: str = "BS-Roformer-Viperx-1297.ckpt",
+):
+    # ["BS-Roformer-Viperx-1297.ckpt", "BS-Roformer-Viperx-1296.ckpt", "BS-Roformer-Viperx-1053.ckpt", "Mel-Roformer-Viperx-1143.ckpt"]
+    sepr = Separator(
+        model_file_dir=os.environ["UVR5_CACHE"],
+        output_dir=output_folder,
+        output_format=output_format,
+    )
+    dictmodel = {
+        "BS-Roformer-Viperx-1297.ckpt": "model_bs_roformer_ep_317_sdr_12.9755.ckpt",
+        "BS-Roformer-Viperx-1296.ckpt": "model_bs_roformer_ep_368_sdr_12.9628.ckpt",
+        "BS-Roformer-Viperx-1053.ckpt": "model_bs_roformer_ep_937_sdr_10.5309.ckpt",
+        "Mel-Roformer-Viperx-1143.ckpt": "model_mel_band_roformer_ep_3005_sdr_11.4360.ckpt",
+    }
+    roformer_model = dictmodel[model]
+    sepr.load_model(roformer_model)
+    if audio_files is None:
+        audio_files = list_files(
+            path=audio_dir, extensions=AUDIO_EXTENSIONS, recursive=True
+        )
+    total_files = len(audio_files)
+    print(f"{total_files} audio files found")
+    res = []
+    for audio in tqdm(audio_files, desc="Denoising: "):
+        file_path = str(audio_dir / audio)
+        sep_out = sepr.separate(file_path)
+        if isinstance(sep_out, str):
+            res.append(sep_out)
+        elif isinstance(sep_out, list):
+            res.extend(sep_out)
+    del sepr
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    return res, roformer_model
+def get_sample_rate(media_path: Path):
+    result = subprocess.run(
+        [
+            "ffprobe",
+            "-v",
+            "quiet",
+            "-print_format",
+            "json",
+            "-show_streams",
+            str(media_path),
+        ],
+        capture_output=True,
+        text=True,
+        check=True,
+    )
+    media_info = json.loads(result.stdout)
+    for stream in media_info.get("streams", []):
+        if stream.get("codec_type") == "audio":
+            return stream.get("sample_rate")
+    return "44100"  # Default sample rate if not found
+def convert_to_mono(src_path: Path, out_path: Path, out_fmt: str = "wav"):
+    sr = get_sample_rate(src_path)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    if src_path.resolve() == out_path.resolve():
+        output = str(out_path.with_stem(out_path.stem + f"_{sr}"))
+    else:
+        output = str(out_path)
+    subprocess.run(
+        [
+            "ffmpeg",
+            "-loglevel",
+            "error",
+            "-i",
+            str(src_path),
+            "-acodec",
+            "pcm_s16le" if out_fmt == "wav" else "flac",
+            "-ar",
+            sr,
+            "-ac",
+            "1",
+            "-y",
+            output,
+        ],
+        check=True,
+    )
+    return out_path
+def convert_video_to_audio(video_path: Path, audio_dir: Path):
+    cur_dir = audio_dir / video_path.relative_to(audio_dir).parent
+    vocals = [
+        p
+        for p in cur_dir.glob(f"{video_path.stem}_(Vocals)*.*")
+        if p.suffix in AUDIO_EXTENSIONS
+    ]
+    if len(vocals) > 0:
+        return vocals[0]
+    audio_path = cur_dir / f"{video_path.stem}.wav"
+    convert_to_mono(video_path, audio_path)
+    return audio_path
+@click.command()
+@click.option("--audio-dir", required=True, help="Directory containing audio files")
+@click.option(
+    "--save-dir", required=True, help="Directory to save processed audio files"
+)
+@click.option("--device", default="cuda", help="Device to use [cuda / cpu]")
+@click.option("--language", default="auto", help="Language of the transcription")
+@click.option(
+    "--max_single_segment_time",
+    default=20000,
+    type=int,
+    help="Maximum of Output single audio duration(ms)",
+)
+@click.option("--fsmn-vad/--silero-vad", default=False)
+@click.option("--punc/--no-punc", default=False)
+@click.option("--denoise/--no-denoise", default=False)
+@click.option("--save_emo/--no_save_emo", default=False)
+def main(
+    audio_dir: str,
+    save_dir: str,
+    device: str,
+    language: str,
+    max_single_segment_time: int,
+    fsmn_vad: bool,
+    punc: bool,
+    denoise: bool,
+    save_emo: bool,
+):
+    audios_path = Path(audio_dir)
+    save_path = Path(save_dir)
+    save_path.mkdir(parents=True, exist_ok=True)
+    video_files = list_files(
+        path=audio_dir, extensions=VIDEO_EXTENSIONS, recursive=True
+    )
+    v2a_files = [convert_video_to_audio(p, audio_dir) for p in video_files]
+    if denoise:
+        VOCAL = "_(Vocals)"
+        original_files = [
+            p
+            for p in audios_path.glob("**/*")
+            if p.suffix in AUDIO_EXTENSIONS and VOCAL not in p.stem
+        ]
+        _, cur_model = uvr5_cli(
+            audio_dir=audio_dir, output_folder=audio_dir, audio_files=original_files
+        )
+        need_remove = [p for p in audios_path.glob("**/*(Instrumental)*")]
+        need_remove.extend(original_files)
+        for _ in need_remove:
+            _.unlink()
+        vocal_files = [
+            p
+            for p in audios_path.glob("**/*")
+            if p.suffix in AUDIO_EXTENSIONS and VOCAL in p.stem
+        ]
+        for f in vocal_files:
+            fn, ext = f.stem, f.suffix
+            v_pos = fn.find(VOCAL + "_" + cur_model.split(".")[0])
+            if v_pos != -1:
+                new_fn = fn[: v_pos + len(VOCAL)]
+                new_f = f.with_name(new_fn + ext)
+                f = f.rename(new_f)
+                convert_to_mono(f, f, "flac")
+                f.unlink()
+    audio_files = list_files(
+        path=audio_dir, extensions=AUDIO_EXTENSIONS, recursive=True
+    )
+    logger.info("Loading / Downloading Funasr model...")
+    model_dir = "iic/SenseVoiceSmall"
+    vad_model = "fsmn-vad" if fsmn_vad else None
+    vad_kwargs = {"max_single_segment_time": max_single_segment_time}
+    punc_model = "ct-punc" if punc else None
+    manager = AutoModel(
+        model=model_dir,
+        trust_remote_code=False,
+        vad_model=vad_model,
+        vad_kwargs=vad_kwargs,
+        punc_model=punc_model,
+        device=device,
+    )
+    if not fsmn_vad and vad_model is None:
+        vad_model = load_silero_vad()
+    logger.info("Model loaded.")
+    pattern = re.compile(r"_\d{3}\.")
+    for file_path in tqdm(audio_files, desc="Processing audio file"):
+        if pattern.search(file_path.name):
+            # logger.info(f"Skipping {file_path} as it has already been processed.")
+            continue
+        file_stem = file_path.stem
+        file_suffix = file_path.suffix
+        rel_path = Path(file_path).relative_to(audio_dir)
+        (save_path / rel_path.parent).mkdir(parents=True, exist_ok=True)
+        audio = AudioSegment.from_file(file_path)
+        cfg = dict(
+            cache={},
+            language=language,  # "zh", "en", "yue", "ja", "ko", "nospeech"
+            use_itn=False,
+            batch_size_s=60,
+        )
+        if fsmn_vad:
+            elapsed, vad_res = manager.vad(input=str(file_path), **cfg)
+        else:
+            wav = read_audio(
+                str(file_path)
+            )  # backend (sox, soundfile, or ffmpeg) required!
+            audio_key = file_path.stem
+            audio_val = []
+            speech_timestamps = get_speech_timestamps(
+                wav,
+                vad_model,
+                max_speech_duration_s=max_single_segment_time // 1000,
+                return_seconds=True,
+            )
+            audio_val = [
+                [int(timestamp["start"] * 1000), int(timestamp["end"] * 1000)]
+                for timestamp in speech_timestamps
+            ]
+            vad_res = []
+            vad_res.append(dict(key=audio_key, value=audio_val))
+        res = manager.inference_with_vadres(
+            input=str(file_path), vad_res=vad_res, **cfg
+        )
+        for i, info in enumerate(res):
+            [start_ms, end_ms] = info["interval"]
+            text = info["text"]
+            emo = info["emo"]
+            sliced_audio = audio[start_ms:end_ms]
+            audio_save_path = (
+                save_path / rel_path.parent / f"{file_stem}_{i:03d}{file_suffix}"
+            )
+            sliced_audio.export(audio_save_path, format=file_suffix[1:])
+            print(f"Exported {audio_save_path}: {text}")
+            transcript_save_path = (
+                save_path / rel_path.parent / f"{file_stem}_{i:03d}.lab"
+            )
+            with open(
+                transcript_save_path,
+                "w",
+                encoding="utf-8",
+            ) as f:
+                f.write(text)
+            if save_emo:
+                emo_save_path = save_path / rel_path.parent / f"{file_stem}_{i:03d}.emo"
+                with open(
+                    emo_save_path,
+                    "w",
+                    encoding="utf-8",
+                ) as f:
+                    f.write(emo)
+        if audios_path.resolve() == save_path.resolve():
+            file_path.unlink()
+if __name__ == "__main__":
+    main()
+    exit(0)
+    from funasr.utils.postprocess_utils import rich_transcription_postprocess
+    # Load the audio file
+    audio_path = Path(r"D:\PythonProject\ok\1_output_(Vocals).wav")
+    model_dir = "iic/SenseVoiceSmall"
+    m, kwargs = SenseVoiceSmall.from_pretrained(model=model_dir, device="cuda:0")
+    m.eval()
+    res = m.inference(
+        data_in=f"{kwargs['model_path']}/example/zh.mp3",
+        language="auto",  # "zh", "en", "yue", "ja", "ko", "nospeech"
+        use_itn=False,
+        ban_emo_unk=False,
+        **kwargs,
+    )
+    print(res)
+    text = rich_transcription_postprocess(res[0][0]["text"])
+    print(text)

tools/sensevoice/vad_utils.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import torch
+from torch.nn.utils.rnn import pad_sequence
+def slice_padding_fbank(speech, speech_lengths, vad_segments):
+    speech_list = []
+    speech_lengths_list = []
+    for i, segment in enumerate(vad_segments):
+        bed_idx = int(segment[0][0] * 16)
+        end_idx = min(int(segment[0][1] * 16), speech_lengths[0])
+        speech_i = speech[0, bed_idx:end_idx]
+        speech_lengths_i = end_idx - bed_idx
+        speech_list.append(speech_i)
+        speech_lengths_list.append(speech_lengths_i)
+    feats_pad = pad_sequence(speech_list, batch_first=True, padding_value=0.0)
+    speech_lengths_pad = torch.Tensor(speech_lengths_list).int()
+    return feats_pad, speech_lengths_pad
+def slice_padding_audio_samples(speech, speech_lengths, vad_segments):
+    speech_list = []
+    speech_lengths_list = []
+    intervals = []
+    for i, segment in enumerate(vad_segments):
+        bed_idx = int(segment[0][0] * 16)
+        end_idx = min(int(segment[0][1] * 16), speech_lengths)
+        speech_i = speech[bed_idx:end_idx]
+        speech_lengths_i = end_idx - bed_idx
+        speech_list.append(speech_i)
+        speech_lengths_list.append(speech_lengths_i)
+        intervals.append([bed_idx // 16, end_idx // 16])
+    return speech_list, speech_lengths_list, intervals
+def merge_vad(vad_result, max_length=15000, min_length=0):
+    new_result = []
+    if len(vad_result) <= 1:
+        return vad_result
+    time_step = [t[0] for t in vad_result] + [t[1] for t in vad_result]
+    time_step = sorted(list(set(time_step)))
+    if len(time_step) == 0:
+        return []
+    bg = 0
+    for i in range(len(time_step) - 1):
+        time = time_step[i]
+        if time_step[i + 1] - bg < max_length:
+            continue
+        if time - bg > min_length:
+            new_result.append([bg, time])
+        # if time - bg < max_length * 1.5:
+        #     new_result.append([bg, time])
+        # else:
+        #     split_num = int(time - bg) // max_length + 1
+        #     spl_l = int(time - bg) // split_num
+        #     for j in range(split_num):
+        #         new_result.append([bg + j * spl_l, bg + (j + 1) * spl_l])
+        bg = time
+    new_result.append([bg, time_step[-1]])
+    return new_result

tools/smart_pad.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import random
+from multiprocessing import Pool
+from pathlib import Path
+import click
+import librosa
+import torch.nn.functional as F
+import torchaudio
+from tqdm import tqdm
+from tools.file import AUDIO_EXTENSIONS, list_files
+threshold = 10 ** (-50 / 20.0)
+def process(file):
+    waveform, sample_rate = torchaudio.load(str(file), backend="sox")
+    if waveform.size(0) > 1:
+        waveform = waveform.mean(dim=0, keepdim=True)
+    loudness = librosa.feature.rms(
+        y=waveform.numpy().squeeze(), frame_length=2048, hop_length=512, center=True
+    )[0]
+    for i in range(len(loudness) - 1, 0, -1):
+        if loudness[i] > threshold:
+            break
+    end_silent_time = (len(loudness) - i) * 512 / sample_rate
+    if end_silent_time <= 0.3:
+        random_time = random.uniform(0.3, 0.7) - end_silent_time
+        waveform = F.pad(
+            waveform, (0, int(random_time * sample_rate)), mode="constant", value=0
+        )
+    for i in range(len(loudness)):
+        if loudness[i] > threshold:
+            break
+    start_silent_time = i * 512 / sample_rate
+    if start_silent_time > 0.02:
+        waveform = waveform[:, int((start_silent_time - 0.02) * sample_rate) :]
+    torchaudio.save(uri=str(file), src=waveform, sample_rate=sample_rate)
+@click.command()
+@click.argument("source", type=Path)
+@click.option("--num-workers", type=int, default=12)
+def main(source, num_workers):
+    files = list(list_files(source, AUDIO_EXTENSIONS, recursive=True))
+    with Pool(num_workers) as p:
+        list(tqdm(p.imap_unordered(process, files), total=len(files)))
+if __name__ == "__main__":
+    main()

tools/vqgan/__pycache__/inference.cpython-310.pyc ADDED Viewed

Binary file (3.53 kB). View file

tools/vqgan/create_train_split.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import math
+from pathlib import Path
+from random import Random
+import click
+from loguru import logger
+from pydub import AudioSegment
+from tqdm import tqdm
+from tools.file import AUDIO_EXTENSIONS, list_files, load_filelist
+@click.command()
+@click.argument("root", type=click.Path(exists=True, path_type=Path))
+@click.option("--val-ratio", type=float, default=None)
+@click.option("--val-count", type=int, default=None)
+@click.option("--filelist", default=None, type=Path)
+@click.option("--min-duration", default=None, type=float)
+@click.option("--max-duration", default=None, type=float)
+def main(root, val_ratio, val_count, filelist, min_duration, max_duration):
+    if filelist:
+        files = [i[0] for i in load_filelist(filelist)]
+    else:
+        files = list_files(root, AUDIO_EXTENSIONS, recursive=True, sort=True)
+    if min_duration is None and max_duration is None:
+        filtered_files = list(map(str, [file.relative_to(root) for file in files]))
+    else:
+        filtered_files = []
+        for file in tqdm(files):
+            try:
+                audio = AudioSegment.from_file(str(file))
+                duration = len(audio) / 1000.0
+                if min_duration is not None and duration < min_duration:
+                    logger.info(
+                        f"Skipping {file} due to duration {duration:.2f} < {min_duration:.2f}"
+                    )
+                    continue
+                if max_duration is not None and duration > max_duration:
+                    logger.info(
+                        f"Skipping {file} due to duration {duration:.2f} > {max_duration:.2f}"
+                    )
+                    continue
+                filtered_files.append(str(file.relative_to(root)))
+            except Exception as e:
+                logger.info(f"Error processing {file}: {e}")
+    logger.info(
+        f"Found {len(files)} files, remaining {len(filtered_files)} files after filtering"
+    )
+    Random(42).shuffle(filtered_files)
+    if val_count is None and val_ratio is None:
+        logger.info("Validation ratio and count not specified, using min(20%, 100)")
+        val_size = min(100, math.ceil(len(filtered_files) * 0.2))
+    elif val_count is not None and val_ratio is not None:
+        logger.error("Cannot specify both val_count and val_ratio")
+        return
+    elif val_count is not None:
+        if val_count < 1 or val_count > len(filtered_files):
+            logger.error("val_count must be between 1 and number of files")
+            return
+        val_size = val_count
+    else:
+        val_size = math.ceil(len(filtered_files) * val_ratio)
+    logger.info(f"Using {val_size} files for validation")
+    with open(root / "vq_train_filelist.txt", "w", encoding="utf-8") as f:
+        f.write("\n".join(filtered_files[val_size:]))
+    with open(root / "vq_val_filelist.txt", "w", encoding="utf-8") as f:
+        f.write("\n".join(filtered_files[:val_size]))
+    logger.info("Done")
+if __name__ == "__main__":
+    main()

tools/vqgan/extract_vq.py ADDED Viewed

	@@ -0,0 +1,233 @@

+import os
+import subprocess as sp
+import sys
+import time
+from datetime import timedelta
+from functools import lru_cache
+from pathlib import Path
+from random import Random
+import click
+import numpy as np
+import torch
+import torchaudio
+from hydra import compose, initialize
+from hydra.utils import instantiate
+from lightning import LightningModule
+from loguru import logger
+from omegaconf import OmegaConf
+from tools.file import AUDIO_EXTENSIONS, list_files, load_filelist
+# register eval resolver
+OmegaConf.register_new_resolver("eval", eval)
+# This file is used to convert the audio files to text files using the Whisper model.
+# It's mainly used to generate the training data for the VQ model.
+backends = torchaudio.list_audio_backends()
+if "ffmpeg" in backends:
+    backend = "ffmpeg"
+else:
+    backend = "soundfile"
+RANK = int(os.environ.get("SLURM_PROCID", 0))
+WORLD_SIZE = int(os.environ.get("SLURM_NTASKS", 1))
+logger_format = (
+    "<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | "
+    "<level>{level: <8}</level> | "
+    "<cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> | "
+    "{extra[rank]} - <level>{message}</level>"
+)
+logger.configure(extra={"rank": f"RANK: {RANK} / {WORLD_SIZE}"})
+logger.remove()
+logger.add(sys.stderr, format=logger_format)
+@lru_cache(maxsize=1)
+def get_model(
+    config_name: str = "firefly_gan_vq",
+    checkpoint_path: str = "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
+    device: str | torch.device = "cuda",
+):
+    with initialize(version_base="1.3", config_path="../../fish_speech/configs"):
+        cfg = compose(config_name=config_name)
+    model = instantiate(cfg)
+    state_dict = torch.load(
+        checkpoint_path,
+        map_location=device,
+    )
+    if "state_dict" in state_dict:
+        state_dict = state_dict["state_dict"]
+    if any("generator" in k for k in state_dict):
+        state_dict = {
+            k.replace("generator.", ""): v
+            for k, v in state_dict.items()
+            if "generator." in k
+        }
+    model.load_state_dict(state_dict, strict=False)
+    model.eval()
+    model.to(device)
+    logger.info(f"Loaded model")
+    return model
+@torch.inference_mode()
+def process_batch(files: list[Path], model) -> float:
+    wavs = []
+    audio_lengths = []
+    new_files = []
+    max_length = total_time = 0
+    for file in files:
+        try:
+            wav, sr = torchaudio.load(
+                str(file), backend=backend
+            )  # Need to install libsox-dev
+        except Exception as e:
+            logger.error(f"Error reading {file}: {e}")
+            continue
+        if wav.shape[0] > 1:
+            wav = wav.mean(dim=0, keepdim=True)
+        wav = torchaudio.functional.resample(
+            wav.cuda(), sr, model.spec_transform.sample_rate
+        )[0]
+        total_time += len(wav) / model.spec_transform.sample_rate
+        max_length = max(max_length, len(wav))
+        wavs.append(wav)
+        audio_lengths.append(len(wav))
+        new_files.append(file)
+    files = new_files
+    # Pad to max length
+    for i, wav in enumerate(wavs):
+        wavs[i] = torch.nn.functional.pad(wav, (0, max_length - len(wav)), "constant")
+    audios = torch.stack(wavs, dim=0)[:, None]
+    audio_lengths = torch.tensor(audio_lengths, device=model.device, dtype=torch.long)
+    # Calculate lengths
+    indices, feature_lengths = model.encode(audios, audio_lengths)
+    # Save to disk
+    outputs = indices.cpu().numpy()
+    for file, length, feature, audio_length in zip(
+        files, feature_lengths, outputs, audio_lengths
+    ):
+        feature = feature[:, :length]
+        # (T,)
+        with open(file.with_suffix(".npy"), "wb") as f:
+            np.save(f, feature)
+    return total_time
+@click.command()
+@click.argument("folder")
+@click.option("--num-workers", default=1)
+@click.option("--config-name", default="firefly_gan_vq")
+@click.option(
+    "--checkpoint-path",
+    default="checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
+)
+@click.option("--batch-size", default=64)
+@click.option("--filelist", default=None, type=Path)
+def main(
+    folder: str,
+    num_workers: int,
+    config_name: str,
+    checkpoint_path: str,
+    batch_size: int,
+    filelist: Path,
+):
+    if num_workers > 1 and WORLD_SIZE != num_workers:
+        assert WORLD_SIZE == 1, "You should either use SLURM or this launcher, not both"
+        logger.info(f"Spawning {num_workers} workers")
+        if torch.cuda.is_available():
+            visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
+            if visible_devices is None:
+                visible_devices = list(range(torch.cuda.device_count()))
+            else:
+                visible_devices = visible_devices.split(",")
+        else:
+            # Set to empty string to avoid using GPU
+            visible_devices = [""]
+        processes = []
+        for i in range(num_workers):
+            env = os.environ.copy()
+            env["CUDA_VISIBLE_DEVICES"] = str(visible_devices[i % len(visible_devices)])
+            env["SLURM_PROCID"] = str(i)
+            env["SLURM_NTASKS"] = str(num_workers)
+            processes.append(
+                sp.Popen(
+                    [sys.executable] + sys.argv.copy(),
+                    env=env,
+                )
+            )
+        for p in processes:
+            p.wait()
+        logger.info(f"All workers finished")
+        return
+    # This is a worker
+    logger.info(f"Starting worker")
+    if filelist:
+        files = [i[0] for i in load_filelist(filelist)]
+    else:
+        files = list_files(folder, AUDIO_EXTENSIONS, recursive=True, sort=False)
+    print(f"Found {len(files)} files")
+    files = [Path(f) for f in files if not Path(f).with_suffix(".npy").exists()]
+    total_files = len(files)
+    files = files[RANK::WORLD_SIZE]
+    logger.info(f"Processing {len(files)}/{total_files} files")
+    # Batch processing
+    total_time = 0
+    begin_time = time.time()
+    processed_files = 0
+    model = get_model(config_name, checkpoint_path)
+    for n_batch, idx in enumerate(range(0, len(files), batch_size)):
+        batch = files[idx : idx + batch_size]
+        batch_time = process_batch(batch, model)
+        total_time += batch_time
+        processed_files += len(batch)
+        if (n_batch + 1) % 10 == 0:
+            eta = (
+                (time.time() - begin_time)
+                / processed_files
+                * (len(files) - processed_files)
+            )
+            logger.info(
+                f"Processed {processed_files} files, {total_time / 3600:.2f} hours of audio, "
+                + f"ETA: {timedelta(seconds=round(eta))}s"
+            )
+    logger.info(
+        f"Finished processing {len(files)} files, {total_time / 3600:.2f} hours of audio"
+    )
+if __name__ == "__main__":
+    main()

tools/vqgan/inference.py ADDED Viewed

	@@ -0,0 +1,121 @@

+from pathlib import Path
+import click
+import hydra
+import numpy as np
+import soundfile as sf
+import torch
+import torchaudio
+from hydra import compose, initialize
+from hydra.utils import instantiate
+from loguru import logger
+from omegaconf import OmegaConf
+from tools.file import AUDIO_EXTENSIONS
+# register eval resolver
+OmegaConf.register_new_resolver("eval", eval)
+def load_model(config_name, checkpoint_path, device="cuda"):
+    hydra.core.global_hydra.GlobalHydra.instance().clear()
+    with initialize(version_base="1.3", config_path="../../fish_speech/configs"):
+        cfg = compose(config_name=config_name)
+    model = instantiate(cfg)
+    state_dict = torch.load(
+        checkpoint_path, map_location=device, mmap=True, weights_only=True
+    )
+    if "state_dict" in state_dict:
+        state_dict = state_dict["state_dict"]
+    if any("generator" in k for k in state_dict):
+        state_dict = {
+            k.replace("generator.", ""): v
+            for k, v in state_dict.items()
+            if "generator." in k
+        }
+    result = model.load_state_dict(state_dict, strict=False, assign=True)
+    model.eval()
+    model.to(device)
+    logger.info(f"Loaded model: {result}")
+    return model
+@torch.no_grad()
+@click.command()
+@click.option(
+    "--input-path",
+    "-i",
+    default="test.wav",
+    type=click.Path(exists=True, path_type=Path),
+)
+@click.option(
+    "--output-path", "-o", default="fake.wav", type=click.Path(path_type=Path)
+)
+@click.option("--config-name", default="firefly_gan_vq")
+@click.option(
+    "--checkpoint-path",
+    default="checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
+)
+@click.option(
+    "--device",
+    "-d",
+    default="cuda",
+)
+def main(input_path, output_path, config_name, checkpoint_path, device):
+    model = load_model(config_name, checkpoint_path, device=device)
+    if input_path.suffix in AUDIO_EXTENSIONS:
+        logger.info(f"Processing in-place reconstruction of {input_path}")
+        # Load audio
+        audio, sr = torchaudio.load(str(input_path))
+        if audio.shape[0] > 1:
+            audio = audio.mean(0, keepdim=True)
+        audio = torchaudio.functional.resample(
+            audio, sr, model.spec_transform.sample_rate
+        )
+        audios = audio[None].to(device)
+        logger.info(
+            f"Loaded audio with {audios.shape[2] / model.spec_transform.sample_rate:.2f} seconds"
+        )
+        # VQ Encoder
+        audio_lengths = torch.tensor([audios.shape[2]], device=device, dtype=torch.long)
+        indices = model.encode(audios, audio_lengths)[0][0]
+        logger.info(f"Generated indices of shape {indices.shape}")
+        # Save indices
+        np.save(output_path.with_suffix(".npy"), indices.cpu().numpy())
+    elif input_path.suffix == ".npy":
+        logger.info(f"Processing precomputed indices from {input_path}")
+        indices = np.load(input_path)
+        indices = torch.from_numpy(indices).to(device).long()
+        assert indices.ndim == 2, f"Expected 2D indices, got {indices.ndim}"
+    else:
+        raise ValueError(f"Unknown input type: {input_path}")
+    # Restore
+    feature_lengths = torch.tensor([indices.shape[1]], device=device)
+    fake_audios, _ = model.decode(
+        indices=indices[None], feature_lengths=feature_lengths
+    )
+    audio_time = fake_audios.shape[-1] / model.spec_transform.sample_rate
+    logger.info(
+        f"Generated audio of shape {fake_audios.shape}, equivalent to {audio_time:.2f} seconds from {indices.shape[1]} features, features/second: {indices.shape[1] / audio_time:.2f}"
+    )
+    # Save audio
+    fake_audio = fake_audios[0, 0].float().cpu().numpy()
+    sf.write(output_path, fake_audio, model.spec_transform.sample_rate)
+    logger.info(f"Saved audio to {output_path}")
+if __name__ == "__main__":
+    main()

tools/webui.py ADDED Viewed

	@@ -0,0 +1,570 @@

+import gc
+import html
+import io
+import os
+import queue
+import wave
+from argparse import ArgumentParser
+from functools import partial
+from pathlib import Path
+import gradio as gr
+import librosa
+import numpy as np
+import pyrootutils
+import torch
+from loguru import logger
+from transformers import AutoTokenizer
+pyrootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
+from fish_speech.i18n import i18n
+from fish_speech.text.chn_text_norm.text import Text as ChnNormedText
+from fish_speech.utils import autocast_exclude_mps, set_seed
+from tools.api import decode_vq_tokens, encode_reference
+from tools.file import AUDIO_EXTENSIONS, list_files
+from tools.llama.generate import (
+    GenerateRequest,
+    GenerateResponse,
+    WrappedGenerateResponse,
+    launch_thread_safe_queue,
+)
+from tools.vqgan.inference import load_model as load_decoder_model
+# Make einx happy
+os.environ["EINX_FILTER_TRACEBACK"] = "false"
+HEADER_MD = f"""# 泰雅爾語TTS
+#泰雅爾語測試範例
+{i18n("Miyan qaniy qu binkgan bbinkesan na Yesu:Yesu Kristo ga kinbahan na Tabite, Tabite ga kinbahan na Aburaham.")}
+{i18n("Aburaham ga yaba na Isak; Isak ga yaba na Yakob; Yakob ga yaba na Yuta ki mmtswe nya mlikuy.")}
+{i18n("Babaw nqu kyapun rasun squ qalang Babilon lga, plqyun ni Yehoyacin qu Seltiyel; Seltiyel ga yaba na Zerubabel;")}
+#若要使用自己的聲音合成請按以下步驟(Streaming Generate)
+# <span style="color: red;">Streaming Generate 此功能維護中</span>
+{i18n("1.在Reference Audio找到Enable Reference Audio打勾")}
+{i18n("2.在左下方將錄音檔案上傳，並在Reference Text輸入上傳音檔的文字")}
+{i18n("3.在Input Text輸入文字")}
+{i18n("4.按下Streaming Generate即可")}
+"""
+TEXTBOX_PLACEHOLDER = i18n("Put your text here.")
+SPACE_IMPORTED = False
+def build_html_error_message(error):
+    return f"""
+    <div style="color: red;
+    font-weight: bold;">
+        {html.escape(str(error))}
+    </div>
+    """
+@torch.inference_mode()
+def inference(
+    text,
+    enable_reference_audio,
+    reference_audio,
+    reference_text,
+    max_new_tokens,
+    chunk_length,
+    top_p,
+    repetition_penalty,
+    temperature,
+    seed="0",
+    streaming=False,
+):
+    if args.max_gradio_length > 0 and len(text) > args.max_gradio_length:
+        return (
+            None,
+            None,
+            i18n("Text is too long, please keep it under {} characters.").format(
+                args.max_gradio_length
+            ),
+        )
+    seed = int(seed)
+    if seed != 0:
+        set_seed(seed)
+        logger.warning(f"set seed: {seed}")
+    # Parse reference audio aka prompt
+    prompt_tokens = encode_reference(
+        decoder_model=decoder_model,
+        reference_audio=reference_audio,
+        enable_reference_audio=enable_reference_audio,
+    )
+    # LLAMA Inference
+    request = dict(
+        device=decoder_model.device,
+        max_new_tokens=600,
+        text=text,
+        top_p=top_p,
+        repetition_penalty=repetition_penalty,
+        temperature=temperature,
+        compile=args.compile,
+        iterative_prompt=chunk_length > 0,
+        chunk_length=chunk_length,
+        max_length=2048,
+        prompt_tokens=prompt_tokens if enable_reference_audio else None,
+        prompt_text=reference_text if enable_reference_audio else None,
+    )
+    response_queue = queue.Queue()
+    llama_queue.put(
+        GenerateRequest(
+            request=request,
+            response_queue=response_queue,
+        )
+    )
+    if streaming:
+        yield wav_chunk_header(), None, None
+    segments = []
+    while True:
+        result: WrappedGenerateResponse = response_queue.get()
+        if result.status == "error":
+            yield None, None, build_html_error_message(result.response)
+            break
+        result: GenerateResponse = result.response
+        if result.action == "next":
+            break
+        with autocast_exclude_mps(
+            device_type=decoder_model.device.type, dtype=args.precision
+        ):
+            fake_audios = decode_vq_tokens(
+                decoder_model=decoder_model,
+                codes=result.codes,
+            )
+        fake_audios = fake_audios.float().cpu().numpy()
+        segments.append(fake_audios)
+        if streaming:
+            wav_header = wav_chunk_header()
+            audio_data = (fake_audios * 32768).astype(np.int16).tobytes()
+            yield wav_header + audio_data, None, None
+    if len(segments) == 0:
+        return (
+            None,
+            None,
+            build_html_error_message(
+                i18n("No audio generated, please check the input text.")
+            ),
+        )
+    # No matter streaming or not, we need to return the final audio
+    audio = np.concatenate(segments, axis=0)
+    yield None, (decoder_model.spec_transform.sample_rate, audio), None
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        gc.collect()
+inference_stream = partial(inference, streaming=True)
+n_audios = 4
+global_audio_list = []
+global_error_list = []
+def inference_wrapper(
+    text,
+    enable_reference_audio,
+    reference_audio,
+    reference_text,
+    max_new_tokens,
+    chunk_length,
+    top_p,
+    repetition_penalty,
+    temperature,
+    seed,
+    batch_infer_num,
+):
+    audios = []
+    errors = []
+    for _ in range(batch_infer_num):
+        result = inference(
+            text,
+            enable_reference_audio,
+            reference_audio,
+            reference_text,
+            max_new_tokens,
+            chunk_length,
+            top_p,
+            repetition_penalty,
+            temperature,
+            seed,
+        )
+        _, audio_data, error_message = next(result)
+        audios.append(
+            gr.Audio(value=audio_data if audio_data else None, visible=True),
+        )
+        errors.append(
+            gr.HTML(value=error_message if error_message else None, visible=True),
+        )
+    for _ in range(batch_infer_num, n_audios):
+        audios.append(
+            gr.Audio(value=None, visible=False),
+        )
+        errors.append(
+            gr.HTML(value=None, visible=False),
+        )
+    return None, *audios, *errors
+def wav_chunk_header(sample_rate=44100, bit_depth=16, channels=1):
+    buffer = io.BytesIO()
+    with wave.open(buffer, "wb") as wav_file:
+        wav_file.setnchannels(channels)
+        wav_file.setsampwidth(bit_depth // 8)
+        wav_file.setframerate(sample_rate)
+    wav_header_bytes = buffer.getvalue()
+    buffer.close()
+    return wav_header_bytes
+def normalize_text(user_input, use_normalization):
+    if use_normalization:
+        return ChnNormedText(raw_text=user_input).normalize()
+    else:
+        return user_input
+def update_examples():
+    examples_dir = Path("references")
+    examples_dir.mkdir(parents=True, exist_ok=True)
+    example_audios = list_files(examples_dir, AUDIO_EXTENSIONS, recursive=True)
+    return gr.Dropdown(choices=example_audios + [""])
+def build_app():
+    with gr.Blocks(theme=gr.themes.Base()) as app:
+        gr.Markdown(HEADER_MD)
+        # Use light theme by default
+        app.load(
+            None,
+            None,
+            js="() => {const params = new URLSearchParams(window.location.search);if (!params.has('__theme')) {params.set('__theme', '%s');window.location.search = params.toString();}}"
+            % args.theme,
+        )
+        # Inference
+        with gr.Row():
+            with gr.Column(scale=3):
+                text = gr.Textbox(
+                    label=i18n("Input Text"), placeholder=TEXTBOX_PLACEHOLDER, lines=10
+                )
+                refined_text = gr.Textbox(
+                    label=i18n("Realtime Transform Text"),
+                    placeholder=i18n(
+                        "Normalization Result Preview (Currently Only Chinese)"
+                    ),
+                    lines=5,
+                    interactive=False,
+                )
+                with gr.Row():
+                    if_refine_text = gr.Checkbox(
+                        label=i18n("Text Normalization"),
+                        value=False,
+                        scale=1,
+                    )
+                with gr.Row():
+                    with gr.Column():
+                        with gr.Tab(label=i18n("Advanced Config")):
+                            with gr.Row():
+                                chunk_length = gr.Slider(
+                                    label=i18n("Iterative Prompt Length, 0 means off"),
+                                    minimum=50,
+                                    maximum=300,
+                                    value=200,
+                                    step=8,
+                                )
+                                max_new_tokens = gr.Slider(
+                                    label=i18n(
+                                        "Maximum tokens per batch, 0 means no limit"
+                                    ),
+                                    minimum=0,
+                                    maximum=2048,
+                                    value=0,  # 0 means no limit
+                                    step=8,
+                                )
+                            with gr.Row():
+                                top_p = gr.Slider(
+                                    label="Top-P",
+                                    minimum=0.6,
+                                    maximum=0.9,
+                                    value=0.7,
+                                    step=0.01,
+                                )
+                                repetition_penalty = gr.Slider(
+                                    label=i18n("Repetition Penalty"),
+                                    minimum=1,
+                                    maximum=1.5,
+                                    value=1.2,
+                                    step=0.01,
+                                )
+                            with gr.Row():
+                                temperature = gr.Slider(
+                                    label="Temperature",
+                                    minimum=0.6,
+                                    maximum=0.9,
+                                    value=0.7,
+                                    step=0.01,
+                                )
+                                seed = gr.Textbox(
+                                    label="Seed",
+                                    info="0 means randomized inference, otherwise deterministic",
+                                    placeholder="any 32-bit-integer",
+                                    value="0",
+                                )
+                        with gr.Tab(label=i18n("Reference Audio")):
+                            with gr.Row():
+                                gr.Markdown(
+                                    i18n(
+                                        "5 to 10 seconds of reference audio, useful for specifying speaker."
+                                    )
+                                )
+                            with gr.Row():
+                                enable_reference_audio = gr.Checkbox(
+                                    label=i18n("Enable Reference Audio"),
+                                )
+                            with gr.Row():
+                                example_audio_dropdown = gr.Dropdown(
+                                    label=i18n("Select Example Audio"),
+                                    choices=[""],
+                                    value="",
+                                    interactive=True,
+                                    allow_custom_value=True,
+                                )
+                            with gr.Row():
+                                reference_audio = gr.Audio(
+                                    label=i18n("Reference Audio"),
+                                    type="filepath",
+                                )
+                            with gr.Row():
+                                reference_text = gr.Textbox(
+                                    label=i18n("Reference Text"),
+                                    lines=1,
+                                    placeholder="在一无所知中，梦里的一天结束了，一个新的「轮回」便会开始。",
+                                    value="",
+                                )
+                        with gr.Tab(label=i18n("Batch Inference")):
+                            with gr.Row():
+                                batch_infer_num = gr.Slider(
+                                    label="Batch infer nums",
+                                    minimum=1,
+                                    maximum=n_audios,
+                                    step=1,
+                                    value=1,
+                                )
+            with gr.Column(scale=3):
+                for _ in range(n_audios):
+                    with gr.Row():
+                        error = gr.HTML(
+                            label=i18n("Error Message"),
+                            visible=True if _ == 0 else False,
+                        )
+                        global_error_list.append(error)
+                    with gr.Row():
+                        audio = gr.Audio(
+                            label=i18n("Generated Audio"),
+                            type="numpy",
+                            interactive=False,
+                            visible=True if _ == 0 else False,
+                        )
+                        global_audio_list.append(audio)
+                with gr.Row():
+                    stream_audio = gr.Audio(
+                        label=i18n("Streaming Audio"),
+                        streaming=True,
+                        autoplay=True,
+                        interactive=False,
+                        show_download_button=True,
+                    )
+                with gr.Row():
+                    with gr.Column(scale=3):
+                        generate = gr.Button(
+                            value="\U0001F3A7 " + i18n("Generate"), variant="primary"
+                        )
+                        generate_stream = gr.Button(
+                            value="\U0001F3A7 " + i18n("Streaming Generate"),
+                            variant="primary",
+                            visible=False  # 隱藏按鈕
+                        )
+        text.input(
+            fn=normalize_text, inputs=[text, if_refine_text], outputs=[refined_text]
+        )
+        def select_example_audio(audio_path):
+            audio_path = Path(audio_path)
+            if audio_path.is_file():
+                lab_file = Path(audio_path.with_suffix(".lab"))
+                if lab_file.exists():
+                    lab_content = lab_file.read_text(encoding="utf-8").strip()
+                else:
+                    lab_content = ""
+                return str(audio_path), lab_content, True
+            return None, "", False
+        # Connect the dropdown to update reference audio and text
+        example_audio_dropdown.change(
+            fn=update_examples, inputs=[], outputs=[example_audio_dropdown]
+        ).then(
+            fn=select_example_audio,
+            inputs=[example_audio_dropdown],
+            outputs=[reference_audio, reference_text, enable_reference_audio],
+        )
+        # # Submit
+        generate.click(
+            inference_wrapper,
+            [
+                refined_text,
+                enable_reference_audio,
+                reference_audio,
+                reference_text,
+                max_new_tokens,
+                chunk_length,
+                top_p,
+                repetition_penalty,
+                temperature,
+                seed,
+                batch_infer_num,
+            ],
+            [stream_audio, *global_audio_list, *global_error_list],
+            concurrency_limit=1,
+        )
+        generate_stream.click(
+            inference_stream,
+            [
+                refined_text,
+                enable_reference_audio,
+                reference_audio,
+                reference_text,
+                max_new_tokens,
+                chunk_length,
+                top_p,
+                repetition_penalty,
+                temperature,
+                seed,
+            ],
+            [stream_audio, global_audio_list[0], global_error_list[0]],
+            concurrency_limit=1,
+        )
+    return app
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--llama-checkpoint-path",
+        type=Path,
+        default="checkpoints/fish-speech-1.2",
+    )
+    parser.add_argument(
+        "--decoder-checkpoint-path",
+        type=Path,
+        default="checkpoints/fish-speech-1.2/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
+    )
+    parser.add_argument("--decoder-config-name", type=str, default="firefly_gan_vq")
+    parser.add_argument("--device", type=str, default="cuda")
+    parser.add_argument("--half", action="store_true")
+    parser.add_argument("--compile", action="store_true")
+    parser.add_argument("--max-gradio-length", type=int, default=0)
+    parser.add_argument("--theme", type=str, default="light")
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = parse_args()
+    args.precision = torch.half if args.half else torch.bfloat16
+    logger.info("Loading Llama model...")
+    llama_queue = launch_thread_safe_queue(
+        checkpoint_path=args.llama_checkpoint_path,
+        device=args.device,
+        precision=args.precision,
+        compile=args.compile,
+    )
+    logger.info("Llama model loaded, loading VQ-GAN model...")
+    decoder_model = load_decoder_model(
+        config_name=args.decoder_config_name,
+        checkpoint_path=args.decoder_checkpoint_path,
+        device=args.device,
+    )
+    logger.info("Decoder model loaded, warming up...")
+    # Dry run to check if the model is loaded correctly and avoid the first-time latency
+    list(
+        inference(
+            text="Hello, world!",
+            enable_reference_audio=False,
+            reference_audio=None,
+            reference_text="",
+            max_new_tokens=500,
+            chunk_length=200,
+            top_p=0.7,
+            repetition_penalty=1.2,
+            temperature=0.7,
+        )
+    )
+    logger.info("Warming up done, launching the web UI...")
+    app = build_app()
+    app.launch(show_api=True, server_name="0.0.0.0",share=True)

tools/whisper_asr.py ADDED Viewed

	@@ -0,0 +1,176 @@

+"""
+Used to transcribe all audio files in one folder into another folder.
+e.g.
+Directory structure:
+--pre_data_root
+----SP_1
+------01.wav
+------02.wav
+------......
+----SP_2
+------01.wav
+------02.wav
+------......
+Use
+python tools/whisper_asr.py --audio-dir pre_data_root/SP_1 --save-dir data/SP_1
+to transcribe the first speaker.
+Use
+python tools/whisper_asr.py --audio-dir pre_data_root/SP_2 --save-dir data/SP_2
+to transcribe the second speaker.
+Note: Be aware of your audio sample rate, which defaults to 44.1kHz.
+"""
+import re
+from pathlib import Path
+import click
+import soundfile as sf
+from faster_whisper import WhisperModel
+from loguru import logger
+from pydub import AudioSegment
+from tqdm import tqdm
+from tools.file import AUDIO_EXTENSIONS, list_files
+@click.command()
+@click.option("--model-size", default="large-v3", help="Size of the Whisper model")
+@click.option(
+    "--compute-type",
+    default="float16",
+    help="Computation Precision of the Whisper model [float16 / int8_float16 / int8]",
+)
+@click.option("--audio-dir", required=True, help="Directory containing audio files")
+@click.option(
+    "--save-dir", required=True, help="Directory to save processed audio files"
+)
+@click.option(
+    "--sample-rate",
+    default=44100,
+    type=int,
+    help="Output sample rate, default to input sample rate",
+)
+@click.option("--device", default="cuda", help="Device to use [cuda / cpu]")
+@click.option("--language", default="auto", help="Language of the transcription")
+@click.option("--initial-prompt", default=None, help="Initial prompt for transcribing")
+def main(
+    model_size,
+    compute_type,
+    audio_dir,
+    save_dir,
+    sample_rate,
+    device,
+    language,
+    initial_prompt,
+):
+    logger.info("Loading / Downloading Faster Whisper model...")
+    model = WhisperModel(
+        model_size,
+        device=device,
+        compute_type=compute_type,
+        download_root="faster_whisper",
+    )
+    logger.info("Model loaded.")
+    save_path = Path(save_dir)
+    save_path.mkdir(parents=True, exist_ok=True)
+    audio_files = list_files(
+        path=audio_dir, extensions=AUDIO_EXTENSIONS, recursive=True
+    )
+    for file_path in tqdm(audio_files, desc="Processing audio file"):
+        file_stem = file_path.stem
+        file_suffix = file_path.suffix
+        rel_path = Path(file_path).relative_to(audio_dir)
+        (save_path / rel_path.parent).mkdir(parents=True, exist_ok=True)
+        audio = AudioSegment.from_file(file_path)
+        segments, info = model.transcribe(
+            file_path,
+            beam_size=5,
+            language=None if language == "auto" else language,
+            initial_prompt=initial_prompt,
+        )
+        print(
+            "Detected language '%s' with probability %f"
+            % (info.language, info.language_probability)
+        )
+        print("Total len(ms): ", len(audio))
+        whole_text = None
+        for segment in segments:
+            id, start, end, text = (
+                segment.id,
+                segment.start,
+                segment.end,
+                segment.text,
+            )
+            print("Segment %03d [%.2fs -> %.2fs] %s" % (id, start, end, text))
+            if not whole_text:
+                whole_text = text
+            else:
+                whole_text += ", " + text
+        whole_text += "."
+        audio_save_path = save_path / rel_path.parent / f"{file_stem}{file_suffix}"
+        audio.export(audio_save_path, format=file_suffix[1:])
+        print(f"Exported {audio_save_path}")
+        transcript_save_path = save_path / rel_path.parent / f"{file_stem}.lab"
+        with open(
+            transcript_save_path,
+            "w",
+            encoding="utf-8",
+        ) as f:
+            f.write(whole_text)
+if __name__ == "__main__":
+    main()
+    exit(0)
+    audio = AudioSegment.from_wav(
+        r"D:\PythonProject\原神语音中文\胡桃\vo_hutao_draw_appear.wav"
+    )
+    model_size = "large-v3"
+    model = WhisperModel(
+        model_size,
+        device="cuda",
+        compute_type="float16",
+        download_root="faster_whisper",
+    )
+    segments, info = model.transcribe(
+        r"D:\PythonProject\原神语音中文\胡桃\vo_hutao_draw_appear.wav",
+        beam_size=5,
+    )
+    print(
+        "Detected language '%s' with probability %f"
+        % (info.language, info.language_probability)
+    )
+    print("Total len(ms): ", len(audio))
+    for i, segment in enumerate(segments):
+        print(
+            "Segment %03d [%.2fs -> %.2fs] %s"
+            % (i, segment.start, segment.end, segment.text)
+        )
+        start_ms = int(segment.start * 1000)
+        end_ms = int(segment.end * 1000)
+        segment_audio = audio[start_ms:end_ms]
+        segment_audio.export(f"segment_{i:03d}.wav", format="wav")
+        print(f"Exported segment_{i:03d}.wav")
+    print("All segments have been exported.")