kokoro-onnx-api-test

Running

App Files Files Community

bcci commited on Feb 8

Commit

25bd1c6

verified ·

1 Parent(s): 05eca7a

Update app.py

Browse files

Files changed (1) hide show

app.py +148 -143

app.py CHANGED Viewed

@@ -2,6 +2,9 @@ import io
 import re
 import wave
 import struct
 import numpy as np
 import torch
@@ -12,71 +15,58 @@ from fastapi.middleware.gzip import GZipMiddleware
 from misaki import en
-import os
-import numpy as np
 from onnxruntime import InferenceSession
 from huggingface_hub import snapshot_download
-import json
 from scipy.io.wavfile import write as write_wav
-import time
-# Load the configuration file
-config_file_path = 'config.json'  # Update this with the path to your config file
 with open(config_file_path, 'r') as f:
     config = json.load(f)
-# Extract the phoneme vocabulary
 phoneme_vocab = config['vocab']
-# Step 3: Download the model and voice file from Hugging Face Hub
 model_repo = "onnx-community/Kokoro-82M-v1.0-ONNX"
 model_name = "onnx/model_q8f16.onnx"
 voice_file_pattern = "*.bin"
 local_dir = "."
-# Download the model and voice file
 snapshot_download(
     repo_id=model_repo,
     allow_patterns=[model_name, voice_file_pattern],
     local_dir=local_dir
 )
-# Step 4: Load the model
 model_path = os.path.join(local_dir, model_name)
 sess = InferenceSession(model_path)
 app = FastAPI(
     title="Kokoro TTS FastAPI",
-    middleware=[
-        Middleware(GZipMiddleware, compresslevel=9)  # Add GZip compression
-    ]
 )
-# ------------------------------------------------------------------------------
-# Global Pipeline Instance
-# ------------------------------------------------------------------------------
-# Create one pipeline instance for the entire app.
 # ------------------------------------------------------------------------------
 # Helper Functions
 # ------------------------------------------------------------------------------
 def generate_wav_header(sample_rate: int, num_channels: int, sample_width: int, data_size: int = 0x7FFFFFFF) -> bytes:
     """
-    Generate a WAV header for streaming.
-    Since we don't know the final audio size, we set the data chunk size to a large dummy value.
-    This header is sent only once at the start of the stream.
     """
     bits_per_sample = sample_width * 8
     byte_rate = sample_rate * num_channels * sample_width
     block_align = num_channels * sample_width
-    # total file size = 36 + data_size (header is 44 bytes total)
-    total_size = 36 + data_size
     header = struct.pack('<4sI4s', b'RIFF', total_size, b'WAVE')
     fmt_chunk = struct.pack('<4sIHHIIHH', b'fmt ', 16, 1, num_channels, sample_rate, byte_rate, block_align, bits_per_sample)
     data_chunk_header = struct.pack('<4sI', b'data', data_size)
@@ -85,13 +75,13 @@ def generate_wav_header(sample_rate: int, num_channels: int, sample_width: int,
 def custom_split_text(text: str) -> list:
     """
-    Custom splitting:
       - Start with a chunk size of 2 words.
-      - For each chunk, if a period (".") is found in any word (except if it’s the very last word),
-        then split the chunk at that word (include words up to that word).
       - Otherwise, use the current chunk size.
-      - For subsequent chunks, increase the chunk size by 2.
-      - If there are fewer than the desired number of words for a full chunk, add all remaining words.
     """
     words = text.split()
     chunks = []
@@ -102,7 +92,6 @@ def custom_split_text(text: str) -> list:
         if candidate_end > len(words):
             candidate_end = len(words)
         chunk_words = words[start:candidate_end]
-        # Look for a period in any word except the last one.
         split_index = None
         for i in range(len(chunk_words) - 1):
             if '.' in chunk_words[i]:
@@ -113,26 +102,24 @@ def custom_split_text(text: str) -> list:
             chunk_words = words[start:candidate_end]
         chunks.append(" ".join(chunk_words))
         start = candidate_end
-        chunk_size += 2  # Increase the chunk size by 2 for the next iteration.
     return chunks
 def audio_tensor_to_pcm_bytes(audio_tensor: torch.Tensor) -> bytes:
     """
-    Convert a torch.FloatTensor (with values in [-1, 1]) to raw 16-bit PCM bytes.
     """
-    # Ensure tensor is on CPU and flatten if necessary.
     audio_np = audio_tensor.cpu().numpy()
     if audio_np.ndim > 1:
         audio_np = audio_np.flatten()
-    # Scale to int16 range.
     audio_int16 = np.int16(audio_np * 32767)
     return audio_int16.tobytes()
 def audio_tensor_to_opus_bytes(audio_tensor: torch.Tensor, sample_rate: int = 24000, bitrate: int = 32000) -> bytes:
     """
-    Convert a torch.FloatTensor to Opus encoded bytes.
     Requires the 'opuslib' package: pip install opuslib
     """
     try:
@@ -143,154 +130,175 @@ def audio_tensor_to_opus_bytes(audio_tensor: torch.Tensor, sample_rate: int = 24
     audio_np = audio_tensor.cpu().numpy()
     if audio_np.ndim > 1:
         audio_np = audio_np.flatten()
-    # Scale to int16 range.  Important for opus.
     audio_int16 = np.int16(audio_np * 32767)
-    encoder = opuslib.Encoder(sample_rate, 1, opuslib.APPLICATION_VOIP) # 1 channel for mono.
-    # Calculate the number of frames to encode. Opus frames are 2.5, 5, 10, or 20 ms long.
-    frame_size = int(sample_rate * 0.020)  # 20ms frame size
     encoded_data = b''
     for i in range(0, len(audio_int16), frame_size):
         frame = audio_int16[i:i + frame_size]
         if len(frame) < frame_size:
-            # Pad the last frame with zeros if needed.
             frame = np.pad(frame, (0, frame_size - len(frame)), 'constant')
-        encoded_frame = encoder.encode(frame.tobytes(), frame_size) # Encode the frame.
         encoded_data += encoded_frame
     return encoded_data
-g2p = en.G2P(trf=False, british=False, fallback=None) # no transformer, American English
-def tokenizer(text):
     print("Text: " + text)
     phonemes_string, _ = g2p(text)
-    phonemes = []
-    for i in phonemes_string:
-        phonemes.append(i)
     tokens = [phoneme_vocab[phoneme] for phoneme in phonemes if phoneme in phoneme_vocab]
-    print(tokens)
     return tokens
 # ------------------------------------------------------------------------------
 # Endpoints
 # ------------------------------------------------------------------------------
-# @app.get("/tts/streaming", summary="Streaming TTS")
-# def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format: str = "opus"):
-#     """
-#     Streaming TTS endpoint that returns a continuous audio stream.
-#     Supports WAV (PCM) and Opus formats.  Opus offers significantly better compression.
-#     The endpoint first yields a WAV header (with a dummy length) for WAV,
-#     then yields encoded audio data for each text chunk as soon as it is generated.
-#     """
-#     # Split the input text using the custom doubling strategy.
-#     chunks = custom_split_text(text)
-#     sample_rate = 24000
-#     num_channels = 1
-#     sample_width = 2  # 16-bit PCM
-#     def audio_generator():
-#         if format.lower() == "wav":
-#             # Yield the WAV header first.
-#             header = generate_wav_header(sample_rate, num_channels, sample_width)
-#             yield header
-#         # Process and yield each chunk's audio data.
-#         for i, chunk in enumerate(chunks):
-#             print(f"Processing chunk {i}: {chunk}")  # Debugging
-#             try:
-#                 results = list(pipeline(chunk, voice=voice, speed=speed, split_pattern=None))
-#                 for result in results:
-#                     if result.audio is not None:
-#                         if format.lower() == "wav":
-#                             yield audio_tensor_to_pcm_bytes(result.audio)
-#                         elif format.lower() == "opus":
-#                             yield audio_tensor_to_opus_bytes(result.audio, sample_rate=sample_rate)
-#                         else:
-#                             raise ValueError(f"Unsupported audio format: {format}")
-#                     else:
-#                         print(f"Chunk {i}: No audio generated")
-#             except Exception as e:
-#                 print(f"Error processing chunk {i}: {e}")
-#                 yield b'' # important so that streaming continues.  Consider returning an error sound.
-#     media_type = "audio/wav" if format.lower() == "wav" else "audio/opus"
-#     return StreamingResponse(
-#         audio_generator(),
-#         media_type=media_type,
-#         headers={"Cache-Control": "no-cache"},
-#     )
 @app.get("/tts/full", summary="Full TTS")
 def tts_full(text: str, voice: str = "af_heart", speed: float = 1.0, format: str = "wav"):
     """
-    Full TTS endpoint that synthesizes the entire text, concatenates the audio,
-    and returns a complete WAV or Opus file.
     """
     voice_path = os.path.join(local_dir, f"voices/{voice}.bin")
     voices = np.fromfile(voice_path, dtype=np.float32).reshape(-1, 1, 256)
     tokens = tokenizer(text)
     ref_s = voices[len(tokens)]
     final_token = [[0, *tokens, 0]]
     start_time = time.time()
-    audio = sess.run(None, dict(
-        input_ids=final_token,
-        style=ref_s,
-        speed=np.ones(1, dtype=np.float32),
-    ))[0]
-    print(time.time()-start_time)
-    # Write the concatenated audio to an in-memory WAV or Opus file.
-    sample_rate = 24000
-    # audio = np.array(audio, dtype=np.float32)  # Ensure it's float32 first
-    audio = (audio * 32767).astype(np.int16)   # Scale to int16 range
-    # Flatten the array if it's 2D
-    audio = audio.flatten()
     if format.lower() == "wav":
-        # Create an in-memory buffer
         wav_io = io.BytesIO()
-        # Write the audio data to the buffer in WAV format
-        write_wav(wav_io, sample_rate, audio)
-        # Seek to the beginning of the buffer
         wav_io.seek(0)
         return Response(content=wav_io.read(), media_type="audio/wav")
     elif format.lower() == "opus":
-        opus_data = audio_tensor_to_opus_bytes(torch.from_numpy(audio), sample_rate=sample_rate)
         return Response(content=opus_data, media_type="audio/opus")
     else:
         raise HTTPException(status_code=400, detail=f"Unsupported audio format: {format}")
 @app.get("/", response_class=HTMLResponse)
 def index():
     """
     HTML demo page for Kokoro TTS.
-    This page provides a simple UI to enter text, choose a voice and speed,
-    and play synthesized audio from both the streaming and full endpoints.
     """
     return """
     <!DOCTYPE html>
@@ -321,7 +329,6 @@ def index():
                 const speed = document.getElementById('speed').value;
                 const format = document.getElementById('format').value;
                 const audio = document.getElementById('audio');
-                // Set the audio element's source to the streaming endpoint.
                 audio.src = `/tts/streaming?text=${encodeURIComponent(text)}&voice=${encodeURIComponent(voice)}&speed=${speed}&format=${format}`;
                 audio.type = format === 'wav' ? 'audio/wav' : 'audio/opus';
                 audio.play();
@@ -332,7 +339,6 @@ def index():
                 const speed = document.getElementById('speed').value;
                 const format = document.getElementById('format').value;
                 const audio = document.getElementById('audio');
-                // Set the audio element's source to the full TTS endpoint.
                 audio.src = `/tts/full?text=${encodeURIComponent(text)}&voice=${encodeURIComponent(voice)}&speed=${speed}&format=${format}`;
                 audio.type = format === 'wav' ? 'audio/wav' : 'audio/opus';
                 audio.play();
@@ -344,9 +350,8 @@ def index():
 # ------------------------------------------------------------------------------
-# Run with: uvicorn app:app --reload
 # ------------------------------------------------------------------------------
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True)

 import re
 import wave
 import struct
+import os
+import time
+import json
 import numpy as np
 import torch
 from misaki import en
 from onnxruntime import InferenceSession
 from huggingface_hub import snapshot_download
 from scipy.io.wavfile import write as write_wav
+# ------------------------------------------------------------------------------
+# Load configuration and set up vocabulary
+# ------------------------------------------------------------------------------
+config_file_path = 'config.json'  # Update with your actual path
 with open(config_file_path, 'r') as f:
     config = json.load(f)
 phoneme_vocab = config['vocab']
+# ------------------------------------------------------------------------------
+# Download the model and voice files from Hugging Face Hub
+# ------------------------------------------------------------------------------
 model_repo = "onnx-community/Kokoro-82M-v1.0-ONNX"
 model_name = "onnx/model_q8f16.onnx"
 voice_file_pattern = "*.bin"
 local_dir = "."
 snapshot_download(
     repo_id=model_repo,
     allow_patterns=[model_name, voice_file_pattern],
     local_dir=local_dir
 )
+# ------------------------------------------------------------------------------
+# Load the ONNX model
+# ------------------------------------------------------------------------------
 model_path = os.path.join(local_dir, model_name)
 sess = InferenceSession(model_path)
+# ------------------------------------------------------------------------------
+# Create the FastAPI app with GZip compression
+# ------------------------------------------------------------------------------
 app = FastAPI(
     title="Kokoro TTS FastAPI",
+    middleware=[Middleware(GZipMiddleware, compresslevel=9)]
 )
 # ------------------------------------------------------------------------------
 # Helper Functions
 # ------------------------------------------------------------------------------
 def generate_wav_header(sample_rate: int, num_channels: int, sample_width: int, data_size: int = 0x7FFFFFFF) -> bytes:
     """
+    Generate a WAV header for streaming. Since we do not know the final audio size,
+    a large dummy value is used for the data chunk size.
     """
     bits_per_sample = sample_width * 8
     byte_rate = sample_rate * num_channels * sample_width
     block_align = num_channels * sample_width
+    total_size = 36 + data_size  # 36 + data_size (header is 44 bytes total)
     header = struct.pack('<4sI4s', b'RIFF', total_size, b'WAVE')
     fmt_chunk = struct.pack('<4sIHHIIHH', b'fmt ', 16, 1, num_channels, sample_rate, byte_rate, block_align, bits_per_sample)
     data_chunk_header = struct.pack('<4sI', b'data', data_size)
 def custom_split_text(text: str) -> list:
     """
+    Custom splitting strategy:
       - Start with a chunk size of 2 words.
+      - For each chunk, if a period (".") is found in any word (except the very last word),
+        then split at that word (including it).
       - Otherwise, use the current chunk size.
+      - Increase the chunk size by 2 for each subsequent chunk.
+      - If there are fewer than the desired number of words remaining, include all of them.
     """
     words = text.split()
     chunks = []
         if candidate_end > len(words):
             candidate_end = len(words)
         chunk_words = words[start:candidate_end]
         split_index = None
         for i in range(len(chunk_words) - 1):
             if '.' in chunk_words[i]:
             chunk_words = words[start:candidate_end]
         chunks.append(" ".join(chunk_words))
         start = candidate_end
+        chunk_size += 2
     return chunks
 def audio_tensor_to_pcm_bytes(audio_tensor: torch.Tensor) -> bytes:
     """
+    Convert a torch.FloatTensor (values in [-1, 1]) to raw 16-bit PCM bytes.
     """
     audio_np = audio_tensor.cpu().numpy()
     if audio_np.ndim > 1:
         audio_np = audio_np.flatten()
     audio_int16 = np.int16(audio_np * 32767)
     return audio_int16.tobytes()
 def audio_tensor_to_opus_bytes(audio_tensor: torch.Tensor, sample_rate: int = 24000, bitrate: int = 32000) -> bytes:
     """
+    Convert a torch.FloatTensor to Opus-encoded bytes.
     Requires the 'opuslib' package: pip install opuslib
     """
     try:
     audio_np = audio_tensor.cpu().numpy()
     if audio_np.ndim > 1:
         audio_np = audio_np.flatten()
     audio_int16 = np.int16(audio_np * 32767)
+    encoder = opuslib.Encoder(sample_rate, 1, opuslib.APPLICATION_VOIP)
+    frame_size = int(sample_rate * 0.020)  # 20 ms frame
     encoded_data = b''
     for i in range(0, len(audio_int16), frame_size):
         frame = audio_int16[i:i + frame_size]
         if len(frame) < frame_size:
             frame = np.pad(frame, (0, frame_size - len(frame)), 'constant')
+        encoded_frame = encoder.encode(frame.tobytes(), frame_size)
         encoded_data += encoded_frame
     return encoded_data
+# Initialize G2P for English (American)
+g2p = en.G2P(trf=False, british=False, fallback=None)
+def tokenizer(text: str):
+    """
+    Converts text to a list of phoneme tokens using the global vocabulary.
+    """
     print("Text: " + text)
     phonemes_string, _ = g2p(text)
+    phonemes = [ph for ph in phonemes_string]
     tokens = [phoneme_vocab[phoneme] for phoneme in phonemes if phoneme in phoneme_vocab]
+    print("Tokens:", tokens)
     return tokens
 # ------------------------------------------------------------------------------
 # Endpoints
 # ------------------------------------------------------------------------------
+@app.get("/tts/streaming", summary="Streaming TTS")
+def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format: str = "opus"):
+    """
+    Streaming TTS endpoint.
+    This endpoint splits the input text into chunks (using the doubling strategy),
+    then for each chunk:
+      - For the first chunk, a 0 is prepended.
+      - For subsequent chunks, the first token is set to the last token from the previous chunk.
+      - For the final chunk, a 0 is appended.
+    The audio for each chunk is generated immediately and streamed to the client.
+    """
+    chunks = custom_split_text(text)
+    sample_rate = 24000
+    num_channels = 1
+    sample_width = 2
+    # Load the voice/style file (must be present in voices/{voice}.bin)
+    voice_path = os.path.join(local_dir, f"voices/{voice}.bin")
+    if not os.path.exists(voice_path):
+        raise HTTPException(status_code=404, detail="Voice file not found")
+    voices = np.fromfile(voice_path, dtype=np.float32).reshape(-1, 1, 256)
+    def audio_generator():
+        # If outputting a WAV stream, yield a WAV header once.
+        if format.lower() == "wav":
+            header = generate_wav_header(sample_rate, num_channels, sample_width)
+            yield header
+        prev_last_token = None
+        for i, chunk in enumerate(chunks):
+            print(f"Processing chunk {i}: {chunk}")
+            # Convert the chunk text to tokens.
+            chunk_tokens = tokenizer(chunk)
+            # For the first chunk, prepend 0; for later chunks, start with the previous chunk's last token.
+            if i == 0:
+                tokens_to_send = [0] + chunk_tokens
+            else:
+                tokens_to_send = [prev_last_token] + chunk_tokens
+            # If this is the final chunk, append 0.
+            if i == len(chunks) - 1:
+                tokens_to_send = tokens_to_send + [0]
+            # Save the last token of this chunk for the next iteration.
+            prev_last_token = tokens_to_send[-1]
+            # Prepare the model input (a batch of one sequence).
+            final_token = [tokens_to_send]
+            # Use the number of tokens to select the appropriate style vector.
+            style_index = len(tokens_to_send)
+            if style_index >= len(voices):
+                style_index = len(voices) - 1  # Fallback if index is out-of-bounds.
+            ref_s = voices[style_index]
+            # Prepare the speed parameter.
+            speed_param = np.ones(1, dtype=np.float32) * speed
+            # Run the model (ONNX inference) for this chunk.
+            try:
+                start_time = time.time()
+                audio_output = sess.run(None, {
+                    "input_ids": final_token,
+                    "style": ref_s,
+                    "speed": speed_param,
+                })[0]
+                print(f"Chunk {i} inference time: {time.time() - start_time:.3f}s")
+            except Exception as e:
+                print(f"Error processing chunk {i}: {e}")
+                # In case of error, generate a short silent chunk.
+                audio_output = np.zeros((sample_rate,), dtype=np.float32)
+            # Convert the model output (assumed to be float32 in [-1, 1]) to int16 PCM.
+            audio_int16 = (audio_output * 32767).astype(np.int16).flatten()
+            # Convert to a torch tensor (back into float range) for our helper functions.
+            audio_tensor = torch.from_numpy(audio_int16.astype(np.float32) / 32767)
+            # Yield the encoded audio chunk.
+            if format.lower() == "wav":
+                yield audio_tensor_to_pcm_bytes(audio_tensor)
+            elif format.lower() == "opus":
+                yield audio_tensor_to_opus_bytes(audio_tensor, sample_rate=sample_rate)
+            else:
+                raise HTTPException(status_code=400, detail=f"Unsupported audio format: {format}")
+    media_type = "audio/wav" if format.lower() == "wav" else "audio/opus"
+    return StreamingResponse(
+        audio_generator(),
+        media_type=media_type,
+        headers={"Cache-Control": "no-cache"},
+    )
 @app.get("/tts/full", summary="Full TTS")
 def tts_full(text: str, voice: str = "af_heart", speed: float = 1.0, format: str = "wav"):
     """
+    Full TTS endpoint that synthesizes the entire text and returns a complete WAV or Opus file.
     """
     voice_path = os.path.join(local_dir, f"voices/{voice}.bin")
     voices = np.fromfile(voice_path, dtype=np.float32).reshape(-1, 1, 256)
     tokens = tokenizer(text)
     ref_s = voices[len(tokens)]
     final_token = [[0, *tokens, 0]]
     start_time = time.time()
+    audio = sess.run(None, {
+        "input_ids": final_token,
+        "style": ref_s,
+        "speed": np.ones(1, dtype=np.float32) * speed,
+    })[0]
+    print(f"Full TTS inference time: {time.time()-start_time:.3f}s")
+    # Convert to int16 PCM.
+    audio = (audio * 32767).astype(np.int16).flatten()
     if format.lower() == "wav":
         wav_io = io.BytesIO()
+        write_wav(wav_io, 24000, audio)
         wav_io.seek(0)
         return Response(content=wav_io.read(), media_type="audio/wav")
     elif format.lower() == "opus":
+        opus_data = audio_tensor_to_opus_bytes(torch.from_numpy(audio.astype(np.float32)/32767), sample_rate=24000)
         return Response(content=opus_data, media_type="audio/opus")
     else:
         raise HTTPException(status_code=400, detail=f"Unsupported audio format: {format}")
 @app.get("/", response_class=HTMLResponse)
 def index():
     """
     HTML demo page for Kokoro TTS.
     """
     return """
     <!DOCTYPE html>
                 const speed = document.getElementById('speed').value;
                 const format = document.getElementById('format').value;
                 const audio = document.getElementById('audio');
                 audio.src = `/tts/streaming?text=${encodeURIComponent(text)}&voice=${encodeURIComponent(voice)}&speed=${speed}&format=${format}`;
                 audio.type = format === 'wav' ? 'audio/wav' : 'audio/opus';
                 audio.play();
                 const speed = document.getElementById('speed').value;
                 const format = document.getElementById('format').value;
                 const audio = document.getElementById('audio');
                 audio.src = `/tts/full?text=${encodeURIComponent(text)}&voice=${encodeURIComponent(voice)}&speed=${speed}&format=${format}`;
                 audio.type = format === 'wav' ? 'audio/wav' : 'audio/opus';
                 audio.play();
 # ------------------------------------------------------------------------------
+# Run the app with: uvicorn app:app --reload
 # ------------------------------------------------------------------------------
 if __name__ == "__main__":
     import uvicorn
+    uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True)