kokoro-onnx-api-test

Sleeping

App Files Files Community

bcci commited on Feb 8

Commit

7fd0353

verified ·

1 Parent(s): 58c1307

Create app.py

Browse files

Files changed (1) hide show

app.py +210 -0

app.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import io
+import re
+import wave
+import struct
+import numpy as np
+import torch
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import StreamingResponse, Response, HTMLResponse
+from kokoro import KPipeline
+app = FastAPI(title="Kokoro TTS FastAPI")
+# ------------------------------------------------------------------------------
+# Global Pipeline Instance
+# ------------------------------------------------------------------------------
+# Create one pipeline instance for the entire app.
+pipeline = KPipeline(lang_code="a")
+# ------------------------------------------------------------------------------
+# Helper Functions
+# ------------------------------------------------------------------------------
+def generate_wav_header(sample_rate: int, num_channels: int, sample_width: int, data_size: int = 0x7FFFFFFF) -> bytes:
+    """
+    Generate a WAV header for streaming.
+    Since we don't know the final audio size, we set the data chunk size to a large dummy value.
+    This header is sent only once at the start of the stream.
+    """
+    bits_per_sample = sample_width * 8
+    byte_rate = sample_rate * num_channels * sample_width
+    block_align = num_channels * sample_width
+    # total file size = 36 + data_size (header is 44 bytes total)
+    total_size = 36 + data_size
+    header = struct.pack('<4sI4s', b'RIFF', total_size, b'WAVE')
+    fmt_chunk = struct.pack('<4sIHHIIHH', b'fmt ', 16, 1, num_channels, sample_rate, byte_rate, block_align, bits_per_sample)
+    data_chunk_header = struct.pack('<4sI', b'data', data_size)
+    return header + fmt_chunk + data_chunk_header
+def custom_split_text(text: str) -> list:
+    """
+    Custom splitting: split text into chunks where each chunk doubles in size.
+    """
+    words = text.split()
+    chunks = []
+    chunk_size = 1
+    start = 0
+    while start < len(words):
+        end = start + chunk_size
+        chunk = " ".join(words[start:end])
+        chunks.append(chunk)
+        start = end
+        chunk_size *= 2  # double the chunk size for the next iteration
+    return chunks
+def audio_tensor_to_pcm_bytes(audio_tensor: torch.Tensor) -> bytes:
+    """
+    Convert a torch.FloatTensor (with values in [-1, 1]) to raw 16-bit PCM bytes.
+    """
+    # Ensure tensor is on CPU and flatten if necessary.
+    audio_np = audio_tensor.cpu().numpy()
+    if audio_np.ndim > 1:
+        audio_np = audio_np.flatten()
+    # Scale to int16 range.
+    audio_int16 = np.int16(audio_np * 32767)
+    return audio_int16.tobytes()
+# ------------------------------------------------------------------------------
+# Endpoints
+# ------------------------------------------------------------------------------
+@app.get("/tts/streaming", summary="Streaming TTS")
+def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0):
+    """
+    Streaming TTS endpoint that returns a continuous WAV stream.
+    The endpoint first yields a WAV header (with a dummy length) then yields raw PCM data
+    for each text chunk as soon as it is generated.
+    """
+    # Split the input text using the custom doubling strategy.
+    chunks = custom_split_text(text)
+    sample_rate = 24000
+    num_channels = 1
+    sample_width = 2  # 16-bit PCM
+    def audio_generator():
+        # Yield the WAV header first.
+        header = generate_wav_header(sample_rate, num_channels, sample_width)
+        yield header
+        # Process and yield each chunk's PCM data.
+        for i, chunk in enumerate(chunks):
+            print(f"Processing chunk {i}: {chunk}")  # Debugging
+            try:
+                results = list(pipeline(chunk, voice=voice, speed=speed, split_pattern=None))
+                for result in results:
+                    if result.audio is not None:
+                        print(f"Chunk {i}: Audio generated")  # Debugging
+                        pcm_bytes = audio_tensor_to_pcm_bytes(result.audio)
+                        yield pcm_bytes
+                    else:
+                        print(f"Chunk {i}: No audio generated")
+            except Exception as e:
+                print(f"Error processing chunk {i}: {e}")
+    return StreamingResponse(
+        audio_generator(),
+        media_type="audio/wav",
+        headers={"Cache-Control": "no-cache"},
+    )
+@app.get("/tts/full", summary="Full TTS")
+def tts_full(text: str, voice: str = "af_heart", speed: float = 1.0):
+    """
+    Full TTS endpoint that synthesizes the entire text, concatenates the audio,
+    and returns a complete WAV file.
+    """
+    # Use newline-based splitting via the pipeline's split_pattern.
+    results = list(pipeline(text, voice=voice, speed=speed, split_pattern=r"\n+"))
+    audio_segments = []
+    for result in results:
+        if result.audio is not None:
+            audio_np = result.audio.cpu().numpy()
+            if audio_np.ndim > 1:
+                audio_np = audio_np.flatten()
+            audio_segments.append(audio_np)
+    if not audio_segments:
+        raise HTTPException(status_code=500, detail="No audio generated.")
+    # Concatenate all audio segments.
+    full_audio = np.concatenate(audio_segments)
+    # Write the concatenated audio to an in-memory WAV file.
+    sample_rate = 24000
+    num_channels = 1
+    sample_width = 2  # 16-bit PCM -> 2 bytes per sample
+    wav_io = io.BytesIO()
+    with wave.open(wav_io, "wb") as wav_file:
+        wav_file.setnchannels(num_channels)
+        wav_file.setsampwidth(sample_width)
+        wav_file.setframerate(sample_rate)
+        full_audio_int16 = np.int16(full_audio * 32767)
+        wav_file.writeframes(full_audio_int16.tobytes())
+    wav_io.seek(0)
+    return Response(content=wav_io.read(), media_type="audio/wav")
+@app.get("/", response_class=HTMLResponse)
+def index():
+    """
+    HTML demo page for Kokoro TTS.
+    This page provides a simple UI to enter text, choose a voice and speed,
+    and play synthesized audio from both the streaming and full endpoints.
+    """
+    return """
+    <!DOCTYPE html>
+    <html>
+    <head>
+        <title>Kokoro TTS Demo</title>
+    </head>
+    <body>
+        <h1>Kokoro TTS Demo</h1>
+        <textarea id="text" rows="4" cols="50" placeholder="Enter text here"></textarea><br>
+        <label for="voice">Voice:</label>
+        <input type="text" id="voice" value="af_heart"><br>
+        <label for="speed">Speed:</label>
+        <input type="number" step="0.1" id="speed" value="1.0"><br><br>
+        <button onclick="playStreaming()">Play Streaming TTS</button>
+        <button onclick="playFull()">Play Full TTS</button>
+        <br><br>
+        <audio id="audio" controls autoplay></audio>
+        <script>
+            function playStreaming() {
+                const text = document.getElementById('text').value;
+                const voice = document.getElementById('voice').value;
+                const speed = document.getElementById('speed').value;
+                const audio = document.getElementById('audio');
+                // Set the audio element's source to the streaming endpoint.
+                audio.src = `/tts/streaming?text=${encodeURIComponent(text)}&voice=${encodeURIComponent(voice)}&speed=${speed}`;
+                audio.play();
+            }
+            function playFull() {
+                const text = document.getElementById('text').value;
+                const voice = document.getElementById('voice').value;
+                const speed = document.getElementById('speed').value;
+                const audio = document.getElementById('audio');
+                // Set the audio element's source to the full TTS endpoint.
+                audio.src = `/tts/full?text=${encodeURIComponent(text)}&voice=${encodeURIComponent(voice)}&speed=${speed}`;
+                audio.play();
+            }
+        </script>
+    </body>
+    </html>
+    """
+# ------------------------------------------------------------------------------
+# Run with: uvicorn app:app --reload
+# ------------------------------------------------------------------------------
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True)