kokoro-onnx-api-test

Running

App Files Files Community

bcci commited on Feb 8

Commit

e7655ad

verified ·

1 Parent(s): 65e1914

Update app.py

Browse files

Files changed (1) hide show

app.py +95 -29

app.py CHANGED Viewed

@@ -7,10 +7,17 @@ import numpy as np
 import torch
 from fastapi import FastAPI, HTTPException
 from fastapi.responses import StreamingResponse, Response, HTMLResponse
 from kokoro import KPipeline
-app = FastAPI(title="Kokoro TTS FastAPI")
 # ------------------------------------------------------------------------------
 # Global Pipeline Instance
@@ -87,17 +94,51 @@ def audio_tensor_to_pcm_bytes(audio_tensor: torch.Tensor) -> bytes:
     return audio_int16.tobytes()
 # ------------------------------------------------------------------------------
 # Endpoints
 # ------------------------------------------------------------------------------
 @app.get("/tts/streaming", summary="Streaming TTS")
-def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0):
     """
-    Streaming TTS endpoint that returns a continuous WAV stream.
-    The endpoint first yields a WAV header (with a dummy length) then yields raw PCM data
-    for each text chunk as soon as it is generated.
     """
     # Split the input text using the custom doubling strategy.
     chunks = custom_split_text(text)
@@ -106,34 +147,43 @@ def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0):
     sample_width = 2  # 16-bit PCM
     def audio_generator():
-        # Yield the WAV header first.
-        header = generate_wav_header(sample_rate, num_channels, sample_width)
-        yield header
-        # Process and yield each chunk's PCM data.
         for i, chunk in enumerate(chunks):
             print(f"Processing chunk {i}: {chunk}")  # Debugging
             try:
                 results = list(pipeline(chunk, voice=voice, speed=speed, split_pattern=None))
                 for result in results:
                     if result.audio is not None:
-                        yield audio_tensor_to_pcm_bytes(result.audio)
                     else:
                         print(f"Chunk {i}: No audio generated")
             except Exception as e:
                 print(f"Error processing chunk {i}: {e}")
     return StreamingResponse(
         audio_generator(),
-        media_type="audio/wav",
         headers={"Cache-Control": "no-cache"},
     )
 @app.get("/tts/full", summary="Full TTS")
-def tts_full(text: str, voice: str = "af_heart", speed: float = 1.0):
     """
     Full TTS endpoint that synthesizes the entire text, concatenates the audio,
-    and returns a complete WAV file.
     """
     # Use newline-based splitting via the pipeline's split_pattern.
     results = list(pipeline(text, voice=voice, speed=speed, split_pattern=r"\n+"))
@@ -151,26 +201,33 @@ def tts_full(text: str, voice: str = "af_heart", speed: float = 1.0):
     # Concatenate all audio segments.
     full_audio = np.concatenate(audio_segments)
-    # Write the concatenated audio to an in-memory WAV file.
     sample_rate = 24000
     num_channels = 1
     sample_width = 2  # 16-bit PCM -> 2 bytes per sample
-    wav_io = io.BytesIO()
-    with wave.open(wav_io, "wb") as wav_file:
-        wav_file.setnchannels(num_channels)
-        wav_file.setsampwidth(sample_width)
-        wav_file.setframerate(sample_rate)
-        full_audio_int16 = np.int16(full_audio * 32767)
-        wav_file.writeframes(full_audio_int16.tobytes())
-    wav_io.seek(0)
-    return Response(content=wav_io.read(), media_type="audio/wav")
 @app.get("/", response_class=HTMLResponse)
 def index():
     """
     HTML demo page for Kokoro TTS.
     This page provides a simple UI to enter text, choose a voice and speed,
     and play synthesized audio from both the streaming and full endpoints.
     """
@@ -186,7 +243,12 @@ def index():
         <label for="voice">Voice:</label>
         <input type="text" id="voice" value="af_heart"><br>
         <label for="speed">Speed:</label>
-        <input type="number" step="0.1" id="speed" value="1.0"><br><br>
         <button onclick="playStreaming()">Play Streaming TTS</button>
         <button onclick="playFull()">Play Full TTS</button>
         <br><br>
@@ -196,18 +258,22 @@ def index():
                 const text = document.getElementById('text').value;
                 const voice = document.getElementById('voice').value;
                 const speed = document.getElementById('speed').value;
                 const audio = document.getElementById('audio');
                 // Set the audio element's source to the streaming endpoint.
-                audio.src = `/tts/streaming?text=${encodeURIComponent(text)}&voice=${encodeURIComponent(voice)}&speed=${speed}`;
                 audio.play();
             }
             function playFull() {
                 const text = document.getElementById('text').value;
                 const voice = document.getElementById('voice').value;
                 const speed = document.getElementById('speed').value;
                 const audio = document.getElementById('audio');
                 // Set the audio element's source to the full TTS endpoint.
-                audio.src = `/tts/full?text=${encodeURIComponent(text)}&voice=${encodeURIComponent(voice)}&speed=${speed}`;
                 audio.play();
             }
         </script>
@@ -222,4 +288,4 @@ def index():
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True)

 import torch
 from fastapi import FastAPI, HTTPException
 from fastapi.responses import StreamingResponse, Response, HTMLResponse
+from fastapi.middleware import Middleware
+from fastapi.middleware.gzip import GZipMiddleware
 from kokoro import KPipeline
+app = FastAPI(
+    title="Kokoro TTS FastAPI",
+    middleware=[
+        Middleware(GZipMiddleware, compresslevel=9)  # Add GZip compression
+    ]
+)
 # ------------------------------------------------------------------------------
 # Global Pipeline Instance
     return audio_int16.tobytes()
+def audio_tensor_to_opus_bytes(audio_tensor: torch.Tensor, sample_rate: int = 24000, bitrate: int = 32000) -> bytes:
+    """
+    Convert a torch.FloatTensor to Opus encoded bytes.
+    Requires the 'opuslib' package: pip install opuslib
+    """
+    try:
+        import opuslib
+    except ImportError:
+        raise ImportError("opuslib is not installed. Please install it with: pip install opuslib")
+    audio_np = audio_tensor.cpu().numpy()
+    if audio_np.ndim > 1:
+        audio_np = audio_np.flatten()
+    # Scale to int16 range.  Important for opus.
+    audio_int16 = np.int16(audio_np * 32767)
+    encoder = opuslib.Encoder(sample_rate, 1, opuslib.APPLICATION_VOIP) # 1 channel for mono.
+    # Calculate the number of frames to encode. Opus frames are 2.5, 5, 10, or 20 ms long.
+    frame_size = int(sample_rate * 0.020)  # 20ms frame size
+    encoded_data = b''
+    for i in range(0, len(audio_int16), frame_size):
+        frame = audio_int16[i:i + frame_size]
+        if len(frame) < frame_size:
+            # Pad the last frame with zeros if needed.
+            frame = np.pad(frame, (0, frame_size - len(frame)), 'constant')
+        encoded_frame = encoder.encode(frame.tobytes(), frame_size) # Encode the frame.
+        encoded_data += encoded_frame
+    return encoded_data
 # ------------------------------------------------------------------------------
 # Endpoints
 # ------------------------------------------------------------------------------
 @app.get("/tts/streaming", summary="Streaming TTS")
+def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format: str = "opus"):
     """
+    Streaming TTS endpoint that returns a continuous audio stream.
+    Supports WAV (PCM) and Opus formats.  Opus offers significantly better compression.
+    The endpoint first yields a WAV header (with a dummy length) for WAV,
+    then yields encoded audio data for each text chunk as soon as it is generated.
     """
     # Split the input text using the custom doubling strategy.
     chunks = custom_split_text(text)
     sample_width = 2  # 16-bit PCM
     def audio_generator():
+        if format.lower() == "wav":
+            # Yield the WAV header first.
+            header = generate_wav_header(sample_rate, num_channels, sample_width)
+            yield header
+        # Process and yield each chunk's audio data.
         for i, chunk in enumerate(chunks):
             print(f"Processing chunk {i}: {chunk}")  # Debugging
             try:
                 results = list(pipeline(chunk, voice=voice, speed=speed, split_pattern=None))
                 for result in results:
                     if result.audio is not None:
+                        if format.lower() == "wav":
+                            yield audio_tensor_to_pcm_bytes(result.audio)
+                        elif format.lower() == "opus":
+                            yield audio_tensor_to_opus_bytes(result.audio, sample_rate=sample_rate)
+                        else:
+                            raise ValueError(f"Unsupported audio format: {format}")
                     else:
                         print(f"Chunk {i}: No audio generated")
             except Exception as e:
                 print(f"Error processing chunk {i}: {e}")
+                yield b'' # important so that streaming continues.  Consider returning an error sound.
+    media_type = "audio/wav" if format.lower() == "wav" else "audio/opus"
     return StreamingResponse(
         audio_generator(),
+        media_type=media_type,
         headers={"Cache-Control": "no-cache"},
     )
 @app.get("/tts/full", summary="Full TTS")
+def tts_full(text: str, voice: str = "af_heart", speed: float = 1.0, format: str = "wav"):
     """
     Full TTS endpoint that synthesizes the entire text, concatenates the audio,
+    and returns a complete WAV or Opus file.
     """
     # Use newline-based splitting via the pipeline's split_pattern.
     results = list(pipeline(text, voice=voice, speed=speed, split_pattern=r"\n+"))
     # Concatenate all audio segments.
     full_audio = np.concatenate(audio_segments)
+    # Write the concatenated audio to an in-memory WAV or Opus file.
     sample_rate = 24000
     num_channels = 1
     sample_width = 2  # 16-bit PCM -> 2 bytes per sample
+    if format.lower() == "wav":
+        wav_io = io.BytesIO()
+        with wave.open(wav_io, "wb") as wav_file:
+            wav_file.setnchannels(num_channels)
+            wav_file.setsampwidth(sample_width)
+            wav_file.setframerate(sample_rate)
+            full_audio_int16 = np.int16(full_audio * 32767)
+            wav_file.writeframes(full_audio_int16.tobytes())
+        wav_io.seek(0)
+        return Response(content=wav_io.read(), media_type="audio/wav")
+    elif format.lower() == "opus":
+        opus_data = audio_tensor_to_opus_bytes(torch.from_numpy(full_audio), sample_rate=sample_rate)
+        return Response(content=opus_data, media_type="audio/opus")
+    else:
+        raise HTTPException(status_code=400, detail=f"Unsupported audio format: {format}")
 @app.get("/", response_class=HTMLResponse)
 def index():
     """
     HTML demo page for Kokoro TTS.
     This page provides a simple UI to enter text, choose a voice and speed,
     and play synthesized audio from both the streaming and full endpoints.
     """
         <label for="voice">Voice:</label>
         <input type="text" id="voice" value="af_heart"><br>
         <label for="speed">Speed:</label>
+        <input type="number" step="0.1" id="speed" value="1.0"><br>
+        <label for="format">Format:</label>
+        <select id="format">
+            <option value="wav">WAV</option>
+            <option value="opus" selected>Opus</option>
+        </select><br><br>
         <button onclick="playStreaming()">Play Streaming TTS</button>
         <button onclick="playFull()">Play Full TTS</button>
         <br><br>
                 const text = document.getElementById('text').value;
                 const voice = document.getElementById('voice').value;
                 const speed = document.getElementById('speed').value;
+                const format = document.getElementById('format').value;
                 const audio = document.getElementById('audio');
                 // Set the audio element's source to the streaming endpoint.
+                audio.src = `/tts/streaming?text=${encodeURIComponent(text)}&voice=${encodeURIComponent(voice)}&speed=${speed}&format=${format}`;
+                audio.type = format === 'wav' ? 'audio/wav' : 'audio/opus';
                 audio.play();
             }
             function playFull() {
                 const text = document.getElementById('text').value;
                 const voice = document.getElementById('voice').value;
                 const speed = document.getElementById('speed').value;
+                const format = document.getElementById('format').value;
                 const audio = document.getElementById('audio');
                 // Set the audio element's source to the full TTS endpoint.
+                audio.src = `/tts/full?text=${encodeURIComponent(text)}&voice=${encodeURIComponent(voice)}&speed=${speed}&format=${format}`;
+                audio.type = format === 'wav' ? 'audio/wav' : 'audio/opus';
                 audio.play();
             }
         </script>
 if __name__ == "__main__":
     import uvicorn
+    uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True)