Spaces:

bcci
/

kokoro-api-test

Runtime error

App Files Files Community

bcci commited on Feb 10

Commit

80ce7b7

verified ·

1 Parent(s): 9c48211

Update app.py

Browse files

Files changed (1) hide show

app.py +136 -154

app.py CHANGED Viewed

@@ -10,8 +10,7 @@ from fastapi.responses import StreamingResponse, Response, HTMLResponse
 from fastapi.middleware import Middleware
 from fastapi.middleware.gzip import GZipMiddleware
-# --- IMPORTANT:  Use the AutoregressiveStreamKPipeline ---
-from kokoro.pipeline import AutoregressiveStreamKPipeline  # Or wherever your pipeline is.
 app = FastAPI(
     title="Kokoro TTS FastAPI",
@@ -24,8 +23,9 @@ app = FastAPI(
 # Global Pipeline Instance
 # ------------------------------------------------------------------------------
 # Create one pipeline instance for the entire app.
-pipeline = AutoregressiveStreamKPipeline(lang_code="a")  # Use the autoregressive pipeline
 # ------------------------------------------------------------------------------
 # Helper Functions
@@ -48,6 +48,40 @@ def generate_wav_header(sample_rate: int, num_channels: int, sample_width: int,
     return header + fmt_chunk + data_chunk_header
 def audio_tensor_to_pcm_bytes(audio_tensor: torch.Tensor) -> bytes:
     """
     Convert a torch.FloatTensor (with values in [-1, 1]) to raw 16-bit PCM bytes.
@@ -60,78 +94,42 @@ def audio_tensor_to_pcm_bytes(audio_tensor: torch.Tensor) -> bytes:
     audio_int16 = np.int16(audio_np * 32767)
     return audio_int16.tobytes()
-def audio_tensor_to_opus_bytes(audio_tensor: torch.Tensor, sample_rate: int = 24000, bitrate: int = 32000) -> bytes:
-    """
-    Convert a torch.FloatTensor to Opus encoded bytes.
-    Requires the 'opuslib' package: pip install opuslib
-    """
-    try:
-        import opuslib
-    except ImportError:
-        raise ImportError("opuslib is not installed. Please install it with: pip install opuslib")
-    audio_np = audio_tensor.cpu().numpy()
-    if audio_np.ndim > 1:
-        audio_np = audio_np.flatten()
-    # Scale to int16 range.  Important for opus.
-    audio_int16 = np.int16(audio_np * 32767)
-    encoder = opuslib.Encoder(sample_rate, 1, opuslib.APPLICATION_VOIP) # 1 channel for mono.
-    # Calculate the number of frames to encode. Opus frames are 2.5, 5, 10, or 20 ms long.
-    frame_size = int(sample_rate * 0.020)  # 20ms frame size
-    encoded_data = b''
-    for i in range(0, len(audio_int16), frame_size):
-        frame = audio_int16[i:i + frame_size]
-        if len(frame) < frame_size:
-            # Pad the last frame with zeros if needed.
-            frame = np.pad(frame, (0, frame_size - len(frame)), 'constant')
-        encoded_frame = encoder.encode(frame.tobytes(), frame_size) # Encode the frame.
-        encoded_data += encoded_frame
-    return encoded_data
 # ------------------------------------------------------------------------------
 # Endpoints
 # ------------------------------------------------------------------------------
-@app.get("/tts/streaming", summary="Streaming TTS (Autoregressive)")
-def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format: str = "opus"):
     """
-    Streaming TTS endpoint that attempts autoregressive, near sample-by-sample output.
-    IMPORTANT: This is EXPERIMENTAL and may have reduced quality compared to
-    the full or chunking methods.  It's also likely to be slower due to the
-    per-phoneme processing overhead.
     """
     sample_rate = 24000
     num_channels = 1
     sample_width = 2  # 16-bit PCM
     def audio_generator():
-        if format.lower() == "wav":
-            # Yield the WAV header first.
-            header = generate_wav_header(sample_rate, num_channels, sample_width)
-            yield header
         try:
-            # Use the AUTOREGRESSIVE pipeline
-            for audio_chunk in pipeline(text, voice=voice, speed=speed):
-                print(audio_chunk)
-                if audio_chunk.numel() > 0:  # Ensure we have audio data
-                    if format.lower() == "wav":
-                        yield audio_tensor_to_pcm_bytes(audio_chunk)
-                    elif format.lower() == "opus":
-                        yield audio_tensor_to_opus_bytes(audio_chunk, sample_rate=sample_rate)
-                    else:
-                        raise ValueError(f"Unsupported audio format: {format}")
         except Exception as e:
-            print(f"Error during streaming: {e}")
-            yield b''  # Yield empty bytes to avoid breaking the stream
-    media_type = "audio/wav" if format.lower() == "wav" else "audio/opus"
     return StreamingResponse(
         audio_generator(),
         media_type=media_type,
@@ -139,54 +137,41 @@ def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format
     )
-# @app.get("/tts/full", summary="Full TTS")
-# def tts_full(text: str, voice: str = "af_heart", speed: float = 1.0, format: str = "wav"):
-#     """
-#     Full TTS endpoint (no streaming).  Synthesizes the entire text and returns
-#     a complete WAV or Opus file.
-#     """
-#     # Use newline-based splitting.  This is the *original* KPipeline,
-#     # which is better for full synthesis.  It's important to use
-#     # the right pipeline for the right task.
-#     from kokoro.pipeline import KPipeline  # Import here to avoid circular import
-#     full_pipeline = KPipeline(lang_code="a")
-#     results = list(full_pipeline(text, voice=voice, speed=speed, split_pattern=r"\n+"))
-#     audio_segments = []
-#     for result in results:
-#         if result.audio is not None:
-#             audio_np = result.audio.cpu().numpy()
-#             if audio_np.ndim > 1:
-#                 audio_np = audio_np.flatten()
-#             audio_segments.append(audio_np)
-#     if not audio_segments:
-#         raise HTTPException(status_code=500, detail="No audio generated.")
-#     # Concatenate all audio segments.
-#     full_audio = np.concatenate(audio_segments)
-#     # Write the concatenated audio to an in-memory WAV or Opus file.
-#     sample_rate = 24000
-#     num_channels = 1
-#     sample_width = 2  # 16-bit PCM -> 2 bytes per sample
-#     if format.lower() == "wav":
-#         wav_io = io.BytesIO()
-#         with wave.open(wav_io, "wb") as wav_file:
-#             wav_file.setnchannels(num_channels)
-#             wav_file.setsampwidth(sample_width)
-#             wav_file.setframerate(sample_rate)
-#             full_audio_int16 = np.int16(full_audio * 32767)
-#             wav_file.writeframes(full_audio_int16.tobytes())
-#         wav_io.seek(0)
-#         return Response(content=wav_io.read(), media_type="audio/wav")
-#     elif format.lower() == "opus":
-#         opus_data = audio_tensor_to_opus_bytes(torch.from_numpy(full_audio), sample_rate=sample_rate)
-#         return Response(content=opus_data, media_type="audio/opus")
-#     else:
-#         raise HTTPException(status_code=400, detail=f"Unsupported audio format: {format}")
@@ -194,61 +179,58 @@ def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format
 def index():
     """
     HTML demo page for Kokoro TTS.
     """
     return """
-<!DOCTYPE html>
-<html>
-<head>
-    <title>Kokoro TTS Demo</title>
-</head>
-<body>
-    <h1>Kokoro TTS Demo</h1>
-    <textarea id="text" rows="4" cols="50" placeholder="Enter text here"></textarea><br>
-    <label for="voice">Voice:</label>
-    <input type="text" id="voice" value="af_heart"><br>
-    <label for="speed">Speed:</label>
-    <input type="number" step="0.1" id="speed" value="1.0"><br>
-    <label for="format">Format:</label>
-    <select id="format">
-        <option value="wav">WAV</option>
-        <option value="opus" selected>Opus</option>
-    </select><br><br>
-    <button onclick="playStreaming()">Play Streaming TTS</button>
-    <button onclick="playFull()">Play Full TTS</button>
-    <br><br>
-    <audio id="audio" controls autoplay></audio>
-    <script>
-        function playStreaming() {
-            const text = document.getElementById('text').value;
-            const voice = document.getElementById('voice').value;
-            const speed = document.getElementById('speed').value;
-            const format = document.getElementById('format').value;
-            const audio = document.getElementById('audio');
-            // Set the audio element's source to the streaming endpoint.
-            audio.src = `/tts/streaming?text=${encodeURIComponent(text)}&voice=${encodeURIComponent(voice)}&speed=${speed}&format=${format}`;
-            audio.type = format === 'wav' ? 'audio/wav' : 'audio/opus';
-            audio.play();
-        }
-        function playFull() {
-            const text = document.getElementById('text').value;
-            const voice = document.getElementById('voice').value;
-            const speed = document.getElementById('speed').value;
-            const format = document.getElementById('format').value;
-            const audio = document.getElementById('audio');
-            // Set the audio element's source to the full TTS endpoint.
-            audio.src = `/tts/full?text=${encodeURIComponent(text)}&voice=${encodeURIComponent(voice)}&speed=${speed}&format=${format}`;
-            audio.type = format === 'wav' ? 'audio/wav' : 'audio/opus';
-            audio.play();
-        }
-    </script>
-</body>
-</html>
-"""
 # ------------------------------------------------------------------------------
 # Run with: uvicorn app:app --reload
 # ------------------------------------------------------------------------------
 if __name__ == "__main__":
     import uvicorn

 from fastapi.middleware import Middleware
 from fastapi.middleware.gzip import GZipMiddleware
+from kokoro import StreamKPipeline, KPipeline # Import StreamKPipeline and KPipeline
 app = FastAPI(
     title="Kokoro TTS FastAPI",
 # Global Pipeline Instance
 # ------------------------------------------------------------------------------
 # Create one pipeline instance for the entire app.
+stream_pipeline = StreamKPipeline(lang_code="a") # Use StreamKPipeline for streaming
+full_pipeline = KPipeline(lang_code="a") # Keep KPipeline for full TTS
 # ------------------------------------------------------------------------------
 # Helper Functions
     return header + fmt_chunk + data_chunk_header
+def custom_split_text(text: str) -> list:
+    """
+    Custom splitting:
+      - Start with a chunk size of 2 words.
+      - For each chunk, if a period (".") is found in any word (except if it’s the very last word),
+        then split the chunk at that word (include words up to that word).
+      - Otherwise, use the current chunk size.
+      - For subsequent chunks, increase the chunk size by 2.
+      - If there are fewer than the desired number of words for a full chunk, add all remaining words.
+    """
+    words = text.split()
+    chunks = []
+    chunk_size = 2
+    start = 0
+    while start < len(words):
+        candidate_end = start + chunk_size
+        if candidate_end > len(words):
+            candidate_end = len(words)
+        chunk_words = words[start:candidate_end]
+        # Look for a period in any word except the last one.
+        split_index = None
+        for i in range(len(chunk_words) - 1):
+            if '.' in chunk_words[i]:
+                split_index = i
+                break
+        if split_index is not None:
+            candidate_end = start + split_index + 1
+            chunk_words = words[start:candidate_end]
+        chunks.append(" ".join(chunk_words))
+        start = candidate_end
+        chunk_size += 2  # Increase the chunk size by 2 for the next iteration.
+    return chunks
 def audio_tensor_to_pcm_bytes(audio_tensor: torch.Tensor) -> bytes:
     """
     Convert a torch.FloatTensor (with values in [-1, 1]) to raw 16-bit PCM bytes.
     audio_int16 = np.int16(audio_np * 32767)
     return audio_int16.tobytes()
 # ------------------------------------------------------------------------------
 # Endpoints
 # ------------------------------------------------------------------------------
+@app.get("/tts/streaming", summary="Streaming TTS")
+def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0):
     """
+    Streaming TTS endpoint that returns a continuous audio stream in WAV format (PCM).
+    The endpoint yields a WAV header (with a dummy length) only once at the start of the stream,
+    then yields PCM audio data chunks as they are generated in real-time.
     """
     sample_rate = 24000
     num_channels = 1
     sample_width = 2  # 16-bit PCM
     def audio_generator():
+        # Yield the WAV header first.
+        header = generate_wav_header(sample_rate, num_channels, sample_width)
+        yield header
+        # Stream audio chunks from StreamKPipeline
         try:
+            for stream_result in stream_pipeline(text, voice=voice, speed=speed, split_pattern=r'([.!?…])\s+'): # Split at sentence ends
+                if stream_result.audio_chunk is not None:
+                    pcm_bytes = audio_tensor_to_pcm_bytes(stream_result.audio_chunk)
+                    yield pcm_bytes
         except Exception as e:
+            print(f"Streaming error: {e}")
+            yield b'' # Keep stream alive on error
+    media_type = "audio/wav"
     return StreamingResponse(
         audio_generator(),
         media_type=media_type,
     )
+@app.get("/tts/full", summary="Full TTS")
+def tts_full(text: str, voice: str = "af_heart", speed: float = 1.0):
+    """
+    Full TTS endpoint that synthesizes the entire text using KPipeline,
+    concatenates the audio, and returns a complete WAV file.
+    """
+    # Use newline-based splitting via the pipeline's split_pattern.
+    results = list(full_pipeline(text, voice=voice, speed=speed, split_pattern=r"\n+"))
+    audio_segments = []
+    for result in results:
+        if result.audio is not None:
+            audio_np = result.audio.cpu().numpy()
+            if audio_np.ndim > 1:
+                audio_np = audio_np.flatten()
+            audio_segments.append(audio_np)
+    if not audio_segments:
+        raise HTTPException(status_code=500, detail="No audio generated.")
+    # Concatenate all audio segments.
+    full_audio = np.concatenate(audio_segments)
+    # Write the concatenated audio to an in-memory WAV file.
+    sample_rate = 24000
+    num_channels = 1
+    sample_width = 2  # 16-bit PCM -> 2 bytes per sample
+    wav_io = io.BytesIO()
+    with wave.open(wav_io, "wb") as wav_file:
+        wav_file.setnchannels(num_channels)
+        wav_file.setsampwidth(sample_width)
+        wav_file.setframerate(sample_rate)
+        full_audio_int16 = np.int16(full_audio * 32767)
+        wav_file.writeframes(full_audio_int16.tobytes())
+    wav_io.seek(0)
+    return Response(content=wav_io.read(), media_type="audio/wav")
 def index():
     """
     HTML demo page for Kokoro TTS.
+    This page provides a simple UI to enter text, choose a voice and speed,
+    and play synthesized audio from both the streaming and full endpoints.
     """
     return """
+    <!DOCTYPE html>
+    <html>
+    <head>
+        <title>Kokoro TTS Demo</title>
+    </head>
+    <body>
+        <h1>Kokoro TTS Demo</h1>
+        <textarea id="text" rows="4" cols="50" placeholder="Enter text here"></textarea><br>
+        <label for="voice">Voice:</label>
+        <input type="text" id="voice" value="af_heart"><br>
+        <label for="speed">Speed:</label>
+        <input type="number" step="0.1" id="speed" value="1.0"><br>
+        <br><br>
+        <button onclick="playStreaming()">Play Streaming TTS</button>
+        <button onclick="playFull()">Play Full TTS (Download WAV)</button>
+        <br><br>
+        <audio id="audio" controls autoplay></audio>
+        <script>
+            function playStreaming() {
+                const text = document.getElementById('text').value;
+                const voice = document.getElementById('voice').value;
+                const speed = document.getElementById('speed').value;
+                const audio = document.getElementById('audio');
+                // Set the audio element's source to the streaming endpoint.
+                audio.src = `/tts/streaming?text=${encodeURIComponent(text)}&voice=${encodeURIComponent(voice)}&speed=${speed}`;
+                audio.type = 'audio/wav';
+                audio.play();
+            }
+            function playFull() {
+                const text = document.getElementById('text').value;
+                const voice = document.getElementById('voice').value;
+                const speed = document.getElementById('speed').value;
+                const audio = document.getElementById('audio');
+                // Set the audio element's source to the full TTS endpoint.
+                audio.src = `/tts/full?text=${encodeURIComponent(text)}&voice=${encodeURIComponent(voice)}&speed=${speed}`;
+                audio.type = 'audio/wav';
+                audio.play();
+            }
+        </script>
+    </body>
+    </html>
+    """
 # ------------------------------------------------------------------------------
 # Run with: uvicorn app:app --reload
 # ------------------------------------------------------------------------------
 if __name__ == "__main__":
     import uvicorn