Spaces:

bcci
/

kokoro-api-test

Runtime error

App Files Files Community

bcci commited on Feb 10

Commit

bc93f92

verified ·

1 Parent(s): f023a07

Update app.py

Browse files

Files changed (1) hide show

app.py +164 -204

app.py CHANGED Viewed

@@ -10,229 +10,191 @@ from fastapi.responses import StreamingResponse, Response, HTMLResponse
 from fastapi.middleware import Middleware
 from fastapi.middleware.gzip import GZipMiddleware
-from kokoro import KPipeline
 app = FastAPI(
-title="Kokoro TTS FastAPI",
-middleware=[
-Middleware(GZipMiddleware, compresslevel=9)  # Add GZip compression
-]
 )
-------------------------------------------------------------------------------
-Global Pipeline Instance
-------------------------------------------------------------------------------
-Create one pipeline instance for the entire app.
-pipeline = KPipeline(lang_code="a")
-------------------------------------------------------------------------------
-Helper Functions
-------------------------------------------------------------------------------
 def generate_wav_header(sample_rate: int, num_channels: int, sample_width: int, data_size: int = 0x7FFFFFFF) -> bytes:
-"""
-Generate a WAV header for streaming.
-Since we don't know the final audio size, we set the data chunk size to a large dummy value.
-This header is sent only once at the start of the stream.
-"""
-bits_per_sample = sample_width * 8
-byte_rate = sample_rate * num_channels * sample_width
-block_align = num_channels * sample_width
-# total file size = 36 + data_size (header is 44 bytes total)
-total_size = 36 + data_size
-header = struct.pack('<4sI4s', b'RIFF', total_size, b'WAVE')
-fmt_chunk = struct.pack('<4sIHHIIHH', b'fmt ', 16, 1, num_channels, sample_rate, byte_rate, block_align, bits_per_sample)
-data_chunk_header = struct.pack('<4sI', b'data', data_size)
-return header + fmt_chunk + data_chunk_header
-def custom_split_text(text: str) -> list:
-"""
-Custom splitting:
-- Start with a chunk size of 2 words.
-- For each chunk, if a period (".") is found in any word (except if it’s the very last word),
-then split the chunk at that word (include words up to that word).
-- Otherwise, use the current chunk size.
-- For subsequent chunks, increase the chunk size by 2.
-- If there are fewer than the desired number of words for a full chunk, add all remaining words.
-"""
-words = text.split()
-chunks = []
-chunk_size = 2
-start = 0
-while start < len(words):
-candidate_end = start + chunk_size
-if candidate_end > len(words):
-candidate_end = len(words)
-chunk_words = words[start:candidate_end]
-# Look for a period in any word except the last one.
-split_index = None
-for i in range(len(chunk_words) - 1):
-if '.' in chunk_words[i]:
-split_index = i
-break
-if split_index is not None:
-candidate_end = start + split_index + 1
-chunk_words = words[start:candidate_end]
-chunks.append(" ".join(chunk_words))
-start = candidate_end
-chunk_size += 2  # Increase the chunk size by 2 for the next iteration.
-return chunks
 def audio_tensor_to_pcm_bytes(audio_tensor: torch.Tensor) -> bytes:
-"""
-Convert a torch.FloatTensor (with values in [-1, 1]) to raw 16-bit PCM bytes.
-"""
-# Ensure tensor is on CPU and flatten if necessary.
-audio_np = audio_tensor.cpu().numpy()
-if audio_np.ndim > 1:
-audio_np = audio_np.flatten()
-# Scale to int16 range.
-audio_int16 = np.int16(audio_np * 32767)
-return audio_int16.tobytes()
 def audio_tensor_to_opus_bytes(audio_tensor: torch.Tensor, sample_rate: int = 24000, bitrate: int = 32000) -> bytes:
-"""
-Convert a torch.FloatTensor to Opus encoded bytes.
-Requires the 'opuslib' package: pip install opuslib
-"""
-try:
-import opuslib
-except ImportError:
-raise ImportError("opuslib is not installed. Please install it with: pip install opuslib")
-audio_np = audio_tensor.cpu().numpy()
-if audio_np.ndim > 1:
-    audio_np = audio_np.flatten()
-# Scale to int16 range.  Important for opus.
-audio_int16 = np.int16(audio_np * 32767)
-encoder = opuslib.Encoder(sample_rate, 1, opuslib.APPLICATION_VOIP) # 1 channel for mono.
-# Calculate the number of frames to encode. Opus frames are 2.5, 5, 10, or 20 ms long.
-frame_size = int(sample_rate * 0.020)  # 20ms frame size
-encoded_data = b''
-for i in range(0, len(audio_int16), frame_size):
-    frame = audio_int16[i:i + frame_size]
-    if len(frame) < frame_size:
-        # Pad the last frame with zeros if needed.
-        frame = np.pad(frame, (0, frame_size - len(frame)), 'constant')
-    encoded_frame = encoder.encode(frame.tobytes(), frame_size) # Encode the frame.
-    encoded_data += encoded_frame
-return encoded_data
-content_copy
-download
- Use code with caution.
-------------------------------------------------------------------------------
-Endpoints
-------------------------------------------------------------------------------
-@app.get("/tts/streaming", summary="Streaming TTS")
 def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format: str = "opus"):
-"""
-Streaming TTS endpoint that returns a continuous audio stream.
-Supports WAV (PCM) and Opus formats.  Opus offers significantly better compression.
-The endpoint first yields a WAV header (with a dummy length) for WAV,
-then yields encoded audio data for each text chunk as soon as it is generated.
-"""
-# Split the input text using the custom doubling strategy.
-chunks = custom_split_text(text)
-sample_rate = 24000
-num_channels = 1
-sample_width = 2  # 16-bit PCM
-def audio_generator():
-    if format.lower() == "wav":
-        # Yield the WAV header first.
-        header = generate_wav_header(sample_rate, num_channels, sample_width)
-        yield header
-    # Process and yield each chunk's audio data.
-    for i, chunk in enumerate(chunks):
-        print(f"Processing chunk {i}: {chunk}")  # Debugging
         try:
-            results = list(pipeline(chunk, voice=voice, speed=speed, split_pattern=None))
-            for result in results:
-                if result.audio is not None:
                     if format.lower() == "wav":
-                        yield audio_tensor_to_pcm_bytes(result.audio)
                     elif format.lower() == "opus":
-                        yield audio_tensor_to_opus_bytes(result.audio, sample_rate=sample_rate)
                     else:
                         raise ValueError(f"Unsupported audio format: {format}")
-                else:
-                    print(f"Chunk {i}: No audio generated")
         except Exception as e:
-            print(f"Error processing chunk {i}: {e}")
-            yield b'' # important so that streaming continues.  Consider returning an error sound.
-media_type = "audio/wav" if format.lower() == "wav" else "audio/opus"
-return StreamingResponse(
-    audio_generator(),
-    media_type=media_type,
-    headers={"Cache-Control": "no-cache"},
-)
-content_copy
-download
- Use code with caution.
 @app.get("/tts/full", summary="Full TTS")
 def tts_full(text: str, voice: str = "af_heart", speed: float = 1.0, format: str = "wav"):
-"""
-Full TTS endpoint that synthesizes the entire text, concatenates the audio,
-and returns a complete WAV or Opus file.
-"""
-# Use newline-based splitting via the pipeline's split_pattern.
-results = list(pipeline(text, voice=voice, speed=speed, split_pattern=r"\n+"))
-audio_segments = []
-for result in results:
-if result.audio is not None:
-audio_np = result.audio.cpu().numpy()
-if audio_np.ndim > 1:
-audio_np = audio_np.flatten()
-audio_segments.append(audio_np)
-if not audio_segments:
-    raise HTTPException(status_code=500, detail="No audio generated.")
-# Concatenate all audio segments.
-full_audio = np.concatenate(audio_segments)
-# Write the concatenated audio to an in-memory WAV or Opus file.
-sample_rate = 24000
-num_channels = 1
-sample_width = 2  # 16-bit PCM -> 2 bytes per sample
-if format.lower() == "wav":
-    wav_io = io.BytesIO()
-    with wave.open(wav_io, "wb") as wav_file:
-        wav_file.setnchannels(num_channels)
-        wav_file.setsampwidth(sample_width)
-        wav_file.setframerate(sample_rate)
-        full_audio_int16 = np.int16(full_audio * 32767)
-        wav_file.writeframes(full_audio_int16.tobytes())
-    wav_io.seek(0)
-    return Response(content=wav_io.read(), media_type="audio/wav")
-elif format.lower() == "opus":
-    opus_data = audio_tensor_to_opus_bytes(torch.from_numpy(full_audio), sample_rate=sample_rate)
-    return Response(content=opus_data, media_type="audio/opus")
-else:
-    raise HTTPException(status_code=400, detail=f"Unsupported audio format: {format}")
-content_copy
-download
- Use code with caution.
 @app.get("/", response_class=HTMLResponse)
 def index():
-"""
-HTML demo page for Kokoro TTS.
-This page provides a simple UI to enter text, choose a voice and speed,
-and play synthesized audio from both the streaming and full endpoints.
-"""
-return """
 <!DOCTYPE html>
 <html>
 <head>
@@ -281,14 +243,12 @@ return """
 </body>
 </html>
 """
-content_copy
-download
- Use code with caution.
-------------------------------------------------------------------------------
-Run with: uvicorn app:app --reload
-------------------------------------------------------------------------------
-if name == "main":
-import uvicorn
-uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True)

 from fastapi.middleware import Middleware
 from fastapi.middleware.gzip import GZipMiddleware
+# --- IMPORTANT:  Use the AutoregressiveStreamKPipeline ---
+from kokoro.pipeline import AutoregressiveStreamKPipeline  # Or wherever your pipeline is.
 app = FastAPI(
+    title="Kokoro TTS FastAPI",
+    middleware=[
+        Middleware(GZipMiddleware, compresslevel=9)  # Add GZip compression
+    ]
 )
+# ------------------------------------------------------------------------------
+# Global Pipeline Instance
+# ------------------------------------------------------------------------------
+# Create one pipeline instance for the entire app.
+pipeline = AutoregressiveStreamKPipeline(lang_code="a")  # Use the autoregressive pipeline
+# ------------------------------------------------------------------------------
+# Helper Functions
+# ------------------------------------------------------------------------------
 def generate_wav_header(sample_rate: int, num_channels: int, sample_width: int, data_size: int = 0x7FFFFFFF) -> bytes:
+    """
+    Generate a WAV header for streaming.
+    Since we don't know the final audio size, we set the data chunk size to a large dummy value.
+    This header is sent only once at the start of the stream.
+    """
+    bits_per_sample = sample_width * 8
+    byte_rate = sample_rate * num_channels * sample_width
+    block_align = num_channels * sample_width
+    # total file size = 36 + data_size (header is 44 bytes total)
+    total_size = 36 + data_size
+    header = struct.pack('<4sI4s', b'RIFF', total_size, b'WAVE')
+    fmt_chunk = struct.pack('<4sIHHIIHH', b'fmt ', 16, 1, num_channels, sample_rate, byte_rate, block_align, bits_per_sample)
+    data_chunk_header = struct.pack('<4sI', b'data', data_size)
+    return header + fmt_chunk + data_chunk_header
 def audio_tensor_to_pcm_bytes(audio_tensor: torch.Tensor) -> bytes:
+    """
+    Convert a torch.FloatTensor (with values in [-1, 1]) to raw 16-bit PCM bytes.
+    """
+    # Ensure tensor is on CPU and flatten if necessary.
+    audio_np = audio_tensor.cpu().numpy()
+    if audio_np.ndim > 1:
+        audio_np = audio_np.flatten()
+    # Scale to int16 range.
+    audio_int16 = np.int16(audio_np * 32767)
+    return audio_int16.tobytes()
 def audio_tensor_to_opus_bytes(audio_tensor: torch.Tensor, sample_rate: int = 24000, bitrate: int = 32000) -> bytes:
+    """
+    Convert a torch.FloatTensor to Opus encoded bytes.
+    Requires the 'opuslib' package: pip install opuslib
+    """
+    try:
+        import opuslib
+    except ImportError:
+        raise ImportError("opuslib is not installed. Please install it with: pip install opuslib")
+    audio_np = audio_tensor.cpu().numpy()
+    if audio_np.ndim > 1:
+        audio_np = audio_np.flatten()
+    # Scale to int16 range.  Important for opus.
+    audio_int16 = np.int16(audio_np * 32767)
+    encoder = opuslib.Encoder(sample_rate, 1, opuslib.APPLICATION_VOIP) # 1 channel for mono.
+    # Calculate the number of frames to encode. Opus frames are 2.5, 5, 10, or 20 ms long.
+    frame_size = int(sample_rate * 0.020)  # 20ms frame size
+    encoded_data = b''
+    for i in range(0, len(audio_int16), frame_size):
+        frame = audio_int16[i:i + frame_size]
+        if len(frame) < frame_size:
+            # Pad the last frame with zeros if needed.
+            frame = np.pad(frame, (0, frame_size - len(frame)), 'constant')
+        encoded_frame = encoder.encode(frame.tobytes(), frame_size) # Encode the frame.
+        encoded_data += encoded_frame
+    return encoded_data
+# ------------------------------------------------------------------------------
+# Endpoints
+# ------------------------------------------------------------------------------
+@app.get("/tts/streaming", summary="Streaming TTS (Autoregressive)")
 def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format: str = "opus"):
+    """
+    Streaming TTS endpoint that attempts autoregressive, near sample-by-sample output.
+    IMPORTANT: This is EXPERIMENTAL and may have reduced quality compared to
+    the full or chunking methods.  It's also likely to be slower due to the
+    per-phoneme processing overhead.
+    """
+    sample_rate = 24000
+    num_channels = 1
+    sample_width = 2  # 16-bit PCM
+    def audio_generator():
+        if format.lower() == "wav":
+            # Yield the WAV header first.
+            header = generate_wav_header(sample_rate, num_channels, sample_width)
+            yield header
         try:
+            # Use the AUTOREGRESSIVE pipeline
+            for audio_chunk in pipeline(text, voice=voice, speed=speed):
+                if audio_chunk.numel() > 0:  # Ensure we have audio data
                     if format.lower() == "wav":
+                        yield audio_tensor_to_pcm_bytes(audio_chunk)
                     elif format.lower() == "opus":
+                        yield audio_tensor_to_opus_bytes(audio_chunk, sample_rate=sample_rate)
                     else:
                         raise ValueError(f"Unsupported audio format: {format}")
         except Exception as e:
+            print(f"Error during streaming: {e}")
+            yield b''  # Yield empty bytes to avoid breaking the stream
+    media_type = "audio/wav" if format.lower() == "wav" else "audio/opus"
+    return StreamingResponse(
+        audio_generator(),
+        media_type=media_type,
+        headers={"Cache-Control": "no-cache"},
+    )
 @app.get("/tts/full", summary="Full TTS")
 def tts_full(text: str, voice: str = "af_heart", speed: float = 1.0, format: str = "wav"):
+    """
+    Full TTS endpoint (no streaming).  Synthesizes the entire text and returns
+    a complete WAV or Opus file.
+    """
+    # Use newline-based splitting.  This is the *original* KPipeline,
+    # which is better for full synthesis.  It's important to use
+    # the right pipeline for the right task.
+    from kokoro.pipeline import KPipeline  # Import here to avoid circular import
+    full_pipeline = KPipeline(lang_code="a")
+    results = list(full_pipeline(text, voice=voice, speed=speed, split_pattern=r"\n+"))
+    audio_segments = []
+    for result in results:
+        if result.audio is not None:
+            audio_np = result.audio.cpu().numpy()
+            if audio_np.ndim > 1:
+                audio_np = audio_np.flatten()
+            audio_segments.append(audio_np)
+    if not audio_segments:
+        raise HTTPException(status_code=500, detail="No audio generated.")
+    # Concatenate all audio segments.
+    full_audio = np.concatenate(audio_segments)
+    # Write the concatenated audio to an in-memory WAV or Opus file.
+    sample_rate = 24000
+    num_channels = 1
+    sample_width = 2  # 16-bit PCM -> 2 bytes per sample
+    if format.lower() == "wav":
+        wav_io = io.BytesIO()
+        with wave.open(wav_io, "wb") as wav_file:
+            wav_file.setnchannels(num_channels)
+            wav_file.setsampwidth(sample_width)
+            wav_file.setframerate(sample_rate)
+            full_audio_int16 = np.int16(full_audio * 32767)
+            wav_file.writeframes(full_audio_int16.tobytes())
+        wav_io.seek(0)
+        return Response(content=wav_io.read(), media_type="audio/wav")
+    elif format.lower() == "opus":
+        opus_data = audio_tensor_to_opus_bytes(torch.from_numpy(full_audio), sample_rate=sample_rate)
+        return Response(content=opus_data, media_type="audio/opus")
+    else:
+        raise HTTPException(status_code=400, detail=f"Unsupported audio format: {format}")
 @app.get("/", response_class=HTMLResponse)
 def index():
+    """
+    HTML demo page for Kokoro TTS.
+    """
+    return """
 <!DOCTYPE html>
 <html>
 <head>
 </body>
 </html>
 """
+# ------------------------------------------------------------------------------
+# Run with: uvicorn app:app --reload
+# ------------------------------------------------------------------------------
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True)