Spaces:

bcci
/

kokoro-api-test

Runtime error

App Files Files Community

bcci commited on Feb 10

Commit

ec5398b

verified ·

1 Parent(s): 795d4dd

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -57

app.py CHANGED Viewed

@@ -47,40 +47,6 @@ def generate_wav_header(sample_rate: int, num_channels: int, sample_width: int,
     return header + fmt_chunk + data_chunk_header
-def custom_split_text(text: str) -> list:
-    """
-    Custom splitting:
-      - Start with a chunk size of 2 words.
-      - For each chunk, if a period (".") is found in any word (except if it’s the very last word),
-        then split the chunk at that word (include words up to that word).
-      - Otherwise, use the current chunk size.
-      - For subsequent chunks, increase the chunk size by 2.
-      - If there are fewer than the desired number of words for a full chunk, add all remaining words.
-    """
-    words = text.split()
-    chunks = []
-    chunk_size = 2
-    start = 0
-    while start < len(words):
-        candidate_end = start + chunk_size
-        if candidate_end > len(words):
-            candidate_end = len(words)
-        chunk_words = words[start:candidate_end]
-        # Look for a period in any word except the last one.
-        split_index = None
-        for i in range(len(chunk_words) - 1):
-            if '.' in chunk_words[i]:
-                split_index = i
-                break
-        if split_index is not None:
-            candidate_end = start + split_index + 1
-            chunk_words = words[start:candidate_end]
-        chunks.append(" ".join(chunk_words))
-        start = candidate_end
-        chunk_size += 2  # Increase the chunk size by 2 for the next iteration.
-    return chunks
 def audio_tensor_to_pcm_bytes(audio_tensor: torch.Tensor) -> bytes:
     """
     Convert a torch.FloatTensor (with values in [-1, 1]) to raw 16-bit PCM bytes.
@@ -131,17 +97,17 @@ def audio_tensor_to_opus_bytes(audio_tensor: torch.Tensor, sample_rate: int = 24
 # Endpoints
 # ------------------------------------------------------------------------------
-@app.get("/tts/streaming", summary="Streaming TTS")
 def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format: str = "opus"):
     """
-    Streaming TTS endpoint that returns a continuous audio stream.
-    Supports WAV (PCM) and Opus formats.  Opus offers significantly better compression.
     The endpoint first yields a WAV header (with a dummy length) for WAV,
-    then yields encoded audio data for each text chunk as soon as it is generated.
     """
-    # Split the input text using the custom doubling strategy.
-    chunks = custom_split_text(text)
     sample_rate = 24000
     num_channels = 1
     sample_width = 2  # 16-bit PCM
@@ -151,24 +117,22 @@ def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format
             # Yield the WAV header first.
             header = generate_wav_header(sample_rate, num_channels, sample_width)
             yield header
-        # Process and yield each chunk's audio data.
-        for i, chunk in enumerate(chunks):
-            print(f"Processing chunk {i}: {chunk}")  # Debugging
-            try:
-                results = list(pipeline(chunk, voice=voice, speed=speed, split_pattern=None))
-                for result in results:
-                    if result.audio is not None:
-                        if format.lower() == "wav":
-                            yield audio_tensor_to_pcm_bytes(result.audio)
-                        elif format.lower() == "opus":
-                            yield audio_tensor_to_opus_bytes(result.audio, sample_rate=sample_rate)
-                        else:
-                            raise ValueError(f"Unsupported audio format: {format}")
                     else:
-                        print(f"Chunk {i}: No audio generated")
-            except Exception as e:
-                print(f"Error processing chunk {i}: {e}")
-                yield b'' # important so that streaming continues.  Consider returning an error sound.
     media_type = "audio/wav" if format.lower() == "wav" else "audio/opus"

     return header + fmt_chunk + data_chunk_header
 def audio_tensor_to_pcm_bytes(audio_tensor: torch.Tensor) -> bytes:
     """
     Convert a torch.FloatTensor (with values in [-1, 1]) to raw 16-bit PCM bytes.
 # Endpoints
 # ------------------------------------------------------------------------------
+@app.get("/tts/streaming", summary="True Streaming TTS")
 def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format: str = "opus"):
     """
+    True Streaming TTS endpoint that returns a continuous audio stream.
+    It processes text and generates audio token by token (or small chunks as KPipeline yields),
+    providing a more responsive streaming experience.
+    Supports WAV (PCM) and Opus formats. Opus offers significantly better compression.
     The endpoint first yields a WAV header (with a dummy length) for WAV,
+    then yields encoded audio data for each token's audio as soon as it is generated.
     """
     sample_rate = 24000
     num_channels = 1
     sample_width = 2  # 16-bit PCM
             # Yield the WAV header first.
             header = generate_wav_header(sample_rate, num_channels, sample_width)
             yield header
+        try:
+            results = pipeline(text, voice=voice, speed=speed, split_pattern=None) # split_pattern=None to avoid splitting here, let KPipeline handle
+            for result in results:
+                if result.audio is not None:
+                    if format.lower() == "wav":
+                        yield audio_tensor_to_pcm_bytes(result.audio)
+                    elif format.lower() == "opus":
+                        yield audio_tensor_to_opus_bytes(result.audio, sample_rate=sample_rate)
                     else:
+                        raise ValueError(f"Unsupported audio format: {format}")
+                else:
+                    print("No audio generated for a token/chunk") # Debugging, remove in production if not needed
+        except Exception as e:
+            print(f"Error during TTS processing: {e}")
+            yield b''  # Important: yield empty bytes to keep stream alive, or handle error sound
     media_type = "audio/wav" if format.lower() == "wav" else "audio/opus"