kokoro-onnx-api-test

Running

App Files Files Community

bcci commited on Feb 8

Commit

e76bbe1

verified ·

1 Parent(s): 01f0881

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -14

app.py CHANGED Viewed

@@ -31,7 +31,7 @@ phoneme_vocab = config['vocab']
 # Download the model and voice files from Hugging Face Hub
 # ------------------------------------------------------------------------------
 model_repo = "onnx-community/Kokoro-82M-v1.0-ONNX"
-model_name = "onnx/model_q8f16.onnx"
 voice_file_pattern = "*.bin"
 local_dir = "."
 snapshot_download(
@@ -72,6 +72,7 @@ def generate_wav_header(sample_rate: int, num_channels: int, sample_width: int,
     data_chunk_header = struct.pack('<4sI', b'data', data_size)
     return header + fmt_chunk + data_chunk_header
 def custom_split_text(text: str) -> list:
     """
@@ -102,7 +103,7 @@ def custom_split_text(text: str) -> list:
             chunk_words = words[start:candidate_end]
         chunks.append(" ".join(chunk_words))
         start = candidate_end
-        chunk_size += 1
     return chunks
@@ -151,11 +152,9 @@ def tokenizer(text: str):
     """
     Converts text to a list of phoneme tokens using the global vocabulary.
     """
-    print("Text: " + text)
     phonemes_string, _ = g2p(text)
     phonemes = [ph for ph in phonemes_string]
     tokens = [phoneme_vocab[phoneme] for phoneme in phonemes if phoneme in phoneme_vocab]
-    print("Tokens:", tokens)
     return tokens
@@ -164,7 +163,7 @@ def tokenizer(text: str):
 # ------------------------------------------------------------------------------
 @app.get("/tts/streaming", summary="Streaming TTS")
-def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format: str = "opus"):
     """
     Streaming TTS endpoint.
@@ -177,9 +176,6 @@ def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format
     The audio for each chunk is generated immediately and streamed to the client.
     """
     chunks = custom_split_text(text)
-    sample_rate = 24000
-    num_channels = 1
-    sample_width = 2
     # Load the voice/style file (must be present in voices/{voice}.bin)
     voice_path = os.path.join(local_dir, f"voices/{voice}.bin")
@@ -190,23 +186,21 @@ def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format
     def audio_generator():
         # If outputting a WAV stream, yield a WAV header once.
         if format.lower() == "wav":
-            header = generate_wav_header(sample_rate, num_channels, sample_width)
-            yield header
         prev_last_token = None
         for i, chunk in enumerate(chunks):
-            print(f"Processing chunk {i}: {chunk}")
             # Convert the chunk text to tokens.
             chunk_tokens = tokenizer(chunk)
             # For the first chunk, prepend 0; for later chunks, start with the previous chunk's last token.
             if i == 0:
-                tokens_to_send = [0] + chunk_tokens + [0]
             else:
-                tokens_to_send = [prev_last_token] + chunk_tokens + [0]
             # Save the last token of this chunk for the next iteration.
-            prev_last_token = tokens_to_send[-2]
             # Prepare the model input (a batch of one sequence).
             final_token = [tokens_to_send]

 # Download the model and voice files from Hugging Face Hub
 # ------------------------------------------------------------------------------
 model_repo = "onnx-community/Kokoro-82M-v1.0-ONNX"
+model_name = "onnx/model_q4f16.onnx"
 voice_file_pattern = "*.bin"
 local_dir = "."
 snapshot_download(
     data_chunk_header = struct.pack('<4sI', b'data', data_size)
     return header + fmt_chunk + data_chunk_header
+stream_header = generate_wav_header(24000, 1, 2)
 def custom_split_text(text: str) -> list:
     """
             chunk_words = words[start:candidate_end]
         chunks.append(" ".join(chunk_words))
         start = candidate_end
+        chunk_size += 2
     return chunks
     """
     Converts text to a list of phoneme tokens using the global vocabulary.
     """
     phonemes_string, _ = g2p(text)
     phonemes = [ph for ph in phonemes_string]
     tokens = [phoneme_vocab[phoneme] for phoneme in phonemes if phoneme in phoneme_vocab]
     return tokens
 # ------------------------------------------------------------------------------
 @app.get("/tts/streaming", summary="Streaming TTS")
+def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format: str = "wav"):
     """
     Streaming TTS endpoint.
     The audio for each chunk is generated immediately and streamed to the client.
     """
     chunks = custom_split_text(text)
     # Load the voice/style file (must be present in voices/{voice}.bin)
     voice_path = os.path.join(local_dir, f"voices/{voice}.bin")
     def audio_generator():
         # If outputting a WAV stream, yield a WAV header once.
         if format.lower() == "wav":
+            yield stream_header
         prev_last_token = None
         for i, chunk in enumerate(chunks):
             # Convert the chunk text to tokens.
             chunk_tokens = tokenizer(chunk)
             # For the first chunk, prepend 0; for later chunks, start with the previous chunk's last token.
             if i == 0:
+                tokens_to_send = [0] + chunk_tokens
             else:
+                tokens_to_send = [prev_last_token] + chunk_tokens
             # Save the last token of this chunk for the next iteration.
+            prev_last_token = tokens_to_send[-1]
             # Prepare the model input (a batch of one sequence).
             final_token = [tokens_to_send]