kokoro-onnx-api-test

Running

App Files Files Community

bcci commited on Feb 8

Commit

9a74dea

verified ·

1 Parent(s): 25bd1c6

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -11

app.py CHANGED Viewed

@@ -31,7 +31,7 @@ phoneme_vocab = config['vocab']
 # Download the model and voice files from Hugging Face Hub
 # ------------------------------------------------------------------------------
 model_repo = "onnx-community/Kokoro-82M-v1.0-ONNX"
-model_name = "onnx/model_q8f16.onnx"
 voice_file_pattern = "*.bin"
 local_dir = "."
 snapshot_download(
@@ -102,7 +102,7 @@ def custom_split_text(text: str) -> list:
             chunk_words = words[start:candidate_end]
         chunks.append(" ".join(chunk_words))
         start = candidate_end
-        chunk_size += 2
     return chunks
@@ -201,16 +201,12 @@ def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format
             # For the first chunk, prepend 0; for later chunks, start with the previous chunk's last token.
             if i == 0:
-                tokens_to_send = [0] + chunk_tokens
             else:
-                tokens_to_send = [prev_last_token] + chunk_tokens
-            # If this is the final chunk, append 0.
-            if i == len(chunks) - 1:
-                tokens_to_send = tokens_to_send + [0]
             # Save the last token of this chunk for the next iteration.
-            prev_last_token = tokens_to_send[-1]
             # Prepare the model input (a batch of one sequence).
             final_token = [tokens_to_send]
@@ -242,11 +238,11 @@ def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format
             audio_int16 = (audio_output * 32767).astype(np.int16).flatten()
             # Convert to a torch tensor (back into float range) for our helper functions.
-            audio_tensor = torch.from_numpy(audio_int16.astype(np.float32) / 32767)
             # Yield the encoded audio chunk.
             if format.lower() == "wav":
-                yield audio_tensor_to_pcm_bytes(audio_tensor)
             elif format.lower() == "opus":
                 yield audio_tensor_to_opus_bytes(audio_tensor, sample_rate=sample_rate)
             else:

 # Download the model and voice files from Hugging Face Hub
 # ------------------------------------------------------------------------------
 model_repo = "onnx-community/Kokoro-82M-v1.0-ONNX"
+model_name = "onnx/model_quantized.onnx"
 voice_file_pattern = "*.bin"
 local_dir = "."
 snapshot_download(
             chunk_words = words[start:candidate_end]
         chunks.append(" ".join(chunk_words))
         start = candidate_end
+        chunk_size += 1
     return chunks
             # For the first chunk, prepend 0; for later chunks, start with the previous chunk's last token.
             if i == 0:
+                tokens_to_send = [0] + chunk_tokens + [0]
             else:
+                tokens_to_send = [prev_last_token] + chunk_tokens + [0]
             # Save the last token of this chunk for the next iteration.
+            prev_last_token = tokens_to_send[-2]
             # Prepare the model input (a batch of one sequence).
             final_token = [tokens_to_send]
             audio_int16 = (audio_output * 32767).astype(np.int16).flatten()
             # Convert to a torch tensor (back into float range) for our helper functions.
+            # audio_tensor = torch.from_numpy(audio_int16.astype(np.float32) / 32767)
             # Yield the encoded audio chunk.
             if format.lower() == "wav":
+                yield audio_int16
             elif format.lower() == "opus":
                 yield audio_tensor_to_opus_bytes(audio_tensor, sample_rate=sample_rate)
             else: