kokoro-onnx-api-test

Running

App Files Files Community

bcci commited on Feb 8

Commit

565857a

verified ·

1 Parent(s): d5853ad

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -8

app.py CHANGED Viewed

@@ -94,10 +94,10 @@ def custom_split_text(text: str) -> list:
             candidate_end = len(words)
         chunk_words = words[start:candidate_end]
         split_index = None
-        # for i in range(len(chunk_words) - 1):
-        #     if '.' in chunk_words[i]:
-        #         split_index = i
-        #         break
         if split_index is not None:
             candidate_end = start + split_index + 1
             chunk_words = words[start:candidate_end]
@@ -194,16 +194,18 @@ def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format
             chunk_tokens = tokenizer(chunk)
             # For the first chunk, prepend 0; for later chunks, start with the previous chunk's last token.
-            if i == 0:
-                tokens_to_send = [0] + chunk_tokens + [0]
-            else:
-                tokens_to_send =  [0] + prev_last_token + [16] + chunk_tokens + [0]
                 # token_to_send = [0] + chunk_tokens
             # Save the last token of this chunk for the next iteration.
             prev_last_token = chunk_tokens[-1:]
             # Prepare the model input (a batch of one sequence).
             final_token = [tokens_to_send]
             print(final_token)
@@ -232,6 +234,7 @@ def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format
             # Convert the model output (assumed to be float32 in [-1, 1]) to int16 PCM.
             audio_int16 = (audio_output * 32767).astype(np.int16).flatten()
             # Convert to a torch tensor (back into float range) for our helper functions.
             # audio_tensor = torch.from_numpy(audio_int16.astype(np.float32) / 32767)

             candidate_end = len(words)
         chunk_words = words[start:candidate_end]
         split_index = None
+        for i in range(len(chunk_words) - 1):
+            if '.' in chunk_words[i]:
+                split_index = i
+                break
         if split_index is not None:
             candidate_end = start + split_index + 1
             chunk_words = words[start:candidate_end]
             chunk_tokens = tokenizer(chunk)
             # For the first chunk, prepend 0; for later chunks, start with the previous chunk's last token.
+            # if i == 0:
+            #     tokens_to_send = [0] + chunk_tokens + [0]
+            # else:
+                # tokens_to_send =  [0] + chunk_tokens + [0]
                 # token_to_send = [0] + chunk_tokens
             # Save the last token of this chunk for the next iteration.
             prev_last_token = chunk_tokens[-1:]
             # Prepare the model input (a batch of one sequence).
+            tokens_to_send =  [0] + chunk_tokens + [0]
             final_token = [tokens_to_send]
             print(final_token)
             # Convert the model output (assumed to be float32 in [-1, 1]) to int16 PCM.
             audio_int16 = (audio_output * 32767).astype(np.int16).flatten()
+            print(audio_int16)
             # Convert to a torch tensor (back into float range) for our helper functions.
             # audio_tensor = torch.from_numpy(audio_int16.astype(np.float32) / 32767)