Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -31,7 +31,7 @@ phoneme_vocab = config['vocab']
|
|
31 |
# Download the model and voice files from Hugging Face Hub
|
32 |
# ------------------------------------------------------------------------------
|
33 |
model_repo = "onnx-community/Kokoro-82M-v1.0-ONNX"
|
34 |
-
model_name = "onnx/
|
35 |
voice_file_pattern = "*.bin"
|
36 |
local_dir = "."
|
37 |
snapshot_download(
|
@@ -102,7 +102,7 @@ def custom_split_text(text: str) -> list:
|
|
102 |
chunk_words = words[start:candidate_end]
|
103 |
chunks.append(" ".join(chunk_words))
|
104 |
start = candidate_end
|
105 |
-
chunk_size +=
|
106 |
return chunks
|
107 |
|
108 |
|
@@ -201,16 +201,12 @@ def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format
|
|
201 |
|
202 |
# For the first chunk, prepend 0; for later chunks, start with the previous chunk's last token.
|
203 |
if i == 0:
|
204 |
-
tokens_to_send = [0] + chunk_tokens
|
205 |
else:
|
206 |
-
tokens_to_send = [prev_last_token] + chunk_tokens
|
207 |
-
|
208 |
-
# If this is the final chunk, append 0.
|
209 |
-
if i == len(chunks) - 1:
|
210 |
-
tokens_to_send = tokens_to_send + [0]
|
211 |
|
212 |
# Save the last token of this chunk for the next iteration.
|
213 |
-
prev_last_token = tokens_to_send[-
|
214 |
|
215 |
# Prepare the model input (a batch of one sequence).
|
216 |
final_token = [tokens_to_send]
|
@@ -242,11 +238,11 @@ def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format
|
|
242 |
audio_int16 = (audio_output * 32767).astype(np.int16).flatten()
|
243 |
|
244 |
# Convert to a torch tensor (back into float range) for our helper functions.
|
245 |
-
audio_tensor = torch.from_numpy(audio_int16.astype(np.float32) / 32767)
|
246 |
|
247 |
# Yield the encoded audio chunk.
|
248 |
if format.lower() == "wav":
|
249 |
-
yield
|
250 |
elif format.lower() == "opus":
|
251 |
yield audio_tensor_to_opus_bytes(audio_tensor, sample_rate=sample_rate)
|
252 |
else:
|
|
|
31 |
# Download the model and voice files from Hugging Face Hub
|
32 |
# ------------------------------------------------------------------------------
|
33 |
model_repo = "onnx-community/Kokoro-82M-v1.0-ONNX"
|
34 |
+
model_name = "onnx/model_quantized.onnx"
|
35 |
voice_file_pattern = "*.bin"
|
36 |
local_dir = "."
|
37 |
snapshot_download(
|
|
|
102 |
chunk_words = words[start:candidate_end]
|
103 |
chunks.append(" ".join(chunk_words))
|
104 |
start = candidate_end
|
105 |
+
chunk_size += 1
|
106 |
return chunks
|
107 |
|
108 |
|
|
|
201 |
|
202 |
# For the first chunk, prepend 0; for later chunks, start with the previous chunk's last token.
|
203 |
if i == 0:
|
204 |
+
tokens_to_send = [0] + chunk_tokens + [0]
|
205 |
else:
|
206 |
+
tokens_to_send = [prev_last_token] + chunk_tokens + [0]
|
|
|
|
|
|
|
|
|
207 |
|
208 |
# Save the last token of this chunk for the next iteration.
|
209 |
+
prev_last_token = tokens_to_send[-2]
|
210 |
|
211 |
# Prepare the model input (a batch of one sequence).
|
212 |
final_token = [tokens_to_send]
|
|
|
238 |
audio_int16 = (audio_output * 32767).astype(np.int16).flatten()
|
239 |
|
240 |
# Convert to a torch tensor (back into float range) for our helper functions.
|
241 |
+
# audio_tensor = torch.from_numpy(audio_int16.astype(np.float32) / 32767)
|
242 |
|
243 |
# Yield the encoded audio chunk.
|
244 |
if format.lower() == "wav":
|
245 |
+
yield audio_int16
|
246 |
elif format.lower() == "opus":
|
247 |
yield audio_tensor_to_opus_bytes(audio_tensor, sample_rate=sample_rate)
|
248 |
else:
|