bcci commited on
Commit
9a74dea
·
verified ·
1 Parent(s): 25bd1c6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -11
app.py CHANGED
@@ -31,7 +31,7 @@ phoneme_vocab = config['vocab']
31
  # Download the model and voice files from Hugging Face Hub
32
  # ------------------------------------------------------------------------------
33
  model_repo = "onnx-community/Kokoro-82M-v1.0-ONNX"
34
- model_name = "onnx/model_q8f16.onnx"
35
  voice_file_pattern = "*.bin"
36
  local_dir = "."
37
  snapshot_download(
@@ -102,7 +102,7 @@ def custom_split_text(text: str) -> list:
102
  chunk_words = words[start:candidate_end]
103
  chunks.append(" ".join(chunk_words))
104
  start = candidate_end
105
- chunk_size += 2
106
  return chunks
107
 
108
 
@@ -201,16 +201,12 @@ def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format
201
 
202
  # For the first chunk, prepend 0; for later chunks, start with the previous chunk's last token.
203
  if i == 0:
204
- tokens_to_send = [0] + chunk_tokens
205
  else:
206
- tokens_to_send = [prev_last_token] + chunk_tokens
207
-
208
- # If this is the final chunk, append 0.
209
- if i == len(chunks) - 1:
210
- tokens_to_send = tokens_to_send + [0]
211
 
212
  # Save the last token of this chunk for the next iteration.
213
- prev_last_token = tokens_to_send[-1]
214
 
215
  # Prepare the model input (a batch of one sequence).
216
  final_token = [tokens_to_send]
@@ -242,11 +238,11 @@ def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format
242
  audio_int16 = (audio_output * 32767).astype(np.int16).flatten()
243
 
244
  # Convert to a torch tensor (back into float range) for our helper functions.
245
- audio_tensor = torch.from_numpy(audio_int16.astype(np.float32) / 32767)
246
 
247
  # Yield the encoded audio chunk.
248
  if format.lower() == "wav":
249
- yield audio_tensor_to_pcm_bytes(audio_tensor)
250
  elif format.lower() == "opus":
251
  yield audio_tensor_to_opus_bytes(audio_tensor, sample_rate=sample_rate)
252
  else:
 
31
  # Download the model and voice files from Hugging Face Hub
32
  # ------------------------------------------------------------------------------
33
  model_repo = "onnx-community/Kokoro-82M-v1.0-ONNX"
34
+ model_name = "onnx/model_quantized.onnx"
35
  voice_file_pattern = "*.bin"
36
  local_dir = "."
37
  snapshot_download(
 
102
  chunk_words = words[start:candidate_end]
103
  chunks.append(" ".join(chunk_words))
104
  start = candidate_end
105
+ chunk_size += 1
106
  return chunks
107
 
108
 
 
201
 
202
  # For the first chunk, prepend 0; for later chunks, start with the previous chunk's last token.
203
  if i == 0:
204
+ tokens_to_send = [0] + chunk_tokens + [0]
205
  else:
206
+ tokens_to_send = [prev_last_token] + chunk_tokens + [0]
 
 
 
 
207
 
208
  # Save the last token of this chunk for the next iteration.
209
+ prev_last_token = tokens_to_send[-2]
210
 
211
  # Prepare the model input (a batch of one sequence).
212
  final_token = [tokens_to_send]
 
238
  audio_int16 = (audio_output * 32767).astype(np.int16).flatten()
239
 
240
  # Convert to a torch tensor (back into float range) for our helper functions.
241
+ # audio_tensor = torch.from_numpy(audio_int16.astype(np.float32) / 32767)
242
 
243
  # Yield the encoded audio chunk.
244
  if format.lower() == "wav":
245
+ yield audio_int16
246
  elif format.lower() == "opus":
247
  yield audio_tensor_to_opus_bytes(audio_tensor, sample_rate=sample_rate)
248
  else: