bcci commited on
Commit
e76bbe1
·
verified ·
1 Parent(s): 01f0881

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -14
app.py CHANGED
@@ -31,7 +31,7 @@ phoneme_vocab = config['vocab']
31
  # Download the model and voice files from Hugging Face Hub
32
  # ------------------------------------------------------------------------------
33
  model_repo = "onnx-community/Kokoro-82M-v1.0-ONNX"
34
- model_name = "onnx/model_q8f16.onnx"
35
  voice_file_pattern = "*.bin"
36
  local_dir = "."
37
  snapshot_download(
@@ -72,6 +72,7 @@ def generate_wav_header(sample_rate: int, num_channels: int, sample_width: int,
72
  data_chunk_header = struct.pack('<4sI', b'data', data_size)
73
  return header + fmt_chunk + data_chunk_header
74
 
 
75
 
76
  def custom_split_text(text: str) -> list:
77
  """
@@ -102,7 +103,7 @@ def custom_split_text(text: str) -> list:
102
  chunk_words = words[start:candidate_end]
103
  chunks.append(" ".join(chunk_words))
104
  start = candidate_end
105
- chunk_size += 1
106
  return chunks
107
 
108
 
@@ -151,11 +152,9 @@ def tokenizer(text: str):
151
  """
152
  Converts text to a list of phoneme tokens using the global vocabulary.
153
  """
154
- print("Text: " + text)
155
  phonemes_string, _ = g2p(text)
156
  phonemes = [ph for ph in phonemes_string]
157
  tokens = [phoneme_vocab[phoneme] for phoneme in phonemes if phoneme in phoneme_vocab]
158
- print("Tokens:", tokens)
159
  return tokens
160
 
161
 
@@ -164,7 +163,7 @@ def tokenizer(text: str):
164
  # ------------------------------------------------------------------------------
165
 
166
  @app.get("/tts/streaming", summary="Streaming TTS")
167
- def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format: str = "opus"):
168
  """
169
  Streaming TTS endpoint.
170
 
@@ -177,9 +176,6 @@ def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format
177
  The audio for each chunk is generated immediately and streamed to the client.
178
  """
179
  chunks = custom_split_text(text)
180
- sample_rate = 24000
181
- num_channels = 1
182
- sample_width = 2
183
 
184
  # Load the voice/style file (must be present in voices/{voice}.bin)
185
  voice_path = os.path.join(local_dir, f"voices/{voice}.bin")
@@ -190,23 +186,21 @@ def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format
190
  def audio_generator():
191
  # If outputting a WAV stream, yield a WAV header once.
192
  if format.lower() == "wav":
193
- header = generate_wav_header(sample_rate, num_channels, sample_width)
194
- yield header
195
 
196
  prev_last_token = None
197
  for i, chunk in enumerate(chunks):
198
- print(f"Processing chunk {i}: {chunk}")
199
  # Convert the chunk text to tokens.
200
  chunk_tokens = tokenizer(chunk)
201
 
202
  # For the first chunk, prepend 0; for later chunks, start with the previous chunk's last token.
203
  if i == 0:
204
- tokens_to_send = [0] + chunk_tokens + [0]
205
  else:
206
- tokens_to_send = [prev_last_token] + chunk_tokens + [0]
207
 
208
  # Save the last token of this chunk for the next iteration.
209
- prev_last_token = tokens_to_send[-2]
210
 
211
  # Prepare the model input (a batch of one sequence).
212
  final_token = [tokens_to_send]
 
31
  # Download the model and voice files from Hugging Face Hub
32
  # ------------------------------------------------------------------------------
33
  model_repo = "onnx-community/Kokoro-82M-v1.0-ONNX"
34
+ model_name = "onnx/model_q4f16.onnx"
35
  voice_file_pattern = "*.bin"
36
  local_dir = "."
37
  snapshot_download(
 
72
  data_chunk_header = struct.pack('<4sI', b'data', data_size)
73
  return header + fmt_chunk + data_chunk_header
74
 
75
+ stream_header = generate_wav_header(24000, 1, 2)
76
 
77
  def custom_split_text(text: str) -> list:
78
  """
 
103
  chunk_words = words[start:candidate_end]
104
  chunks.append(" ".join(chunk_words))
105
  start = candidate_end
106
+ chunk_size += 2
107
  return chunks
108
 
109
 
 
152
  """
153
  Converts text to a list of phoneme tokens using the global vocabulary.
154
  """
 
155
  phonemes_string, _ = g2p(text)
156
  phonemes = [ph for ph in phonemes_string]
157
  tokens = [phoneme_vocab[phoneme] for phoneme in phonemes if phoneme in phoneme_vocab]
 
158
  return tokens
159
 
160
 
 
163
  # ------------------------------------------------------------------------------
164
 
165
  @app.get("/tts/streaming", summary="Streaming TTS")
166
+ def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format: str = "wav"):
167
  """
168
  Streaming TTS endpoint.
169
 
 
176
  The audio for each chunk is generated immediately and streamed to the client.
177
  """
178
  chunks = custom_split_text(text)
 
 
 
179
 
180
  # Load the voice/style file (must be present in voices/{voice}.bin)
181
  voice_path = os.path.join(local_dir, f"voices/{voice}.bin")
 
186
  def audio_generator():
187
  # If outputting a WAV stream, yield a WAV header once.
188
  if format.lower() == "wav":
189
+ yield stream_header
 
190
 
191
  prev_last_token = None
192
  for i, chunk in enumerate(chunks):
 
193
  # Convert the chunk text to tokens.
194
  chunk_tokens = tokenizer(chunk)
195
 
196
  # For the first chunk, prepend 0; for later chunks, start with the previous chunk's last token.
197
  if i == 0:
198
+ tokens_to_send = [0] + chunk_tokens
199
  else:
200
+ tokens_to_send = [prev_last_token] + chunk_tokens
201
 
202
  # Save the last token of this chunk for the next iteration.
203
+ prev_last_token = tokens_to_send[-1]
204
 
205
  # Prepare the model input (a batch of one sequence).
206
  final_token = [tokens_to_send]