Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -31,7 +31,7 @@ phoneme_vocab = config['vocab']
|
|
31 |
# Download the model and voice files from Hugging Face Hub
|
32 |
# ------------------------------------------------------------------------------
|
33 |
model_repo = "onnx-community/Kokoro-82M-v1.0-ONNX"
|
34 |
-
model_name = "onnx/
|
35 |
voice_file_pattern = "*.bin"
|
36 |
local_dir = "."
|
37 |
snapshot_download(
|
@@ -72,6 +72,7 @@ def generate_wav_header(sample_rate: int, num_channels: int, sample_width: int,
|
|
72 |
data_chunk_header = struct.pack('<4sI', b'data', data_size)
|
73 |
return header + fmt_chunk + data_chunk_header
|
74 |
|
|
|
75 |
|
76 |
def custom_split_text(text: str) -> list:
|
77 |
"""
|
@@ -102,7 +103,7 @@ def custom_split_text(text: str) -> list:
|
|
102 |
chunk_words = words[start:candidate_end]
|
103 |
chunks.append(" ".join(chunk_words))
|
104 |
start = candidate_end
|
105 |
-
chunk_size +=
|
106 |
return chunks
|
107 |
|
108 |
|
@@ -151,11 +152,9 @@ def tokenizer(text: str):
|
|
151 |
"""
|
152 |
Converts text to a list of phoneme tokens using the global vocabulary.
|
153 |
"""
|
154 |
-
print("Text: " + text)
|
155 |
phonemes_string, _ = g2p(text)
|
156 |
phonemes = [ph for ph in phonemes_string]
|
157 |
tokens = [phoneme_vocab[phoneme] for phoneme in phonemes if phoneme in phoneme_vocab]
|
158 |
-
print("Tokens:", tokens)
|
159 |
return tokens
|
160 |
|
161 |
|
@@ -164,7 +163,7 @@ def tokenizer(text: str):
|
|
164 |
# ------------------------------------------------------------------------------
|
165 |
|
166 |
@app.get("/tts/streaming", summary="Streaming TTS")
|
167 |
-
def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format: str = "
|
168 |
"""
|
169 |
Streaming TTS endpoint.
|
170 |
|
@@ -177,9 +176,6 @@ def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format
|
|
177 |
The audio for each chunk is generated immediately and streamed to the client.
|
178 |
"""
|
179 |
chunks = custom_split_text(text)
|
180 |
-
sample_rate = 24000
|
181 |
-
num_channels = 1
|
182 |
-
sample_width = 2
|
183 |
|
184 |
# Load the voice/style file (must be present in voices/{voice}.bin)
|
185 |
voice_path = os.path.join(local_dir, f"voices/{voice}.bin")
|
@@ -190,23 +186,21 @@ def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format
|
|
190 |
def audio_generator():
|
191 |
# If outputting a WAV stream, yield a WAV header once.
|
192 |
if format.lower() == "wav":
|
193 |
-
|
194 |
-
yield header
|
195 |
|
196 |
prev_last_token = None
|
197 |
for i, chunk in enumerate(chunks):
|
198 |
-
print(f"Processing chunk {i}: {chunk}")
|
199 |
# Convert the chunk text to tokens.
|
200 |
chunk_tokens = tokenizer(chunk)
|
201 |
|
202 |
# For the first chunk, prepend 0; for later chunks, start with the previous chunk's last token.
|
203 |
if i == 0:
|
204 |
-
tokens_to_send = [0] + chunk_tokens
|
205 |
else:
|
206 |
-
tokens_to_send = [prev_last_token] + chunk_tokens
|
207 |
|
208 |
# Save the last token of this chunk for the next iteration.
|
209 |
-
prev_last_token = tokens_to_send[-
|
210 |
|
211 |
# Prepare the model input (a batch of one sequence).
|
212 |
final_token = [tokens_to_send]
|
|
|
31 |
# Download the model and voice files from Hugging Face Hub
|
32 |
# ------------------------------------------------------------------------------
|
33 |
model_repo = "onnx-community/Kokoro-82M-v1.0-ONNX"
|
34 |
+
model_name = "onnx/model_q4f16.onnx"
|
35 |
voice_file_pattern = "*.bin"
|
36 |
local_dir = "."
|
37 |
snapshot_download(
|
|
|
72 |
data_chunk_header = struct.pack('<4sI', b'data', data_size)
|
73 |
return header + fmt_chunk + data_chunk_header
|
74 |
|
75 |
+
stream_header = generate_wav_header(24000, 1, 2)
|
76 |
|
77 |
def custom_split_text(text: str) -> list:
|
78 |
"""
|
|
|
103 |
chunk_words = words[start:candidate_end]
|
104 |
chunks.append(" ".join(chunk_words))
|
105 |
start = candidate_end
|
106 |
+
chunk_size += 2
|
107 |
return chunks
|
108 |
|
109 |
|
|
|
152 |
"""
|
153 |
Converts text to a list of phoneme tokens using the global vocabulary.
|
154 |
"""
|
|
|
155 |
phonemes_string, _ = g2p(text)
|
156 |
phonemes = [ph for ph in phonemes_string]
|
157 |
tokens = [phoneme_vocab[phoneme] for phoneme in phonemes if phoneme in phoneme_vocab]
|
|
|
158 |
return tokens
|
159 |
|
160 |
|
|
|
163 |
# ------------------------------------------------------------------------------
|
164 |
|
165 |
@app.get("/tts/streaming", summary="Streaming TTS")
|
166 |
+
def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format: str = "wav"):
|
167 |
"""
|
168 |
Streaming TTS endpoint.
|
169 |
|
|
|
176 |
The audio for each chunk is generated immediately and streamed to the client.
|
177 |
"""
|
178 |
chunks = custom_split_text(text)
|
|
|
|
|
|
|
179 |
|
180 |
# Load the voice/style file (must be present in voices/{voice}.bin)
|
181 |
voice_path = os.path.join(local_dir, f"voices/{voice}.bin")
|
|
|
186 |
def audio_generator():
|
187 |
# If outputting a WAV stream, yield a WAV header once.
|
188 |
if format.lower() == "wav":
|
189 |
+
yield stream_header
|
|
|
190 |
|
191 |
prev_last_token = None
|
192 |
for i, chunk in enumerate(chunks):
|
|
|
193 |
# Convert the chunk text to tokens.
|
194 |
chunk_tokens = tokenizer(chunk)
|
195 |
|
196 |
# For the first chunk, prepend 0; for later chunks, start with the previous chunk's last token.
|
197 |
if i == 0:
|
198 |
+
tokens_to_send = [0] + chunk_tokens
|
199 |
else:
|
200 |
+
tokens_to_send = [prev_last_token] + chunk_tokens
|
201 |
|
202 |
# Save the last token of this chunk for the next iteration.
|
203 |
+
prev_last_token = tokens_to_send[-1]
|
204 |
|
205 |
# Prepare the model input (a batch of one sequence).
|
206 |
final_token = [tokens_to_send]
|