bcci commited on
Commit
ada1283
·
verified ·
1 Parent(s): 3ab2b52

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -116
app.py CHANGED
@@ -42,9 +42,9 @@ def custom_split_text(text: str) -> list:
42
  Custom splitting:
43
  - Start with a chunk size of 2 words.
44
  - For each chunk, if a period (".") is found in any word (except if it’s the very last word),
45
- then split the chunk at that word (i.e. include words up to and including that word).
46
  - Otherwise, use the current chunk size.
47
- - For subsequent chunks, increase the chunk size by 2 (i.e. 2, 4, 6, …).
48
  - If there are fewer than the desired number of words for a full chunk, add all remaining words.
49
  """
50
  words = text.split()
@@ -56,26 +56,24 @@ def custom_split_text(text: str) -> list:
56
  if candidate_end > len(words):
57
  candidate_end = len(words)
58
  chunk_words = words[start:candidate_end]
59
- # Look for a period in the chunk (from right to left)
60
  split_index = None
61
- for i in reversed(range(len(chunk_words))):
62
  if '.' in chunk_words[i]:
63
  split_index = i
64
  break
65
- if split_index is not None and split_index != len(chunk_words) - 1:
66
- # If a period is found and it’s not the last word in the chunk,
67
- # adjust the chunk so it ends at that word.
68
  candidate_end = start + split_index + 1
69
  chunk_words = words[start:candidate_end]
70
  chunks.append(" ".join(chunk_words))
71
  start = candidate_end
72
- chunk_size += 2 # Increase by 2 (added, not multiplied)
73
  return chunks
74
 
75
 
76
  def audio_tensor_to_pcm_bytes(audio_tensor: torch.Tensor) -> bytes:
77
  """
78
- Convert a torch.FloatTensor (with values assumed in [-1, 1]) to raw 16-bit PCM bytes.
79
  """
80
  audio_np = audio_tensor.cpu().numpy()
81
  if audio_np.ndim > 1:
@@ -93,7 +91,7 @@ def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0):
93
  """
94
  Streaming TTS endpoint that returns a continuous WAV stream.
95
 
96
- The endpoint first yields a WAV header (with a dummy length) then yields raw PCM data
97
  for each text chunk as soon as it is generated.
98
  """
99
  chunks = custom_split_text(text)
@@ -118,6 +116,7 @@ def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0):
118
  print(f"Chunk {i}: No audio generated")
119
  except Exception as e:
120
  print(f"Error processing chunk {i}: {e}")
 
121
  return StreamingResponse(
122
  audio_generator(),
123
  media_type="audio/wav",
@@ -147,7 +146,7 @@ def tts_full(text: str, voice: str = "af_heart", speed: float = 1.0):
147
 
148
  sample_rate = 24000
149
  num_channels = 1
150
- sample_width = 2 # 16-bit PCM
151
  wav_io = io.BytesIO()
152
  with wave.open(wav_io, "wb") as wav_file:
153
  wav_file.setnchannels(num_channels)
@@ -163,14 +162,13 @@ def tts_full(text: str, voice: str = "af_heart", speed: float = 1.0):
163
  def index():
164
  """
165
  HTML demo page for Kokoro TTS.
166
-
167
  Two playback methods are provided:
168
- - "Play Full TTS" uses a standard <audio> element.
169
- - "Play Streaming TTS" uses the Web Audio API (via a ScriptProcessorNode) to stream
170
- the raw PCM data as it arrives. This method first reads the WAV header (44 bytes)
171
- then continuously pulls in PCM data, converts it to Float32, and plays it.
172
  """
173
- return r"""
174
  <!DOCTYPE html>
175
  <html>
176
  <head>
@@ -183,109 +181,29 @@ def index():
183
  <input type="text" id="voice" value="af_heart"><br>
184
  <label for="speed">Speed:</label>
185
  <input type="number" step="0.1" id="speed" value="1.0"><br><br>
186
- <button onclick="startStreaming()">Play Streaming TTS (Web Audio API)</button>
187
- <button onclick="playFull()">Play Full TTS (Standard Audio)</button>
188
  <br><br>
189
- <audio id="fullAudio" controls></audio>
190
  <script>
191
- // Function to play full TTS by simply setting the <audio> element's source.
192
- function playFull() {
193
- const text = document.getElementById('text').value;
194
- const voice = document.getElementById('voice').value;
195
- const speed = document.getElementById('speed').value;
196
- const audio = document.getElementById('fullAudio');
197
- audio.src = `/tts/full?text=${encodeURIComponent(text)}&voice=${encodeURIComponent(voice)}&speed=${speed}`;
198
- audio.play();
199
- }
200
-
201
- // Function to stream audio using the Web Audio API.
202
- async function startStreaming() {
203
- const text = document.getElementById('text').value;
204
- const voice = document.getElementById('voice').value;
205
- const speed = document.getElementById('speed').value;
206
- const response = await fetch(`/tts/streaming?text=${encodeURIComponent(text)}&voice=${encodeURIComponent(voice)}&speed=${speed}`);
207
- if (!response.body) {
208
- alert("Streaming not supported in this browser.");
209
- return;
210
- }
211
-
212
- const reader = response.body.getReader();
213
- const audioContext = new (window.AudioContext || window.webkitAudioContext)();
214
- // Create a ScriptProcessorNode (buffer size of 4096 samples)
215
- const scriptNode = audioContext.createScriptProcessor(4096, 1, 1);
216
- let bufferQueue = [];
217
- let currentBuffer = new Float32Array(0);
218
- let headerRead = false;
219
- let headerBytes = new Uint8Array(0);
220
-
221
- // Helper: Convert Int16 PCM (little-endian) to Float32.
222
- function int16ToFloat32(buffer) {
223
- const len = buffer.length;
224
- const floatBuffer = new Float32Array(len);
225
- for (let i = 0; i < len; i++) {
226
- floatBuffer[i] = buffer[i] / 32767;
227
- }
228
- return floatBuffer;
229
  }
230
-
231
- scriptNode.onaudioprocess = function(e) {
232
- const output = e.outputBuffer.getChannelData(0);
233
- let offset = 0;
234
- while (offset < output.length) {
235
- if (currentBuffer.length === 0) {
236
- if (bufferQueue.length > 0) {
237
- currentBuffer = bufferQueue.shift();
238
- } else {
239
- // If no data is available, output silence.
240
- for (let i = offset; i < output.length; i++) {
241
- output[i] = 0;
242
- }
243
- break;
244
- }
245
- }
246
- const needed = output.length - offset;
247
- const available = currentBuffer.length;
248
- const toCopy = Math.min(needed, available);
249
- output.set(currentBuffer.slice(0, toCopy), offset);
250
- offset += toCopy;
251
- if (toCopy < currentBuffer.length) {
252
- currentBuffer = currentBuffer.slice(toCopy);
253
- } else {
254
- currentBuffer = new Float32Array(0);
255
- }
256
- }
257
- };
258
- scriptNode.connect(audioContext.destination);
259
-
260
- // Read the response stream.
261
- while (true) {
262
- const { done, value } = await reader.read();
263
- if (done) break;
264
- let chunk = value;
265
- // First, accumulate the 44-byte WAV header.
266
- if (!headerRead) {
267
- let combined = new Uint8Array(headerBytes.length + chunk.length);
268
- combined.set(headerBytes);
269
- combined.set(chunk, headerBytes.length);
270
- if (combined.length >= 44) {
271
- headerBytes = combined.slice(0, 44);
272
- headerRead = true;
273
- // Remove the header bytes from the chunk.
274
- chunk = combined.slice(44);
275
- } else {
276
- headerBytes = combined;
277
- continue;
278
- }
279
- }
280
- // Make sure the chunk length is even (2 bytes per sample).
281
- if (chunk.length % 2 !== 0) {
282
- chunk = chunk.slice(0, chunk.length - 1);
283
- }
284
- const int16Buffer = new Int16Array(chunk.buffer, chunk.byteOffset, chunk.byteLength / 2);
285
- const floatBuffer = int16ToFloat32(int16Buffer);
286
- bufferQueue.push(floatBuffer);
287
  }
288
- }
289
  </script>
290
  </body>
291
  </html>
 
42
  Custom splitting:
43
  - Start with a chunk size of 2 words.
44
  - For each chunk, if a period (".") is found in any word (except if it’s the very last word),
45
+ then split the chunk at that word (include words up to that word).
46
  - Otherwise, use the current chunk size.
47
+ - For subsequent chunks, increase the chunk size by 2.
48
  - If there are fewer than the desired number of words for a full chunk, add all remaining words.
49
  """
50
  words = text.split()
 
56
  if candidate_end > len(words):
57
  candidate_end = len(words)
58
  chunk_words = words[start:candidate_end]
59
+ # Look for a period in any word except the last one.
60
  split_index = None
61
+ for i in range(len(chunk_words) - 1):
62
  if '.' in chunk_words[i]:
63
  split_index = i
64
  break
65
+ if split_index is not None:
 
 
66
  candidate_end = start + split_index + 1
67
  chunk_words = words[start:candidate_end]
68
  chunks.append(" ".join(chunk_words))
69
  start = candidate_end
70
+ chunk_size += 2 # Increase the chunk size by 2 for the next iteration.
71
  return chunks
72
 
73
 
74
  def audio_tensor_to_pcm_bytes(audio_tensor: torch.Tensor) -> bytes:
75
  """
76
+ Convert a torch.FloatTensor (with values in [-1, 1]) to raw 16-bit PCM bytes.
77
  """
78
  audio_np = audio_tensor.cpu().numpy()
79
  if audio_np.ndim > 1:
 
91
  """
92
  Streaming TTS endpoint that returns a continuous WAV stream.
93
 
94
+ This endpoint first yields a WAV header (with a dummy data length) and then yields raw PCM data
95
  for each text chunk as soon as it is generated.
96
  """
97
  chunks = custom_split_text(text)
 
116
  print(f"Chunk {i}: No audio generated")
117
  except Exception as e:
118
  print(f"Error processing chunk {i}: {e}")
119
+
120
  return StreamingResponse(
121
  audio_generator(),
122
  media_type="audio/wav",
 
146
 
147
  sample_rate = 24000
148
  num_channels = 1
149
+ sample_width = 2 # 16-bit PCM -> 2 bytes per sample
150
  wav_io = io.BytesIO()
151
  with wave.open(wav_io, "wb") as wav_file:
152
  wav_file.setnchannels(num_channels)
 
162
  def index():
163
  """
164
  HTML demo page for Kokoro TTS.
165
+
166
  Two playback methods are provided:
167
+ - "Play Streaming TTS" sets the <audio> element's src to the streaming endpoint.
168
+ - "Play Full TTS" sets the <audio> element's src to the full synthesis endpoint.
169
+ The browser’s native playback handles streaming (progressive download) of the WAV data.
 
170
  """
171
+ return """
172
  <!DOCTYPE html>
173
  <html>
174
  <head>
 
181
  <input type="text" id="voice" value="af_heart"><br>
182
  <label for="speed">Speed:</label>
183
  <input type="number" step="0.1" id="speed" value="1.0"><br><br>
184
+ <button onclick="playStreaming()">Play Streaming TTS</button>
185
+ <button onclick="playFull()">Play Full TTS</button>
186
  <br><br>
187
+ <audio id="audioPlayer" controls autoplay></audio>
188
  <script>
189
+ function playStreaming() {
190
+ const text = document.getElementById('text').value;
191
+ const voice = document.getElementById('voice').value;
192
+ const speed = document.getElementById('speed').value;
193
+ const audio = document.getElementById('audioPlayer');
194
+ // Simply point the audio element to the streaming endpoint.
195
+ audio.src = `/tts/streaming?text=${encodeURIComponent(text)}&voice=${encodeURIComponent(voice)}&speed=${speed}`;
196
+ audio.play();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  }
198
+ function playFull() {
199
+ const text = document.getElementById('text').value;
200
+ const voice = document.getElementById('voice').value;
201
+ const speed = document.getElementById('speed').value;
202
+ const audio = document.getElementById('audioPlayer');
203
+ // Simply point the audio element to the full synthesis endpoint.
204
+ audio.src = `/tts/full?text=${encodeURIComponent(text)}&voice=${encodeURIComponent(voice)}&speed=${speed}`;
205
+ audio.play();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  }
 
207
  </script>
208
  </body>
209
  </html>