bcci commited on
Commit
ec5398b
Β·
verified Β·
1 Parent(s): 795d4dd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -57
app.py CHANGED
@@ -47,40 +47,6 @@ def generate_wav_header(sample_rate: int, num_channels: int, sample_width: int,
47
  return header + fmt_chunk + data_chunk_header
48
 
49
 
50
- def custom_split_text(text: str) -> list:
51
- """
52
- Custom splitting:
53
- - Start with a chunk size of 2 words.
54
- - For each chunk, if a period (".") is found in any word (except if it’s the very last word),
55
- then split the chunk at that word (include words up to that word).
56
- - Otherwise, use the current chunk size.
57
- - For subsequent chunks, increase the chunk size by 2.
58
- - If there are fewer than the desired number of words for a full chunk, add all remaining words.
59
- """
60
- words = text.split()
61
- chunks = []
62
- chunk_size = 2
63
- start = 0
64
- while start < len(words):
65
- candidate_end = start + chunk_size
66
- if candidate_end > len(words):
67
- candidate_end = len(words)
68
- chunk_words = words[start:candidate_end]
69
- # Look for a period in any word except the last one.
70
- split_index = None
71
- for i in range(len(chunk_words) - 1):
72
- if '.' in chunk_words[i]:
73
- split_index = i
74
- break
75
- if split_index is not None:
76
- candidate_end = start + split_index + 1
77
- chunk_words = words[start:candidate_end]
78
- chunks.append(" ".join(chunk_words))
79
- start = candidate_end
80
- chunk_size += 2 # Increase the chunk size by 2 for the next iteration.
81
- return chunks
82
-
83
-
84
  def audio_tensor_to_pcm_bytes(audio_tensor: torch.Tensor) -> bytes:
85
  """
86
  Convert a torch.FloatTensor (with values in [-1, 1]) to raw 16-bit PCM bytes.
@@ -131,17 +97,17 @@ def audio_tensor_to_opus_bytes(audio_tensor: torch.Tensor, sample_rate: int = 24
131
  # Endpoints
132
  # ------------------------------------------------------------------------------
133
 
134
- @app.get("/tts/streaming", summary="Streaming TTS")
135
  def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format: str = "opus"):
136
  """
137
- Streaming TTS endpoint that returns a continuous audio stream.
138
- Supports WAV (PCM) and Opus formats. Opus offers significantly better compression.
 
 
139
 
140
  The endpoint first yields a WAV header (with a dummy length) for WAV,
141
- then yields encoded audio data for each text chunk as soon as it is generated.
142
  """
143
- # Split the input text using the custom doubling strategy.
144
- chunks = custom_split_text(text)
145
  sample_rate = 24000
146
  num_channels = 1
147
  sample_width = 2 # 16-bit PCM
@@ -151,24 +117,22 @@ def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format
151
  # Yield the WAV header first.
152
  header = generate_wav_header(sample_rate, num_channels, sample_width)
153
  yield header
154
- # Process and yield each chunk's audio data.
155
- for i, chunk in enumerate(chunks):
156
- print(f"Processing chunk {i}: {chunk}") # Debugging
157
- try:
158
- results = list(pipeline(chunk, voice=voice, speed=speed, split_pattern=None))
159
- for result in results:
160
- if result.audio is not None:
161
- if format.lower() == "wav":
162
- yield audio_tensor_to_pcm_bytes(result.audio)
163
- elif format.lower() == "opus":
164
- yield audio_tensor_to_opus_bytes(result.audio, sample_rate=sample_rate)
165
- else:
166
- raise ValueError(f"Unsupported audio format: {format}")
167
  else:
168
- print(f"Chunk {i}: No audio generated")
169
- except Exception as e:
170
- print(f"Error processing chunk {i}: {e}")
171
- yield b'' # important so that streaming continues. Consider returning an error sound.
 
 
172
 
173
  media_type = "audio/wav" if format.lower() == "wav" else "audio/opus"
174
 
 
47
  return header + fmt_chunk + data_chunk_header
48
 
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  def audio_tensor_to_pcm_bytes(audio_tensor: torch.Tensor) -> bytes:
51
  """
52
  Convert a torch.FloatTensor (with values in [-1, 1]) to raw 16-bit PCM bytes.
 
97
  # Endpoints
98
  # ------------------------------------------------------------------------------
99
 
100
+ @app.get("/tts/streaming", summary="True Streaming TTS")
101
  def tts_streaming(text: str, voice: str = "af_heart", speed: float = 1.0, format: str = "opus"):
102
  """
103
+ True Streaming TTS endpoint that returns a continuous audio stream.
104
+ It processes text and generates audio token by token (or small chunks as KPipeline yields),
105
+ providing a more responsive streaming experience.
106
+ Supports WAV (PCM) and Opus formats. Opus offers significantly better compression.
107
 
108
  The endpoint first yields a WAV header (with a dummy length) for WAV,
109
+ then yields encoded audio data for each token's audio as soon as it is generated.
110
  """
 
 
111
  sample_rate = 24000
112
  num_channels = 1
113
  sample_width = 2 # 16-bit PCM
 
117
  # Yield the WAV header first.
118
  header = generate_wav_header(sample_rate, num_channels, sample_width)
119
  yield header
120
+
121
+ try:
122
+ results = pipeline(text, voice=voice, speed=speed, split_pattern=None) # split_pattern=None to avoid splitting here, let KPipeline handle
123
+ for result in results:
124
+ if result.audio is not None:
125
+ if format.lower() == "wav":
126
+ yield audio_tensor_to_pcm_bytes(result.audio)
127
+ elif format.lower() == "opus":
128
+ yield audio_tensor_to_opus_bytes(result.audio, sample_rate=sample_rate)
 
 
 
 
129
  else:
130
+ raise ValueError(f"Unsupported audio format: {format}")
131
+ else:
132
+ print("No audio generated for a token/chunk") # Debugging, remove in production if not needed
133
+ except Exception as e:
134
+ print(f"Error during TTS processing: {e}")
135
+ yield b'' # Important: yield empty bytes to keep stream alive, or handle error sound
136
 
137
  media_type = "audio/wav" if format.lower() == "wav" else "audio/opus"
138