Steveeeeeeen HF staff commited on
Commit
15994b1
·
verified ·
1 Parent(s): 4f7f0f0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -4
app.py CHANGED
@@ -186,6 +186,61 @@ def infer(sample_audio_path, target_text, progress=gr.Progress()):
186
 
187
  return (16000, gen_wav[0, 0, :].cpu().numpy())
188
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  with gr.Blocks() as app_tts:
190
  gr.Markdown("# Zero Shot Voice Clone TTS")
191
 
@@ -229,17 +284,41 @@ with gr.Blocks() as app_credits:
229
  * [mrfakename](https://huggingface.co/mrfakename) for the [gradio demo code](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
230
  """)
231
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  with gr.Blocks() as app:
233
  gr.Markdown(
234
  """
235
  # Llasa 1b Multilingual TTS
236
 
237
- This is a local web UI for Llasa 1b multilingual Zero Shot Voice Cloning and TTS model that supports English, Chinese, French, German, Dutch, Spanish, Italian, Portuguese, Polish, Japanese and Korean!
 
 
 
 
238
 
239
  If you're having issues, try converting your reference audio to WAV or MP3, clipping it to 15s, and shortening your prompt.
240
  """
241
  )
242
- gr.TabbedInterface([app_tts], ["TTS"])
243
-
 
 
244
 
245
- app.launch(ssr_mode=False, share=True)
 
186
 
187
  return (16000, gen_wav[0, 0, :].cpu().numpy())
188
 
189
+ def text_only_infer(target_text, progress=gr.Progress()):
190
+ """Function to generate speech directly from text without a reference voice"""
191
+ if len(target_text) == 0:
192
+ return None
193
+ elif len(target_text) > 300:
194
+ gr.Warning("Text is too long. Please keep it under 300 characters.")
195
+ target_text = target_text[:300]
196
+
197
+ progress(0.2, 'Generating speech...')
198
+
199
+ with torch.no_grad():
200
+ formatted_text = f"<|TEXT_UNDERSTANDING_START|>{target_text}<|TEXT_UNDERSTANDING_END|>"
201
+
202
+ # Tokenize the text
203
+ chat = [
204
+ {"role": "user", "content": "Convert the text to speech:" + formatted_text},
205
+ {"role": "assistant", "content": "<|SPEECH_GENERATION_START|>"}
206
+ ]
207
+
208
+ input_ids = tokenizer.apply_chat_template(
209
+ chat,
210
+ tokenize=True,
211
+ return_tensors='pt',
212
+ continue_final_message=True
213
+ )
214
+ input_ids = input_ids.to('cuda')
215
+ speech_end_id = tokenizer.convert_tokens_to_ids('<|SPEECH_GENERATION_END|>')
216
+
217
+ # Generate the speech autoregressively
218
+ outputs = model.generate(
219
+ input_ids,
220
+ max_length=2048,
221
+ eos_token_id=speech_end_id,
222
+ do_sample=True,
223
+ top_p=1,
224
+ temperature=0.8
225
+ )
226
+
227
+ progress(0.6, 'Processing audio...')
228
+
229
+ # Extract the speech tokens
230
+ generated_ids = outputs[0][input_ids.shape[1]:-1]
231
+ speech_tokens = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
232
+
233
+ # Convert token <|s_23456|> to int 23456
234
+ speech_tokens = extract_speech_ids(speech_tokens)
235
+ speech_tokens = torch.tensor(speech_tokens).cuda().unsqueeze(0).unsqueeze(0)
236
+
237
+ # Decode the speech tokens to speech waveform
238
+ gen_wav = Codec_model.decode_code(speech_tokens)
239
+
240
+ progress(1, 'Done!')
241
+
242
+ return (16000, gen_wav[0, 0, :].cpu().numpy())
243
+
244
  with gr.Blocks() as app_tts:
245
  gr.Markdown("# Zero Shot Voice Clone TTS")
246
 
 
284
  * [mrfakename](https://huggingface.co/mrfakename) for the [gradio demo code](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
285
  """)
286
 
287
+ with gr.Blocks() as app_direct_tts:
288
+ gr.Markdown("# Direct Text-to-Speech")
289
+ gr.Markdown("Generate speech directly from text without voice cloning")
290
+
291
+ text_input = gr.Textbox(
292
+ label="Text to Generate",
293
+ lines=10,
294
+ placeholder="Enter the text you want to convert to speech..."
295
+ )
296
+ generate_btn = gr.Button("Generate Speech", variant="primary")
297
+ audio_output = gr.Audio(label="Generated Audio")
298
+
299
+ generate_btn.click(
300
+ text_only_infer,
301
+ inputs=[text_input],
302
+ outputs=[audio_output],
303
+ )
304
+
305
  with gr.Blocks() as app:
306
  gr.Markdown(
307
  """
308
  # Llasa 1b Multilingual TTS
309
 
310
+ This is a local web UI for Llasa 1b multilingual TTS that supports:
311
+ - Zero Shot Voice Cloning
312
+ - Direct Text-to-Speech
313
+
314
+ Supports multiple languages including English, Chinese, French, German, Dutch, Spanish, Italian, Portuguese, Polish, Japanese and Korean!
315
 
316
  If you're having issues, try converting your reference audio to WAV or MP3, clipping it to 15s, and shortening your prompt.
317
  """
318
  )
319
+ gr.TabbedInterface(
320
+ [app_direct_tts, app_tts],
321
+ ["Direct TTS", "Voice Cloning"]
322
+ )
323
 
324
+ app.launch(ssr_mode=False)