Pijush2023 commited on
Commit
aeeb222
·
verified ·
1 Parent(s): 6624924

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -4
app.py CHANGED
@@ -506,6 +506,32 @@ def generate_audio_elevenlabs(text):
506
  logging.error(f"Error generating audio: {response.text}")
507
  return None
508
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
509
  def generate_audio_parler_tts(text):
510
  model_id = 'parler-tts/parler_tts_mini_v0.1'
511
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
@@ -520,18 +546,31 @@ def generate_audio_parler_tts(text):
520
  description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
521
 
522
  input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
523
- prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
524
 
525
- generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
526
- audio_arr = generation.cpu().numpy().squeeze()
 
 
 
 
 
 
 
527
 
 
 
528
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
529
- sf.write(f.name, audio_arr, model.config.sampling_rate)
530
  temp_audio_path = f.name
531
 
532
  logging.debug(f"Audio saved to {temp_audio_path}")
533
  return temp_audio_path
534
 
 
 
 
 
535
  # Stable Diffusion setup
536
  pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16)
537
  pipe = pipe.to("cuda")
 
506
  logging.error(f"Error generating audio: {response.text}")
507
  return None
508
 
509
+ # def generate_audio_parler_tts(text):
510
+ # model_id = 'parler-tts/parler_tts_mini_v0.1'
511
+ # device = "cuda:0" if torch.cuda.is_available() else "cpu"
512
+ # try:
513
+ # model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
514
+ # except torch.cuda.OutOfMemoryError:
515
+ # print("CUDA out of memory. Switching to CPU.")
516
+ # device = "cpu"
517
+ # model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
518
+ # tokenizer = AutoTokenizer.from_pretrained(model_id)
519
+
520
+ # description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
521
+
522
+ # input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
523
+ # prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
524
+
525
+ # generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
526
+ # audio_arr = generation.cpu().numpy().squeeze()
527
+
528
+ # with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
529
+ # sf.write(f.name, audio_arr, model.config.sampling_rate)
530
+ # temp_audio_path = f.name
531
+
532
+ # logging.debug(f"Audio saved to {temp_audio_path}")
533
+ # return temp_audio_path
534
+
535
  def generate_audio_parler_tts(text):
536
  model_id = 'parler-tts/parler_tts_mini_v0.1'
537
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
 
546
  description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
547
 
548
  input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
549
+ max_length = model.config.max_length
550
 
551
+ # Split the text into smaller chunks if it exceeds the max length
552
+ text_chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
553
+ audio_segments = []
554
+
555
+ for chunk in text_chunks:
556
+ prompt_input_ids = tokenizer(chunk, return_tensors="pt").input_ids.to(device)
557
+ generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
558
+ audio_arr = generation.cpu().numpy().squeeze()
559
+ audio_segments.append(audio_arr)
560
 
561
+ combined_audio = np.concatenate(audio_segments)
562
+
563
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
564
+ sf.write(f.name, combined_audio, model.config.sampling_rate)
565
  temp_audio_path = f.name
566
 
567
  logging.debug(f"Audio saved to {temp_audio_path}")
568
  return temp_audio_path
569
 
570
+
571
+
572
+
573
+
574
  # Stable Diffusion setup
575
  pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16)
576
  pipe = pipe.to("cuda")