Pijush2023 commited on
Commit
cfcb1b1
·
verified ·
1 Parent(s): aeeb222

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -11
app.py CHANGED
@@ -532,7 +532,63 @@ def generate_audio_elevenlabs(text):
532
  # logging.debug(f"Audio saved to {temp_audio_path}")
533
  # return temp_audio_path
534
 
535
- def generate_audio_parler_tts(text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
536
  model_id = 'parler-tts/parler_tts_mini_v0.1'
537
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
538
  try:
@@ -546,22 +602,20 @@ def generate_audio_parler_tts(text):
546
  description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
547
 
548
  input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
549
- max_length = model.config.max_length
 
550
 
551
- # Split the text into smaller chunks if it exceeds the max length
552
- text_chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
553
- audio_segments = []
554
-
555
- for chunk in text_chunks:
556
  prompt_input_ids = tokenizer(chunk, return_tensors="pt").input_ids.to(device)
557
  generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
558
  audio_arr = generation.cpu().numpy().squeeze()
559
- audio_segments.append(audio_arr)
 
 
 
560
 
561
- combined_audio = np.concatenate(audio_segments)
562
-
563
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
564
- sf.write(f.name, combined_audio, model.config.sampling_rate)
565
  temp_audio_path = f.name
566
 
567
  logging.debug(f"Audio saved to {temp_audio_path}")
@@ -571,6 +625,7 @@ def generate_audio_parler_tts(text):
571
 
572
 
573
 
 
574
  # Stable Diffusion setup
575
  pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16)
576
  pipe = pipe.to("cuda")
 
532
  # logging.debug(f"Audio saved to {temp_audio_path}")
533
  # return temp_audio_path
534
 
535
+ # def generate_audio_parler_tts(text):
536
+ # model_id = 'parler-tts/parler_tts_mini_v0.1'
537
+ # device = "cuda:0" if torch.cuda.is_available() else "cpu"
538
+ # try:
539
+ # model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
540
+ # except torch.cuda.OutOfMemoryError:
541
+ # print("CUDA out of memory. Switching to CPU.")
542
+ # device = "cpu"
543
+ # model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
544
+ # tokenizer = AutoTokenizer.from_pretrained(model_id)
545
+
546
+ # description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
547
+
548
+ # input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
549
+ # max_length = model.config.max_length
550
+
551
+ # # Split the text into smaller chunks if it exceeds the max length
552
+ # text_chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
553
+ # audio_segments = []
554
+
555
+ # for chunk in text_chunks:
556
+ # prompt_input_ids = tokenizer(chunk, return_tensors="pt").input_ids.to(device)
557
+ # generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
558
+ # audio_arr = generation.cpu().numpy().squeeze()
559
+ # audio_segments.append(audio_arr)
560
+
561
+ # combined_audio = np.concatenate(audio_segments)
562
+
563
+ # with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
564
+ # sf.write(f.name, combined_audio, model.config.sampling_rate)
565
+ # temp_audio_path = f.name
566
+
567
+ # logging.debug(f"Audio saved to {temp_audio_path}")
568
+ # return temp_audio_path
569
+
570
+ def generate_audio_parler_tts(text, chunk_size=200):
571
+ def split_text(text, chunk_size):
572
+ # Split text into chunks of the specified size
573
+ words = text.split()
574
+ chunks = []
575
+ current_chunk = []
576
+ current_length = 0
577
+
578
+ for word in words:
579
+ if current_length + len(word) + 1 > chunk_size:
580
+ chunks.append(" ".join(current_chunk))
581
+ current_chunk = [word]
582
+ current_length = len(word) + 1
583
+ else:
584
+ current_chunk.append(word)
585
+ current_length += len(word) + 1
586
+
587
+ if current_chunk:
588
+ chunks.append(" ".join(current_chunk))
589
+
590
+ return chunks
591
+
592
  model_id = 'parler-tts/parler_tts_mini_v0.1'
593
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
594
  try:
 
602
  description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
603
 
604
  input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
605
+ chunks = split_text(text, chunk_size)
606
+ audio_arrs = []
607
 
608
+ for chunk in chunks:
 
 
 
 
609
  prompt_input_ids = tokenizer(chunk, return_tensors="pt").input_ids.to(device)
610
  generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
611
  audio_arr = generation.cpu().numpy().squeeze()
612
+ audio_arrs.append(audio_arr)
613
+
614
+ # Concatenate all audio arrays into a single array
615
+ concatenated_audio = np.concatenate(audio_arrs)
616
 
 
 
617
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
618
+ sf.write(f.name, concatenated_audio, model.config.sampling_rate)
619
  temp_audio_path = f.name
620
 
621
  logging.debug(f"Audio saved to {temp_audio_path}")
 
625
 
626
 
627
 
628
+
629
  # Stable Diffusion setup
630
  pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16)
631
  pipe = pipe.to("cuda")