Pijush2023 commited on
Commit
f291fd6
·
verified ·
1 Parent(s): f061ade

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -160
app.py CHANGED
@@ -526,151 +526,7 @@ def generate_audio_elevenlabs(text):
526
  logging.error(f"Error generating audio: {response.text}")
527
  return None
528
 
529
- # def generate_audio_parler_tts(text):
530
- # model_id = 'parler-tts/parler_tts_mini_v0.1'
531
- # device = "cuda:0" if torch.cuda.is_available() else "cpu"
532
- # try:
533
- # model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
534
- # except torch.cuda.OutOfMemoryError:
535
- # print("CUDA out of memory. Switching to CPU.")
536
- # device = "cpu"
537
- # model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
538
- # tokenizer = AutoTokenizer.from_pretrained(model_id)
539
-
540
- # description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
541
-
542
- # input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
543
- # prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
544
-
545
- # generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
546
- # audio_arr = generation.cpu().numpy().squeeze()
547
-
548
- # with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
549
- # sf.write(f.name, audio_arr, model.config.sampling_rate)
550
- # temp_audio_path = f.name
551
-
552
- # logging.debug(f"Audio saved to {temp_audio_path}")
553
- # return temp_audio_path
554
-
555
- # def generate_audio_parler_tts(text):
556
- # model_id = 'parler-tts/parler_tts_mini_v0.1'
557
- # device = "cuda:0" if torch.cuda.is_available() else "cpu"
558
- # try:
559
- # model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
560
- # except torch.cuda.OutOfMemoryError:
561
- # print("CUDA out of memory. Switching to CPU.")
562
- # device = "cpu"
563
- # model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
564
- # tokenizer = AutoTokenizer.from_pretrained(model_id)
565
-
566
- # description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
567
-
568
- # input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
569
- # max_length = model.config.max_length
570
-
571
- # # Split the text into smaller chunks if it exceeds the max length
572
- # text_chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
573
- # audio_segments = []
574
-
575
- # for chunk in text_chunks:
576
- # prompt_input_ids = tokenizer(chunk, return_tensors="pt").input_ids.to(device)
577
- # generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
578
- # audio_arr = generation.cpu().numpy().squeeze()
579
- # audio_segments.append(audio_arr)
580
-
581
- # combined_audio = np.concatenate(audio_segments)
582
-
583
- # with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
584
- # sf.write(f.name, combined_audio, model.config.sampling_rate)
585
- # temp_audio_path = f.name
586
-
587
- # logging.debug(f"Audio saved to {temp_audio_path}")
588
- # return temp_audio_path
589
-
590
- # def generate_audio_parler_tts(text, chunk_size=200):
591
- # def split_text(text, chunk_size):
592
- # # Split text into chunks of the specified size
593
- # words = text.split()
594
- # chunks = []
595
- # current_chunk = []
596
- # current_length = 0
597
-
598
- # for word in words:
599
- # if current_length + len(word) + 1 > chunk_size:
600
- # chunks.append(" ".join(current_chunk))
601
- # current_chunk = [word]
602
- # current_length = len(word) + 1
603
- # else:
604
- # current_chunk.append(word)
605
- # current_length += len(word) + 1
606
-
607
- # if current_chunk:
608
- # chunks.append(" ".join(current_chunk))
609
-
610
- # return chunks
611
-
612
- # model_id = 'parler-tts/parler_tts_mini_v0.1'
613
- # device = "cuda:0" if torch.cuda.is_available() else "cpu"
614
- # try:
615
- # model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
616
- # except torch.cuda.OutOfMemoryError:
617
- # print("CUDA out of memory. Switching to CPU.")
618
- # device = "cpu"
619
- # model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
620
- # tokenizer = AutoTokenizer.from_pretrained(model_id)
621
-
622
- # description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
623
-
624
- # input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
625
- # chunks = split_text(text, chunk_size)
626
- # audio_arrs = []
627
-
628
- # for chunk in chunks:
629
- # prompt_input_ids = tokenizer(chunk, return_tensors="pt").input_ids.to(device)
630
- # generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
631
- # audio_arr = generation.cpu().numpy().squeeze()
632
- # audio_arrs.append(audio_arr)
633
-
634
- # # Concatenate all audio arrays into a single array
635
- # concatenated_audio = np.concatenate(audio_arrs)
636
-
637
- # with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
638
- # sf.write(f.name, concatenated_audio, model.config.sampling_rate)
639
- # temp_audio_path = f.name
640
-
641
- # logging.debug(f"Audio saved to {temp_audio_path}")
642
- # return temp_audio_path
643
-
644
-
645
- import concurrent.futures
646
-
647
- def generate_audio_parler_tts(text, chunk_size=200):
648
- def split_text(text, chunk_size):
649
- words = text.split()
650
- chunks = []
651
- current_chunk = []
652
- current_length = 0
653
-
654
- for word in words:
655
- if current_length + len(word) + 1 > chunk_size:
656
- chunks.append(" ".join(current_chunk))
657
- current_chunk = [word]
658
- current_length = len(word) + 1
659
- else:
660
- current_chunk.append(word)
661
- current_length += len(word) + 1
662
-
663
- if current_chunk:
664
- chunks.append(" ".join(current_chunk))
665
-
666
- return chunks
667
-
668
- def process_chunk(chunk):
669
- prompt_input_ids = tokenizer(chunk, return_tensors="pt").input_ids.to(device)
670
- generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
671
- audio_arr = generation.cpu().numpy().squeeze()
672
- return audio_arr
673
-
674
  model_id = 'parler-tts/parler_tts_mini_v0.1'
675
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
676
  try:
@@ -684,28 +540,18 @@ def generate_audio_parler_tts(text, chunk_size=200):
684
  description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
685
 
686
  input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
687
- chunks = split_text(text, chunk_size)
688
-
689
- # Process chunks in parallel
690
- with concurrent.futures.ThreadPoolExecutor() as executor:
691
- futures = [executor.submit(process_chunk, chunk) for chunk in chunks]
692
- audio_arrs = [future.result() for future in concurrent.futures.as_completed(futures)]
693
-
694
- # Concatenate all audio arrays into a single array
695
- concatenated_audio = np.concatenate(audio_arrs)
696
 
697
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
698
- sf.write(f.name, concatenated_audio, model.config.sampling_rate)
699
  temp_audio_path = f.name
700
 
701
  logging.debug(f"Audio saved to {temp_audio_path}")
702
  return temp_audio_path
703
 
704
-
705
-
706
-
707
-
708
-
709
  # Stable Diffusion setup
710
  pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16)
711
  pipe = pipe.to("cuda")
 
526
  logging.error(f"Error generating audio: {response.text}")
527
  return None
528
 
529
+ def generate_audio_parler_tts(text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
530
  model_id = 'parler-tts/parler_tts_mini_v0.1'
531
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
532
  try:
 
540
  description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
541
 
542
  input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
543
+ prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
544
+
545
+ generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
546
+ audio_arr = generation.cpu().numpy().squeeze()
 
 
 
 
 
547
 
548
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
549
+ sf.write(f.name, audio_arr, model.config.sampling_rate)
550
  temp_audio_path = f.name
551
 
552
  logging.debug(f"Audio saved to {temp_audio_path}")
553
  return temp_audio_path
554
 
 
 
 
 
 
555
  # Stable Diffusion setup
556
  pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16)
557
  pipe = pipe.to("cuda")