Pijush2023 commited on
Commit
7b4aa5f
·
verified ·
1 Parent(s): 53bd549

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -5
app.py CHANGED
@@ -545,9 +545,92 @@ def generate_audio_parler_tts(text):
545
  logging.debug(f"Audio saved to {combined_audio_path}")
546
  return combined_audio_path
547
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
548
  # Load the MARS5 model
549
  mars5, config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True)
550
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
551
  def generate_audio_mars5(text):
552
  description = "Thomas speaks with emphasis and excitement at a moderate pace with high quality."
553
  kwargs_dict = {
@@ -562,20 +645,24 @@ def generate_audio_mars5(text):
562
  'deep_clone': True,
563
  'nar_guidance_w': 3
564
  }
565
-
566
  chunks = chunk_text(preprocess(text))
567
  audio_segments = []
568
 
569
- for chunk in chunks:
570
  wav = torch.zeros(1, mars5.sr) # Use a placeholder silent audio for the reference
571
  cfg = config_class(**{k: kwargs_dict[k] for k in kwargs_dict if k in config_class.__dataclass_fields__})
572
  ar_codes, wav_out = mars5.tts(chunk, wav, "", cfg=cfg)
573
 
574
-
575
  temp_audio_path = os.path.join(tempfile.gettempdir(), f"mars5_audio_{len(audio_segments)}.wav")
576
  torchaudio.save(temp_audio_path, wav_out.unsqueeze(0), mars5.sr)
577
- audio_segments.append(AudioSegment.from_wav(temp_audio_path))
578
-
 
 
 
 
 
579
  combined_audio = sum(audio_segments)
580
  combined_audio_path = os.path.join(tempfile.gettempdir(), "mars5_combined_audio.wav")
581
  combined_audio.export(combined_audio_path, format="wav")
@@ -583,6 +670,7 @@ def generate_audio_mars5(text):
583
  logging.debug(f"Audio saved to {combined_audio_path}")
584
  return combined_audio_path
585
 
 
586
  pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", torch_dtype=torch.float16)
587
  pipe.to(device)
588
 
 
545
  logging.debug(f"Audio saved to {combined_audio_path}")
546
  return combined_audio_path
547
 
548
+ # # Load the MARS5 model
549
+ # mars5, config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True)
550
+
551
+ # def generate_audio_mars5(text):
552
+ # description = "Thomas speaks with emphasis and excitement at a moderate pace with high quality."
553
+ # kwargs_dict = {
554
+ # 'temperature': 0.8,
555
+ # 'top_k': -1,
556
+ # 'top_p': 0.2,
557
+ # 'typical_p': 1.0,
558
+ # 'freq_penalty': 2.6,
559
+ # 'presence_penalty': 0.4,
560
+ # 'rep_penalty_window': 100,
561
+ # 'max_prompt_phones': 360,
562
+ # 'deep_clone': True,
563
+ # 'nar_guidance_w': 3
564
+ # }
565
+
566
+ # chunks = chunk_text(preprocess(text))
567
+ # audio_segments = []
568
+
569
+ # for chunk in chunks:
570
+ # wav = torch.zeros(1, mars5.sr) # Use a placeholder silent audio for the reference
571
+ # cfg = config_class(**{k: kwargs_dict[k] for k in kwargs_dict if k in config_class.__dataclass_fields__})
572
+ # ar_codes, wav_out = mars5.tts(chunk, wav, "", cfg=cfg)
573
+
574
+
575
+ # temp_audio_path = os.path.join(tempfile.gettempdir(), f"mars5_audio_{len(audio_segments)}.wav")
576
+ # torchaudio.save(temp_audio_path, wav_out.unsqueeze(0), mars5.sr)
577
+ # audio_segments.append(AudioSegment.from_wav(temp_audio_path))
578
+
579
+ # combined_audio = sum(audio_segments)
580
+ # combined_audio_path = os.path.join(tempfile.gettempdir(), "mars5_combined_audio.wav")
581
+ # combined_audio.export(combined_audio_path, format="wav")
582
+
583
+ # logging.debug(f"Audio saved to {combined_audio_path}")
584
+ # return combined_audio_path
585
+
586
  # Load the MARS5 model
587
  mars5, config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True)
588
 
589
+ # Setting device and precision
590
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
591
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
592
+ mars5.to(device)
593
+
594
+ SAMPLE_RATE = 22050
595
+ SEED = 42
596
+
597
+ def preprocess(text):
598
+ number_normalizer = EnglishNumberNormalizer()
599
+ text = number_normalizer(text).strip()
600
+ if text[-1] not in punctuation:
601
+ text = f"{text}."
602
+
603
+ abbreviations_pattern = r'\b[A-Z][A-Z\.]+\b'
604
+ def separate_abb(chunk):
605
+ chunk = chunk.replace(".", "")
606
+ return " ".join(chunk)
607
+
608
+ abbreviations = re.findall(abbreviations_pattern, text)
609
+ for abv in abbreviations:
610
+ if abv in text:
611
+ text = text.replace(abv, separate_abb(abv))
612
+ return text
613
+
614
+ def chunk_text(text, max_length=250):
615
+ words = text.split()
616
+ chunks = []
617
+ current_chunk = []
618
+ current_length = 0
619
+
620
+ for word in words:
621
+ if current_length + len(word) + 1 <= max_length:
622
+ current_chunk.append(word)
623
+ current_length += len(word) + 1
624
+ else:
625
+ chunks.append(' '.join(current_chunk))
626
+ current_chunk = [word]
627
+ current_length = len(word) + 1
628
+
629
+ if current_chunk:
630
+ chunks.append(' '.join(current_chunk))
631
+
632
+ return chunks
633
+
634
  def generate_audio_mars5(text):
635
  description = "Thomas speaks with emphasis and excitement at a moderate pace with high quality."
636
  kwargs_dict = {
 
645
  'deep_clone': True,
646
  'nar_guidance_w': 3
647
  }
648
+
649
  chunks = chunk_text(preprocess(text))
650
  audio_segments = []
651
 
652
+ def process_chunk(chunk):
653
  wav = torch.zeros(1, mars5.sr) # Use a placeholder silent audio for the reference
654
  cfg = config_class(**{k: kwargs_dict[k] for k in kwargs_dict if k in config_class.__dataclass_fields__})
655
  ar_codes, wav_out = mars5.tts(chunk, wav, "", cfg=cfg)
656
 
 
657
  temp_audio_path = os.path.join(tempfile.gettempdir(), f"mars5_audio_{len(audio_segments)}.wav")
658
  torchaudio.save(temp_audio_path, wav_out.unsqueeze(0), mars5.sr)
659
+ return AudioSegment.from_wav(temp_audio_path)
660
+
661
+ # Use concurrent futures for parallel processing
662
+ with concurrent.futures.ThreadPoolExecutor() as executor:
663
+ results = list(executor.map(process_chunk, chunks))
664
+ audio_segments.extend(results)
665
+
666
  combined_audio = sum(audio_segments)
667
  combined_audio_path = os.path.join(tempfile.gettempdir(), "mars5_combined_audio.wav")
668
  combined_audio.export(combined_audio_path, format="wav")
 
670
  logging.debug(f"Audio saved to {combined_audio_path}")
671
  return combined_audio_path
672
 
673
+
674
  pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", torch_dtype=torch.float16)
675
  pipe.to(device)
676