Pijush2023 commited on
Commit
a7747d2
·
verified ·
1 Parent(s): 7b4aa5f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -92
app.py CHANGED
@@ -545,92 +545,9 @@ def generate_audio_parler_tts(text):
545
  logging.debug(f"Audio saved to {combined_audio_path}")
546
  return combined_audio_path
547
 
548
- # # Load the MARS5 model
549
- # mars5, config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True)
550
-
551
- # def generate_audio_mars5(text):
552
- # description = "Thomas speaks with emphasis and excitement at a moderate pace with high quality."
553
- # kwargs_dict = {
554
- # 'temperature': 0.8,
555
- # 'top_k': -1,
556
- # 'top_p': 0.2,
557
- # 'typical_p': 1.0,
558
- # 'freq_penalty': 2.6,
559
- # 'presence_penalty': 0.4,
560
- # 'rep_penalty_window': 100,
561
- # 'max_prompt_phones': 360,
562
- # 'deep_clone': True,
563
- # 'nar_guidance_w': 3
564
- # }
565
-
566
- # chunks = chunk_text(preprocess(text))
567
- # audio_segments = []
568
-
569
- # for chunk in chunks:
570
- # wav = torch.zeros(1, mars5.sr) # Use a placeholder silent audio for the reference
571
- # cfg = config_class(**{k: kwargs_dict[k] for k in kwargs_dict if k in config_class.__dataclass_fields__})
572
- # ar_codes, wav_out = mars5.tts(chunk, wav, "", cfg=cfg)
573
-
574
-
575
- # temp_audio_path = os.path.join(tempfile.gettempdir(), f"mars5_audio_{len(audio_segments)}.wav")
576
- # torchaudio.save(temp_audio_path, wav_out.unsqueeze(0), mars5.sr)
577
- # audio_segments.append(AudioSegment.from_wav(temp_audio_path))
578
-
579
- # combined_audio = sum(audio_segments)
580
- # combined_audio_path = os.path.join(tempfile.gettempdir(), "mars5_combined_audio.wav")
581
- # combined_audio.export(combined_audio_path, format="wav")
582
-
583
- # logging.debug(f"Audio saved to {combined_audio_path}")
584
- # return combined_audio_path
585
-
586
  # Load the MARS5 model
587
  mars5, config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True)
588
 
589
- # Setting device and precision
590
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
591
- torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
592
- mars5.to(device)
593
-
594
- SAMPLE_RATE = 22050
595
- SEED = 42
596
-
597
- def preprocess(text):
598
- number_normalizer = EnglishNumberNormalizer()
599
- text = number_normalizer(text).strip()
600
- if text[-1] not in punctuation:
601
- text = f"{text}."
602
-
603
- abbreviations_pattern = r'\b[A-Z][A-Z\.]+\b'
604
- def separate_abb(chunk):
605
- chunk = chunk.replace(".", "")
606
- return " ".join(chunk)
607
-
608
- abbreviations = re.findall(abbreviations_pattern, text)
609
- for abv in abbreviations:
610
- if abv in text:
611
- text = text.replace(abv, separate_abb(abv))
612
- return text
613
-
614
- def chunk_text(text, max_length=250):
615
- words = text.split()
616
- chunks = []
617
- current_chunk = []
618
- current_length = 0
619
-
620
- for word in words:
621
- if current_length + len(word) + 1 <= max_length:
622
- current_chunk.append(word)
623
- current_length += len(word) + 1
624
- else:
625
- chunks.append(' '.join(current_chunk))
626
- current_chunk = [word]
627
- current_length = len(word) + 1
628
-
629
- if current_chunk:
630
- chunks.append(' '.join(current_chunk))
631
-
632
- return chunks
633
-
634
  def generate_audio_mars5(text):
635
  description = "Thomas speaks with emphasis and excitement at a moderate pace with high quality."
636
  kwargs_dict = {
@@ -645,24 +562,20 @@ def generate_audio_mars5(text):
645
  'deep_clone': True,
646
  'nar_guidance_w': 3
647
  }
648
-
649
  chunks = chunk_text(preprocess(text))
650
  audio_segments = []
651
 
652
- def process_chunk(chunk):
653
  wav = torch.zeros(1, mars5.sr) # Use a placeholder silent audio for the reference
654
  cfg = config_class(**{k: kwargs_dict[k] for k in kwargs_dict if k in config_class.__dataclass_fields__})
655
  ar_codes, wav_out = mars5.tts(chunk, wav, "", cfg=cfg)
656
 
 
657
  temp_audio_path = os.path.join(tempfile.gettempdir(), f"mars5_audio_{len(audio_segments)}.wav")
658
  torchaudio.save(temp_audio_path, wav_out.unsqueeze(0), mars5.sr)
659
- return AudioSegment.from_wav(temp_audio_path)
660
-
661
- # Use concurrent futures for parallel processing
662
- with concurrent.futures.ThreadPoolExecutor() as executor:
663
- results = list(executor.map(process_chunk, chunks))
664
- audio_segments.extend(results)
665
-
666
  combined_audio = sum(audio_segments)
667
  combined_audio_path = os.path.join(tempfile.gettempdir(), "mars5_combined_audio.wav")
668
  combined_audio.export(combined_audio_path, format="wav")
@@ -671,6 +584,7 @@ def generate_audio_mars5(text):
671
  return combined_audio_path
672
 
673
 
 
674
  pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", torch_dtype=torch.float16)
675
  pipe.to(device)
676
 
 
545
  logging.debug(f"Audio saved to {combined_audio_path}")
546
  return combined_audio_path
547
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
548
  # Load the MARS5 model
549
  mars5, config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True)
550
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
551
  def generate_audio_mars5(text):
552
  description = "Thomas speaks with emphasis and excitement at a moderate pace with high quality."
553
  kwargs_dict = {
 
562
  'deep_clone': True,
563
  'nar_guidance_w': 3
564
  }
565
+
566
  chunks = chunk_text(preprocess(text))
567
  audio_segments = []
568
 
569
+ for chunk in chunks:
570
  wav = torch.zeros(1, mars5.sr) # Use a placeholder silent audio for the reference
571
  cfg = config_class(**{k: kwargs_dict[k] for k in kwargs_dict if k in config_class.__dataclass_fields__})
572
  ar_codes, wav_out = mars5.tts(chunk, wav, "", cfg=cfg)
573
 
574
+
575
  temp_audio_path = os.path.join(tempfile.gettempdir(), f"mars5_audio_{len(audio_segments)}.wav")
576
  torchaudio.save(temp_audio_path, wav_out.unsqueeze(0), mars5.sr)
577
+ audio_segments.append(AudioSegment.from_wav(temp_audio_path))
578
+
 
 
 
 
 
579
  combined_audio = sum(audio_segments)
580
  combined_audio_path = os.path.join(tempfile.gettempdir(), "mars5_combined_audio.wav")
581
  combined_audio.export(combined_audio_path, format="wav")
 
584
  return combined_audio_path
585
 
586
 
587
+
588
  pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", torch_dtype=torch.float16)
589
  pipe.to(device)
590