Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -545,9 +545,92 @@ def generate_audio_parler_tts(text):
|
|
545 |
logging.debug(f"Audio saved to {combined_audio_path}")
|
546 |
return combined_audio_path
|
547 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
548 |
# Load the MARS5 model
|
549 |
mars5, config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True)
|
550 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
551 |
def generate_audio_mars5(text):
|
552 |
description = "Thomas speaks with emphasis and excitement at a moderate pace with high quality."
|
553 |
kwargs_dict = {
|
@@ -562,20 +645,24 @@ def generate_audio_mars5(text):
|
|
562 |
'deep_clone': True,
|
563 |
'nar_guidance_w': 3
|
564 |
}
|
565 |
-
|
566 |
chunks = chunk_text(preprocess(text))
|
567 |
audio_segments = []
|
568 |
|
569 |
-
|
570 |
wav = torch.zeros(1, mars5.sr) # Use a placeholder silent audio for the reference
|
571 |
cfg = config_class(**{k: kwargs_dict[k] for k in kwargs_dict if k in config_class.__dataclass_fields__})
|
572 |
ar_codes, wav_out = mars5.tts(chunk, wav, "", cfg=cfg)
|
573 |
|
574 |
-
|
575 |
temp_audio_path = os.path.join(tempfile.gettempdir(), f"mars5_audio_{len(audio_segments)}.wav")
|
576 |
torchaudio.save(temp_audio_path, wav_out.unsqueeze(0), mars5.sr)
|
577 |
-
|
578 |
-
|
|
|
|
|
|
|
|
|
|
|
579 |
combined_audio = sum(audio_segments)
|
580 |
combined_audio_path = os.path.join(tempfile.gettempdir(), "mars5_combined_audio.wav")
|
581 |
combined_audio.export(combined_audio_path, format="wav")
|
@@ -583,6 +670,7 @@ def generate_audio_mars5(text):
|
|
583 |
logging.debug(f"Audio saved to {combined_audio_path}")
|
584 |
return combined_audio_path
|
585 |
|
|
|
586 |
pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", torch_dtype=torch.float16)
|
587 |
pipe.to(device)
|
588 |
|
|
|
545 |
logging.debug(f"Audio saved to {combined_audio_path}")
|
546 |
return combined_audio_path
|
547 |
|
548 |
+
# # Load the MARS5 model
|
549 |
+
# mars5, config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True)
|
550 |
+
|
551 |
+
# def generate_audio_mars5(text):
|
552 |
+
# description = "Thomas speaks with emphasis and excitement at a moderate pace with high quality."
|
553 |
+
# kwargs_dict = {
|
554 |
+
# 'temperature': 0.8,
|
555 |
+
# 'top_k': -1,
|
556 |
+
# 'top_p': 0.2,
|
557 |
+
# 'typical_p': 1.0,
|
558 |
+
# 'freq_penalty': 2.6,
|
559 |
+
# 'presence_penalty': 0.4,
|
560 |
+
# 'rep_penalty_window': 100,
|
561 |
+
# 'max_prompt_phones': 360,
|
562 |
+
# 'deep_clone': True,
|
563 |
+
# 'nar_guidance_w': 3
|
564 |
+
# }
|
565 |
+
|
566 |
+
# chunks = chunk_text(preprocess(text))
|
567 |
+
# audio_segments = []
|
568 |
+
|
569 |
+
# for chunk in chunks:
|
570 |
+
# wav = torch.zeros(1, mars5.sr) # Use a placeholder silent audio for the reference
|
571 |
+
# cfg = config_class(**{k: kwargs_dict[k] for k in kwargs_dict if k in config_class.__dataclass_fields__})
|
572 |
+
# ar_codes, wav_out = mars5.tts(chunk, wav, "", cfg=cfg)
|
573 |
+
|
574 |
+
|
575 |
+
# temp_audio_path = os.path.join(tempfile.gettempdir(), f"mars5_audio_{len(audio_segments)}.wav")
|
576 |
+
# torchaudio.save(temp_audio_path, wav_out.unsqueeze(0), mars5.sr)
|
577 |
+
# audio_segments.append(AudioSegment.from_wav(temp_audio_path))
|
578 |
+
|
579 |
+
# combined_audio = sum(audio_segments)
|
580 |
+
# combined_audio_path = os.path.join(tempfile.gettempdir(), "mars5_combined_audio.wav")
|
581 |
+
# combined_audio.export(combined_audio_path, format="wav")
|
582 |
+
|
583 |
+
# logging.debug(f"Audio saved to {combined_audio_path}")
|
584 |
+
# return combined_audio_path
|
585 |
+
|
586 |
# Load the MARS5 model
|
587 |
mars5, config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True)
|
588 |
|
589 |
+
# Setting device and precision
|
590 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
591 |
+
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
592 |
+
mars5.to(device)
|
593 |
+
|
594 |
+
SAMPLE_RATE = 22050
|
595 |
+
SEED = 42
|
596 |
+
|
597 |
+
def preprocess(text):
|
598 |
+
number_normalizer = EnglishNumberNormalizer()
|
599 |
+
text = number_normalizer(text).strip()
|
600 |
+
if text[-1] not in punctuation:
|
601 |
+
text = f"{text}."
|
602 |
+
|
603 |
+
abbreviations_pattern = r'\b[A-Z][A-Z\.]+\b'
|
604 |
+
def separate_abb(chunk):
|
605 |
+
chunk = chunk.replace(".", "")
|
606 |
+
return " ".join(chunk)
|
607 |
+
|
608 |
+
abbreviations = re.findall(abbreviations_pattern, text)
|
609 |
+
for abv in abbreviations:
|
610 |
+
if abv in text:
|
611 |
+
text = text.replace(abv, separate_abb(abv))
|
612 |
+
return text
|
613 |
+
|
614 |
+
def chunk_text(text, max_length=250):
|
615 |
+
words = text.split()
|
616 |
+
chunks = []
|
617 |
+
current_chunk = []
|
618 |
+
current_length = 0
|
619 |
+
|
620 |
+
for word in words:
|
621 |
+
if current_length + len(word) + 1 <= max_length:
|
622 |
+
current_chunk.append(word)
|
623 |
+
current_length += len(word) + 1
|
624 |
+
else:
|
625 |
+
chunks.append(' '.join(current_chunk))
|
626 |
+
current_chunk = [word]
|
627 |
+
current_length = len(word) + 1
|
628 |
+
|
629 |
+
if current_chunk:
|
630 |
+
chunks.append(' '.join(current_chunk))
|
631 |
+
|
632 |
+
return chunks
|
633 |
+
|
634 |
def generate_audio_mars5(text):
|
635 |
description = "Thomas speaks with emphasis and excitement at a moderate pace with high quality."
|
636 |
kwargs_dict = {
|
|
|
645 |
'deep_clone': True,
|
646 |
'nar_guidance_w': 3
|
647 |
}
|
648 |
+
|
649 |
chunks = chunk_text(preprocess(text))
|
650 |
audio_segments = []
|
651 |
|
652 |
+
def process_chunk(chunk):
|
653 |
wav = torch.zeros(1, mars5.sr) # Use a placeholder silent audio for the reference
|
654 |
cfg = config_class(**{k: kwargs_dict[k] for k in kwargs_dict if k in config_class.__dataclass_fields__})
|
655 |
ar_codes, wav_out = mars5.tts(chunk, wav, "", cfg=cfg)
|
656 |
|
|
|
657 |
temp_audio_path = os.path.join(tempfile.gettempdir(), f"mars5_audio_{len(audio_segments)}.wav")
|
658 |
torchaudio.save(temp_audio_path, wav_out.unsqueeze(0), mars5.sr)
|
659 |
+
return AudioSegment.from_wav(temp_audio_path)
|
660 |
+
|
661 |
+
# Use concurrent futures for parallel processing
|
662 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
663 |
+
results = list(executor.map(process_chunk, chunks))
|
664 |
+
audio_segments.extend(results)
|
665 |
+
|
666 |
combined_audio = sum(audio_segments)
|
667 |
combined_audio_path = os.path.join(tempfile.gettempdir(), "mars5_combined_audio.wav")
|
668 |
combined_audio.export(combined_audio_path, format="wav")
|
|
|
670 |
logging.debug(f"Audio saved to {combined_audio_path}")
|
671 |
return combined_audio_path
|
672 |
|
673 |
+
|
674 |
pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", torch_dtype=torch.float16)
|
675 |
pipe.to(device)
|
676 |
|