Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -545,92 +545,9 @@ def generate_audio_parler_tts(text):
|
|
545 |
logging.debug(f"Audio saved to {combined_audio_path}")
|
546 |
return combined_audio_path
|
547 |
|
548 |
-
# # Load the MARS5 model
|
549 |
-
# mars5, config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True)
|
550 |
-
|
551 |
-
# def generate_audio_mars5(text):
|
552 |
-
# description = "Thomas speaks with emphasis and excitement at a moderate pace with high quality."
|
553 |
-
# kwargs_dict = {
|
554 |
-
# 'temperature': 0.8,
|
555 |
-
# 'top_k': -1,
|
556 |
-
# 'top_p': 0.2,
|
557 |
-
# 'typical_p': 1.0,
|
558 |
-
# 'freq_penalty': 2.6,
|
559 |
-
# 'presence_penalty': 0.4,
|
560 |
-
# 'rep_penalty_window': 100,
|
561 |
-
# 'max_prompt_phones': 360,
|
562 |
-
# 'deep_clone': True,
|
563 |
-
# 'nar_guidance_w': 3
|
564 |
-
# }
|
565 |
-
|
566 |
-
# chunks = chunk_text(preprocess(text))
|
567 |
-
# audio_segments = []
|
568 |
-
|
569 |
-
# for chunk in chunks:
|
570 |
-
# wav = torch.zeros(1, mars5.sr) # Use a placeholder silent audio for the reference
|
571 |
-
# cfg = config_class(**{k: kwargs_dict[k] for k in kwargs_dict if k in config_class.__dataclass_fields__})
|
572 |
-
# ar_codes, wav_out = mars5.tts(chunk, wav, "", cfg=cfg)
|
573 |
-
|
574 |
-
|
575 |
-
# temp_audio_path = os.path.join(tempfile.gettempdir(), f"mars5_audio_{len(audio_segments)}.wav")
|
576 |
-
# torchaudio.save(temp_audio_path, wav_out.unsqueeze(0), mars5.sr)
|
577 |
-
# audio_segments.append(AudioSegment.from_wav(temp_audio_path))
|
578 |
-
|
579 |
-
# combined_audio = sum(audio_segments)
|
580 |
-
# combined_audio_path = os.path.join(tempfile.gettempdir(), "mars5_combined_audio.wav")
|
581 |
-
# combined_audio.export(combined_audio_path, format="wav")
|
582 |
-
|
583 |
-
# logging.debug(f"Audio saved to {combined_audio_path}")
|
584 |
-
# return combined_audio_path
|
585 |
-
|
586 |
# Load the MARS5 model
|
587 |
mars5, config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True)
|
588 |
|
589 |
-
# Setting device and precision
|
590 |
-
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
591 |
-
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
592 |
-
mars5.to(device)
|
593 |
-
|
594 |
-
SAMPLE_RATE = 22050
|
595 |
-
SEED = 42
|
596 |
-
|
597 |
-
def preprocess(text):
|
598 |
-
number_normalizer = EnglishNumberNormalizer()
|
599 |
-
text = number_normalizer(text).strip()
|
600 |
-
if text[-1] not in punctuation:
|
601 |
-
text = f"{text}."
|
602 |
-
|
603 |
-
abbreviations_pattern = r'\b[A-Z][A-Z\.]+\b'
|
604 |
-
def separate_abb(chunk):
|
605 |
-
chunk = chunk.replace(".", "")
|
606 |
-
return " ".join(chunk)
|
607 |
-
|
608 |
-
abbreviations = re.findall(abbreviations_pattern, text)
|
609 |
-
for abv in abbreviations:
|
610 |
-
if abv in text:
|
611 |
-
text = text.replace(abv, separate_abb(abv))
|
612 |
-
return text
|
613 |
-
|
614 |
-
def chunk_text(text, max_length=250):
|
615 |
-
words = text.split()
|
616 |
-
chunks = []
|
617 |
-
current_chunk = []
|
618 |
-
current_length = 0
|
619 |
-
|
620 |
-
for word in words:
|
621 |
-
if current_length + len(word) + 1 <= max_length:
|
622 |
-
current_chunk.append(word)
|
623 |
-
current_length += len(word) + 1
|
624 |
-
else:
|
625 |
-
chunks.append(' '.join(current_chunk))
|
626 |
-
current_chunk = [word]
|
627 |
-
current_length = len(word) + 1
|
628 |
-
|
629 |
-
if current_chunk:
|
630 |
-
chunks.append(' '.join(current_chunk))
|
631 |
-
|
632 |
-
return chunks
|
633 |
-
|
634 |
def generate_audio_mars5(text):
|
635 |
description = "Thomas speaks with emphasis and excitement at a moderate pace with high quality."
|
636 |
kwargs_dict = {
|
@@ -645,24 +562,20 @@ def generate_audio_mars5(text):
|
|
645 |
'deep_clone': True,
|
646 |
'nar_guidance_w': 3
|
647 |
}
|
648 |
-
|
649 |
chunks = chunk_text(preprocess(text))
|
650 |
audio_segments = []
|
651 |
|
652 |
-
|
653 |
wav = torch.zeros(1, mars5.sr) # Use a placeholder silent audio for the reference
|
654 |
cfg = config_class(**{k: kwargs_dict[k] for k in kwargs_dict if k in config_class.__dataclass_fields__})
|
655 |
ar_codes, wav_out = mars5.tts(chunk, wav, "", cfg=cfg)
|
656 |
|
|
|
657 |
temp_audio_path = os.path.join(tempfile.gettempdir(), f"mars5_audio_{len(audio_segments)}.wav")
|
658 |
torchaudio.save(temp_audio_path, wav_out.unsqueeze(0), mars5.sr)
|
659 |
-
|
660 |
-
|
661 |
-
# Use concurrent futures for parallel processing
|
662 |
-
with concurrent.futures.ThreadPoolExecutor() as executor:
|
663 |
-
results = list(executor.map(process_chunk, chunks))
|
664 |
-
audio_segments.extend(results)
|
665 |
-
|
666 |
combined_audio = sum(audio_segments)
|
667 |
combined_audio_path = os.path.join(tempfile.gettempdir(), "mars5_combined_audio.wav")
|
668 |
combined_audio.export(combined_audio_path, format="wav")
|
@@ -671,6 +584,7 @@ def generate_audio_mars5(text):
|
|
671 |
return combined_audio_path
|
672 |
|
673 |
|
|
|
674 |
pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", torch_dtype=torch.float16)
|
675 |
pipe.to(device)
|
676 |
|
|
|
545 |
logging.debug(f"Audio saved to {combined_audio_path}")
|
546 |
return combined_audio_path
|
547 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
548 |
# Load the MARS5 model
|
549 |
mars5, config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True)
|
550 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
551 |
def generate_audio_mars5(text):
|
552 |
description = "Thomas speaks with emphasis and excitement at a moderate pace with high quality."
|
553 |
kwargs_dict = {
|
|
|
562 |
'deep_clone': True,
|
563 |
'nar_guidance_w': 3
|
564 |
}
|
565 |
+
|
566 |
chunks = chunk_text(preprocess(text))
|
567 |
audio_segments = []
|
568 |
|
569 |
+
for chunk in chunks:
|
570 |
wav = torch.zeros(1, mars5.sr) # Use a placeholder silent audio for the reference
|
571 |
cfg = config_class(**{k: kwargs_dict[k] for k in kwargs_dict if k in config_class.__dataclass_fields__})
|
572 |
ar_codes, wav_out = mars5.tts(chunk, wav, "", cfg=cfg)
|
573 |
|
574 |
+
|
575 |
temp_audio_path = os.path.join(tempfile.gettempdir(), f"mars5_audio_{len(audio_segments)}.wav")
|
576 |
torchaudio.save(temp_audio_path, wav_out.unsqueeze(0), mars5.sr)
|
577 |
+
audio_segments.append(AudioSegment.from_wav(temp_audio_path))
|
578 |
+
|
|
|
|
|
|
|
|
|
|
|
579 |
combined_audio = sum(audio_segments)
|
580 |
combined_audio_path = os.path.join(tempfile.gettempdir(), "mars5_combined_audio.wav")
|
581 |
combined_audio.export(combined_audio_path, format="wav")
|
|
|
584 |
return combined_audio_path
|
585 |
|
586 |
|
587 |
+
|
588 |
pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", torch_dtype=torch.float16)
|
589 |
pipe.to(device)
|
590 |
|