Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -584,157 +584,79 @@ def generate_audio_elevenlabs(text):
|
|
584 |
return None
|
585 |
|
586 |
|
587 |
-
|
588 |
|
589 |
-
|
590 |
-
|
591 |
-
|
592 |
|
593 |
-
|
594 |
|
595 |
-
|
596 |
-
|
597 |
-
|
598 |
-
|
599 |
-
|
600 |
|
601 |
-
|
602 |
|
603 |
-
|
604 |
-
|
605 |
-
|
606 |
|
607 |
-
|
608 |
-
|
609 |
-
|
610 |
-
|
611 |
-
|
612 |
-
|
613 |
-
# def chunk_text(text, max_length=250):
|
614 |
-
# words = text.split()
|
615 |
-
# chunks = []
|
616 |
-
# current_chunk = []
|
617 |
-
# current_length = 0
|
618 |
-
|
619 |
-
# for word in words:
|
620 |
-
# if current_length + len(word) + 1 <= max_length:
|
621 |
-
# current_chunk.append(word)
|
622 |
-
# current_length += len(word) + 1
|
623 |
-
# else:
|
624 |
-
# chunks.append(' '.join(current_chunk))
|
625 |
-
# current_chunk = [word]
|
626 |
-
# current_length = len(word) + 1
|
627 |
-
|
628 |
-
# if current_chunk:
|
629 |
-
# chunks.append(' '.join(current_chunk))
|
630 |
-
|
631 |
-
# return chunks
|
632 |
|
633 |
-
|
634 |
-
|
635 |
-
|
636 |
-
|
|
|
637 |
|
638 |
-
|
639 |
-
|
640 |
-
|
|
|
|
|
|
|
|
|
|
|
641 |
|
642 |
-
|
643 |
-
|
644 |
|
645 |
-
|
646 |
-
# sf.write(temp_audio_path, audio_arr, parler_model.config.sampling_rate)
|
647 |
-
# audio_segments.append(AudioSegment.from_wav(temp_audio_path))
|
648 |
|
649 |
-
|
650 |
-
|
651 |
-
|
|
|
652 |
|
653 |
-
|
654 |
-
|
|
|
655 |
|
|
|
|
|
656 |
|
|
|
|
|
|
|
657 |
|
|
|
|
|
|
|
658 |
|
659 |
-
|
660 |
-
|
661 |
-
from transformers import AutoTokenizer
|
662 |
-
from threading import Thread
|
663 |
-
import tempfile
|
664 |
-
import soundfile as sf
|
665 |
-
import numpy as np
|
666 |
-
import os
|
667 |
|
668 |
-
# Parler TTS configuration
|
669 |
-
torch_device = "cuda:0" # Use "mps" for Mac
|
670 |
-
torch_dtype = torch.bfloat16
|
671 |
-
model_name = "parler-tts/parler-tts-mini-v1"
|
672 |
-
|
673 |
-
# Load model and tokenizer
|
674 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
675 |
-
parler_model = ParlerTTSForConditionalGeneration.from_pretrained(
|
676 |
-
model_name,
|
677 |
-
).to(torch_device, dtype=torch_dtype)
|
678 |
-
|
679 |
-
# Define frame_rate from the model's audio encoder configuration
|
680 |
-
frame_rate = parler_model.audio_encoder.config.frame_rate
|
681 |
-
sampling_rate = parler_model.audio_encoder.config.sampling_rate
|
682 |
-
|
683 |
-
def preprocess_text_for_tts(text):
|
684 |
-
# Add a period at the end if not present
|
685 |
-
if not text.endswith('.'):
|
686 |
-
text += '.'
|
687 |
-
|
688 |
-
# Normalize abbreviations and numbers
|
689 |
-
text = re.sub(r'\b[A-Z]{2,}\b', lambda m: ' '.join(list(m.group())), text)
|
690 |
-
return text
|
691 |
|
692 |
-
def generate_audio_parler_tts(text, description="A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up.", play_steps_in_s=0.5):
|
693 |
-
text = preprocess_text_for_tts(text) # Preprocess the text
|
694 |
-
play_steps = int(frame_rate * play_steps_in_s)
|
695 |
-
streamer = ParlerTTSStreamer(parler_model, device=torch_device, play_steps=play_steps)
|
696 |
-
|
697 |
-
# Tokenization
|
698 |
-
inputs = tokenizer(description, return_tensors="pt").to(torch_device)
|
699 |
-
prompt = tokenizer(text, return_tensors="pt").to(torch_device)
|
700 |
-
|
701 |
-
# Create generation kwargs
|
702 |
-
generation_kwargs = dict(
|
703 |
-
input_ids=inputs.input_ids,
|
704 |
-
attention_mask=inputs.attention_mask,
|
705 |
-
prompt_input_ids=prompt.input_ids,
|
706 |
-
streamer=streamer,
|
707 |
-
do_sample=True,
|
708 |
-
temperature=0.8, # Adjusting temperature for clearer pronunciation
|
709 |
-
min_new_tokens=10,
|
710 |
-
)
|
711 |
-
|
712 |
-
# Initialize Thread
|
713 |
-
thread = Thread(target=parler_model.generate, kwargs=generation_kwargs)
|
714 |
-
thread.start()
|
715 |
-
|
716 |
-
# Prepare for audio concatenation
|
717 |
-
audio_segments = []
|
718 |
|
719 |
-
# Iterate over chunks of audio
|
720 |
-
for new_audio in streamer:
|
721 |
-
if new_audio.shape[0] == 0:
|
722 |
-
break
|
723 |
-
# Ensure the audio chunk is a tensor
|
724 |
-
if isinstance(new_audio, torch.Tensor):
|
725 |
-
audio_segments.append(new_audio)
|
726 |
-
else:
|
727 |
-
audio_segments.append(torch.tensor(new_audio))
|
728 |
|
729 |
-
# Combine all audio segments into a single tensor
|
730 |
-
combined_audio = torch.cat(audio_segments, dim=0).cpu().numpy()
|
731 |
-
|
732 |
-
# Save the combined audio to a file
|
733 |
-
combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio.wav")
|
734 |
-
sf.write(combined_audio_path, combined_audio, sampling_rate)
|
735 |
|
736 |
-
logging.debug(f"Audio saved to {combined_audio_path}")
|
737 |
-
return combined_audio_path
|
738 |
|
739 |
|
740 |
|
|
|
584 |
return None
|
585 |
|
586 |
|
587 |
+
repo_id = "parler-tts/parler-tts-mini-v1"
|
588 |
|
589 |
+
parler_model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
|
590 |
+
parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
|
591 |
+
parler_feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
|
592 |
|
593 |
+
SAMPLE_RATE = parler_feature_extractor.sampling_rate
|
594 |
|
595 |
+
def preprocess(text):
|
596 |
+
number_normalizer = EnglishNumberNormalizer()
|
597 |
+
text = number_normalizer(text).strip()
|
598 |
+
if text[-1] not in punctuation:
|
599 |
+
text = f"{text}."
|
600 |
|
601 |
+
abbreviations_pattern = r'\b[A-Z][A-Z\.]+\b'
|
602 |
|
603 |
+
def separate_abb(chunk):
|
604 |
+
chunk = chunk.replace(".", "")
|
605 |
+
return " ".join(chunk)
|
606 |
|
607 |
+
abbreviations = re.findall(abbreviations_pattern, text)
|
608 |
+
for abv in abbreviations:
|
609 |
+
if abv in text:
|
610 |
+
text = text.replace(abv, separate_abb(abv))
|
611 |
+
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
612 |
|
613 |
+
def chunk_text(text, max_length=250):
|
614 |
+
words = text.split()
|
615 |
+
chunks = []
|
616 |
+
current_chunk = []
|
617 |
+
current_length = 0
|
618 |
|
619 |
+
for word in words:
|
620 |
+
if current_length + len(word) + 1 <= max_length:
|
621 |
+
current_chunk.append(word)
|
622 |
+
current_length += len(word) + 1
|
623 |
+
else:
|
624 |
+
chunks.append(' '.join(current_chunk))
|
625 |
+
current_chunk = [word]
|
626 |
+
current_length = len(word) + 1
|
627 |
|
628 |
+
if current_chunk:
|
629 |
+
chunks.append(' '.join(current_chunk))
|
630 |
|
631 |
+
return chunks
|
|
|
|
|
632 |
|
633 |
+
def generate_audio_parler_tts(text):
|
634 |
+
description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
|
635 |
+
chunks = chunk_text(preprocess(text))
|
636 |
+
audio_segments = []
|
637 |
|
638 |
+
for chunk in chunks:
|
639 |
+
input_ids = parler_tokenizer(description, return_tensors="pt").input_ids.to(device)
|
640 |
+
prompt_input_ids = parler_tokenizer(chunk, return_tensors="pt").input_ids.to(device)
|
641 |
|
642 |
+
generation = parler_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
|
643 |
+
audio_arr = generation.cpu().numpy().squeeze()
|
644 |
|
645 |
+
temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_{len(audio_segments)}.wav")
|
646 |
+
sf.write(temp_audio_path, audio_arr, parler_model.config.sampling_rate)
|
647 |
+
audio_segments.append(AudioSegment.from_wav(temp_audio_path))
|
648 |
|
649 |
+
combined_audio = sum(audio_segments)
|
650 |
+
combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio.wav")
|
651 |
+
combined_audio.export(combined_audio_path, format="wav")
|
652 |
|
653 |
+
logging.debug(f"Audio saved to {combined_audio_path}")
|
654 |
+
return combined_audio_path
|
|
|
|
|
|
|
|
|
|
|
|
|
655 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
656 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
657 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
658 |
|
|
|
|
|
|
|
|
|
|
|
|
|
659 |
|
|
|
|
|
660 |
|
661 |
|
662 |
|