IT2091024v2

Paused

App Files Files Community

Pijush2023 commited on Aug 25, 2024

Commit

6b4d9f8

verified ·

1 Parent(s): 8969c48

Update app.py

Browse files

Files changed (1) hide show

app.py +134 -50

app.py CHANGED Viewed

@@ -584,75 +584,159 @@ def generate_audio_elevenlabs(text):
         return None
-repo_id = "parler-tts/parler-tts-mini-v1"
-parler_model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
-parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
-parler_feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
-SAMPLE_RATE = parler_feature_extractor.sampling_rate
-def preprocess(text):
-    number_normalizer = EnglishNumberNormalizer()
-    text = number_normalizer(text).strip()
-    if text[-1] not in punctuation:
-        text = f"{text}."
-    abbreviations_pattern = r'\b[A-Z][A-Z\.]+\b'
-    def separate_abb(chunk):
-        chunk = chunk.replace(".", "")
-        return " ".join(chunk)
-    abbreviations = re.findall(abbreviations_pattern, text)
-    for abv in abbreviations:
-        if abv in text:
-            text = text.replace(abv, separate_abb(abv))
-    return text
-def chunk_text(text, max_length=250):
-    words = text.split()
-    chunks = []
-    current_chunk = []
-    current_length = 0
-    for word in words:
-        if current_length + len(word) + 1 <= max_length:
-            current_chunk.append(word)
-            current_length += len(word) + 1
-        else:
-            chunks.append(' '.join(current_chunk))
-            current_chunk = [word]
-            current_length = len(word) + 1
-    if current_chunk:
-        chunks.append(' '.join(current_chunk))
-    return chunks
-def generate_audio_parler_tts(text):
-    description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
-    chunks = chunk_text(preprocess(text))
-    audio_segments = []
-    for chunk in chunks:
-        input_ids = parler_tokenizer(description, return_tensors="pt").input_ids.to(device)
-        prompt_input_ids = parler_tokenizer(chunk, return_tensors="pt").input_ids.to(device)
-        generation = parler_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
-        audio_arr = generation.cpu().numpy().squeeze()
-        temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_{len(audio_segments)}.wav")
-        sf.write(temp_audio_path, audio_arr, parler_model.config.sampling_rate)
-        audio_segments.append(AudioSegment.from_wav(temp_audio_path))
-    combined_audio = sum(audio_segments)
     combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio.wav")
-    combined_audio.export(combined_audio_path, format="wav")
     logging.debug(f"Audio saved to {combined_audio_path}")
     return combined_audio_path
 # Load the MARS5 model
 mars5, config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True)

         return None
+# repo_id = "parler-tts/parler-tts-mini-v1"
+# parler_model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
+# parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
+# parler_feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
+# SAMPLE_RATE = parler_feature_extractor.sampling_rate
+# def preprocess(text):
+#     number_normalizer = EnglishNumberNormalizer()
+#     text = number_normalizer(text).strip()
+#     if text[-1] not in punctuation:
+#         text = f"{text}."
+#     abbreviations_pattern = r'\b[A-Z][A-Z\.]+\b'
+#     def separate_abb(chunk):
+#         chunk = chunk.replace(".", "")
+#         return " ".join(chunk)
+#     abbreviations = re.findall(abbreviations_pattern, text)
+#     for abv in abbreviations:
+#         if abv in text:
+#             text = text.replace(abv, separate_abb(abv))
+#     return text
+# def chunk_text(text, max_length=250):
+#     words = text.split()
+#     chunks = []
+#     current_chunk = []
+#     current_length = 0
+#     for word in words:
+#         if current_length + len(word) + 1 <= max_length:
+#             current_chunk.append(word)
+#             current_length += len(word) + 1
+#         else:
+#             chunks.append(' '.join(current_chunk))
+#             current_chunk = [word]
+#             current_length = len(word) + 1
+#     if current_chunk:
+#         chunks.append(' '.join(current_chunk))
+#     return chunks
+# def generate_audio_parler_tts(text):
+#     description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
+#     chunks = chunk_text(preprocess(text))
+#     audio_segments = []
+#     for chunk in chunks:
+#         input_ids = parler_tokenizer(description, return_tensors="pt").input_ids.to(device)
+#         prompt_input_ids = parler_tokenizer(chunk, return_tensors="pt").input_ids.to(device)
+#         generation = parler_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
+#         audio_arr = generation.cpu().numpy().squeeze()
+#         temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_{len(audio_segments)}.wav")
+#         sf.write(temp_audio_path, audio_arr, parler_model.config.sampling_rate)
+#         audio_segments.append(AudioSegment.from_wav(temp_audio_path))
+#     combined_audio = sum(audio_segments)
+#     combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio.wav")
+#     combined_audio.export(combined_audio_path, format="wav")
+#     logging.debug(f"Audio saved to {combined_audio_path}")
+#     return combined_audio_path
+import torch
+from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
+from transformers import AutoTokenizer
+from threading import Thread
+import tempfile
+import soundfile as sf
+import numpy as np
+import os
+# Parler TTS configuration
+torch_device = "cuda:0"  # Use "mps" for Mac
+torch_dtype = torch.bfloat16
+model_name = "parler-tts/parler-tts-mini-v1"
+# Load model and tokenizer
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+parler_model = ParlerTTSForConditionalGeneration.from_pretrained(
+    model_name,
+).to(torch_device, dtype=torch_dtype)
+# Define frame_rate from the model's audio encoder configuration
+frame_rate = parler_model.audio_encoder.config.frame_rate
+sampling_rate = parler_model.audio_encoder.config.sampling_rate
+def preprocess_text_for_tts(text):
+    # Add a period at the end if not present
+    if not text.endswith('.'):
+        text += '.'
+    # Normalize abbreviations and numbers
+    text = re.sub(r'\b[A-Z]{2,}\b', lambda m: ' '.join(list(m.group())), text)
+    return text
+def generate_audio_parler_tts(text, description="A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up.", play_steps_in_s=0.5):
+    text = preprocess_text_for_tts(text)  # Preprocess the text
+    play_steps = int(frame_rate * play_steps_in_s)
+    streamer = ParlerTTSStreamer(parler_model, device=torch_device, play_steps=play_steps)
+    # Tokenization
+    inputs = tokenizer(description, return_tensors="pt").to(torch_device)
+    prompt = tokenizer(text, return_tensors="pt").to(torch_device)
+    # Create generation kwargs
+    generation_kwargs = dict(
+        input_ids=inputs.input_ids,
+        attention_mask=inputs.attention_mask,
+        prompt_input_ids=prompt.input_ids,
+        streamer=streamer,
+        do_sample=True,
+        temperature=0.8,  # Adjusting temperature for clearer pronunciation
+        min_new_tokens=10,
+    )
+    # Initialize Thread
+    thread = Thread(target=parler_model.generate, kwargs=generation_kwargs)
+    thread.start()
+    # Prepare for audio concatenation
+    audio_segments = []
+    # Iterate over chunks of audio
+    for new_audio in streamer:
+        if new_audio.shape[0] == 0:
+            break
+        # Ensure the audio chunk is a tensor
+        if isinstance(new_audio, torch.Tensor):
+            audio_segments.append(new_audio)
+        else:
+            audio_segments.append(torch.tensor(new_audio))
+    # Combine all audio segments into a single tensor
+    combined_audio = torch.cat(audio_segments, dim=0).cpu().numpy()
+    # Save the combined audio to a file
     combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio.wav")
+    sf.write(combined_audio_path, combined_audio, sampling_rate)
     logging.debug(f"Audio saved to {combined_audio_path}")
     return combined_audio_path
 # Load the MARS5 model
 mars5, config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True)