IT2091024v2

Paused

App Files Files Community

Pijush2023 commited on Jul 6, 2024

Commit

cfcb1b1

verified ·

1 Parent(s): aeeb222

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -11

app.py CHANGED Viewed

@@ -532,7 +532,63 @@ def generate_audio_elevenlabs(text):
 #     logging.debug(f"Audio saved to {temp_audio_path}")
 #     return temp_audio_path
-def generate_audio_parler_tts(text):
     model_id = 'parler-tts/parler_tts_mini_v0.1'
     device = "cuda:0" if torch.cuda.is_available() else "cpu"
     try:
@@ -546,22 +602,20 @@ def generate_audio_parler_tts(text):
     description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
     input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
-    max_length = model.config.max_length
-    # Split the text into smaller chunks if it exceeds the max length
-    text_chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
-    audio_segments = []
-    for chunk in text_chunks:
         prompt_input_ids = tokenizer(chunk, return_tensors="pt").input_ids.to(device)
         generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
         audio_arr = generation.cpu().numpy().squeeze()
-        audio_segments.append(audio_arr)
-    combined_audio = np.concatenate(audio_segments)
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
-        sf.write(f.name, combined_audio, model.config.sampling_rate)
         temp_audio_path = f.name
     logging.debug(f"Audio saved to {temp_audio_path}")
@@ -571,6 +625,7 @@ def generate_audio_parler_tts(text):
 # Stable Diffusion setup
 pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16)
 pipe = pipe.to("cuda")

 #     logging.debug(f"Audio saved to {temp_audio_path}")
 #     return temp_audio_path
+# def generate_audio_parler_tts(text):
+#     model_id = 'parler-tts/parler_tts_mini_v0.1'
+#     device = "cuda:0" if torch.cuda.is_available() else "cpu"
+#     try:
+#         model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
+#     except torch.cuda.OutOfMemoryError:
+#         print("CUDA out of memory. Switching to CPU.")
+#         device = "cpu"
+#         model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
+#     tokenizer = AutoTokenizer.from_pretrained(model_id)
+#     description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
+#     input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
+#     max_length = model.config.max_length
+#     # Split the text into smaller chunks if it exceeds the max length
+#     text_chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
+#     audio_segments = []
+#     for chunk in text_chunks:
+#         prompt_input_ids = tokenizer(chunk, return_tensors="pt").input_ids.to(device)
+#         generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
+#         audio_arr = generation.cpu().numpy().squeeze()
+#         audio_segments.append(audio_arr)
+#     combined_audio = np.concatenate(audio_segments)
+#     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
+#         sf.write(f.name, combined_audio, model.config.sampling_rate)
+#         temp_audio_path = f.name
+#     logging.debug(f"Audio saved to {temp_audio_path}")
+#     return temp_audio_path
+def generate_audio_parler_tts(text, chunk_size=200):
+    def split_text(text, chunk_size):
+        # Split text into chunks of the specified size
+        words = text.split()
+        chunks = []
+        current_chunk = []
+        current_length = 0
+        for word in words:
+            if current_length + len(word) + 1 > chunk_size:
+                chunks.append(" ".join(current_chunk))
+                current_chunk = [word]
+                current_length = len(word) + 1
+            else:
+                current_chunk.append(word)
+                current_length += len(word) + 1
+        if current_chunk:
+            chunks.append(" ".join(current_chunk))
+        return chunks
     model_id = 'parler-tts/parler_tts_mini_v0.1'
     device = "cuda:0" if torch.cuda.is_available() else "cpu"
     try:
     description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
     input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
+    chunks = split_text(text, chunk_size)
+    audio_arrs = []
+    for chunk in chunks:
         prompt_input_ids = tokenizer(chunk, return_tensors="pt").input_ids.to(device)
         generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
         audio_arr = generation.cpu().numpy().squeeze()
+        audio_arrs.append(audio_arr)
+    # Concatenate all audio arrays into a single array
+    concatenated_audio = np.concatenate(audio_arrs)
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
+        sf.write(f.name, concatenated_audio, model.config.sampling_rate)
         temp_audio_path = f.name
     logging.debug(f"Audio saved to {temp_audio_path}")
 # Stable Diffusion setup
 pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16)
 pipe = pipe.to("cuda")