IT2091024v2

Paused

App Files Files Community

Pijush2023 commited on Jul 6, 2024

Commit

f291fd6

verified ·

1 Parent(s): f061ade

Update app.py

Browse files

Files changed (1) hide show

app.py +6 -160

app.py CHANGED Viewed

@@ -526,151 +526,7 @@ def generate_audio_elevenlabs(text):
         logging.error(f"Error generating audio: {response.text}")
         return None
-# def generate_audio_parler_tts(text):
-#     model_id = 'parler-tts/parler_tts_mini_v0.1'
-#     device = "cuda:0" if torch.cuda.is_available() else "cpu"
-#     try:
-#         model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
-#     except torch.cuda.OutOfMemoryError:
-#         print("CUDA out of memory. Switching to CPU.")
-#         device = "cpu"
-#         model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
-#     tokenizer = AutoTokenizer.from_pretrained(model_id)
-#     description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
-#     input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
-#     prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
-#     generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
-#     audio_arr = generation.cpu().numpy().squeeze()
-#     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
-#         sf.write(f.name, audio_arr, model.config.sampling_rate)
-#         temp_audio_path = f.name
-#     logging.debug(f"Audio saved to {temp_audio_path}")
-#     return temp_audio_path
-# def generate_audio_parler_tts(text):
-#     model_id = 'parler-tts/parler_tts_mini_v0.1'
-#     device = "cuda:0" if torch.cuda.is_available() else "cpu"
-#     try:
-#         model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
-#     except torch.cuda.OutOfMemoryError:
-#         print("CUDA out of memory. Switching to CPU.")
-#         device = "cpu"
-#         model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
-#     tokenizer = AutoTokenizer.from_pretrained(model_id)
-#     description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
-#     input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
-#     max_length = model.config.max_length
-#     # Split the text into smaller chunks if it exceeds the max length
-#     text_chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
-#     audio_segments = []
-#     for chunk in text_chunks:
-#         prompt_input_ids = tokenizer(chunk, return_tensors="pt").input_ids.to(device)
-#         generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
-#         audio_arr = generation.cpu().numpy().squeeze()
-#         audio_segments.append(audio_arr)
-#     combined_audio = np.concatenate(audio_segments)
-#     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
-#         sf.write(f.name, combined_audio, model.config.sampling_rate)
-#         temp_audio_path = f.name
-#     logging.debug(f"Audio saved to {temp_audio_path}")
-#     return temp_audio_path
-# def generate_audio_parler_tts(text, chunk_size=200):
-#     def split_text(text, chunk_size):
-#         # Split text into chunks of the specified size
-#         words = text.split()
-#         chunks = []
-#         current_chunk = []
-#         current_length = 0
-#         for word in words:
-#             if current_length + len(word) + 1 > chunk_size:
-#                 chunks.append(" ".join(current_chunk))
-#                 current_chunk = [word]
-#                 current_length = len(word) + 1
-#             else:
-#                 current_chunk.append(word)
-#                 current_length += len(word) + 1
-#         if current_chunk:
-#             chunks.append(" ".join(current_chunk))
-#         return chunks
-#     model_id = 'parler-tts/parler_tts_mini_v0.1'
-#     device = "cuda:0" if torch.cuda.is_available() else "cpu"
-#     try:
-#         model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
-#     except torch.cuda.OutOfMemoryError:
-#         print("CUDA out of memory. Switching to CPU.")
-#         device = "cpu"
-#         model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
-#     tokenizer = AutoTokenizer.from_pretrained(model_id)
-#     description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
-#     input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
-#     chunks = split_text(text, chunk_size)
-#     audio_arrs = []
-#     for chunk in chunks:
-#         prompt_input_ids = tokenizer(chunk, return_tensors="pt").input_ids.to(device)
-#         generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
-#         audio_arr = generation.cpu().numpy().squeeze()
-#         audio_arrs.append(audio_arr)
-#     # Concatenate all audio arrays into a single array
-#     concatenated_audio = np.concatenate(audio_arrs)
-#     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
-#         sf.write(f.name, concatenated_audio, model.config.sampling_rate)
-#         temp_audio_path = f.name
-#     logging.debug(f"Audio saved to {temp_audio_path}")
-#     return temp_audio_path
-import concurrent.futures
-def generate_audio_parler_tts(text, chunk_size=200):
-    def split_text(text, chunk_size):
-        words = text.split()
-        chunks = []
-        current_chunk = []
-        current_length = 0
-        for word in words:
-            if current_length + len(word) + 1 > chunk_size:
-                chunks.append(" ".join(current_chunk))
-                current_chunk = [word]
-                current_length = len(word) + 1
-            else:
-                current_chunk.append(word)
-                current_length += len(word) + 1
-        if current_chunk:
-            chunks.append(" ".join(current_chunk))
-        return chunks
-    def process_chunk(chunk):
-        prompt_input_ids = tokenizer(chunk, return_tensors="pt").input_ids.to(device)
-        generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
-        audio_arr = generation.cpu().numpy().squeeze()
-        return audio_arr
     model_id = 'parler-tts/parler_tts_mini_v0.1'
     device = "cuda:0" if torch.cuda.is_available() else "cpu"
     try:
@@ -684,28 +540,18 @@ def generate_audio_parler_tts(text, chunk_size=200):
     description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
     input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
-    chunks = split_text(text, chunk_size)
-    # Process chunks in parallel
-    with concurrent.futures.ThreadPoolExecutor() as executor:
-        futures = [executor.submit(process_chunk, chunk) for chunk in chunks]
-        audio_arrs = [future.result() for future in concurrent.futures.as_completed(futures)]
-    # Concatenate all audio arrays into a single array
-    concatenated_audio = np.concatenate(audio_arrs)
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
-        sf.write(f.name, concatenated_audio, model.config.sampling_rate)
         temp_audio_path = f.name
     logging.debug(f"Audio saved to {temp_audio_path}")
     return temp_audio_path
 # Stable Diffusion setup
 pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16)
 pipe = pipe.to("cuda")

         logging.error(f"Error generating audio: {response.text}")
         return None
+def generate_audio_parler_tts(text):
     model_id = 'parler-tts/parler_tts_mini_v0.1'
     device = "cuda:0" if torch.cuda.is_available() else "cpu"
     try:
     description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
     input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
+    prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
+    generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
+    audio_arr = generation.cpu().numpy().squeeze()
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
+        sf.write(f.name, audio_arr, model.config.sampling_rate)
         temp_audio_path = f.name
     logging.debug(f"Audio saved to {temp_audio_path}")
     return temp_audio_path
 # Stable Diffusion setup
 pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16)
 pipe = pipe.to("cuda")