Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -506,6 +506,32 @@ def generate_audio_elevenlabs(text):
|
|
506 |
logging.error(f"Error generating audio: {response.text}")
|
507 |
return None
|
508 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
509 |
def generate_audio_parler_tts(text):
|
510 |
model_id = 'parler-tts/parler_tts_mini_v0.1'
|
511 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
@@ -520,18 +546,31 @@ def generate_audio_parler_tts(text):
|
|
520 |
description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
|
521 |
|
522 |
input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
|
523 |
-
|
524 |
|
525 |
-
|
526 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
527 |
|
|
|
|
|
528 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
529 |
-
sf.write(f.name,
|
530 |
temp_audio_path = f.name
|
531 |
|
532 |
logging.debug(f"Audio saved to {temp_audio_path}")
|
533 |
return temp_audio_path
|
534 |
|
|
|
|
|
|
|
|
|
535 |
# Stable Diffusion setup
|
536 |
pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16)
|
537 |
pipe = pipe.to("cuda")
|
|
|
506 |
logging.error(f"Error generating audio: {response.text}")
|
507 |
return None
|
508 |
|
509 |
+
# def generate_audio_parler_tts(text):
|
510 |
+
# model_id = 'parler-tts/parler_tts_mini_v0.1'
|
511 |
+
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
512 |
+
# try:
|
513 |
+
# model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
|
514 |
+
# except torch.cuda.OutOfMemoryError:
|
515 |
+
# print("CUDA out of memory. Switching to CPU.")
|
516 |
+
# device = "cpu"
|
517 |
+
# model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
|
518 |
+
# tokenizer = AutoTokenizer.from_pretrained(model_id)
|
519 |
+
|
520 |
+
# description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
|
521 |
+
|
522 |
+
# input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
|
523 |
+
# prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
|
524 |
+
|
525 |
+
# generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
|
526 |
+
# audio_arr = generation.cpu().numpy().squeeze()
|
527 |
+
|
528 |
+
# with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
529 |
+
# sf.write(f.name, audio_arr, model.config.sampling_rate)
|
530 |
+
# temp_audio_path = f.name
|
531 |
+
|
532 |
+
# logging.debug(f"Audio saved to {temp_audio_path}")
|
533 |
+
# return temp_audio_path
|
534 |
+
|
535 |
def generate_audio_parler_tts(text):
|
536 |
model_id = 'parler-tts/parler_tts_mini_v0.1'
|
537 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
|
|
546 |
description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
|
547 |
|
548 |
input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
|
549 |
+
max_length = model.config.max_length
|
550 |
|
551 |
+
# Split the text into smaller chunks if it exceeds the max length
|
552 |
+
text_chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
|
553 |
+
audio_segments = []
|
554 |
+
|
555 |
+
for chunk in text_chunks:
|
556 |
+
prompt_input_ids = tokenizer(chunk, return_tensors="pt").input_ids.to(device)
|
557 |
+
generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
|
558 |
+
audio_arr = generation.cpu().numpy().squeeze()
|
559 |
+
audio_segments.append(audio_arr)
|
560 |
|
561 |
+
combined_audio = np.concatenate(audio_segments)
|
562 |
+
|
563 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
564 |
+
sf.write(f.name, combined_audio, model.config.sampling_rate)
|
565 |
temp_audio_path = f.name
|
566 |
|
567 |
logging.debug(f"Audio saved to {temp_audio_path}")
|
568 |
return temp_audio_path
|
569 |
|
570 |
+
|
571 |
+
|
572 |
+
|
573 |
+
|
574 |
# Stable Diffusion setup
|
575 |
pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16)
|
576 |
pipe = pipe.to("cuda")
|