Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -526,151 +526,7 @@ def generate_audio_elevenlabs(text):
|
|
526 |
logging.error(f"Error generating audio: {response.text}")
|
527 |
return None
|
528 |
|
529 |
-
|
530 |
-
# model_id = 'parler-tts/parler_tts_mini_v0.1'
|
531 |
-
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
532 |
-
# try:
|
533 |
-
# model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
|
534 |
-
# except torch.cuda.OutOfMemoryError:
|
535 |
-
# print("CUDA out of memory. Switching to CPU.")
|
536 |
-
# device = "cpu"
|
537 |
-
# model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
|
538 |
-
# tokenizer = AutoTokenizer.from_pretrained(model_id)
|
539 |
-
|
540 |
-
# description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
|
541 |
-
|
542 |
-
# input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
|
543 |
-
# prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
|
544 |
-
|
545 |
-
# generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
|
546 |
-
# audio_arr = generation.cpu().numpy().squeeze()
|
547 |
-
|
548 |
-
# with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
549 |
-
# sf.write(f.name, audio_arr, model.config.sampling_rate)
|
550 |
-
# temp_audio_path = f.name
|
551 |
-
|
552 |
-
# logging.debug(f"Audio saved to {temp_audio_path}")
|
553 |
-
# return temp_audio_path
|
554 |
-
|
555 |
-
# def generate_audio_parler_tts(text):
|
556 |
-
# model_id = 'parler-tts/parler_tts_mini_v0.1'
|
557 |
-
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
558 |
-
# try:
|
559 |
-
# model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
|
560 |
-
# except torch.cuda.OutOfMemoryError:
|
561 |
-
# print("CUDA out of memory. Switching to CPU.")
|
562 |
-
# device = "cpu"
|
563 |
-
# model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
|
564 |
-
# tokenizer = AutoTokenizer.from_pretrained(model_id)
|
565 |
-
|
566 |
-
# description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
|
567 |
-
|
568 |
-
# input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
|
569 |
-
# max_length = model.config.max_length
|
570 |
-
|
571 |
-
# # Split the text into smaller chunks if it exceeds the max length
|
572 |
-
# text_chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
|
573 |
-
# audio_segments = []
|
574 |
-
|
575 |
-
# for chunk in text_chunks:
|
576 |
-
# prompt_input_ids = tokenizer(chunk, return_tensors="pt").input_ids.to(device)
|
577 |
-
# generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
|
578 |
-
# audio_arr = generation.cpu().numpy().squeeze()
|
579 |
-
# audio_segments.append(audio_arr)
|
580 |
-
|
581 |
-
# combined_audio = np.concatenate(audio_segments)
|
582 |
-
|
583 |
-
# with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
584 |
-
# sf.write(f.name, combined_audio, model.config.sampling_rate)
|
585 |
-
# temp_audio_path = f.name
|
586 |
-
|
587 |
-
# logging.debug(f"Audio saved to {temp_audio_path}")
|
588 |
-
# return temp_audio_path
|
589 |
-
|
590 |
-
# def generate_audio_parler_tts(text, chunk_size=200):
|
591 |
-
# def split_text(text, chunk_size):
|
592 |
-
# # Split text into chunks of the specified size
|
593 |
-
# words = text.split()
|
594 |
-
# chunks = []
|
595 |
-
# current_chunk = []
|
596 |
-
# current_length = 0
|
597 |
-
|
598 |
-
# for word in words:
|
599 |
-
# if current_length + len(word) + 1 > chunk_size:
|
600 |
-
# chunks.append(" ".join(current_chunk))
|
601 |
-
# current_chunk = [word]
|
602 |
-
# current_length = len(word) + 1
|
603 |
-
# else:
|
604 |
-
# current_chunk.append(word)
|
605 |
-
# current_length += len(word) + 1
|
606 |
-
|
607 |
-
# if current_chunk:
|
608 |
-
# chunks.append(" ".join(current_chunk))
|
609 |
-
|
610 |
-
# return chunks
|
611 |
-
|
612 |
-
# model_id = 'parler-tts/parler_tts_mini_v0.1'
|
613 |
-
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
614 |
-
# try:
|
615 |
-
# model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
|
616 |
-
# except torch.cuda.OutOfMemoryError:
|
617 |
-
# print("CUDA out of memory. Switching to CPU.")
|
618 |
-
# device = "cpu"
|
619 |
-
# model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
|
620 |
-
# tokenizer = AutoTokenizer.from_pretrained(model_id)
|
621 |
-
|
622 |
-
# description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
|
623 |
-
|
624 |
-
# input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
|
625 |
-
# chunks = split_text(text, chunk_size)
|
626 |
-
# audio_arrs = []
|
627 |
-
|
628 |
-
# for chunk in chunks:
|
629 |
-
# prompt_input_ids = tokenizer(chunk, return_tensors="pt").input_ids.to(device)
|
630 |
-
# generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
|
631 |
-
# audio_arr = generation.cpu().numpy().squeeze()
|
632 |
-
# audio_arrs.append(audio_arr)
|
633 |
-
|
634 |
-
# # Concatenate all audio arrays into a single array
|
635 |
-
# concatenated_audio = np.concatenate(audio_arrs)
|
636 |
-
|
637 |
-
# with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
638 |
-
# sf.write(f.name, concatenated_audio, model.config.sampling_rate)
|
639 |
-
# temp_audio_path = f.name
|
640 |
-
|
641 |
-
# logging.debug(f"Audio saved to {temp_audio_path}")
|
642 |
-
# return temp_audio_path
|
643 |
-
|
644 |
-
|
645 |
-
import concurrent.futures
|
646 |
-
|
647 |
-
def generate_audio_parler_tts(text, chunk_size=200):
|
648 |
-
def split_text(text, chunk_size):
|
649 |
-
words = text.split()
|
650 |
-
chunks = []
|
651 |
-
current_chunk = []
|
652 |
-
current_length = 0
|
653 |
-
|
654 |
-
for word in words:
|
655 |
-
if current_length + len(word) + 1 > chunk_size:
|
656 |
-
chunks.append(" ".join(current_chunk))
|
657 |
-
current_chunk = [word]
|
658 |
-
current_length = len(word) + 1
|
659 |
-
else:
|
660 |
-
current_chunk.append(word)
|
661 |
-
current_length += len(word) + 1
|
662 |
-
|
663 |
-
if current_chunk:
|
664 |
-
chunks.append(" ".join(current_chunk))
|
665 |
-
|
666 |
-
return chunks
|
667 |
-
|
668 |
-
def process_chunk(chunk):
|
669 |
-
prompt_input_ids = tokenizer(chunk, return_tensors="pt").input_ids.to(device)
|
670 |
-
generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
|
671 |
-
audio_arr = generation.cpu().numpy().squeeze()
|
672 |
-
return audio_arr
|
673 |
-
|
674 |
model_id = 'parler-tts/parler_tts_mini_v0.1'
|
675 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
676 |
try:
|
@@ -684,28 +540,18 @@ def generate_audio_parler_tts(text, chunk_size=200):
|
|
684 |
description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
|
685 |
|
686 |
input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
|
687 |
-
|
688 |
-
|
689 |
-
|
690 |
-
|
691 |
-
futures = [executor.submit(process_chunk, chunk) for chunk in chunks]
|
692 |
-
audio_arrs = [future.result() for future in concurrent.futures.as_completed(futures)]
|
693 |
-
|
694 |
-
# Concatenate all audio arrays into a single array
|
695 |
-
concatenated_audio = np.concatenate(audio_arrs)
|
696 |
|
697 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
698 |
-
sf.write(f.name,
|
699 |
temp_audio_path = f.name
|
700 |
|
701 |
logging.debug(f"Audio saved to {temp_audio_path}")
|
702 |
return temp_audio_path
|
703 |
|
704 |
-
|
705 |
-
|
706 |
-
|
707 |
-
|
708 |
-
|
709 |
# Stable Diffusion setup
|
710 |
pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16)
|
711 |
pipe = pipe.to("cuda")
|
|
|
526 |
logging.error(f"Error generating audio: {response.text}")
|
527 |
return None
|
528 |
|
529 |
+
def generate_audio_parler_tts(text):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
530 |
model_id = 'parler-tts/parler_tts_mini_v0.1'
|
531 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
532 |
try:
|
|
|
540 |
description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
|
541 |
|
542 |
input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
|
543 |
+
prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
|
544 |
+
|
545 |
+
generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
|
546 |
+
audio_arr = generation.cpu().numpy().squeeze()
|
|
|
|
|
|
|
|
|
|
|
547 |
|
548 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
549 |
+
sf.write(f.name, audio_arr, model.config.sampling_rate)
|
550 |
temp_audio_path = f.name
|
551 |
|
552 |
logging.debug(f"Audio saved to {temp_audio_path}")
|
553 |
return temp_audio_path
|
554 |
|
|
|
|
|
|
|
|
|
|
|
555 |
# Stable Diffusion setup
|
556 |
pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16)
|
557 |
pipe = pipe.to("cuda")
|