Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -532,7 +532,63 @@ def generate_audio_elevenlabs(text):
|
|
| 532 |
# logging.debug(f"Audio saved to {temp_audio_path}")
|
| 533 |
# return temp_audio_path
|
| 534 |
|
| 535 |
-
def generate_audio_parler_tts(text):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 536 |
model_id = 'parler-tts/parler_tts_mini_v0.1'
|
| 537 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 538 |
try:
|
|
@@ -546,22 +602,20 @@ def generate_audio_parler_tts(text):
|
|
| 546 |
description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
|
| 547 |
|
| 548 |
input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
|
| 549 |
-
|
|
|
|
| 550 |
|
| 551 |
-
|
| 552 |
-
text_chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
|
| 553 |
-
audio_segments = []
|
| 554 |
-
|
| 555 |
-
for chunk in text_chunks:
|
| 556 |
prompt_input_ids = tokenizer(chunk, return_tensors="pt").input_ids.to(device)
|
| 557 |
generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
|
| 558 |
audio_arr = generation.cpu().numpy().squeeze()
|
| 559 |
-
|
|
|
|
|
|
|
|
|
|
| 560 |
|
| 561 |
-
combined_audio = np.concatenate(audio_segments)
|
| 562 |
-
|
| 563 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
| 564 |
-
sf.write(f.name,
|
| 565 |
temp_audio_path = f.name
|
| 566 |
|
| 567 |
logging.debug(f"Audio saved to {temp_audio_path}")
|
|
@@ -571,6 +625,7 @@ def generate_audio_parler_tts(text):
|
|
| 571 |
|
| 572 |
|
| 573 |
|
|
|
|
| 574 |
# Stable Diffusion setup
|
| 575 |
pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16)
|
| 576 |
pipe = pipe.to("cuda")
|
|
|
|
| 532 |
# logging.debug(f"Audio saved to {temp_audio_path}")
|
| 533 |
# return temp_audio_path
|
| 534 |
|
| 535 |
+
# def generate_audio_parler_tts(text):
|
| 536 |
+
# model_id = 'parler-tts/parler_tts_mini_v0.1'
|
| 537 |
+
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 538 |
+
# try:
|
| 539 |
+
# model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
|
| 540 |
+
# except torch.cuda.OutOfMemoryError:
|
| 541 |
+
# print("CUDA out of memory. Switching to CPU.")
|
| 542 |
+
# device = "cpu"
|
| 543 |
+
# model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
|
| 544 |
+
# tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 545 |
+
|
| 546 |
+
# description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
|
| 547 |
+
|
| 548 |
+
# input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
|
| 549 |
+
# max_length = model.config.max_length
|
| 550 |
+
|
| 551 |
+
# # Split the text into smaller chunks if it exceeds the max length
|
| 552 |
+
# text_chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
|
| 553 |
+
# audio_segments = []
|
| 554 |
+
|
| 555 |
+
# for chunk in text_chunks:
|
| 556 |
+
# prompt_input_ids = tokenizer(chunk, return_tensors="pt").input_ids.to(device)
|
| 557 |
+
# generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
|
| 558 |
+
# audio_arr = generation.cpu().numpy().squeeze()
|
| 559 |
+
# audio_segments.append(audio_arr)
|
| 560 |
+
|
| 561 |
+
# combined_audio = np.concatenate(audio_segments)
|
| 562 |
+
|
| 563 |
+
# with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
| 564 |
+
# sf.write(f.name, combined_audio, model.config.sampling_rate)
|
| 565 |
+
# temp_audio_path = f.name
|
| 566 |
+
|
| 567 |
+
# logging.debug(f"Audio saved to {temp_audio_path}")
|
| 568 |
+
# return temp_audio_path
|
| 569 |
+
|
| 570 |
+
def generate_audio_parler_tts(text, chunk_size=200):
|
| 571 |
+
def split_text(text, chunk_size):
|
| 572 |
+
# Split text into chunks of the specified size
|
| 573 |
+
words = text.split()
|
| 574 |
+
chunks = []
|
| 575 |
+
current_chunk = []
|
| 576 |
+
current_length = 0
|
| 577 |
+
|
| 578 |
+
for word in words:
|
| 579 |
+
if current_length + len(word) + 1 > chunk_size:
|
| 580 |
+
chunks.append(" ".join(current_chunk))
|
| 581 |
+
current_chunk = [word]
|
| 582 |
+
current_length = len(word) + 1
|
| 583 |
+
else:
|
| 584 |
+
current_chunk.append(word)
|
| 585 |
+
current_length += len(word) + 1
|
| 586 |
+
|
| 587 |
+
if current_chunk:
|
| 588 |
+
chunks.append(" ".join(current_chunk))
|
| 589 |
+
|
| 590 |
+
return chunks
|
| 591 |
+
|
| 592 |
model_id = 'parler-tts/parler_tts_mini_v0.1'
|
| 593 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 594 |
try:
|
|
|
|
| 602 |
description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
|
| 603 |
|
| 604 |
input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
|
| 605 |
+
chunks = split_text(text, chunk_size)
|
| 606 |
+
audio_arrs = []
|
| 607 |
|
| 608 |
+
for chunk in chunks:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 609 |
prompt_input_ids = tokenizer(chunk, return_tensors="pt").input_ids.to(device)
|
| 610 |
generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
|
| 611 |
audio_arr = generation.cpu().numpy().squeeze()
|
| 612 |
+
audio_arrs.append(audio_arr)
|
| 613 |
+
|
| 614 |
+
# Concatenate all audio arrays into a single array
|
| 615 |
+
concatenated_audio = np.concatenate(audio_arrs)
|
| 616 |
|
|
|
|
|
|
|
| 617 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
| 618 |
+
sf.write(f.name, concatenated_audio, model.config.sampling_rate)
|
| 619 |
temp_audio_path = f.name
|
| 620 |
|
| 621 |
logging.debug(f"Audio saved to {temp_audio_path}")
|
|
|
|
| 625 |
|
| 626 |
|
| 627 |
|
| 628 |
+
|
| 629 |
# Stable Diffusion setup
|
| 630 |
pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16)
|
| 631 |
pipe = pipe.to("cuda")
|