Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -444,31 +444,30 @@ def generate_tts_response(response, tts_choice):
|
|
| 444 |
|
| 445 |
import concurrent.futures
|
| 446 |
|
| 447 |
-
|
| 448 |
-
|
|
|
|
| 449 |
# Initialize an empty response
|
| 450 |
response = ""
|
| 451 |
|
| 452 |
-
#
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
bot_future = executor.submit(generate_bot_response, history, choice, retrieval_mode, model_choice)
|
| 456 |
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
|
|
|
|
|
|
| 461 |
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
parler_tts_future.result()
|
| 466 |
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
audio_path = tts_future.result()
|
| 471 |
-
yield history, audio_path
|
| 472 |
|
| 473 |
def yield_audio(audio_chunk):
|
| 474 |
""" Stream audio in chunks to the output """
|
|
@@ -476,6 +475,11 @@ def yield_audio(audio_chunk):
|
|
| 476 |
write_wav(temp_audio_path, 16000, audio_chunk.astype(np.float32))
|
| 477 |
return temp_audio_path
|
| 478 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 479 |
|
| 480 |
|
| 481 |
|
|
@@ -1028,6 +1032,7 @@ def generate_audio_elevenlabs(text):
|
|
| 1028 |
# return combined_audio_path
|
| 1029 |
|
| 1030 |
|
|
|
|
| 1031 |
import concurrent.futures
|
| 1032 |
import tempfile
|
| 1033 |
import os
|
|
@@ -1044,7 +1049,8 @@ device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
|
| 1044 |
|
| 1045 |
repo_id = "parler-tts/parler-tts-mini-v1"
|
| 1046 |
|
| 1047 |
-
|
|
|
|
| 1048 |
description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
|
| 1049 |
chunk_size_in_s = 3.0 # Set to 3-second chunks
|
| 1050 |
|
|
@@ -1083,6 +1089,7 @@ def generate_audio_parler_tts(text, callback=None):
|
|
| 1083 |
audio_segments = []
|
| 1084 |
for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
|
| 1085 |
audio_segments.append(audio_chunk)
|
|
|
|
| 1086 |
|
| 1087 |
# Combine all the audio chunks into one audio file after streaming
|
| 1088 |
combined_audio = np.concatenate(audio_segments)
|
|
|
|
| 444 |
|
| 445 |
import concurrent.futures
|
| 446 |
|
| 447 |
+
import asyncio
|
| 448 |
+
|
| 449 |
+
async def bot(history, choice, tts_choice, retrieval_mode, model_choice):
|
| 450 |
# Initialize an empty response
|
| 451 |
response = ""
|
| 452 |
|
| 453 |
+
# Start generating the text and audio in parallel
|
| 454 |
+
text_future = asyncio.create_task(generate_text(history, choice, retrieval_mode, model_choice))
|
| 455 |
+
audio_future = None
|
|
|
|
| 456 |
|
| 457 |
+
while not text_future.done():
|
| 458 |
+
# Stream the text as it's being generated
|
| 459 |
+
chunk = await text_future
|
| 460 |
+
response += chunk
|
| 461 |
+
history[-1][1] += chunk
|
| 462 |
+
yield history, None # Stream the text output as it's generated
|
| 463 |
|
| 464 |
+
# Start generating Parler TTS if selected
|
| 465 |
+
if tts_choice == "Beta" and audio_future is None:
|
| 466 |
+
audio_future = asyncio.create_task(generate_audio_parler_tts(response, callback=lambda audio_chunk: yield_audio(audio_chunk)))
|
|
|
|
| 467 |
|
| 468 |
+
# Wait for the audio to finish streaming
|
| 469 |
+
if audio_future is not None:
|
| 470 |
+
await audio_future
|
|
|
|
|
|
|
| 471 |
|
| 472 |
def yield_audio(audio_chunk):
|
| 473 |
""" Stream audio in chunks to the output """
|
|
|
|
| 475 |
write_wav(temp_audio_path, 16000, audio_chunk.astype(np.float32))
|
| 476 |
return temp_audio_path
|
| 477 |
|
| 478 |
+
async def generate_text(history, choice, retrieval_mode, model_choice):
|
| 479 |
+
# Simulate text generation chunk by chunk
|
| 480 |
+
for char in "Generating text response...":
|
| 481 |
+
await asyncio.sleep(0.05) # Simulate time delay between character generation
|
| 482 |
+
yield char
|
| 483 |
|
| 484 |
|
| 485 |
|
|
|
|
| 1032 |
# return combined_audio_path
|
| 1033 |
|
| 1034 |
|
| 1035 |
+
import asyncio
|
| 1036 |
import concurrent.futures
|
| 1037 |
import tempfile
|
| 1038 |
import os
|
|
|
|
| 1049 |
|
| 1050 |
repo_id = "parler-tts/parler-tts-mini-v1"
|
| 1051 |
|
| 1052 |
+
# Async function to stream Parler TTS in chunks
|
| 1053 |
+
async def generate_audio_parler_tts(text, callback=None):
|
| 1054 |
description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
|
| 1055 |
chunk_size_in_s = 3.0 # Set to 3-second chunks
|
| 1056 |
|
|
|
|
| 1089 |
audio_segments = []
|
| 1090 |
for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
|
| 1091 |
audio_segments.append(audio_chunk)
|
| 1092 |
+
await asyncio.sleep(0) # Allow other tasks to run
|
| 1093 |
|
| 1094 |
# Combine all the audio chunks into one audio file after streaming
|
| 1095 |
combined_audio = np.concatenate(audio_segments)
|