Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -444,31 +444,30 @@ def generate_tts_response(response, tts_choice):
|
|
444 |
|
445 |
import concurrent.futures
|
446 |
|
447 |
-
|
448 |
-
|
|
|
449 |
# Initialize an empty response
|
450 |
response = ""
|
451 |
|
452 |
-
#
|
453 |
-
|
454 |
-
|
455 |
-
bot_future = executor.submit(generate_bot_response, history, choice, retrieval_mode, model_choice)
|
456 |
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
|
|
|
|
|
461 |
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
parler_tts_future.result()
|
466 |
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
audio_path = tts_future.result()
|
471 |
-
yield history, audio_path
|
472 |
|
473 |
def yield_audio(audio_chunk):
|
474 |
""" Stream audio in chunks to the output """
|
@@ -476,6 +475,11 @@ def yield_audio(audio_chunk):
|
|
476 |
write_wav(temp_audio_path, 16000, audio_chunk.astype(np.float32))
|
477 |
return temp_audio_path
|
478 |
|
|
|
|
|
|
|
|
|
|
|
479 |
|
480 |
|
481 |
|
@@ -1028,6 +1032,7 @@ def generate_audio_elevenlabs(text):
|
|
1028 |
# return combined_audio_path
|
1029 |
|
1030 |
|
|
|
1031 |
import concurrent.futures
|
1032 |
import tempfile
|
1033 |
import os
|
@@ -1044,7 +1049,8 @@ device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
|
1044 |
|
1045 |
repo_id = "parler-tts/parler-tts-mini-v1"
|
1046 |
|
1047 |
-
|
|
|
1048 |
description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
|
1049 |
chunk_size_in_s = 3.0 # Set to 3-second chunks
|
1050 |
|
@@ -1083,6 +1089,7 @@ def generate_audio_parler_tts(text, callback=None):
|
|
1083 |
audio_segments = []
|
1084 |
for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
|
1085 |
audio_segments.append(audio_chunk)
|
|
|
1086 |
|
1087 |
# Combine all the audio chunks into one audio file after streaming
|
1088 |
combined_audio = np.concatenate(audio_segments)
|
|
|
444 |
|
445 |
import concurrent.futures
|
446 |
|
447 |
+
import asyncio
|
448 |
+
|
449 |
+
async def bot(history, choice, tts_choice, retrieval_mode, model_choice):
|
450 |
# Initialize an empty response
|
451 |
response = ""
|
452 |
|
453 |
+
# Start generating the text and audio in parallel
|
454 |
+
text_future = asyncio.create_task(generate_text(history, choice, retrieval_mode, model_choice))
|
455 |
+
audio_future = None
|
|
|
456 |
|
457 |
+
while not text_future.done():
|
458 |
+
# Stream the text as it's being generated
|
459 |
+
chunk = await text_future
|
460 |
+
response += chunk
|
461 |
+
history[-1][1] += chunk
|
462 |
+
yield history, None # Stream the text output as it's generated
|
463 |
|
464 |
+
# Start generating Parler TTS if selected
|
465 |
+
if tts_choice == "Beta" and audio_future is None:
|
466 |
+
audio_future = asyncio.create_task(generate_audio_parler_tts(response, callback=lambda audio_chunk: yield_audio(audio_chunk)))
|
|
|
467 |
|
468 |
+
# Wait for the audio to finish streaming
|
469 |
+
if audio_future is not None:
|
470 |
+
await audio_future
|
|
|
|
|
471 |
|
472 |
def yield_audio(audio_chunk):
|
473 |
""" Stream audio in chunks to the output """
|
|
|
475 |
write_wav(temp_audio_path, 16000, audio_chunk.astype(np.float32))
|
476 |
return temp_audio_path
|
477 |
|
478 |
+
async def generate_text(history, choice, retrieval_mode, model_choice):
|
479 |
+
# Simulate text generation chunk by chunk
|
480 |
+
for char in "Generating text response...":
|
481 |
+
await asyncio.sleep(0.05) # Simulate time delay between character generation
|
482 |
+
yield char
|
483 |
|
484 |
|
485 |
|
|
|
1032 |
# return combined_audio_path
|
1033 |
|
1034 |
|
1035 |
+
import asyncio
|
1036 |
import concurrent.futures
|
1037 |
import tempfile
|
1038 |
import os
|
|
|
1049 |
|
1050 |
repo_id = "parler-tts/parler-tts-mini-v1"
|
1051 |
|
1052 |
+
# Async function to stream Parler TTS in chunks
|
1053 |
+
async def generate_audio_parler_tts(text, callback=None):
|
1054 |
description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
|
1055 |
chunk_size_in_s = 3.0 # Set to 3-second chunks
|
1056 |
|
|
|
1089 |
audio_segments = []
|
1090 |
for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
|
1091 |
audio_segments.append(audio_chunk)
|
1092 |
+
await asyncio.sleep(0) # Allow other tasks to run
|
1093 |
|
1094 |
# Combine all the audio chunks into one audio file after streaming
|
1095 |
combined_audio = np.concatenate(audio_segments)
|