Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -356,34 +356,15 @@ Sure! Here's the information you requested:
|
|
| 356 |
"""
|
| 357 |
|
| 358 |
|
| 359 |
-
# def generate_bot_response(history, choice, retrieval_mode, model_choice):
|
| 360 |
-
# if not history:
|
| 361 |
-
# return
|
| 362 |
-
|
| 363 |
-
# # Select the model
|
| 364 |
-
# # selected_model = chat_model if model_choice == "LM-1" else phi_pipe
|
| 365 |
-
# selected_model = chat_model if model_choice == "LM-1" else (chat_model1 if model_choice == "LM-3" else phi_pipe)
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
# response, addresses = generate_answer(history[-1][0], choice, retrieval_mode, selected_model)
|
| 369 |
-
# history[-1][1] = ""
|
| 370 |
-
|
| 371 |
-
# for character in response:
|
| 372 |
-
# history[-1][1] += character
|
| 373 |
-
# yield history # Stream each character as it is generated
|
| 374 |
-
# time.sleep(0.05) # Add a slight delay to simulate streaming
|
| 375 |
-
|
| 376 |
-
# yield history # Final yield with the complete response
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
# Modified bot function to separate chatbot response and TTS generation
|
| 380 |
def generate_bot_response(history, choice, retrieval_mode, model_choice):
|
| 381 |
if not history:
|
| 382 |
return
|
| 383 |
|
| 384 |
# Select the model
|
|
|
|
| 385 |
selected_model = chat_model if model_choice == "LM-1" else (chat_model1 if model_choice == "LM-3" else phi_pipe)
|
| 386 |
|
|
|
|
| 387 |
response, addresses = generate_answer(history[-1][0], choice, retrieval_mode, selected_model)
|
| 388 |
history[-1][1] = ""
|
| 389 |
|
|
@@ -416,70 +397,34 @@ def generate_tts_response(response, tts_choice):
|
|
| 416 |
|
| 417 |
|
| 418 |
|
| 419 |
-
# import concurrent.futures
|
| 420 |
-
# # Existing bot function with concurrent futures for parallel processing
|
| 421 |
-
# def bot(history, choice, tts_choice, retrieval_mode, model_choice):
|
| 422 |
-
# # Initialize an empty response
|
| 423 |
-
# response = ""
|
| 424 |
-
|
| 425 |
-
# # Create a thread pool to handle both text generation and TTS conversion in parallel
|
| 426 |
-
# with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 427 |
-
# # Start the bot response generation in parallel
|
| 428 |
-
# bot_future = executor.submit(generate_bot_response, history, choice, retrieval_mode, model_choice)
|
| 429 |
-
|
| 430 |
-
# # Wait for the text generation to start
|
| 431 |
-
# for history_chunk in bot_future.result():
|
| 432 |
-
# response = history_chunk[-1][1] # Update the response with the current state
|
| 433 |
-
# yield history_chunk, None # Stream the text output as it's generated
|
| 434 |
-
|
| 435 |
-
# # Once text is fully generated, start the TTS conversion
|
| 436 |
-
# tts_future = executor.submit(generate_tts_response, response, tts_choice)
|
| 437 |
-
|
| 438 |
-
# # Get the audio output after TTS is done
|
| 439 |
-
# audio_path = tts_future.result()
|
| 440 |
-
|
| 441 |
-
# # Stream the final text and audio output
|
| 442 |
-
# yield history, audio_path
|
| 443 |
-
|
| 444 |
-
|
| 445 |
import concurrent.futures
|
|
|
|
|
|
|
|
|
|
|
|
|
| 446 |
|
| 447 |
-
|
|
|
|
|
|
|
|
|
|
| 448 |
|
| 449 |
-
|
| 450 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 451 |
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
audio_future = None
|
| 455 |
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
response += chunk
|
| 459 |
-
history[-1][1] += chunk
|
| 460 |
-
yield history, None # Stream the text output as it's generated
|
| 461 |
|
| 462 |
-
# Start generating Parler TTS if selected and not started already
|
| 463 |
-
if tts_choice == "Beta" and audio_future is None:
|
| 464 |
-
audio_future = asyncio.create_task(generate_audio_parler_tts(response, callback=lambda audio_chunk: yield_audio(audio_chunk)))
|
| 465 |
|
| 466 |
-
# Wait for the audio to finish streaming if it was started
|
| 467 |
-
if audio_future is not None:
|
| 468 |
-
await audio_future
|
| 469 |
|
| 470 |
-
def yield_audio(audio_chunk):
|
| 471 |
-
""" Stream audio in chunks to the output """
|
| 472 |
-
temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_chunk_{int(time.time())}.wav")
|
| 473 |
-
write_wav(temp_audio_path, 16000, audio_chunk.astype(np.float32))
|
| 474 |
-
return temp_audio_path
|
| 475 |
|
| 476 |
-
# Text generator as an async generator
|
| 477 |
-
async def generate_text(history, choice, retrieval_mode, model_choice):
|
| 478 |
-
# Simulate text generation chunk by chunk
|
| 479 |
-
text_to_generate = "Generating text response..."
|
| 480 |
-
for char in text_to_generate:
|
| 481 |
-
await asyncio.sleep(0.05) # Simulate time delay between character generation
|
| 482 |
-
yield char # Yield each character as it's generated
|
| 483 |
|
| 484 |
|
| 485 |
|
|
@@ -507,21 +452,11 @@ def generate_bot_response(history, choice, retrieval_mode, model_choice):
|
|
| 507 |
|
| 508 |
|
| 509 |
|
| 510 |
-
# def generate_audio_after_text(response, tts_choice):
|
| 511 |
-
# # Generate TTS audio after text response is completed
|
| 512 |
-
# with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 513 |
-
# tts_future = executor.submit(generate_tts_response, response, tts_choice)
|
| 514 |
-
# audio_path = tts_future.result()
|
| 515 |
-
# return audio_path
|
| 516 |
-
|
| 517 |
def generate_audio_after_text(response, tts_choice):
|
| 518 |
# Generate TTS audio after text response is completed
|
| 519 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
elif tts_choice == "Beta":
|
| 523 |
-
audio_future = executor.submit(generate_audio_parler_tts, response) # Use the updated Parler TTS generator
|
| 524 |
-
audio_path = audio_future.result()
|
| 525 |
return audio_path
|
| 526 |
|
| 527 |
import re
|
|
@@ -766,9 +701,9 @@ def generate_image(prompt):
|
|
| 766 |
).images[0]
|
| 767 |
return image
|
| 768 |
|
| 769 |
-
hardcoded_prompt_1 = "
|
| 770 |
-
hardcoded_prompt_2 = "A
|
| 771 |
-
hardcoded_prompt_3 = "
|
| 772 |
|
| 773 |
def update_images():
|
| 774 |
image_1 = generate_image(hardcoded_prompt_1)
|
|
@@ -960,79 +895,6 @@ def generate_audio_elevenlabs(text):
|
|
| 960 |
|
| 961 |
# chunking audio and then Process
|
| 962 |
|
| 963 |
-
# import concurrent.futures
|
| 964 |
-
# import tempfile
|
| 965 |
-
# import os
|
| 966 |
-
# import numpy as np
|
| 967 |
-
# import logging
|
| 968 |
-
# from queue import Queue
|
| 969 |
-
# from threading import Thread
|
| 970 |
-
# from scipy.io.wavfile import write as write_wav
|
| 971 |
-
# from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
|
| 972 |
-
# from transformers import AutoTokenizer
|
| 973 |
-
|
| 974 |
-
# # Ensure your device is set to CUDA
|
| 975 |
-
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 976 |
-
|
| 977 |
-
# repo_id = "parler-tts/parler-tts-mini-v1"
|
| 978 |
-
|
| 979 |
-
# def generate_audio_parler_tts(text):
|
| 980 |
-
# description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
|
| 981 |
-
# chunk_size_in_s = 0.5
|
| 982 |
-
|
| 983 |
-
# # Initialize the tokenizer and model
|
| 984 |
-
# parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
|
| 985 |
-
# parler_model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
|
| 986 |
-
# sampling_rate = parler_model.audio_encoder.config.sampling_rate
|
| 987 |
-
# frame_rate = parler_model.audio_encoder.config.frame_rate
|
| 988 |
-
|
| 989 |
-
# def generate(text, description, play_steps_in_s=0.5):
|
| 990 |
-
# play_steps = int(frame_rate * play_steps_in_s)
|
| 991 |
-
# streamer = ParlerTTSStreamer(parler_model, device=device, play_steps=play_steps)
|
| 992 |
-
|
| 993 |
-
# inputs = parler_tokenizer(description, return_tensors="pt").to(device)
|
| 994 |
-
# prompt = parler_tokenizer(text, return_tensors="pt").to(device)
|
| 995 |
-
|
| 996 |
-
# generation_kwargs = dict(
|
| 997 |
-
# input_ids=inputs.input_ids,
|
| 998 |
-
# prompt_input_ids=prompt.input_ids,
|
| 999 |
-
# attention_mask=inputs.attention_mask,
|
| 1000 |
-
# prompt_attention_mask=prompt.attention_mask,
|
| 1001 |
-
# streamer=streamer,
|
| 1002 |
-
# do_sample=True,
|
| 1003 |
-
# temperature=1.0,
|
| 1004 |
-
# min_new_tokens=10,
|
| 1005 |
-
# )
|
| 1006 |
-
|
| 1007 |
-
# thread = Thread(target=parler_model.generate, kwargs=generation_kwargs)
|
| 1008 |
-
# thread.start()
|
| 1009 |
-
|
| 1010 |
-
# for new_audio in streamer:
|
| 1011 |
-
# if new_audio.shape[0] == 0:
|
| 1012 |
-
# break
|
| 1013 |
-
# # Save or process each audio chunk as it is generated
|
| 1014 |
-
# yield sampling_rate, new_audio
|
| 1015 |
-
|
| 1016 |
-
# audio_segments = []
|
| 1017 |
-
# for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
|
| 1018 |
-
# audio_segments.append(audio_chunk)
|
| 1019 |
-
|
| 1020 |
-
# temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_chunk_{len(audio_segments)}.wav")
|
| 1021 |
-
# write_wav(temp_audio_path, sampling_rate, audio_chunk.astype(np.float32))
|
| 1022 |
-
# logging.debug(f"Saved chunk to {temp_audio_path}")
|
| 1023 |
-
|
| 1024 |
-
|
| 1025 |
-
# # Combine all the audio chunks into one audio file
|
| 1026 |
-
# combined_audio = np.concatenate(audio_segments)
|
| 1027 |
-
# combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
|
| 1028 |
-
|
| 1029 |
-
# write_wav(combined_audio_path, sampling_rate, combined_audio.astype(np.float32))
|
| 1030 |
-
|
| 1031 |
-
# logging.debug(f"Combined audio saved to {combined_audio_path}")
|
| 1032 |
-
# return combined_audio_path
|
| 1033 |
-
|
| 1034 |
-
|
| 1035 |
-
import asyncio
|
| 1036 |
import concurrent.futures
|
| 1037 |
import tempfile
|
| 1038 |
import os
|
|
@@ -1049,10 +911,9 @@ device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
|
| 1049 |
|
| 1050 |
repo_id = "parler-tts/parler-tts-mini-v1"
|
| 1051 |
|
| 1052 |
-
|
| 1053 |
-
async def generate_audio_parler_tts(text, callback=None):
|
| 1054 |
description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
|
| 1055 |
-
chunk_size_in_s =
|
| 1056 |
|
| 1057 |
# Initialize the tokenizer and model
|
| 1058 |
parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
|
|
@@ -1060,7 +921,7 @@ async def generate_audio_parler_tts(text, callback=None):
|
|
| 1060 |
sampling_rate = parler_model.audio_encoder.config.sampling_rate
|
| 1061 |
frame_rate = parler_model.audio_encoder.config.frame_rate
|
| 1062 |
|
| 1063 |
-
def generate(text, description, play_steps_in_s=
|
| 1064 |
play_steps = int(frame_rate * play_steps_in_s)
|
| 1065 |
streamer = ParlerTTSStreamer(parler_model, device=device, play_steps=play_steps)
|
| 1066 |
|
|
@@ -1070,6 +931,8 @@ async def generate_audio_parler_tts(text, callback=None):
|
|
| 1070 |
generation_kwargs = dict(
|
| 1071 |
input_ids=inputs.input_ids,
|
| 1072 |
prompt_input_ids=prompt.input_ids,
|
|
|
|
|
|
|
| 1073 |
streamer=streamer,
|
| 1074 |
do_sample=True,
|
| 1075 |
temperature=1.0,
|
|
@@ -1082,26 +945,28 @@ async def generate_audio_parler_tts(text, callback=None):
|
|
| 1082 |
for new_audio in streamer:
|
| 1083 |
if new_audio.shape[0] == 0:
|
| 1084 |
break
|
| 1085 |
-
|
| 1086 |
-
callback(new_audio) # Send the chunk to the callback function for streaming
|
| 1087 |
yield sampling_rate, new_audio
|
| 1088 |
|
| 1089 |
audio_segments = []
|
| 1090 |
for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
|
| 1091 |
audio_segments.append(audio_chunk)
|
| 1092 |
-
await asyncio.sleep(0) # Allow other tasks to run
|
| 1093 |
|
| 1094 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1095 |
combined_audio = np.concatenate(audio_segments)
|
| 1096 |
combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
|
|
|
|
| 1097 |
write_wav(combined_audio_path, sampling_rate, combined_audio.astype(np.float32))
|
| 1098 |
|
| 1099 |
logging.debug(f"Combined audio saved to {combined_audio_path}")
|
| 1100 |
return combined_audio_path
|
| 1101 |
|
| 1102 |
|
| 1103 |
-
|
| 1104 |
-
|
| 1105 |
def fetch_local_events():
|
| 1106 |
api_key = os.environ['SERP_API']
|
| 1107 |
url = f'https://serpapi.com/search.json?engine=google_events&q=Events+in+Birmingham&hl=en&gl=us&api_key={api_key}'
|
|
@@ -1527,25 +1392,13 @@ with gr.Blocks(theme='Pijush2023/scikit-learn-pijush') as demo:
|
|
| 1527 |
# .then(fn=clear_textbox, inputs=[], outputs=[chat_input],api_name="api_clear_textbox")
|
| 1528 |
# )
|
| 1529 |
|
| 1530 |
-
# retriever_sequence = (
|
| 1531 |
-
# retriever_button.click(fn=stop_audio, inputs=[], outputs=[audio_output], api_name="api_stop_audio_recording")
|
| 1532 |
-
# .then(fn=add_message, inputs=[chatbot, chat_input], outputs=[chatbot, chat_input], api_name="api_addprompt_chathistory")
|
| 1533 |
-
# # First, generate the bot response
|
| 1534 |
-
# .then(fn=generate_bot_response, inputs=[chatbot, choice, retrieval_mode, model_choice], outputs=[chatbot], api_name="api_generate_bot_response")
|
| 1535 |
-
# # Then, generate the TTS response based on the bot's response
|
| 1536 |
-
# .then(fn=generate_tts_response, inputs=[chatbot, tts_choice], outputs=[audio_output], api_name="api_generate_tts_response")
|
| 1537 |
-
# .then(fn=show_map_if_details, inputs=[chatbot, choice], outputs=[location_output, location_output], api_name="api_show_map_details")
|
| 1538 |
-
# .then(fn=clear_textbox, inputs=[], outputs=[chat_input], api_name="api_clear_textbox")
|
| 1539 |
-
# )
|
| 1540 |
-
|
| 1541 |
-
# Gradio bot interaction with audio streaming
|
| 1542 |
retriever_sequence = (
|
| 1543 |
retriever_button.click(fn=stop_audio, inputs=[], outputs=[audio_output], api_name="api_stop_audio_recording")
|
| 1544 |
.then(fn=add_message, inputs=[chatbot, chat_input], outputs=[chatbot, chat_input], api_name="api_addprompt_chathistory")
|
| 1545 |
# First, generate the bot response
|
| 1546 |
.then(fn=generate_bot_response, inputs=[chatbot, choice, retrieval_mode, model_choice], outputs=[chatbot], api_name="api_generate_bot_response")
|
| 1547 |
-
#
|
| 1548 |
-
.then(fn=
|
| 1549 |
.then(fn=show_map_if_details, inputs=[chatbot, choice], outputs=[location_output, location_output], api_name="api_show_map_details")
|
| 1550 |
.then(fn=clear_textbox, inputs=[], outputs=[chat_input], api_name="api_clear_textbox")
|
| 1551 |
)
|
|
@@ -1564,25 +1417,14 @@ with gr.Blocks(theme='Pijush2023/scikit-learn-pijush') as demo:
|
|
| 1564 |
# fn=clear_textbox, inputs=[], outputs=[chat_input],api_name="api_clear_textbox"
|
| 1565 |
# )
|
| 1566 |
|
| 1567 |
-
# chat_input.submit(fn=stop_audio, inputs=[], outputs=[audio_output], api_name="api_stop_audio_recording").then(
|
| 1568 |
-
# fn=add_message, inputs=[chatbot, chat_input], outputs=[chatbot, chat_input], api_name="api_addprompt_chathistory"
|
| 1569 |
-
# ).then(
|
| 1570 |
-
# # First, generate the bot response
|
| 1571 |
-
# fn=generate_bot_response, inputs=[chatbot, choice, retrieval_mode, model_choice], outputs=[chatbot], api_name="api_generate_bot_response"
|
| 1572 |
-
# ).then(
|
| 1573 |
-
# # Then, generate the TTS response based on the bot's response
|
| 1574 |
-
# fn=generate_tts_response, inputs=[chatbot, tts_choice], outputs=[audio_output], api_name="api_generate_tts_response"
|
| 1575 |
-
# ).then(
|
| 1576 |
-
# fn=show_map_if_details, inputs=[chatbot, choice], outputs=[location_output, location_output], api_name="api_show_map_details"
|
| 1577 |
-
# ).then(
|
| 1578 |
-
# fn=clear_textbox, inputs=[], outputs=[chat_input], api_name="api_clear_textbox"
|
| 1579 |
-
# )
|
| 1580 |
-
|
| 1581 |
-
# The same logic for chat_input submission
|
| 1582 |
chat_input.submit(fn=stop_audio, inputs=[], outputs=[audio_output], api_name="api_stop_audio_recording").then(
|
| 1583 |
fn=add_message, inputs=[chatbot, chat_input], outputs=[chatbot, chat_input], api_name="api_addprompt_chathistory"
|
| 1584 |
).then(
|
| 1585 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1586 |
).then(
|
| 1587 |
fn=show_map_if_details, inputs=[chatbot, choice], outputs=[location_output, location_output], api_name="api_show_map_details"
|
| 1588 |
).then(
|
|
@@ -1594,6 +1436,7 @@ with gr.Blocks(theme='Pijush2023/scikit-learn-pijush') as demo:
|
|
| 1594 |
|
| 1595 |
|
| 1596 |
|
|
|
|
| 1597 |
audio_input = gr.Audio(sources=["microphone"], streaming=True, type='numpy', every=0.1)
|
| 1598 |
audio_input.stream(transcribe_function, inputs=[state, audio_input], outputs=[state, chat_input], api_name="api_voice_to_text")
|
| 1599 |
|
|
@@ -1614,11 +1457,4 @@ with gr.Blocks(theme='Pijush2023/scikit-learn-pijush') as demo:
|
|
| 1614 |
refresh_button.click(fn=update_images, inputs=None, outputs=[image_output_1, image_output_2, image_output_3], api_name="update_image")
|
| 1615 |
|
| 1616 |
demo.queue()
|
| 1617 |
-
demo.launch(show_error=True)
|
| 1618 |
-
|
| 1619 |
-
|
| 1620 |
-
|
| 1621 |
-
|
| 1622 |
-
|
| 1623 |
-
|
| 1624 |
-
|
|
|
|
| 356 |
"""
|
| 357 |
|
| 358 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 359 |
def generate_bot_response(history, choice, retrieval_mode, model_choice):
|
| 360 |
if not history:
|
| 361 |
return
|
| 362 |
|
| 363 |
# Select the model
|
| 364 |
+
# selected_model = chat_model if model_choice == "LM-1" else phi_pipe
|
| 365 |
selected_model = chat_model if model_choice == "LM-1" else (chat_model1 if model_choice == "LM-3" else phi_pipe)
|
| 366 |
|
| 367 |
+
|
| 368 |
response, addresses = generate_answer(history[-1][0], choice, retrieval_mode, selected_model)
|
| 369 |
history[-1][1] = ""
|
| 370 |
|
|
|
|
| 397 |
|
| 398 |
|
| 399 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 400 |
import concurrent.futures
|
| 401 |
+
# Existing bot function with concurrent futures for parallel processing
|
| 402 |
+
def bot(history, choice, tts_choice, retrieval_mode, model_choice):
|
| 403 |
+
# Initialize an empty response
|
| 404 |
+
response = ""
|
| 405 |
|
| 406 |
+
# Create a thread pool to handle both text generation and TTS conversion in parallel
|
| 407 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 408 |
+
# Start the bot response generation in parallel
|
| 409 |
+
bot_future = executor.submit(generate_bot_response, history, choice, retrieval_mode, model_choice)
|
| 410 |
|
| 411 |
+
# Wait for the text generation to start
|
| 412 |
+
for history_chunk in bot_future.result():
|
| 413 |
+
response = history_chunk[-1][1] # Update the response with the current state
|
| 414 |
+
yield history_chunk, None # Stream the text output as it's generated
|
| 415 |
+
|
| 416 |
+
# Once text is fully generated, start the TTS conversion
|
| 417 |
+
tts_future = executor.submit(generate_tts_response, response, tts_choice)
|
| 418 |
|
| 419 |
+
# Get the audio output after TTS is done
|
| 420 |
+
audio_path = tts_future.result()
|
|
|
|
| 421 |
|
| 422 |
+
# Stream the final text and audio output
|
| 423 |
+
yield history, audio_path
|
|
|
|
|
|
|
|
|
|
| 424 |
|
|
|
|
|
|
|
|
|
|
| 425 |
|
|
|
|
|
|
|
|
|
|
| 426 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 427 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 428 |
|
| 429 |
|
| 430 |
|
|
|
|
| 452 |
|
| 453 |
|
| 454 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 455 |
def generate_audio_after_text(response, tts_choice):
|
| 456 |
# Generate TTS audio after text response is completed
|
| 457 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 458 |
+
tts_future = executor.submit(generate_tts_response, response, tts_choice)
|
| 459 |
+
audio_path = tts_future.result()
|
|
|
|
|
|
|
|
|
|
| 460 |
return audio_path
|
| 461 |
|
| 462 |
import re
|
|
|
|
| 701 |
).images[0]
|
| 702 |
return image
|
| 703 |
|
| 704 |
+
hardcoded_prompt_1 = "A high quality cinematic image for Toyota Truck in Birmingham skyline shot in th style of Michael Mann"
|
| 705 |
+
hardcoded_prompt_2 = "A high quality cinematic image for Alabama Quarterback close up emotional shot in th style of Michael Mann"
|
| 706 |
+
hardcoded_prompt_3 = "A high quality cinematic image for Taylor Swift concert in Birmingham skyline style of Michael Mann"
|
| 707 |
|
| 708 |
def update_images():
|
| 709 |
image_1 = generate_image(hardcoded_prompt_1)
|
|
|
|
| 895 |
|
| 896 |
# chunking audio and then Process
|
| 897 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 898 |
import concurrent.futures
|
| 899 |
import tempfile
|
| 900 |
import os
|
|
|
|
| 911 |
|
| 912 |
repo_id = "parler-tts/parler-tts-mini-v1"
|
| 913 |
|
| 914 |
+
def generate_audio_parler_tts(text):
|
|
|
|
| 915 |
description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
|
| 916 |
+
chunk_size_in_s = 0.5
|
| 917 |
|
| 918 |
# Initialize the tokenizer and model
|
| 919 |
parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
|
|
|
|
| 921 |
sampling_rate = parler_model.audio_encoder.config.sampling_rate
|
| 922 |
frame_rate = parler_model.audio_encoder.config.frame_rate
|
| 923 |
|
| 924 |
+
def generate(text, description, play_steps_in_s=0.5):
|
| 925 |
play_steps = int(frame_rate * play_steps_in_s)
|
| 926 |
streamer = ParlerTTSStreamer(parler_model, device=device, play_steps=play_steps)
|
| 927 |
|
|
|
|
| 931 |
generation_kwargs = dict(
|
| 932 |
input_ids=inputs.input_ids,
|
| 933 |
prompt_input_ids=prompt.input_ids,
|
| 934 |
+
attention_mask=inputs.attention_mask,
|
| 935 |
+
prompt_attention_mask=prompt.attention_mask,
|
| 936 |
streamer=streamer,
|
| 937 |
do_sample=True,
|
| 938 |
temperature=1.0,
|
|
|
|
| 945 |
for new_audio in streamer:
|
| 946 |
if new_audio.shape[0] == 0:
|
| 947 |
break
|
| 948 |
+
# Save or process each audio chunk as it is generated
|
|
|
|
| 949 |
yield sampling_rate, new_audio
|
| 950 |
|
| 951 |
audio_segments = []
|
| 952 |
for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
|
| 953 |
audio_segments.append(audio_chunk)
|
|
|
|
| 954 |
|
| 955 |
+
temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_chunk_{len(audio_segments)}.wav")
|
| 956 |
+
write_wav(temp_audio_path, sampling_rate, audio_chunk.astype(np.float32))
|
| 957 |
+
logging.debug(f"Saved chunk to {temp_audio_path}")
|
| 958 |
+
|
| 959 |
+
|
| 960 |
+
# Combine all the audio chunks into one audio file
|
| 961 |
combined_audio = np.concatenate(audio_segments)
|
| 962 |
combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
|
| 963 |
+
|
| 964 |
write_wav(combined_audio_path, sampling_rate, combined_audio.astype(np.float32))
|
| 965 |
|
| 966 |
logging.debug(f"Combined audio saved to {combined_audio_path}")
|
| 967 |
return combined_audio_path
|
| 968 |
|
| 969 |
|
|
|
|
|
|
|
| 970 |
def fetch_local_events():
|
| 971 |
api_key = os.environ['SERP_API']
|
| 972 |
url = f'https://serpapi.com/search.json?engine=google_events&q=Events+in+Birmingham&hl=en&gl=us&api_key={api_key}'
|
|
|
|
| 1392 |
# .then(fn=clear_textbox, inputs=[], outputs=[chat_input],api_name="api_clear_textbox")
|
| 1393 |
# )
|
| 1394 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1395 |
retriever_sequence = (
|
| 1396 |
retriever_button.click(fn=stop_audio, inputs=[], outputs=[audio_output], api_name="api_stop_audio_recording")
|
| 1397 |
.then(fn=add_message, inputs=[chatbot, chat_input], outputs=[chatbot, chat_input], api_name="api_addprompt_chathistory")
|
| 1398 |
# First, generate the bot response
|
| 1399 |
.then(fn=generate_bot_response, inputs=[chatbot, choice, retrieval_mode, model_choice], outputs=[chatbot], api_name="api_generate_bot_response")
|
| 1400 |
+
# Then, generate the TTS response based on the bot's response
|
| 1401 |
+
.then(fn=generate_tts_response, inputs=[chatbot, tts_choice], outputs=[audio_output], api_name="api_generate_tts_response")
|
| 1402 |
.then(fn=show_map_if_details, inputs=[chatbot, choice], outputs=[location_output, location_output], api_name="api_show_map_details")
|
| 1403 |
.then(fn=clear_textbox, inputs=[], outputs=[chat_input], api_name="api_clear_textbox")
|
| 1404 |
)
|
|
|
|
| 1417 |
# fn=clear_textbox, inputs=[], outputs=[chat_input],api_name="api_clear_textbox"
|
| 1418 |
# )
|
| 1419 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1420 |
chat_input.submit(fn=stop_audio, inputs=[], outputs=[audio_output], api_name="api_stop_audio_recording").then(
|
| 1421 |
fn=add_message, inputs=[chatbot, chat_input], outputs=[chatbot, chat_input], api_name="api_addprompt_chathistory"
|
| 1422 |
).then(
|
| 1423 |
+
# First, generate the bot response
|
| 1424 |
+
fn=generate_bot_response, inputs=[chatbot, choice, retrieval_mode, model_choice], outputs=[chatbot], api_name="api_generate_bot_response"
|
| 1425 |
+
).then(
|
| 1426 |
+
# Then, generate the TTS response based on the bot's response
|
| 1427 |
+
fn=generate_tts_response, inputs=[chatbot, tts_choice], outputs=[audio_output], api_name="api_generate_tts_response"
|
| 1428 |
).then(
|
| 1429 |
fn=show_map_if_details, inputs=[chatbot, choice], outputs=[location_output, location_output], api_name="api_show_map_details"
|
| 1430 |
).then(
|
|
|
|
| 1436 |
|
| 1437 |
|
| 1438 |
|
| 1439 |
+
|
| 1440 |
audio_input = gr.Audio(sources=["microphone"], streaming=True, type='numpy', every=0.1)
|
| 1441 |
audio_input.stream(transcribe_function, inputs=[state, audio_input], outputs=[state, chat_input], api_name="api_voice_to_text")
|
| 1442 |
|
|
|
|
| 1457 |
refresh_button.click(fn=update_images, inputs=None, outputs=[image_output_1, image_output_2, image_output_3], api_name="update_image")
|
| 1458 |
|
| 1459 |
demo.queue()
|
| 1460 |
+
demo.launch(show_error=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|