Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -275,26 +275,46 @@ def generate_answer(message, choice):
|
|
| 275 |
addresses = extract_addresses(response['output'])
|
| 276 |
return response['output'], addresses
|
| 277 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
def bot(history, choice, tts_model):
|
| 279 |
if not history:
|
| 280 |
return history
|
| 281 |
response, addresses = generate_answer(history[-1][0], choice)
|
| 282 |
history[-1][1] = ""
|
| 283 |
-
|
| 284 |
-
# Generate audio for the entire response in a separate thread
|
| 285 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 286 |
if tts_model == "ElevenLabs":
|
| 287 |
audio_future = executor.submit(generate_audio_elevenlabs, response)
|
| 288 |
else:
|
| 289 |
audio_future = executor.submit(generate_audio_parler_tts, response)
|
| 290 |
-
|
| 291 |
for character in response:
|
| 292 |
history[-1][1] += character
|
| 293 |
-
time.sleep(0.05)
|
| 294 |
-
yield history, None
|
| 295 |
-
|
| 296 |
audio_path = audio_future.result()
|
| 297 |
-
yield history, audio_path
|
| 298 |
|
| 299 |
def add_message(history, message):
|
| 300 |
history.append((message, None))
|
|
@@ -567,9 +587,65 @@ def generate_audio_elevenlabs(text):
|
|
| 567 |
# logging.debug(f"Audio saved to {temp_audio_path}")
|
| 568 |
# return temp_audio_path
|
| 569 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 570 |
def generate_audio_parler_tts(text, chunk_size=200):
|
| 571 |
def split_text(text, chunk_size):
|
| 572 |
-
# Split text into chunks of the specified size
|
| 573 |
words = text.split()
|
| 574 |
chunks = []
|
| 575 |
current_chunk = []
|
|
@@ -588,7 +664,13 @@ def generate_audio_parler_tts(text, chunk_size=200):
|
|
| 588 |
chunks.append(" ".join(current_chunk))
|
| 589 |
|
| 590 |
return chunks
|
| 591 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 592 |
model_id = 'parler-tts/parler_tts_mini_v0.1'
|
| 593 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 594 |
try:
|
|
@@ -603,13 +685,11 @@ def generate_audio_parler_tts(text, chunk_size=200):
|
|
| 603 |
|
| 604 |
input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
|
| 605 |
chunks = split_text(text, chunk_size)
|
| 606 |
-
|
| 607 |
-
|
| 608 |
-
|
| 609 |
-
|
| 610 |
-
|
| 611 |
-
audio_arr = generation.cpu().numpy().squeeze()
|
| 612 |
-
audio_arrs.append(audio_arr)
|
| 613 |
|
| 614 |
# Concatenate all audio arrays into a single array
|
| 615 |
concatenated_audio = np.concatenate(audio_arrs)
|
|
@@ -650,47 +730,97 @@ def update_images():
|
|
| 650 |
image_3 = generate_image(hardcoded_prompt_3)
|
| 651 |
return image_1, image_2, image_3
|
| 652 |
|
| 653 |
-
with gr.Blocks(theme='Pijush2023/scikit-learn-pijush') as demo:
|
| 654 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 655 |
with gr.Row():
|
| 656 |
with gr.Column():
|
| 657 |
state = gr.State()
|
| 658 |
-
|
| 659 |
chatbot = gr.Chatbot([], elem_id="RADAR:Channel 94.1", bubble_full_width=False)
|
| 660 |
choice = gr.Radio(label="Select Style", choices=["Details", "Conversational"], value="Conversational")
|
| 661 |
tts_choice = gr.Radio(label="Select TTS Model", choices=["ElevenLabs", "Parler TTS"], value="Parler TTS")
|
| 662 |
-
|
| 663 |
gr.Markdown("<h1 style='color: red;'>Talk to RADAR</h1>", elem_id="voice-markdown")
|
| 664 |
chat_input = gr.Textbox(show_copy_button=True, interactive=True, show_label=False, label="ASK Radar !!!")
|
| 665 |
chat_msg = chat_input.submit(add_message, [chatbot, chat_input], [chatbot, chat_input])
|
| 666 |
-
bot_msg = chat_msg.then(bot, [chatbot, choice, tts_choice], [chatbot, gr.Audio(interactive=False, autoplay=True)])
|
| 667 |
bot_msg.then(lambda: gr.Textbox(value="", interactive=True, placeholder="Ask Radar!!!...", show_label=False), None, [chat_input])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 668 |
chatbot.like(print_like_dislike, None, None)
|
| 669 |
clear_button = gr.Button("Clear")
|
| 670 |
clear_button.click(fn=clear_textbox, inputs=None, outputs=chat_input)
|
| 671 |
-
|
| 672 |
-
|
| 673 |
audio_input = gr.Audio(sources=["microphone"], streaming=True, type='numpy')
|
| 674 |
audio_input.stream(transcribe_function, inputs=[state, audio_input], outputs=[state, chat_input], api_name="SAMLOne_real_time")
|
| 675 |
|
| 676 |
-
# gr.Markdown("<h1 style='color: red;'>Map</h1>", elem_id="location-markdown")
|
| 677 |
-
# location_output = gr.HTML()
|
| 678 |
-
# bot_msg.then(show_map_if_details, [chatbot, choice], [location_output, location_output])
|
| 679 |
-
|
| 680 |
-
# with gr.Column():
|
| 681 |
-
# weather_output = gr.HTML(value=fetch_local_weather())
|
| 682 |
-
# news_output = gr.HTML(value=fetch_local_news())
|
| 683 |
-
# news_output = gr.HTML(value=fetch_local_events())
|
| 684 |
-
|
| 685 |
with gr.Column():
|
| 686 |
-
|
| 687 |
image_output_1 = gr.Image(value=generate_image(hardcoded_prompt_1), width=400, height=400)
|
| 688 |
image_output_2 = gr.Image(value=generate_image(hardcoded_prompt_2), width=400, height=400)
|
| 689 |
image_output_3 = gr.Image(value=generate_image(hardcoded_prompt_3), width=400, height=400)
|
| 690 |
|
| 691 |
-
|
| 692 |
refresh_button = gr.Button("Refresh Images")
|
| 693 |
refresh_button.click(fn=update_images, inputs=None, outputs=[image_output_1, image_output_2, image_output_3])
|
| 694 |
-
|
| 695 |
demo.queue()
|
| 696 |
demo.launch(share=True)
|
|
|
|
| 275 |
addresses = extract_addresses(response['output'])
|
| 276 |
return response['output'], addresses
|
| 277 |
|
| 278 |
+
# def bot(history, choice, tts_model):
|
| 279 |
+
# if not history:
|
| 280 |
+
# return history
|
| 281 |
+
# response, addresses = generate_answer(history[-1][0], choice)
|
| 282 |
+
# history[-1][1] = ""
|
| 283 |
+
|
| 284 |
+
# # Generate audio for the entire response in a separate thread
|
| 285 |
+
# with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 286 |
+
# if tts_model == "ElevenLabs":
|
| 287 |
+
# audio_future = executor.submit(generate_audio_elevenlabs, response)
|
| 288 |
+
# else:
|
| 289 |
+
# audio_future = executor.submit(generate_audio_parler_tts, response)
|
| 290 |
+
|
| 291 |
+
# for character in response:
|
| 292 |
+
# history[-1][1] += character
|
| 293 |
+
# time.sleep(0.05) # Adjust the speed of text appearance
|
| 294 |
+
# yield history, None
|
| 295 |
+
|
| 296 |
+
# audio_path = audio_future.result()
|
| 297 |
+
# yield history, audio_path
|
| 298 |
+
|
| 299 |
def bot(history, choice, tts_model):
|
| 300 |
if not history:
|
| 301 |
return history
|
| 302 |
response, addresses = generate_answer(history[-1][0], choice)
|
| 303 |
history[-1][1] = ""
|
| 304 |
+
|
|
|
|
| 305 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 306 |
if tts_model == "ElevenLabs":
|
| 307 |
audio_future = executor.submit(generate_audio_elevenlabs, response)
|
| 308 |
else:
|
| 309 |
audio_future = executor.submit(generate_audio_parler_tts, response)
|
| 310 |
+
|
| 311 |
for character in response:
|
| 312 |
history[-1][1] += character
|
| 313 |
+
time.sleep(0.05)
|
| 314 |
+
yield history, None, gr.update(visible=True, value=history[-1][1])
|
| 315 |
+
|
| 316 |
audio_path = audio_future.result()
|
| 317 |
+
yield history, audio_path, gr.update(visible=True, value=history[-1][1])
|
| 318 |
|
| 319 |
def add_message(history, message):
|
| 320 |
history.append((message, None))
|
|
|
|
| 587 |
# logging.debug(f"Audio saved to {temp_audio_path}")
|
| 588 |
# return temp_audio_path
|
| 589 |
|
| 590 |
+
# def generate_audio_parler_tts(text, chunk_size=200):
|
| 591 |
+
# def split_text(text, chunk_size):
|
| 592 |
+
# # Split text into chunks of the specified size
|
| 593 |
+
# words = text.split()
|
| 594 |
+
# chunks = []
|
| 595 |
+
# current_chunk = []
|
| 596 |
+
# current_length = 0
|
| 597 |
+
|
| 598 |
+
# for word in words:
|
| 599 |
+
# if current_length + len(word) + 1 > chunk_size:
|
| 600 |
+
# chunks.append(" ".join(current_chunk))
|
| 601 |
+
# current_chunk = [word]
|
| 602 |
+
# current_length = len(word) + 1
|
| 603 |
+
# else:
|
| 604 |
+
# current_chunk.append(word)
|
| 605 |
+
# current_length += len(word) + 1
|
| 606 |
+
|
| 607 |
+
# if current_chunk:
|
| 608 |
+
# chunks.append(" ".join(current_chunk))
|
| 609 |
+
|
| 610 |
+
# return chunks
|
| 611 |
+
|
| 612 |
+
# model_id = 'parler-tts/parler_tts_mini_v0.1'
|
| 613 |
+
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 614 |
+
# try:
|
| 615 |
+
# model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
|
| 616 |
+
# except torch.cuda.OutOfMemoryError:
|
| 617 |
+
# print("CUDA out of memory. Switching to CPU.")
|
| 618 |
+
# device = "cpu"
|
| 619 |
+
# model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
|
| 620 |
+
# tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 621 |
+
|
| 622 |
+
# description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
|
| 623 |
+
|
| 624 |
+
# input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
|
| 625 |
+
# chunks = split_text(text, chunk_size)
|
| 626 |
+
# audio_arrs = []
|
| 627 |
+
|
| 628 |
+
# for chunk in chunks:
|
| 629 |
+
# prompt_input_ids = tokenizer(chunk, return_tensors="pt").input_ids.to(device)
|
| 630 |
+
# generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
|
| 631 |
+
# audio_arr = generation.cpu().numpy().squeeze()
|
| 632 |
+
# audio_arrs.append(audio_arr)
|
| 633 |
+
|
| 634 |
+
# # Concatenate all audio arrays into a single array
|
| 635 |
+
# concatenated_audio = np.concatenate(audio_arrs)
|
| 636 |
+
|
| 637 |
+
# with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
| 638 |
+
# sf.write(f.name, concatenated_audio, model.config.sampling_rate)
|
| 639 |
+
# temp_audio_path = f.name
|
| 640 |
+
|
| 641 |
+
# logging.debug(f"Audio saved to {temp_audio_path}")
|
| 642 |
+
# return temp_audio_path
|
| 643 |
+
|
| 644 |
+
|
| 645 |
+
import concurrent.futures
|
| 646 |
+
|
| 647 |
def generate_audio_parler_tts(text, chunk_size=200):
|
| 648 |
def split_text(text, chunk_size):
|
|
|
|
| 649 |
words = text.split()
|
| 650 |
chunks = []
|
| 651 |
current_chunk = []
|
|
|
|
| 664 |
chunks.append(" ".join(current_chunk))
|
| 665 |
|
| 666 |
return chunks
|
| 667 |
+
|
| 668 |
+
def process_chunk(chunk):
|
| 669 |
+
prompt_input_ids = tokenizer(chunk, return_tensors="pt").input_ids.to(device)
|
| 670 |
+
generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
|
| 671 |
+
audio_arr = generation.cpu().numpy().squeeze()
|
| 672 |
+
return audio_arr
|
| 673 |
+
|
| 674 |
model_id = 'parler-tts/parler_tts_mini_v0.1'
|
| 675 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 676 |
try:
|
|
|
|
| 685 |
|
| 686 |
input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
|
| 687 |
chunks = split_text(text, chunk_size)
|
| 688 |
+
|
| 689 |
+
# Process chunks in parallel
|
| 690 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 691 |
+
futures = [executor.submit(process_chunk, chunk) for chunk in chunks]
|
| 692 |
+
audio_arrs = [future.result() for future in concurrent.futures.as_completed(futures)]
|
|
|
|
|
|
|
| 693 |
|
| 694 |
# Concatenate all audio arrays into a single array
|
| 695 |
concatenated_audio = np.concatenate(audio_arrs)
|
|
|
|
| 730 |
image_3 = generate_image(hardcoded_prompt_3)
|
| 731 |
return image_1, image_2, image_3
|
| 732 |
|
| 733 |
+
# with gr.Blocks(theme='Pijush2023/scikit-learn-pijush') as demo:
|
| 734 |
|
| 735 |
+
# with gr.Row():
|
| 736 |
+
# with gr.Column():
|
| 737 |
+
# state = gr.State()
|
| 738 |
+
|
| 739 |
+
# chatbot = gr.Chatbot([], elem_id="RADAR:Channel 94.1", bubble_full_width=False)
|
| 740 |
+
# choice = gr.Radio(label="Select Style", choices=["Details", "Conversational"], value="Conversational")
|
| 741 |
+
# tts_choice = gr.Radio(label="Select TTS Model", choices=["ElevenLabs", "Parler TTS"], value="Parler TTS")
|
| 742 |
+
|
| 743 |
+
# gr.Markdown("<h1 style='color: red;'>Talk to RADAR</h1>", elem_id="voice-markdown")
|
| 744 |
+
# chat_input = gr.Textbox(show_copy_button=True, interactive=True, show_label=False, label="ASK Radar !!!")
|
| 745 |
+
# chat_msg = chat_input.submit(add_message, [chatbot, chat_input], [chatbot, chat_input])
|
| 746 |
+
# bot_msg = chat_msg.then(bot, [chatbot, choice, tts_choice], [chatbot, gr.Audio(interactive=False, autoplay=True)])
|
| 747 |
+
# bot_msg.then(lambda: gr.Textbox(value="", interactive=True, placeholder="Ask Radar!!!...", show_label=False), None, [chat_input])
|
| 748 |
+
# chatbot.like(print_like_dislike, None, None)
|
| 749 |
+
# clear_button = gr.Button("Clear")
|
| 750 |
+
# clear_button.click(fn=clear_textbox, inputs=None, outputs=chat_input)
|
| 751 |
+
|
| 752 |
+
|
| 753 |
+
# audio_input = gr.Audio(sources=["microphone"], streaming=True, type='numpy')
|
| 754 |
+
# audio_input.stream(transcribe_function, inputs=[state, audio_input], outputs=[state, chat_input], api_name="SAMLOne_real_time")
|
| 755 |
+
|
| 756 |
+
# # gr.Markdown("<h1 style='color: red;'>Map</h1>", elem_id="location-markdown")
|
| 757 |
+
# # location_output = gr.HTML()
|
| 758 |
+
# # bot_msg.then(show_map_if_details, [chatbot, choice], [location_output, location_output])
|
| 759 |
+
|
| 760 |
+
# # with gr.Column():
|
| 761 |
+
# # weather_output = gr.HTML(value=fetch_local_weather())
|
| 762 |
+
# # news_output = gr.HTML(value=fetch_local_news())
|
| 763 |
+
# # news_output = gr.HTML(value=fetch_local_events())
|
| 764 |
+
|
| 765 |
+
# with gr.Column():
|
| 766 |
+
|
| 767 |
+
# image_output_1 = gr.Image(value=generate_image(hardcoded_prompt_1), width=400, height=400)
|
| 768 |
+
# image_output_2 = gr.Image(value=generate_image(hardcoded_prompt_2), width=400, height=400)
|
| 769 |
+
# image_output_3 = gr.Image(value=generate_image(hardcoded_prompt_3), width=400, height=400)
|
| 770 |
+
|
| 771 |
+
|
| 772 |
+
# refresh_button = gr.Button("Refresh Images")
|
| 773 |
+
# refresh_button.click(fn=update_images, inputs=None, outputs=[image_output_1, image_output_2, image_output_3])
|
| 774 |
+
|
| 775 |
+
# demo.queue()
|
| 776 |
+
# demo.launch(share=True)
|
| 777 |
+
|
| 778 |
+
def generate_follow_up_buttons(response):
|
| 779 |
+
return gr.update(visible=True), gr.update(value=response)
|
| 780 |
+
|
| 781 |
+
def handle_follow_up_choice(choice, history):
|
| 782 |
+
follow_up_responses = {
|
| 783 |
+
"Question 1": "This is the response to follow-up question 1.",
|
| 784 |
+
"Question 2": "This is the response to follow-up question 2."
|
| 785 |
+
}
|
| 786 |
+
follow_up_response = follow_up_responses.get(choice, "Sorry, I didn't understand that choice.")
|
| 787 |
+
history.append((choice, follow_up_response))
|
| 788 |
+
return history, gr.update(visible=False)
|
| 789 |
+
|
| 790 |
+
|
| 791 |
+
|
| 792 |
+
with gr.Blocks(theme='Pijush2023/scikit-learn-pijush') as demo:
|
| 793 |
+
|
| 794 |
with gr.Row():
|
| 795 |
with gr.Column():
|
| 796 |
state = gr.State()
|
|
|
|
| 797 |
chatbot = gr.Chatbot([], elem_id="RADAR:Channel 94.1", bubble_full_width=False)
|
| 798 |
choice = gr.Radio(label="Select Style", choices=["Details", "Conversational"], value="Conversational")
|
| 799 |
tts_choice = gr.Radio(label="Select TTS Model", choices=["ElevenLabs", "Parler TTS"], value="Parler TTS")
|
| 800 |
+
|
| 801 |
gr.Markdown("<h1 style='color: red;'>Talk to RADAR</h1>", elem_id="voice-markdown")
|
| 802 |
chat_input = gr.Textbox(show_copy_button=True, interactive=True, show_label=False, label="ASK Radar !!!")
|
| 803 |
chat_msg = chat_input.submit(add_message, [chatbot, chat_input], [chatbot, chat_input])
|
| 804 |
+
bot_msg = chat_msg.then(bot, [chatbot, choice, tts_choice], [chatbot, gr.Audio(interactive=False, autoplay=True), gr.Button(value="Follow-up Questions")])
|
| 805 |
bot_msg.then(lambda: gr.Textbox(value="", interactive=True, placeholder="Ask Radar!!!...", show_label=False), None, [chat_input])
|
| 806 |
+
|
| 807 |
+
follow_up_buttons = gr.ButtonGroup(choices=["Question 1", "Question 2"], visible=False)
|
| 808 |
+
follow_up_buttons.click(handle_follow_up_choice, inputs=[follow_up_buttons, chatbot], outputs=[chatbot, follow_up_buttons])
|
| 809 |
+
|
| 810 |
chatbot.like(print_like_dislike, None, None)
|
| 811 |
clear_button = gr.Button("Clear")
|
| 812 |
clear_button.click(fn=clear_textbox, inputs=None, outputs=chat_input)
|
| 813 |
+
|
|
|
|
| 814 |
audio_input = gr.Audio(sources=["microphone"], streaming=True, type='numpy')
|
| 815 |
audio_input.stream(transcribe_function, inputs=[state, audio_input], outputs=[state, chat_input], api_name="SAMLOne_real_time")
|
| 816 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 817 |
with gr.Column():
|
|
|
|
| 818 |
image_output_1 = gr.Image(value=generate_image(hardcoded_prompt_1), width=400, height=400)
|
| 819 |
image_output_2 = gr.Image(value=generate_image(hardcoded_prompt_2), width=400, height=400)
|
| 820 |
image_output_3 = gr.Image(value=generate_image(hardcoded_prompt_3), width=400, height=400)
|
| 821 |
|
|
|
|
| 822 |
refresh_button = gr.Button("Refresh Images")
|
| 823 |
refresh_button.click(fn=update_images, inputs=None, outputs=[image_output_1, image_output_2, image_output_3])
|
| 824 |
+
|
| 825 |
demo.queue()
|
| 826 |
demo.launch(share=True)
|