Pijush2023 commited on
Commit
16f0b32
·
verified ·
1 Parent(s): cfcb1b1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +164 -34
app.py CHANGED
@@ -275,26 +275,46 @@ def generate_answer(message, choice):
275
  addresses = extract_addresses(response['output'])
276
  return response['output'], addresses
277
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
  def bot(history, choice, tts_model):
279
  if not history:
280
  return history
281
  response, addresses = generate_answer(history[-1][0], choice)
282
  history[-1][1] = ""
283
-
284
- # Generate audio for the entire response in a separate thread
285
  with concurrent.futures.ThreadPoolExecutor() as executor:
286
  if tts_model == "ElevenLabs":
287
  audio_future = executor.submit(generate_audio_elevenlabs, response)
288
  else:
289
  audio_future = executor.submit(generate_audio_parler_tts, response)
290
-
291
  for character in response:
292
  history[-1][1] += character
293
- time.sleep(0.05) # Adjust the speed of text appearance
294
- yield history, None
295
-
296
  audio_path = audio_future.result()
297
- yield history, audio_path
298
 
299
  def add_message(history, message):
300
  history.append((message, None))
@@ -567,9 +587,65 @@ def generate_audio_elevenlabs(text):
567
  # logging.debug(f"Audio saved to {temp_audio_path}")
568
  # return temp_audio_path
569
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
570
  def generate_audio_parler_tts(text, chunk_size=200):
571
  def split_text(text, chunk_size):
572
- # Split text into chunks of the specified size
573
  words = text.split()
574
  chunks = []
575
  current_chunk = []
@@ -588,7 +664,13 @@ def generate_audio_parler_tts(text, chunk_size=200):
588
  chunks.append(" ".join(current_chunk))
589
 
590
  return chunks
591
-
 
 
 
 
 
 
592
  model_id = 'parler-tts/parler_tts_mini_v0.1'
593
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
594
  try:
@@ -603,13 +685,11 @@ def generate_audio_parler_tts(text, chunk_size=200):
603
 
604
  input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
605
  chunks = split_text(text, chunk_size)
606
- audio_arrs = []
607
-
608
- for chunk in chunks:
609
- prompt_input_ids = tokenizer(chunk, return_tensors="pt").input_ids.to(device)
610
- generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
611
- audio_arr = generation.cpu().numpy().squeeze()
612
- audio_arrs.append(audio_arr)
613
 
614
  # Concatenate all audio arrays into a single array
615
  concatenated_audio = np.concatenate(audio_arrs)
@@ -650,47 +730,97 @@ def update_images():
650
  image_3 = generate_image(hardcoded_prompt_3)
651
  return image_1, image_2, image_3
652
 
653
- with gr.Blocks(theme='Pijush2023/scikit-learn-pijush') as demo:
654
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
655
  with gr.Row():
656
  with gr.Column():
657
  state = gr.State()
658
-
659
  chatbot = gr.Chatbot([], elem_id="RADAR:Channel 94.1", bubble_full_width=False)
660
  choice = gr.Radio(label="Select Style", choices=["Details", "Conversational"], value="Conversational")
661
  tts_choice = gr.Radio(label="Select TTS Model", choices=["ElevenLabs", "Parler TTS"], value="Parler TTS")
662
-
663
  gr.Markdown("<h1 style='color: red;'>Talk to RADAR</h1>", elem_id="voice-markdown")
664
  chat_input = gr.Textbox(show_copy_button=True, interactive=True, show_label=False, label="ASK Radar !!!")
665
  chat_msg = chat_input.submit(add_message, [chatbot, chat_input], [chatbot, chat_input])
666
- bot_msg = chat_msg.then(bot, [chatbot, choice, tts_choice], [chatbot, gr.Audio(interactive=False, autoplay=True)])
667
  bot_msg.then(lambda: gr.Textbox(value="", interactive=True, placeholder="Ask Radar!!!...", show_label=False), None, [chat_input])
 
 
 
 
668
  chatbot.like(print_like_dislike, None, None)
669
  clear_button = gr.Button("Clear")
670
  clear_button.click(fn=clear_textbox, inputs=None, outputs=chat_input)
671
-
672
-
673
  audio_input = gr.Audio(sources=["microphone"], streaming=True, type='numpy')
674
  audio_input.stream(transcribe_function, inputs=[state, audio_input], outputs=[state, chat_input], api_name="SAMLOne_real_time")
675
 
676
- # gr.Markdown("<h1 style='color: red;'>Map</h1>", elem_id="location-markdown")
677
- # location_output = gr.HTML()
678
- # bot_msg.then(show_map_if_details, [chatbot, choice], [location_output, location_output])
679
-
680
- # with gr.Column():
681
- # weather_output = gr.HTML(value=fetch_local_weather())
682
- # news_output = gr.HTML(value=fetch_local_news())
683
- # news_output = gr.HTML(value=fetch_local_events())
684
-
685
  with gr.Column():
686
-
687
  image_output_1 = gr.Image(value=generate_image(hardcoded_prompt_1), width=400, height=400)
688
  image_output_2 = gr.Image(value=generate_image(hardcoded_prompt_2), width=400, height=400)
689
  image_output_3 = gr.Image(value=generate_image(hardcoded_prompt_3), width=400, height=400)
690
 
691
-
692
  refresh_button = gr.Button("Refresh Images")
693
  refresh_button.click(fn=update_images, inputs=None, outputs=[image_output_1, image_output_2, image_output_3])
694
-
695
  demo.queue()
696
  demo.launch(share=True)
 
275
  addresses = extract_addresses(response['output'])
276
  return response['output'], addresses
277
 
278
+ # def bot(history, choice, tts_model):
279
+ # if not history:
280
+ # return history
281
+ # response, addresses = generate_answer(history[-1][0], choice)
282
+ # history[-1][1] = ""
283
+
284
+ # # Generate audio for the entire response in a separate thread
285
+ # with concurrent.futures.ThreadPoolExecutor() as executor:
286
+ # if tts_model == "ElevenLabs":
287
+ # audio_future = executor.submit(generate_audio_elevenlabs, response)
288
+ # else:
289
+ # audio_future = executor.submit(generate_audio_parler_tts, response)
290
+
291
+ # for character in response:
292
+ # history[-1][1] += character
293
+ # time.sleep(0.05) # Adjust the speed of text appearance
294
+ # yield history, None
295
+
296
+ # audio_path = audio_future.result()
297
+ # yield history, audio_path
298
+
299
  def bot(history, choice, tts_model):
300
  if not history:
301
  return history
302
  response, addresses = generate_answer(history[-1][0], choice)
303
  history[-1][1] = ""
304
+
 
305
  with concurrent.futures.ThreadPoolExecutor() as executor:
306
  if tts_model == "ElevenLabs":
307
  audio_future = executor.submit(generate_audio_elevenlabs, response)
308
  else:
309
  audio_future = executor.submit(generate_audio_parler_tts, response)
310
+
311
  for character in response:
312
  history[-1][1] += character
313
+ time.sleep(0.05)
314
+ yield history, None, gr.update(visible=True, value=history[-1][1])
315
+
316
  audio_path = audio_future.result()
317
+ yield history, audio_path, gr.update(visible=True, value=history[-1][1])
318
 
319
  def add_message(history, message):
320
  history.append((message, None))
 
587
  # logging.debug(f"Audio saved to {temp_audio_path}")
588
  # return temp_audio_path
589
 
590
+ # def generate_audio_parler_tts(text, chunk_size=200):
591
+ # def split_text(text, chunk_size):
592
+ # # Split text into chunks of the specified size
593
+ # words = text.split()
594
+ # chunks = []
595
+ # current_chunk = []
596
+ # current_length = 0
597
+
598
+ # for word in words:
599
+ # if current_length + len(word) + 1 > chunk_size:
600
+ # chunks.append(" ".join(current_chunk))
601
+ # current_chunk = [word]
602
+ # current_length = len(word) + 1
603
+ # else:
604
+ # current_chunk.append(word)
605
+ # current_length += len(word) + 1
606
+
607
+ # if current_chunk:
608
+ # chunks.append(" ".join(current_chunk))
609
+
610
+ # return chunks
611
+
612
+ # model_id = 'parler-tts/parler_tts_mini_v0.1'
613
+ # device = "cuda:0" if torch.cuda.is_available() else "cpu"
614
+ # try:
615
+ # model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
616
+ # except torch.cuda.OutOfMemoryError:
617
+ # print("CUDA out of memory. Switching to CPU.")
618
+ # device = "cpu"
619
+ # model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
620
+ # tokenizer = AutoTokenizer.from_pretrained(model_id)
621
+
622
+ # description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
623
+
624
+ # input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
625
+ # chunks = split_text(text, chunk_size)
626
+ # audio_arrs = []
627
+
628
+ # for chunk in chunks:
629
+ # prompt_input_ids = tokenizer(chunk, return_tensors="pt").input_ids.to(device)
630
+ # generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
631
+ # audio_arr = generation.cpu().numpy().squeeze()
632
+ # audio_arrs.append(audio_arr)
633
+
634
+ # # Concatenate all audio arrays into a single array
635
+ # concatenated_audio = np.concatenate(audio_arrs)
636
+
637
+ # with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
638
+ # sf.write(f.name, concatenated_audio, model.config.sampling_rate)
639
+ # temp_audio_path = f.name
640
+
641
+ # logging.debug(f"Audio saved to {temp_audio_path}")
642
+ # return temp_audio_path
643
+
644
+
645
+ import concurrent.futures
646
+
647
  def generate_audio_parler_tts(text, chunk_size=200):
648
  def split_text(text, chunk_size):
 
649
  words = text.split()
650
  chunks = []
651
  current_chunk = []
 
664
  chunks.append(" ".join(current_chunk))
665
 
666
  return chunks
667
+
668
+ def process_chunk(chunk):
669
+ prompt_input_ids = tokenizer(chunk, return_tensors="pt").input_ids.to(device)
670
+ generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
671
+ audio_arr = generation.cpu().numpy().squeeze()
672
+ return audio_arr
673
+
674
  model_id = 'parler-tts/parler_tts_mini_v0.1'
675
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
676
  try:
 
685
 
686
  input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
687
  chunks = split_text(text, chunk_size)
688
+
689
+ # Process chunks in parallel
690
+ with concurrent.futures.ThreadPoolExecutor() as executor:
691
+ futures = [executor.submit(process_chunk, chunk) for chunk in chunks]
692
+ audio_arrs = [future.result() for future in concurrent.futures.as_completed(futures)]
 
 
693
 
694
  # Concatenate all audio arrays into a single array
695
  concatenated_audio = np.concatenate(audio_arrs)
 
730
  image_3 = generate_image(hardcoded_prompt_3)
731
  return image_1, image_2, image_3
732
 
733
+ # with gr.Blocks(theme='Pijush2023/scikit-learn-pijush') as demo:
734
 
735
+ # with gr.Row():
736
+ # with gr.Column():
737
+ # state = gr.State()
738
+
739
+ # chatbot = gr.Chatbot([], elem_id="RADAR:Channel 94.1", bubble_full_width=False)
740
+ # choice = gr.Radio(label="Select Style", choices=["Details", "Conversational"], value="Conversational")
741
+ # tts_choice = gr.Radio(label="Select TTS Model", choices=["ElevenLabs", "Parler TTS"], value="Parler TTS")
742
+
743
+ # gr.Markdown("<h1 style='color: red;'>Talk to RADAR</h1>", elem_id="voice-markdown")
744
+ # chat_input = gr.Textbox(show_copy_button=True, interactive=True, show_label=False, label="ASK Radar !!!")
745
+ # chat_msg = chat_input.submit(add_message, [chatbot, chat_input], [chatbot, chat_input])
746
+ # bot_msg = chat_msg.then(bot, [chatbot, choice, tts_choice], [chatbot, gr.Audio(interactive=False, autoplay=True)])
747
+ # bot_msg.then(lambda: gr.Textbox(value="", interactive=True, placeholder="Ask Radar!!!...", show_label=False), None, [chat_input])
748
+ # chatbot.like(print_like_dislike, None, None)
749
+ # clear_button = gr.Button("Clear")
750
+ # clear_button.click(fn=clear_textbox, inputs=None, outputs=chat_input)
751
+
752
+
753
+ # audio_input = gr.Audio(sources=["microphone"], streaming=True, type='numpy')
754
+ # audio_input.stream(transcribe_function, inputs=[state, audio_input], outputs=[state, chat_input], api_name="SAMLOne_real_time")
755
+
756
+ # # gr.Markdown("<h1 style='color: red;'>Map</h1>", elem_id="location-markdown")
757
+ # # location_output = gr.HTML()
758
+ # # bot_msg.then(show_map_if_details, [chatbot, choice], [location_output, location_output])
759
+
760
+ # # with gr.Column():
761
+ # # weather_output = gr.HTML(value=fetch_local_weather())
762
+ # # news_output = gr.HTML(value=fetch_local_news())
763
+ # # news_output = gr.HTML(value=fetch_local_events())
764
+
765
+ # with gr.Column():
766
+
767
+ # image_output_1 = gr.Image(value=generate_image(hardcoded_prompt_1), width=400, height=400)
768
+ # image_output_2 = gr.Image(value=generate_image(hardcoded_prompt_2), width=400, height=400)
769
+ # image_output_3 = gr.Image(value=generate_image(hardcoded_prompt_3), width=400, height=400)
770
+
771
+
772
+ # refresh_button = gr.Button("Refresh Images")
773
+ # refresh_button.click(fn=update_images, inputs=None, outputs=[image_output_1, image_output_2, image_output_3])
774
+
775
+ # demo.queue()
776
+ # demo.launch(share=True)
777
+
778
+ def generate_follow_up_buttons(response):
779
+ return gr.update(visible=True), gr.update(value=response)
780
+
781
+ def handle_follow_up_choice(choice, history):
782
+ follow_up_responses = {
783
+ "Question 1": "This is the response to follow-up question 1.",
784
+ "Question 2": "This is the response to follow-up question 2."
785
+ }
786
+ follow_up_response = follow_up_responses.get(choice, "Sorry, I didn't understand that choice.")
787
+ history.append((choice, follow_up_response))
788
+ return history, gr.update(visible=False)
789
+
790
+
791
+
792
+ with gr.Blocks(theme='Pijush2023/scikit-learn-pijush') as demo:
793
+
794
  with gr.Row():
795
  with gr.Column():
796
  state = gr.State()
 
797
  chatbot = gr.Chatbot([], elem_id="RADAR:Channel 94.1", bubble_full_width=False)
798
  choice = gr.Radio(label="Select Style", choices=["Details", "Conversational"], value="Conversational")
799
  tts_choice = gr.Radio(label="Select TTS Model", choices=["ElevenLabs", "Parler TTS"], value="Parler TTS")
800
+
801
  gr.Markdown("<h1 style='color: red;'>Talk to RADAR</h1>", elem_id="voice-markdown")
802
  chat_input = gr.Textbox(show_copy_button=True, interactive=True, show_label=False, label="ASK Radar !!!")
803
  chat_msg = chat_input.submit(add_message, [chatbot, chat_input], [chatbot, chat_input])
804
+ bot_msg = chat_msg.then(bot, [chatbot, choice, tts_choice], [chatbot, gr.Audio(interactive=False, autoplay=True), gr.Button(value="Follow-up Questions")])
805
  bot_msg.then(lambda: gr.Textbox(value="", interactive=True, placeholder="Ask Radar!!!...", show_label=False), None, [chat_input])
806
+
807
+ follow_up_buttons = gr.ButtonGroup(choices=["Question 1", "Question 2"], visible=False)
808
+ follow_up_buttons.click(handle_follow_up_choice, inputs=[follow_up_buttons, chatbot], outputs=[chatbot, follow_up_buttons])
809
+
810
  chatbot.like(print_like_dislike, None, None)
811
  clear_button = gr.Button("Clear")
812
  clear_button.click(fn=clear_textbox, inputs=None, outputs=chat_input)
813
+
 
814
  audio_input = gr.Audio(sources=["microphone"], streaming=True, type='numpy')
815
  audio_input.stream(transcribe_function, inputs=[state, audio_input], outputs=[state, chat_input], api_name="SAMLOne_real_time")
816
 
 
 
 
 
 
 
 
 
 
817
  with gr.Column():
 
818
  image_output_1 = gr.Image(value=generate_image(hardcoded_prompt_1), width=400, height=400)
819
  image_output_2 = gr.Image(value=generate_image(hardcoded_prompt_2), width=400, height=400)
820
  image_output_3 = gr.Image(value=generate_image(hardcoded_prompt_3), width=400, height=400)
821
 
 
822
  refresh_button = gr.Button("Refresh Images")
823
  refresh_button.click(fn=update_images, inputs=None, outputs=[image_output_1, image_output_2, image_output_3])
824
+
825
  demo.queue()
826
  demo.launch(share=True)