Pijush2023 commited on
Commit
cc994a5
·
verified ·
1 Parent(s): 7db69ce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +162 -11
app.py CHANGED
@@ -356,15 +356,34 @@ Sure! Here's the information you requested:
356
  """
357
 
358
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359
  def generate_bot_response(history, choice, retrieval_mode, model_choice):
360
  if not history:
361
  return
362
 
363
  # Select the model
364
- # selected_model = chat_model if model_choice == "LM-1" else phi_pipe
365
  selected_model = chat_model if model_choice == "LM-1" else (chat_model1 if model_choice == "LM-3" else phi_pipe)
366
 
367
-
368
  response, addresses = generate_answer(history[-1][0], choice, retrieval_mode, selected_model)
369
  history[-1][1] = ""
370
 
@@ -397,8 +416,34 @@ def generate_tts_response(response, tts_choice):
397
 
398
 
399
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
  import concurrent.futures
401
- # Existing bot function with concurrent futures for parallel processing
402
  def bot(history, choice, tts_choice, retrieval_mode, model_choice):
403
  # Initialize an empty response
404
  response = ""
@@ -452,11 +497,21 @@ def generate_bot_response(history, choice, retrieval_mode, model_choice):
452
 
453
 
454
 
 
 
 
 
 
 
 
455
  def generate_audio_after_text(response, tts_choice):
456
  # Generate TTS audio after text response is completed
457
  with concurrent.futures.ThreadPoolExecutor() as executor:
458
- tts_future = executor.submit(generate_tts_response, response, tts_choice)
459
- audio_path = tts_future.result()
 
 
 
460
  return audio_path
461
 
462
  import re
@@ -895,6 +950,78 @@ def generate_audio_elevenlabs(text):
895
 
896
  # chunking audio and then Process
897
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
898
  import concurrent.futures
899
  import tempfile
900
  import os
@@ -913,7 +1040,7 @@ repo_id = "parler-tts/parler-tts-mini-v1"
913
 
914
  def generate_audio_parler_tts(text):
915
  description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
916
- chunk_size_in_s = 0.5
917
 
918
  # Initialize the tokenizer and model
919
  parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
@@ -921,7 +1048,7 @@ def generate_audio_parler_tts(text):
921
  sampling_rate = parler_model.audio_encoder.config.sampling_rate
922
  frame_rate = parler_model.audio_encoder.config.frame_rate
923
 
924
- def generate(text, description, play_steps_in_s=0.5):
925
  play_steps = int(frame_rate * play_steps_in_s)
926
  streamer = ParlerTTSStreamer(parler_model, device=device, play_steps=play_steps)
927
 
@@ -956,7 +1083,6 @@ def generate_audio_parler_tts(text):
956
  write_wav(temp_audio_path, sampling_rate, audio_chunk.astype(np.float32))
957
  logging.debug(f"Saved chunk to {temp_audio_path}")
958
 
959
-
960
  # Combine all the audio chunks into one audio file
961
  combined_audio = np.concatenate(audio_segments)
962
  combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
@@ -967,6 +1093,7 @@ def generate_audio_parler_tts(text):
967
  return combined_audio_path
968
 
969
 
 
970
  def fetch_local_events():
971
  api_key = os.environ['SERP_API']
972
  url = f'https://serpapi.com/search.json?engine=google_events&q=Events+in+Birmingham&hl=en&gl=us&api_key={api_key}'
@@ -1392,6 +1519,18 @@ with gr.Blocks(theme='Pijush2023/scikit-learn-pijush') as demo:
1392
  # .then(fn=clear_textbox, inputs=[], outputs=[chat_input],api_name="api_clear_textbox")
1393
  # )
1394
 
 
 
 
 
 
 
 
 
 
 
 
 
1395
  retriever_sequence = (
1396
  retriever_button.click(fn=stop_audio, inputs=[], outputs=[audio_output], api_name="api_stop_audio_recording")
1397
  .then(fn=add_message, inputs=[chatbot, chat_input], outputs=[chatbot, chat_input], api_name="api_addprompt_chathistory")
@@ -1417,13 +1556,26 @@ with gr.Blocks(theme='Pijush2023/scikit-learn-pijush') as demo:
1417
  # fn=clear_textbox, inputs=[], outputs=[chat_input],api_name="api_clear_textbox"
1418
  # )
1419
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1420
  chat_input.submit(fn=stop_audio, inputs=[], outputs=[audio_output], api_name="api_stop_audio_recording").then(
1421
  fn=add_message, inputs=[chatbot, chat_input], outputs=[chatbot, chat_input], api_name="api_addprompt_chathistory"
1422
  ).then(
1423
- # First, generate the bot response
1424
  fn=generate_bot_response, inputs=[chatbot, choice, retrieval_mode, model_choice], outputs=[chatbot], api_name="api_generate_bot_response"
1425
  ).then(
1426
- # Then, generate the TTS response based on the bot's response
1427
  fn=generate_tts_response, inputs=[chatbot, tts_choice], outputs=[audio_output], api_name="api_generate_tts_response"
1428
  ).then(
1429
  fn=show_map_if_details, inputs=[chatbot, choice], outputs=[location_output, location_output], api_name="api_show_map_details"
@@ -1436,7 +1588,6 @@ with gr.Blocks(theme='Pijush2023/scikit-learn-pijush') as demo:
1436
 
1437
 
1438
 
1439
-
1440
  audio_input = gr.Audio(sources=["microphone"], streaming=True, type='numpy', every=0.1)
1441
  audio_input.stream(transcribe_function, inputs=[state, audio_input], outputs=[state, chat_input], api_name="api_voice_to_text")
1442
 
 
356
  """
357
 
358
 
359
+ # def generate_bot_response(history, choice, retrieval_mode, model_choice):
360
+ # if not history:
361
+ # return
362
+
363
+ # # Select the model
364
+ # # selected_model = chat_model if model_choice == "LM-1" else phi_pipe
365
+ # selected_model = chat_model if model_choice == "LM-1" else (chat_model1 if model_choice == "LM-3" else phi_pipe)
366
+
367
+
368
+ # response, addresses = generate_answer(history[-1][0], choice, retrieval_mode, selected_model)
369
+ # history[-1][1] = ""
370
+
371
+ # for character in response:
372
+ # history[-1][1] += character
373
+ # yield history # Stream each character as it is generated
374
+ # time.sleep(0.05) # Add a slight delay to simulate streaming
375
+
376
+ # yield history # Final yield with the complete response
377
+
378
+
379
+ # Modified bot function to separate chatbot response and TTS generation
380
  def generate_bot_response(history, choice, retrieval_mode, model_choice):
381
  if not history:
382
  return
383
 
384
  # Select the model
 
385
  selected_model = chat_model if model_choice == "LM-1" else (chat_model1 if model_choice == "LM-3" else phi_pipe)
386
 
 
387
  response, addresses = generate_answer(history[-1][0], choice, retrieval_mode, selected_model)
388
  history[-1][1] = ""
389
 
 
416
 
417
 
418
 
419
+ # import concurrent.futures
420
+ # # Existing bot function with concurrent futures for parallel processing
421
+ # def bot(history, choice, tts_choice, retrieval_mode, model_choice):
422
+ # # Initialize an empty response
423
+ # response = ""
424
+
425
+ # # Create a thread pool to handle both text generation and TTS conversion in parallel
426
+ # with concurrent.futures.ThreadPoolExecutor() as executor:
427
+ # # Start the bot response generation in parallel
428
+ # bot_future = executor.submit(generate_bot_response, history, choice, retrieval_mode, model_choice)
429
+
430
+ # # Wait for the text generation to start
431
+ # for history_chunk in bot_future.result():
432
+ # response = history_chunk[-1][1] # Update the response with the current state
433
+ # yield history_chunk, None # Stream the text output as it's generated
434
+
435
+ # # Once text is fully generated, start the TTS conversion
436
+ # tts_future = executor.submit(generate_tts_response, response, tts_choice)
437
+
438
+ # # Get the audio output after TTS is done
439
+ # audio_path = tts_future.result()
440
+
441
+ # # Stream the final text and audio output
442
+ # yield history, audio_path
443
+
444
+
445
  import concurrent.futures
446
+
447
  def bot(history, choice, tts_choice, retrieval_mode, model_choice):
448
  # Initialize an empty response
449
  response = ""
 
497
 
498
 
499
 
500
+ # def generate_audio_after_text(response, tts_choice):
501
+ # # Generate TTS audio after text response is completed
502
+ # with concurrent.futures.ThreadPoolExecutor() as executor:
503
+ # tts_future = executor.submit(generate_tts_response, response, tts_choice)
504
+ # audio_path = tts_future.result()
505
+ # return audio_path
506
+
507
  def generate_audio_after_text(response, tts_choice):
508
  # Generate TTS audio after text response is completed
509
  with concurrent.futures.ThreadPoolExecutor() as executor:
510
+ if tts_choice == "Alpha":
511
+ audio_future = executor.submit(generate_audio_elevenlabs, response)
512
+ elif tts_choice == "Beta":
513
+ audio_future = executor.submit(generate_audio_parler_tts, response) # Use the updated Parler TTS generator
514
+ audio_path = audio_future.result()
515
  return audio_path
516
 
517
  import re
 
950
 
951
  # chunking audio and then Process
952
 
953
+ # import concurrent.futures
954
+ # import tempfile
955
+ # import os
956
+ # import numpy as np
957
+ # import logging
958
+ # from queue import Queue
959
+ # from threading import Thread
960
+ # from scipy.io.wavfile import write as write_wav
961
+ # from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
962
+ # from transformers import AutoTokenizer
963
+
964
+ # # Ensure your device is set to CUDA
965
+ # device = "cuda:0" if torch.cuda.is_available() else "cpu"
966
+
967
+ # repo_id = "parler-tts/parler-tts-mini-v1"
968
+
969
+ # def generate_audio_parler_tts(text):
970
+ # description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
971
+ # chunk_size_in_s = 0.5
972
+
973
+ # # Initialize the tokenizer and model
974
+ # parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
975
+ # parler_model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
976
+ # sampling_rate = parler_model.audio_encoder.config.sampling_rate
977
+ # frame_rate = parler_model.audio_encoder.config.frame_rate
978
+
979
+ # def generate(text, description, play_steps_in_s=0.5):
980
+ # play_steps = int(frame_rate * play_steps_in_s)
981
+ # streamer = ParlerTTSStreamer(parler_model, device=device, play_steps=play_steps)
982
+
983
+ # inputs = parler_tokenizer(description, return_tensors="pt").to(device)
984
+ # prompt = parler_tokenizer(text, return_tensors="pt").to(device)
985
+
986
+ # generation_kwargs = dict(
987
+ # input_ids=inputs.input_ids,
988
+ # prompt_input_ids=prompt.input_ids,
989
+ # attention_mask=inputs.attention_mask,
990
+ # prompt_attention_mask=prompt.attention_mask,
991
+ # streamer=streamer,
992
+ # do_sample=True,
993
+ # temperature=1.0,
994
+ # min_new_tokens=10,
995
+ # )
996
+
997
+ # thread = Thread(target=parler_model.generate, kwargs=generation_kwargs)
998
+ # thread.start()
999
+
1000
+ # for new_audio in streamer:
1001
+ # if new_audio.shape[0] == 0:
1002
+ # break
1003
+ # # Save or process each audio chunk as it is generated
1004
+ # yield sampling_rate, new_audio
1005
+
1006
+ # audio_segments = []
1007
+ # for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
1008
+ # audio_segments.append(audio_chunk)
1009
+
1010
+ # temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_chunk_{len(audio_segments)}.wav")
1011
+ # write_wav(temp_audio_path, sampling_rate, audio_chunk.astype(np.float32))
1012
+ # logging.debug(f"Saved chunk to {temp_audio_path}")
1013
+
1014
+
1015
+ # # Combine all the audio chunks into one audio file
1016
+ # combined_audio = np.concatenate(audio_segments)
1017
+ # combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
1018
+
1019
+ # write_wav(combined_audio_path, sampling_rate, combined_audio.astype(np.float32))
1020
+
1021
+ # logging.debug(f"Combined audio saved to {combined_audio_path}")
1022
+ # return combined_audio_path
1023
+
1024
+
1025
  import concurrent.futures
1026
  import tempfile
1027
  import os
 
1040
 
1041
  def generate_audio_parler_tts(text):
1042
  description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
1043
+ chunk_size_in_s = 3.0 # Setting buffer size to 3 seconds
1044
 
1045
  # Initialize the tokenizer and model
1046
  parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
 
1048
  sampling_rate = parler_model.audio_encoder.config.sampling_rate
1049
  frame_rate = parler_model.audio_encoder.config.frame_rate
1050
 
1051
+ def generate(text, description, play_steps_in_s=3.0):
1052
  play_steps = int(frame_rate * play_steps_in_s)
1053
  streamer = ParlerTTSStreamer(parler_model, device=device, play_steps=play_steps)
1054
 
 
1083
  write_wav(temp_audio_path, sampling_rate, audio_chunk.astype(np.float32))
1084
  logging.debug(f"Saved chunk to {temp_audio_path}")
1085
 
 
1086
  # Combine all the audio chunks into one audio file
1087
  combined_audio = np.concatenate(audio_segments)
1088
  combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
 
1093
  return combined_audio_path
1094
 
1095
 
1096
+
1097
  def fetch_local_events():
1098
  api_key = os.environ['SERP_API']
1099
  url = f'https://serpapi.com/search.json?engine=google_events&q=Events+in+Birmingham&hl=en&gl=us&api_key={api_key}'
 
1519
  # .then(fn=clear_textbox, inputs=[], outputs=[chat_input],api_name="api_clear_textbox")
1520
  # )
1521
 
1522
+ # retriever_sequence = (
1523
+ # retriever_button.click(fn=stop_audio, inputs=[], outputs=[audio_output], api_name="api_stop_audio_recording")
1524
+ # .then(fn=add_message, inputs=[chatbot, chat_input], outputs=[chatbot, chat_input], api_name="api_addprompt_chathistory")
1525
+ # # First, generate the bot response
1526
+ # .then(fn=generate_bot_response, inputs=[chatbot, choice, retrieval_mode, model_choice], outputs=[chatbot], api_name="api_generate_bot_response")
1527
+ # # Then, generate the TTS response based on the bot's response
1528
+ # .then(fn=generate_tts_response, inputs=[chatbot, tts_choice], outputs=[audio_output], api_name="api_generate_tts_response")
1529
+ # .then(fn=show_map_if_details, inputs=[chatbot, choice], outputs=[location_output, location_output], api_name="api_show_map_details")
1530
+ # .then(fn=clear_textbox, inputs=[], outputs=[chat_input], api_name="api_clear_textbox")
1531
+ # )
1532
+
1533
+ # Gradio bot interaction with audio streaming
1534
  retriever_sequence = (
1535
  retriever_button.click(fn=stop_audio, inputs=[], outputs=[audio_output], api_name="api_stop_audio_recording")
1536
  .then(fn=add_message, inputs=[chatbot, chat_input], outputs=[chatbot, chat_input], api_name="api_addprompt_chathistory")
 
1556
  # fn=clear_textbox, inputs=[], outputs=[chat_input],api_name="api_clear_textbox"
1557
  # )
1558
 
1559
+ # chat_input.submit(fn=stop_audio, inputs=[], outputs=[audio_output], api_name="api_stop_audio_recording").then(
1560
+ # fn=add_message, inputs=[chatbot, chat_input], outputs=[chatbot, chat_input], api_name="api_addprompt_chathistory"
1561
+ # ).then(
1562
+ # # First, generate the bot response
1563
+ # fn=generate_bot_response, inputs=[chatbot, choice, retrieval_mode, model_choice], outputs=[chatbot], api_name="api_generate_bot_response"
1564
+ # ).then(
1565
+ # # Then, generate the TTS response based on the bot's response
1566
+ # fn=generate_tts_response, inputs=[chatbot, tts_choice], outputs=[audio_output], api_name="api_generate_tts_response"
1567
+ # ).then(
1568
+ # fn=show_map_if_details, inputs=[chatbot, choice], outputs=[location_output, location_output], api_name="api_show_map_details"
1569
+ # ).then(
1570
+ # fn=clear_textbox, inputs=[], outputs=[chat_input], api_name="api_clear_textbox"
1571
+ # )
1572
+
1573
+ # The same logic for chat_input submission
1574
  chat_input.submit(fn=stop_audio, inputs=[], outputs=[audio_output], api_name="api_stop_audio_recording").then(
1575
  fn=add_message, inputs=[chatbot, chat_input], outputs=[chatbot, chat_input], api_name="api_addprompt_chathistory"
1576
  ).then(
 
1577
  fn=generate_bot_response, inputs=[chatbot, choice, retrieval_mode, model_choice], outputs=[chatbot], api_name="api_generate_bot_response"
1578
  ).then(
 
1579
  fn=generate_tts_response, inputs=[chatbot, tts_choice], outputs=[audio_output], api_name="api_generate_tts_response"
1580
  ).then(
1581
  fn=show_map_if_details, inputs=[chatbot, choice], outputs=[location_output, location_output], api_name="api_show_map_details"
 
1588
 
1589
 
1590
 
 
1591
  audio_input = gr.Audio(sources=["microphone"], streaming=True, type='numpy', every=0.1)
1592
  audio_input.stream(transcribe_function, inputs=[state, audio_input], outputs=[state, chat_input], api_name="api_voice_to_text")
1593