Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -356,15 +356,34 @@ Sure! Here's the information you requested:
|
|
356 |
"""
|
357 |
|
358 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
359 |
def generate_bot_response(history, choice, retrieval_mode, model_choice):
|
360 |
if not history:
|
361 |
return
|
362 |
|
363 |
# Select the model
|
364 |
-
# selected_model = chat_model if model_choice == "LM-1" else phi_pipe
|
365 |
selected_model = chat_model if model_choice == "LM-1" else (chat_model1 if model_choice == "LM-3" else phi_pipe)
|
366 |
|
367 |
-
|
368 |
response, addresses = generate_answer(history[-1][0], choice, retrieval_mode, selected_model)
|
369 |
history[-1][1] = ""
|
370 |
|
@@ -397,8 +416,34 @@ def generate_tts_response(response, tts_choice):
|
|
397 |
|
398 |
|
399 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
400 |
import concurrent.futures
|
401 |
-
|
402 |
def bot(history, choice, tts_choice, retrieval_mode, model_choice):
|
403 |
# Initialize an empty response
|
404 |
response = ""
|
@@ -452,11 +497,21 @@ def generate_bot_response(history, choice, retrieval_mode, model_choice):
|
|
452 |
|
453 |
|
454 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
455 |
def generate_audio_after_text(response, tts_choice):
|
456 |
# Generate TTS audio after text response is completed
|
457 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
458 |
-
|
459 |
-
|
|
|
|
|
|
|
460 |
return audio_path
|
461 |
|
462 |
import re
|
@@ -895,6 +950,78 @@ def generate_audio_elevenlabs(text):
|
|
895 |
|
896 |
# chunking audio and then Process
|
897 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
898 |
import concurrent.futures
|
899 |
import tempfile
|
900 |
import os
|
@@ -913,7 +1040,7 @@ repo_id = "parler-tts/parler-tts-mini-v1"
|
|
913 |
|
914 |
def generate_audio_parler_tts(text):
|
915 |
description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
|
916 |
-
chunk_size_in_s = 0
|
917 |
|
918 |
# Initialize the tokenizer and model
|
919 |
parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
|
@@ -921,7 +1048,7 @@ def generate_audio_parler_tts(text):
|
|
921 |
sampling_rate = parler_model.audio_encoder.config.sampling_rate
|
922 |
frame_rate = parler_model.audio_encoder.config.frame_rate
|
923 |
|
924 |
-
def generate(text, description, play_steps_in_s=0
|
925 |
play_steps = int(frame_rate * play_steps_in_s)
|
926 |
streamer = ParlerTTSStreamer(parler_model, device=device, play_steps=play_steps)
|
927 |
|
@@ -956,7 +1083,6 @@ def generate_audio_parler_tts(text):
|
|
956 |
write_wav(temp_audio_path, sampling_rate, audio_chunk.astype(np.float32))
|
957 |
logging.debug(f"Saved chunk to {temp_audio_path}")
|
958 |
|
959 |
-
|
960 |
# Combine all the audio chunks into one audio file
|
961 |
combined_audio = np.concatenate(audio_segments)
|
962 |
combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
|
@@ -967,6 +1093,7 @@ def generate_audio_parler_tts(text):
|
|
967 |
return combined_audio_path
|
968 |
|
969 |
|
|
|
970 |
def fetch_local_events():
|
971 |
api_key = os.environ['SERP_API']
|
972 |
url = f'https://serpapi.com/search.json?engine=google_events&q=Events+in+Birmingham&hl=en&gl=us&api_key={api_key}'
|
@@ -1392,6 +1519,18 @@ with gr.Blocks(theme='Pijush2023/scikit-learn-pijush') as demo:
|
|
1392 |
# .then(fn=clear_textbox, inputs=[], outputs=[chat_input],api_name="api_clear_textbox")
|
1393 |
# )
|
1394 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1395 |
retriever_sequence = (
|
1396 |
retriever_button.click(fn=stop_audio, inputs=[], outputs=[audio_output], api_name="api_stop_audio_recording")
|
1397 |
.then(fn=add_message, inputs=[chatbot, chat_input], outputs=[chatbot, chat_input], api_name="api_addprompt_chathistory")
|
@@ -1417,13 +1556,26 @@ with gr.Blocks(theme='Pijush2023/scikit-learn-pijush') as demo:
|
|
1417 |
# fn=clear_textbox, inputs=[], outputs=[chat_input],api_name="api_clear_textbox"
|
1418 |
# )
|
1419 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1420 |
chat_input.submit(fn=stop_audio, inputs=[], outputs=[audio_output], api_name="api_stop_audio_recording").then(
|
1421 |
fn=add_message, inputs=[chatbot, chat_input], outputs=[chatbot, chat_input], api_name="api_addprompt_chathistory"
|
1422 |
).then(
|
1423 |
-
# First, generate the bot response
|
1424 |
fn=generate_bot_response, inputs=[chatbot, choice, retrieval_mode, model_choice], outputs=[chatbot], api_name="api_generate_bot_response"
|
1425 |
).then(
|
1426 |
-
# Then, generate the TTS response based on the bot's response
|
1427 |
fn=generate_tts_response, inputs=[chatbot, tts_choice], outputs=[audio_output], api_name="api_generate_tts_response"
|
1428 |
).then(
|
1429 |
fn=show_map_if_details, inputs=[chatbot, choice], outputs=[location_output, location_output], api_name="api_show_map_details"
|
@@ -1436,7 +1588,6 @@ with gr.Blocks(theme='Pijush2023/scikit-learn-pijush') as demo:
|
|
1436 |
|
1437 |
|
1438 |
|
1439 |
-
|
1440 |
audio_input = gr.Audio(sources=["microphone"], streaming=True, type='numpy', every=0.1)
|
1441 |
audio_input.stream(transcribe_function, inputs=[state, audio_input], outputs=[state, chat_input], api_name="api_voice_to_text")
|
1442 |
|
|
|
356 |
"""
|
357 |
|
358 |
|
359 |
+
# def generate_bot_response(history, choice, retrieval_mode, model_choice):
|
360 |
+
# if not history:
|
361 |
+
# return
|
362 |
+
|
363 |
+
# # Select the model
|
364 |
+
# # selected_model = chat_model if model_choice == "LM-1" else phi_pipe
|
365 |
+
# selected_model = chat_model if model_choice == "LM-1" else (chat_model1 if model_choice == "LM-3" else phi_pipe)
|
366 |
+
|
367 |
+
|
368 |
+
# response, addresses = generate_answer(history[-1][0], choice, retrieval_mode, selected_model)
|
369 |
+
# history[-1][1] = ""
|
370 |
+
|
371 |
+
# for character in response:
|
372 |
+
# history[-1][1] += character
|
373 |
+
# yield history # Stream each character as it is generated
|
374 |
+
# time.sleep(0.05) # Add a slight delay to simulate streaming
|
375 |
+
|
376 |
+
# yield history # Final yield with the complete response
|
377 |
+
|
378 |
+
|
379 |
+
# Modified bot function to separate chatbot response and TTS generation
|
380 |
def generate_bot_response(history, choice, retrieval_mode, model_choice):
|
381 |
if not history:
|
382 |
return
|
383 |
|
384 |
# Select the model
|
|
|
385 |
selected_model = chat_model if model_choice == "LM-1" else (chat_model1 if model_choice == "LM-3" else phi_pipe)
|
386 |
|
|
|
387 |
response, addresses = generate_answer(history[-1][0], choice, retrieval_mode, selected_model)
|
388 |
history[-1][1] = ""
|
389 |
|
|
|
416 |
|
417 |
|
418 |
|
419 |
+
# import concurrent.futures
|
420 |
+
# # Existing bot function with concurrent futures for parallel processing
|
421 |
+
# def bot(history, choice, tts_choice, retrieval_mode, model_choice):
|
422 |
+
# # Initialize an empty response
|
423 |
+
# response = ""
|
424 |
+
|
425 |
+
# # Create a thread pool to handle both text generation and TTS conversion in parallel
|
426 |
+
# with concurrent.futures.ThreadPoolExecutor() as executor:
|
427 |
+
# # Start the bot response generation in parallel
|
428 |
+
# bot_future = executor.submit(generate_bot_response, history, choice, retrieval_mode, model_choice)
|
429 |
+
|
430 |
+
# # Wait for the text generation to start
|
431 |
+
# for history_chunk in bot_future.result():
|
432 |
+
# response = history_chunk[-1][1] # Update the response with the current state
|
433 |
+
# yield history_chunk, None # Stream the text output as it's generated
|
434 |
+
|
435 |
+
# # Once text is fully generated, start the TTS conversion
|
436 |
+
# tts_future = executor.submit(generate_tts_response, response, tts_choice)
|
437 |
+
|
438 |
+
# # Get the audio output after TTS is done
|
439 |
+
# audio_path = tts_future.result()
|
440 |
+
|
441 |
+
# # Stream the final text and audio output
|
442 |
+
# yield history, audio_path
|
443 |
+
|
444 |
+
|
445 |
import concurrent.futures
|
446 |
+
|
447 |
def bot(history, choice, tts_choice, retrieval_mode, model_choice):
|
448 |
# Initialize an empty response
|
449 |
response = ""
|
|
|
497 |
|
498 |
|
499 |
|
500 |
+
# def generate_audio_after_text(response, tts_choice):
|
501 |
+
# # Generate TTS audio after text response is completed
|
502 |
+
# with concurrent.futures.ThreadPoolExecutor() as executor:
|
503 |
+
# tts_future = executor.submit(generate_tts_response, response, tts_choice)
|
504 |
+
# audio_path = tts_future.result()
|
505 |
+
# return audio_path
|
506 |
+
|
507 |
def generate_audio_after_text(response, tts_choice):
|
508 |
# Generate TTS audio after text response is completed
|
509 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
510 |
+
if tts_choice == "Alpha":
|
511 |
+
audio_future = executor.submit(generate_audio_elevenlabs, response)
|
512 |
+
elif tts_choice == "Beta":
|
513 |
+
audio_future = executor.submit(generate_audio_parler_tts, response) # Use the updated Parler TTS generator
|
514 |
+
audio_path = audio_future.result()
|
515 |
return audio_path
|
516 |
|
517 |
import re
|
|
|
950 |
|
951 |
# chunking audio and then Process
|
952 |
|
953 |
+
# import concurrent.futures
|
954 |
+
# import tempfile
|
955 |
+
# import os
|
956 |
+
# import numpy as np
|
957 |
+
# import logging
|
958 |
+
# from queue import Queue
|
959 |
+
# from threading import Thread
|
960 |
+
# from scipy.io.wavfile import write as write_wav
|
961 |
+
# from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
|
962 |
+
# from transformers import AutoTokenizer
|
963 |
+
|
964 |
+
# # Ensure your device is set to CUDA
|
965 |
+
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
966 |
+
|
967 |
+
# repo_id = "parler-tts/parler-tts-mini-v1"
|
968 |
+
|
969 |
+
# def generate_audio_parler_tts(text):
|
970 |
+
# description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
|
971 |
+
# chunk_size_in_s = 0.5
|
972 |
+
|
973 |
+
# # Initialize the tokenizer and model
|
974 |
+
# parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
|
975 |
+
# parler_model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
|
976 |
+
# sampling_rate = parler_model.audio_encoder.config.sampling_rate
|
977 |
+
# frame_rate = parler_model.audio_encoder.config.frame_rate
|
978 |
+
|
979 |
+
# def generate(text, description, play_steps_in_s=0.5):
|
980 |
+
# play_steps = int(frame_rate * play_steps_in_s)
|
981 |
+
# streamer = ParlerTTSStreamer(parler_model, device=device, play_steps=play_steps)
|
982 |
+
|
983 |
+
# inputs = parler_tokenizer(description, return_tensors="pt").to(device)
|
984 |
+
# prompt = parler_tokenizer(text, return_tensors="pt").to(device)
|
985 |
+
|
986 |
+
# generation_kwargs = dict(
|
987 |
+
# input_ids=inputs.input_ids,
|
988 |
+
# prompt_input_ids=prompt.input_ids,
|
989 |
+
# attention_mask=inputs.attention_mask,
|
990 |
+
# prompt_attention_mask=prompt.attention_mask,
|
991 |
+
# streamer=streamer,
|
992 |
+
# do_sample=True,
|
993 |
+
# temperature=1.0,
|
994 |
+
# min_new_tokens=10,
|
995 |
+
# )
|
996 |
+
|
997 |
+
# thread = Thread(target=parler_model.generate, kwargs=generation_kwargs)
|
998 |
+
# thread.start()
|
999 |
+
|
1000 |
+
# for new_audio in streamer:
|
1001 |
+
# if new_audio.shape[0] == 0:
|
1002 |
+
# break
|
1003 |
+
# # Save or process each audio chunk as it is generated
|
1004 |
+
# yield sampling_rate, new_audio
|
1005 |
+
|
1006 |
+
# audio_segments = []
|
1007 |
+
# for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
|
1008 |
+
# audio_segments.append(audio_chunk)
|
1009 |
+
|
1010 |
+
# temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_chunk_{len(audio_segments)}.wav")
|
1011 |
+
# write_wav(temp_audio_path, sampling_rate, audio_chunk.astype(np.float32))
|
1012 |
+
# logging.debug(f"Saved chunk to {temp_audio_path}")
|
1013 |
+
|
1014 |
+
|
1015 |
+
# # Combine all the audio chunks into one audio file
|
1016 |
+
# combined_audio = np.concatenate(audio_segments)
|
1017 |
+
# combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
|
1018 |
+
|
1019 |
+
# write_wav(combined_audio_path, sampling_rate, combined_audio.astype(np.float32))
|
1020 |
+
|
1021 |
+
# logging.debug(f"Combined audio saved to {combined_audio_path}")
|
1022 |
+
# return combined_audio_path
|
1023 |
+
|
1024 |
+
|
1025 |
import concurrent.futures
|
1026 |
import tempfile
|
1027 |
import os
|
|
|
1040 |
|
1041 |
def generate_audio_parler_tts(text):
|
1042 |
description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
|
1043 |
+
chunk_size_in_s = 3.0 # Setting buffer size to 3 seconds
|
1044 |
|
1045 |
# Initialize the tokenizer and model
|
1046 |
parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
|
|
|
1048 |
sampling_rate = parler_model.audio_encoder.config.sampling_rate
|
1049 |
frame_rate = parler_model.audio_encoder.config.frame_rate
|
1050 |
|
1051 |
+
def generate(text, description, play_steps_in_s=3.0):
|
1052 |
play_steps = int(frame_rate * play_steps_in_s)
|
1053 |
streamer = ParlerTTSStreamer(parler_model, device=device, play_steps=play_steps)
|
1054 |
|
|
|
1083 |
write_wav(temp_audio_path, sampling_rate, audio_chunk.astype(np.float32))
|
1084 |
logging.debug(f"Saved chunk to {temp_audio_path}")
|
1085 |
|
|
|
1086 |
# Combine all the audio chunks into one audio file
|
1087 |
combined_audio = np.concatenate(audio_segments)
|
1088 |
combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
|
|
|
1093 |
return combined_audio_path
|
1094 |
|
1095 |
|
1096 |
+
|
1097 |
def fetch_local_events():
|
1098 |
api_key = os.environ['SERP_API']
|
1099 |
url = f'https://serpapi.com/search.json?engine=google_events&q=Events+in+Birmingham&hl=en&gl=us&api_key={api_key}'
|
|
|
1519 |
# .then(fn=clear_textbox, inputs=[], outputs=[chat_input],api_name="api_clear_textbox")
|
1520 |
# )
|
1521 |
|
1522 |
+
# retriever_sequence = (
|
1523 |
+
# retriever_button.click(fn=stop_audio, inputs=[], outputs=[audio_output], api_name="api_stop_audio_recording")
|
1524 |
+
# .then(fn=add_message, inputs=[chatbot, chat_input], outputs=[chatbot, chat_input], api_name="api_addprompt_chathistory")
|
1525 |
+
# # First, generate the bot response
|
1526 |
+
# .then(fn=generate_bot_response, inputs=[chatbot, choice, retrieval_mode, model_choice], outputs=[chatbot], api_name="api_generate_bot_response")
|
1527 |
+
# # Then, generate the TTS response based on the bot's response
|
1528 |
+
# .then(fn=generate_tts_response, inputs=[chatbot, tts_choice], outputs=[audio_output], api_name="api_generate_tts_response")
|
1529 |
+
# .then(fn=show_map_if_details, inputs=[chatbot, choice], outputs=[location_output, location_output], api_name="api_show_map_details")
|
1530 |
+
# .then(fn=clear_textbox, inputs=[], outputs=[chat_input], api_name="api_clear_textbox")
|
1531 |
+
# )
|
1532 |
+
|
1533 |
+
# Gradio bot interaction with audio streaming
|
1534 |
retriever_sequence = (
|
1535 |
retriever_button.click(fn=stop_audio, inputs=[], outputs=[audio_output], api_name="api_stop_audio_recording")
|
1536 |
.then(fn=add_message, inputs=[chatbot, chat_input], outputs=[chatbot, chat_input], api_name="api_addprompt_chathistory")
|
|
|
1556 |
# fn=clear_textbox, inputs=[], outputs=[chat_input],api_name="api_clear_textbox"
|
1557 |
# )
|
1558 |
|
1559 |
+
# chat_input.submit(fn=stop_audio, inputs=[], outputs=[audio_output], api_name="api_stop_audio_recording").then(
|
1560 |
+
# fn=add_message, inputs=[chatbot, chat_input], outputs=[chatbot, chat_input], api_name="api_addprompt_chathistory"
|
1561 |
+
# ).then(
|
1562 |
+
# # First, generate the bot response
|
1563 |
+
# fn=generate_bot_response, inputs=[chatbot, choice, retrieval_mode, model_choice], outputs=[chatbot], api_name="api_generate_bot_response"
|
1564 |
+
# ).then(
|
1565 |
+
# # Then, generate the TTS response based on the bot's response
|
1566 |
+
# fn=generate_tts_response, inputs=[chatbot, tts_choice], outputs=[audio_output], api_name="api_generate_tts_response"
|
1567 |
+
# ).then(
|
1568 |
+
# fn=show_map_if_details, inputs=[chatbot, choice], outputs=[location_output, location_output], api_name="api_show_map_details"
|
1569 |
+
# ).then(
|
1570 |
+
# fn=clear_textbox, inputs=[], outputs=[chat_input], api_name="api_clear_textbox"
|
1571 |
+
# )
|
1572 |
+
|
1573 |
+
# The same logic for chat_input submission
|
1574 |
chat_input.submit(fn=stop_audio, inputs=[], outputs=[audio_output], api_name="api_stop_audio_recording").then(
|
1575 |
fn=add_message, inputs=[chatbot, chat_input], outputs=[chatbot, chat_input], api_name="api_addprompt_chathistory"
|
1576 |
).then(
|
|
|
1577 |
fn=generate_bot_response, inputs=[chatbot, choice, retrieval_mode, model_choice], outputs=[chatbot], api_name="api_generate_bot_response"
|
1578 |
).then(
|
|
|
1579 |
fn=generate_tts_response, inputs=[chatbot, tts_choice], outputs=[audio_output], api_name="api_generate_tts_response"
|
1580 |
).then(
|
1581 |
fn=show_map_if_details, inputs=[chatbot, choice], outputs=[location_output, location_output], api_name="api_show_map_details"
|
|
|
1588 |
|
1589 |
|
1590 |
|
|
|
1591 |
audio_input = gr.Audio(sources=["microphone"], streaming=True, type='numpy', every=0.1)
|
1592 |
audio_input.stream(transcribe_function, inputs=[state, audio_input], outputs=[state, chat_input], api_name="api_voice_to_text")
|
1593 |
|