Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -356,34 +356,15 @@ Sure! Here's the information you requested:
|
|
356 |
"""
|
357 |
|
358 |
|
359 |
-
# def generate_bot_response(history, choice, retrieval_mode, model_choice):
|
360 |
-
# if not history:
|
361 |
-
# return
|
362 |
-
|
363 |
-
# # Select the model
|
364 |
-
# # selected_model = chat_model if model_choice == "LM-1" else phi_pipe
|
365 |
-
# selected_model = chat_model if model_choice == "LM-1" else (chat_model1 if model_choice == "LM-3" else phi_pipe)
|
366 |
-
|
367 |
-
|
368 |
-
# response, addresses = generate_answer(history[-1][0], choice, retrieval_mode, selected_model)
|
369 |
-
# history[-1][1] = ""
|
370 |
-
|
371 |
-
# for character in response:
|
372 |
-
# history[-1][1] += character
|
373 |
-
# yield history # Stream each character as it is generated
|
374 |
-
# time.sleep(0.05) # Add a slight delay to simulate streaming
|
375 |
-
|
376 |
-
# yield history # Final yield with the complete response
|
377 |
-
|
378 |
-
|
379 |
-
# Modified bot function to separate chatbot response and TTS generation
|
380 |
def generate_bot_response(history, choice, retrieval_mode, model_choice):
|
381 |
if not history:
|
382 |
return
|
383 |
|
384 |
# Select the model
|
|
|
385 |
selected_model = chat_model if model_choice == "LM-1" else (chat_model1 if model_choice == "LM-3" else phi_pipe)
|
386 |
|
|
|
387 |
response, addresses = generate_answer(history[-1][0], choice, retrieval_mode, selected_model)
|
388 |
history[-1][1] = ""
|
389 |
|
@@ -416,70 +397,34 @@ def generate_tts_response(response, tts_choice):
|
|
416 |
|
417 |
|
418 |
|
419 |
-
# import concurrent.futures
|
420 |
-
# # Existing bot function with concurrent futures for parallel processing
|
421 |
-
# def bot(history, choice, tts_choice, retrieval_mode, model_choice):
|
422 |
-
# # Initialize an empty response
|
423 |
-
# response = ""
|
424 |
-
|
425 |
-
# # Create a thread pool to handle both text generation and TTS conversion in parallel
|
426 |
-
# with concurrent.futures.ThreadPoolExecutor() as executor:
|
427 |
-
# # Start the bot response generation in parallel
|
428 |
-
# bot_future = executor.submit(generate_bot_response, history, choice, retrieval_mode, model_choice)
|
429 |
-
|
430 |
-
# # Wait for the text generation to start
|
431 |
-
# for history_chunk in bot_future.result():
|
432 |
-
# response = history_chunk[-1][1] # Update the response with the current state
|
433 |
-
# yield history_chunk, None # Stream the text output as it's generated
|
434 |
-
|
435 |
-
# # Once text is fully generated, start the TTS conversion
|
436 |
-
# tts_future = executor.submit(generate_tts_response, response, tts_choice)
|
437 |
-
|
438 |
-
# # Get the audio output after TTS is done
|
439 |
-
# audio_path = tts_future.result()
|
440 |
-
|
441 |
-
# # Stream the final text and audio output
|
442 |
-
# yield history, audio_path
|
443 |
-
|
444 |
-
|
445 |
import concurrent.futures
|
|
|
|
|
|
|
|
|
446 |
|
447 |
-
|
|
|
|
|
|
|
448 |
|
449 |
-
|
450 |
-
|
|
|
|
|
|
|
|
|
|
|
451 |
|
452 |
-
|
453 |
-
|
454 |
-
audio_future = None
|
455 |
|
456 |
-
|
457 |
-
|
458 |
-
response += chunk
|
459 |
-
history[-1][1] += chunk
|
460 |
-
yield history, None # Stream the text output as it's generated
|
461 |
|
462 |
-
# Start generating Parler TTS if selected and not started already
|
463 |
-
if tts_choice == "Beta" and audio_future is None:
|
464 |
-
audio_future = asyncio.create_task(generate_audio_parler_tts(response, callback=lambda audio_chunk: yield_audio(audio_chunk)))
|
465 |
|
466 |
-
# Wait for the audio to finish streaming if it was started
|
467 |
-
if audio_future is not None:
|
468 |
-
await audio_future
|
469 |
|
470 |
-
def yield_audio(audio_chunk):
|
471 |
-
""" Stream audio in chunks to the output """
|
472 |
-
temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_chunk_{int(time.time())}.wav")
|
473 |
-
write_wav(temp_audio_path, 16000, audio_chunk.astype(np.float32))
|
474 |
-
return temp_audio_path
|
475 |
|
476 |
-
# Text generator as an async generator
|
477 |
-
async def generate_text(history, choice, retrieval_mode, model_choice):
|
478 |
-
# Simulate text generation chunk by chunk
|
479 |
-
text_to_generate = "Generating text response..."
|
480 |
-
for char in text_to_generate:
|
481 |
-
await asyncio.sleep(0.05) # Simulate time delay between character generation
|
482 |
-
yield char # Yield each character as it's generated
|
483 |
|
484 |
|
485 |
|
@@ -507,21 +452,11 @@ def generate_bot_response(history, choice, retrieval_mode, model_choice):
|
|
507 |
|
508 |
|
509 |
|
510 |
-
# def generate_audio_after_text(response, tts_choice):
|
511 |
-
# # Generate TTS audio after text response is completed
|
512 |
-
# with concurrent.futures.ThreadPoolExecutor() as executor:
|
513 |
-
# tts_future = executor.submit(generate_tts_response, response, tts_choice)
|
514 |
-
# audio_path = tts_future.result()
|
515 |
-
# return audio_path
|
516 |
-
|
517 |
def generate_audio_after_text(response, tts_choice):
|
518 |
# Generate TTS audio after text response is completed
|
519 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
520 |
-
|
521 |
-
|
522 |
-
elif tts_choice == "Beta":
|
523 |
-
audio_future = executor.submit(generate_audio_parler_tts, response) # Use the updated Parler TTS generator
|
524 |
-
audio_path = audio_future.result()
|
525 |
return audio_path
|
526 |
|
527 |
import re
|
@@ -766,9 +701,9 @@ def generate_image(prompt):
|
|
766 |
).images[0]
|
767 |
return image
|
768 |
|
769 |
-
hardcoded_prompt_1 = "
|
770 |
-
hardcoded_prompt_2 = "A
|
771 |
-
hardcoded_prompt_3 = "
|
772 |
|
773 |
def update_images():
|
774 |
image_1 = generate_image(hardcoded_prompt_1)
|
@@ -960,79 +895,6 @@ def generate_audio_elevenlabs(text):
|
|
960 |
|
961 |
# chunking audio and then Process
|
962 |
|
963 |
-
# import concurrent.futures
|
964 |
-
# import tempfile
|
965 |
-
# import os
|
966 |
-
# import numpy as np
|
967 |
-
# import logging
|
968 |
-
# from queue import Queue
|
969 |
-
# from threading import Thread
|
970 |
-
# from scipy.io.wavfile import write as write_wav
|
971 |
-
# from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
|
972 |
-
# from transformers import AutoTokenizer
|
973 |
-
|
974 |
-
# # Ensure your device is set to CUDA
|
975 |
-
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
976 |
-
|
977 |
-
# repo_id = "parler-tts/parler-tts-mini-v1"
|
978 |
-
|
979 |
-
# def generate_audio_parler_tts(text):
|
980 |
-
# description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
|
981 |
-
# chunk_size_in_s = 0.5
|
982 |
-
|
983 |
-
# # Initialize the tokenizer and model
|
984 |
-
# parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
|
985 |
-
# parler_model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
|
986 |
-
# sampling_rate = parler_model.audio_encoder.config.sampling_rate
|
987 |
-
# frame_rate = parler_model.audio_encoder.config.frame_rate
|
988 |
-
|
989 |
-
# def generate(text, description, play_steps_in_s=0.5):
|
990 |
-
# play_steps = int(frame_rate * play_steps_in_s)
|
991 |
-
# streamer = ParlerTTSStreamer(parler_model, device=device, play_steps=play_steps)
|
992 |
-
|
993 |
-
# inputs = parler_tokenizer(description, return_tensors="pt").to(device)
|
994 |
-
# prompt = parler_tokenizer(text, return_tensors="pt").to(device)
|
995 |
-
|
996 |
-
# generation_kwargs = dict(
|
997 |
-
# input_ids=inputs.input_ids,
|
998 |
-
# prompt_input_ids=prompt.input_ids,
|
999 |
-
# attention_mask=inputs.attention_mask,
|
1000 |
-
# prompt_attention_mask=prompt.attention_mask,
|
1001 |
-
# streamer=streamer,
|
1002 |
-
# do_sample=True,
|
1003 |
-
# temperature=1.0,
|
1004 |
-
# min_new_tokens=10,
|
1005 |
-
# )
|
1006 |
-
|
1007 |
-
# thread = Thread(target=parler_model.generate, kwargs=generation_kwargs)
|
1008 |
-
# thread.start()
|
1009 |
-
|
1010 |
-
# for new_audio in streamer:
|
1011 |
-
# if new_audio.shape[0] == 0:
|
1012 |
-
# break
|
1013 |
-
# # Save or process each audio chunk as it is generated
|
1014 |
-
# yield sampling_rate, new_audio
|
1015 |
-
|
1016 |
-
# audio_segments = []
|
1017 |
-
# for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
|
1018 |
-
# audio_segments.append(audio_chunk)
|
1019 |
-
|
1020 |
-
# temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_chunk_{len(audio_segments)}.wav")
|
1021 |
-
# write_wav(temp_audio_path, sampling_rate, audio_chunk.astype(np.float32))
|
1022 |
-
# logging.debug(f"Saved chunk to {temp_audio_path}")
|
1023 |
-
|
1024 |
-
|
1025 |
-
# # Combine all the audio chunks into one audio file
|
1026 |
-
# combined_audio = np.concatenate(audio_segments)
|
1027 |
-
# combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
|
1028 |
-
|
1029 |
-
# write_wav(combined_audio_path, sampling_rate, combined_audio.astype(np.float32))
|
1030 |
-
|
1031 |
-
# logging.debug(f"Combined audio saved to {combined_audio_path}")
|
1032 |
-
# return combined_audio_path
|
1033 |
-
|
1034 |
-
|
1035 |
-
import asyncio
|
1036 |
import concurrent.futures
|
1037 |
import tempfile
|
1038 |
import os
|
@@ -1049,10 +911,9 @@ device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
|
1049 |
|
1050 |
repo_id = "parler-tts/parler-tts-mini-v1"
|
1051 |
|
1052 |
-
|
1053 |
-
async def generate_audio_parler_tts(text, callback=None):
|
1054 |
description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
|
1055 |
-
chunk_size_in_s =
|
1056 |
|
1057 |
# Initialize the tokenizer and model
|
1058 |
parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
|
@@ -1060,7 +921,7 @@ async def generate_audio_parler_tts(text, callback=None):
|
|
1060 |
sampling_rate = parler_model.audio_encoder.config.sampling_rate
|
1061 |
frame_rate = parler_model.audio_encoder.config.frame_rate
|
1062 |
|
1063 |
-
def generate(text, description, play_steps_in_s=
|
1064 |
play_steps = int(frame_rate * play_steps_in_s)
|
1065 |
streamer = ParlerTTSStreamer(parler_model, device=device, play_steps=play_steps)
|
1066 |
|
@@ -1070,6 +931,8 @@ async def generate_audio_parler_tts(text, callback=None):
|
|
1070 |
generation_kwargs = dict(
|
1071 |
input_ids=inputs.input_ids,
|
1072 |
prompt_input_ids=prompt.input_ids,
|
|
|
|
|
1073 |
streamer=streamer,
|
1074 |
do_sample=True,
|
1075 |
temperature=1.0,
|
@@ -1082,26 +945,28 @@ async def generate_audio_parler_tts(text, callback=None):
|
|
1082 |
for new_audio in streamer:
|
1083 |
if new_audio.shape[0] == 0:
|
1084 |
break
|
1085 |
-
|
1086 |
-
callback(new_audio) # Send the chunk to the callback function for streaming
|
1087 |
yield sampling_rate, new_audio
|
1088 |
|
1089 |
audio_segments = []
|
1090 |
for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
|
1091 |
audio_segments.append(audio_chunk)
|
1092 |
-
await asyncio.sleep(0) # Allow other tasks to run
|
1093 |
|
1094 |
-
|
|
|
|
|
|
|
|
|
|
|
1095 |
combined_audio = np.concatenate(audio_segments)
|
1096 |
combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
|
|
|
1097 |
write_wav(combined_audio_path, sampling_rate, combined_audio.astype(np.float32))
|
1098 |
|
1099 |
logging.debug(f"Combined audio saved to {combined_audio_path}")
|
1100 |
return combined_audio_path
|
1101 |
|
1102 |
|
1103 |
-
|
1104 |
-
|
1105 |
def fetch_local_events():
|
1106 |
api_key = os.environ['SERP_API']
|
1107 |
url = f'https://serpapi.com/search.json?engine=google_events&q=Events+in+Birmingham&hl=en&gl=us&api_key={api_key}'
|
@@ -1527,25 +1392,13 @@ with gr.Blocks(theme='Pijush2023/scikit-learn-pijush') as demo:
|
|
1527 |
# .then(fn=clear_textbox, inputs=[], outputs=[chat_input],api_name="api_clear_textbox")
|
1528 |
# )
|
1529 |
|
1530 |
-
# retriever_sequence = (
|
1531 |
-
# retriever_button.click(fn=stop_audio, inputs=[], outputs=[audio_output], api_name="api_stop_audio_recording")
|
1532 |
-
# .then(fn=add_message, inputs=[chatbot, chat_input], outputs=[chatbot, chat_input], api_name="api_addprompt_chathistory")
|
1533 |
-
# # First, generate the bot response
|
1534 |
-
# .then(fn=generate_bot_response, inputs=[chatbot, choice, retrieval_mode, model_choice], outputs=[chatbot], api_name="api_generate_bot_response")
|
1535 |
-
# # Then, generate the TTS response based on the bot's response
|
1536 |
-
# .then(fn=generate_tts_response, inputs=[chatbot, tts_choice], outputs=[audio_output], api_name="api_generate_tts_response")
|
1537 |
-
# .then(fn=show_map_if_details, inputs=[chatbot, choice], outputs=[location_output, location_output], api_name="api_show_map_details")
|
1538 |
-
# .then(fn=clear_textbox, inputs=[], outputs=[chat_input], api_name="api_clear_textbox")
|
1539 |
-
# )
|
1540 |
-
|
1541 |
-
# Gradio bot interaction with audio streaming
|
1542 |
retriever_sequence = (
|
1543 |
retriever_button.click(fn=stop_audio, inputs=[], outputs=[audio_output], api_name="api_stop_audio_recording")
|
1544 |
.then(fn=add_message, inputs=[chatbot, chat_input], outputs=[chatbot, chat_input], api_name="api_addprompt_chathistory")
|
1545 |
# First, generate the bot response
|
1546 |
.then(fn=generate_bot_response, inputs=[chatbot, choice, retrieval_mode, model_choice], outputs=[chatbot], api_name="api_generate_bot_response")
|
1547 |
-
#
|
1548 |
-
.then(fn=
|
1549 |
.then(fn=show_map_if_details, inputs=[chatbot, choice], outputs=[location_output, location_output], api_name="api_show_map_details")
|
1550 |
.then(fn=clear_textbox, inputs=[], outputs=[chat_input], api_name="api_clear_textbox")
|
1551 |
)
|
@@ -1564,25 +1417,14 @@ with gr.Blocks(theme='Pijush2023/scikit-learn-pijush') as demo:
|
|
1564 |
# fn=clear_textbox, inputs=[], outputs=[chat_input],api_name="api_clear_textbox"
|
1565 |
# )
|
1566 |
|
1567 |
-
# chat_input.submit(fn=stop_audio, inputs=[], outputs=[audio_output], api_name="api_stop_audio_recording").then(
|
1568 |
-
# fn=add_message, inputs=[chatbot, chat_input], outputs=[chatbot, chat_input], api_name="api_addprompt_chathistory"
|
1569 |
-
# ).then(
|
1570 |
-
# # First, generate the bot response
|
1571 |
-
# fn=generate_bot_response, inputs=[chatbot, choice, retrieval_mode, model_choice], outputs=[chatbot], api_name="api_generate_bot_response"
|
1572 |
-
# ).then(
|
1573 |
-
# # Then, generate the TTS response based on the bot's response
|
1574 |
-
# fn=generate_tts_response, inputs=[chatbot, tts_choice], outputs=[audio_output], api_name="api_generate_tts_response"
|
1575 |
-
# ).then(
|
1576 |
-
# fn=show_map_if_details, inputs=[chatbot, choice], outputs=[location_output, location_output], api_name="api_show_map_details"
|
1577 |
-
# ).then(
|
1578 |
-
# fn=clear_textbox, inputs=[], outputs=[chat_input], api_name="api_clear_textbox"
|
1579 |
-
# )
|
1580 |
-
|
1581 |
-
# The same logic for chat_input submission
|
1582 |
chat_input.submit(fn=stop_audio, inputs=[], outputs=[audio_output], api_name="api_stop_audio_recording").then(
|
1583 |
fn=add_message, inputs=[chatbot, chat_input], outputs=[chatbot, chat_input], api_name="api_addprompt_chathistory"
|
1584 |
).then(
|
1585 |
-
|
|
|
|
|
|
|
|
|
1586 |
).then(
|
1587 |
fn=show_map_if_details, inputs=[chatbot, choice], outputs=[location_output, location_output], api_name="api_show_map_details"
|
1588 |
).then(
|
@@ -1594,6 +1436,7 @@ with gr.Blocks(theme='Pijush2023/scikit-learn-pijush') as demo:
|
|
1594 |
|
1595 |
|
1596 |
|
|
|
1597 |
audio_input = gr.Audio(sources=["microphone"], streaming=True, type='numpy', every=0.1)
|
1598 |
audio_input.stream(transcribe_function, inputs=[state, audio_input], outputs=[state, chat_input], api_name="api_voice_to_text")
|
1599 |
|
@@ -1614,11 +1457,4 @@ with gr.Blocks(theme='Pijush2023/scikit-learn-pijush') as demo:
|
|
1614 |
refresh_button.click(fn=update_images, inputs=None, outputs=[image_output_1, image_output_2, image_output_3], api_name="update_image")
|
1615 |
|
1616 |
demo.queue()
|
1617 |
-
demo.launch(show_error=True)
|
1618 |
-
|
1619 |
-
|
1620 |
-
|
1621 |
-
|
1622 |
-
|
1623 |
-
|
1624 |
-
|
|
|
356 |
"""
|
357 |
|
358 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
359 |
def generate_bot_response(history, choice, retrieval_mode, model_choice):
|
360 |
if not history:
|
361 |
return
|
362 |
|
363 |
# Select the model
|
364 |
+
# selected_model = chat_model if model_choice == "LM-1" else phi_pipe
|
365 |
selected_model = chat_model if model_choice == "LM-1" else (chat_model1 if model_choice == "LM-3" else phi_pipe)
|
366 |
|
367 |
+
|
368 |
response, addresses = generate_answer(history[-1][0], choice, retrieval_mode, selected_model)
|
369 |
history[-1][1] = ""
|
370 |
|
|
|
397 |
|
398 |
|
399 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
400 |
import concurrent.futures
|
401 |
+
# Existing bot function with concurrent futures for parallel processing
|
402 |
+
def bot(history, choice, tts_choice, retrieval_mode, model_choice):
|
403 |
+
# Initialize an empty response
|
404 |
+
response = ""
|
405 |
|
406 |
+
# Create a thread pool to handle both text generation and TTS conversion in parallel
|
407 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
408 |
+
# Start the bot response generation in parallel
|
409 |
+
bot_future = executor.submit(generate_bot_response, history, choice, retrieval_mode, model_choice)
|
410 |
|
411 |
+
# Wait for the text generation to start
|
412 |
+
for history_chunk in bot_future.result():
|
413 |
+
response = history_chunk[-1][1] # Update the response with the current state
|
414 |
+
yield history_chunk, None # Stream the text output as it's generated
|
415 |
+
|
416 |
+
# Once text is fully generated, start the TTS conversion
|
417 |
+
tts_future = executor.submit(generate_tts_response, response, tts_choice)
|
418 |
|
419 |
+
# Get the audio output after TTS is done
|
420 |
+
audio_path = tts_future.result()
|
|
|
421 |
|
422 |
+
# Stream the final text and audio output
|
423 |
+
yield history, audio_path
|
|
|
|
|
|
|
424 |
|
|
|
|
|
|
|
425 |
|
|
|
|
|
|
|
426 |
|
|
|
|
|
|
|
|
|
|
|
427 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
428 |
|
429 |
|
430 |
|
|
|
452 |
|
453 |
|
454 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
455 |
def generate_audio_after_text(response, tts_choice):
|
456 |
# Generate TTS audio after text response is completed
|
457 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
458 |
+
tts_future = executor.submit(generate_tts_response, response, tts_choice)
|
459 |
+
audio_path = tts_future.result()
|
|
|
|
|
|
|
460 |
return audio_path
|
461 |
|
462 |
import re
|
|
|
701 |
).images[0]
|
702 |
return image
|
703 |
|
704 |
+
hardcoded_prompt_1 = "A high quality cinematic image for Toyota Truck in Birmingham skyline shot in th style of Michael Mann"
|
705 |
+
hardcoded_prompt_2 = "A high quality cinematic image for Alabama Quarterback close up emotional shot in th style of Michael Mann"
|
706 |
+
hardcoded_prompt_3 = "A high quality cinematic image for Taylor Swift concert in Birmingham skyline style of Michael Mann"
|
707 |
|
708 |
def update_images():
|
709 |
image_1 = generate_image(hardcoded_prompt_1)
|
|
|
895 |
|
896 |
# chunking audio and then Process
|
897 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
898 |
import concurrent.futures
|
899 |
import tempfile
|
900 |
import os
|
|
|
911 |
|
912 |
repo_id = "parler-tts/parler-tts-mini-v1"
|
913 |
|
914 |
+
def generate_audio_parler_tts(text):
|
|
|
915 |
description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
|
916 |
+
chunk_size_in_s = 0.5
|
917 |
|
918 |
# Initialize the tokenizer and model
|
919 |
parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
|
|
|
921 |
sampling_rate = parler_model.audio_encoder.config.sampling_rate
|
922 |
frame_rate = parler_model.audio_encoder.config.frame_rate
|
923 |
|
924 |
+
def generate(text, description, play_steps_in_s=0.5):
|
925 |
play_steps = int(frame_rate * play_steps_in_s)
|
926 |
streamer = ParlerTTSStreamer(parler_model, device=device, play_steps=play_steps)
|
927 |
|
|
|
931 |
generation_kwargs = dict(
|
932 |
input_ids=inputs.input_ids,
|
933 |
prompt_input_ids=prompt.input_ids,
|
934 |
+
attention_mask=inputs.attention_mask,
|
935 |
+
prompt_attention_mask=prompt.attention_mask,
|
936 |
streamer=streamer,
|
937 |
do_sample=True,
|
938 |
temperature=1.0,
|
|
|
945 |
for new_audio in streamer:
|
946 |
if new_audio.shape[0] == 0:
|
947 |
break
|
948 |
+
# Save or process each audio chunk as it is generated
|
|
|
949 |
yield sampling_rate, new_audio
|
950 |
|
951 |
audio_segments = []
|
952 |
for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
|
953 |
audio_segments.append(audio_chunk)
|
|
|
954 |
|
955 |
+
temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_chunk_{len(audio_segments)}.wav")
|
956 |
+
write_wav(temp_audio_path, sampling_rate, audio_chunk.astype(np.float32))
|
957 |
+
logging.debug(f"Saved chunk to {temp_audio_path}")
|
958 |
+
|
959 |
+
|
960 |
+
# Combine all the audio chunks into one audio file
|
961 |
combined_audio = np.concatenate(audio_segments)
|
962 |
combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
|
963 |
+
|
964 |
write_wav(combined_audio_path, sampling_rate, combined_audio.astype(np.float32))
|
965 |
|
966 |
logging.debug(f"Combined audio saved to {combined_audio_path}")
|
967 |
return combined_audio_path
|
968 |
|
969 |
|
|
|
|
|
970 |
def fetch_local_events():
|
971 |
api_key = os.environ['SERP_API']
|
972 |
url = f'https://serpapi.com/search.json?engine=google_events&q=Events+in+Birmingham&hl=en&gl=us&api_key={api_key}'
|
|
|
1392 |
# .then(fn=clear_textbox, inputs=[], outputs=[chat_input],api_name="api_clear_textbox")
|
1393 |
# )
|
1394 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1395 |
retriever_sequence = (
|
1396 |
retriever_button.click(fn=stop_audio, inputs=[], outputs=[audio_output], api_name="api_stop_audio_recording")
|
1397 |
.then(fn=add_message, inputs=[chatbot, chat_input], outputs=[chatbot, chat_input], api_name="api_addprompt_chathistory")
|
1398 |
# First, generate the bot response
|
1399 |
.then(fn=generate_bot_response, inputs=[chatbot, choice, retrieval_mode, model_choice], outputs=[chatbot], api_name="api_generate_bot_response")
|
1400 |
+
# Then, generate the TTS response based on the bot's response
|
1401 |
+
.then(fn=generate_tts_response, inputs=[chatbot, tts_choice], outputs=[audio_output], api_name="api_generate_tts_response")
|
1402 |
.then(fn=show_map_if_details, inputs=[chatbot, choice], outputs=[location_output, location_output], api_name="api_show_map_details")
|
1403 |
.then(fn=clear_textbox, inputs=[], outputs=[chat_input], api_name="api_clear_textbox")
|
1404 |
)
|
|
|
1417 |
# fn=clear_textbox, inputs=[], outputs=[chat_input],api_name="api_clear_textbox"
|
1418 |
# )
|
1419 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1420 |
chat_input.submit(fn=stop_audio, inputs=[], outputs=[audio_output], api_name="api_stop_audio_recording").then(
|
1421 |
fn=add_message, inputs=[chatbot, chat_input], outputs=[chatbot, chat_input], api_name="api_addprompt_chathistory"
|
1422 |
).then(
|
1423 |
+
# First, generate the bot response
|
1424 |
+
fn=generate_bot_response, inputs=[chatbot, choice, retrieval_mode, model_choice], outputs=[chatbot], api_name="api_generate_bot_response"
|
1425 |
+
).then(
|
1426 |
+
# Then, generate the TTS response based on the bot's response
|
1427 |
+
fn=generate_tts_response, inputs=[chatbot, tts_choice], outputs=[audio_output], api_name="api_generate_tts_response"
|
1428 |
).then(
|
1429 |
fn=show_map_if_details, inputs=[chatbot, choice], outputs=[location_output, location_output], api_name="api_show_map_details"
|
1430 |
).then(
|
|
|
1436 |
|
1437 |
|
1438 |
|
1439 |
+
|
1440 |
audio_input = gr.Audio(sources=["microphone"], streaming=True, type='numpy', every=0.1)
|
1441 |
audio_input.stream(transcribe_function, inputs=[state, audio_input], outputs=[state, chat_input], api_name="api_voice_to_text")
|
1442 |
|
|
|
1457 |
refresh_button.click(fn=update_images, inputs=None, outputs=[image_output_1, image_output_2, image_output_3], api_name="update_image")
|
1458 |
|
1459 |
demo.queue()
|
1460 |
+
demo.launch(show_error=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|