IT2091024v2

Paused

App Files Files Community

Pijush2023 commited on Sep 9, 2024

Commit

84d46a3

verified ·

1 Parent(s): cc994a5

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -25

app.py CHANGED Viewed

@@ -444,6 +444,7 @@ def generate_tts_response(response, tts_choice):
 import concurrent.futures
 def bot(history, choice, tts_choice, retrieval_mode, model_choice):
     # Initialize an empty response
     response = ""
@@ -458,17 +459,22 @@ def bot(history, choice, tts_choice, retrieval_mode, model_choice):
             response = history_chunk[-1][1]  # Update the response with the current state
             yield history_chunk, None  # Stream the text output as it's generated
-        # Once text is fully generated, start the TTS conversion
-        tts_future = executor.submit(generate_tts_response, response, tts_choice)
-        # Get the audio output after TTS is done
-        audio_path = tts_future.result()
-        # Stream the final text and audio output
-        yield history, audio_path
@@ -1038,9 +1044,9 @@ device = "cuda:0" if torch.cuda.is_available() else "cpu"
 repo_id = "parler-tts/parler-tts-mini-v1"
-def generate_audio_parler_tts(text):
     description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
-    chunk_size_in_s = 3.0  # Setting buffer size to 3 seconds
     # Initialize the tokenizer and model
     parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
@@ -1058,8 +1064,6 @@ def generate_audio_parler_tts(text):
         generation_kwargs = dict(
             input_ids=inputs.input_ids,
             prompt_input_ids=prompt.input_ids,
-            attention_mask=inputs.attention_mask,
-            prompt_attention_mask=prompt.attention_mask,
             streamer=streamer,
             do_sample=True,
             temperature=1.0,
@@ -1072,21 +1076,17 @@ def generate_audio_parler_tts(text):
         for new_audio in streamer:
             if new_audio.shape[0] == 0:
                 break
-            # Save or process each audio chunk as it is generated
             yield sampling_rate, new_audio
     audio_segments = []
     for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
         audio_segments.append(audio_chunk)
-        temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_chunk_{len(audio_segments)}.wav")
-        write_wav(temp_audio_path, sampling_rate, audio_chunk.astype(np.float32))
-        logging.debug(f"Saved chunk to {temp_audio_path}")
-    # Combine all the audio chunks into one audio file
     combined_audio = np.concatenate(audio_segments)
     combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
     write_wav(combined_audio_path, sampling_rate, combined_audio.astype(np.float32))
     logging.debug(f"Combined audio saved to {combined_audio_path}")
@@ -1094,6 +1094,7 @@ def generate_audio_parler_tts(text):
 def fetch_local_events():
     api_key = os.environ['SERP_API']
     url = f'https://serpapi.com/search.json?engine=google_events&q=Events+in+Birmingham&hl=en&gl=us&api_key={api_key}'
@@ -1536,8 +1537,8 @@ with gr.Blocks(theme='Pijush2023/scikit-learn-pijush') as demo:
                 .then(fn=add_message, inputs=[chatbot, chat_input], outputs=[chatbot, chat_input], api_name="api_addprompt_chathistory")
                 # First, generate the bot response
                 .then(fn=generate_bot_response, inputs=[chatbot, choice, retrieval_mode, model_choice], outputs=[chatbot], api_name="api_generate_bot_response")
-                # Then, generate the TTS response based on the bot's response
-                .then(fn=generate_tts_response, inputs=[chatbot, tts_choice], outputs=[audio_output], api_name="api_generate_tts_response")
                 .then(fn=show_map_if_details, inputs=[chatbot, choice], outputs=[location_output, location_output], api_name="api_show_map_details")
                 .then(fn=clear_textbox, inputs=[], outputs=[chat_input], api_name="api_clear_textbox")
             )
@@ -1574,9 +1575,7 @@ with gr.Blocks(theme='Pijush2023/scikit-learn-pijush') as demo:
             chat_input.submit(fn=stop_audio, inputs=[], outputs=[audio_output], api_name="api_stop_audio_recording").then(
                 fn=add_message, inputs=[chatbot, chat_input], outputs=[chatbot, chat_input], api_name="api_addprompt_chathistory"
             ).then(
-                fn=generate_bot_response, inputs=[chatbot, choice, retrieval_mode, model_choice], outputs=[chatbot], api_name="api_generate_bot_response"
-            ).then(
-                fn=generate_tts_response, inputs=[chatbot, tts_choice], outputs=[audio_output], api_name="api_generate_tts_response"
             ).then(
                 fn=show_map_if_details, inputs=[chatbot, choice], outputs=[location_output, location_output], api_name="api_show_map_details"
             ).then(

 import concurrent.futures
+# Modified bot function to handle text and audio concurrently
 def bot(history, choice, tts_choice, retrieval_mode, model_choice):
     # Initialize an empty response
     response = ""
             response = history_chunk[-1][1]  # Update the response with the current state
             yield history_chunk, None  # Stream the text output as it's generated
+            # Start streaming Parler TTS as text is being generated
+            if tts_choice == "Beta":  # Parler TTS
+                parler_tts_future = executor.submit(generate_audio_parler_tts, response, callback=lambda audio_chunk: yield_audio(audio_chunk))
+                parler_tts_future.result()
+        # Once text is fully generated, start the Eleven Labs TTS if chosen
+        if tts_choice == "Alpha":  # Eleven Labs
+            tts_future = executor.submit(generate_tts_response, response, tts_choice)
+            audio_path = tts_future.result()
+            yield history, audio_path
+def yield_audio(audio_chunk):
+    """ Stream audio in chunks to the output """
+    temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_chunk_{int(time.time())}.wav")
+    write_wav(temp_audio_path, 16000, audio_chunk.astype(np.float32))
+    return temp_audio_path
 repo_id = "parler-tts/parler-tts-mini-v1"
+def generate_audio_parler_tts(text, callback=None):
     description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
+    chunk_size_in_s = 3.0  # Set to 3-second chunks
     # Initialize the tokenizer and model
     parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
         generation_kwargs = dict(
             input_ids=inputs.input_ids,
             prompt_input_ids=prompt.input_ids,
             streamer=streamer,
             do_sample=True,
             temperature=1.0,
         for new_audio in streamer:
             if new_audio.shape[0] == 0:
                 break
+            if callback:
+                callback(new_audio)  # Send the chunk to the callback function for streaming
             yield sampling_rate, new_audio
     audio_segments = []
     for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
         audio_segments.append(audio_chunk)
+    # Combine all the audio chunks into one audio file after streaming
     combined_audio = np.concatenate(audio_segments)
     combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
     write_wav(combined_audio_path, sampling_rate, combined_audio.astype(np.float32))
     logging.debug(f"Combined audio saved to {combined_audio_path}")
 def fetch_local_events():
     api_key = os.environ['SERP_API']
     url = f'https://serpapi.com/search.json?engine=google_events&q=Events+in+Birmingham&hl=en&gl=us&api_key={api_key}'
                 .then(fn=add_message, inputs=[chatbot, chat_input], outputs=[chatbot, chat_input], api_name="api_addprompt_chathistory")
                 # First, generate the bot response
                 .then(fn=generate_bot_response, inputs=[chatbot, choice, retrieval_mode, model_choice], outputs=[chatbot], api_name="api_generate_bot_response")
+                # Generate the TTS response based on the bot's response concurrently
+                .then(fn=bot, inputs=[chatbot, choice, tts_choice, retrieval_mode, model_choice], outputs=[chatbot, audio_output], api_name="api_generate_tts_response")
                 .then(fn=show_map_if_details, inputs=[chatbot, choice], outputs=[location_output, location_output], api_name="api_show_map_details")
                 .then(fn=clear_textbox, inputs=[], outputs=[chat_input], api_name="api_clear_textbox")
             )
             chat_input.submit(fn=stop_audio, inputs=[], outputs=[audio_output], api_name="api_stop_audio_recording").then(
                 fn=add_message, inputs=[chatbot, chat_input], outputs=[chatbot, chat_input], api_name="api_addprompt_chathistory"
             ).then(
+                fn=bot, inputs=[chatbot, choice, tts_choice, retrieval_mode, model_choice], outputs=[chatbot, audio_output], api_name="api_generate_tts_response"
             ).then(
                 fn=show_map_if_details, inputs=[chatbot, choice], outputs=[location_output, location_output], api_name="api_show_map_details"
             ).then(