Voice-Chat

Running

App Files Files Community

NeoPy commited on Feb 8

Commit

21c1f7b

verified ·

1 Parent(s): 12443a2

Update app.py

Browse files

Files changed (1) hide show

app.py +126 -137

app.py CHANGED Viewed

@@ -2,7 +2,6 @@
 # ruff: noqa: E402
 import json
-import re
 import tempfile
 import os
@@ -17,7 +16,7 @@ from groq import Groq
 from cached_path import cached_path
 from transformers import AutoModelForCausalLM, AutoTokenizer
-# Try to import spaces; if available, set USING_SPACES to True so we can decorate functions for GPU support.
 try:
     import spaces
@@ -70,7 +69,6 @@ def load_f5tts(
 F5TTS_ema_model = load_f5tts()
 @gpu_decorator
 def generate_response(messages, apikey):
     """
@@ -88,14 +86,13 @@ def generate_response(messages, apikey):
         model="deepseek-r1-distill-llama-70b",
         stream=False,
     )
-    # Check that we got a valid response.
     if chat_completion.choices and hasattr(chat_completion.choices[0].message, "content"):
         return chat_completion.choices[0].message.content
     return ""
 @gpu_decorator
-def process_audio_input(audio_path, text, history, conv_state):
     """
     Process audio and/or text input from the user:
       - If an audio file is provided, its transcript is obtained.
@@ -105,7 +102,7 @@ def process_audio_input(audio_path, text, history, conv_state):
         return history, conv_state, ""
     if audio_path:
-        # preprocess_ref_audio_text returns a tuple (audio, transcript).
         _, text = preprocess_ref_audio_text(audio_path, text)
     if not text.strip():
@@ -113,7 +110,7 @@ def process_audio_input(audio_path, text, history, conv_state):
     conv_state.append({"role": "user", "content": text})
     history.append((text, None))
-    response = generate_response(conv_state)
     conv_state.append({"role": "assistant", "content": response})
     history[-1] = (text, response)
     return history, conv_state, ""
@@ -177,7 +174,7 @@ def infer(
     return (final_sample_rate, final_wave), spectrogram_path, ref_text
-with gr.Blocks() as app_chat:
     gr.Markdown(
         """
 # Voice Chat
@@ -189,141 +186,133 @@ Have a conversation with an AI using your reference voice!
 """
     )
-    # The chat interface container is visible only if running in a Spaces environment.
-    chat_interface_container = gr.Column(visible=USING_SPACES)
-    with chat_interface_container:
-        with gr.Row():
-            with gr.Column():
-                ref_audio_chat = gr.Audio(label="Reference Audio", type="filepath")
-            with gr.Column():
-                with gr.Accordion("Advanced Settings", open=False):
-                    remove_silence_chat = gr.Checkbox(label="Remove Silences", value=True)
-                    ref_text_chat = gr.Textbox(
-                        label="Reference Text",
-                        info="Optional: Leave blank to auto-transcribe",
-                        lines=2,
-                    )
-                    system_prompt_chat = gr.Textbox(
-                        label="System Prompt",
-                        value=(
-                            "You are not an AI assistant, you are whoever the user says you are. "
-                            "You must stay in character. Keep your responses concise since they will be spoken out loud."
-                        ),
-                        lines=2,
-                    )
-        chatbot_interface = gr.Chatbot(label="Conversation")
-        with gr.Row():
-            with gr.Column():
-                audio_input_chat = gr.Microphone(label="Speak your message", type="filepath")
-                audio_output_chat = gr.Audio(autoplay=True)
-            with gr.Column():
-                with gr.Row():
-                    groq_apikey = gr.Textbox(label="Your Groq API Key")
-                text_input_chat = gr.Textbox(label="Type your message", lines=1)
-                send_btn_chat = gr.Button("Send Message")
-                clear_btn_chat = gr.Button("Clear Conversation")
-        # Initialize the conversation state with the system prompt.
-        conversation_state = gr.State(
-            value=[
-                {
-                    "role": "system",
-                    "content": (
                         "You are not an AI assistant, you are whoever the user says you are. "
                         "You must stay in character. Keep your responses concise since they will be spoken out loud."
                     ),
-                }
-            ]
-        )
-        @gpu_decorator
-        def generate_audio_response(history, ref_audio, ref_text, remove_silence):
-            """
-            Generate an audio response from the last AI message in the conversation.
-            """
-            if not history or not ref_audio:
-                return None, ref_text
-            last_user_message, last_ai_response = history[-1]
-            if not last_ai_response:
-                return None, ref_text
-            audio_result, _, ref_text_out = infer(
-                ref_audio,
-                ref_text,
-                last_ai_response,
-                remove_silence,
-                cross_fade_duration=0.15,
-                speed=1.0,
-                show_info=print,
-            )
-            return audio_result, ref_text_out
-        def clear_conversation():
-            """
-            Clear the chat conversation and reset the conversation state.
-            """
-            initial_state = [
-                {
-                    "role": "system",
-                    "content": (
-                        "You are not an AI assistant, you are whoever the user says you are. "
-                        "You must stay in character. Keep your responses concise since they will be spoken out loud."
-                    ),
-                }
-            ]
-            return [], initial_state
-        def update_system_prompt(new_prompt):
-            """
-            Update the system prompt and reset the conversation.
-            """
-            initial_state = [{"role": "system", "content": new_prompt}]
-            return [], initial_state
-        # Set up callbacks so that when recording stops, or text is submitted, the chain of processing is run.
-        audio_input_chat.stop_recording(
-            process_audio_input,
-            inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state],
-            outputs=[chatbot_interface, conversation_state],
-        ).then(
-            generate_audio_response,
-            inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
-            outputs=[audio_output_chat, ref_text_chat],
-        ).then(lambda: None, None, audio_input_chat)
-        text_input_chat.submit(
-            process_audio_input,
-            inputs=[audio_input_chat, text_input_chat, groq_apikey, chatbot_interface, conversation_state],
-            outputs=[chatbot_interface, conversation_state],
-        ).then(
-            generate_audio_response,
-            inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
-            outputs=[audio_output_chat, ref_text_chat],
-        ).then(lambda: None, None, text_input_chat)
-        send_btn_chat.click(
-            process_audio_input,
-            inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state],
-            outputs=[chatbot_interface, conversation_state],
-        ).then(
-            generate_audio_response,
-            inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
-            outputs=[audio_output_chat, ref_text_chat],
-        ).then(lambda: None, None, text_input_chat)
-        clear_btn_chat.click(clear_conversation, outputs=[chatbot_interface, conversation_state])
-        system_prompt_chat.change(
-            update_system_prompt,
-            inputs=system_prompt_chat,
-            outputs=[chatbot_interface, conversation_state],
         )
-app = app_chat
 @click.command()

 # ruff: noqa: E402
 import json
 import tempfile
 import os
 from cached_path import cached_path
 from transformers import AutoModelForCausalLM, AutoTokenizer
+# Try to import spaces; if available, wrap functions for GPU support.
 try:
     import spaces
 F5TTS_ema_model = load_f5tts()
 @gpu_decorator
 def generate_response(messages, apikey):
     """
         model="deepseek-r1-distill-llama-70b",
         stream=False,
     )
     if chat_completion.choices and hasattr(chat_completion.choices[0].message, "content"):
         return chat_completion.choices[0].message.content
     return ""
 @gpu_decorator
+def process_audio_input(audio_path, text, apikey, history, conv_state):
     """
     Process audio and/or text input from the user:
       - If an audio file is provided, its transcript is obtained.
         return history, conv_state, ""
     if audio_path:
+        # preprocess_ref_audio_text returns a tuple (audio, transcript)
         _, text = preprocess_ref_audio_text(audio_path, text)
     if not text.strip():
     conv_state.append({"role": "user", "content": text})
     history.append((text, None))
+    response = generate_response(conv_state, apikey)
     conv_state.append({"role": "assistant", "content": response})
     history[-1] = (text, response)
     return history, conv_state, ""
     return (final_sample_rate, final_wave), spectrogram_path, ref_text
+with gr.Blocks() as app:
     gr.Markdown(
         """
 # Voice Chat
 """
     )
+    with gr.Row():
+        with gr.Column():
+            ref_audio_chat = gr.Audio(label="Reference Audio", type="filepath")
+        with gr.Column():
+            with gr.Accordion("Advanced Settings", open=False):
+                remove_silence_chat = gr.Checkbox(label="Remove Silences", value=True)
+                ref_text_chat = gr.Textbox(
+                    label="Reference Text",
+                    info="Optional: Leave blank to auto-transcribe",
+                    lines=2,
+                )
+                system_prompt_chat = gr.Textbox(
+                    label="System Prompt",
+                    value=(
                         "You are not an AI assistant, you are whoever the user says you are. "
                         "You must stay in character. Keep your responses concise since they will be spoken out loud."
                     ),
+                    lines=2,
+                )
+    chatbot_interface = gr.Chatbot(label="Conversation")
+    with gr.Row():
+        with gr.Column():
+            audio_input_chat = gr.Microphone(label="Speak your message", type="filepath")
+            audio_output_chat = gr.Audio(autoplay=True)
+        with gr.Column():
+            groq_apikey = gr.Textbox(label="Your Groq API Key")
+            text_input_chat = gr.Textbox(label="Type your message", lines=1)
+            send_btn_chat = gr.Button("Send Message")
+            clear_btn_chat = gr.Button("Clear Conversation")
+    # Initialize the conversation state with the system prompt.
+    conversation_state = gr.State(
+        value=[
+            {
+                "role": "system",
+                "content": (
+                    "You are not an AI assistant, you are whoever the user says you are. "
+                    "You must stay in character. Keep your responses concise since they will be spoken out loud."
+                ),
+            }
+        ]
+    )
+    @gpu_decorator
+    def generate_audio_response(history, ref_audio, ref_text, remove_silence):
+        """
+        Generate an audio response from the last AI message in the conversation.
+        """
+        if not history or not ref_audio:
+            return None, ref_text
+        last_user_message, last_ai_response = history[-1]
+        if not last_ai_response:
+            return None, ref_text
+        audio_result, _, ref_text_out = infer(
+            ref_audio,
+            ref_text,
+            last_ai_response,
+            remove_silence,
+            cross_fade_duration=0.15,
+            speed=1.0,
+            show_info=print,
         )
+        return audio_result, ref_text_out
+    def clear_conversation():
+        """
+        Clear the chat conversation and reset the conversation state.
+        """
+        initial_state = [
+            {
+                "role": "system",
+                "content": (
+                    "You are not an AI assistant, you are whoever the user says you are. "
+                    "You must stay in character. Keep your responses concise since they will be spoken out loud."
+                ),
+            }
+        ]
+        return [], initial_state
+    def update_system_prompt(new_prompt):
+        """
+        Update the system prompt and reset the conversation.
+        """
+        initial_state = [{"role": "system", "content": new_prompt}]
+        return [], initial_state
+    # Set up callbacks so that when recording stops or text is submitted, the processing chain is run.
+    audio_input_chat.stop_recording(
+        process_audio_input,
+        inputs=[audio_input_chat, text_input_chat, groq_apikey, chatbot_interface, conversation_state],
+        outputs=[chatbot_interface, conversation_state, None],
+    ).then(
+        generate_audio_response,
+        inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
+        outputs=[audio_output_chat, ref_text_chat],
+    ).then(lambda: None, None, audio_input_chat)
+    text_input_chat.submit(
+        process_audio_input,
+        inputs=[audio_input_chat, text_input_chat, groq_apikey, chatbot_interface, conversation_state],
+        outputs=[chatbot_interface, conversation_state, None],
+    ).then(
+        generate_audio_response,
+        inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
+        outputs=[audio_output_chat, ref_text_chat],
+    ).then(lambda: None, None, text_input_chat)
+    send_btn_chat.click(
+        process_audio_input,
+        inputs=[audio_input_chat, text_input_chat, groq_apikey, chatbot_interface, conversation_state],
+        outputs=[chatbot_interface, conversation_state, None],
+    ).then(
+        generate_audio_response,
+        inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
+        outputs=[audio_output_chat, ref_text_chat],
+    ).then(lambda: None, None, text_input_chat)
+    clear_btn_chat.click(clear_conversation, outputs=[chatbot_interface, conversation_state])
+    system_prompt_chat.change(
+        update_system_prompt,
+        inputs=system_prompt_chat,
+        outputs=[chatbot_interface, conversation_state],
+    )
 @click.command()