Spaces:

Twelve2five
/

fastrtc-voice-assistant

Runtime error

App Files Files Community

Twelve2five commited on Mar 16

Commit

f558bc0

verified ·

1 Parent(s): 227326d

Update app.py

Browse files

Files changed (1) hide show

app.py +98 -30

app.py CHANGED Viewed

@@ -17,13 +17,23 @@ import io
 import soundfile as sf
 from gtts import gTTS
 import re
 # Load environment variables
 load_dotenv()
 # Initialize clients
 elevenlabs_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
 stt_model = get_stt_model()
 class DeepSeekAPI:
     def __init__(self, api_key):
@@ -45,7 +55,7 @@ class DeepSeekAPI:
         # Check for error response
         if response.status_code != 200:
-            print(f"DeepSeek API error: {response.status_code} - {response.text}")
             return {"choices": [{"message": {"content": "I'm sorry, I encountered an error processing your request."}}]}
         return response.json()
@@ -60,8 +70,9 @@ def response(
     messages = [{"role": d["role"], "content": d["content"]} for d in chatbot]
     # Convert speech to text
     text = stt_model.stt(audio)
-    print("prompt:", text)
     # Add user message to chat
     chatbot.append({"role": "user", "content": text})
@@ -71,8 +82,10 @@ def response(
     messages.append({"role": "user", "content": text})
     # Call DeepSeek API
     response_data = deepseek_client.chat_completion(messages)
     response_text = response_data["choices"][0]["message"]["content"]
     # Add AI response to chat
     chatbot.append({"role": "assistant", "content": response_text})
@@ -80,7 +93,7 @@ def response(
     # Convert response to speech
     if os.getenv("ELEVENLABS_API_KEY"):
         try:
-            print(f"Generating ElevenLabs speech for response")
             # Use the streaming API for better experience
             for chunk in elevenlabs_client.text_to_speech.convert_as_stream(
@@ -93,12 +106,12 @@ def response(
                 yield (24000, audio_array)
         except Exception as e:
-            print(f"ElevenLabs error: {e}, falling back to gTTS")
             # Fall back to gTTS
             yield from use_gtts_for_text(response_text)
     else:
         # Fall back to gTTS
-        print("ElevenLabs API key not found, using gTTS...")
         yield from use_gtts_for_text(response_text)
     yield AdditionalOutputs(chatbot)
@@ -114,7 +127,7 @@ def use_gtts_for_text(text):
                 continue
             mp3_fp = io.BytesIO()
-            print(f"Using gTTS for sentence: {sentence[:30]}...")
             tts = gTTS(text=sentence, lang='en-us', tld='com', slow=False)
             tts.write_to_fp(mp3_fp)
             mp3_fp.seek(0)
@@ -147,48 +160,103 @@ def use_gtts_for_text(text):
                     chunk = chunk.reshape(1, -1)
                     yield (24000, chunk)
     except Exception as e:
-        print(f"gTTS error: {e}")
         yield None
-# Enhanced WebRTC configuration
 rtc_configuration = {
     "iceServers": [
-        {"urls": ["stun:stun.l.google.com:19302", "stun:stun1.l.google.com:19302"]},
         {
             "urls": ["turn:openrelay.metered.ca:80"],
             "username": "openrelayproject",
             "credential": "openrelayproject"
         },
         {
             "urls": ["turn:openrelay.metered.ca:443?transport=tcp"],
             "username": "openrelayproject",
             "credential": "openrelayproject"
-        }
     ],
-    "iceCandidatePoolSize": 10
 }
-# Create the Stream component outside of any Blocks context
-chatbot = gr.Chatbot(type="messages", visible=False)  # Will be used for state only
-stream = Stream(
-    modality="audio",
-    mode="send-receive",
-    handler=ReplyOnPause(response, input_sample_rate=16000),
-    additional_outputs_handler=lambda a, b: b,
-    additional_inputs=[chatbot],
-    additional_outputs=[chatbot],
-    rtc_configuration=rtc_configuration,
-    concurrency_limit=5 if get_space() else None,
-    time_limit=90 if get_space() else None,
-    ui_args={"title": "LLM Voice Chat (Powered by DeepSeek & ElevenLabs)"}
-)
-# Create a basic Gradio interface
-demo = stream.ui
 # Launch the app
 if __name__ == "__main__":
     # Local development
     os.environ["GRADIO_SSR_MODE"] = "false"
-    demo.launch(server_port=7860)

 import soundfile as sf
 from gtts import gTTS
 import re
+import logging
+# Set up logging for WebRTC debugging
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger("fastrtc-voice-assistant")
 # Load environment variables
 load_dotenv()
+# Enable WebRTC debug tracing
+os.environ["WEBRTC_TRACE"] = "WEBRTC_TRACE_ALL"
 # Initialize clients
+logger.info("Initializing clients...")
 elevenlabs_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
 stt_model = get_stt_model()
+logger.info("Clients initialized")
 class DeepSeekAPI:
     def __init__(self, api_key):
         # Check for error response
         if response.status_code != 200:
+            logger.error(f"DeepSeek API error: {response.status_code} - {response.text}")
             return {"choices": [{"message": {"content": "I'm sorry, I encountered an error processing your request."}}]}
         return response.json()
     messages = [{"role": d["role"], "content": d["content"]} for d in chatbot]
     # Convert speech to text
+    logger.info("Converting speech to text...")
     text = stt_model.stt(audio)
+    logger.info(f"User said: {text}")
     # Add user message to chat
     chatbot.append({"role": "user", "content": text})
     messages.append({"role": "user", "content": text})
     # Call DeepSeek API
+    logger.info("Calling DeepSeek API...")
     response_data = deepseek_client.chat_completion(messages)
     response_text = response_data["choices"][0]["message"]["content"]
+    logger.info(f"DeepSeek response: {response_text[:50]}...")
     # Add AI response to chat
     chatbot.append({"role": "assistant", "content": response_text})
     # Convert response to speech
     if os.getenv("ELEVENLABS_API_KEY"):
         try:
+            logger.info("Using ElevenLabs for speech generation")
             # Use the streaming API for better experience
             for chunk in elevenlabs_client.text_to_speech.convert_as_stream(
                 yield (24000, audio_array)
         except Exception as e:
+            logger.error(f"ElevenLabs error: {e}, falling back to gTTS")
             # Fall back to gTTS
             yield from use_gtts_for_text(response_text)
     else:
         # Fall back to gTTS
+        logger.info("ElevenLabs API key not found, using gTTS...")
         yield from use_gtts_for_text(response_text)
     yield AdditionalOutputs(chatbot)
                 continue
             mp3_fp = io.BytesIO()
+            logger.info(f"Using gTTS for: {sentence[:30]}...")
             tts = gTTS(text=sentence, lang='en-us', tld='com', slow=False)
             tts.write_to_fp(mp3_fp)
             mp3_fp.seek(0)
                     chunk = chunk.reshape(1, -1)
                     yield (24000, chunk)
     except Exception as e:
+        logger.error(f"gTTS error: {e}")
         yield None
+# Comprehensive WebRTC configuration with multiple STUN/TURN options
 rtc_configuration = {
     "iceServers": [
+        # Google STUN servers
+        {"urls": ["stun:stun.l.google.com:19302"]},
+        {"urls": ["stun:stun1.l.google.com:19302"]},
+        {"urls": ["stun:stun2.l.google.com:19302"]},
+        {"urls": ["stun:stun3.l.google.com:19302"]},
+        {"urls": ["stun:stun4.l.google.com:19302"]},
+        # OpenRelay TURN servers
         {
             "urls": ["turn:openrelay.metered.ca:80"],
             "username": "openrelayproject",
             "credential": "openrelayproject"
         },
+        {
+            "urls": ["turn:openrelay.metered.ca:443"],
+            "username": "openrelayproject",
+            "credential": "openrelayproject"
+        },
         {
             "urls": ["turn:openrelay.metered.ca:443?transport=tcp"],
             "username": "openrelayproject",
             "credential": "openrelayproject"
+        },
+        # Additional public STUN servers
+        {"urls": ["stun:stun.stunprotocol.org:3478"]},
+        {"urls": ["stun:stun.voip.blackberry.com:3478"]},
+        {"urls": ["stun:stun.nextcloud.com:443"]}
     ],
+    "iceCandidatePoolSize": 10,
+    "bundlePolicy": "max-bundle",
+    "rtcpMuxPolicy": "require",
+    "iceTransportPolicy": "all"  # Try "relay" if "all" doesn't work
 }
+# Create a simple wrapper for the webchat UI
+with gr.Blocks(title="LLM Voice Chat (Powered by DeepSeek & ElevenLabs)") as demo:
+    gr.Markdown("# LLM Voice Chat\nPowered by DeepSeek & ElevenLabs")
+    with gr.Row():
+        with gr.Column(scale=3):
+            # Create the chatbot component
+            chatbot = gr.Chatbot(type="messages")
+            # For debugging, allow seeing connection status
+            connection_status = gr.Textbox(label="Connection Status",
+                                          value="Ready to connect. Click the microphone button to start.",
+                                          interactive=False)
+            # Display debugging information
+            debug_info = gr.Textbox(label="Debug Info",
+                                    value="WebRTC debug information will appear here.",
+                                    interactive=False)
+            # Button to manually refresh the page
+            refresh_btn = gr.Button("Refresh Connection")
+            def refresh_page():
+                debug_info.value = f"Attempting to refresh connection at {time.time()}"
+                return "Refreshed", f"Connection refresh attempted at {time.time()}"
+            refresh_btn.click(
+                refresh_page,
+                outputs=[connection_status, debug_info]
+            )
+    logger.info("Creating Stream component...")
+    # Initialize the stream (outside of the blocks context)
+    stream = Stream(
+        modality="audio",
+        mode="send-receive",
+        handler=ReplyOnPause(response, input_sample_rate=16000),
+        additional_outputs_handler=lambda a, b: b,
+        additional_inputs=[chatbot],
+        additional_outputs=[chatbot],
+        rtc_configuration=rtc_configuration,
+        concurrency_limit=5 if get_space() else None,
+        time_limit=90 if get_space() else None
+    )
+    # Mount the stream to the blocks interface
+    stream.render()
+    logger.info("Stream component created and rendered")
 # Launch the app
 if __name__ == "__main__":
     # Local development
+    logger.info("Running in development mode")
     os.environ["GRADIO_SSR_MODE"] = "false"
+    demo.launch(server_port=7860, share=True)
+else:
+    # Hugging Face Spaces
+    logger.info("Running in Hugging Face Spaces")
+    demo.launch()