Spaces:

Twelve2five
/

fastrtc-voice-assistant

Runtime error

App Files Files Community

Twelve2five commited on Mar 17

Commit

c0c2699

verified ·

1 Parent(s): f558bc0

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -145

app.py CHANGED Viewed

@@ -1,39 +1,25 @@
 import os
-import time
-import gradio as gr
 import numpy as np
 from dotenv import load_dotenv
 from elevenlabs import ElevenLabs
-from fastrtc import (
-    Stream,
-    get_stt_model,
-    ReplyOnPause,
-    AdditionalOutputs
-)
-from gradio.utils import get_space
 import requests
-import io
 import soundfile as sf
 from gtts import gTTS
 import re
-import logging
-# Set up logging for WebRTC debugging
-logging.basicConfig(level=logging.DEBUG)
-logger = logging.getLogger("fastrtc-voice-assistant")
 # Load environment variables
 load_dotenv()
-# Enable WebRTC debug tracing
-os.environ["WEBRTC_TRACE"] = "WEBRTC_TRACE_ALL"
-# Initialize clients
-logger.info("Initializing clients...")
 elevenlabs_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
-stt_model = get_stt_model()
-logger.info("Clients initialized")
 class DeepSeekAPI:
     def __init__(self, api_key):
@@ -60,62 +46,9 @@ class DeepSeekAPI:
         return response.json()
 deepseek_client = DeepSeekAPI(api_key=os.getenv("DEEPSEEK_API_KEY"))
-def response(
-    audio: tuple[int, np.ndarray],
-    chatbot: list[dict] | None = None,
-):
-    chatbot = chatbot or []
-    messages = [{"role": d["role"], "content": d["content"]} for d in chatbot]
-    # Convert speech to text
-    logger.info("Converting speech to text...")
-    text = stt_model.stt(audio)
-    logger.info(f"User said: {text}")
-    # Add user message to chat
-    chatbot.append({"role": "user", "content": text})
-    yield AdditionalOutputs(chatbot)
-    # Get AI response
-    messages.append({"role": "user", "content": text})
-    # Call DeepSeek API
-    logger.info("Calling DeepSeek API...")
-    response_data = deepseek_client.chat_completion(messages)
-    response_text = response_data["choices"][0]["message"]["content"]
-    logger.info(f"DeepSeek response: {response_text[:50]}...")
-    # Add AI response to chat
-    chatbot.append({"role": "assistant", "content": response_text})
-    # Convert response to speech
-    if os.getenv("ELEVENLABS_API_KEY"):
-        try:
-            logger.info("Using ElevenLabs for speech generation")
-            # Use the streaming API for better experience
-            for chunk in elevenlabs_client.text_to_speech.convert_as_stream(
-                text=response_text,
-                voice_id="Antoni",
-                model_id="eleven_monolingual_v1",
-                output_format="pcm_24000"
-            ):
-                audio_array = np.frombuffer(chunk, dtype=np.int16).reshape(1, -1)
-                yield (24000, audio_array)
-        except Exception as e:
-            logger.error(f"ElevenLabs error: {e}, falling back to gTTS")
-            # Fall back to gTTS
-            yield from use_gtts_for_text(response_text)
-    else:
-        # Fall back to gTTS
-        logger.info("ElevenLabs API key not found, using gTTS...")
-        yield from use_gtts_for_text(response_text)
-    yield AdditionalOutputs(chatbot)
 def use_gtts_for_text(text):
     """Helper function to generate speech with gTTS for the entire text"""
     try:
@@ -163,100 +96,106 @@ def use_gtts_for_text(text):
         logger.error(f"gTTS error: {e}")
         yield None
-# Comprehensive WebRTC configuration with multiple STUN/TURN options
 rtc_configuration = {
     "iceServers": [
-        # Google STUN servers
         {"urls": ["stun:stun.l.google.com:19302"]},
         {"urls": ["stun:stun1.l.google.com:19302"]},
-        {"urls": ["stun:stun2.l.google.com:19302"]},
-        {"urls": ["stun:stun3.l.google.com:19302"]},
-        {"urls": ["stun:stun4.l.google.com:19302"]},
-        # OpenRelay TURN servers
         {
             "urls": ["turn:openrelay.metered.ca:80"],
             "username": "openrelayproject",
             "credential": "openrelayproject"
         },
-        {
-            "urls": ["turn:openrelay.metered.ca:443"],
-            "username": "openrelayproject",
-            "credential": "openrelayproject"
-        },
         {
             "urls": ["turn:openrelay.metered.ca:443?transport=tcp"],
             "username": "openrelayproject",
             "credential": "openrelayproject"
-        },
-        # Additional public STUN servers
-        {"urls": ["stun:stun.stunprotocol.org:3478"]},
-        {"urls": ["stun:stun.voip.blackberry.com:3478"]},
-        {"urls": ["stun:stun.nextcloud.com:443"]}
     ],
-    "iceCandidatePoolSize": 10,
-    "bundlePolicy": "max-bundle",
-    "rtcpMuxPolicy": "require",
-    "iceTransportPolicy": "all"  # Try "relay" if "all" doesn't work
 }
-# Create a simple wrapper for the webchat UI
-with gr.Blocks(title="LLM Voice Chat (Powered by DeepSeek & ElevenLabs)") as demo:
-    gr.Markdown("# LLM Voice Chat\nPowered by DeepSeek & ElevenLabs")
-    with gr.Row():
-        with gr.Column(scale=3):
-            # Create the chatbot component
-            chatbot = gr.Chatbot(type="messages")
-            # For debugging, allow seeing connection status
-            connection_status = gr.Textbox(label="Connection Status",
-                                          value="Ready to connect. Click the microphone button to start.",
-                                          interactive=False)
-            # Display debugging information
-            debug_info = gr.Textbox(label="Debug Info",
-                                    value="WebRTC debug information will appear here.",
-                                    interactive=False)
-            # Button to manually refresh the page
-            refresh_btn = gr.Button("Refresh Connection")
-            def refresh_page():
-                debug_info.value = f"Attempting to refresh connection at {time.time()}"
-                return "Refreshed", f"Connection refresh attempted at {time.time()}"
-            refresh_btn.click(
-                refresh_page,
-                outputs=[connection_status, debug_info]
-            )
-    logger.info("Creating Stream component...")
-    # Initialize the stream (outside of the blocks context)
-    stream = Stream(
-        modality="audio",
-        mode="send-receive",
-        handler=ReplyOnPause(response, input_sample_rate=16000),
-        additional_outputs_handler=lambda a, b: b,
-        additional_inputs=[chatbot],
-        additional_outputs=[chatbot],
-        rtc_configuration=rtc_configuration,
-        concurrency_limit=5 if get_space() else None,
-        time_limit=90 if get_space() else None
-    )
-    # Mount the stream to the blocks interface
-    stream.render()
-    logger.info("Stream component created and rendered")
 # Launch the app
 if __name__ == "__main__":
-    # Local development
-    logger.info("Running in development mode")
-    os.environ["GRADIO_SSR_MODE"] = "false"
-    demo.launch(server_port=7860, share=True)
 else:
-    # Hugging Face Spaces
-    logger.info("Running in Hugging Face Spaces")
     demo.launch()

 import os
 import numpy as np
+import gradio as gr
 from dotenv import load_dotenv
 from elevenlabs import ElevenLabs
+from fastrtc import ReplyOnPause
+import logging
 import requests
 import soundfile as sf
 from gtts import gTTS
+import io
 import re
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("voice-assistant")
 # Load environment variables
 load_dotenv()
+# Initialize ElevenLabs client
 elevenlabs_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
 class DeepSeekAPI:
     def __init__(self, api_key):
         return response.json()
+# Initialize DeepSeek client
 deepseek_client = DeepSeekAPI(api_key=os.getenv("DEEPSEEK_API_KEY"))
 def use_gtts_for_text(text):
     """Helper function to generate speech with gTTS for the entire text"""
     try:
         logger.error(f"gTTS error: {e}")
         yield None
+# Comprehensive WebRTC configuration
 rtc_configuration = {
     "iceServers": [
         {"urls": ["stun:stun.l.google.com:19302"]},
         {"urls": ["stun:stun1.l.google.com:19302"]},
         {
             "urls": ["turn:openrelay.metered.ca:80"],
             "username": "openrelayproject",
             "credential": "openrelayproject"
         },
         {
             "urls": ["turn:openrelay.metered.ca:443?transport=tcp"],
             "username": "openrelayproject",
             "credential": "openrelayproject"
+        }
     ],
 }
+# Define the chat history function to handle messages
+def process_message(audio, history):
+    from fastrtc import get_stt_model
+    # Get the STT model instance
+    stt_model = get_stt_model()
+    # Convert speech to text
+    user_message = stt_model.stt(audio)
+    logger.info(f"User said: {user_message}")
+    # Add user message to history
+    history = history + [(user_message, None)]
+    # Prepare messages for DeepSeek
+    messages = []
+    for user, bot in history:
+        messages.append({"role": "user", "content": user})
+        if bot:
+            messages.append({"role": "assistant", "content": bot})
+    # Get AI response
+    response_data = deepseek_client.chat_completion(messages)
+    bot_message = response_data["choices"][0]["message"]["content"]
+    logger.info(f"DeepSeek response: {bot_message[:50]}...")
+    # Update history
+    history[-1] = (user_message, bot_message)
+    # Generate audio response
+    if os.getenv("ELEVENLABS_API_KEY"):
+        try:
+            logger.info("Using ElevenLabs for speech generation")
+            audio_bytes = elevenlabs_client.text_to_speech.convert(
+                text=bot_message,
+                voice_id="Antoni",
+                model_id="eleven_monolingual_v1"
+            )
+            # Save to temporary file and read back
+            with open("temp_response.mp3", "wb") as f:
+                f.write(audio_bytes)
+            data, sr = sf.read("temp_response.mp3")
+            os.remove("temp_response.mp3")
+            # Convert to the right format if needed
+            if len(data.shape) > 1:
+                data = data[:, 0]  # Take first channel if stereo
+            audio_out = (sr, data)
+        except Exception as e:
+            logger.error(f"ElevenLabs error: {e}, falling back to gTTS")
+            # TODO: Implement gTTS fallback for this function
+            audio_out = None
+    else:
+        logger.info("No ElevenLabs API key, audio response not available")
+        audio_out = None
+    return history, audio_out
+# Create the Gradio interface - much simpler than before
+demo = gr.Interface(
+    fn=process_message,
+    inputs=[
+        gr.Audio(sources=["microphone"], type="numpy"),
+        gr.State([])
+    ],
+    outputs=[
+        gr.Chatbot(),
+        gr.Audio(label="AI Voice Response")
+    ],
+    title="LLM Voice Chat (Powered by DeepSeek & ElevenLabs)",
+    description="Speak into the microphone and get AI responses in text and speech.",
+    examples=[],
+    cache_examples=False
+)
 # Launch the app
 if __name__ == "__main__":
+    demo.launch(share=True)
 else:
+    # For Hugging Face Spaces
     demo.launch()