Spaces:

Twelve2five
/

fastrtc-voice-assistant

Runtime error

App Files Files Community

Twelve2five commited on Mar 17

Commit

797af4f

verified ·

1 Parent(s): c0c2699

Update app.py

Browse files

Files changed (1) hide show

app.py +115 -80

app.py CHANGED Viewed

@@ -1,14 +1,20 @@
 import os
-import numpy as np
 import gradio as gr
 from dotenv import load_dotenv
 from elevenlabs import ElevenLabs
-from fastrtc import ReplyOnPause
 import logging
 import requests
 import soundfile as sf
 from gtts import gTTS
-import io
 import re
 # Configure logging
@@ -18,8 +24,9 @@ logger = logging.getLogger("voice-assistant")
 # Load environment variables
 load_dotenv()
-# Initialize ElevenLabs client
 elevenlabs_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
 class DeepSeekAPI:
     def __init__(self, api_key):
@@ -46,9 +53,64 @@ class DeepSeekAPI:
         return response.json()
-# Initialize DeepSeek client
 deepseek_client = DeepSeekAPI(api_key=os.getenv("DEEPSEEK_API_KEY"))
 def use_gtts_for_text(text):
     """Helper function to generate speech with gTTS for the entire text"""
     try:
@@ -96,7 +158,7 @@ def use_gtts_for_text(text):
         logger.error(f"gTTS error: {e}")
         yield None
-# Comprehensive WebRTC configuration
 rtc_configuration = {
     "iceServers": [
         {"urls": ["stun:stun.l.google.com:19302"]},
@@ -112,90 +174,63 @@ rtc_configuration = {
             "credential": "openrelayproject"
         }
     ],
 }
-# Define the chat history function to handle messages
-def process_message(audio, history):
-    from fastrtc import get_stt_model
-    # Get the STT model instance
-    stt_model = get_stt_model()
-    # Convert speech to text
-    user_message = stt_model.stt(audio)
-    logger.info(f"User said: {user_message}")
-    # Add user message to history
-    history = history + [(user_message, None)]
-    # Prepare messages for DeepSeek
-    messages = []
-    for user, bot in history:
-        messages.append({"role": "user", "content": user})
-        if bot:
-            messages.append({"role": "assistant", "content": bot})
-    # Get AI response
-    response_data = deepseek_client.chat_completion(messages)
-    bot_message = response_data["choices"][0]["message"]["content"]
-    logger.info(f"DeepSeek response: {bot_message[:50]}...")
-    # Update history
-    history[-1] = (user_message, bot_message)
-    # Generate audio response
-    if os.getenv("ELEVENLABS_API_KEY"):
-        try:
-            logger.info("Using ElevenLabs for speech generation")
-            audio_bytes = elevenlabs_client.text_to_speech.convert(
-                text=bot_message,
-                voice_id="Antoni",
-                model_id="eleven_monolingual_v1"
-            )
-            # Save to temporary file and read back
-            with open("temp_response.mp3", "wb") as f:
-                f.write(audio_bytes)
-            data, sr = sf.read("temp_response.mp3")
-            os.remove("temp_response.mp3")
-            # Convert to the right format if needed
-            if len(data.shape) > 1:
-                data = data[:, 0]  # Take first channel if stereo
-            audio_out = (sr, data)
-        except Exception as e:
-            logger.error(f"ElevenLabs error: {e}, falling back to gTTS")
-            # TODO: Implement gTTS fallback for this function
-            audio_out = None
-    else:
-        logger.info("No ElevenLabs API key, audio response not available")
-        audio_out = None
-    return history, audio_out
-# Create the Gradio interface - much simpler than before
-demo = gr.Interface(
-    fn=process_message,
-    inputs=[
-        gr.Audio(sources=["microphone"], type="numpy"),
-        gr.State([])
-    ],
-    outputs=[
-        gr.Chatbot(),
-        gr.Audio(label="AI Voice Response")
-    ],
-    title="LLM Voice Chat (Powered by DeepSeek & ElevenLabs)",
-    description="Speak into the microphone and get AI responses in text and speech.",
-    examples=[],
-    cache_examples=False
-)
-# Launch the app
 if __name__ == "__main__":
     demo.launch(share=True)
 else:
     # For Hugging Face Spaces
-    demo.launch()

 import os
+import time
 import gradio as gr
+import numpy as np
 from dotenv import load_dotenv
 from elevenlabs import ElevenLabs
+from fastrtc import (
+    Stream,
+    get_stt_model,
+    ReplyOnPause,
+    AdditionalOutputs
+)
 import logging
 import requests
+import io
 import soundfile as sf
 from gtts import gTTS
 import re
 # Configure logging
 # Load environment variables
 load_dotenv()
+# Initialize clients
 elevenlabs_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
+stt_model = get_stt_model()
 class DeepSeekAPI:
     def __init__(self, api_key):
         return response.json()
 deepseek_client = DeepSeekAPI(api_key=os.getenv("DEEPSEEK_API_KEY"))
+# Define handler function for FastRTC Stream
+def response(
+    audio: tuple[int, np.ndarray],
+    chatbot=None,
+):
+    # Initialize chatbot if None
+    chatbot = chatbot or []
+    messages = [{"role": msg[0], "content": msg[1]} for msg in chatbot] if chatbot else []
+    # Convert speech to text
+    text = stt_model.stt(audio)
+    logger.info(f"User said: {text}")
+    # Add user message to chat
+    chatbot.append(("user", text))
+    yield AdditionalOutputs(chatbot)
+    # Get AI response
+    formatted_messages = []
+    for role, content in chatbot:
+        formatted_messages.append({"role": "user" if role == "user" else "assistant", "content": content})
+    # Call DeepSeek API
+    response_data = deepseek_client.chat_completion(formatted_messages)
+    response_text = response_data["choices"][0]["message"]["content"]
+    logger.info(f"DeepSeek response: {response_text[:50]}...")
+    # Add AI response to chat
+    chatbot.append(("assistant", response_text))
+    # Convert response to speech
+    if os.getenv("ELEVENLABS_API_KEY"):
+        try:
+            logger.info("Using ElevenLabs for speech generation")
+            # Use the streaming API for better experience
+            for chunk in elevenlabs_client.text_to_speech.convert_as_stream(
+                text=response_text,
+                voice_id="Antoni",
+                model_id="eleven_monolingual_v1",
+                output_format="pcm_24000"
+            ):
+                audio_array = np.frombuffer(chunk, dtype=np.int16).reshape(1, -1)
+                yield (24000, audio_array)
+        except Exception as e:
+            logger.error(f"ElevenLabs error: {e}, falling back to gTTS")
+            # Fall back to gTTS
+            yield from use_gtts_for_text(response_text)
+    else:
+        # Fall back to gTTS
+        logger.info("ElevenLabs API key not found, using gTTS...")
+        yield from use_gtts_for_text(response_text)
+    yield AdditionalOutputs(chatbot)
 def use_gtts_for_text(text):
     """Helper function to generate speech with gTTS for the entire text"""
     try:
         logger.error(f"gTTS error: {e}")
         yield None
+# Enhanced WebRTC configuration
 rtc_configuration = {
     "iceServers": [
         {"urls": ["stun:stun.l.google.com:19302"]},
             "credential": "openrelayproject"
         }
     ],
+    "iceCandidatePoolSize": 10
 }
+# Build the interface - we need separate Blocks for chatbot and Stream
+with gr.Blocks(title="LLM Voice Assistant") as demo:
+    gr.Markdown("# LLM Voice Chat (Powered by DeepSeek & ElevenLabs)")
+    gr.Markdown("Click the microphone button to start speaking")
+    # Create the main chatbot display
+    chatbot = gr.Chatbot(label="Conversation")
+    # Create the Stream component outside of the Blocks context to avoid conflicts
+    # We'll insert it into the interface later
+    stream_container = gr.HTML("<div id='stream-placeholder'>Loading WebRTC component...</div>")
+# Create the FastRTC Stream separately
+stream = Stream(
+    modality="audio",
+    mode="send-receive",
+    handler=ReplyOnPause(response, input_sample_rate=16000),
+    additional_outputs_handler=lambda a, b: b,
+    additional_inputs=[chatbot],
+    additional_outputs=[chatbot],
+    rtc_configuration=rtc_configuration
+)
+# Custom mount function
+def mount_components():
+    import gradio as gr
+    import os
+    # Get the main interface
+    main_interface = demo
+    # Add the Stream interface to a custom Blocks
+    with gr.Blocks(analytics_enabled=False) as stream_interface:
+        stream.render()
+    # Create a custom app that hosts both interfaces on different routes
+    app = gr.routes.App()
+    app.add_route("/", main_interface)
+    app.add_route("/stream", stream_interface)
+    # Launch the combined app
+    app.launch()
+# Launch with the mount function
 if __name__ == "__main__":
+    # Local development
     demo.launch(share=True)
+    # Launch the Stream component separately for local development
+    stream.ui.launch(server_port=7861, share=True)
 else:
     # For Hugging Face Spaces
+    # Initialize FastRTC in Spaces
+    app = gr.mount_gradio_app(stream.app, demo, path="/")
+    # Launch both components
+    gr.launch_app(app)