Spaces:

Twelve2five
/

fastrtc-voice-assistant

Runtime error

App Files Files Community

Twelve2five commited on Mar 17

Commit

ccc0748

verified ·

1 Parent(s): 01f7ec4

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -130

app.py CHANGED Viewed

@@ -1,33 +1,25 @@
 import os
 import time
 import gradio as gr
 import numpy as np
 from dotenv import load_dotenv
 from elevenlabs import ElevenLabs
 from fastrtc import (
     Stream,
     get_stt_model,
-    ReplyOnPause,
-    AdditionalOutputs
 )
-import logging
-import requests
-import io
-import soundfile as sf
-from gtts import gTTS
-import re
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger("voice-assistant")
 # Load environment variables
 load_dotenv()
-# Initialize clients
-elevenlabs_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
-stt_model = get_stt_model()
 class DeepSeekAPI:
     def __init__(self, api_key):
         self.api_key = api_key
@@ -48,130 +40,66 @@ class DeepSeekAPI:
         # Check for error response
         if response.status_code != 200:
-            logger.error(f"DeepSeek API error: {response.status_code} - {response.text}")
             return {"choices": [{"message": {"content": "I'm sorry, I encountered an error processing your request."}}]}
         return response.json()
 deepseek_client = DeepSeekAPI(api_key=os.getenv("DEEPSEEK_API_KEY"))
 def response(
-    audio: tuple[int, np.ndarray],
-    chatbot: list[tuple] | None = None,
 ):
     chatbot = chatbot or []
-    # Convert speech to text
     text = stt_model.stt(audio)
-    logger.info(f"User said: {text}")
-    # Add user message to chat
-    chatbot.append((text, None))
     yield AdditionalOutputs(chatbot)
-    # Get AI response
-    messages = []
-    for user_text, assistant_text in chatbot:
-        messages.append({"role": "user", "content": user_text})
-        if assistant_text:
-            messages.append({"role": "assistant", "content": assistant_text})
-    # Call DeepSeek API
-    response_data = deepseek_client.chat_completion(messages)
     response_text = response_data["choices"][0]["message"]["content"]
-    logger.info(f"DeepSeek response: {response_text[:50]}...")
-    # Update chatbot with AI response
-    chatbot[-1] = (text, response_text)
     yield AdditionalOutputs(chatbot)
-    # Convert response to speech
-    if os.getenv("ELEVENLABS_API_KEY"):
-        try:
-            logger.info("Using ElevenLabs for speech generation")
-            # Use the streaming API for better experience
-            for chunk in elevenlabs_client.text_to_speech.convert_as_stream(
-                text=response_text,
-                voice_id="Antoni",
-                model_id="eleven_monolingual_v1",
-                output_format="pcm_24000"
-            ):
-                audio_array = np.frombuffer(chunk, dtype=np.int16).reshape(1, -1)
-                yield (24000, audio_array)
-        except Exception as e:
-            logger.error(f"ElevenLabs error: {e}, falling back to gTTS")
-            # Fall back to gTTS
-            yield from use_gtts_for_text(response_text)
-    else:
-        # Fall back to gTTS
-        logger.info("ElevenLabs API key not found, using gTTS...")
-        yield from use_gtts_for_text(response_text)
-def use_gtts_for_text(text):
-    """Helper function to generate speech with gTTS for the entire text"""
-    try:
-        # Split text into sentences for better results
-        sentences = re.split(r'(?<=[.!?])\s+', text)
-        for sentence in sentences:
-            if not sentence.strip():
-                continue
-            mp3_fp = io.BytesIO()
-            logger.info(f"Using gTTS for: {sentence[:30]}...")
-            tts = gTTS(text=sentence, lang='en-us', tld='com', slow=False)
-            tts.write_to_fp(mp3_fp)
-            mp3_fp.seek(0)
-            data, samplerate = sf.read(mp3_fp)
-            if len(data.shape) > 1 and data.shape[1] > 1:
-                data = data[:, 0]
-            if samplerate != 24000:
-                data = np.interp(
-                    np.linspace(0, len(data), int(len(data) * 24000 / samplerate)),
-                    np.arange(len(data)),
-                    data
-                )
-            data = (data * 32767).astype(np.int16)
-            # Ensure buffer size is even
-            if len(data) % 2 != 0:
-                data = np.append(data, [0])
-            # Reshape and yield in chunks
-            chunk_size = 4800
-            for i in range(0, len(data), chunk_size):
-                chunk = data[i:i+chunk_size]
-                if len(chunk) > 0:
-                    if len(chunk) % 2 != 0:
-                        chunk = np.append(chunk, [0])
-                    chunk = chunk.reshape(1, -1)
-                    yield (24000, chunk)
-    except Exception as e:
-        logger.error(f"gTTS error: {e}")
-        yield None
-# Basic WebRTC configuration - just the minimum needed
-rtc_configuration = {
-    "iceServers": [
-        {"urls": ["stun:stun.l.google.com:19302"]},
-        {
-            "urls": ["turn:openrelay.metered.ca:80"],
-            "username": "openrelayproject",
-            "credential": "openrelayproject"
-        }
-    ]
-}
-# Create chatbot component for tracking conversation
-chatbot = gr.Chatbot()
-# Create Stream outside of any blocks context
 stream = Stream(
     modality="audio",
     mode="send-receive",
@@ -179,13 +107,25 @@ stream = Stream(
     additional_outputs_handler=lambda a, b: b,
     additional_inputs=[chatbot],
     additional_outputs=[chatbot],
-    rtc_configuration=rtc_configuration,
-    ui_args={"title": "LLM Voice Chat (DeepSeek & ElevenLabs)"}
 )
-# Export the UI directly
-demo = stream.ui
-# Expose the demo for Hugging Face Spaces
 if __name__ == "__main__":
-    demo.launch()

 import os
 import time
+import requests
 import gradio as gr
 import numpy as np
 from dotenv import load_dotenv
 from elevenlabs import ElevenLabs
+from fastapi import FastAPI
 from fastrtc import (
+    AdditionalOutputs,
+    ReplyOnPause,
     Stream,
     get_stt_model,
+    get_twilio_turn_credentials,
 )
+from gradio.utils import get_space
+from numpy.typing import NDArray
 # Load environment variables
 load_dotenv()
+# Initialize DeepSeek client
 class DeepSeekAPI:
     def __init__(self, api_key):
         self.api_key = api_key
         # Check for error response
         if response.status_code != 200:
+            print(f"DeepSeek API error: {response.status_code} - {response.text}")
             return {"choices": [{"message": {"content": "I'm sorry, I encountered an error processing your request."}}]}
         return response.json()
+# Initialize clients
 deepseek_client = DeepSeekAPI(api_key=os.getenv("DEEPSEEK_API_KEY"))
+tts_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
+stt_model = get_stt_model()
+# Get Twilio TURN credentials
+twilio_credentials = get_twilio_turn_credentials(
+    account_sid=os.getenv("TWILIO_ACCOUNT_SID"),
+    auth_token=os.getenv("TWILIO_AUTH_TOKEN")
+)
+# Log Twilio status
+if twilio_credentials:
+    print("Twilio TURN credentials successfully configured")
+else:
+    print("No Twilio credentials found or invalid credentials")
+# Handler function for voice conversation
 def response(
+    audio: tuple[int, NDArray[np.int16 | np.float32]],
+    chatbot: list[dict] | None = None,
 ):
     chatbot = chatbot or []
+    messages = [{"role": d["role"], "content": d["content"]} for d in chatbot]
+    start = time.time()
     text = stt_model.stt(audio)
+    print("transcription", time.time() - start)
+    print("prompt", text)
+    chatbot.append({"role": "user", "content": text})
     yield AdditionalOutputs(chatbot)
+    messages.append({"role": "user", "content": text})
+    # Replace Groq LLM with DeepSeek
+    response_data = deepseek_client.chat_completion(
+        messages=messages,
+        max_tokens=512
+    )
     response_text = response_data["choices"][0]["message"]["content"]
+    chatbot.append({"role": "assistant", "content": response_text})
+    for chunk in tts_client.text_to_speech.convert_as_stream(
+        text=response_text,
+        voice_id="Antoni",  # Changed to Antoni, a default voice
+        model_id="eleven_multilingual_v2",
+        output_format="pcm_24000",
+    ):
+        audio_array = np.frombuffer(chunk, dtype=np.int16).reshape(1, -1)
+        yield (24000, audio_array)
     yield AdditionalOutputs(chatbot)
+# Create the chatbot and Stream components
+chatbot = gr.Chatbot(type="messages")
 stream = Stream(
     modality="audio",
     mode="send-receive",
     additional_outputs_handler=lambda a, b: b,
     additional_inputs=[chatbot],
     additional_outputs=[chatbot],
+    rtc_configuration=twilio_credentials,  # Always use Twilio credentials
+    concurrency_limit=5 if get_space() else None,
+    time_limit=90 if get_space() else None,
+    ui_args={"title": "LLM Voice Chat (Powered by DeepSeek, ElevenLabs, and WebRTC ⚡️)"},
 )
+# Mount the STREAM UI to the FastAPI app
+app = FastAPI()
+app = gr.mount_gradio_app(app, stream.ui, path="/")
 if __name__ == "__main__":
+    import os
+    os.environ["GRADIO_SSR_MODE"] = "false"
+    if (mode := os.getenv("MODE")) == "UI":
+        stream.ui.launch(server_port=7860)
+    elif mode == "PHONE":
+        stream.fastphone(host="0.0.0.0", port=7860)
+    else:
+        stream.ui.launch(server_port=7860)