Spaces:

Twelve2five
/

fastrtc-voice-assistant

Runtime error

App Files Files Community

Twelve2five commited on Mar 16

Commit

dff5fe4

verified ·

1 Parent(s): 9e6481e

Upload 4 files

Browse files

Files changed (4) hide show

app.py +206 -14
debug.py +5 -0
webrtc_client.js +92 -0
webrtc_handler.py +77 -0

app.py CHANGED Viewed

@@ -1,14 +1,206 @@
-   import streamlit as st
-   from fastrtc import FastRTC
-   import os
-   st.title("FastRTC Voice Assistant")
-   st.write("Talk to DeepSeek LLM with ElevenLabs voice!")
-   # Initialize your FastRTC components
-   # (You'll need to adapt your existing code for the web interface)
-   fastrtc = FastRTC()
-   # Create a simple UI
-   st.button("Start Voice Chat")

+import os
+import time
+import gradio as gr
+import numpy as np
+from dotenv import load_dotenv
+from elevenlabs import ElevenLabs
+from fastrtc import (
+    Stream,
+    get_stt_model,
+    ReplyOnPause,
+    AdditionalOutputs
+)
+import requests
+import io
+import soundfile as sf
+from gtts import gTTS
+import re
+import inspect
+from deepseek import DeepSeekAPI
+# Load environment variables
+load_dotenv()
+# Initialize clients
+elevenlabs_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
+stt_model = get_stt_model()
+deepseek_client = DeepSeekAPI(api_key=os.getenv("DEEPSEEK_API_KEY"))
+# Add this debug code temporarily to see what methods are available:
+print(dir(deepseek_client))
+def response(
+    audio: tuple[int, np.ndarray],
+    chatbot: list[dict] | None = None,
+):
+    chatbot = chatbot or []
+    messages = [{"role": d["role"], "content": d["content"]} for d in chatbot]
+    # Convert speech to text
+    text = stt_model.stt(audio)
+    print("prompt:", text)
+    # Add user message to chat
+    chatbot.append({"role": "user", "content": text})
+    yield AdditionalOutputs(chatbot)
+    # Get AI response
+    messages.append({"role": "user", "content": text})
+    response_text = get_deepseek_response(messages)
+    # Add AI response to chat
+    chatbot.append({"role": "assistant", "content": response_text})
+    # Convert response to speech
+    for audio_data in text_to_speech(response_text):
+        if audio_data:
+            yield audio_data
+    yield AdditionalOutputs(chatbot)
+# Create Gradio interface
+chatbot = gr.Chatbot(type="messages")
+stream = Stream(
+    modality="audio",
+    mode="send-receive",
+    handler=ReplyOnPause(response, input_sample_rate=16000),
+    additional_outputs_handler=lambda a, b: b,
+    additional_inputs=[chatbot],
+    additional_outputs=[chatbot],
+    ui_args={"title": "LLM Voice Chat (Powered by DeepSeek & ElevenLabs)"}
+)
+# Create FastAPI app and mount stream
+from fastapi import FastAPI
+app = FastAPI()
+app = gr.mount_gradio_app(app, stream.ui, path="/")
+stream.mount(app)  # Mount the stream for telephone/fastphone integration
+# Update the chat completion part based on available methods:
+# We'll use direct HTTP requests as a fallback since the API structure is unclear:
+def get_deepseek_response(messages):
+    url = "https://api.deepseek.com/v1/chat/completions"
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {os.getenv('DEEPSEEK_API_KEY')}"
+    }
+    payload = {
+        "model": "deepseek-chat",
+        "messages": messages,
+        "temperature": 0.7,
+        "max_tokens": 512
+    }
+    response = requests.post(url, json=payload, headers=headers)
+    # Check for error response
+    if response.status_code != 200:
+        print(f"DeepSeek API error: {response.status_code} - {response.text}")
+        return "I'm sorry, I encountered an error processing your request."
+    response_json = response.json()
+    return response_json["choices"][0]["message"]["content"]
+# Make sure that the text_to_speech function is completely replaced and gTTS is explicitly using US English
+def text_to_speech(text):
+    """Convert text to speech using Google TTS with sentence-by-sentence processing"""
+    try:
+        # Split text into sentences for faster perceived response
+        sentences = re.split(r'(?<=[.!?])\s+', text)
+        for sentence in sentences:
+            if not sentence.strip():
+                continue
+            # Process each sentence separately
+            mp3_fp = io.BytesIO()
+            # Force US English - be explicit about it
+            print(f"Using gTTS with en-us locale for sentence: {sentence[:20]}...")
+            tts = gTTS(text=sentence, lang='en-us', tld='com', slow=False)
+            tts.write_to_fp(mp3_fp)
+            mp3_fp.seek(0)
+            # Process audio data
+            data, samplerate = sf.read(mp3_fp)
+            # Convert to mono if stereo
+            if len(data.shape) > 1 and data.shape[1] > 1:
+                data = data[:, 0]
+            # Resample to 24000 Hz if needed
+            if samplerate != 24000:
+                data = np.interp(
+                    np.linspace(0, len(data), int(len(data) * 24000 / samplerate)),
+                    np.arange(len(data)),
+                    data
+                )
+            # Convert to 16-bit integers
+            data = (data * 32767).astype(np.int16)
+            # Ensure buffer size is even
+            if len(data) % 2 != 0:
+                data = np.append(data, [0])
+            # Reshape and yield in chunks
+            chunk_size = 4800
+            for i in range(0, len(data), chunk_size):
+                chunk = data[i:i+chunk_size]
+                if len(chunk) > 0:
+                    if len(chunk) % 2 != 0:
+                        chunk = np.append(chunk, [0])
+                    chunk = chunk.reshape(1, -1)
+                    yield (24000, chunk)
+    except Exception as e:
+        print(f"Exception in text_to_speech: {e}")
+        yield None
+# Add this debug statement AFTER the function definition
+print("text_to_speech function:", inspect.getsource(text_to_speech))
+if __name__ == "__main__":
+    os.environ["GRADIO_SSR_MODE"] = "false"
+    # Check FastRTC version
+    import fastrtc
+    print(f"FastRTC version: {fastrtc.__version__ if hasattr(fastrtc, '__version__') else 'unknown'}")
+    # Try running fastphone with additional diagnostic
+    print("Starting phone service - attempting to inspect fastphone method...")
+    import inspect
+    print(f"FastPhone signature: {inspect.signature(stream.fastphone) if hasattr(stream, 'fastphone') else 'Not available'}")
+    try:
+        # Fix: Use keyword argument instead of positional
+        phone_service = stream.fastphone(
+            token=os.getenv("HF_TOKEN"),
+            host="127.0.0.1",
+            port=8000,
+            share_server_tls_certificate=True  # Use keyword argument format
+        )
+        print("Phone service started successfully")
+    except Exception as e:
+        print(f"Error starting phone service: {e}")
+        print("Falling back to web interface...")
+        # Launch with web interface as fallback
+        stream.ui.launch(server_port=7860)
+# Remove or comment out the following lines:
+# !pip install -q torch==2.0.1 torchaudio==2.0.2 gradio requests soundfile huggingface_hub
+# !wget -q https://github.com/seasalt-ai/csm/archive/refs/heads/main.zip
+# !unzip -q main.zip
+# !mv csm-main csm
+# !cd csm && pip install -e .
+#
+# # Set up directories
+# import os
+# import sys
+# sys.path.append("/content/csm")
+# voice_samples_dir = "/content/csm_voice_samples"
+# output_dir = "/content/csm_output"
+# os.makedirs(voice_samples_dir, exist_ok=True)
+# os.makedirs(output_dir, exist_ok=True)
+#
+# print("✅ Dependencies installed!")

debug.py ADDED Viewed

	@@ -0,0 +1,5 @@

+import os
+import requests
+headers = {"xi-api-key": os.getenv("ELEVENLABS_API_KEY")}
+voices_response = requests.get("https://api.elevenlabs.io/v1/voices", headers=headers)
+print("Available voices:", voices_response.json())

webrtc_client.js ADDED Viewed

	@@ -0,0 +1,92 @@

+let pc = null;
+let localStream = null;
+let audioSender = null;
+async function setupWebRTC() {
+    // Create WebRTC peer connection
+    pc = new RTCPeerConnection({
+        iceServers: [
+            { urls: 'stun:stun.l.google.com:19302' }
+        ]
+    });
+    // Get local media stream
+    localStream = await navigator.mediaDevices.getUserMedia({
+        audio: true,
+        video: false
+    });
+    // Add tracks to peer connection
+    localStream.getTracks().forEach(track => {
+        audioSender = pc.addTrack(track, localStream);
+    });
+    // Create offer
+    const offer = await pc.createOffer();
+    await pc.setLocalDescription(offer);
+    // Send offer to server (would need to be implemented)
+    sendOfferToServer(pc.localDescription);
+    // Set up event listeners for ICE candidates
+    pc.onicecandidate = event => {
+        if (event.candidate) {
+            sendIceCandidateToServer(event.candidate);
+        }
+    };
+    // Handle incoming tracks (audio responses)
+    pc.ontrack = event => {
+        const audioElement = document.getElementById('ai-response-audio');
+        if (audioElement) {
+            audioElement.srcObject = new MediaStream([event.track]);
+        }
+    };
+}
+function sendOfferToServer(offer) {
+    // Send the offer to your backend
+    // Implementation would depend on your server setup
+    fetch('/webrtc/offer', {
+        method: 'POST',
+        headers: {
+            'Content-Type': 'application/json'
+        },
+        body: JSON.stringify(offer)
+    })
+    .then(response => response.json())
+    .then(answer => {
+        pc.setRemoteDescription(new RTCSessionDescription(answer));
+    })
+    .catch(error => console.error('Error sending offer:', error));
+}
+function sendIceCandidateToServer(candidate) {
+    // Send ICE candidate to server
+    fetch('/webrtc/ice-candidate', {
+        method: 'POST',
+        headers: {
+            'Content-Type': 'application/json'
+        },
+        body: JSON.stringify(candidate)
+    })
+    .catch(error => console.error('Error sending ICE candidate:', error));
+}
+function startRecording() {
+    // Unmute the audio track
+    if (localStream) {
+        localStream.getAudioTracks().forEach(track => {
+            track.enabled = true;
+        });
+    }
+}
+function stopRecording() {
+    // Mute the audio track
+    if (localStream) {
+        localStream.getAudioTracks().forEach(track => {
+            track.enabled = false;
+        });
+    }
+}

webrtc_handler.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import asyncio
+import json
+import logging
+import os
+import ssl
+import uuid
+from typing import Dict, Optional, Callable
+import aiohttp
+from aiortc import RTCPeerConnection, RTCSessionDescription, MediaStreamTrack, RTCIceCandidate
+from aiortc.contrib.media import MediaBlackhole, MediaRelay
+logger = logging.getLogger("webrtc_handler")
+pcs = set()
+relay = MediaRelay()
+class AudioTransformTrack(MediaStreamTrack):
+    """
+    A track that processes audio and sends it to a callback function
+    """
+    kind = "audio"
+    def __init__(self, track, callback):
+        super().__init__()
+        self.track = track
+        self.callback = callback
+    async def recv(self):
+        frame = await self.track.recv()
+        # Process audio frame
+        if self.callback:
+            self.callback(frame)
+        return frame
+async def handle_offer(offer, audio_callback=None):
+    offer_data = RTCSessionDescription(sdp=offer["sdp"], type=offer["type"])
+    pc = RTCPeerConnection()
+    pcs.add(pc)
+    @pc.on("connectionstatechange")
+    async def on_connectionstatechange():
+        logger.info(f"Connection state is {pc.connectionState}")
+        if pc.connectionState == "failed":
+            await pc.close()
+            pcs.discard(pc)
+    @pc.on("track")
+    def on_track(track):
+        logger.info(f"Track {track.kind} received")
+        if track.kind == "audio":
+            pc.addTrack(AudioTransformTrack(relay.subscribe(track), audio_callback))
+        @track.on("ended")
+        async def on_ended():
+            logger.info(f"Track {track.kind} ended")
+    # Handle the incoming offer
+    await pc.setRemoteDescription(offer_data)
+    # Create an answer
+    answer = await pc.createAnswer()
+    await pc.setLocalDescription(answer)
+    return {
+        "sdp": pc.localDescription.sdp,
+        "type": pc.localDescription.type
+    }
+async def add_ice_candidate(candidate, pc):
+    if candidate and pc:
+        candidate_data = RTCIceCandidate(
+            sdpMLineIndex=candidate.get("sdpMLineIndex"),
+            sdpMid=candidate.get("sdpMid"),
+            candidate=candidate.get("candidate")
+        )
+        await pc.addIceCandidate(candidate_data)