Twelve2five commited on
Commit
dff5fe4
·
verified ·
1 Parent(s): 9e6481e

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +206 -14
  2. debug.py +5 -0
  3. webrtc_client.js +92 -0
  4. webrtc_handler.py +77 -0
app.py CHANGED
@@ -1,14 +1,206 @@
1
- import streamlit as st
2
- from fastrtc import FastRTC
3
- import os
4
-
5
- st.title("FastRTC Voice Assistant")
6
- st.write("Talk to DeepSeek LLM with ElevenLabs voice!")
7
-
8
- # Initialize your FastRTC components
9
- # (You'll need to adapt your existing code for the web interface)
10
-
11
- fastrtc = FastRTC()
12
-
13
- # Create a simple UI
14
- st.button("Start Voice Chat")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import gradio as gr
4
+ import numpy as np
5
+ from dotenv import load_dotenv
6
+ from elevenlabs import ElevenLabs
7
+ from fastrtc import (
8
+ Stream,
9
+ get_stt_model,
10
+ ReplyOnPause,
11
+ AdditionalOutputs
12
+ )
13
+
14
+ import requests
15
+ import io
16
+ import soundfile as sf
17
+ from gtts import gTTS
18
+ import re
19
+ import inspect
20
+
21
+ from deepseek import DeepSeekAPI
22
+
23
+ # Load environment variables
24
+ load_dotenv()
25
+
26
+ # Initialize clients
27
+ elevenlabs_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
28
+ stt_model = get_stt_model()
29
+ deepseek_client = DeepSeekAPI(api_key=os.getenv("DEEPSEEK_API_KEY"))
30
+
31
+ # Add this debug code temporarily to see what methods are available:
32
+ print(dir(deepseek_client))
33
+
34
+ def response(
35
+ audio: tuple[int, np.ndarray],
36
+ chatbot: list[dict] | None = None,
37
+ ):
38
+ chatbot = chatbot or []
39
+ messages = [{"role": d["role"], "content": d["content"]} for d in chatbot]
40
+
41
+ # Convert speech to text
42
+ text = stt_model.stt(audio)
43
+ print("prompt:", text)
44
+
45
+ # Add user message to chat
46
+ chatbot.append({"role": "user", "content": text})
47
+ yield AdditionalOutputs(chatbot)
48
+
49
+ # Get AI response
50
+ messages.append({"role": "user", "content": text})
51
+ response_text = get_deepseek_response(messages)
52
+
53
+ # Add AI response to chat
54
+ chatbot.append({"role": "assistant", "content": response_text})
55
+
56
+ # Convert response to speech
57
+ for audio_data in text_to_speech(response_text):
58
+ if audio_data:
59
+ yield audio_data
60
+
61
+ yield AdditionalOutputs(chatbot)
62
+
63
+ # Create Gradio interface
64
+ chatbot = gr.Chatbot(type="messages")
65
+ stream = Stream(
66
+ modality="audio",
67
+ mode="send-receive",
68
+ handler=ReplyOnPause(response, input_sample_rate=16000),
69
+ additional_outputs_handler=lambda a, b: b,
70
+ additional_inputs=[chatbot],
71
+ additional_outputs=[chatbot],
72
+ ui_args={"title": "LLM Voice Chat (Powered by DeepSeek & ElevenLabs)"}
73
+ )
74
+
75
+ # Create FastAPI app and mount stream
76
+ from fastapi import FastAPI
77
+ app = FastAPI()
78
+ app = gr.mount_gradio_app(app, stream.ui, path="/")
79
+ stream.mount(app) # Mount the stream for telephone/fastphone integration
80
+
81
+ # Update the chat completion part based on available methods:
82
+ # We'll use direct HTTP requests as a fallback since the API structure is unclear:
83
+ def get_deepseek_response(messages):
84
+ url = "https://api.deepseek.com/v1/chat/completions"
85
+ headers = {
86
+ "Content-Type": "application/json",
87
+ "Authorization": f"Bearer {os.getenv('DEEPSEEK_API_KEY')}"
88
+ }
89
+ payload = {
90
+ "model": "deepseek-chat",
91
+ "messages": messages,
92
+ "temperature": 0.7,
93
+ "max_tokens": 512
94
+ }
95
+ response = requests.post(url, json=payload, headers=headers)
96
+
97
+ # Check for error response
98
+ if response.status_code != 200:
99
+ print(f"DeepSeek API error: {response.status_code} - {response.text}")
100
+ return "I'm sorry, I encountered an error processing your request."
101
+
102
+ response_json = response.json()
103
+ return response_json["choices"][0]["message"]["content"]
104
+
105
+ # Make sure that the text_to_speech function is completely replaced and gTTS is explicitly using US English
106
+ def text_to_speech(text):
107
+ """Convert text to speech using Google TTS with sentence-by-sentence processing"""
108
+ try:
109
+ # Split text into sentences for faster perceived response
110
+ sentences = re.split(r'(?<=[.!?])\s+', text)
111
+
112
+ for sentence in sentences:
113
+ if not sentence.strip():
114
+ continue
115
+
116
+ # Process each sentence separately
117
+ mp3_fp = io.BytesIO()
118
+
119
+ # Force US English - be explicit about it
120
+ print(f"Using gTTS with en-us locale for sentence: {sentence[:20]}...")
121
+ tts = gTTS(text=sentence, lang='en-us', tld='com', slow=False)
122
+ tts.write_to_fp(mp3_fp)
123
+ mp3_fp.seek(0)
124
+
125
+ # Process audio data
126
+ data, samplerate = sf.read(mp3_fp)
127
+
128
+ # Convert to mono if stereo
129
+ if len(data.shape) > 1 and data.shape[1] > 1:
130
+ data = data[:, 0]
131
+
132
+ # Resample to 24000 Hz if needed
133
+ if samplerate != 24000:
134
+ data = np.interp(
135
+ np.linspace(0, len(data), int(len(data) * 24000 / samplerate)),
136
+ np.arange(len(data)),
137
+ data
138
+ )
139
+
140
+ # Convert to 16-bit integers
141
+ data = (data * 32767).astype(np.int16)
142
+
143
+ # Ensure buffer size is even
144
+ if len(data) % 2 != 0:
145
+ data = np.append(data, [0])
146
+
147
+ # Reshape and yield in chunks
148
+ chunk_size = 4800
149
+ for i in range(0, len(data), chunk_size):
150
+ chunk = data[i:i+chunk_size]
151
+ if len(chunk) > 0:
152
+ if len(chunk) % 2 != 0:
153
+ chunk = np.append(chunk, [0])
154
+ chunk = chunk.reshape(1, -1)
155
+ yield (24000, chunk)
156
+ except Exception as e:
157
+ print(f"Exception in text_to_speech: {e}")
158
+ yield None
159
+
160
+ # Add this debug statement AFTER the function definition
161
+ print("text_to_speech function:", inspect.getsource(text_to_speech))
162
+
163
+ if __name__ == "__main__":
164
+ os.environ["GRADIO_SSR_MODE"] = "false"
165
+
166
+ # Check FastRTC version
167
+ import fastrtc
168
+ print(f"FastRTC version: {fastrtc.__version__ if hasattr(fastrtc, '__version__') else 'unknown'}")
169
+
170
+ # Try running fastphone with additional diagnostic
171
+ print("Starting phone service - attempting to inspect fastphone method...")
172
+ import inspect
173
+ print(f"FastPhone signature: {inspect.signature(stream.fastphone) if hasattr(stream, 'fastphone') else 'Not available'}")
174
+
175
+ try:
176
+ # Fix: Use keyword argument instead of positional
177
+ phone_service = stream.fastphone(
178
+ token=os.getenv("HF_TOKEN"),
179
+ host="127.0.0.1",
180
+ port=8000,
181
+ share_server_tls_certificate=True # Use keyword argument format
182
+ )
183
+ print("Phone service started successfully")
184
+ except Exception as e:
185
+ print(f"Error starting phone service: {e}")
186
+ print("Falling back to web interface...")
187
+ # Launch with web interface as fallback
188
+ stream.ui.launch(server_port=7860)
189
+
190
+ # Remove or comment out the following lines:
191
+ # !pip install -q torch==2.0.1 torchaudio==2.0.2 gradio requests soundfile huggingface_hub
192
+ # !wget -q https://github.com/seasalt-ai/csm/archive/refs/heads/main.zip
193
+ # !unzip -q main.zip
194
+ # !mv csm-main csm
195
+ # !cd csm && pip install -e .
196
+ #
197
+ # # Set up directories
198
+ # import os
199
+ # import sys
200
+ # sys.path.append("/content/csm")
201
+ # voice_samples_dir = "/content/csm_voice_samples"
202
+ # output_dir = "/content/csm_output"
203
+ # os.makedirs(voice_samples_dir, exist_ok=True)
204
+ # os.makedirs(output_dir, exist_ok=True)
205
+ #
206
+ # print("✅ Dependencies installed!")
debug.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ headers = {"xi-api-key": os.getenv("ELEVENLABS_API_KEY")}
4
+ voices_response = requests.get("https://api.elevenlabs.io/v1/voices", headers=headers)
5
+ print("Available voices:", voices_response.json())
webrtc_client.js ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ let pc = null;
2
+ let localStream = null;
3
+ let audioSender = null;
4
+
5
+ async function setupWebRTC() {
6
+ // Create WebRTC peer connection
7
+ pc = new RTCPeerConnection({
8
+ iceServers: [
9
+ { urls: 'stun:stun.l.google.com:19302' }
10
+ ]
11
+ });
12
+
13
+ // Get local media stream
14
+ localStream = await navigator.mediaDevices.getUserMedia({
15
+ audio: true,
16
+ video: false
17
+ });
18
+
19
+ // Add tracks to peer connection
20
+ localStream.getTracks().forEach(track => {
21
+ audioSender = pc.addTrack(track, localStream);
22
+ });
23
+
24
+ // Create offer
25
+ const offer = await pc.createOffer();
26
+ await pc.setLocalDescription(offer);
27
+
28
+ // Send offer to server (would need to be implemented)
29
+ sendOfferToServer(pc.localDescription);
30
+
31
+ // Set up event listeners for ICE candidates
32
+ pc.onicecandidate = event => {
33
+ if (event.candidate) {
34
+ sendIceCandidateToServer(event.candidate);
35
+ }
36
+ };
37
+
38
+ // Handle incoming tracks (audio responses)
39
+ pc.ontrack = event => {
40
+ const audioElement = document.getElementById('ai-response-audio');
41
+ if (audioElement) {
42
+ audioElement.srcObject = new MediaStream([event.track]);
43
+ }
44
+ };
45
+ }
46
+
47
+ function sendOfferToServer(offer) {
48
+ // Send the offer to your backend
49
+ // Implementation would depend on your server setup
50
+ fetch('/webrtc/offer', {
51
+ method: 'POST',
52
+ headers: {
53
+ 'Content-Type': 'application/json'
54
+ },
55
+ body: JSON.stringify(offer)
56
+ })
57
+ .then(response => response.json())
58
+ .then(answer => {
59
+ pc.setRemoteDescription(new RTCSessionDescription(answer));
60
+ })
61
+ .catch(error => console.error('Error sending offer:', error));
62
+ }
63
+
64
+ function sendIceCandidateToServer(candidate) {
65
+ // Send ICE candidate to server
66
+ fetch('/webrtc/ice-candidate', {
67
+ method: 'POST',
68
+ headers: {
69
+ 'Content-Type': 'application/json'
70
+ },
71
+ body: JSON.stringify(candidate)
72
+ })
73
+ .catch(error => console.error('Error sending ICE candidate:', error));
74
+ }
75
+
76
+ function startRecording() {
77
+ // Unmute the audio track
78
+ if (localStream) {
79
+ localStream.getAudioTracks().forEach(track => {
80
+ track.enabled = true;
81
+ });
82
+ }
83
+ }
84
+
85
+ function stopRecording() {
86
+ // Mute the audio track
87
+ if (localStream) {
88
+ localStream.getAudioTracks().forEach(track => {
89
+ track.enabled = false;
90
+ });
91
+ }
92
+ }
webrtc_handler.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ import logging
4
+ import os
5
+ import ssl
6
+ import uuid
7
+ from typing import Dict, Optional, Callable
8
+
9
+ import aiohttp
10
+ from aiortc import RTCPeerConnection, RTCSessionDescription, MediaStreamTrack, RTCIceCandidate
11
+ from aiortc.contrib.media import MediaBlackhole, MediaRelay
12
+
13
+ logger = logging.getLogger("webrtc_handler")
14
+ pcs = set()
15
+ relay = MediaRelay()
16
+
17
+ class AudioTransformTrack(MediaStreamTrack):
18
+ """
19
+ A track that processes audio and sends it to a callback function
20
+ """
21
+ kind = "audio"
22
+
23
+ def __init__(self, track, callback):
24
+ super().__init__()
25
+ self.track = track
26
+ self.callback = callback
27
+
28
+ async def recv(self):
29
+ frame = await self.track.recv()
30
+ # Process audio frame
31
+ if self.callback:
32
+ self.callback(frame)
33
+ return frame
34
+
35
+ async def handle_offer(offer, audio_callback=None):
36
+ offer_data = RTCSessionDescription(sdp=offer["sdp"], type=offer["type"])
37
+
38
+ pc = RTCPeerConnection()
39
+ pcs.add(pc)
40
+
41
+ @pc.on("connectionstatechange")
42
+ async def on_connectionstatechange():
43
+ logger.info(f"Connection state is {pc.connectionState}")
44
+ if pc.connectionState == "failed":
45
+ await pc.close()
46
+ pcs.discard(pc)
47
+
48
+ @pc.on("track")
49
+ def on_track(track):
50
+ logger.info(f"Track {track.kind} received")
51
+ if track.kind == "audio":
52
+ pc.addTrack(AudioTransformTrack(relay.subscribe(track), audio_callback))
53
+
54
+ @track.on("ended")
55
+ async def on_ended():
56
+ logger.info(f"Track {track.kind} ended")
57
+
58
+ # Handle the incoming offer
59
+ await pc.setRemoteDescription(offer_data)
60
+
61
+ # Create an answer
62
+ answer = await pc.createAnswer()
63
+ await pc.setLocalDescription(answer)
64
+
65
+ return {
66
+ "sdp": pc.localDescription.sdp,
67
+ "type": pc.localDescription.type
68
+ }
69
+
70
+ async def add_ice_candidate(candidate, pc):
71
+ if candidate and pc:
72
+ candidate_data = RTCIceCandidate(
73
+ sdpMLineIndex=candidate.get("sdpMLineIndex"),
74
+ sdpMid=candidate.get("sdpMid"),
75
+ candidate=candidate.get("candidate")
76
+ )
77
+ await pc.addIceCandidate(candidate_data)