Spaces:
Runtime error
Runtime error
Upload 4 files
Browse files- app.py +206 -14
- debug.py +5 -0
- webrtc_client.js +92 -0
- webrtc_handler.py +77 -0
app.py
CHANGED
@@ -1,14 +1,206 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import time
|
3 |
+
import gradio as gr
|
4 |
+
import numpy as np
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
from elevenlabs import ElevenLabs
|
7 |
+
from fastrtc import (
|
8 |
+
Stream,
|
9 |
+
get_stt_model,
|
10 |
+
ReplyOnPause,
|
11 |
+
AdditionalOutputs
|
12 |
+
)
|
13 |
+
|
14 |
+
import requests
|
15 |
+
import io
|
16 |
+
import soundfile as sf
|
17 |
+
from gtts import gTTS
|
18 |
+
import re
|
19 |
+
import inspect
|
20 |
+
|
21 |
+
from deepseek import DeepSeekAPI
|
22 |
+
|
23 |
+
# Load environment variables
|
24 |
+
load_dotenv()
|
25 |
+
|
26 |
+
# Initialize clients
|
27 |
+
elevenlabs_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
|
28 |
+
stt_model = get_stt_model()
|
29 |
+
deepseek_client = DeepSeekAPI(api_key=os.getenv("DEEPSEEK_API_KEY"))
|
30 |
+
|
31 |
+
# Add this debug code temporarily to see what methods are available:
|
32 |
+
print(dir(deepseek_client))
|
33 |
+
|
34 |
+
def response(
|
35 |
+
audio: tuple[int, np.ndarray],
|
36 |
+
chatbot: list[dict] | None = None,
|
37 |
+
):
|
38 |
+
chatbot = chatbot or []
|
39 |
+
messages = [{"role": d["role"], "content": d["content"]} for d in chatbot]
|
40 |
+
|
41 |
+
# Convert speech to text
|
42 |
+
text = stt_model.stt(audio)
|
43 |
+
print("prompt:", text)
|
44 |
+
|
45 |
+
# Add user message to chat
|
46 |
+
chatbot.append({"role": "user", "content": text})
|
47 |
+
yield AdditionalOutputs(chatbot)
|
48 |
+
|
49 |
+
# Get AI response
|
50 |
+
messages.append({"role": "user", "content": text})
|
51 |
+
response_text = get_deepseek_response(messages)
|
52 |
+
|
53 |
+
# Add AI response to chat
|
54 |
+
chatbot.append({"role": "assistant", "content": response_text})
|
55 |
+
|
56 |
+
# Convert response to speech
|
57 |
+
for audio_data in text_to_speech(response_text):
|
58 |
+
if audio_data:
|
59 |
+
yield audio_data
|
60 |
+
|
61 |
+
yield AdditionalOutputs(chatbot)
|
62 |
+
|
63 |
+
# Create Gradio interface
|
64 |
+
chatbot = gr.Chatbot(type="messages")
|
65 |
+
stream = Stream(
|
66 |
+
modality="audio",
|
67 |
+
mode="send-receive",
|
68 |
+
handler=ReplyOnPause(response, input_sample_rate=16000),
|
69 |
+
additional_outputs_handler=lambda a, b: b,
|
70 |
+
additional_inputs=[chatbot],
|
71 |
+
additional_outputs=[chatbot],
|
72 |
+
ui_args={"title": "LLM Voice Chat (Powered by DeepSeek & ElevenLabs)"}
|
73 |
+
)
|
74 |
+
|
75 |
+
# Create FastAPI app and mount stream
|
76 |
+
from fastapi import FastAPI
|
77 |
+
app = FastAPI()
|
78 |
+
app = gr.mount_gradio_app(app, stream.ui, path="/")
|
79 |
+
stream.mount(app) # Mount the stream for telephone/fastphone integration
|
80 |
+
|
81 |
+
# Update the chat completion part based on available methods:
|
82 |
+
# We'll use direct HTTP requests as a fallback since the API structure is unclear:
|
83 |
+
def get_deepseek_response(messages):
|
84 |
+
url = "https://api.deepseek.com/v1/chat/completions"
|
85 |
+
headers = {
|
86 |
+
"Content-Type": "application/json",
|
87 |
+
"Authorization": f"Bearer {os.getenv('DEEPSEEK_API_KEY')}"
|
88 |
+
}
|
89 |
+
payload = {
|
90 |
+
"model": "deepseek-chat",
|
91 |
+
"messages": messages,
|
92 |
+
"temperature": 0.7,
|
93 |
+
"max_tokens": 512
|
94 |
+
}
|
95 |
+
response = requests.post(url, json=payload, headers=headers)
|
96 |
+
|
97 |
+
# Check for error response
|
98 |
+
if response.status_code != 200:
|
99 |
+
print(f"DeepSeek API error: {response.status_code} - {response.text}")
|
100 |
+
return "I'm sorry, I encountered an error processing your request."
|
101 |
+
|
102 |
+
response_json = response.json()
|
103 |
+
return response_json["choices"][0]["message"]["content"]
|
104 |
+
|
105 |
+
# Make sure that the text_to_speech function is completely replaced and gTTS is explicitly using US English
|
106 |
+
def text_to_speech(text):
|
107 |
+
"""Convert text to speech using Google TTS with sentence-by-sentence processing"""
|
108 |
+
try:
|
109 |
+
# Split text into sentences for faster perceived response
|
110 |
+
sentences = re.split(r'(?<=[.!?])\s+', text)
|
111 |
+
|
112 |
+
for sentence in sentences:
|
113 |
+
if not sentence.strip():
|
114 |
+
continue
|
115 |
+
|
116 |
+
# Process each sentence separately
|
117 |
+
mp3_fp = io.BytesIO()
|
118 |
+
|
119 |
+
# Force US English - be explicit about it
|
120 |
+
print(f"Using gTTS with en-us locale for sentence: {sentence[:20]}...")
|
121 |
+
tts = gTTS(text=sentence, lang='en-us', tld='com', slow=False)
|
122 |
+
tts.write_to_fp(mp3_fp)
|
123 |
+
mp3_fp.seek(0)
|
124 |
+
|
125 |
+
# Process audio data
|
126 |
+
data, samplerate = sf.read(mp3_fp)
|
127 |
+
|
128 |
+
# Convert to mono if stereo
|
129 |
+
if len(data.shape) > 1 and data.shape[1] > 1:
|
130 |
+
data = data[:, 0]
|
131 |
+
|
132 |
+
# Resample to 24000 Hz if needed
|
133 |
+
if samplerate != 24000:
|
134 |
+
data = np.interp(
|
135 |
+
np.linspace(0, len(data), int(len(data) * 24000 / samplerate)),
|
136 |
+
np.arange(len(data)),
|
137 |
+
data
|
138 |
+
)
|
139 |
+
|
140 |
+
# Convert to 16-bit integers
|
141 |
+
data = (data * 32767).astype(np.int16)
|
142 |
+
|
143 |
+
# Ensure buffer size is even
|
144 |
+
if len(data) % 2 != 0:
|
145 |
+
data = np.append(data, [0])
|
146 |
+
|
147 |
+
# Reshape and yield in chunks
|
148 |
+
chunk_size = 4800
|
149 |
+
for i in range(0, len(data), chunk_size):
|
150 |
+
chunk = data[i:i+chunk_size]
|
151 |
+
if len(chunk) > 0:
|
152 |
+
if len(chunk) % 2 != 0:
|
153 |
+
chunk = np.append(chunk, [0])
|
154 |
+
chunk = chunk.reshape(1, -1)
|
155 |
+
yield (24000, chunk)
|
156 |
+
except Exception as e:
|
157 |
+
print(f"Exception in text_to_speech: {e}")
|
158 |
+
yield None
|
159 |
+
|
160 |
+
# Add this debug statement AFTER the function definition
|
161 |
+
print("text_to_speech function:", inspect.getsource(text_to_speech))
|
162 |
+
|
163 |
+
if __name__ == "__main__":
|
164 |
+
os.environ["GRADIO_SSR_MODE"] = "false"
|
165 |
+
|
166 |
+
# Check FastRTC version
|
167 |
+
import fastrtc
|
168 |
+
print(f"FastRTC version: {fastrtc.__version__ if hasattr(fastrtc, '__version__') else 'unknown'}")
|
169 |
+
|
170 |
+
# Try running fastphone with additional diagnostic
|
171 |
+
print("Starting phone service - attempting to inspect fastphone method...")
|
172 |
+
import inspect
|
173 |
+
print(f"FastPhone signature: {inspect.signature(stream.fastphone) if hasattr(stream, 'fastphone') else 'Not available'}")
|
174 |
+
|
175 |
+
try:
|
176 |
+
# Fix: Use keyword argument instead of positional
|
177 |
+
phone_service = stream.fastphone(
|
178 |
+
token=os.getenv("HF_TOKEN"),
|
179 |
+
host="127.0.0.1",
|
180 |
+
port=8000,
|
181 |
+
share_server_tls_certificate=True # Use keyword argument format
|
182 |
+
)
|
183 |
+
print("Phone service started successfully")
|
184 |
+
except Exception as e:
|
185 |
+
print(f"Error starting phone service: {e}")
|
186 |
+
print("Falling back to web interface...")
|
187 |
+
# Launch with web interface as fallback
|
188 |
+
stream.ui.launch(server_port=7860)
|
189 |
+
|
190 |
+
# Remove or comment out the following lines:
|
191 |
+
# !pip install -q torch==2.0.1 torchaudio==2.0.2 gradio requests soundfile huggingface_hub
|
192 |
+
# !wget -q https://github.com/seasalt-ai/csm/archive/refs/heads/main.zip
|
193 |
+
# !unzip -q main.zip
|
194 |
+
# !mv csm-main csm
|
195 |
+
# !cd csm && pip install -e .
|
196 |
+
#
|
197 |
+
# # Set up directories
|
198 |
+
# import os
|
199 |
+
# import sys
|
200 |
+
# sys.path.append("/content/csm")
|
201 |
+
# voice_samples_dir = "/content/csm_voice_samples"
|
202 |
+
# output_dir = "/content/csm_output"
|
203 |
+
# os.makedirs(voice_samples_dir, exist_ok=True)
|
204 |
+
# os.makedirs(output_dir, exist_ok=True)
|
205 |
+
#
|
206 |
+
# print("✅ Dependencies installed!")
|
debug.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import requests
|
3 |
+
headers = {"xi-api-key": os.getenv("ELEVENLABS_API_KEY")}
|
4 |
+
voices_response = requests.get("https://api.elevenlabs.io/v1/voices", headers=headers)
|
5 |
+
print("Available voices:", voices_response.json())
|
webrtc_client.js
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
let pc = null;
|
2 |
+
let localStream = null;
|
3 |
+
let audioSender = null;
|
4 |
+
|
5 |
+
async function setupWebRTC() {
|
6 |
+
// Create WebRTC peer connection
|
7 |
+
pc = new RTCPeerConnection({
|
8 |
+
iceServers: [
|
9 |
+
{ urls: 'stun:stun.l.google.com:19302' }
|
10 |
+
]
|
11 |
+
});
|
12 |
+
|
13 |
+
// Get local media stream
|
14 |
+
localStream = await navigator.mediaDevices.getUserMedia({
|
15 |
+
audio: true,
|
16 |
+
video: false
|
17 |
+
});
|
18 |
+
|
19 |
+
// Add tracks to peer connection
|
20 |
+
localStream.getTracks().forEach(track => {
|
21 |
+
audioSender = pc.addTrack(track, localStream);
|
22 |
+
});
|
23 |
+
|
24 |
+
// Create offer
|
25 |
+
const offer = await pc.createOffer();
|
26 |
+
await pc.setLocalDescription(offer);
|
27 |
+
|
28 |
+
// Send offer to server (would need to be implemented)
|
29 |
+
sendOfferToServer(pc.localDescription);
|
30 |
+
|
31 |
+
// Set up event listeners for ICE candidates
|
32 |
+
pc.onicecandidate = event => {
|
33 |
+
if (event.candidate) {
|
34 |
+
sendIceCandidateToServer(event.candidate);
|
35 |
+
}
|
36 |
+
};
|
37 |
+
|
38 |
+
// Handle incoming tracks (audio responses)
|
39 |
+
pc.ontrack = event => {
|
40 |
+
const audioElement = document.getElementById('ai-response-audio');
|
41 |
+
if (audioElement) {
|
42 |
+
audioElement.srcObject = new MediaStream([event.track]);
|
43 |
+
}
|
44 |
+
};
|
45 |
+
}
|
46 |
+
|
47 |
+
function sendOfferToServer(offer) {
|
48 |
+
// Send the offer to your backend
|
49 |
+
// Implementation would depend on your server setup
|
50 |
+
fetch('/webrtc/offer', {
|
51 |
+
method: 'POST',
|
52 |
+
headers: {
|
53 |
+
'Content-Type': 'application/json'
|
54 |
+
},
|
55 |
+
body: JSON.stringify(offer)
|
56 |
+
})
|
57 |
+
.then(response => response.json())
|
58 |
+
.then(answer => {
|
59 |
+
pc.setRemoteDescription(new RTCSessionDescription(answer));
|
60 |
+
})
|
61 |
+
.catch(error => console.error('Error sending offer:', error));
|
62 |
+
}
|
63 |
+
|
64 |
+
function sendIceCandidateToServer(candidate) {
|
65 |
+
// Send ICE candidate to server
|
66 |
+
fetch('/webrtc/ice-candidate', {
|
67 |
+
method: 'POST',
|
68 |
+
headers: {
|
69 |
+
'Content-Type': 'application/json'
|
70 |
+
},
|
71 |
+
body: JSON.stringify(candidate)
|
72 |
+
})
|
73 |
+
.catch(error => console.error('Error sending ICE candidate:', error));
|
74 |
+
}
|
75 |
+
|
76 |
+
function startRecording() {
|
77 |
+
// Unmute the audio track
|
78 |
+
if (localStream) {
|
79 |
+
localStream.getAudioTracks().forEach(track => {
|
80 |
+
track.enabled = true;
|
81 |
+
});
|
82 |
+
}
|
83 |
+
}
|
84 |
+
|
85 |
+
function stopRecording() {
|
86 |
+
// Mute the audio track
|
87 |
+
if (localStream) {
|
88 |
+
localStream.getAudioTracks().forEach(track => {
|
89 |
+
track.enabled = false;
|
90 |
+
});
|
91 |
+
}
|
92 |
+
}
|
webrtc_handler.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
import json
|
3 |
+
import logging
|
4 |
+
import os
|
5 |
+
import ssl
|
6 |
+
import uuid
|
7 |
+
from typing import Dict, Optional, Callable
|
8 |
+
|
9 |
+
import aiohttp
|
10 |
+
from aiortc import RTCPeerConnection, RTCSessionDescription, MediaStreamTrack, RTCIceCandidate
|
11 |
+
from aiortc.contrib.media import MediaBlackhole, MediaRelay
|
12 |
+
|
13 |
+
logger = logging.getLogger("webrtc_handler")
|
14 |
+
pcs = set()
|
15 |
+
relay = MediaRelay()
|
16 |
+
|
17 |
+
class AudioTransformTrack(MediaStreamTrack):
|
18 |
+
"""
|
19 |
+
A track that processes audio and sends it to a callback function
|
20 |
+
"""
|
21 |
+
kind = "audio"
|
22 |
+
|
23 |
+
def __init__(self, track, callback):
|
24 |
+
super().__init__()
|
25 |
+
self.track = track
|
26 |
+
self.callback = callback
|
27 |
+
|
28 |
+
async def recv(self):
|
29 |
+
frame = await self.track.recv()
|
30 |
+
# Process audio frame
|
31 |
+
if self.callback:
|
32 |
+
self.callback(frame)
|
33 |
+
return frame
|
34 |
+
|
35 |
+
async def handle_offer(offer, audio_callback=None):
|
36 |
+
offer_data = RTCSessionDescription(sdp=offer["sdp"], type=offer["type"])
|
37 |
+
|
38 |
+
pc = RTCPeerConnection()
|
39 |
+
pcs.add(pc)
|
40 |
+
|
41 |
+
@pc.on("connectionstatechange")
|
42 |
+
async def on_connectionstatechange():
|
43 |
+
logger.info(f"Connection state is {pc.connectionState}")
|
44 |
+
if pc.connectionState == "failed":
|
45 |
+
await pc.close()
|
46 |
+
pcs.discard(pc)
|
47 |
+
|
48 |
+
@pc.on("track")
|
49 |
+
def on_track(track):
|
50 |
+
logger.info(f"Track {track.kind} received")
|
51 |
+
if track.kind == "audio":
|
52 |
+
pc.addTrack(AudioTransformTrack(relay.subscribe(track), audio_callback))
|
53 |
+
|
54 |
+
@track.on("ended")
|
55 |
+
async def on_ended():
|
56 |
+
logger.info(f"Track {track.kind} ended")
|
57 |
+
|
58 |
+
# Handle the incoming offer
|
59 |
+
await pc.setRemoteDescription(offer_data)
|
60 |
+
|
61 |
+
# Create an answer
|
62 |
+
answer = await pc.createAnswer()
|
63 |
+
await pc.setLocalDescription(answer)
|
64 |
+
|
65 |
+
return {
|
66 |
+
"sdp": pc.localDescription.sdp,
|
67 |
+
"type": pc.localDescription.type
|
68 |
+
}
|
69 |
+
|
70 |
+
async def add_ice_candidate(candidate, pc):
|
71 |
+
if candidate and pc:
|
72 |
+
candidate_data = RTCIceCandidate(
|
73 |
+
sdpMLineIndex=candidate.get("sdpMLineIndex"),
|
74 |
+
sdpMid=candidate.get("sdpMid"),
|
75 |
+
candidate=candidate.get("candidate")
|
76 |
+
)
|
77 |
+
await pc.addIceCandidate(candidate_data)
|