Spaces:
Runtime error
Runtime error
import os | |
import time | |
import gradio as gr | |
import numpy as np | |
from dotenv import load_dotenv | |
from elevenlabs import ElevenLabs | |
from fastrtc import ( | |
Stream, | |
get_stt_model, | |
ReplyOnPause, | |
AdditionalOutputs | |
) | |
from gradio.utils import get_space | |
import requests | |
import io | |
import soundfile as sf | |
from gtts import gTTS | |
import re | |
import logging | |
# Set up logging for WebRTC debugging | |
logging.basicConfig(level=logging.DEBUG) | |
logger = logging.getLogger("fastrtc-voice-assistant") | |
# Load environment variables | |
load_dotenv() | |
# Enable WebRTC debug tracing | |
os.environ["WEBRTC_TRACE"] = "WEBRTC_TRACE_ALL" | |
# Initialize clients | |
logger.info("Initializing clients...") | |
elevenlabs_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY")) | |
stt_model = get_stt_model() | |
logger.info("Clients initialized") | |
class DeepSeekAPI: | |
def __init__(self, api_key): | |
self.api_key = api_key | |
def chat_completion(self, messages, temperature=0.7, max_tokens=512): | |
url = "https://api.deepseek.com/v1/chat/completions" | |
headers = { | |
"Content-Type": "application/json", | |
"Authorization": f"Bearer {self.api_key}" | |
} | |
payload = { | |
"model": "deepseek-chat", | |
"messages": messages, | |
"temperature": temperature, | |
"max_tokens": max_tokens | |
} | |
response = requests.post(url, json=payload, headers=headers) | |
# Check for error response | |
if response.status_code != 200: | |
logger.error(f"DeepSeek API error: {response.status_code} - {response.text}") | |
return {"choices": [{"message": {"content": "I'm sorry, I encountered an error processing your request."}}]} | |
return response.json() | |
deepseek_client = DeepSeekAPI(api_key=os.getenv("DEEPSEEK_API_KEY")) | |
def response( | |
audio: tuple[int, np.ndarray], | |
chatbot: list[dict] | None = None, | |
): | |
chatbot = chatbot or [] | |
messages = [{"role": d["role"], "content": d["content"]} for d in chatbot] | |
# Convert speech to text | |
logger.info("Converting speech to text...") | |
text = stt_model.stt(audio) | |
logger.info(f"User said: {text}") | |
# Add user message to chat | |
chatbot.append({"role": "user", "content": text}) | |
yield AdditionalOutputs(chatbot) | |
# Get AI response | |
messages.append({"role": "user", "content": text}) | |
# Call DeepSeek API | |
logger.info("Calling DeepSeek API...") | |
response_data = deepseek_client.chat_completion(messages) | |
response_text = response_data["choices"][0]["message"]["content"] | |
logger.info(f"DeepSeek response: {response_text[:50]}...") | |
# Add AI response to chat | |
chatbot.append({"role": "assistant", "content": response_text}) | |
# Convert response to speech | |
if os.getenv("ELEVENLABS_API_KEY"): | |
try: | |
logger.info("Using ElevenLabs for speech generation") | |
# Use the streaming API for better experience | |
for chunk in elevenlabs_client.text_to_speech.convert_as_stream( | |
text=response_text, | |
voice_id="Antoni", | |
model_id="eleven_monolingual_v1", | |
output_format="pcm_24000" | |
): | |
audio_array = np.frombuffer(chunk, dtype=np.int16).reshape(1, -1) | |
yield (24000, audio_array) | |
except Exception as e: | |
logger.error(f"ElevenLabs error: {e}, falling back to gTTS") | |
# Fall back to gTTS | |
yield from use_gtts_for_text(response_text) | |
else: | |
# Fall back to gTTS | |
logger.info("ElevenLabs API key not found, using gTTS...") | |
yield from use_gtts_for_text(response_text) | |
yield AdditionalOutputs(chatbot) | |
def use_gtts_for_text(text): | |
"""Helper function to generate speech with gTTS for the entire text""" | |
try: | |
# Split text into sentences for better results | |
sentences = re.split(r'(?<=[.!?])\s+', text) | |
for sentence in sentences: | |
if not sentence.strip(): | |
continue | |
mp3_fp = io.BytesIO() | |
logger.info(f"Using gTTS for: {sentence[:30]}...") | |
tts = gTTS(text=sentence, lang='en-us', tld='com', slow=False) | |
tts.write_to_fp(mp3_fp) | |
mp3_fp.seek(0) | |
data, samplerate = sf.read(mp3_fp) | |
if len(data.shape) > 1 and data.shape[1] > 1: | |
data = data[:, 0] | |
if samplerate != 24000: | |
data = np.interp( | |
np.linspace(0, len(data), int(len(data) * 24000 / samplerate)), | |
np.arange(len(data)), | |
data | |
) | |
data = (data * 32767).astype(np.int16) | |
# Ensure buffer size is even | |
if len(data) % 2 != 0: | |
data = np.append(data, [0]) | |
# Reshape and yield in chunks | |
chunk_size = 4800 | |
for i in range(0, len(data), chunk_size): | |
chunk = data[i:i+chunk_size] | |
if len(chunk) > 0: | |
if len(chunk) % 2 != 0: | |
chunk = np.append(chunk, [0]) | |
chunk = chunk.reshape(1, -1) | |
yield (24000, chunk) | |
except Exception as e: | |
logger.error(f"gTTS error: {e}") | |
yield None | |
# Comprehensive WebRTC configuration with multiple STUN/TURN options | |
rtc_configuration = { | |
"iceServers": [ | |
# Google STUN servers | |
{"urls": ["stun:stun.l.google.com:19302"]}, | |
{"urls": ["stun:stun1.l.google.com:19302"]}, | |
{"urls": ["stun:stun2.l.google.com:19302"]}, | |
{"urls": ["stun:stun3.l.google.com:19302"]}, | |
{"urls": ["stun:stun4.l.google.com:19302"]}, | |
# OpenRelay TURN servers | |
{ | |
"urls": ["turn:openrelay.metered.ca:80"], | |
"username": "openrelayproject", | |
"credential": "openrelayproject" | |
}, | |
{ | |
"urls": ["turn:openrelay.metered.ca:443"], | |
"username": "openrelayproject", | |
"credential": "openrelayproject" | |
}, | |
{ | |
"urls": ["turn:openrelay.metered.ca:443?transport=tcp"], | |
"username": "openrelayproject", | |
"credential": "openrelayproject" | |
}, | |
# Additional public STUN servers | |
{"urls": ["stun:stun.stunprotocol.org:3478"]}, | |
{"urls": ["stun:stun.voip.blackberry.com:3478"]}, | |
{"urls": ["stun:stun.nextcloud.com:443"]} | |
], | |
"iceCandidatePoolSize": 10, | |
"bundlePolicy": "max-bundle", | |
"rtcpMuxPolicy": "require", | |
"iceTransportPolicy": "all" # Try "relay" if "all" doesn't work | |
} | |
# Create a simple wrapper for the webchat UI | |
with gr.Blocks(title="LLM Voice Chat (Powered by DeepSeek & ElevenLabs)") as demo: | |
gr.Markdown("# LLM Voice Chat\nPowered by DeepSeek & ElevenLabs") | |
with gr.Row(): | |
with gr.Column(scale=3): | |
# Create the chatbot component | |
chatbot = gr.Chatbot(type="messages") | |
# For debugging, allow seeing connection status | |
connection_status = gr.Textbox(label="Connection Status", | |
value="Ready to connect. Click the microphone button to start.", | |
interactive=False) | |
# Display debugging information | |
debug_info = gr.Textbox(label="Debug Info", | |
value="WebRTC debug information will appear here.", | |
interactive=False) | |
# Button to manually refresh the page | |
refresh_btn = gr.Button("Refresh Connection") | |
def refresh_page(): | |
debug_info.value = f"Attempting to refresh connection at {time.time()}" | |
return "Refreshed", f"Connection refresh attempted at {time.time()}" | |
refresh_btn.click( | |
refresh_page, | |
outputs=[connection_status, debug_info] | |
) | |
logger.info("Creating Stream component...") | |
# Initialize the stream (outside of the blocks context) | |
stream = Stream( | |
modality="audio", | |
mode="send-receive", | |
handler=ReplyOnPause(response, input_sample_rate=16000), | |
additional_outputs_handler=lambda a, b: b, | |
additional_inputs=[chatbot], | |
additional_outputs=[chatbot], | |
rtc_configuration=rtc_configuration, | |
concurrency_limit=5 if get_space() else None, | |
time_limit=90 if get_space() else None | |
) | |
# Mount the stream to the blocks interface | |
stream.render() | |
logger.info("Stream component created and rendered") | |
# Launch the app | |
if __name__ == "__main__": | |
# Local development | |
logger.info("Running in development mode") | |
os.environ["GRADIO_SSR_MODE"] = "false" | |
demo.launch(server_port=7860, share=True) | |
else: | |
# Hugging Face Spaces | |
logger.info("Running in Hugging Face Spaces") | |
demo.launch() | |