import os import time import requests import gradio as gr import numpy as np from dotenv import load_dotenv from elevenlabs import ElevenLabs from fastapi import FastAPI from fastrtc import ( AdditionalOutputs, ReplyOnPause, Stream, get_stt_model, get_twilio_turn_credentials, ) from gradio.utils import get_space from numpy.typing import NDArray # Load environment variables load_dotenv() # Initialize DeepSeek client class DeepSeekAPI: def __init__(self, api_key): self.api_key = api_key def chat_completion(self, messages, temperature=0.7, max_tokens=512): url = "https://api.deepseek.com/v1/chat/completions" headers = { "Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}" } payload = { "model": "deepseek-chat", "messages": messages, "temperature": temperature, "max_tokens": max_tokens } response = requests.post(url, json=payload, headers=headers) # Check for error response if response.status_code != 200: print(f"DeepSeek API error: {response.status_code} - {response.text}") return {"choices": [{"message": {"content": "I'm sorry, I encountered an error processing your request."}}]} return response.json() # Initialize clients deepseek_client = DeepSeekAPI(api_key=os.getenv("DEEPSEEK_API_KEY")) tts_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY")) stt_model = get_stt_model() # Set up Twilio credentials for WebRTC # The function doesn't accept keyword arguments, it reads from env vars directly twilio_credentials = get_twilio_turn_credentials() # Log Twilio status if twilio_credentials: print("Twilio TURN credentials successfully configured") else: print("No Twilio credentials found or invalid credentials") # Handler function for voice conversation def response( audio: tuple[int, NDArray[np.int16 | np.float32]], chatbot: list[dict] | None = None, ): chatbot = chatbot or [] messages = [{"role": d["role"], "content": d["content"]} for d in chatbot] start = time.time() text = stt_model.stt(audio) print("transcription", time.time() - start) print("prompt", text) chatbot.append({"role": "user", "content": text}) yield AdditionalOutputs(chatbot) messages.append({"role": "user", "content": text}) # Replace Groq LLM with DeepSeek response_data = deepseek_client.chat_completion( messages=messages, max_tokens=512 ) response_text = response_data["choices"][0]["message"]["content"] chatbot.append({"role": "assistant", "content": response_text}) for chunk in tts_client.text_to_speech.convert_as_stream( text=response_text, voice_id="JBFqnCBsd6RMkjVDRZzb", model_id="eleven_multilingual_v2", output_format="pcm_24000", ): audio_array = np.frombuffer(chunk, dtype=np.int16).reshape(1, -1) yield (24000, audio_array) yield AdditionalOutputs(chatbot) # Create the chatbot and Stream components chatbot = gr.Chatbot(type="messages") stream = Stream( modality="audio", mode="send-receive", handler=ReplyOnPause(response, input_sample_rate=16000), additional_outputs_handler=lambda a, b: b, additional_inputs=[chatbot], additional_outputs=[chatbot], rtc_configuration=twilio_credentials, # Always use Twilio credentials concurrency_limit=5 if get_space() else None, time_limit=90 if get_space() else None, ui_args={"title": "LLM Voice Chat (Powered by DeepSeek, ElevenLabs, and WebRTC ⚡️)"}, ) # Mount the STREAM UI to the FastAPI app app = FastAPI() app = gr.mount_gradio_app(app, stream.ui, path="/") if __name__ == "__main__": import os os.environ["GRADIO_SSR_MODE"] = "false" if (mode := os.getenv("MODE")) == "UI": stream.ui.launch(server_port=7860) elif mode == "PHONE": stream.fastphone(host="0.0.0.0", port=7860) else: stream.ui.launch(server_port=7860)