Twelve2five's picture
Update app.py
40785f3 verified
raw
history blame
9.16 kB
import os
import time
import gradio as gr
import numpy as np
from dotenv import load_dotenv
from elevenlabs import ElevenLabs
from fastrtc import (
Stream,
get_stt_model,
ReplyOnPause,
AdditionalOutputs
)
import requests
import io
import soundfile as sf
from gtts import gTTS
import re
import torch
import torchaudio
from huggingface_hub import login, hf_hub_download
from deepseek import DeepSeekAPI
# Load environment variables
load_dotenv()
# Initialize clients
elevenlabs_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
stt_model = get_stt_model()
deepseek_client = DeepSeekAPI(api_key=os.getenv("DEEPSEEK_API_KEY"))
# Add this debug code temporarily to see what methods are available:
print(dir(deepseek_client))
# Set CSM to None to skip that option
csm_generator = None
def response(
audio: tuple[int, np.ndarray],
chatbot: list[dict] | None = None,
):
chatbot = chatbot or []
messages = [{"role": d["role"], "content": d["content"]} for d in chatbot]
# Convert speech to text
text = stt_model.stt(audio)
print("prompt:", text)
# Add user message to chat
chatbot.append({"role": "user", "content": text})
yield AdditionalOutputs(chatbot)
# Get AI response
messages.append({"role": "user", "content": text})
response_text = get_deepseek_response(messages)
# Add AI response to chat
chatbot.append({"role": "assistant", "content": response_text})
# Convert response to speech
for audio_data in text_to_speech(response_text):
if audio_data:
yield audio_data
yield AdditionalOutputs(chatbot)
# Your existing helper functions remain unchanged
def use_gtts_for_sentence(sentence):
"""Helper function to generate speech with gTTS"""
try:
# Process each sentence separately
mp3_fp = io.BytesIO()
# Force US English
print(f"Using gTTS with en-us locale for sentence: {sentence[:20]}...")
tts = gTTS(text=sentence, lang='en-us', tld='com', slow=False)
tts.write_to_fp(mp3_fp)
mp3_fp.seek(0)
# Process audio data
data, samplerate = sf.read(mp3_fp)
# Convert to mono if stereo
if len(data.shape) > 1 and data.shape[1] > 1:
data = data[:, 0]
# Resample to 24000 Hz if needed
if samplerate != 24000:
data = np.interp(
np.linspace(0, len(data), int(len(data) * 24000 / samplerate)),
np.arange(len(data)),
data
)
# Convert to 16-bit integers
data = (data * 32767).astype(np.int16)
# Ensure buffer size is even
if len(data) % 2 != 0:
data = np.append(data, [0])
# Reshape and yield in chunks
chunk_size = 4800
for i in range(0, len(data), chunk_size):
chunk = data[i:i+chunk_size]
if len(chunk) > 0:
if len(chunk) % 2 != 0:
chunk = np.append(chunk, [0])
chunk = chunk.reshape(1, -1)
yield (24000, chunk)
except Exception as e:
print(f"gTTS error: {e}")
yield None
def text_to_speech(text):
"""Convert text to speech using ElevenLabs or gTTS as fallback"""
try:
# Split text into sentences for faster perceived response
sentences = re.split(r'(?<=[.!?])\s+', text)
# Try ElevenLabs first
if os.getenv("ELEVENLABS_API_KEY"):
print("Using ElevenLabs for text-to-speech...")
for sentence in sentences:
if not sentence.strip():
continue
try:
print(f"Generating ElevenLabs speech for: {sentence[:30]}...")
# Generate audio using ElevenLabs
audio_data = elevenlabs_client.generate(
text=sentence,
voice="Antoni", # You can change to any available voice
model="eleven_monolingual_v1"
)
# Convert to numpy array
mp3_fp = io.BytesIO(audio_data)
data, samplerate = sf.read(mp3_fp)
# Convert to mono if stereo
if len(data.shape) > 1 and data.shape[1] > 1:
data = data[:, 0]
# Resample to 24000 Hz if needed
if samplerate != 24000:
data = np.interp(
np.linspace(0, len(data), int(len(data) * 24000 / samplerate)),
np.arange(len(data)),
data
)
# Convert to 16-bit integers
data = (data * 32767).astype(np.int16)
# Ensure buffer size is even
if len(data) % 2 != 0:
data = np.append(data, [0])
# Reshape and yield in chunks
chunk_size = 4800
for i in range(0, len(data), chunk_size):
chunk = data[i:i+chunk_size]
if len(chunk) > 0:
if len(chunk) % 2 != 0:
chunk = np.append(chunk, [0])
chunk = chunk.reshape(1, -1)
yield (24000, chunk)
except Exception as e:
print(f"ElevenLabs error: {e}, falling back to gTTS")
# Fall through to gTTS for this sentence
for audio_chunk in use_gtts_for_sentence(sentence):
if audio_chunk:
yield audio_chunk
else:
# Fall back to gTTS
print("ElevenLabs API key not found, using gTTS...")
for sentence in sentences:
if sentence.strip():
for audio_chunk in use_gtts_for_sentence(sentence):
if audio_chunk:
yield audio_chunk
except Exception as e:
print(f"Exception in text_to_speech: {e}")
yield None
def get_deepseek_response(messages):
url = "https://api.deepseek.com/v1/chat/completions"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {os.getenv('DEEPSEEK_API_KEY')}"
}
payload = {
"model": "deepseek-chat",
"messages": messages,
"temperature": 0.7,
"max_tokens": 512
}
response = requests.post(url, json=payload, headers=headers)
# Check for error response
if response.status_code != 200:
print(f"DeepSeek API error: {response.status_code} - {response.text}")
return "I'm sorry, I encountered an error processing your request."
response_json = response.json()
return response_json["choices"][0]["message"]["content"]
# WebRTC configuration required for Hugging Face Spaces
rtc_config = {
"iceServers": [
{"urls": ["stun:stun.l.google.com:19302"]},
{
"urls": ["turn:openrelay.metered.ca:80"],
"username": "openrelayproject",
"credential": "openrelayproject"
},
{
"urls": ["turn:openrelay.metered.ca:443"],
"username": "openrelayproject",
"credential": "openrelayproject"
},
{
"urls": ["turn:openrelay.metered.ca:443?transport=tcp"],
"username": "openrelayproject",
"credential": "openrelayproject"
}
]
}
# Create Gradio interface with the required rtc_configuration
chatbot = gr.Chatbot(type="messages")
stream = Stream(
modality="audio",
mode="send-receive",
handler=ReplyOnPause(response, input_sample_rate=16000),
additional_outputs_handler=lambda a, b: b,
additional_inputs=[chatbot],
additional_outputs=[chatbot],
ui_args={"title": "LLM Voice Chat (Powered by DeepSeek & ElevenLabs)"},
rtc_configuration=rtc_config # Add the WebRTC configuration
)
# FastAPI app with Gradio interface
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
import gradio as gr
app = FastAPI()
# Add CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Mount the Gradio app
app = gr.mount_gradio_app(app, stream.ui, path="/")
# Add the Stream to FastAPI
stream.mount(app)
# No launch code here - let Hugging Face Spaces handle the server launch
# Only if running locally would you use this:
if __name__ == "__main__" and not os.getenv("HF_SPACE"):
import uvicorn
PORT = int(os.getenv("PORT", 7860))
print(f"Using port: {PORT}")
uvicorn.run(app, host="0.0.0.0", port=PORT)