Spaces:

Twelve2five
/

fastrtc-voice-assistant

Runtime error

App Files Files Community

fastrtc-voice-assistant / app.py

Twelve2five

Upload 4 files

dff5fe4 verified 5 months ago

raw

history blame

7.14 kB

	import os
	import time
	import gradio as gr
	import numpy as np
	from dotenv import load_dotenv
	from elevenlabs import ElevenLabs
	from fastrtc import (
	Stream,
	get_stt_model,
	ReplyOnPause,
	AdditionalOutputs
	)

	import requests
	import io
	import soundfile as sf
	from gtts import gTTS
	import re
	import inspect

	from deepseek import DeepSeekAPI

	# Load environment variables
	load_dotenv()

	# Initialize clients
	elevenlabs_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
	stt_model = get_stt_model()
	deepseek_client = DeepSeekAPI(api_key=os.getenv("DEEPSEEK_API_KEY"))

	# Add this debug code temporarily to see what methods are available:
	print(dir(deepseek_client))

	def response(
	audio: tuple[int, np.ndarray],
	chatbot: list[dict] \| None = None,
	):
	chatbot = chatbot or []
	messages = [{"role": d["role"], "content": d["content"]} for d in chatbot]

	# Convert speech to text
	text = stt_model.stt(audio)
	print("prompt:", text)

	# Add user message to chat
	chatbot.append({"role": "user", "content": text})
	yield AdditionalOutputs(chatbot)

	# Get AI response
	messages.append({"role": "user", "content": text})
	response_text = get_deepseek_response(messages)

	# Add AI response to chat
	chatbot.append({"role": "assistant", "content": response_text})

	# Convert response to speech
	for audio_data in text_to_speech(response_text):
	if audio_data:
	yield audio_data

	yield AdditionalOutputs(chatbot)

	# Create Gradio interface
	chatbot = gr.Chatbot(type="messages")
	stream = Stream(
	modality="audio",
	mode="send-receive",
	handler=ReplyOnPause(response, input_sample_rate=16000),
	additional_outputs_handler=lambda a, b: b,
	additional_inputs=[chatbot],
	additional_outputs=[chatbot],
	ui_args={"title": "LLM Voice Chat (Powered by DeepSeek & ElevenLabs)"}
	)

	# Create FastAPI app and mount stream
	from fastapi import FastAPI
	app = FastAPI()
	app = gr.mount_gradio_app(app, stream.ui, path="/")
	stream.mount(app) # Mount the stream for telephone/fastphone integration

	# Update the chat completion part based on available methods:
	# We'll use direct HTTP requests as a fallback since the API structure is unclear:
	def get_deepseek_response(messages):
	url = "https://api.deepseek.com/v1/chat/completions"
	headers = {
	"Content-Type": "application/json",
	"Authorization": f"Bearer {os.getenv('DEEPSEEK_API_KEY')}"
	}
	payload = {
	"model": "deepseek-chat",
	"messages": messages,
	"temperature": 0.7,
	"max_tokens": 512
	}
	response = requests.post(url, json=payload, headers=headers)

	# Check for error response
	if response.status_code != 200:
	print(f"DeepSeek API error: {response.status_code} - {response.text}")
	return "I'm sorry, I encountered an error processing your request."

	response_json = response.json()
	return response_json["choices"][0]["message"]["content"]

	# Make sure that the text_to_speech function is completely replaced and gTTS is explicitly using US English
	def text_to_speech(text):
	"""Convert text to speech using Google TTS with sentence-by-sentence processing"""
	try:
	# Split text into sentences for faster perceived response
	sentences = re.split(r'(?<=[.!?])\s+', text)

	for sentence in sentences:
	if not sentence.strip():
	continue

	# Process each sentence separately
	mp3_fp = io.BytesIO()

	# Force US English - be explicit about it
	print(f"Using gTTS with en-us locale for sentence: {sentence[:20]}...")
	tts = gTTS(text=sentence, lang='en-us', tld='com', slow=False)
	tts.write_to_fp(mp3_fp)
	mp3_fp.seek(0)

	# Process audio data
	data, samplerate = sf.read(mp3_fp)

	# Convert to mono if stereo
	if len(data.shape) > 1 and data.shape[1] > 1:
	data = data[:, 0]

	# Resample to 24000 Hz if needed
	if samplerate != 24000:
	data = np.interp(
	np.linspace(0, len(data), int(len(data) * 24000 / samplerate)),
	np.arange(len(data)),
	data
	)

	# Convert to 16-bit integers
	data = (data * 32767).astype(np.int16)

	# Ensure buffer size is even
	if len(data) % 2 != 0:
	data = np.append(data, [0])

	# Reshape and yield in chunks
	chunk_size = 4800
	for i in range(0, len(data), chunk_size):
	chunk = data[i:i+chunk_size]
	if len(chunk) > 0:
	if len(chunk) % 2 != 0:
	chunk = np.append(chunk, [0])
	chunk = chunk.reshape(1, -1)
	yield (24000, chunk)
	except Exception as e:
	print(f"Exception in text_to_speech: {e}")
	yield None

	# Add this debug statement AFTER the function definition
	print("text_to_speech function:", inspect.getsource(text_to_speech))

	if __name__ == "__main__":
	os.environ["GRADIO_SSR_MODE"] = "false"

	# Check FastRTC version
	import fastrtc
	print(f"FastRTC version: {fastrtc.__version__ if hasattr(fastrtc, '__version__') else 'unknown'}")

	# Try running fastphone with additional diagnostic
	print("Starting phone service - attempting to inspect fastphone method...")
	import inspect
	print(f"FastPhone signature: {inspect.signature(stream.fastphone) if hasattr(stream, 'fastphone') else 'Not available'}")

	try:
	# Fix: Use keyword argument instead of positional
	phone_service = stream.fastphone(
	token=os.getenv("HF_TOKEN"),
	host="127.0.0.1",
	port=8000,
	share_server_tls_certificate=True # Use keyword argument format
	)
	print("Phone service started successfully")
	except Exception as e:
	print(f"Error starting phone service: {e}")
	print("Falling back to web interface...")
	# Launch with web interface as fallback
	stream.ui.launch(server_port=7860)

	# Remove or comment out the following lines:
	# !pip install -q torch==2.0.1 torchaudio==2.0.2 gradio requests soundfile huggingface_hub
	# !wget -q https://github.com/seasalt-ai/csm/archive/refs/heads/main.zip
	# !unzip -q main.zip
	# !mv csm-main csm
	# !cd csm && pip install -e .
	#
	# # Set up directories
	# import os
	# import sys
	# sys.path.append("/content/csm")
	# voice_samples_dir = "/content/csm_voice_samples"
	# output_dir = "/content/csm_output"
	# os.makedirs(voice_samples_dir, exist_ok=True)
	# os.makedirs(output_dir, exist_ok=True)
	#
	# print("✅ Dependencies installed!")