Spaces:
Runtime error
Runtime error
| import os | |
| import time | |
| import gradio as gr | |
| import numpy as np | |
| from dotenv import load_dotenv | |
| from elevenlabs import ElevenLabs | |
| from fastrtc import ( | |
| Stream, | |
| get_stt_model, | |
| ReplyOnPause, | |
| AdditionalOutputs | |
| ) | |
| import requests | |
| import io | |
| import soundfile as sf | |
| from gtts import gTTS | |
| import re | |
| import inspect | |
| import torch | |
| import torchaudio | |
| import sys | |
| from huggingface_hub import login, hf_hub_download | |
| from deepseek import DeepSeekAPI | |
| # Load environment variables | |
| load_dotenv() | |
| # Add this RTC configuration for Hugging Face Spaces | |
| # This is critical for WebRTC to work properly in Spaces | |
| rtc_config = { | |
| "iceServers": [ | |
| {"urls": ["stun:stun.l.google.com:19302", "stun:stun1.l.google.com:19302"]} | |
| ] | |
| } | |
| # Initialize clients | |
| elevenlabs_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY")) | |
| stt_model = get_stt_model() | |
| deepseek_client = DeepSeekAPI(api_key=os.getenv("DEEPSEEK_API_KEY")) | |
| # Add this debug code temporarily to see what methods are available: | |
| print(dir(deepseek_client)) | |
| # Set CSM to None to skip that option | |
| csm_generator = None | |
| def response( | |
| audio: tuple[int, np.ndarray], | |
| chatbot: list[dict] | None = None, | |
| ): | |
| chatbot = chatbot or [] | |
| messages = [{"role": d["role"], "content": d["content"]} for d in chatbot] | |
| # Convert speech to text | |
| text = stt_model.stt(audio) | |
| print("prompt:", text) | |
| # Add user message to chat | |
| chatbot.append({"role": "user", "content": text}) | |
| yield AdditionalOutputs(chatbot) | |
| # Get AI response | |
| messages.append({"role": "user", "content": text}) | |
| response_text = get_deepseek_response(messages) | |
| # Add AI response to chat | |
| chatbot.append({"role": "assistant", "content": response_text}) | |
| # Convert response to speech | |
| for audio_data in text_to_speech(response_text): | |
| if audio_data: | |
| yield audio_data | |
| yield AdditionalOutputs(chatbot) | |
| # Create Gradio interface | |
| chatbot = gr.Chatbot(type="messages", height=500, label="Conversation") | |
| # Define enhanced UI arguments | |
| enhanced_ui_args = { | |
| "title": "LLM Voice Chat (Powered by DeepSeek & ElevenLabs)", | |
| "description": "Speak after clicking the microphone button below. Your conversation will appear in the chat.", | |
| "theme": gr.themes.Soft(), | |
| "css": """ | |
| .gradio-container { | |
| min-height: 600px; | |
| } | |
| #chatbot { | |
| min-height: 400px; | |
| } | |
| """ | |
| } | |
| # Create Stream with enhanced UI args and the RTC configuration | |
| stream = Stream( | |
| modality="audio", | |
| mode="send-receive", | |
| handler=ReplyOnPause(response, input_sample_rate=16000), | |
| additional_outputs_handler=lambda a, b: b, | |
| additional_inputs=[chatbot], | |
| additional_outputs=[chatbot], | |
| ui_args=enhanced_ui_args, | |
| rtc_configuration=rtc_config | |
| ) | |
| # Create FastAPI app and mount stream | |
| from fastapi import FastAPI | |
| app = FastAPI() | |
| app = gr.mount_gradio_app(app, stream.ui, path="/") | |
| stream.mount(app) # Mount the stream for telephone/fastphone integration | |
| # Update the chat completion part based on available methods: | |
| # We'll use direct HTTP requests as a fallback since the API structure is unclear: | |
| def get_deepseek_response(messages): | |
| url = "https://api.deepseek.com/v1/chat/completions" | |
| headers = { | |
| "Content-Type": "application/json", | |
| "Authorization": f"Bearer {os.getenv('DEEPSEEK_API_KEY')}" | |
| } | |
| payload = { | |
| "model": "deepseek-chat", | |
| "messages": messages, | |
| "temperature": 0.7, | |
| "max_tokens": 512 | |
| } | |
| response = requests.post(url, json=payload, headers=headers) | |
| # Check for error response | |
| if response.status_code != 200: | |
| print(f"DeepSeek API error: {response.status_code} - {response.text}") | |
| return "I'm sorry, I encountered an error processing your request." | |
| response_json = response.json() | |
| return response_json["choices"][0]["message"]["content"] | |
| # Helper function for gTTS | |
| def use_gtts_for_sentence(sentence): | |
| """Helper function to generate speech with gTTS""" | |
| try: | |
| # Process each sentence separately | |
| mp3_fp = io.BytesIO() | |
| # Force US English | |
| print(f"Using gTTS with en-us locale for sentence: {sentence[:20]}...") | |
| tts = gTTS(text=sentence, lang='en-us', tld='com', slow=False) | |
| tts.write_to_fp(mp3_fp) | |
| mp3_fp.seek(0) | |
| # Process audio data | |
| data, samplerate = sf.read(mp3_fp) | |
| # Convert to mono if stereo | |
| if len(data.shape) > 1 and data.shape[1] > 1: | |
| data = data[:, 0] | |
| # Resample to 24000 Hz if needed | |
| if samplerate != 24000: | |
| data = np.interp( | |
| np.linspace(0, len(data), int(len(data) * 24000 / samplerate)), | |
| np.arange(len(data)), | |
| data | |
| ) | |
| # Convert to 16-bit integers | |
| data = (data * 32767).astype(np.int16) | |
| # Ensure buffer size is even | |
| if len(data) % 2 != 0: | |
| data = np.append(data, [0]) | |
| # Reshape and yield in chunks | |
| chunk_size = 4800 | |
| for i in range(0, len(data), chunk_size): | |
| chunk = data[i:i+chunk_size] | |
| if len(chunk) > 0: | |
| if len(chunk) % 2 != 0: | |
| chunk = np.append(chunk, [0]) | |
| chunk = chunk.reshape(1, -1) | |
| yield (24000, chunk) | |
| except Exception as e: | |
| print(f"gTTS error: {e}") | |
| yield None | |
| # Replace the text_to_speech function with this version | |
| def text_to_speech(text): | |
| """Convert text to speech using ElevenLabs or gTTS as fallback""" | |
| try: | |
| # Split text into sentences for faster perceived response | |
| sentences = re.split(r'(?<=[.!?])\s+', text) | |
| # Try ElevenLabs first | |
| if os.getenv("ELEVENLABS_API_KEY"): | |
| print("Using ElevenLabs for text-to-speech...") | |
| for sentence in sentences: | |
| if not sentence.strip(): | |
| continue | |
| try: | |
| print(f"Generating ElevenLabs speech for: {sentence[:30]}...") | |
| # Generate audio using ElevenLabs | |
| audio_data = elevenlabs_client.generate( | |
| text=sentence, | |
| voice="Antoni", # You can change to any available voice | |
| model="eleven_monolingual_v1" | |
| ) | |
| # Convert to numpy array | |
| mp3_fp = io.BytesIO(audio_data) | |
| data, samplerate = sf.read(mp3_fp) | |
| # Convert to mono if stereo | |
| if len(data.shape) > 1 and data.shape[1] > 1: | |
| data = data[:, 0] | |
| # Resample to 24000 Hz if needed | |
| if samplerate != 24000: | |
| data = np.interp( | |
| np.linspace(0, len(data), int(len(data) * 24000 / samplerate)), | |
| np.arange(len(data)), | |
| data | |
| ) | |
| # Convert to 16-bit integers | |
| data = (data * 32767).astype(np.int16) | |
| # Ensure buffer size is even | |
| if len(data) % 2 != 0: | |
| data = np.append(data, [0]) | |
| # Reshape and yield in chunks | |
| chunk_size = 4800 | |
| for i in range(0, len(data), chunk_size): | |
| chunk = data[i:i+chunk_size] | |
| if len(chunk) > 0: | |
| if len(chunk) % 2 != 0: | |
| chunk = np.append(chunk, [0]) | |
| chunk = chunk.reshape(1, -1) | |
| yield (24000, chunk) | |
| except Exception as e: | |
| print(f"ElevenLabs error: {e}, falling back to gTTS") | |
| # Fall through to gTTS for this sentence | |
| for audio_chunk in use_gtts_for_sentence(sentence): | |
| if audio_chunk: | |
| yield audio_chunk | |
| else: | |
| # Fall back to gTTS | |
| print("ElevenLabs API key not found, using gTTS...") | |
| for sentence in sentences: | |
| if sentence.strip(): | |
| for audio_chunk in use_gtts_for_sentence(sentence): | |
| if audio_chunk: | |
| yield audio_chunk | |
| except Exception as e: | |
| print(f"Exception in text_to_speech: {e}") | |
| yield None | |
| # Add this debug statement AFTER the function definition | |
| print("text_to_speech function:", inspect.getsource(text_to_speech)) | |
| if __name__ == "__main__": | |
| os.environ["GRADIO_SSR_MODE"] = "false" | |
| # Check FastRTC version | |
| import fastrtc | |
| print(f"FastRTC version: {fastrtc.__version__ if hasattr(fastrtc, '__version__') else 'unknown'}") | |
| # Use a simpler startup method compatible with Hugging Face Spaces | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=7860) |