Spaces:

Twelve2five
/

fastrtc-voice-assistant

Runtime error

App Files Files Community

Twelve2five commited on Mar 16

Commit

013f6a1

verified ·

1 Parent(s): 3ee34e5

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -75

app.py CHANGED Viewed

@@ -16,11 +16,6 @@ import io
 import soundfile as sf
 from gtts import gTTS
 import re
-import torch
-import torchaudio
-from huggingface_hub import login, hf_hub_download
-from deepseek import DeepSeekAPI
 # Load environment variables
 load_dotenv()
@@ -28,13 +23,33 @@ load_dotenv()
 # Initialize clients
 elevenlabs_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
 stt_model = get_stt_model()
-deepseek_client = DeepSeekAPI(api_key=os.getenv("DEEPSEEK_API_KEY"))
-# Add this debug code temporarily to see what methods are available:
-print(dir(deepseek_client))
-# Set CSM to None to skip that option
-csm_generator = None
 def response(
     audio: tuple[int, np.ndarray],
@@ -53,7 +68,10 @@ def response(
     # Get AI response
     messages.append({"role": "user", "content": text})
-    response_text = get_deepseek_response(messages)
     # Add AI response to chat
     chatbot.append({"role": "assistant", "content": response_text})
@@ -65,27 +83,21 @@ def response(
     yield AdditionalOutputs(chatbot)
-# Your existing helper functions remain unchanged
 def use_gtts_for_sentence(sentence):
     """Helper function to generate speech with gTTS"""
     try:
-        # Process each sentence separately
         mp3_fp = io.BytesIO()
-        # Force US English
         print(f"Using gTTS with en-us locale for sentence: {sentence[:20]}...")
         tts = gTTS(text=sentence, lang='en-us', tld='com', slow=False)
         tts.write_to_fp(mp3_fp)
         mp3_fp.seek(0)
-        # Process audio data
         data, samplerate = sf.read(mp3_fp)
-        # Convert to mono if stereo
         if len(data.shape) > 1 and data.shape[1] > 1:
             data = data[:, 0]
-        # Resample to 24000 Hz if needed
         if samplerate != 24000:
             data = np.interp(
                 np.linspace(0, len(data), int(len(data) * 24000 / samplerate)),
@@ -93,14 +105,11 @@ def use_gtts_for_sentence(sentence):
                 data
             )
-        # Convert to 16-bit integers
         data = (data * 32767).astype(np.int16)
-        # Ensure buffer size is even
         if len(data) % 2 != 0:
             data = np.append(data, [0])
-        # Reshape and yield in chunks
         chunk_size = 4800
         for i in range(0, len(data), chunk_size):
             chunk = data[i:i+chunk_size]
@@ -116,10 +125,8 @@ def use_gtts_for_sentence(sentence):
 def text_to_speech(text):
     """Convert text to speech using ElevenLabs or gTTS as fallback"""
     try:
-        # Split text into sentences for faster perceived response
         sentences = re.split(r'(?<=[.!?])\s+', text)
-        # Try ElevenLabs first
         if os.getenv("ELEVENLABS_API_KEY"):
             print("Using ElevenLabs for text-to-speech...")
@@ -130,22 +137,18 @@ def text_to_speech(text):
                 try:
                     print(f"Generating ElevenLabs speech for: {sentence[:30]}...")
-                    # Generate audio using ElevenLabs
                     audio_data = elevenlabs_client.generate(
                         text=sentence,
-                        voice="Antoni",  # You can change to any available voice
                         model="eleven_monolingual_v1"
                     )
-                    # Convert to numpy array
                     mp3_fp = io.BytesIO(audio_data)
                     data, samplerate = sf.read(mp3_fp)
-                    # Convert to mono if stereo
                     if len(data.shape) > 1 and data.shape[1] > 1:
                         data = data[:, 0]
-                    # Resample to 24000 Hz if needed
                     if samplerate != 24000:
                         data = np.interp(
                             np.linspace(0, len(data), int(len(data) * 24000 / samplerate)),
@@ -153,14 +156,11 @@ def text_to_speech(text):
                             data
                         )
-                    # Convert to 16-bit integers
                     data = (data * 32767).astype(np.int16)
-                    # Ensure buffer size is even
                     if len(data) % 2 != 0:
                         data = np.append(data, [0])
-                    # Reshape and yield in chunks
                     chunk_size = 4800
                     for i in range(0, len(data), chunk_size):
                         chunk = data[i:i+chunk_size]
@@ -172,12 +172,10 @@ def text_to_speech(text):
                 except Exception as e:
                     print(f"ElevenLabs error: {e}, falling back to gTTS")
-                    # Fall through to gTTS for this sentence
                     for audio_chunk in use_gtts_for_sentence(sentence):
                         if audio_chunk:
                             yield audio_chunk
         else:
-            # Fall back to gTTS
             print("ElevenLabs API key not found, using gTTS...")
             for sentence in sentences:
                 if sentence.strip():
@@ -188,28 +186,6 @@ def text_to_speech(text):
         print(f"Exception in text_to_speech: {e}")
         yield None
-def get_deepseek_response(messages):
-    url = "https://api.deepseek.com/v1/chat/completions"
-    headers = {
-        "Content-Type": "application/json",
-        "Authorization": f"Bearer {os.getenv('DEEPSEEK_API_KEY')}"
-    }
-    payload = {
-        "model": "deepseek-chat",
-        "messages": messages,
-        "temperature": 0.7,
-        "max_tokens": 512
-    }
-    response = requests.post(url, json=payload, headers=headers)
-    # Check for error response
-    if response.status_code != 200:
-        print(f"DeepSeek API error: {response.status_code} - {response.text}")
-        return "I'm sorry, I encountered an error processing your request."
-    response_json = response.json()
-    return response_json["choices"][0]["message"]["content"]
 # WebRTC configuration required for Hugging Face Spaces
 rtc_config = {
     "iceServers": [
@@ -232,24 +208,25 @@ rtc_config = {
     ]
 }
-# Create Gradio interface with the required rtc_configuration
-chatbot = gr.Chatbot(type="messages")
-stream = Stream(
-    modality="audio",
-    mode="send-receive",
-    handler=ReplyOnPause(response, input_sample_rate=16000),
-    additional_outputs_handler=lambda a, b: b,
-    additional_inputs=[chatbot],
-    additional_outputs=[chatbot],
-    ui_args={"title": "LLM Voice Chat (Powered by DeepSeek & ElevenLabs)"},
-    rtc_configuration=rtc_config  # Add the WebRTC configuration
-)
-# Create the Gradio interface without serving it
-ui = stream.ui
-# Export the Gradio app for Hugging Face Spaces to find it
-# In Hugging Face Spaces, this will be automatically served
-demo = ui
-# Do not include any server initialization code here - just export the Gradio app

 import soundfile as sf
 from gtts import gTTS
 import re
 # Load environment variables
 load_dotenv()
 # Initialize clients
 elevenlabs_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
 stt_model = get_stt_model()
+class DeepSeekAPI:
+    def __init__(self, api_key):
+        self.api_key = api_key
+    def chat_completion(self, messages, temperature=0.7, max_tokens=512):
+        url = "https://api.deepseek.com/v1/chat/completions"
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {self.api_key}"
+        }
+        payload = {
+            "model": "deepseek-chat",
+            "messages": messages,
+            "temperature": temperature,
+            "max_tokens": max_tokens
+        }
+        response = requests.post(url, json=payload, headers=headers)
+        # Check for error response
+        if response.status_code != 200:
+            print(f"DeepSeek API error: {response.status_code} - {response.text}")
+            return {"choices": [{"message": {"content": "I'm sorry, I encountered an error processing your request."}}]}
+        return response.json()
+deepseek_client = DeepSeekAPI(api_key=os.getenv("DEEPSEEK_API_KEY"))
 def response(
     audio: tuple[int, np.ndarray],
     # Get AI response
     messages.append({"role": "user", "content": text})
+    # Call DeepSeek API
+    response_data = deepseek_client.chat_completion(messages)
+    response_text = response_data["choices"][0]["message"]["content"]
     # Add AI response to chat
     chatbot.append({"role": "assistant", "content": response_text})
     yield AdditionalOutputs(chatbot)
+# Your existing helper functions
 def use_gtts_for_sentence(sentence):
     """Helper function to generate speech with gTTS"""
     try:
         mp3_fp = io.BytesIO()
         print(f"Using gTTS with en-us locale for sentence: {sentence[:20]}...")
         tts = gTTS(text=sentence, lang='en-us', tld='com', slow=False)
         tts.write_to_fp(mp3_fp)
         mp3_fp.seek(0)
         data, samplerate = sf.read(mp3_fp)
         if len(data.shape) > 1 and data.shape[1] > 1:
             data = data[:, 0]
         if samplerate != 24000:
             data = np.interp(
                 np.linspace(0, len(data), int(len(data) * 24000 / samplerate)),
                 data
             )
         data = (data * 32767).astype(np.int16)
         if len(data) % 2 != 0:
             data = np.append(data, [0])
         chunk_size = 4800
         for i in range(0, len(data), chunk_size):
             chunk = data[i:i+chunk_size]
 def text_to_speech(text):
     """Convert text to speech using ElevenLabs or gTTS as fallback"""
     try:
         sentences = re.split(r'(?<=[.!?])\s+', text)
         if os.getenv("ELEVENLABS_API_KEY"):
             print("Using ElevenLabs for text-to-speech...")
                 try:
                     print(f"Generating ElevenLabs speech for: {sentence[:30]}...")
                     audio_data = elevenlabs_client.generate(
                         text=sentence,
+                        voice="Antoni",
                         model="eleven_monolingual_v1"
                     )
                     mp3_fp = io.BytesIO(audio_data)
                     data, samplerate = sf.read(mp3_fp)
                     if len(data.shape) > 1 and data.shape[1] > 1:
                         data = data[:, 0]
                     if samplerate != 24000:
                         data = np.interp(
                             np.linspace(0, len(data), int(len(data) * 24000 / samplerate)),
                             data
                         )
                     data = (data * 32767).astype(np.int16)
                     if len(data) % 2 != 0:
                         data = np.append(data, [0])
                     chunk_size = 4800
                     for i in range(0, len(data), chunk_size):
                         chunk = data[i:i+chunk_size]
                 except Exception as e:
                     print(f"ElevenLabs error: {e}, falling back to gTTS")
                     for audio_chunk in use_gtts_for_sentence(sentence):
                         if audio_chunk:
                             yield audio_chunk
         else:
             print("ElevenLabs API key not found, using gTTS...")
             for sentence in sentences:
                 if sentence.strip():
         print(f"Exception in text_to_speech: {e}")
         yield None
 # WebRTC configuration required for Hugging Face Spaces
 rtc_config = {
     "iceServers": [
     ]
 }
+# Initialize Gradio app with a standard pattern that Hugging Face recognizes
+with gr.Blocks(title="LLM Voice Chat") as demo:
+    gr.Markdown("# LLM Voice Chat (Powered by DeepSeek & ElevenLabs)")
+    # Create a custom Stream component that Gradio can render
+    chatbot = gr.Chatbot(type="messages")
+    # This is the key part - use Stream as a component inside the Gradio app
+    stream_component = Stream(
+        modality="audio",
+        mode="send-receive",
+        handler=ReplyOnPause(response, input_sample_rate=16000),
+        additional_outputs_handler=lambda a, b: b,
+        additional_inputs=[chatbot],
+        additional_outputs=[chatbot],
+        rtc_configuration=rtc_config
+    )
+    # Make the stream component appear in the Gradio UI
+    stream_component.render()
+# The variable 'demo' will be picked up by Hugging Face Spaces