Spaces:

Twelve2five
/

fastrtc-voice-assistant

Runtime error

App Files Files Community

Twelve2five commited on Mar 16

Commit

40785f3

verified ·

1 Parent(s): ee0d47e

Update app.py

Browse files

Files changed (1) hide show

app.py +170 -2

app.py CHANGED Viewed

@@ -65,7 +65,174 @@ def response(
     yield AdditionalOutputs(chatbot)
-# Create Gradio interface
 chatbot = gr.Chatbot(type="messages")
 stream = Stream(
     modality="audio",
@@ -74,7 +241,8 @@ stream = Stream(
     additional_outputs_handler=lambda a, b: b,
     additional_inputs=[chatbot],
     additional_outputs=[chatbot],
-    ui_args={"title": "LLM Voice Chat (Powered by DeepSeek & ElevenLabs)"}
 )
 # FastAPI app with Gradio interface

     yield AdditionalOutputs(chatbot)
+# Your existing helper functions remain unchanged
+def use_gtts_for_sentence(sentence):
+    """Helper function to generate speech with gTTS"""
+    try:
+        # Process each sentence separately
+        mp3_fp = io.BytesIO()
+        # Force US English
+        print(f"Using gTTS with en-us locale for sentence: {sentence[:20]}...")
+        tts = gTTS(text=sentence, lang='en-us', tld='com', slow=False)
+        tts.write_to_fp(mp3_fp)
+        mp3_fp.seek(0)
+        # Process audio data
+        data, samplerate = sf.read(mp3_fp)
+        # Convert to mono if stereo
+        if len(data.shape) > 1 and data.shape[1] > 1:
+            data = data[:, 0]
+        # Resample to 24000 Hz if needed
+        if samplerate != 24000:
+            data = np.interp(
+                np.linspace(0, len(data), int(len(data) * 24000 / samplerate)),
+                np.arange(len(data)),
+                data
+            )
+        # Convert to 16-bit integers
+        data = (data * 32767).astype(np.int16)
+        # Ensure buffer size is even
+        if len(data) % 2 != 0:
+            data = np.append(data, [0])
+        # Reshape and yield in chunks
+        chunk_size = 4800
+        for i in range(0, len(data), chunk_size):
+            chunk = data[i:i+chunk_size]
+            if len(chunk) > 0:
+                if len(chunk) % 2 != 0:
+                    chunk = np.append(chunk, [0])
+                chunk = chunk.reshape(1, -1)
+                yield (24000, chunk)
+    except Exception as e:
+        print(f"gTTS error: {e}")
+        yield None
+def text_to_speech(text):
+    """Convert text to speech using ElevenLabs or gTTS as fallback"""
+    try:
+        # Split text into sentences for faster perceived response
+        sentences = re.split(r'(?<=[.!?])\s+', text)
+        # Try ElevenLabs first
+        if os.getenv("ELEVENLABS_API_KEY"):
+            print("Using ElevenLabs for text-to-speech...")
+            for sentence in sentences:
+                if not sentence.strip():
+                    continue
+                try:
+                    print(f"Generating ElevenLabs speech for: {sentence[:30]}...")
+                    # Generate audio using ElevenLabs
+                    audio_data = elevenlabs_client.generate(
+                        text=sentence,
+                        voice="Antoni",  # You can change to any available voice
+                        model="eleven_monolingual_v1"
+                    )
+                    # Convert to numpy array
+                    mp3_fp = io.BytesIO(audio_data)
+                    data, samplerate = sf.read(mp3_fp)
+                    # Convert to mono if stereo
+                    if len(data.shape) > 1 and data.shape[1] > 1:
+                        data = data[:, 0]
+                    # Resample to 24000 Hz if needed
+                    if samplerate != 24000:
+                        data = np.interp(
+                            np.linspace(0, len(data), int(len(data) * 24000 / samplerate)),
+                            np.arange(len(data)),
+                            data
+                        )
+                    # Convert to 16-bit integers
+                    data = (data * 32767).astype(np.int16)
+                    # Ensure buffer size is even
+                    if len(data) % 2 != 0:
+                        data = np.append(data, [0])
+                    # Reshape and yield in chunks
+                    chunk_size = 4800
+                    for i in range(0, len(data), chunk_size):
+                        chunk = data[i:i+chunk_size]
+                        if len(chunk) > 0:
+                            if len(chunk) % 2 != 0:
+                                chunk = np.append(chunk, [0])
+                            chunk = chunk.reshape(1, -1)
+                            yield (24000, chunk)
+                except Exception as e:
+                    print(f"ElevenLabs error: {e}, falling back to gTTS")
+                    # Fall through to gTTS for this sentence
+                    for audio_chunk in use_gtts_for_sentence(sentence):
+                        if audio_chunk:
+                            yield audio_chunk
+        else:
+            # Fall back to gTTS
+            print("ElevenLabs API key not found, using gTTS...")
+            for sentence in sentences:
+                if sentence.strip():
+                    for audio_chunk in use_gtts_for_sentence(sentence):
+                        if audio_chunk:
+                            yield audio_chunk
+    except Exception as e:
+        print(f"Exception in text_to_speech: {e}")
+        yield None
+def get_deepseek_response(messages):
+    url = "https://api.deepseek.com/v1/chat/completions"
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {os.getenv('DEEPSEEK_API_KEY')}"
+    }
+    payload = {
+        "model": "deepseek-chat",
+        "messages": messages,
+        "temperature": 0.7,
+        "max_tokens": 512
+    }
+    response = requests.post(url, json=payload, headers=headers)
+    # Check for error response
+    if response.status_code != 200:
+        print(f"DeepSeek API error: {response.status_code} - {response.text}")
+        return "I'm sorry, I encountered an error processing your request."
+    response_json = response.json()
+    return response_json["choices"][0]["message"]["content"]
+# WebRTC configuration required for Hugging Face Spaces
+rtc_config = {
+    "iceServers": [
+        {"urls": ["stun:stun.l.google.com:19302"]},
+        {
+            "urls": ["turn:openrelay.metered.ca:80"],
+            "username": "openrelayproject",
+            "credential": "openrelayproject"
+        },
+        {
+            "urls": ["turn:openrelay.metered.ca:443"],
+            "username": "openrelayproject",
+            "credential": "openrelayproject"
+        },
+        {
+            "urls": ["turn:openrelay.metered.ca:443?transport=tcp"],
+            "username": "openrelayproject",
+            "credential": "openrelayproject"
+        }
+    ]
+}
+# Create Gradio interface with the required rtc_configuration
 chatbot = gr.Chatbot(type="messages")
 stream = Stream(
     modality="audio",
     additional_outputs_handler=lambda a, b: b,
     additional_inputs=[chatbot],
     additional_outputs=[chatbot],
+    ui_args={"title": "LLM Voice Chat (Powered by DeepSeek & ElevenLabs)"},
+    rtc_configuration=rtc_config  # Add the WebRTC configuration
 )
 # FastAPI app with Gradio interface