Spaces:

Twelve2five
/

fastrtc-voice-assistant

Runtime error

App Files Files Community

Twelve2five commited on Mar 16

Commit

c4620f8

verified ·

1 Parent(s): 013f6a1

Update app.py

Browse files

Files changed (1) hide show

app.py +93 -141

app.py CHANGED Viewed

@@ -4,12 +4,15 @@ import gradio as gr
 import numpy as np
 from dotenv import load_dotenv
 from elevenlabs import ElevenLabs
 from fastrtc import (
     Stream,
     get_stt_model,
     ReplyOnPause,
     AdditionalOutputs
 )
 import requests
 import io
@@ -77,156 +80,105 @@ def response(
     chatbot.append({"role": "assistant", "content": response_text})
     # Convert response to speech
-    for audio_data in text_to_speech(response_text):
-        if audio_data:
-            yield audio_data
     yield AdditionalOutputs(chatbot)
-# Your existing helper functions
-def use_gtts_for_sentence(sentence):
-    """Helper function to generate speech with gTTS"""
-    try:
-        mp3_fp = io.BytesIO()
-        print(f"Using gTTS with en-us locale for sentence: {sentence[:20]}...")
-        tts = gTTS(text=sentence, lang='en-us', tld='com', slow=False)
-        tts.write_to_fp(mp3_fp)
-        mp3_fp.seek(0)
-        data, samplerate = sf.read(mp3_fp)
-        if len(data.shape) > 1 and data.shape[1] > 1:
-            data = data[:, 0]
-        if samplerate != 24000:
-            data = np.interp(
-                np.linspace(0, len(data), int(len(data) * 24000 / samplerate)),
-                np.arange(len(data)),
-                data
-            )
-        data = (data * 32767).astype(np.int16)
-        if len(data) % 2 != 0:
-            data = np.append(data, [0])
-        chunk_size = 4800
-        for i in range(0, len(data), chunk_size):
-            chunk = data[i:i+chunk_size]
-            if len(chunk) > 0:
-                if len(chunk) % 2 != 0:
-                    chunk = np.append(chunk, [0])
-                chunk = chunk.reshape(1, -1)
-                yield (24000, chunk)
-    except Exception as e:
-        print(f"gTTS error: {e}")
-        yield None
-def text_to_speech(text):
-    """Convert text to speech using ElevenLabs or gTTS as fallback"""
     try:
         sentences = re.split(r'(?<=[.!?])\s+', text)
-        if os.getenv("ELEVENLABS_API_KEY"):
-            print("Using ElevenLabs for text-to-speech...")
-            for sentence in sentences:
-                if not sentence.strip():
-                    continue
-                try:
-                    print(f"Generating ElevenLabs speech for: {sentence[:30]}...")
-                    audio_data = elevenlabs_client.generate(
-                        text=sentence,
-                        voice="Antoni",
-                        model="eleven_monolingual_v1"
-                    )
-                    mp3_fp = io.BytesIO(audio_data)
-                    data, samplerate = sf.read(mp3_fp)
-                    if len(data.shape) > 1 and data.shape[1] > 1:
-                        data = data[:, 0]
-                    if samplerate != 24000:
-                        data = np.interp(
-                            np.linspace(0, len(data), int(len(data) * 24000 / samplerate)),
-                            np.arange(len(data)),
-                            data
-                        )
-                    data = (data * 32767).astype(np.int16)
-                    if len(data) % 2 != 0:
-                        data = np.append(data, [0])
-                    chunk_size = 4800
-                    for i in range(0, len(data), chunk_size):
-                        chunk = data[i:i+chunk_size]
-                        if len(chunk) > 0:
-                            if len(chunk) % 2 != 0:
-                                chunk = np.append(chunk, [0])
-                            chunk = chunk.reshape(1, -1)
-                            yield (24000, chunk)
-                except Exception as e:
-                    print(f"ElevenLabs error: {e}, falling back to gTTS")
-                    for audio_chunk in use_gtts_for_sentence(sentence):
-                        if audio_chunk:
-                            yield audio_chunk
-        else:
-            print("ElevenLabs API key not found, using gTTS...")
-            for sentence in sentences:
-                if sentence.strip():
-                    for audio_chunk in use_gtts_for_sentence(sentence):
-                        if audio_chunk:
-                            yield audio_chunk
     except Exception as e:
-        print(f"Exception in text_to_speech: {e}")
         yield None
-# WebRTC configuration required for Hugging Face Spaces
-rtc_config = {
-    "iceServers": [
-        {"urls": ["stun:stun.l.google.com:19302"]},
-        {
-            "urls": ["turn:openrelay.metered.ca:80"],
-            "username": "openrelayproject",
-            "credential": "openrelayproject"
-        },
-        {
-            "urls": ["turn:openrelay.metered.ca:443"],
-            "username": "openrelayproject",
-            "credential": "openrelayproject"
-        },
-        {
-            "urls": ["turn:openrelay.metered.ca:443?transport=tcp"],
-            "username": "openrelayproject",
-            "credential": "openrelayproject"
-        }
-    ]
-}
-# Initialize Gradio app with a standard pattern that Hugging Face recognizes
-with gr.Blocks(title="LLM Voice Chat") as demo:
-    gr.Markdown("# LLM Voice Chat (Powered by DeepSeek & ElevenLabs)")
-    # Create a custom Stream component that Gradio can render
-    chatbot = gr.Chatbot(type="messages")
-    # This is the key part - use Stream as a component inside the Gradio app
-    stream_component = Stream(
-        modality="audio",
-        mode="send-receive",
-        handler=ReplyOnPause(response, input_sample_rate=16000),
-        additional_outputs_handler=lambda a, b: b,
-        additional_inputs=[chatbot],
-        additional_outputs=[chatbot],
-        rtc_configuration=rtc_config
-    )
-    # Make the stream component appear in the Gradio UI
-    stream_component.render()
-# The variable 'demo' will be picked up by Hugging Face Spaces

 import numpy as np
 from dotenv import load_dotenv
 from elevenlabs import ElevenLabs
+from fastapi import FastAPI
 from fastrtc import (
     Stream,
     get_stt_model,
+    get_twilio_turn_credentials,
     ReplyOnPause,
     AdditionalOutputs
 )
+from gradio.utils import get_space
 import requests
 import io
     chatbot.append({"role": "assistant", "content": response_text})
     # Convert response to speech
+    if os.getenv("ELEVENLABS_API_KEY"):
+        try:
+            print(f"Generating ElevenLabs speech for response")
+            # Use the streaming API for better experience
+            for chunk in elevenlabs_client.text_to_speech.convert_as_stream(
+                text=response_text,
+                voice_id="Antoni",
+                model_id="eleven_monolingual_v1",
+                output_format="pcm_24000"
+            ):
+                audio_array = np.frombuffer(chunk, dtype=np.int16).reshape(1, -1)
+                yield (24000, audio_array)
+        except Exception as e:
+            print(f"ElevenLabs error: {e}, falling back to gTTS")
+            # Fall back to gTTS
+            yield from use_gtts_for_text(response_text)
+    else:
+        # Fall back to gTTS
+        print("ElevenLabs API key not found, using gTTS...")
+        yield from use_gtts_for_text(response_text)
     yield AdditionalOutputs(chatbot)
+def use_gtts_for_text(text):
+    """Helper function to generate speech with gTTS for the entire text"""
     try:
+        # Split text into sentences for better results
         sentences = re.split(r'(?<=[.!?])\s+', text)
+        for sentence in sentences:
+            if not sentence.strip():
+                continue
+            mp3_fp = io.BytesIO()
+            print(f"Using gTTS for sentence: {sentence[:30]}...")
+            tts = gTTS(text=sentence, lang='en-us', tld='com', slow=False)
+            tts.write_to_fp(mp3_fp)
+            mp3_fp.seek(0)
+            data, samplerate = sf.read(mp3_fp)
+            if len(data.shape) > 1 and data.shape[1] > 1:
+                data = data[:, 0]
+            if samplerate != 24000:
+                data = np.interp(
+                    np.linspace(0, len(data), int(len(data) * 24000 / samplerate)),
+                    np.arange(len(data)),
+                    data
+                )
+            data = (data * 32767).astype(np.int16)
+            # Ensure buffer size is even
+            if len(data) % 2 != 0:
+                data = np.append(data, [0])
+            # Reshape and yield in chunks
+            chunk_size = 4800
+            for i in range(0, len(data), chunk_size):
+                chunk = data[i:i+chunk_size]
+                if len(chunk) > 0:
+                    if len(chunk) % 2 != 0:
+                        chunk = np.append(chunk, [0])
+                    chunk = chunk.reshape(1, -1)
+                    yield (24000, chunk)
     except Exception as e:
+        print(f"gTTS error: {e}")
         yield None
+# Create Gradio chatbot and stream
+chatbot = gr.Chatbot(type="messages")
+stream = Stream(
+    modality="audio",
+    mode="send-receive",
+    handler=ReplyOnPause(response, input_sample_rate=16000),
+    additional_outputs_handler=lambda a, b: b,
+    additional_inputs=[chatbot],
+    additional_outputs=[chatbot],
+    rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
+    concurrency_limit=5 if get_space() else None,
+    time_limit=90 if get_space() else None,
+    ui_args={"title": "LLM Voice Chat (Powered by DeepSeek & ElevenLabs)"}
+)
+# Mount the Stream UI to the FastAPI app
+app = FastAPI()
+app = gr.mount_gradio_app(app, stream.ui, path="/")
+# Only for local development
+if __name__ == "__main__":
+    os.environ["GRADIO_SSR_MODE"] = "false"
+    # Different launch modes based on environment
+    if (mode := os.getenv("MODE")) == "UI":
+        stream.ui.launch(server_port=7860)
+    elif mode == "PHONE":
+        stream.fastphone(host="0.0.0.0", port=7860)
+    else:
+        stream.ui.launch(server_port=7860)