Twelve2five commited on
Commit
797af4f
·
verified ·
1 Parent(s): c0c2699

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +115 -80
app.py CHANGED
@@ -1,14 +1,20 @@
1
  import os
2
- import numpy as np
3
  import gradio as gr
 
4
  from dotenv import load_dotenv
5
  from elevenlabs import ElevenLabs
6
- from fastrtc import ReplyOnPause
 
 
 
 
 
7
  import logging
8
  import requests
 
9
  import soundfile as sf
10
  from gtts import gTTS
11
- import io
12
  import re
13
 
14
  # Configure logging
@@ -18,8 +24,9 @@ logger = logging.getLogger("voice-assistant")
18
  # Load environment variables
19
  load_dotenv()
20
 
21
- # Initialize ElevenLabs client
22
  elevenlabs_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
 
23
 
24
  class DeepSeekAPI:
25
  def __init__(self, api_key):
@@ -46,9 +53,64 @@ class DeepSeekAPI:
46
 
47
  return response.json()
48
 
49
- # Initialize DeepSeek client
50
  deepseek_client = DeepSeekAPI(api_key=os.getenv("DEEPSEEK_API_KEY"))
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  def use_gtts_for_text(text):
53
  """Helper function to generate speech with gTTS for the entire text"""
54
  try:
@@ -96,7 +158,7 @@ def use_gtts_for_text(text):
96
  logger.error(f"gTTS error: {e}")
97
  yield None
98
 
99
- # Comprehensive WebRTC configuration
100
  rtc_configuration = {
101
  "iceServers": [
102
  {"urls": ["stun:stun.l.google.com:19302"]},
@@ -112,90 +174,63 @@ rtc_configuration = {
112
  "credential": "openrelayproject"
113
  }
114
  ],
 
115
  }
116
 
117
- # Define the chat history function to handle messages
118
- def process_message(audio, history):
119
- from fastrtc import get_stt_model
 
120
 
121
- # Get the STT model instance
122
- stt_model = get_stt_model()
123
 
124
- # Convert speech to text
125
- user_message = stt_model.stt(audio)
126
- logger.info(f"User said: {user_message}")
127
-
128
- # Add user message to history
129
- history = history + [(user_message, None)]
130
-
131
- # Prepare messages for DeepSeek
132
- messages = []
133
- for user, bot in history:
134
- messages.append({"role": "user", "content": user})
135
- if bot:
136
- messages.append({"role": "assistant", "content": bot})
 
 
 
 
 
 
137
 
138
- # Get AI response
139
- response_data = deepseek_client.chat_completion(messages)
140
- bot_message = response_data["choices"][0]["message"]["content"]
141
- logger.info(f"DeepSeek response: {bot_message[:50]}...")
142
 
143
- # Update history
144
- history[-1] = (user_message, bot_message)
 
145
 
146
- # Generate audio response
147
- if os.getenv("ELEVENLABS_API_KEY"):
148
- try:
149
- logger.info("Using ElevenLabs for speech generation")
150
- audio_bytes = elevenlabs_client.text_to_speech.convert(
151
- text=bot_message,
152
- voice_id="Antoni",
153
- model_id="eleven_monolingual_v1"
154
- )
155
-
156
- # Save to temporary file and read back
157
- with open("temp_response.mp3", "wb") as f:
158
- f.write(audio_bytes)
159
-
160
- data, sr = sf.read("temp_response.mp3")
161
- os.remove("temp_response.mp3")
162
-
163
- # Convert to the right format if needed
164
- if len(data.shape) > 1:
165
- data = data[:, 0] # Take first channel if stereo
166
-
167
- audio_out = (sr, data)
168
-
169
- except Exception as e:
170
- logger.error(f"ElevenLabs error: {e}, falling back to gTTS")
171
- # TODO: Implement gTTS fallback for this function
172
- audio_out = None
173
- else:
174
- logger.info("No ElevenLabs API key, audio response not available")
175
- audio_out = None
176
 
177
- return history, audio_out
 
178
 
179
- # Create the Gradio interface - much simpler than before
180
- demo = gr.Interface(
181
- fn=process_message,
182
- inputs=[
183
- gr.Audio(sources=["microphone"], type="numpy"),
184
- gr.State([])
185
- ],
186
- outputs=[
187
- gr.Chatbot(),
188
- gr.Audio(label="AI Voice Response")
189
- ],
190
- title="LLM Voice Chat (Powered by DeepSeek & ElevenLabs)",
191
- description="Speak into the microphone and get AI responses in text and speech.",
192
- examples=[],
193
- cache_examples=False
194
- )
195
-
196
- # Launch the app
197
  if __name__ == "__main__":
 
198
  demo.launch(share=True)
 
 
 
199
  else:
200
  # For Hugging Face Spaces
201
- demo.launch()
 
 
 
 
 
1
  import os
2
+ import time
3
  import gradio as gr
4
+ import numpy as np
5
  from dotenv import load_dotenv
6
  from elevenlabs import ElevenLabs
7
+ from fastrtc import (
8
+ Stream,
9
+ get_stt_model,
10
+ ReplyOnPause,
11
+ AdditionalOutputs
12
+ )
13
  import logging
14
  import requests
15
+ import io
16
  import soundfile as sf
17
  from gtts import gTTS
 
18
  import re
19
 
20
  # Configure logging
 
24
  # Load environment variables
25
  load_dotenv()
26
 
27
+ # Initialize clients
28
  elevenlabs_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
29
+ stt_model = get_stt_model()
30
 
31
  class DeepSeekAPI:
32
  def __init__(self, api_key):
 
53
 
54
  return response.json()
55
 
 
56
  deepseek_client = DeepSeekAPI(api_key=os.getenv("DEEPSEEK_API_KEY"))
57
 
58
+ # Define handler function for FastRTC Stream
59
+ def response(
60
+ audio: tuple[int, np.ndarray],
61
+ chatbot=None,
62
+ ):
63
+ # Initialize chatbot if None
64
+ chatbot = chatbot or []
65
+ messages = [{"role": msg[0], "content": msg[1]} for msg in chatbot] if chatbot else []
66
+
67
+ # Convert speech to text
68
+ text = stt_model.stt(audio)
69
+ logger.info(f"User said: {text}")
70
+
71
+ # Add user message to chat
72
+ chatbot.append(("user", text))
73
+ yield AdditionalOutputs(chatbot)
74
+
75
+ # Get AI response
76
+ formatted_messages = []
77
+ for role, content in chatbot:
78
+ formatted_messages.append({"role": "user" if role == "user" else "assistant", "content": content})
79
+
80
+ # Call DeepSeek API
81
+ response_data = deepseek_client.chat_completion(formatted_messages)
82
+ response_text = response_data["choices"][0]["message"]["content"]
83
+ logger.info(f"DeepSeek response: {response_text[:50]}...")
84
+
85
+ # Add AI response to chat
86
+ chatbot.append(("assistant", response_text))
87
+
88
+ # Convert response to speech
89
+ if os.getenv("ELEVENLABS_API_KEY"):
90
+ try:
91
+ logger.info("Using ElevenLabs for speech generation")
92
+
93
+ # Use the streaming API for better experience
94
+ for chunk in elevenlabs_client.text_to_speech.convert_as_stream(
95
+ text=response_text,
96
+ voice_id="Antoni",
97
+ model_id="eleven_monolingual_v1",
98
+ output_format="pcm_24000"
99
+ ):
100
+ audio_array = np.frombuffer(chunk, dtype=np.int16).reshape(1, -1)
101
+ yield (24000, audio_array)
102
+
103
+ except Exception as e:
104
+ logger.error(f"ElevenLabs error: {e}, falling back to gTTS")
105
+ # Fall back to gTTS
106
+ yield from use_gtts_for_text(response_text)
107
+ else:
108
+ # Fall back to gTTS
109
+ logger.info("ElevenLabs API key not found, using gTTS...")
110
+ yield from use_gtts_for_text(response_text)
111
+
112
+ yield AdditionalOutputs(chatbot)
113
+
114
  def use_gtts_for_text(text):
115
  """Helper function to generate speech with gTTS for the entire text"""
116
  try:
 
158
  logger.error(f"gTTS error: {e}")
159
  yield None
160
 
161
+ # Enhanced WebRTC configuration
162
  rtc_configuration = {
163
  "iceServers": [
164
  {"urls": ["stun:stun.l.google.com:19302"]},
 
174
  "credential": "openrelayproject"
175
  }
176
  ],
177
+ "iceCandidatePoolSize": 10
178
  }
179
 
180
+ # Build the interface - we need separate Blocks for chatbot and Stream
181
+ with gr.Blocks(title="LLM Voice Assistant") as demo:
182
+ gr.Markdown("# LLM Voice Chat (Powered by DeepSeek & ElevenLabs)")
183
+ gr.Markdown("Click the microphone button to start speaking")
184
 
185
+ # Create the main chatbot display
186
+ chatbot = gr.Chatbot(label="Conversation")
187
 
188
+ # Create the Stream component outside of the Blocks context to avoid conflicts
189
+ # We'll insert it into the interface later
190
+ stream_container = gr.HTML("<div id='stream-placeholder'>Loading WebRTC component...</div>")
191
+
192
+ # Create the FastRTC Stream separately
193
+ stream = Stream(
194
+ modality="audio",
195
+ mode="send-receive",
196
+ handler=ReplyOnPause(response, input_sample_rate=16000),
197
+ additional_outputs_handler=lambda a, b: b,
198
+ additional_inputs=[chatbot],
199
+ additional_outputs=[chatbot],
200
+ rtc_configuration=rtc_configuration
201
+ )
202
+
203
+ # Custom mount function
204
+ def mount_components():
205
+ import gradio as gr
206
+ import os
207
 
208
+ # Get the main interface
209
+ main_interface = demo
 
 
210
 
211
+ # Add the Stream interface to a custom Blocks
212
+ with gr.Blocks(analytics_enabled=False) as stream_interface:
213
+ stream.render()
214
 
215
+ # Create a custom app that hosts both interfaces on different routes
216
+ app = gr.routes.App()
217
+ app.add_route("/", main_interface)
218
+ app.add_route("/stream", stream_interface)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
+ # Launch the combined app
221
+ app.launch()
222
 
223
+ # Launch with the mount function
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  if __name__ == "__main__":
225
+ # Local development
226
  demo.launch(share=True)
227
+
228
+ # Launch the Stream component separately for local development
229
+ stream.ui.launch(server_port=7861, share=True)
230
  else:
231
  # For Hugging Face Spaces
232
+ # Initialize FastRTC in Spaces
233
+ app = gr.mount_gradio_app(stream.app, demo, path="/")
234
+
235
+ # Launch both components
236
+ gr.launch_app(app)