Twelve2five commited on
Commit
c0c2699
·
verified ·
1 Parent(s): f558bc0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -145
app.py CHANGED
@@ -1,39 +1,25 @@
1
  import os
2
- import time
3
- import gradio as gr
4
  import numpy as np
 
5
  from dotenv import load_dotenv
6
  from elevenlabs import ElevenLabs
7
- from fastrtc import (
8
- Stream,
9
- get_stt_model,
10
- ReplyOnPause,
11
- AdditionalOutputs
12
- )
13
- from gradio.utils import get_space
14
-
15
  import requests
16
- import io
17
  import soundfile as sf
18
  from gtts import gTTS
 
19
  import re
20
- import logging
21
 
22
- # Set up logging for WebRTC debugging
23
- logging.basicConfig(level=logging.DEBUG)
24
- logger = logging.getLogger("fastrtc-voice-assistant")
25
 
26
  # Load environment variables
27
  load_dotenv()
28
 
29
- # Enable WebRTC debug tracing
30
- os.environ["WEBRTC_TRACE"] = "WEBRTC_TRACE_ALL"
31
-
32
- # Initialize clients
33
- logger.info("Initializing clients...")
34
  elevenlabs_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
35
- stt_model = get_stt_model()
36
- logger.info("Clients initialized")
37
 
38
  class DeepSeekAPI:
39
  def __init__(self, api_key):
@@ -60,62 +46,9 @@ class DeepSeekAPI:
60
 
61
  return response.json()
62
 
 
63
  deepseek_client = DeepSeekAPI(api_key=os.getenv("DEEPSEEK_API_KEY"))
64
 
65
- def response(
66
- audio: tuple[int, np.ndarray],
67
- chatbot: list[dict] | None = None,
68
- ):
69
- chatbot = chatbot or []
70
- messages = [{"role": d["role"], "content": d["content"]} for d in chatbot]
71
-
72
- # Convert speech to text
73
- logger.info("Converting speech to text...")
74
- text = stt_model.stt(audio)
75
- logger.info(f"User said: {text}")
76
-
77
- # Add user message to chat
78
- chatbot.append({"role": "user", "content": text})
79
- yield AdditionalOutputs(chatbot)
80
-
81
- # Get AI response
82
- messages.append({"role": "user", "content": text})
83
-
84
- # Call DeepSeek API
85
- logger.info("Calling DeepSeek API...")
86
- response_data = deepseek_client.chat_completion(messages)
87
- response_text = response_data["choices"][0]["message"]["content"]
88
- logger.info(f"DeepSeek response: {response_text[:50]}...")
89
-
90
- # Add AI response to chat
91
- chatbot.append({"role": "assistant", "content": response_text})
92
-
93
- # Convert response to speech
94
- if os.getenv("ELEVENLABS_API_KEY"):
95
- try:
96
- logger.info("Using ElevenLabs for speech generation")
97
-
98
- # Use the streaming API for better experience
99
- for chunk in elevenlabs_client.text_to_speech.convert_as_stream(
100
- text=response_text,
101
- voice_id="Antoni",
102
- model_id="eleven_monolingual_v1",
103
- output_format="pcm_24000"
104
- ):
105
- audio_array = np.frombuffer(chunk, dtype=np.int16).reshape(1, -1)
106
- yield (24000, audio_array)
107
-
108
- except Exception as e:
109
- logger.error(f"ElevenLabs error: {e}, falling back to gTTS")
110
- # Fall back to gTTS
111
- yield from use_gtts_for_text(response_text)
112
- else:
113
- # Fall back to gTTS
114
- logger.info("ElevenLabs API key not found, using gTTS...")
115
- yield from use_gtts_for_text(response_text)
116
-
117
- yield AdditionalOutputs(chatbot)
118
-
119
  def use_gtts_for_text(text):
120
  """Helper function to generate speech with gTTS for the entire text"""
121
  try:
@@ -163,100 +96,106 @@ def use_gtts_for_text(text):
163
  logger.error(f"gTTS error: {e}")
164
  yield None
165
 
166
- # Comprehensive WebRTC configuration with multiple STUN/TURN options
167
  rtc_configuration = {
168
  "iceServers": [
169
- # Google STUN servers
170
  {"urls": ["stun:stun.l.google.com:19302"]},
171
  {"urls": ["stun:stun1.l.google.com:19302"]},
172
- {"urls": ["stun:stun2.l.google.com:19302"]},
173
- {"urls": ["stun:stun3.l.google.com:19302"]},
174
- {"urls": ["stun:stun4.l.google.com:19302"]},
175
-
176
- # OpenRelay TURN servers
177
  {
178
  "urls": ["turn:openrelay.metered.ca:80"],
179
  "username": "openrelayproject",
180
  "credential": "openrelayproject"
181
  },
182
- {
183
- "urls": ["turn:openrelay.metered.ca:443"],
184
- "username": "openrelayproject",
185
- "credential": "openrelayproject"
186
- },
187
  {
188
  "urls": ["turn:openrelay.metered.ca:443?transport=tcp"],
189
  "username": "openrelayproject",
190
  "credential": "openrelayproject"
191
- },
192
-
193
- # Additional public STUN servers
194
- {"urls": ["stun:stun.stunprotocol.org:3478"]},
195
- {"urls": ["stun:stun.voip.blackberry.com:3478"]},
196
- {"urls": ["stun:stun.nextcloud.com:443"]}
197
  ],
198
- "iceCandidatePoolSize": 10,
199
- "bundlePolicy": "max-bundle",
200
- "rtcpMuxPolicy": "require",
201
- "iceTransportPolicy": "all" # Try "relay" if "all" doesn't work
202
  }
203
 
204
- # Create a simple wrapper for the webchat UI
205
- with gr.Blocks(title="LLM Voice Chat (Powered by DeepSeek & ElevenLabs)") as demo:
206
- gr.Markdown("# LLM Voice Chat\nPowered by DeepSeek & ElevenLabs")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
 
208
- with gr.Row():
209
- with gr.Column(scale=3):
210
- # Create the chatbot component
211
- chatbot = gr.Chatbot(type="messages")
 
 
 
 
 
 
 
 
212
 
213
- # For debugging, allow seeing connection status
214
- connection_status = gr.Textbox(label="Connection Status",
215
- value="Ready to connect. Click the microphone button to start.",
216
- interactive=False)
217
 
218
- # Display debugging information
219
- debug_info = gr.Textbox(label="Debug Info",
220
- value="WebRTC debug information will appear here.",
221
- interactive=False)
222
 
223
- # Button to manually refresh the page
224
- refresh_btn = gr.Button("Refresh Connection")
 
225
 
226
- def refresh_page():
227
- debug_info.value = f"Attempting to refresh connection at {time.time()}"
228
- return "Refreshed", f"Connection refresh attempted at {time.time()}"
229
 
230
- refresh_btn.click(
231
- refresh_page,
232
- outputs=[connection_status, debug_info]
233
- )
234
-
235
- logger.info("Creating Stream component...")
236
- # Initialize the stream (outside of the blocks context)
237
- stream = Stream(
238
- modality="audio",
239
- mode="send-receive",
240
- handler=ReplyOnPause(response, input_sample_rate=16000),
241
- additional_outputs_handler=lambda a, b: b,
242
- additional_inputs=[chatbot],
243
- additional_outputs=[chatbot],
244
- rtc_configuration=rtc_configuration,
245
- concurrency_limit=5 if get_space() else None,
246
- time_limit=90 if get_space() else None
247
- )
248
 
249
- # Mount the stream to the blocks interface
250
- stream.render()
251
- logger.info("Stream component created and rendered")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
 
253
  # Launch the app
254
  if __name__ == "__main__":
255
- # Local development
256
- logger.info("Running in development mode")
257
- os.environ["GRADIO_SSR_MODE"] = "false"
258
- demo.launch(server_port=7860, share=True)
259
  else:
260
- # Hugging Face Spaces
261
- logger.info("Running in Hugging Face Spaces")
262
  demo.launch()
 
1
  import os
 
 
2
  import numpy as np
3
+ import gradio as gr
4
  from dotenv import load_dotenv
5
  from elevenlabs import ElevenLabs
6
+ from fastrtc import ReplyOnPause
7
+ import logging
 
 
 
 
 
 
8
  import requests
 
9
  import soundfile as sf
10
  from gtts import gTTS
11
+ import io
12
  import re
 
13
 
14
+ # Configure logging
15
+ logging.basicConfig(level=logging.INFO)
16
+ logger = logging.getLogger("voice-assistant")
17
 
18
  # Load environment variables
19
  load_dotenv()
20
 
21
+ # Initialize ElevenLabs client
 
 
 
 
22
  elevenlabs_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
 
 
23
 
24
  class DeepSeekAPI:
25
  def __init__(self, api_key):
 
46
 
47
  return response.json()
48
 
49
+ # Initialize DeepSeek client
50
  deepseek_client = DeepSeekAPI(api_key=os.getenv("DEEPSEEK_API_KEY"))
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  def use_gtts_for_text(text):
53
  """Helper function to generate speech with gTTS for the entire text"""
54
  try:
 
96
  logger.error(f"gTTS error: {e}")
97
  yield None
98
 
99
+ # Comprehensive WebRTC configuration
100
  rtc_configuration = {
101
  "iceServers": [
 
102
  {"urls": ["stun:stun.l.google.com:19302"]},
103
  {"urls": ["stun:stun1.l.google.com:19302"]},
 
 
 
 
 
104
  {
105
  "urls": ["turn:openrelay.metered.ca:80"],
106
  "username": "openrelayproject",
107
  "credential": "openrelayproject"
108
  },
 
 
 
 
 
109
  {
110
  "urls": ["turn:openrelay.metered.ca:443?transport=tcp"],
111
  "username": "openrelayproject",
112
  "credential": "openrelayproject"
113
+ }
 
 
 
 
 
114
  ],
 
 
 
 
115
  }
116
 
117
+ # Define the chat history function to handle messages
118
+ def process_message(audio, history):
119
+ from fastrtc import get_stt_model
120
+
121
+ # Get the STT model instance
122
+ stt_model = get_stt_model()
123
+
124
+ # Convert speech to text
125
+ user_message = stt_model.stt(audio)
126
+ logger.info(f"User said: {user_message}")
127
+
128
+ # Add user message to history
129
+ history = history + [(user_message, None)]
130
+
131
+ # Prepare messages for DeepSeek
132
+ messages = []
133
+ for user, bot in history:
134
+ messages.append({"role": "user", "content": user})
135
+ if bot:
136
+ messages.append({"role": "assistant", "content": bot})
137
+
138
+ # Get AI response
139
+ response_data = deepseek_client.chat_completion(messages)
140
+ bot_message = response_data["choices"][0]["message"]["content"]
141
+ logger.info(f"DeepSeek response: {bot_message[:50]}...")
142
 
143
+ # Update history
144
+ history[-1] = (user_message, bot_message)
145
+
146
+ # Generate audio response
147
+ if os.getenv("ELEVENLABS_API_KEY"):
148
+ try:
149
+ logger.info("Using ElevenLabs for speech generation")
150
+ audio_bytes = elevenlabs_client.text_to_speech.convert(
151
+ text=bot_message,
152
+ voice_id="Antoni",
153
+ model_id="eleven_monolingual_v1"
154
+ )
155
 
156
+ # Save to temporary file and read back
157
+ with open("temp_response.mp3", "wb") as f:
158
+ f.write(audio_bytes)
 
159
 
160
+ data, sr = sf.read("temp_response.mp3")
161
+ os.remove("temp_response.mp3")
 
 
162
 
163
+ # Convert to the right format if needed
164
+ if len(data.shape) > 1:
165
+ data = data[:, 0] # Take first channel if stereo
166
 
167
+ audio_out = (sr, data)
 
 
168
 
169
+ except Exception as e:
170
+ logger.error(f"ElevenLabs error: {e}, falling back to gTTS")
171
+ # TODO: Implement gTTS fallback for this function
172
+ audio_out = None
173
+ else:
174
+ logger.info("No ElevenLabs API key, audio response not available")
175
+ audio_out = None
 
 
 
 
 
 
 
 
 
 
 
176
 
177
+ return history, audio_out
178
+
179
+ # Create the Gradio interface - much simpler than before
180
+ demo = gr.Interface(
181
+ fn=process_message,
182
+ inputs=[
183
+ gr.Audio(sources=["microphone"], type="numpy"),
184
+ gr.State([])
185
+ ],
186
+ outputs=[
187
+ gr.Chatbot(),
188
+ gr.Audio(label="AI Voice Response")
189
+ ],
190
+ title="LLM Voice Chat (Powered by DeepSeek & ElevenLabs)",
191
+ description="Speak into the microphone and get AI responses in text and speech.",
192
+ examples=[],
193
+ cache_examples=False
194
+ )
195
 
196
  # Launch the app
197
  if __name__ == "__main__":
198
+ demo.launch(share=True)
 
 
 
199
  else:
200
+ # For Hugging Face Spaces
 
201
  demo.launch()