Twelve2five commited on
Commit
40785f3
·
verified ·
1 Parent(s): ee0d47e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +170 -2
app.py CHANGED
@@ -65,7 +65,174 @@ def response(
65
 
66
  yield AdditionalOutputs(chatbot)
67
 
68
- # Create Gradio interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  chatbot = gr.Chatbot(type="messages")
70
  stream = Stream(
71
  modality="audio",
@@ -74,7 +241,8 @@ stream = Stream(
74
  additional_outputs_handler=lambda a, b: b,
75
  additional_inputs=[chatbot],
76
  additional_outputs=[chatbot],
77
- ui_args={"title": "LLM Voice Chat (Powered by DeepSeek & ElevenLabs)"}
 
78
  )
79
 
80
  # FastAPI app with Gradio interface
 
65
 
66
  yield AdditionalOutputs(chatbot)
67
 
68
+ # Your existing helper functions remain unchanged
69
+ def use_gtts_for_sentence(sentence):
70
+ """Helper function to generate speech with gTTS"""
71
+ try:
72
+ # Process each sentence separately
73
+ mp3_fp = io.BytesIO()
74
+
75
+ # Force US English
76
+ print(f"Using gTTS with en-us locale for sentence: {sentence[:20]}...")
77
+ tts = gTTS(text=sentence, lang='en-us', tld='com', slow=False)
78
+ tts.write_to_fp(mp3_fp)
79
+ mp3_fp.seek(0)
80
+
81
+ # Process audio data
82
+ data, samplerate = sf.read(mp3_fp)
83
+
84
+ # Convert to mono if stereo
85
+ if len(data.shape) > 1 and data.shape[1] > 1:
86
+ data = data[:, 0]
87
+
88
+ # Resample to 24000 Hz if needed
89
+ if samplerate != 24000:
90
+ data = np.interp(
91
+ np.linspace(0, len(data), int(len(data) * 24000 / samplerate)),
92
+ np.arange(len(data)),
93
+ data
94
+ )
95
+
96
+ # Convert to 16-bit integers
97
+ data = (data * 32767).astype(np.int16)
98
+
99
+ # Ensure buffer size is even
100
+ if len(data) % 2 != 0:
101
+ data = np.append(data, [0])
102
+
103
+ # Reshape and yield in chunks
104
+ chunk_size = 4800
105
+ for i in range(0, len(data), chunk_size):
106
+ chunk = data[i:i+chunk_size]
107
+ if len(chunk) > 0:
108
+ if len(chunk) % 2 != 0:
109
+ chunk = np.append(chunk, [0])
110
+ chunk = chunk.reshape(1, -1)
111
+ yield (24000, chunk)
112
+ except Exception as e:
113
+ print(f"gTTS error: {e}")
114
+ yield None
115
+
116
+ def text_to_speech(text):
117
+ """Convert text to speech using ElevenLabs or gTTS as fallback"""
118
+ try:
119
+ # Split text into sentences for faster perceived response
120
+ sentences = re.split(r'(?<=[.!?])\s+', text)
121
+
122
+ # Try ElevenLabs first
123
+ if os.getenv("ELEVENLABS_API_KEY"):
124
+ print("Using ElevenLabs for text-to-speech...")
125
+
126
+ for sentence in sentences:
127
+ if not sentence.strip():
128
+ continue
129
+
130
+ try:
131
+ print(f"Generating ElevenLabs speech for: {sentence[:30]}...")
132
+
133
+ # Generate audio using ElevenLabs
134
+ audio_data = elevenlabs_client.generate(
135
+ text=sentence,
136
+ voice="Antoni", # You can change to any available voice
137
+ model="eleven_monolingual_v1"
138
+ )
139
+
140
+ # Convert to numpy array
141
+ mp3_fp = io.BytesIO(audio_data)
142
+ data, samplerate = sf.read(mp3_fp)
143
+
144
+ # Convert to mono if stereo
145
+ if len(data.shape) > 1 and data.shape[1] > 1:
146
+ data = data[:, 0]
147
+
148
+ # Resample to 24000 Hz if needed
149
+ if samplerate != 24000:
150
+ data = np.interp(
151
+ np.linspace(0, len(data), int(len(data) * 24000 / samplerate)),
152
+ np.arange(len(data)),
153
+ data
154
+ )
155
+
156
+ # Convert to 16-bit integers
157
+ data = (data * 32767).astype(np.int16)
158
+
159
+ # Ensure buffer size is even
160
+ if len(data) % 2 != 0:
161
+ data = np.append(data, [0])
162
+
163
+ # Reshape and yield in chunks
164
+ chunk_size = 4800
165
+ for i in range(0, len(data), chunk_size):
166
+ chunk = data[i:i+chunk_size]
167
+ if len(chunk) > 0:
168
+ if len(chunk) % 2 != 0:
169
+ chunk = np.append(chunk, [0])
170
+ chunk = chunk.reshape(1, -1)
171
+ yield (24000, chunk)
172
+
173
+ except Exception as e:
174
+ print(f"ElevenLabs error: {e}, falling back to gTTS")
175
+ # Fall through to gTTS for this sentence
176
+ for audio_chunk in use_gtts_for_sentence(sentence):
177
+ if audio_chunk:
178
+ yield audio_chunk
179
+ else:
180
+ # Fall back to gTTS
181
+ print("ElevenLabs API key not found, using gTTS...")
182
+ for sentence in sentences:
183
+ if sentence.strip():
184
+ for audio_chunk in use_gtts_for_sentence(sentence):
185
+ if audio_chunk:
186
+ yield audio_chunk
187
+ except Exception as e:
188
+ print(f"Exception in text_to_speech: {e}")
189
+ yield None
190
+
191
+ def get_deepseek_response(messages):
192
+ url = "https://api.deepseek.com/v1/chat/completions"
193
+ headers = {
194
+ "Content-Type": "application/json",
195
+ "Authorization": f"Bearer {os.getenv('DEEPSEEK_API_KEY')}"
196
+ }
197
+ payload = {
198
+ "model": "deepseek-chat",
199
+ "messages": messages,
200
+ "temperature": 0.7,
201
+ "max_tokens": 512
202
+ }
203
+ response = requests.post(url, json=payload, headers=headers)
204
+
205
+ # Check for error response
206
+ if response.status_code != 200:
207
+ print(f"DeepSeek API error: {response.status_code} - {response.text}")
208
+ return "I'm sorry, I encountered an error processing your request."
209
+
210
+ response_json = response.json()
211
+ return response_json["choices"][0]["message"]["content"]
212
+
213
+ # WebRTC configuration required for Hugging Face Spaces
214
+ rtc_config = {
215
+ "iceServers": [
216
+ {"urls": ["stun:stun.l.google.com:19302"]},
217
+ {
218
+ "urls": ["turn:openrelay.metered.ca:80"],
219
+ "username": "openrelayproject",
220
+ "credential": "openrelayproject"
221
+ },
222
+ {
223
+ "urls": ["turn:openrelay.metered.ca:443"],
224
+ "username": "openrelayproject",
225
+ "credential": "openrelayproject"
226
+ },
227
+ {
228
+ "urls": ["turn:openrelay.metered.ca:443?transport=tcp"],
229
+ "username": "openrelayproject",
230
+ "credential": "openrelayproject"
231
+ }
232
+ ]
233
+ }
234
+
235
+ # Create Gradio interface with the required rtc_configuration
236
  chatbot = gr.Chatbot(type="messages")
237
  stream = Stream(
238
  modality="audio",
 
241
  additional_outputs_handler=lambda a, b: b,
242
  additional_inputs=[chatbot],
243
  additional_outputs=[chatbot],
244
+ ui_args={"title": "LLM Voice Chat (Powered by DeepSeek & ElevenLabs)"},
245
+ rtc_configuration=rtc_config # Add the WebRTC configuration
246
  )
247
 
248
  # FastAPI app with Gradio interface