Twelve2five commited on
Commit
c4620f8
·
verified ·
1 Parent(s): 013f6a1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -141
app.py CHANGED
@@ -4,12 +4,15 @@ import gradio as gr
4
  import numpy as np
5
  from dotenv import load_dotenv
6
  from elevenlabs import ElevenLabs
 
7
  from fastrtc import (
8
  Stream,
9
  get_stt_model,
 
10
  ReplyOnPause,
11
  AdditionalOutputs
12
  )
 
13
 
14
  import requests
15
  import io
@@ -77,156 +80,105 @@ def response(
77
  chatbot.append({"role": "assistant", "content": response_text})
78
 
79
  # Convert response to speech
80
- for audio_data in text_to_speech(response_text):
81
- if audio_data:
82
- yield audio_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
  yield AdditionalOutputs(chatbot)
85
 
86
- # Your existing helper functions
87
- def use_gtts_for_sentence(sentence):
88
- """Helper function to generate speech with gTTS"""
89
- try:
90
- mp3_fp = io.BytesIO()
91
- print(f"Using gTTS with en-us locale for sentence: {sentence[:20]}...")
92
- tts = gTTS(text=sentence, lang='en-us', tld='com', slow=False)
93
- tts.write_to_fp(mp3_fp)
94
- mp3_fp.seek(0)
95
-
96
- data, samplerate = sf.read(mp3_fp)
97
-
98
- if len(data.shape) > 1 and data.shape[1] > 1:
99
- data = data[:, 0]
100
-
101
- if samplerate != 24000:
102
- data = np.interp(
103
- np.linspace(0, len(data), int(len(data) * 24000 / samplerate)),
104
- np.arange(len(data)),
105
- data
106
- )
107
-
108
- data = (data * 32767).astype(np.int16)
109
-
110
- if len(data) % 2 != 0:
111
- data = np.append(data, [0])
112
-
113
- chunk_size = 4800
114
- for i in range(0, len(data), chunk_size):
115
- chunk = data[i:i+chunk_size]
116
- if len(chunk) > 0:
117
- if len(chunk) % 2 != 0:
118
- chunk = np.append(chunk, [0])
119
- chunk = chunk.reshape(1, -1)
120
- yield (24000, chunk)
121
- except Exception as e:
122
- print(f"gTTS error: {e}")
123
- yield None
124
-
125
- def text_to_speech(text):
126
- """Convert text to speech using ElevenLabs or gTTS as fallback"""
127
  try:
 
128
  sentences = re.split(r'(?<=[.!?])\s+', text)
129
 
130
- if os.getenv("ELEVENLABS_API_KEY"):
131
- print("Using ElevenLabs for text-to-speech...")
132
-
133
- for sentence in sentences:
134
- if not sentence.strip():
135
- continue
136
 
137
- try:
138
- print(f"Generating ElevenLabs speech for: {sentence[:30]}...")
139
-
140
- audio_data = elevenlabs_client.generate(
141
- text=sentence,
142
- voice="Antoni",
143
- model="eleven_monolingual_v1"
144
- )
145
-
146
- mp3_fp = io.BytesIO(audio_data)
147
- data, samplerate = sf.read(mp3_fp)
148
-
149
- if len(data.shape) > 1 and data.shape[1] > 1:
150
- data = data[:, 0]
151
-
152
- if samplerate != 24000:
153
- data = np.interp(
154
- np.linspace(0, len(data), int(len(data) * 24000 / samplerate)),
155
- np.arange(len(data)),
156
- data
157
- )
158
-
159
- data = (data * 32767).astype(np.int16)
160
-
161
- if len(data) % 2 != 0:
162
- data = np.append(data, [0])
163
-
164
- chunk_size = 4800
165
- for i in range(0, len(data), chunk_size):
166
- chunk = data[i:i+chunk_size]
167
- if len(chunk) > 0:
168
- if len(chunk) % 2 != 0:
169
- chunk = np.append(chunk, [0])
170
- chunk = chunk.reshape(1, -1)
171
- yield (24000, chunk)
172
-
173
- except Exception as e:
174
- print(f"ElevenLabs error: {e}, falling back to gTTS")
175
- for audio_chunk in use_gtts_for_sentence(sentence):
176
- if audio_chunk:
177
- yield audio_chunk
178
- else:
179
- print("ElevenLabs API key not found, using gTTS...")
180
- for sentence in sentences:
181
- if sentence.strip():
182
- for audio_chunk in use_gtts_for_sentence(sentence):
183
- if audio_chunk:
184
- yield audio_chunk
185
  except Exception as e:
186
- print(f"Exception in text_to_speech: {e}")
187
  yield None
188
 
189
- # WebRTC configuration required for Hugging Face Spaces
190
- rtc_config = {
191
- "iceServers": [
192
- {"urls": ["stun:stun.l.google.com:19302"]},
193
- {
194
- "urls": ["turn:openrelay.metered.ca:80"],
195
- "username": "openrelayproject",
196
- "credential": "openrelayproject"
197
- },
198
- {
199
- "urls": ["turn:openrelay.metered.ca:443"],
200
- "username": "openrelayproject",
201
- "credential": "openrelayproject"
202
- },
203
- {
204
- "urls": ["turn:openrelay.metered.ca:443?transport=tcp"],
205
- "username": "openrelayproject",
206
- "credential": "openrelayproject"
207
- }
208
- ]
209
- }
210
 
211
- # Initialize Gradio app with a standard pattern that Hugging Face recognizes
212
- with gr.Blocks(title="LLM Voice Chat") as demo:
213
- gr.Markdown("# LLM Voice Chat (Powered by DeepSeek & ElevenLabs)")
214
-
215
- # Create a custom Stream component that Gradio can render
216
- chatbot = gr.Chatbot(type="messages")
217
-
218
- # This is the key part - use Stream as a component inside the Gradio app
219
- stream_component = Stream(
220
- modality="audio",
221
- mode="send-receive",
222
- handler=ReplyOnPause(response, input_sample_rate=16000),
223
- additional_outputs_handler=lambda a, b: b,
224
- additional_inputs=[chatbot],
225
- additional_outputs=[chatbot],
226
- rtc_configuration=rtc_config
227
- )
228
-
229
- # Make the stream component appear in the Gradio UI
230
- stream_component.render()
231
 
232
- # The variable 'demo' will be picked up by Hugging Face Spaces
 
 
 
 
 
 
 
 
 
 
 
4
  import numpy as np
5
  from dotenv import load_dotenv
6
  from elevenlabs import ElevenLabs
7
+ from fastapi import FastAPI
8
  from fastrtc import (
9
  Stream,
10
  get_stt_model,
11
+ get_twilio_turn_credentials,
12
  ReplyOnPause,
13
  AdditionalOutputs
14
  )
15
+ from gradio.utils import get_space
16
 
17
  import requests
18
  import io
 
80
  chatbot.append({"role": "assistant", "content": response_text})
81
 
82
  # Convert response to speech
83
+ if os.getenv("ELEVENLABS_API_KEY"):
84
+ try:
85
+ print(f"Generating ElevenLabs speech for response")
86
+
87
+ # Use the streaming API for better experience
88
+ for chunk in elevenlabs_client.text_to_speech.convert_as_stream(
89
+ text=response_text,
90
+ voice_id="Antoni",
91
+ model_id="eleven_monolingual_v1",
92
+ output_format="pcm_24000"
93
+ ):
94
+ audio_array = np.frombuffer(chunk, dtype=np.int16).reshape(1, -1)
95
+ yield (24000, audio_array)
96
+
97
+ except Exception as e:
98
+ print(f"ElevenLabs error: {e}, falling back to gTTS")
99
+ # Fall back to gTTS
100
+ yield from use_gtts_for_text(response_text)
101
+ else:
102
+ # Fall back to gTTS
103
+ print("ElevenLabs API key not found, using gTTS...")
104
+ yield from use_gtts_for_text(response_text)
105
 
106
  yield AdditionalOutputs(chatbot)
107
 
108
+ def use_gtts_for_text(text):
109
+ """Helper function to generate speech with gTTS for the entire text"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  try:
111
+ # Split text into sentences for better results
112
  sentences = re.split(r'(?<=[.!?])\s+', text)
113
 
114
+ for sentence in sentences:
115
+ if not sentence.strip():
116
+ continue
 
 
 
117
 
118
+ mp3_fp = io.BytesIO()
119
+ print(f"Using gTTS for sentence: {sentence[:30]}...")
120
+ tts = gTTS(text=sentence, lang='en-us', tld='com', slow=False)
121
+ tts.write_to_fp(mp3_fp)
122
+ mp3_fp.seek(0)
123
+
124
+ data, samplerate = sf.read(mp3_fp)
125
+
126
+ if len(data.shape) > 1 and data.shape[1] > 1:
127
+ data = data[:, 0]
128
+
129
+ if samplerate != 24000:
130
+ data = np.interp(
131
+ np.linspace(0, len(data), int(len(data) * 24000 / samplerate)),
132
+ np.arange(len(data)),
133
+ data
134
+ )
135
+
136
+ data = (data * 32767).astype(np.int16)
137
+
138
+ # Ensure buffer size is even
139
+ if len(data) % 2 != 0:
140
+ data = np.append(data, [0])
141
+
142
+ # Reshape and yield in chunks
143
+ chunk_size = 4800
144
+ for i in range(0, len(data), chunk_size):
145
+ chunk = data[i:i+chunk_size]
146
+ if len(chunk) > 0:
147
+ if len(chunk) % 2 != 0:
148
+ chunk = np.append(chunk, [0])
149
+ chunk = chunk.reshape(1, -1)
150
+ yield (24000, chunk)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  except Exception as e:
152
+ print(f"gTTS error: {e}")
153
  yield None
154
 
155
+ # Create Gradio chatbot and stream
156
+ chatbot = gr.Chatbot(type="messages")
157
+ stream = Stream(
158
+ modality="audio",
159
+ mode="send-receive",
160
+ handler=ReplyOnPause(response, input_sample_rate=16000),
161
+ additional_outputs_handler=lambda a, b: b,
162
+ additional_inputs=[chatbot],
163
+ additional_outputs=[chatbot],
164
+ rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
165
+ concurrency_limit=5 if get_space() else None,
166
+ time_limit=90 if get_space() else None,
167
+ ui_args={"title": "LLM Voice Chat (Powered by DeepSeek & ElevenLabs)"}
168
+ )
 
 
 
 
 
 
 
169
 
170
+ # Mount the Stream UI to the FastAPI app
171
+ app = FastAPI()
172
+ app = gr.mount_gradio_app(app, stream.ui, path="/")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
174
+ # Only for local development
175
+ if __name__ == "__main__":
176
+ os.environ["GRADIO_SSR_MODE"] = "false"
177
+
178
+ # Different launch modes based on environment
179
+ if (mode := os.getenv("MODE")) == "UI":
180
+ stream.ui.launch(server_port=7860)
181
+ elif mode == "PHONE":
182
+ stream.fastphone(host="0.0.0.0", port=7860)
183
+ else:
184
+ stream.ui.launch(server_port=7860)