Twelve2five commited on
Commit
4df6700
·
verified ·
1 Parent(s): dff5fe4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +264 -206
app.py CHANGED
@@ -1,206 +1,264 @@
1
- import os
2
- import time
3
- import gradio as gr
4
- import numpy as np
5
- from dotenv import load_dotenv
6
- from elevenlabs import ElevenLabs
7
- from fastrtc import (
8
- Stream,
9
- get_stt_model,
10
- ReplyOnPause,
11
- AdditionalOutputs
12
- )
13
-
14
- import requests
15
- import io
16
- import soundfile as sf
17
- from gtts import gTTS
18
- import re
19
- import inspect
20
-
21
- from deepseek import DeepSeekAPI
22
-
23
- # Load environment variables
24
- load_dotenv()
25
-
26
- # Initialize clients
27
- elevenlabs_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
28
- stt_model = get_stt_model()
29
- deepseek_client = DeepSeekAPI(api_key=os.getenv("DEEPSEEK_API_KEY"))
30
-
31
- # Add this debug code temporarily to see what methods are available:
32
- print(dir(deepseek_client))
33
-
34
- def response(
35
- audio: tuple[int, np.ndarray],
36
- chatbot: list[dict] | None = None,
37
- ):
38
- chatbot = chatbot or []
39
- messages = [{"role": d["role"], "content": d["content"]} for d in chatbot]
40
-
41
- # Convert speech to text
42
- text = stt_model.stt(audio)
43
- print("prompt:", text)
44
-
45
- # Add user message to chat
46
- chatbot.append({"role": "user", "content": text})
47
- yield AdditionalOutputs(chatbot)
48
-
49
- # Get AI response
50
- messages.append({"role": "user", "content": text})
51
- response_text = get_deepseek_response(messages)
52
-
53
- # Add AI response to chat
54
- chatbot.append({"role": "assistant", "content": response_text})
55
-
56
- # Convert response to speech
57
- for audio_data in text_to_speech(response_text):
58
- if audio_data:
59
- yield audio_data
60
-
61
- yield AdditionalOutputs(chatbot)
62
-
63
- # Create Gradio interface
64
- chatbot = gr.Chatbot(type="messages")
65
- stream = Stream(
66
- modality="audio",
67
- mode="send-receive",
68
- handler=ReplyOnPause(response, input_sample_rate=16000),
69
- additional_outputs_handler=lambda a, b: b,
70
- additional_inputs=[chatbot],
71
- additional_outputs=[chatbot],
72
- ui_args={"title": "LLM Voice Chat (Powered by DeepSeek & ElevenLabs)"}
73
- )
74
-
75
- # Create FastAPI app and mount stream
76
- from fastapi import FastAPI
77
- app = FastAPI()
78
- app = gr.mount_gradio_app(app, stream.ui, path="/")
79
- stream.mount(app) # Mount the stream for telephone/fastphone integration
80
-
81
- # Update the chat completion part based on available methods:
82
- # We'll use direct HTTP requests as a fallback since the API structure is unclear:
83
- def get_deepseek_response(messages):
84
- url = "https://api.deepseek.com/v1/chat/completions"
85
- headers = {
86
- "Content-Type": "application/json",
87
- "Authorization": f"Bearer {os.getenv('DEEPSEEK_API_KEY')}"
88
- }
89
- payload = {
90
- "model": "deepseek-chat",
91
- "messages": messages,
92
- "temperature": 0.7,
93
- "max_tokens": 512
94
- }
95
- response = requests.post(url, json=payload, headers=headers)
96
-
97
- # Check for error response
98
- if response.status_code != 200:
99
- print(f"DeepSeek API error: {response.status_code} - {response.text}")
100
- return "I'm sorry, I encountered an error processing your request."
101
-
102
- response_json = response.json()
103
- return response_json["choices"][0]["message"]["content"]
104
-
105
- # Make sure that the text_to_speech function is completely replaced and gTTS is explicitly using US English
106
- def text_to_speech(text):
107
- """Convert text to speech using Google TTS with sentence-by-sentence processing"""
108
- try:
109
- # Split text into sentences for faster perceived response
110
- sentences = re.split(r'(?<=[.!?])\s+', text)
111
-
112
- for sentence in sentences:
113
- if not sentence.strip():
114
- continue
115
-
116
- # Process each sentence separately
117
- mp3_fp = io.BytesIO()
118
-
119
- # Force US English - be explicit about it
120
- print(f"Using gTTS with en-us locale for sentence: {sentence[:20]}...")
121
- tts = gTTS(text=sentence, lang='en-us', tld='com', slow=False)
122
- tts.write_to_fp(mp3_fp)
123
- mp3_fp.seek(0)
124
-
125
- # Process audio data
126
- data, samplerate = sf.read(mp3_fp)
127
-
128
- # Convert to mono if stereo
129
- if len(data.shape) > 1 and data.shape[1] > 1:
130
- data = data[:, 0]
131
-
132
- # Resample to 24000 Hz if needed
133
- if samplerate != 24000:
134
- data = np.interp(
135
- np.linspace(0, len(data), int(len(data) * 24000 / samplerate)),
136
- np.arange(len(data)),
137
- data
138
- )
139
-
140
- # Convert to 16-bit integers
141
- data = (data * 32767).astype(np.int16)
142
-
143
- # Ensure buffer size is even
144
- if len(data) % 2 != 0:
145
- data = np.append(data, [0])
146
-
147
- # Reshape and yield in chunks
148
- chunk_size = 4800
149
- for i in range(0, len(data), chunk_size):
150
- chunk = data[i:i+chunk_size]
151
- if len(chunk) > 0:
152
- if len(chunk) % 2 != 0:
153
- chunk = np.append(chunk, [0])
154
- chunk = chunk.reshape(1, -1)
155
- yield (24000, chunk)
156
- except Exception as e:
157
- print(f"Exception in text_to_speech: {e}")
158
- yield None
159
-
160
- # Add this debug statement AFTER the function definition
161
- print("text_to_speech function:", inspect.getsource(text_to_speech))
162
-
163
- if __name__ == "__main__":
164
- os.environ["GRADIO_SSR_MODE"] = "false"
165
-
166
- # Check FastRTC version
167
- import fastrtc
168
- print(f"FastRTC version: {fastrtc.__version__ if hasattr(fastrtc, '__version__') else 'unknown'}")
169
-
170
- # Try running fastphone with additional diagnostic
171
- print("Starting phone service - attempting to inspect fastphone method...")
172
- import inspect
173
- print(f"FastPhone signature: {inspect.signature(stream.fastphone) if hasattr(stream, 'fastphone') else 'Not available'}")
174
-
175
- try:
176
- # Fix: Use keyword argument instead of positional
177
- phone_service = stream.fastphone(
178
- token=os.getenv("HF_TOKEN"),
179
- host="127.0.0.1",
180
- port=8000,
181
- share_server_tls_certificate=True # Use keyword argument format
182
- )
183
- print("Phone service started successfully")
184
- except Exception as e:
185
- print(f"Error starting phone service: {e}")
186
- print("Falling back to web interface...")
187
- # Launch with web interface as fallback
188
- stream.ui.launch(server_port=7860)
189
-
190
- # Remove or comment out the following lines:
191
- # !pip install -q torch==2.0.1 torchaudio==2.0.2 gradio requests soundfile huggingface_hub
192
- # !wget -q https://github.com/seasalt-ai/csm/archive/refs/heads/main.zip
193
- # !unzip -q main.zip
194
- # !mv csm-main csm
195
- # !cd csm && pip install -e .
196
- #
197
- # # Set up directories
198
- # import os
199
- # import sys
200
- # sys.path.append("/content/csm")
201
- # voice_samples_dir = "/content/csm_voice_samples"
202
- # output_dir = "/content/csm_output"
203
- # os.makedirs(voice_samples_dir, exist_ok=True)
204
- # os.makedirs(output_dir, exist_ok=True)
205
- #
206
- # print("✅ Dependencies installed!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import gradio as gr
4
+ import numpy as np
5
+ from dotenv import load_dotenv
6
+ from elevenlabs import ElevenLabs
7
+ from fastrtc import (
8
+ Stream,
9
+ get_stt_model,
10
+ ReplyOnPause,
11
+ AdditionalOutputs
12
+ )
13
+
14
+ import requests
15
+ import io
16
+ import soundfile as sf
17
+ from gtts import gTTS
18
+ import re
19
+ import inspect
20
+ import torch
21
+ import torchaudio
22
+ import sys
23
+ from huggingface_hub import login, hf_hub_download
24
+
25
+ from deepseek import DeepSeekAPI
26
+
27
+ # Load environment variables
28
+ load_dotenv()
29
+
30
+ # Initialize clients
31
+ elevenlabs_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
32
+ stt_model = get_stt_model()
33
+ deepseek_client = DeepSeekAPI(api_key=os.getenv("DEEPSEEK_API_KEY"))
34
+
35
+ # Add this debug code temporarily to see what methods are available:
36
+ print(dir(deepseek_client))
37
+
38
+ # Set CSM to None to skip that option
39
+ csm_generator = None
40
+
41
+ def response(
42
+ audio: tuple[int, np.ndarray],
43
+ chatbot: list[dict] | None = None,
44
+ ):
45
+ chatbot = chatbot or []
46
+ messages = [{"role": d["role"], "content": d["content"]} for d in chatbot]
47
+
48
+ # Convert speech to text
49
+ text = stt_model.stt(audio)
50
+ print("prompt:", text)
51
+
52
+ # Add user message to chat
53
+ chatbot.append({"role": "user", "content": text})
54
+ yield AdditionalOutputs(chatbot)
55
+
56
+ # Get AI response
57
+ messages.append({"role": "user", "content": text})
58
+ response_text = get_deepseek_response(messages)
59
+
60
+ # Add AI response to chat
61
+ chatbot.append({"role": "assistant", "content": response_text})
62
+
63
+ # Convert response to speech
64
+ for audio_data in text_to_speech(response_text):
65
+ if audio_data:
66
+ yield audio_data
67
+
68
+ yield AdditionalOutputs(chatbot)
69
+
70
+ # Create Gradio interface
71
+ chatbot = gr.Chatbot(type="messages")
72
+ stream = Stream(
73
+ modality="audio",
74
+ mode="send-receive",
75
+ handler=ReplyOnPause(response, input_sample_rate=16000),
76
+ additional_outputs_handler=lambda a, b: b,
77
+ additional_inputs=[chatbot],
78
+ additional_outputs=[chatbot],
79
+ ui_args={"title": "LLM Voice Chat (Powered by DeepSeek & ElevenLabs)"}
80
+ )
81
+
82
+ # Create FastAPI app and mount stream
83
+ from fastapi import FastAPI
84
+ app = FastAPI()
85
+ app = gr.mount_gradio_app(app, stream.ui, path="/")
86
+ stream.mount(app) # Mount the stream for telephone/fastphone integration
87
+
88
+ # Update the chat completion part based on available methods:
89
+ # We'll use direct HTTP requests as a fallback since the API structure is unclear:
90
+ def get_deepseek_response(messages):
91
+ url = "https://api.deepseek.com/v1/chat/completions"
92
+ headers = {
93
+ "Content-Type": "application/json",
94
+ "Authorization": f"Bearer {os.getenv('DEEPSEEK_API_KEY')}"
95
+ }
96
+ payload = {
97
+ "model": "deepseek-chat",
98
+ "messages": messages,
99
+ "temperature": 0.7,
100
+ "max_tokens": 512
101
+ }
102
+ response = requests.post(url, json=payload, headers=headers)
103
+
104
+ # Check for error response
105
+ if response.status_code != 200:
106
+ print(f"DeepSeek API error: {response.status_code} - {response.text}")
107
+ return "I'm sorry, I encountered an error processing your request."
108
+
109
+ response_json = response.json()
110
+ return response_json["choices"][0]["message"]["content"]
111
+
112
+ # Helper function for gTTS
113
+ def use_gtts_for_sentence(sentence):
114
+ """Helper function to generate speech with gTTS"""
115
+ try:
116
+ # Process each sentence separately
117
+ mp3_fp = io.BytesIO()
118
+
119
+ # Force US English
120
+ print(f"Using gTTS with en-us locale for sentence: {sentence[:20]}...")
121
+ tts = gTTS(text=sentence, lang='en-us', tld='com', slow=False)
122
+ tts.write_to_fp(mp3_fp)
123
+ mp3_fp.seek(0)
124
+
125
+ # Process audio data
126
+ data, samplerate = sf.read(mp3_fp)
127
+
128
+ # Convert to mono if stereo
129
+ if len(data.shape) > 1 and data.shape[1] > 1:
130
+ data = data[:, 0]
131
+
132
+ # Resample to 24000 Hz if needed
133
+ if samplerate != 24000:
134
+ data = np.interp(
135
+ np.linspace(0, len(data), int(len(data) * 24000 / samplerate)),
136
+ np.arange(len(data)),
137
+ data
138
+ )
139
+
140
+ # Convert to 16-bit integers
141
+ data = (data * 32767).astype(np.int16)
142
+
143
+ # Ensure buffer size is even
144
+ if len(data) % 2 != 0:
145
+ data = np.append(data, [0])
146
+
147
+ # Reshape and yield in chunks
148
+ chunk_size = 4800
149
+ for i in range(0, len(data), chunk_size):
150
+ chunk = data[i:i+chunk_size]
151
+ if len(chunk) > 0:
152
+ if len(chunk) % 2 != 0:
153
+ chunk = np.append(chunk, [0])
154
+ chunk = chunk.reshape(1, -1)
155
+ yield (24000, chunk)
156
+ except Exception as e:
157
+ print(f"gTTS error: {e}")
158
+ yield None
159
+
160
+ # Replace the text_to_speech function with this version
161
+ def text_to_speech(text):
162
+ """Convert text to speech using ElevenLabs or gTTS as fallback"""
163
+ try:
164
+ # Split text into sentences for faster perceived response
165
+ sentences = re.split(r'(?<=[.!?])\s+', text)
166
+
167
+ # Try ElevenLabs first
168
+ if os.getenv("ELEVENLABS_API_KEY"):
169
+ print("Using ElevenLabs for text-to-speech...")
170
+
171
+ for sentence in sentences:
172
+ if not sentence.strip():
173
+ continue
174
+
175
+ try:
176
+ print(f"Generating ElevenLabs speech for: {sentence[:30]}...")
177
+
178
+ # Generate audio using ElevenLabs
179
+ audio_data = elevenlabs_client.generate(
180
+ text=sentence,
181
+ voice="Antoni", # You can change to any available voice
182
+ model="eleven_monolingual_v1"
183
+ )
184
+
185
+ # Convert to numpy array
186
+ mp3_fp = io.BytesIO(audio_data)
187
+ data, samplerate = sf.read(mp3_fp)
188
+
189
+ # Convert to mono if stereo
190
+ if len(data.shape) > 1 and data.shape[1] > 1:
191
+ data = data[:, 0]
192
+
193
+ # Resample to 24000 Hz if needed
194
+ if samplerate != 24000:
195
+ data = np.interp(
196
+ np.linspace(0, len(data), int(len(data) * 24000 / samplerate)),
197
+ np.arange(len(data)),
198
+ data
199
+ )
200
+
201
+ # Convert to 16-bit integers
202
+ data = (data * 32767).astype(np.int16)
203
+
204
+ # Ensure buffer size is even
205
+ if len(data) % 2 != 0:
206
+ data = np.append(data, [0])
207
+
208
+ # Reshape and yield in chunks
209
+ chunk_size = 4800
210
+ for i in range(0, len(data), chunk_size):
211
+ chunk = data[i:i+chunk_size]
212
+ if len(chunk) > 0:
213
+ if len(chunk) % 2 != 0:
214
+ chunk = np.append(chunk, [0])
215
+ chunk = chunk.reshape(1, -1)
216
+ yield (24000, chunk)
217
+
218
+ except Exception as e:
219
+ print(f"ElevenLabs error: {e}, falling back to gTTS")
220
+ # Fall through to gTTS for this sentence
221
+ for audio_chunk in use_gtts_for_sentence(sentence):
222
+ if audio_chunk:
223
+ yield audio_chunk
224
+ else:
225
+ # Fall back to gTTS
226
+ print("ElevenLabs API key not found, using gTTS...")
227
+ for sentence in sentences:
228
+ if sentence.strip():
229
+ for audio_chunk in use_gtts_for_sentence(sentence):
230
+ if audio_chunk:
231
+ yield audio_chunk
232
+ except Exception as e:
233
+ print(f"Exception in text_to_speech: {e}")
234
+ yield None
235
+
236
+ # Add this debug statement AFTER the function definition
237
+ print("text_to_speech function:", inspect.getsource(text_to_speech))
238
+
239
+ if __name__ == "__main__":
240
+ os.environ["GRADIO_SSR_MODE"] = "false"
241
+
242
+ # Check FastRTC version
243
+ import fastrtc
244
+ print(f"FastRTC version: {fastrtc.__version__ if hasattr(fastrtc, '__version__') else 'unknown'}")
245
+
246
+ # Try running fastphone with additional diagnostic
247
+ print("Starting phone service - attempting to inspect fastphone method...")
248
+ import inspect
249
+ print(f"FastPhone signature: {inspect.signature(stream.fastphone) if hasattr(stream, 'fastphone') else 'Not available'}")
250
+
251
+ try:
252
+ # Fix: Use keyword argument instead of positional
253
+ phone_service = stream.fastphone(
254
+ token=os.getenv("HF_TOKEN"),
255
+ host="127.0.0.1",
256
+ port=8000,
257
+ share_server_tls_certificate=True # Use keyword argument format
258
+ )
259
+ print("Phone service started successfully")
260
+ except Exception as e:
261
+ print(f"Error starting phone service: {e}")
262
+ print("Falling back to web interface...")
263
+ # Launch with web interface as fallback
264
+ stream.ui.launch(server_port=7860)