Twelve2five commited on
Commit
013f6a1
·
verified ·
1 Parent(s): 3ee34e5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -75
app.py CHANGED
@@ -16,11 +16,6 @@ import io
16
  import soundfile as sf
17
  from gtts import gTTS
18
  import re
19
- import torch
20
- import torchaudio
21
- from huggingface_hub import login, hf_hub_download
22
-
23
- from deepseek import DeepSeekAPI
24
 
25
  # Load environment variables
26
  load_dotenv()
@@ -28,13 +23,33 @@ load_dotenv()
28
  # Initialize clients
29
  elevenlabs_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
30
  stt_model = get_stt_model()
31
- deepseek_client = DeepSeekAPI(api_key=os.getenv("DEEPSEEK_API_KEY"))
32
 
33
- # Add this debug code temporarily to see what methods are available:
34
- print(dir(deepseek_client))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
- # Set CSM to None to skip that option
37
- csm_generator = None
38
 
39
  def response(
40
  audio: tuple[int, np.ndarray],
@@ -53,7 +68,10 @@ def response(
53
 
54
  # Get AI response
55
  messages.append({"role": "user", "content": text})
56
- response_text = get_deepseek_response(messages)
 
 
 
57
 
58
  # Add AI response to chat
59
  chatbot.append({"role": "assistant", "content": response_text})
@@ -65,27 +83,21 @@ def response(
65
 
66
  yield AdditionalOutputs(chatbot)
67
 
68
- # Your existing helper functions remain unchanged
69
  def use_gtts_for_sentence(sentence):
70
  """Helper function to generate speech with gTTS"""
71
  try:
72
- # Process each sentence separately
73
  mp3_fp = io.BytesIO()
74
-
75
- # Force US English
76
  print(f"Using gTTS with en-us locale for sentence: {sentence[:20]}...")
77
  tts = gTTS(text=sentence, lang='en-us', tld='com', slow=False)
78
  tts.write_to_fp(mp3_fp)
79
  mp3_fp.seek(0)
80
 
81
- # Process audio data
82
  data, samplerate = sf.read(mp3_fp)
83
 
84
- # Convert to mono if stereo
85
  if len(data.shape) > 1 and data.shape[1] > 1:
86
  data = data[:, 0]
87
 
88
- # Resample to 24000 Hz if needed
89
  if samplerate != 24000:
90
  data = np.interp(
91
  np.linspace(0, len(data), int(len(data) * 24000 / samplerate)),
@@ -93,14 +105,11 @@ def use_gtts_for_sentence(sentence):
93
  data
94
  )
95
 
96
- # Convert to 16-bit integers
97
  data = (data * 32767).astype(np.int16)
98
 
99
- # Ensure buffer size is even
100
  if len(data) % 2 != 0:
101
  data = np.append(data, [0])
102
 
103
- # Reshape and yield in chunks
104
  chunk_size = 4800
105
  for i in range(0, len(data), chunk_size):
106
  chunk = data[i:i+chunk_size]
@@ -116,10 +125,8 @@ def use_gtts_for_sentence(sentence):
116
  def text_to_speech(text):
117
  """Convert text to speech using ElevenLabs or gTTS as fallback"""
118
  try:
119
- # Split text into sentences for faster perceived response
120
  sentences = re.split(r'(?<=[.!?])\s+', text)
121
 
122
- # Try ElevenLabs first
123
  if os.getenv("ELEVENLABS_API_KEY"):
124
  print("Using ElevenLabs for text-to-speech...")
125
 
@@ -130,22 +137,18 @@ def text_to_speech(text):
130
  try:
131
  print(f"Generating ElevenLabs speech for: {sentence[:30]}...")
132
 
133
- # Generate audio using ElevenLabs
134
  audio_data = elevenlabs_client.generate(
135
  text=sentence,
136
- voice="Antoni", # You can change to any available voice
137
  model="eleven_monolingual_v1"
138
  )
139
 
140
- # Convert to numpy array
141
  mp3_fp = io.BytesIO(audio_data)
142
  data, samplerate = sf.read(mp3_fp)
143
 
144
- # Convert to mono if stereo
145
  if len(data.shape) > 1 and data.shape[1] > 1:
146
  data = data[:, 0]
147
 
148
- # Resample to 24000 Hz if needed
149
  if samplerate != 24000:
150
  data = np.interp(
151
  np.linspace(0, len(data), int(len(data) * 24000 / samplerate)),
@@ -153,14 +156,11 @@ def text_to_speech(text):
153
  data
154
  )
155
 
156
- # Convert to 16-bit integers
157
  data = (data * 32767).astype(np.int16)
158
 
159
- # Ensure buffer size is even
160
  if len(data) % 2 != 0:
161
  data = np.append(data, [0])
162
 
163
- # Reshape and yield in chunks
164
  chunk_size = 4800
165
  for i in range(0, len(data), chunk_size):
166
  chunk = data[i:i+chunk_size]
@@ -172,12 +172,10 @@ def text_to_speech(text):
172
 
173
  except Exception as e:
174
  print(f"ElevenLabs error: {e}, falling back to gTTS")
175
- # Fall through to gTTS for this sentence
176
  for audio_chunk in use_gtts_for_sentence(sentence):
177
  if audio_chunk:
178
  yield audio_chunk
179
  else:
180
- # Fall back to gTTS
181
  print("ElevenLabs API key not found, using gTTS...")
182
  for sentence in sentences:
183
  if sentence.strip():
@@ -188,28 +186,6 @@ def text_to_speech(text):
188
  print(f"Exception in text_to_speech: {e}")
189
  yield None
190
 
191
- def get_deepseek_response(messages):
192
- url = "https://api.deepseek.com/v1/chat/completions"
193
- headers = {
194
- "Content-Type": "application/json",
195
- "Authorization": f"Bearer {os.getenv('DEEPSEEK_API_KEY')}"
196
- }
197
- payload = {
198
- "model": "deepseek-chat",
199
- "messages": messages,
200
- "temperature": 0.7,
201
- "max_tokens": 512
202
- }
203
- response = requests.post(url, json=payload, headers=headers)
204
-
205
- # Check for error response
206
- if response.status_code != 200:
207
- print(f"DeepSeek API error: {response.status_code} - {response.text}")
208
- return "I'm sorry, I encountered an error processing your request."
209
-
210
- response_json = response.json()
211
- return response_json["choices"][0]["message"]["content"]
212
-
213
  # WebRTC configuration required for Hugging Face Spaces
214
  rtc_config = {
215
  "iceServers": [
@@ -232,24 +208,25 @@ rtc_config = {
232
  ]
233
  }
234
 
235
- # Create Gradio interface with the required rtc_configuration
236
- chatbot = gr.Chatbot(type="messages")
237
- stream = Stream(
238
- modality="audio",
239
- mode="send-receive",
240
- handler=ReplyOnPause(response, input_sample_rate=16000),
241
- additional_outputs_handler=lambda a, b: b,
242
- additional_inputs=[chatbot],
243
- additional_outputs=[chatbot],
244
- ui_args={"title": "LLM Voice Chat (Powered by DeepSeek & ElevenLabs)"},
245
- rtc_configuration=rtc_config # Add the WebRTC configuration
246
- )
247
-
248
- # Create the Gradio interface without serving it
249
- ui = stream.ui
250
-
251
- # Export the Gradio app for Hugging Face Spaces to find it
252
- # In Hugging Face Spaces, this will be automatically served
253
- demo = ui
 
254
 
255
- # Do not include any server initialization code here - just export the Gradio app
 
16
  import soundfile as sf
17
  from gtts import gTTS
18
  import re
 
 
 
 
 
19
 
20
  # Load environment variables
21
  load_dotenv()
 
23
  # Initialize clients
24
  elevenlabs_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
25
  stt_model = get_stt_model()
 
26
 
27
+ class DeepSeekAPI:
28
+ def __init__(self, api_key):
29
+ self.api_key = api_key
30
+
31
+ def chat_completion(self, messages, temperature=0.7, max_tokens=512):
32
+ url = "https://api.deepseek.com/v1/chat/completions"
33
+ headers = {
34
+ "Content-Type": "application/json",
35
+ "Authorization": f"Bearer {self.api_key}"
36
+ }
37
+ payload = {
38
+ "model": "deepseek-chat",
39
+ "messages": messages,
40
+ "temperature": temperature,
41
+ "max_tokens": max_tokens
42
+ }
43
+ response = requests.post(url, json=payload, headers=headers)
44
+
45
+ # Check for error response
46
+ if response.status_code != 200:
47
+ print(f"DeepSeek API error: {response.status_code} - {response.text}")
48
+ return {"choices": [{"message": {"content": "I'm sorry, I encountered an error processing your request."}}]}
49
+
50
+ return response.json()
51
 
52
+ deepseek_client = DeepSeekAPI(api_key=os.getenv("DEEPSEEK_API_KEY"))
 
53
 
54
  def response(
55
  audio: tuple[int, np.ndarray],
 
68
 
69
  # Get AI response
70
  messages.append({"role": "user", "content": text})
71
+
72
+ # Call DeepSeek API
73
+ response_data = deepseek_client.chat_completion(messages)
74
+ response_text = response_data["choices"][0]["message"]["content"]
75
 
76
  # Add AI response to chat
77
  chatbot.append({"role": "assistant", "content": response_text})
 
83
 
84
  yield AdditionalOutputs(chatbot)
85
 
86
+ # Your existing helper functions
87
  def use_gtts_for_sentence(sentence):
88
  """Helper function to generate speech with gTTS"""
89
  try:
 
90
  mp3_fp = io.BytesIO()
 
 
91
  print(f"Using gTTS with en-us locale for sentence: {sentence[:20]}...")
92
  tts = gTTS(text=sentence, lang='en-us', tld='com', slow=False)
93
  tts.write_to_fp(mp3_fp)
94
  mp3_fp.seek(0)
95
 
 
96
  data, samplerate = sf.read(mp3_fp)
97
 
 
98
  if len(data.shape) > 1 and data.shape[1] > 1:
99
  data = data[:, 0]
100
 
 
101
  if samplerate != 24000:
102
  data = np.interp(
103
  np.linspace(0, len(data), int(len(data) * 24000 / samplerate)),
 
105
  data
106
  )
107
 
 
108
  data = (data * 32767).astype(np.int16)
109
 
 
110
  if len(data) % 2 != 0:
111
  data = np.append(data, [0])
112
 
 
113
  chunk_size = 4800
114
  for i in range(0, len(data), chunk_size):
115
  chunk = data[i:i+chunk_size]
 
125
  def text_to_speech(text):
126
  """Convert text to speech using ElevenLabs or gTTS as fallback"""
127
  try:
 
128
  sentences = re.split(r'(?<=[.!?])\s+', text)
129
 
 
130
  if os.getenv("ELEVENLABS_API_KEY"):
131
  print("Using ElevenLabs for text-to-speech...")
132
 
 
137
  try:
138
  print(f"Generating ElevenLabs speech for: {sentence[:30]}...")
139
 
 
140
  audio_data = elevenlabs_client.generate(
141
  text=sentence,
142
+ voice="Antoni",
143
  model="eleven_monolingual_v1"
144
  )
145
 
 
146
  mp3_fp = io.BytesIO(audio_data)
147
  data, samplerate = sf.read(mp3_fp)
148
 
 
149
  if len(data.shape) > 1 and data.shape[1] > 1:
150
  data = data[:, 0]
151
 
 
152
  if samplerate != 24000:
153
  data = np.interp(
154
  np.linspace(0, len(data), int(len(data) * 24000 / samplerate)),
 
156
  data
157
  )
158
 
 
159
  data = (data * 32767).astype(np.int16)
160
 
 
161
  if len(data) % 2 != 0:
162
  data = np.append(data, [0])
163
 
 
164
  chunk_size = 4800
165
  for i in range(0, len(data), chunk_size):
166
  chunk = data[i:i+chunk_size]
 
172
 
173
  except Exception as e:
174
  print(f"ElevenLabs error: {e}, falling back to gTTS")
 
175
  for audio_chunk in use_gtts_for_sentence(sentence):
176
  if audio_chunk:
177
  yield audio_chunk
178
  else:
 
179
  print("ElevenLabs API key not found, using gTTS...")
180
  for sentence in sentences:
181
  if sentence.strip():
 
186
  print(f"Exception in text_to_speech: {e}")
187
  yield None
188
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  # WebRTC configuration required for Hugging Face Spaces
190
  rtc_config = {
191
  "iceServers": [
 
208
  ]
209
  }
210
 
211
+ # Initialize Gradio app with a standard pattern that Hugging Face recognizes
212
+ with gr.Blocks(title="LLM Voice Chat") as demo:
213
+ gr.Markdown("# LLM Voice Chat (Powered by DeepSeek & ElevenLabs)")
214
+
215
+ # Create a custom Stream component that Gradio can render
216
+ chatbot = gr.Chatbot(type="messages")
217
+
218
+ # This is the key part - use Stream as a component inside the Gradio app
219
+ stream_component = Stream(
220
+ modality="audio",
221
+ mode="send-receive",
222
+ handler=ReplyOnPause(response, input_sample_rate=16000),
223
+ additional_outputs_handler=lambda a, b: b,
224
+ additional_inputs=[chatbot],
225
+ additional_outputs=[chatbot],
226
+ rtc_configuration=rtc_config
227
+ )
228
+
229
+ # Make the stream component appear in the Gradio UI
230
+ stream_component.render()
231
 
232
+ # The variable 'demo' will be picked up by Hugging Face Spaces