Twelve2five commited on
Commit
ccc0748
·
verified ·
1 Parent(s): 01f7ec4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -130
app.py CHANGED
@@ -1,33 +1,25 @@
1
  import os
2
  import time
 
3
  import gradio as gr
4
  import numpy as np
5
  from dotenv import load_dotenv
6
  from elevenlabs import ElevenLabs
 
7
  from fastrtc import (
 
 
8
  Stream,
9
  get_stt_model,
10
- ReplyOnPause,
11
- AdditionalOutputs
12
  )
13
- import logging
14
- import requests
15
- import io
16
- import soundfile as sf
17
- from gtts import gTTS
18
- import re
19
-
20
- # Configure logging
21
- logging.basicConfig(level=logging.INFO)
22
- logger = logging.getLogger("voice-assistant")
23
 
24
  # Load environment variables
25
  load_dotenv()
26
 
27
- # Initialize clients
28
- elevenlabs_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
29
- stt_model = get_stt_model()
30
-
31
  class DeepSeekAPI:
32
  def __init__(self, api_key):
33
  self.api_key = api_key
@@ -48,130 +40,66 @@ class DeepSeekAPI:
48
 
49
  # Check for error response
50
  if response.status_code != 200:
51
- logger.error(f"DeepSeek API error: {response.status_code} - {response.text}")
52
  return {"choices": [{"message": {"content": "I'm sorry, I encountered an error processing your request."}}]}
53
 
54
  return response.json()
55
 
 
56
  deepseek_client = DeepSeekAPI(api_key=os.getenv("DEEPSEEK_API_KEY"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
 
58
  def response(
59
- audio: tuple[int, np.ndarray],
60
- chatbot: list[tuple] | None = None,
61
  ):
62
  chatbot = chatbot or []
63
-
64
- # Convert speech to text
65
  text = stt_model.stt(audio)
66
- logger.info(f"User said: {text}")
67
-
68
- # Add user message to chat
69
- chatbot.append((text, None))
70
  yield AdditionalOutputs(chatbot)
 
71
 
72
- # Get AI response
73
- messages = []
74
- for user_text, assistant_text in chatbot:
75
- messages.append({"role": "user", "content": user_text})
76
- if assistant_text:
77
- messages.append({"role": "assistant", "content": assistant_text})
78
-
79
- # Call DeepSeek API
80
- response_data = deepseek_client.chat_completion(messages)
81
  response_text = response_data["choices"][0]["message"]["content"]
82
- logger.info(f"DeepSeek response: {response_text[:50]}...")
83
-
84
- # Update chatbot with AI response
85
- chatbot[-1] = (text, response_text)
 
 
 
 
 
 
 
86
  yield AdditionalOutputs(chatbot)
87
-
88
- # Convert response to speech
89
- if os.getenv("ELEVENLABS_API_KEY"):
90
- try:
91
- logger.info("Using ElevenLabs for speech generation")
92
-
93
- # Use the streaming API for better experience
94
- for chunk in elevenlabs_client.text_to_speech.convert_as_stream(
95
- text=response_text,
96
- voice_id="Antoni",
97
- model_id="eleven_monolingual_v1",
98
- output_format="pcm_24000"
99
- ):
100
- audio_array = np.frombuffer(chunk, dtype=np.int16).reshape(1, -1)
101
- yield (24000, audio_array)
102
-
103
- except Exception as e:
104
- logger.error(f"ElevenLabs error: {e}, falling back to gTTS")
105
- # Fall back to gTTS
106
- yield from use_gtts_for_text(response_text)
107
- else:
108
- # Fall back to gTTS
109
- logger.info("ElevenLabs API key not found, using gTTS...")
110
- yield from use_gtts_for_text(response_text)
111
-
112
- def use_gtts_for_text(text):
113
- """Helper function to generate speech with gTTS for the entire text"""
114
- try:
115
- # Split text into sentences for better results
116
- sentences = re.split(r'(?<=[.!?])\s+', text)
117
-
118
- for sentence in sentences:
119
- if not sentence.strip():
120
- continue
121
-
122
- mp3_fp = io.BytesIO()
123
- logger.info(f"Using gTTS for: {sentence[:30]}...")
124
- tts = gTTS(text=sentence, lang='en-us', tld='com', slow=False)
125
- tts.write_to_fp(mp3_fp)
126
- mp3_fp.seek(0)
127
-
128
- data, samplerate = sf.read(mp3_fp)
129
-
130
- if len(data.shape) > 1 and data.shape[1] > 1:
131
- data = data[:, 0]
132
-
133
- if samplerate != 24000:
134
- data = np.interp(
135
- np.linspace(0, len(data), int(len(data) * 24000 / samplerate)),
136
- np.arange(len(data)),
137
- data
138
- )
139
-
140
- data = (data * 32767).astype(np.int16)
141
-
142
- # Ensure buffer size is even
143
- if len(data) % 2 != 0:
144
- data = np.append(data, [0])
145
-
146
- # Reshape and yield in chunks
147
- chunk_size = 4800
148
- for i in range(0, len(data), chunk_size):
149
- chunk = data[i:i+chunk_size]
150
- if len(chunk) > 0:
151
- if len(chunk) % 2 != 0:
152
- chunk = np.append(chunk, [0])
153
- chunk = chunk.reshape(1, -1)
154
- yield (24000, chunk)
155
- except Exception as e:
156
- logger.error(f"gTTS error: {e}")
157
- yield None
158
-
159
- # Basic WebRTC configuration - just the minimum needed
160
- rtc_configuration = {
161
- "iceServers": [
162
- {"urls": ["stun:stun.l.google.com:19302"]},
163
- {
164
- "urls": ["turn:openrelay.metered.ca:80"],
165
- "username": "openrelayproject",
166
- "credential": "openrelayproject"
167
- }
168
- ]
169
- }
170
 
171
- # Create chatbot component for tracking conversation
172
- chatbot = gr.Chatbot()
173
 
174
- # Create Stream outside of any blocks context
 
175
  stream = Stream(
176
  modality="audio",
177
  mode="send-receive",
@@ -179,13 +107,25 @@ stream = Stream(
179
  additional_outputs_handler=lambda a, b: b,
180
  additional_inputs=[chatbot],
181
  additional_outputs=[chatbot],
182
- rtc_configuration=rtc_configuration,
183
- ui_args={"title": "LLM Voice Chat (DeepSeek & ElevenLabs)"}
 
 
184
  )
185
 
186
- # Export the UI directly
187
- demo = stream.ui
 
 
188
 
189
- # Expose the demo for Hugging Face Spaces
190
  if __name__ == "__main__":
191
- demo.launch()
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import time
3
+ import requests
4
  import gradio as gr
5
  import numpy as np
6
  from dotenv import load_dotenv
7
  from elevenlabs import ElevenLabs
8
+ from fastapi import FastAPI
9
  from fastrtc import (
10
+ AdditionalOutputs,
11
+ ReplyOnPause,
12
  Stream,
13
  get_stt_model,
14
+ get_twilio_turn_credentials,
 
15
  )
16
+ from gradio.utils import get_space
17
+ from numpy.typing import NDArray
 
 
 
 
 
 
 
 
18
 
19
  # Load environment variables
20
  load_dotenv()
21
 
22
+ # Initialize DeepSeek client
 
 
 
23
  class DeepSeekAPI:
24
  def __init__(self, api_key):
25
  self.api_key = api_key
 
40
 
41
  # Check for error response
42
  if response.status_code != 200:
43
+ print(f"DeepSeek API error: {response.status_code} - {response.text}")
44
  return {"choices": [{"message": {"content": "I'm sorry, I encountered an error processing your request."}}]}
45
 
46
  return response.json()
47
 
48
+ # Initialize clients
49
  deepseek_client = DeepSeekAPI(api_key=os.getenv("DEEPSEEK_API_KEY"))
50
+ tts_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
51
+ stt_model = get_stt_model()
52
+
53
+ # Get Twilio TURN credentials
54
+ twilio_credentials = get_twilio_turn_credentials(
55
+ account_sid=os.getenv("TWILIO_ACCOUNT_SID"),
56
+ auth_token=os.getenv("TWILIO_AUTH_TOKEN")
57
+ )
58
+
59
+ # Log Twilio status
60
+ if twilio_credentials:
61
+ print("Twilio TURN credentials successfully configured")
62
+ else:
63
+ print("No Twilio credentials found or invalid credentials")
64
+
65
 
66
+ # Handler function for voice conversation
67
  def response(
68
+ audio: tuple[int, NDArray[np.int16 | np.float32]],
69
+ chatbot: list[dict] | None = None,
70
  ):
71
  chatbot = chatbot or []
72
+ messages = [{"role": d["role"], "content": d["content"]} for d in chatbot]
73
+ start = time.time()
74
  text = stt_model.stt(audio)
75
+ print("transcription", time.time() - start)
76
+ print("prompt", text)
77
+ chatbot.append({"role": "user", "content": text})
 
78
  yield AdditionalOutputs(chatbot)
79
+ messages.append({"role": "user", "content": text})
80
 
81
+ # Replace Groq LLM with DeepSeek
82
+ response_data = deepseek_client.chat_completion(
83
+ messages=messages,
84
+ max_tokens=512
85
+ )
 
 
 
 
86
  response_text = response_data["choices"][0]["message"]["content"]
87
+
88
+ chatbot.append({"role": "assistant", "content": response_text})
89
+
90
+ for chunk in tts_client.text_to_speech.convert_as_stream(
91
+ text=response_text,
92
+ voice_id="Antoni", # Changed to Antoni, a default voice
93
+ model_id="eleven_multilingual_v2",
94
+ output_format="pcm_24000",
95
+ ):
96
+ audio_array = np.frombuffer(chunk, dtype=np.int16).reshape(1, -1)
97
+ yield (24000, audio_array)
98
  yield AdditionalOutputs(chatbot)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
 
 
100
 
101
+ # Create the chatbot and Stream components
102
+ chatbot = gr.Chatbot(type="messages")
103
  stream = Stream(
104
  modality="audio",
105
  mode="send-receive",
 
107
  additional_outputs_handler=lambda a, b: b,
108
  additional_inputs=[chatbot],
109
  additional_outputs=[chatbot],
110
+ rtc_configuration=twilio_credentials, # Always use Twilio credentials
111
+ concurrency_limit=5 if get_space() else None,
112
+ time_limit=90 if get_space() else None,
113
+ ui_args={"title": "LLM Voice Chat (Powered by DeepSeek, ElevenLabs, and WebRTC ⚡️)"},
114
  )
115
 
116
+ # Mount the STREAM UI to the FastAPI app
117
+ app = FastAPI()
118
+ app = gr.mount_gradio_app(app, stream.ui, path="/")
119
+
120
 
 
121
  if __name__ == "__main__":
122
+ import os
123
+
124
+ os.environ["GRADIO_SSR_MODE"] = "false"
125
+
126
+ if (mode := os.getenv("MODE")) == "UI":
127
+ stream.ui.launch(server_port=7860)
128
+ elif mode == "PHONE":
129
+ stream.fastphone(host="0.0.0.0", port=7860)
130
+ else:
131
+ stream.ui.launch(server_port=7860)