Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -2,7 +2,6 @@
|
|
2 |
# ruff: noqa: E402
|
3 |
|
4 |
import json
|
5 |
-
import re
|
6 |
import tempfile
|
7 |
import os
|
8 |
|
@@ -17,7 +16,7 @@ from groq import Groq
|
|
17 |
from cached_path import cached_path
|
18 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
19 |
|
20 |
-
# Try to import spaces; if available,
|
21 |
try:
|
22 |
import spaces
|
23 |
|
@@ -70,7 +69,6 @@ def load_f5tts(
|
|
70 |
F5TTS_ema_model = load_f5tts()
|
71 |
|
72 |
|
73 |
-
|
74 |
@gpu_decorator
|
75 |
def generate_response(messages, apikey):
|
76 |
"""
|
@@ -88,14 +86,13 @@ def generate_response(messages, apikey):
|
|
88 |
model="deepseek-r1-distill-llama-70b",
|
89 |
stream=False,
|
90 |
)
|
91 |
-
# Check that we got a valid response.
|
92 |
if chat_completion.choices and hasattr(chat_completion.choices[0].message, "content"):
|
93 |
return chat_completion.choices[0].message.content
|
94 |
return ""
|
95 |
|
96 |
|
97 |
@gpu_decorator
|
98 |
-
def process_audio_input(audio_path, text, history, conv_state):
|
99 |
"""
|
100 |
Process audio and/or text input from the user:
|
101 |
- If an audio file is provided, its transcript is obtained.
|
@@ -105,7 +102,7 @@ def process_audio_input(audio_path, text, history, conv_state):
|
|
105 |
return history, conv_state, ""
|
106 |
|
107 |
if audio_path:
|
108 |
-
# preprocess_ref_audio_text returns a tuple (audio, transcript)
|
109 |
_, text = preprocess_ref_audio_text(audio_path, text)
|
110 |
|
111 |
if not text.strip():
|
@@ -113,7 +110,7 @@ def process_audio_input(audio_path, text, history, conv_state):
|
|
113 |
|
114 |
conv_state.append({"role": "user", "content": text})
|
115 |
history.append((text, None))
|
116 |
-
response = generate_response(conv_state)
|
117 |
conv_state.append({"role": "assistant", "content": response})
|
118 |
history[-1] = (text, response)
|
119 |
return history, conv_state, ""
|
@@ -177,7 +174,7 @@ def infer(
|
|
177 |
return (final_sample_rate, final_wave), spectrogram_path, ref_text
|
178 |
|
179 |
|
180 |
-
with gr.Blocks() as
|
181 |
gr.Markdown(
|
182 |
"""
|
183 |
# Voice Chat
|
@@ -189,141 +186,133 @@ Have a conversation with an AI using your reference voice!
|
|
189 |
"""
|
190 |
)
|
191 |
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
lines=2,
|
207 |
-
)
|
208 |
-
system_prompt_chat = gr.Textbox(
|
209 |
-
label="System Prompt",
|
210 |
-
value=(
|
211 |
-
"You are not an AI assistant, you are whoever the user says you are. "
|
212 |
-
"You must stay in character. Keep your responses concise since they will be spoken out loud."
|
213 |
-
),
|
214 |
-
lines=2,
|
215 |
-
)
|
216 |
-
|
217 |
-
chatbot_interface = gr.Chatbot(label="Conversation")
|
218 |
-
with gr.Row():
|
219 |
-
with gr.Column():
|
220 |
-
audio_input_chat = gr.Microphone(label="Speak your message", type="filepath")
|
221 |
-
audio_output_chat = gr.Audio(autoplay=True)
|
222 |
-
with gr.Column():
|
223 |
-
with gr.Row():
|
224 |
-
groq_apikey = gr.Textbox(label="Your Groq API Key")
|
225 |
-
text_input_chat = gr.Textbox(label="Type your message", lines=1)
|
226 |
-
send_btn_chat = gr.Button("Send Message")
|
227 |
-
clear_btn_chat = gr.Button("Clear Conversation")
|
228 |
-
|
229 |
-
# Initialize the conversation state with the system prompt.
|
230 |
-
conversation_state = gr.State(
|
231 |
-
value=[
|
232 |
-
{
|
233 |
-
"role": "system",
|
234 |
-
"content": (
|
235 |
"You are not an AI assistant, you are whoever the user says you are. "
|
236 |
"You must stay in character. Keep your responses concise since they will be spoken out loud."
|
237 |
),
|
238 |
-
|
239 |
-
|
240 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
241 |
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
)
|
263 |
-
return audio_result, ref_text_out
|
264 |
-
|
265 |
-
def clear_conversation():
|
266 |
-
"""
|
267 |
-
Clear the chat conversation and reset the conversation state.
|
268 |
-
"""
|
269 |
-
initial_state = [
|
270 |
-
{
|
271 |
-
"role": "system",
|
272 |
-
"content": (
|
273 |
-
"You are not an AI assistant, you are whoever the user says you are. "
|
274 |
-
"You must stay in character. Keep your responses concise since they will be spoken out loud."
|
275 |
-
),
|
276 |
-
}
|
277 |
-
]
|
278 |
-
return [], initial_state
|
279 |
-
|
280 |
-
def update_system_prompt(new_prompt):
|
281 |
-
"""
|
282 |
-
Update the system prompt and reset the conversation.
|
283 |
-
"""
|
284 |
-
initial_state = [{"role": "system", "content": new_prompt}]
|
285 |
-
return [], initial_state
|
286 |
-
|
287 |
-
# Set up callbacks so that when recording stops, or text is submitted, the chain of processing is run.
|
288 |
-
audio_input_chat.stop_recording(
|
289 |
-
process_audio_input,
|
290 |
-
inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state],
|
291 |
-
outputs=[chatbot_interface, conversation_state],
|
292 |
-
).then(
|
293 |
-
generate_audio_response,
|
294 |
-
inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
|
295 |
-
outputs=[audio_output_chat, ref_text_chat],
|
296 |
-
).then(lambda: None, None, audio_input_chat)
|
297 |
-
|
298 |
-
text_input_chat.submit(
|
299 |
-
process_audio_input,
|
300 |
-
inputs=[audio_input_chat, text_input_chat, groq_apikey, chatbot_interface, conversation_state],
|
301 |
-
outputs=[chatbot_interface, conversation_state],
|
302 |
-
).then(
|
303 |
-
generate_audio_response,
|
304 |
-
inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
|
305 |
-
outputs=[audio_output_chat, ref_text_chat],
|
306 |
-
).then(lambda: None, None, text_input_chat)
|
307 |
-
|
308 |
-
send_btn_chat.click(
|
309 |
-
process_audio_input,
|
310 |
-
inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state],
|
311 |
-
outputs=[chatbot_interface, conversation_state],
|
312 |
-
).then(
|
313 |
-
generate_audio_response,
|
314 |
-
inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
|
315 |
-
outputs=[audio_output_chat, ref_text_chat],
|
316 |
-
).then(lambda: None, None, text_input_chat)
|
317 |
-
|
318 |
-
clear_btn_chat.click(clear_conversation, outputs=[chatbot_interface, conversation_state])
|
319 |
-
system_prompt_chat.change(
|
320 |
-
update_system_prompt,
|
321 |
-
inputs=system_prompt_chat,
|
322 |
-
outputs=[chatbot_interface, conversation_state],
|
323 |
)
|
|
|
324 |
|
325 |
-
|
326 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
327 |
|
328 |
|
329 |
@click.command()
|
|
|
2 |
# ruff: noqa: E402
|
3 |
|
4 |
import json
|
|
|
5 |
import tempfile
|
6 |
import os
|
7 |
|
|
|
16 |
from cached_path import cached_path
|
17 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
18 |
|
19 |
+
# Try to import spaces; if available, wrap functions for GPU support.
|
20 |
try:
|
21 |
import spaces
|
22 |
|
|
|
69 |
F5TTS_ema_model = load_f5tts()
|
70 |
|
71 |
|
|
|
72 |
@gpu_decorator
|
73 |
def generate_response(messages, apikey):
|
74 |
"""
|
|
|
86 |
model="deepseek-r1-distill-llama-70b",
|
87 |
stream=False,
|
88 |
)
|
|
|
89 |
if chat_completion.choices and hasattr(chat_completion.choices[0].message, "content"):
|
90 |
return chat_completion.choices[0].message.content
|
91 |
return ""
|
92 |
|
93 |
|
94 |
@gpu_decorator
|
95 |
+
def process_audio_input(audio_path, text, apikey, history, conv_state):
|
96 |
"""
|
97 |
Process audio and/or text input from the user:
|
98 |
- If an audio file is provided, its transcript is obtained.
|
|
|
102 |
return history, conv_state, ""
|
103 |
|
104 |
if audio_path:
|
105 |
+
# preprocess_ref_audio_text returns a tuple (audio, transcript)
|
106 |
_, text = preprocess_ref_audio_text(audio_path, text)
|
107 |
|
108 |
if not text.strip():
|
|
|
110 |
|
111 |
conv_state.append({"role": "user", "content": text})
|
112 |
history.append((text, None))
|
113 |
+
response = generate_response(conv_state, apikey)
|
114 |
conv_state.append({"role": "assistant", "content": response})
|
115 |
history[-1] = (text, response)
|
116 |
return history, conv_state, ""
|
|
|
174 |
return (final_sample_rate, final_wave), spectrogram_path, ref_text
|
175 |
|
176 |
|
177 |
+
with gr.Blocks() as app:
|
178 |
gr.Markdown(
|
179 |
"""
|
180 |
# Voice Chat
|
|
|
186 |
"""
|
187 |
)
|
188 |
|
189 |
+
with gr.Row():
|
190 |
+
with gr.Column():
|
191 |
+
ref_audio_chat = gr.Audio(label="Reference Audio", type="filepath")
|
192 |
+
with gr.Column():
|
193 |
+
with gr.Accordion("Advanced Settings", open=False):
|
194 |
+
remove_silence_chat = gr.Checkbox(label="Remove Silences", value=True)
|
195 |
+
ref_text_chat = gr.Textbox(
|
196 |
+
label="Reference Text",
|
197 |
+
info="Optional: Leave blank to auto-transcribe",
|
198 |
+
lines=2,
|
199 |
+
)
|
200 |
+
system_prompt_chat = gr.Textbox(
|
201 |
+
label="System Prompt",
|
202 |
+
value=(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
"You are not an AI assistant, you are whoever the user says you are. "
|
204 |
"You must stay in character. Keep your responses concise since they will be spoken out loud."
|
205 |
),
|
206 |
+
lines=2,
|
207 |
+
)
|
208 |
+
|
209 |
+
chatbot_interface = gr.Chatbot(label="Conversation")
|
210 |
+
|
211 |
+
with gr.Row():
|
212 |
+
with gr.Column():
|
213 |
+
audio_input_chat = gr.Microphone(label="Speak your message", type="filepath")
|
214 |
+
audio_output_chat = gr.Audio(autoplay=True)
|
215 |
+
with gr.Column():
|
216 |
+
groq_apikey = gr.Textbox(label="Your Groq API Key")
|
217 |
+
text_input_chat = gr.Textbox(label="Type your message", lines=1)
|
218 |
+
send_btn_chat = gr.Button("Send Message")
|
219 |
+
clear_btn_chat = gr.Button("Clear Conversation")
|
220 |
+
|
221 |
+
# Initialize the conversation state with the system prompt.
|
222 |
+
conversation_state = gr.State(
|
223 |
+
value=[
|
224 |
+
{
|
225 |
+
"role": "system",
|
226 |
+
"content": (
|
227 |
+
"You are not an AI assistant, you are whoever the user says you are. "
|
228 |
+
"You must stay in character. Keep your responses concise since they will be spoken out loud."
|
229 |
+
),
|
230 |
+
}
|
231 |
+
]
|
232 |
+
)
|
233 |
|
234 |
+
@gpu_decorator
|
235 |
+
def generate_audio_response(history, ref_audio, ref_text, remove_silence):
|
236 |
+
"""
|
237 |
+
Generate an audio response from the last AI message in the conversation.
|
238 |
+
"""
|
239 |
+
if not history or not ref_audio:
|
240 |
+
return None, ref_text
|
241 |
+
|
242 |
+
last_user_message, last_ai_response = history[-1]
|
243 |
+
if not last_ai_response:
|
244 |
+
return None, ref_text
|
245 |
+
|
246 |
+
audio_result, _, ref_text_out = infer(
|
247 |
+
ref_audio,
|
248 |
+
ref_text,
|
249 |
+
last_ai_response,
|
250 |
+
remove_silence,
|
251 |
+
cross_fade_duration=0.15,
|
252 |
+
speed=1.0,
|
253 |
+
show_info=print,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
254 |
)
|
255 |
+
return audio_result, ref_text_out
|
256 |
|
257 |
+
def clear_conversation():
|
258 |
+
"""
|
259 |
+
Clear the chat conversation and reset the conversation state.
|
260 |
+
"""
|
261 |
+
initial_state = [
|
262 |
+
{
|
263 |
+
"role": "system",
|
264 |
+
"content": (
|
265 |
+
"You are not an AI assistant, you are whoever the user says you are. "
|
266 |
+
"You must stay in character. Keep your responses concise since they will be spoken out loud."
|
267 |
+
),
|
268 |
+
}
|
269 |
+
]
|
270 |
+
return [], initial_state
|
271 |
+
|
272 |
+
def update_system_prompt(new_prompt):
|
273 |
+
"""
|
274 |
+
Update the system prompt and reset the conversation.
|
275 |
+
"""
|
276 |
+
initial_state = [{"role": "system", "content": new_prompt}]
|
277 |
+
return [], initial_state
|
278 |
+
|
279 |
+
# Set up callbacks so that when recording stops or text is submitted, the processing chain is run.
|
280 |
+
audio_input_chat.stop_recording(
|
281 |
+
process_audio_input,
|
282 |
+
inputs=[audio_input_chat, text_input_chat, groq_apikey, chatbot_interface, conversation_state],
|
283 |
+
outputs=[chatbot_interface, conversation_state, None],
|
284 |
+
).then(
|
285 |
+
generate_audio_response,
|
286 |
+
inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
|
287 |
+
outputs=[audio_output_chat, ref_text_chat],
|
288 |
+
).then(lambda: None, None, audio_input_chat)
|
289 |
+
|
290 |
+
text_input_chat.submit(
|
291 |
+
process_audio_input,
|
292 |
+
inputs=[audio_input_chat, text_input_chat, groq_apikey, chatbot_interface, conversation_state],
|
293 |
+
outputs=[chatbot_interface, conversation_state, None],
|
294 |
+
).then(
|
295 |
+
generate_audio_response,
|
296 |
+
inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
|
297 |
+
outputs=[audio_output_chat, ref_text_chat],
|
298 |
+
).then(lambda: None, None, text_input_chat)
|
299 |
+
|
300 |
+
send_btn_chat.click(
|
301 |
+
process_audio_input,
|
302 |
+
inputs=[audio_input_chat, text_input_chat, groq_apikey, chatbot_interface, conversation_state],
|
303 |
+
outputs=[chatbot_interface, conversation_state, None],
|
304 |
+
).then(
|
305 |
+
generate_audio_response,
|
306 |
+
inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
|
307 |
+
outputs=[audio_output_chat, ref_text_chat],
|
308 |
+
).then(lambda: None, None, text_input_chat)
|
309 |
+
|
310 |
+
clear_btn_chat.click(clear_conversation, outputs=[chatbot_interface, conversation_state])
|
311 |
+
system_prompt_chat.change(
|
312 |
+
update_system_prompt,
|
313 |
+
inputs=system_prompt_chat,
|
314 |
+
outputs=[chatbot_interface, conversation_state],
|
315 |
+
)
|
316 |
|
317 |
|
318 |
@click.command()
|