NeoPy commited on
Commit
21c1f7b
·
verified ·
1 Parent(s): 12443a2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +126 -137
app.py CHANGED
@@ -2,7 +2,6 @@
2
  # ruff: noqa: E402
3
 
4
  import json
5
- import re
6
  import tempfile
7
  import os
8
 
@@ -17,7 +16,7 @@ from groq import Groq
17
  from cached_path import cached_path
18
  from transformers import AutoModelForCausalLM, AutoTokenizer
19
 
20
- # Try to import spaces; if available, set USING_SPACES to True so we can decorate functions for GPU support.
21
  try:
22
  import spaces
23
 
@@ -70,7 +69,6 @@ def load_f5tts(
70
  F5TTS_ema_model = load_f5tts()
71
 
72
 
73
-
74
  @gpu_decorator
75
  def generate_response(messages, apikey):
76
  """
@@ -88,14 +86,13 @@ def generate_response(messages, apikey):
88
  model="deepseek-r1-distill-llama-70b",
89
  stream=False,
90
  )
91
- # Check that we got a valid response.
92
  if chat_completion.choices and hasattr(chat_completion.choices[0].message, "content"):
93
  return chat_completion.choices[0].message.content
94
  return ""
95
 
96
 
97
  @gpu_decorator
98
- def process_audio_input(audio_path, text, history, conv_state):
99
  """
100
  Process audio and/or text input from the user:
101
  - If an audio file is provided, its transcript is obtained.
@@ -105,7 +102,7 @@ def process_audio_input(audio_path, text, history, conv_state):
105
  return history, conv_state, ""
106
 
107
  if audio_path:
108
- # preprocess_ref_audio_text returns a tuple (audio, transcript).
109
  _, text = preprocess_ref_audio_text(audio_path, text)
110
 
111
  if not text.strip():
@@ -113,7 +110,7 @@ def process_audio_input(audio_path, text, history, conv_state):
113
 
114
  conv_state.append({"role": "user", "content": text})
115
  history.append((text, None))
116
- response = generate_response(conv_state)
117
  conv_state.append({"role": "assistant", "content": response})
118
  history[-1] = (text, response)
119
  return history, conv_state, ""
@@ -177,7 +174,7 @@ def infer(
177
  return (final_sample_rate, final_wave), spectrogram_path, ref_text
178
 
179
 
180
- with gr.Blocks() as app_chat:
181
  gr.Markdown(
182
  """
183
  # Voice Chat
@@ -189,141 +186,133 @@ Have a conversation with an AI using your reference voice!
189
  """
190
  )
191
 
192
- # The chat interface container is visible only if running in a Spaces environment.
193
- chat_interface_container = gr.Column(visible=USING_SPACES)
194
-
195
- with chat_interface_container:
196
- with gr.Row():
197
-
198
- with gr.Column():
199
- ref_audio_chat = gr.Audio(label="Reference Audio", type="filepath")
200
- with gr.Column():
201
- with gr.Accordion("Advanced Settings", open=False):
202
- remove_silence_chat = gr.Checkbox(label="Remove Silences", value=True)
203
- ref_text_chat = gr.Textbox(
204
- label="Reference Text",
205
- info="Optional: Leave blank to auto-transcribe",
206
- lines=2,
207
- )
208
- system_prompt_chat = gr.Textbox(
209
- label="System Prompt",
210
- value=(
211
- "You are not an AI assistant, you are whoever the user says you are. "
212
- "You must stay in character. Keep your responses concise since they will be spoken out loud."
213
- ),
214
- lines=2,
215
- )
216
-
217
- chatbot_interface = gr.Chatbot(label="Conversation")
218
- with gr.Row():
219
- with gr.Column():
220
- audio_input_chat = gr.Microphone(label="Speak your message", type="filepath")
221
- audio_output_chat = gr.Audio(autoplay=True)
222
- with gr.Column():
223
- with gr.Row():
224
- groq_apikey = gr.Textbox(label="Your Groq API Key")
225
- text_input_chat = gr.Textbox(label="Type your message", lines=1)
226
- send_btn_chat = gr.Button("Send Message")
227
- clear_btn_chat = gr.Button("Clear Conversation")
228
-
229
- # Initialize the conversation state with the system prompt.
230
- conversation_state = gr.State(
231
- value=[
232
- {
233
- "role": "system",
234
- "content": (
235
  "You are not an AI assistant, you are whoever the user says you are. "
236
  "You must stay in character. Keep your responses concise since they will be spoken out loud."
237
  ),
238
- }
239
- ]
240
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
 
242
- @gpu_decorator
243
- def generate_audio_response(history, ref_audio, ref_text, remove_silence):
244
- """
245
- Generate an audio response from the last AI message in the conversation.
246
- """
247
- if not history or not ref_audio:
248
- return None, ref_text
249
-
250
- last_user_message, last_ai_response = history[-1]
251
- if not last_ai_response:
252
- return None, ref_text
253
-
254
- audio_result, _, ref_text_out = infer(
255
- ref_audio,
256
- ref_text,
257
- last_ai_response,
258
- remove_silence,
259
- cross_fade_duration=0.15,
260
- speed=1.0,
261
- show_info=print,
262
- )
263
- return audio_result, ref_text_out
264
-
265
- def clear_conversation():
266
- """
267
- Clear the chat conversation and reset the conversation state.
268
- """
269
- initial_state = [
270
- {
271
- "role": "system",
272
- "content": (
273
- "You are not an AI assistant, you are whoever the user says you are. "
274
- "You must stay in character. Keep your responses concise since they will be spoken out loud."
275
- ),
276
- }
277
- ]
278
- return [], initial_state
279
-
280
- def update_system_prompt(new_prompt):
281
- """
282
- Update the system prompt and reset the conversation.
283
- """
284
- initial_state = [{"role": "system", "content": new_prompt}]
285
- return [], initial_state
286
-
287
- # Set up callbacks so that when recording stops, or text is submitted, the chain of processing is run.
288
- audio_input_chat.stop_recording(
289
- process_audio_input,
290
- inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state],
291
- outputs=[chatbot_interface, conversation_state],
292
- ).then(
293
- generate_audio_response,
294
- inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
295
- outputs=[audio_output_chat, ref_text_chat],
296
- ).then(lambda: None, None, audio_input_chat)
297
-
298
- text_input_chat.submit(
299
- process_audio_input,
300
- inputs=[audio_input_chat, text_input_chat, groq_apikey, chatbot_interface, conversation_state],
301
- outputs=[chatbot_interface, conversation_state],
302
- ).then(
303
- generate_audio_response,
304
- inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
305
- outputs=[audio_output_chat, ref_text_chat],
306
- ).then(lambda: None, None, text_input_chat)
307
-
308
- send_btn_chat.click(
309
- process_audio_input,
310
- inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state],
311
- outputs=[chatbot_interface, conversation_state],
312
- ).then(
313
- generate_audio_response,
314
- inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
315
- outputs=[audio_output_chat, ref_text_chat],
316
- ).then(lambda: None, None, text_input_chat)
317
-
318
- clear_btn_chat.click(clear_conversation, outputs=[chatbot_interface, conversation_state])
319
- system_prompt_chat.change(
320
- update_system_prompt,
321
- inputs=system_prompt_chat,
322
- outputs=[chatbot_interface, conversation_state],
323
  )
 
324
 
325
-
326
- app = app_chat
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
327
 
328
 
329
  @click.command()
 
2
  # ruff: noqa: E402
3
 
4
  import json
 
5
  import tempfile
6
  import os
7
 
 
16
  from cached_path import cached_path
17
  from transformers import AutoModelForCausalLM, AutoTokenizer
18
 
19
+ # Try to import spaces; if available, wrap functions for GPU support.
20
  try:
21
  import spaces
22
 
 
69
  F5TTS_ema_model = load_f5tts()
70
 
71
 
 
72
  @gpu_decorator
73
  def generate_response(messages, apikey):
74
  """
 
86
  model="deepseek-r1-distill-llama-70b",
87
  stream=False,
88
  )
 
89
  if chat_completion.choices and hasattr(chat_completion.choices[0].message, "content"):
90
  return chat_completion.choices[0].message.content
91
  return ""
92
 
93
 
94
  @gpu_decorator
95
+ def process_audio_input(audio_path, text, apikey, history, conv_state):
96
  """
97
  Process audio and/or text input from the user:
98
  - If an audio file is provided, its transcript is obtained.
 
102
  return history, conv_state, ""
103
 
104
  if audio_path:
105
+ # preprocess_ref_audio_text returns a tuple (audio, transcript)
106
  _, text = preprocess_ref_audio_text(audio_path, text)
107
 
108
  if not text.strip():
 
110
 
111
  conv_state.append({"role": "user", "content": text})
112
  history.append((text, None))
113
+ response = generate_response(conv_state, apikey)
114
  conv_state.append({"role": "assistant", "content": response})
115
  history[-1] = (text, response)
116
  return history, conv_state, ""
 
174
  return (final_sample_rate, final_wave), spectrogram_path, ref_text
175
 
176
 
177
+ with gr.Blocks() as app:
178
  gr.Markdown(
179
  """
180
  # Voice Chat
 
186
  """
187
  )
188
 
189
+ with gr.Row():
190
+ with gr.Column():
191
+ ref_audio_chat = gr.Audio(label="Reference Audio", type="filepath")
192
+ with gr.Column():
193
+ with gr.Accordion("Advanced Settings", open=False):
194
+ remove_silence_chat = gr.Checkbox(label="Remove Silences", value=True)
195
+ ref_text_chat = gr.Textbox(
196
+ label="Reference Text",
197
+ info="Optional: Leave blank to auto-transcribe",
198
+ lines=2,
199
+ )
200
+ system_prompt_chat = gr.Textbox(
201
+ label="System Prompt",
202
+ value=(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  "You are not an AI assistant, you are whoever the user says you are. "
204
  "You must stay in character. Keep your responses concise since they will be spoken out loud."
205
  ),
206
+ lines=2,
207
+ )
208
+
209
+ chatbot_interface = gr.Chatbot(label="Conversation")
210
+
211
+ with gr.Row():
212
+ with gr.Column():
213
+ audio_input_chat = gr.Microphone(label="Speak your message", type="filepath")
214
+ audio_output_chat = gr.Audio(autoplay=True)
215
+ with gr.Column():
216
+ groq_apikey = gr.Textbox(label="Your Groq API Key")
217
+ text_input_chat = gr.Textbox(label="Type your message", lines=1)
218
+ send_btn_chat = gr.Button("Send Message")
219
+ clear_btn_chat = gr.Button("Clear Conversation")
220
+
221
+ # Initialize the conversation state with the system prompt.
222
+ conversation_state = gr.State(
223
+ value=[
224
+ {
225
+ "role": "system",
226
+ "content": (
227
+ "You are not an AI assistant, you are whoever the user says you are. "
228
+ "You must stay in character. Keep your responses concise since they will be spoken out loud."
229
+ ),
230
+ }
231
+ ]
232
+ )
233
 
234
+ @gpu_decorator
235
+ def generate_audio_response(history, ref_audio, ref_text, remove_silence):
236
+ """
237
+ Generate an audio response from the last AI message in the conversation.
238
+ """
239
+ if not history or not ref_audio:
240
+ return None, ref_text
241
+
242
+ last_user_message, last_ai_response = history[-1]
243
+ if not last_ai_response:
244
+ return None, ref_text
245
+
246
+ audio_result, _, ref_text_out = infer(
247
+ ref_audio,
248
+ ref_text,
249
+ last_ai_response,
250
+ remove_silence,
251
+ cross_fade_duration=0.15,
252
+ speed=1.0,
253
+ show_info=print,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  )
255
+ return audio_result, ref_text_out
256
 
257
+ def clear_conversation():
258
+ """
259
+ Clear the chat conversation and reset the conversation state.
260
+ """
261
+ initial_state = [
262
+ {
263
+ "role": "system",
264
+ "content": (
265
+ "You are not an AI assistant, you are whoever the user says you are. "
266
+ "You must stay in character. Keep your responses concise since they will be spoken out loud."
267
+ ),
268
+ }
269
+ ]
270
+ return [], initial_state
271
+
272
+ def update_system_prompt(new_prompt):
273
+ """
274
+ Update the system prompt and reset the conversation.
275
+ """
276
+ initial_state = [{"role": "system", "content": new_prompt}]
277
+ return [], initial_state
278
+
279
+ # Set up callbacks so that when recording stops or text is submitted, the processing chain is run.
280
+ audio_input_chat.stop_recording(
281
+ process_audio_input,
282
+ inputs=[audio_input_chat, text_input_chat, groq_apikey, chatbot_interface, conversation_state],
283
+ outputs=[chatbot_interface, conversation_state, None],
284
+ ).then(
285
+ generate_audio_response,
286
+ inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
287
+ outputs=[audio_output_chat, ref_text_chat],
288
+ ).then(lambda: None, None, audio_input_chat)
289
+
290
+ text_input_chat.submit(
291
+ process_audio_input,
292
+ inputs=[audio_input_chat, text_input_chat, groq_apikey, chatbot_interface, conversation_state],
293
+ outputs=[chatbot_interface, conversation_state, None],
294
+ ).then(
295
+ generate_audio_response,
296
+ inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
297
+ outputs=[audio_output_chat, ref_text_chat],
298
+ ).then(lambda: None, None, text_input_chat)
299
+
300
+ send_btn_chat.click(
301
+ process_audio_input,
302
+ inputs=[audio_input_chat, text_input_chat, groq_apikey, chatbot_interface, conversation_state],
303
+ outputs=[chatbot_interface, conversation_state, None],
304
+ ).then(
305
+ generate_audio_response,
306
+ inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
307
+ outputs=[audio_output_chat, ref_text_chat],
308
+ ).then(lambda: None, None, text_input_chat)
309
+
310
+ clear_btn_chat.click(clear_conversation, outputs=[chatbot_interface, conversation_state])
311
+ system_prompt_chat.change(
312
+ update_system_prompt,
313
+ inputs=system_prompt_chat,
314
+ outputs=[chatbot_interface, conversation_state],
315
+ )
316
 
317
 
318
  @click.command()