Spaces:
Sleeping
Sleeping
File size: 3,381 Bytes
b6ab738 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import gradio as gr
import numpy as np
import io
import tempfile
from pydub import AudioSegment
from dataclasses import dataclass, field
import numpy as np
@dataclass
class AppState:
stream: np.ndarray | None = None
sampling_rate: int = 0
pause_detected: bool = False
stopped: bool = False
started_talking: bool = False
conversation: list = field(default_factory=list) # Use default_factory for mutable defaults
# Function to process audio input and detect pauses
def process_audio(audio: tuple, state: AppState):
if state.stream is None:
state.stream = audio[1]
state.sampling_rate = audio[0]
else:
state.stream = np.concatenate((state.stream, audio[1]))
# Custom pause detection logic (replace with actual implementation)
pause_detected = len(state.stream) > state.sampling_rate * 1 # Example: 1-sec pause
state.pause_detected = pause_detected
if state.pause_detected:
return gr.Audio(recording=False), state # Stop recording
return None, state
# Generate chatbot response from user audio input
def response(state: AppState):
if not state.pause_detected:
return None, state
# Convert user audio to WAV format
audio_buffer = io.BytesIO()
segment = AudioSegment(
state.stream.tobytes(),
frame_rate=state.sampling_rate,
sample_width=state.stream.dtype.itemsize,
channels=1 if len(state.stream.shape) == 1 else state.stream.shape[1]
)
segment.export(audio_buffer, format="wav")
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
f.write(audio_buffer.getvalue())
state.conversation.append({"role": "user", "content": {"path": f.name, "mime_type": "audio/wav"}})
# Simulate chatbot's response (replace with mini omni model logic)
chatbot_response = b"Simulated response audio content" # Placeholder
output_buffer = chatbot_response # Stream actual chatbot response here
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
f.write(output_buffer)
state.conversation.append({"role": "assistant", "content": {"path": f.name, "mime_type": "audio/mp3"}})
yield None, state
# --- Gradio Interface ---
def start_recording_user(state: AppState):
if not state.stopped:
return gr.Audio(recording=True)
# Build Gradio app using Blocks API
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
input_audio = gr.Audio(label="Input Audio", sources="microphone", type="numpy")
with gr.Column():
chatbot = gr.Chatbot(label="Conversation", type="messages")
output_audio = gr.Audio(label="Output Audio", streaming=True, autoplay=True)
state = gr.State(value=AppState())
stream = input_audio.stream(
process_audio, [input_audio, state], [input_audio, state], stream_every=0.5, time_limit=30
)
respond = input_audio.stop_recording(response, [state], [output_audio, state])
respond.then(lambda s: s.conversation, [state], [chatbot])
restart = output_audio.stop(start_recording_user, [state], [input_audio])
cancel = gr.Button("Stop Conversation", variant="stop")
cancel.click(lambda: (AppState(stopped=True), gr.Audio(recording=False)), None, [state, input_audio], cancels=[respond, restart])
if __name__ == "__main__":
demo.launch()
|