import gradio as gr import numpy as np import io import tempfile from pydub import AudioSegment from dataclasses import dataclass, field @dataclass class AppState: stream: np.ndarray | None = None sampling_rate: int = 0 pause_detected: bool = False stopped: bool = False started_talking: bool = False conversation: list = field(default_factory=list) # Proper use of default_factory # Function to process audio input and detect pauses def process_audio(audio: tuple, state: AppState): if state.stream is None: state.stream = audio[1] state.sampling_rate = audio[0] else: state.stream = np.concatenate((state.stream, audio[1])) # Detect if a pause has occurred (for simplicity, use 1-second threshold) pause_detected = len(state.stream) > state.sampling_rate * 1 state.pause_detected = pause_detected if state.pause_detected: return gr.Audio(recording=False), state # Stop recording return None, state # Generate chatbot response based on user input (audio or text) def response(user_input, state: AppState, input_type: str): if input_type == "text": # Handle text input state.conversation.append({"role": "user", "content": user_input}) bot_response = f"Echo: {user_input}" # Simulate response state.conversation.append({"role": "assistant", "content": bot_response}) return bot_response, state # Handle audio input if pause was detected if not state.pause_detected: return None, state # Convert audio to WAV and store in conversation history audio_buffer = io.BytesIO() segment = AudioSegment( state.stream.tobytes(), frame_rate=state.sampling_rate, sample_width=state.stream.dtype.itemsize, channels=1 if len(state.stream.shape) == 1 else state.stream.shape[1] ) segment.export(audio_buffer, format="wav") with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: f.write(audio_buffer.getvalue()) state.conversation.append({"role": "user", "content": {"path": f.name, "mime_type": "audio/wav"}}) # Simulate bot's response (replace with mini omni logic) chatbot_response = b"Simulated response audio content" # Placeholder output_buffer = chatbot_response with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f: f.write(output_buffer) state.conversation.append({"role": "assistant", "content": {"path": f.name, "mime_type": "audio/mp3"}}) yield None, state # Start recording audio input def start_recording_user(state: AppState): if not state.stopped: return gr.Audio(recording=True) # Gradio interface setup with gr.Blocks() as demo: with gr.Row(): with gr.Column(): input_audio = gr.Audio(label="Input Audio", sources="microphone", type="numpy") text_input = gr.Textbox(label="Text Input", placeholder="Type your message here...") with gr.Column(): chatbot = gr.Chatbot(label="Conversation", type="messages") output_audio = gr.Audio(label="Output Audio", streaming=True, autoplay=True) state = gr.State(value=AppState()) # Handle audio input streaming stream = input_audio.stream( process_audio, [input_audio, state], [input_audio, state], stream_every=0.5, time_limit=30 ) # Handle responses for both text and audio inputs text_submit = text_input.submit( lambda txt, s: response(txt, s, "text"), [text_input, state], [chatbot, state] ) respond = input_audio.stop_recording(response, [None, state, "audio"], [output_audio, state]) respond.then(lambda s: s.conversation, [state], [chatbot]) # Restart recording when audio playback stops restart = output_audio.stop(start_recording_user, [state], [input_audio]) # Stop button to cancel the conversation cancel = gr.Button("Stop Conversation", variant="stop") cancel.click(lambda: (AppState(stopped=True), gr.Audio(recording=False)), None, [state, input_audio], cancels=[respond, restart]) if __name__ == "__main__": demo.launch()