import gradio as gr from transformers import pipeline import numpy as np # Initialize the automatic speech recognition pipeline using a pre-trained model transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en") # Global variables to store the accumulated audio data and its streaming rate audio_data = None streaming_rate = None def capture_audio(stream, new_chunk): """ Function to capture streaming audio and accumulate it in a global variable. Args: stream (numpy.ndarray): The accumulated audio data up to this point. new_chunk (tuple): A tuple containing the sampling rate and the new audio data chunk. Returns: numpy.ndarray: The updated stream with the new chunk appended. """ global audio_data global streaming_rate # Extract sampling rate and audio chunk, normalize the audio sr, y = new_chunk streaming_rate = sr y = y.astype(np.float32) y /= np.max(np.abs(y)) # Concatenate new audio chunk to the existing stream or start a new one if stream is not None: stream = np.concatenate([stream, y]) else: stream = y # Update the global variable with the new audio data audio_data = stream return stream def get_transcript(): """ Function to transcribe the accumulated audio data. Returns: str: The transcription of the accumulated audio data. """ global audio_data global streaming_rate # Transcribe the audio data if available if audio_data is not None and streaming_rate is not None: transcript = transcriber({"sampling_rate": streaming_rate, "raw": audio_data})["text"] return transcript return "" # Building the Gradio interface using Blocks with gr.Blocks() as demo: with gr.Row(): with gr.Column(): # State variable to manage the streaming data state = gr.State() # Audio component for real-time audio capture from the microphone audio = gr.Audio(sources=["microphone"], streaming=True, type="numpy") # Textbox for displaying the transcription transcript_box = gr.Textbox(label="Transcript") # Button to initiate transcription of the captured audio rfrsh_btn = gr.Button("Refresh") # Streaming setup to handle real-time audio capture audio.stream(fn=capture_audio, inputs=[state, audio], outputs=[state]) # Button click setup to trigger transcription rfrsh_btn.click(fn=get_transcript, outputs=[transcript_box]) # Launch the Gradio interface demo.launch()