File size: 2,637 Bytes
f0a39fa 86e368d f0a39fa 86e368d f0a39fa 86e368d f0a39fa 86e368d f0a39fa 86e368d f0a39fa 86e368d f0a39fa 86e368d f0a39fa 86e368d f0a39fa 86e368d f0a39fa 86e368d f0a39fa 86e368d f0a39fa 86e368d f0a39fa 86e368d f0a39fa 86e368d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
import gradio as gr
from transformers import pipeline
import numpy as np
# Initialize the automatic speech recognition pipeline using a pre-trained model
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")
# Global variables to store the accumulated audio data and its streaming rate
audio_data = None
streaming_rate = None
def capture_audio(stream, new_chunk):
"""
Function to capture streaming audio and accumulate it in a global variable.
Args:
stream (numpy.ndarray): The accumulated audio data up to this point.
new_chunk (tuple): A tuple containing the sampling rate and the new audio data chunk.
Returns:
numpy.ndarray: The updated stream with the new chunk appended.
"""
global audio_data
global streaming_rate
# Extract sampling rate and audio chunk, normalize the audio
sr, y = new_chunk
streaming_rate = sr
y = y.astype(np.float32)
y /= np.max(np.abs(y))
# Concatenate new audio chunk to the existing stream or start a new one
if stream is not None:
stream = np.concatenate([stream, y])
else:
stream = y
# Update the global variable with the new audio data
audio_data = stream
return stream
def get_transcript():
"""
Function to transcribe the accumulated audio data.
Returns:
str: The transcription of the accumulated audio data.
"""
global audio_data
global streaming_rate
# Transcribe the audio data if available
if audio_data is not None and streaming_rate is not None:
transcript = transcriber({"sampling_rate": streaming_rate, "raw": audio_data})["text"]
return transcript
return ""
# Building the Gradio interface using Blocks
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
# State variable to manage the streaming data
state = gr.State()
# Audio component for real-time audio capture from the microphone
audio = gr.Audio(sources=["microphone"], streaming=True, type="numpy")
# Textbox for displaying the transcription
transcript_box = gr.Textbox(label="Transcript")
# Button to initiate transcription of the captured audio
rfrsh_btn = gr.Button("Refresh")
# Streaming setup to handle real-time audio capture
audio.stream(fn=capture_audio, inputs=[state, audio], outputs=[state])
# Button click setup to trigger transcription
rfrsh_btn.click(fn=get_transcript, outputs=[transcript_box])
# Launch the Gradio interface
demo.launch() |