|
import gradio as gr |
|
from transformers import pipeline |
|
import numpy as np |
|
|
|
|
|
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en") |
|
|
|
|
|
audio_data = None |
|
streaming_rate = None |
|
|
|
def capture_audio(stream, new_chunk): |
|
""" |
|
Function to capture streaming audio and accumulate it in a global variable. |
|
|
|
Args: |
|
stream (numpy.ndarray): The accumulated audio data up to this point. |
|
new_chunk (tuple): A tuple containing the sampling rate and the new audio data chunk. |
|
|
|
Returns: |
|
numpy.ndarray: The updated stream with the new chunk appended. |
|
""" |
|
global audio_data |
|
global streaming_rate |
|
|
|
|
|
sr, y = new_chunk |
|
streaming_rate = sr |
|
y = y.astype(np.float32) |
|
y /= np.max(np.abs(y)) |
|
|
|
|
|
if stream is not None: |
|
stream = np.concatenate([stream, y]) |
|
else: |
|
stream = y |
|
|
|
|
|
audio_data = stream |
|
return stream |
|
|
|
def get_transcript(): |
|
""" |
|
Function to transcribe the accumulated audio data. |
|
|
|
Returns: |
|
str: The transcription of the accumulated audio data. |
|
""" |
|
global audio_data |
|
global streaming_rate |
|
|
|
|
|
if audio_data is not None and streaming_rate is not None: |
|
transcript = transcriber({"sampling_rate": streaming_rate, "raw": audio_data})["text"] |
|
return transcript |
|
return "" |
|
|
|
|
|
with gr.Blocks() as demo: |
|
with gr.Row(): |
|
with gr.Column(): |
|
|
|
state = gr.State() |
|
|
|
audio = gr.Audio(sources=["microphone"], streaming=True, type="numpy") |
|
|
|
transcript_box = gr.Textbox(label="Transcript") |
|
|
|
rfrsh_btn = gr.Button("Refresh") |
|
|
|
|
|
audio.stream(fn=capture_audio, inputs=[state, audio], outputs=[state]) |
|
|
|
rfrsh_btn.click(fn=get_transcript, outputs=[transcript_box]) |
|
|
|
|
|
demo.launch() |