# app.py import gradio as gr import torch import whisper # Load a fast Whisper model model = whisper.load_model("small") # You can use "tiny" if you want even faster def transcribe_audio(audio): # Audio is received as a tuple (sample_rate, numpy_array) audio = audio[1] # Get the raw audio waveform # Whisper expects 16000 Hz sample rate result = model.transcribe(audio, fp16=torch.cuda.is_available()) text = result["text"] return text # Gradio Interface iface = gr.Interface( fn=transcribe_audio, inputs=gr.Audio(sources=["microphone"], type="numpy", streaming=True), outputs=gr.Textbox(label="Recognized Text"), live=True, # Important for real-time streaming title="Real-time Voice to Text", description="Speak into your microphone and get real-time transcription!", ) if __name__ == "__main__": iface.launch()