import torch from transformers import pipeline import numpy as np import gradio as gr device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 model_id = "openai/whisper-medium" print("\n\nReading Languages...\n\n") with open("languages.txt", "r") as file: languages = file.read().strip().split(",") languages = [language.strip().lower() for language in languages] print("\n\nInitializing model...\n\n") transcriber = pipeline( "automatic-speech-recognition", model=model_id, torch_dtype=torch_dtype, device=device, ) print("\n\nModel Ready!!\n\nLaunching Interface...\n\n") def transcribe(audio, language: str): sr, y = audio # Convert to mono if stereo if y.ndim > 1: y = y.mean(axis=1) y = y.astype(np.float32) y /= np.max(np.abs(y)) language = language.lower() if(language not in languages): return "Error!! Not a valid language!!" args = {"task":"transcribe", "language":language} return transcriber({"sampling_rate": sr, "raw": y}, generate_kwargs=args)["text"] demo = gr.Interface( transcribe, inputs=[gr.Audio(sources="microphone"), gr.Textbox(label="Language", placeholder="Enter the language")], outputs=["text"], title="Whisper Model Interface", description=model_id ) demo.launch()