Spaces:
Sleeping
Sleeping
import torch | |
from transformers import pipeline | |
import numpy as np | |
import gradio as gr | |
device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
model_id = "openai/whisper-medium" | |
print("\n\nReading Languages...\n\n") | |
with open("languages.txt", "r") as file: | |
languages = file.read().strip().split(",") | |
languages = [language.strip().lower() for language in languages] | |
print("\n\nInitializing model...\n\n") | |
transcriber = pipeline( | |
"automatic-speech-recognition", | |
model=model_id, | |
torch_dtype=torch_dtype, | |
device=device, | |
) | |
print("\n\nModel Ready!!\n\nLaunching Interface...\n\n") | |
def transcribe(audio, language: str): | |
sr, y = audio | |
# Convert to mono if stereo | |
if y.ndim > 1: | |
y = y.mean(axis=1) | |
y = y.astype(np.float32) | |
y /= np.max(np.abs(y)) | |
language = language.lower() | |
if(language not in languages): | |
return "Error!! Not a valid language!!" | |
args = {"task":"transcribe", "language":language} | |
return transcriber({"sampling_rate": sr, "raw": y}, generate_kwargs=args)["text"] | |
demo = gr.Interface( | |
transcribe, | |
inputs=[gr.Audio(sources="microphone"), gr.Textbox(label="Language", placeholder="Enter the language")], | |
outputs=["text"], | |
title="Whisper Model Interface", | |
description=model_id | |
) | |
demo.launch() |