AudioToText / app.py
Obai33's picture
Update app.py
4775790 verified
# -*- coding: utf-8 -*-
import torch
import torchaudio
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
import IPython
import matplotlib.pyplot as plt
from torchaudio.utils import download_asset
ctc_preTrained_object = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
model = ctc_preTrained_object.get_model().to(device)
from torchaudio.models.decoder import download_pretrained_files
files = download_pretrained_files('librispeech-4-gram')
f = open(files.tokens, 'r')
from torchaudio.models.decoder import ctc_decoder
beam_search_decoder = ctc_decoder(
lexicon = files.lexicon,
tokens = files.tokens,
lm = files.lm,
nbest = 3,
beam_size = 3
)
import audio_support_functions as myFunc
def theaudio(x):
waveform, sample_rate = torchaudio.load(x)
waveform = waveform.to(device)
#myFunc.play_audio(waveform.cpu(), sample_rate)
waveform = waveform if sample_rate == ctc_preTrained_object.sample_rate else torchaudio.functional.resample(waveform, sample_rate, ctc_preTrained_object.sample_rate)
with torch.inference_mode():
pred_tokens, _ = model(waveform)
#print(pred_tokens.size())
pred_tokens = pred_tokens.to('cpu')
beam_search_result = beam_search_decoder(pred_tokens)
beam_search_transcript = " ".join(beam_search_result[0][0].words).strip()
return beam_search_transcript
import gradio as gr
import librosa
iface = gr.Interface(
fn=theaudio,
inputs=gr.Audio(type="filepath"),
outputs="text",
title="Aud2Text Using CTC",
description="Upload an audio file or record one and the AI will transcribe it for you!"
)
iface.launch()