Spaces:
Sleeping
Sleeping
# -*- coding: utf-8 -*- | |
import torch | |
import torchaudio | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
print(device) | |
import IPython | |
import matplotlib.pyplot as plt | |
from torchaudio.utils import download_asset | |
ctc_preTrained_object = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H | |
model = ctc_preTrained_object.get_model().to(device) | |
from torchaudio.models.decoder import download_pretrained_files | |
files = download_pretrained_files('librispeech-4-gram') | |
f = open(files.tokens, 'r') | |
from torchaudio.models.decoder import ctc_decoder | |
beam_search_decoder = ctc_decoder( | |
lexicon = files.lexicon, | |
tokens = files.tokens, | |
lm = files.lm, | |
nbest = 3, | |
beam_size = 3 | |
) | |
import audio_support_functions as myFunc | |
def theaudio(x): | |
waveform, sample_rate = torchaudio.load(x) | |
waveform = waveform.to(device) | |
#myFunc.play_audio(waveform.cpu(), sample_rate) | |
waveform = waveform if sample_rate == ctc_preTrained_object.sample_rate else torchaudio.functional.resample(waveform, sample_rate, ctc_preTrained_object.sample_rate) | |
with torch.inference_mode(): | |
pred_tokens, _ = model(waveform) | |
#print(pred_tokens.size()) | |
pred_tokens = pred_tokens.to('cpu') | |
beam_search_result = beam_search_decoder(pred_tokens) | |
beam_search_transcript = " ".join(beam_search_result[0][0].words).strip() | |
return beam_search_transcript | |
import gradio as gr | |
import librosa | |
iface = gr.Interface( | |
fn=theaudio, | |
inputs=gr.Audio(type="filepath"), | |
outputs="text", | |
title="Aud2Text Using CTC", | |
description="Upload an audio file or record one and the AI will transcribe it for you!" | |
) | |
iface.launch() |