# -*- coding: utf-8 -*- import torch import torchaudio device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(device) import IPython import matplotlib.pyplot as plt from torchaudio.utils import download_asset ctc_preTrained_object = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H model = ctc_preTrained_object.get_model().to(device) from torchaudio.models.decoder import download_pretrained_files files = download_pretrained_files('librispeech-4-gram') f = open(files.tokens, 'r') from torchaudio.models.decoder import ctc_decoder beam_search_decoder = ctc_decoder( lexicon = files.lexicon, tokens = files.tokens, lm = files.lm, nbest = 3, beam_size = 3 ) import audio_support_functions as myFunc def theaudio(x): waveform, sample_rate = torchaudio.load(x) waveform = waveform.to(device) #myFunc.play_audio(waveform.cpu(), sample_rate) waveform = waveform if sample_rate == ctc_preTrained_object.sample_rate else torchaudio.functional.resample(waveform, sample_rate, ctc_preTrained_object.sample_rate) with torch.inference_mode(): pred_tokens, _ = model(waveform) #print(pred_tokens.size()) pred_tokens = pred_tokens.to('cpu') beam_search_result = beam_search_decoder(pred_tokens) beam_search_transcript = " ".join(beam_search_result[0][0].words).strip() return beam_search_transcript import gradio as gr import librosa iface = gr.Interface( fn=theaudio, inputs=gr.Audio(type="filepath"), outputs="text", title="Aud2Text Using CTC", description="Upload an audio file or record one and the AI will transcribe it for you!" ) iface.launch()