Spaces:

Obai33
/

AudioToText

Sleeping

File size: 1,661 Bytes

# -*- coding: utf-8 -*-

import torch
import torchaudio
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(device)

import IPython
import matplotlib.pyplot as plt
from torchaudio.utils import download_asset

ctc_preTrained_object = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H

model = ctc_preTrained_object.get_model().to(device)

from torchaudio.models.decoder import download_pretrained_files

files = download_pretrained_files('librispeech-4-gram')

f = open(files.tokens, 'r')

from torchaudio.models.decoder import ctc_decoder

beam_search_decoder = ctc_decoder(
    lexicon = files.lexicon,
    tokens = files.tokens,
    lm = files.lm,
    nbest = 3,
    beam_size = 3
)

import audio_support_functions as myFunc

def theaudio(x):
    waveform, sample_rate = torchaudio.load(x)
    waveform = waveform.to(device)

    #myFunc.play_audio(waveform.cpu(), sample_rate)

    waveform = waveform if sample_rate == ctc_preTrained_object.sample_rate else torchaudio.functional.resample(waveform, sample_rate, ctc_preTrained_object.sample_rate)
    with torch.inference_mode():
        pred_tokens, _ = model(waveform)
    #print(pred_tokens.size())

    pred_tokens = pred_tokens.to('cpu')
    beam_search_result = beam_search_decoder(pred_tokens)
    beam_search_transcript = " ".join(beam_search_result[0][0].words).strip()
    return beam_search_transcript

import gradio as gr
import librosa

iface = gr.Interface(
    fn=theaudio,
    inputs=gr.Audio(type="filepath"),
    outputs="text",
    title="Aud2Text Using CTC",
    description="Upload an audio file or record one and the AI will transcribe it for you!"
)

iface.launch()