Spaces:

pratikshahp
/

audio-to-text

Sleeping

File size: 2,223 Bytes

dcf6735
f013d22
 
6e9009d
a976d7a
6488f67
16d11ec
f013d22
 
 
 
 
 
 
 
 
 
 
 
 
dcf6735
7a11ad5
 
 
 
 
d8c4385
0b3bb88
 
f013d22
 
16d11ec
6daa843
e63c494
6daa843
16d11ec
6daa843
dcf6735
 
8ef7225
dcf6735
bcc0290
dcf6735
bcc0290
275e93c
 
e63c494
bcc0290
 
 
e63c494
dcf6735
e63c494
dcf6735
 
e63c494
dcf6735
 
 
bcc0290
 
e63c494

import torch
import torchaudio
from torchaudio.transforms import Resample
from transformers import Speech2Text2Processor, Speech2TextForConditionalGeneration, SpeechEncoderDecoderModel
import streamlit as st
from audio_recorder_streamlit import audio_recorder

def preprocess_audio(audio_bytes, sample_rate=16000):
    # Load audio and convert to mono if necessary
    waveform, _ = torchaudio.load(audio_bytes, normalize=True)
    if waveform.size(0) > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)
    
    # Resample if needed
    if waveform.shape[1] != sample_rate:
        resampler = Resample(orig_freq=waveform.shape[1], new_freq=sample_rate)
        waveform = resampler(waveform)
    
    return waveform

def transcribe_audio(audio_bytes):
    #model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr")
    #processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
    model = SpeechEncoderDecoderModel.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
    processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de")

    # Load model directly
    #processor = AutoProcessor.from_pretrained("facebook/wav2vec2-large-100k-voxpopuli")
    #model = AutoModelForPreTraining.from_pretrained("facebook/wav2vec2-large-100k-voxpopuli")
    # Preprocess audio
    input_features = preprocess_audio(audio_bytes)

    # Tokenize audio
    inputs = processor(input_features.squeeze(0), return_tensors="pt", padding=True)

    # Generate transcription
    generated_ids = model.generate(inputs.input_features)
    translation = processor.batch_decode(generated_ids, skip_special_tokens=True)

    return translation

st.title("Audio to Text Transcription with Recording")

# Use the st_audio_recorder widget to record audio

audio_bytes = audio_recorder()

# Display the recorded audio
if audio_bytes:
    st.audio(audio_bytes, format="audio/wav")

    transcription = transcribe_audio(audio_bytes)

    if transcription:
        st.write("Transcription:")
        st.write(transcription[0])
    else:
        st.write("Error: Failed to transcribe audio.")
else:
    st.write("Please record an audio.")