Spaces:

pratikshahp
/

audio-to-text

Sleeping

File size: 1,863 Bytes

dcf6735
f013d22
 
dcf6735
a976d7a
bcc0290
16d11ec
f013d22
 
 
 
 
 
 
 
 
 
 
 
 
dcf6735
 
 
 
f013d22
 
16d11ec
6daa843
e63c494
6daa843
16d11ec
6daa843
dcf6735
 
8ef7225
dcf6735
bcc0290
dcf6735
bcc0290
 
e63c494
bcc0290
 
 
e63c494
dcf6735
e63c494
dcf6735
 
e63c494
dcf6735
 
 
bcc0290
 
e63c494

import torch
import torchaudio
from torchaudio.transforms import Resample
from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
import streamlit as st
from streamlit_audio_recorder import st_audio_recorder

def preprocess_audio(audio_bytes, sample_rate=16000):
    # Load audio and convert to mono if necessary
    waveform, _ = torchaudio.load(audio_bytes, normalize=True)
    if waveform.size(0) > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)
    
    # Resample if needed
    if waveform.shape[1] != sample_rate:
        resampler = Resample(orig_freq=waveform.shape[1], new_freq=sample_rate)
        waveform = resampler(waveform)
    
    return waveform

def transcribe_audio(audio_bytes):
    model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-mustc-en-fr-st")
    processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-mustc-en-fr-st")

    # Preprocess audio
    input_features = preprocess_audio(audio_bytes)

    # Tokenize audio
    inputs = processor(input_features.squeeze(0), return_tensors="pt", padding=True)

    # Generate transcription
    generated_ids = model.generate(inputs.input_features)
    translation = processor.batch_decode(generated_ids, skip_special_tokens=True)

    return translation

st.title("Audio to Text Transcription with Recording")

# Use the st_audio_recorder widget to record audio
audio_bytes = st_audio_recorder(sample_rate=16000, codec="wav", show_playback_controls=True)

# Display the recorded audio
if audio_bytes:
    st.audio(audio_bytes, format="audio/wav")

    transcription = transcribe_audio(audio_bytes)

    if transcription:
        st.write("Transcription:")
        st.write(transcription[0])
    else:
        st.write("Error: Failed to transcribe audio.")
else:
    st.write("Please record an audio.")