Spaces:
Sleeping
Sleeping
File size: 2,223 Bytes
dcf6735 f013d22 6e9009d a976d7a 6488f67 16d11ec f013d22 dcf6735 7a11ad5 d8c4385 0b3bb88 f013d22 16d11ec 6daa843 e63c494 6daa843 16d11ec 6daa843 dcf6735 8ef7225 dcf6735 bcc0290 dcf6735 bcc0290 275e93c e63c494 bcc0290 e63c494 dcf6735 e63c494 dcf6735 e63c494 dcf6735 bcc0290 e63c494 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
import torch
import torchaudio
from torchaudio.transforms import Resample
from transformers import Speech2Text2Processor, Speech2TextForConditionalGeneration, SpeechEncoderDecoderModel
import streamlit as st
from audio_recorder_streamlit import audio_recorder
def preprocess_audio(audio_bytes, sample_rate=16000):
# Load audio and convert to mono if necessary
waveform, _ = torchaudio.load(audio_bytes, normalize=True)
if waveform.size(0) > 1:
waveform = torch.mean(waveform, dim=0, keepdim=True)
# Resample if needed
if waveform.shape[1] != sample_rate:
resampler = Resample(orig_freq=waveform.shape[1], new_freq=sample_rate)
waveform = resampler(waveform)
return waveform
def transcribe_audio(audio_bytes):
#model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr")
#processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
model = SpeechEncoderDecoderModel.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
# Load model directly
#processor = AutoProcessor.from_pretrained("facebook/wav2vec2-large-100k-voxpopuli")
#model = AutoModelForPreTraining.from_pretrained("facebook/wav2vec2-large-100k-voxpopuli")
# Preprocess audio
input_features = preprocess_audio(audio_bytes)
# Tokenize audio
inputs = processor(input_features.squeeze(0), return_tensors="pt", padding=True)
# Generate transcription
generated_ids = model.generate(inputs.input_features)
translation = processor.batch_decode(generated_ids, skip_special_tokens=True)
return translation
st.title("Audio to Text Transcription with Recording")
# Use the st_audio_recorder widget to record audio
audio_bytes = audio_recorder()
# Display the recorded audio
if audio_bytes:
st.audio(audio_bytes, format="audio/wav")
transcription = transcribe_audio(audio_bytes)
if transcription:
st.write("Transcription:")
st.write(transcription[0])
else:
st.write("Error: Failed to transcribe audio.")
else:
st.write("Please record an audio.")
|