Spaces:
Sleeping
Sleeping
File size: 1,863 Bytes
dcf6735 f013d22 dcf6735 a976d7a bcc0290 16d11ec f013d22 dcf6735 f013d22 16d11ec 6daa843 e63c494 6daa843 16d11ec 6daa843 dcf6735 8ef7225 dcf6735 bcc0290 dcf6735 bcc0290 e63c494 bcc0290 e63c494 dcf6735 e63c494 dcf6735 e63c494 dcf6735 bcc0290 e63c494 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
import torch
import torchaudio
from torchaudio.transforms import Resample
from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
import streamlit as st
from streamlit_audio_recorder import st_audio_recorder
def preprocess_audio(audio_bytes, sample_rate=16000):
# Load audio and convert to mono if necessary
waveform, _ = torchaudio.load(audio_bytes, normalize=True)
if waveform.size(0) > 1:
waveform = torch.mean(waveform, dim=0, keepdim=True)
# Resample if needed
if waveform.shape[1] != sample_rate:
resampler = Resample(orig_freq=waveform.shape[1], new_freq=sample_rate)
waveform = resampler(waveform)
return waveform
def transcribe_audio(audio_bytes):
model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-mustc-en-fr-st")
processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-mustc-en-fr-st")
# Preprocess audio
input_features = preprocess_audio(audio_bytes)
# Tokenize audio
inputs = processor(input_features.squeeze(0), return_tensors="pt", padding=True)
# Generate transcription
generated_ids = model.generate(inputs.input_features)
translation = processor.batch_decode(generated_ids, skip_special_tokens=True)
return translation
st.title("Audio to Text Transcription with Recording")
# Use the st_audio_recorder widget to record audio
audio_bytes = st_audio_recorder(sample_rate=16000, codec="wav", show_playback_controls=True)
# Display the recorded audio
if audio_bytes:
st.audio(audio_bytes, format="audio/wav")
transcription = transcribe_audio(audio_bytes)
if transcription:
st.write("Transcription:")
st.write(transcription[0])
else:
st.write("Error: Failed to transcribe audio.")
else:
st.write("Please record an audio.")
|