Spaces:

pratikshahp
/

audio-to-text

Sleeping

App Files Files Community

audio-to-text / app.py

pratikshahp

Update app.py

bcc0290 verified over 1 year ago

raw

history blame

1.86 kB

	import torch
	import torchaudio
	from torchaudio.transforms import Resample
	from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
	import streamlit as st
	from streamlit_audio_recorder import st_audio_recorder

	def preprocess_audio(audio_bytes, sample_rate=16000):
	# Load audio and convert to mono if necessary
	waveform, _ = torchaudio.load(audio_bytes, normalize=True)
	if waveform.size(0) > 1:
	waveform = torch.mean(waveform, dim=0, keepdim=True)

	# Resample if needed
	if waveform.shape[1] != sample_rate:
	resampler = Resample(orig_freq=waveform.shape[1], new_freq=sample_rate)
	waveform = resampler(waveform)

	return waveform

	def transcribe_audio(audio_bytes):
	model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-mustc-en-fr-st")
	processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-mustc-en-fr-st")

	# Preprocess audio
	input_features = preprocess_audio(audio_bytes)

	# Tokenize audio
	inputs = processor(input_features.squeeze(0), return_tensors="pt", padding=True)

	# Generate transcription
	generated_ids = model.generate(inputs.input_features)
	translation = processor.batch_decode(generated_ids, skip_special_tokens=True)

	return translation

	st.title("Audio to Text Transcription with Recording")

	# Use the st_audio_recorder widget to record audio
	audio_bytes = st_audio_recorder(sample_rate=16000, codec="wav", show_playback_controls=True)

	# Display the recorded audio
	if audio_bytes:
	st.audio(audio_bytes, format="audio/wav")

	transcription = transcribe_audio(audio_bytes)

	if transcription:
	st.write("Transcription:")
	st.write(transcription[0])
	else:
	st.write("Error: Failed to transcribe audio.")
	else:
	st.write("Please record an audio.")