Spaces:

pratikshahp
/

audio-to-text

Sleeping

App Files Files Community

audio-to-text / app.py

pratikshahp

Update app.py

6e9009d verified over 1 year ago

raw

history blame

2.22 kB

	import torch
	import torchaudio
	from torchaudio.transforms import Resample
	from transformers import Speech2Text2Processor, Speech2TextForConditionalGeneration, SpeechEncoderDecoderModel
	import streamlit as st
	from audio_recorder_streamlit import audio_recorder

	def preprocess_audio(audio_bytes, sample_rate=16000):
	# Load audio and convert to mono if necessary
	waveform, _ = torchaudio.load(audio_bytes, normalize=True)
	if waveform.size(0) > 1:
	waveform = torch.mean(waveform, dim=0, keepdim=True)

	# Resample if needed
	if waveform.shape[1] != sample_rate:
	resampler = Resample(orig_freq=waveform.shape[1], new_freq=sample_rate)
	waveform = resampler(waveform)

	return waveform

	def transcribe_audio(audio_bytes):
	#model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr")
	#processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
	model = SpeechEncoderDecoderModel.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
	processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de")

	# Load model directly
	#processor = AutoProcessor.from_pretrained("facebook/wav2vec2-large-100k-voxpopuli")
	#model = AutoModelForPreTraining.from_pretrained("facebook/wav2vec2-large-100k-voxpopuli")
	# Preprocess audio
	input_features = preprocess_audio(audio_bytes)

	# Tokenize audio
	inputs = processor(input_features.squeeze(0), return_tensors="pt", padding=True)

	# Generate transcription
	generated_ids = model.generate(inputs.input_features)
	translation = processor.batch_decode(generated_ids, skip_special_tokens=True)

	return translation

	st.title("Audio to Text Transcription with Recording")

	# Use the st_audio_recorder widget to record audio

	audio_bytes = audio_recorder()

	# Display the recorded audio
	if audio_bytes:
	st.audio(audio_bytes, format="audio/wav")

	transcription = transcribe_audio(audio_bytes)

	if transcription:
	st.write("Transcription:")
	st.write(transcription[0])
	else:
	st.write("Error: Failed to transcribe audio.")
	else:
	st.write("Please record an audio.")