Lguyogiro's picture
try new aproach
faee479
raw
history blame
2.11 kB
import streamlit as st
from streamlit_webrtc import webrtc_streamer, AudioProcessorBase, WebRtcMode
import numpy as np
import pydub
from transformers import pipeline
from asr import load_model, inference
# Define a custom audio processor to handle microphone input
class AudioProcessor(AudioProcessorBase):
def __init__(self):
self.audio_data = []
def recv_audio(self, frame):
# Convert the audio frame to a NumPy array
audio_array = np.frombuffer(frame.to_ndarray(), dtype=np.int16)
self.audio_data.append(audio_array)
return frame
def get_audio_data(self):
# Combine all captured audio data
if self.audio_data:
combined = np.concatenate(self.audio_data, axis=0)
return combined
return None
# Title of the app
st.title("Real-Time Speech-to-Text")
# Initialize the audio processor
audio_processor = AudioProcessor()
# WebRTC streamer to capture microphone input
webrtc_streamer(
key="audio",
mode=WebRtcMode.SENDONLY,
audio_processor_factory=lambda: audio_processor,
media_stream_constraints={"audio": True, "video": False},
)
# Load a pre-trained ASR pipeline from Hugging Face
@st.cache_resource
def load_asr_model():
return load_model()
asr_model = load_asr_model()
# Button to process audio and perform ASR
if st.button("Transcribe Audio"):
audio_data = audio_processor.get_audio_data()
if audio_data is not None:
# Convert the NumPy array to a WAV-like audio segment
audio_segment = pydub.AudioSegment(
audio_data.tobytes(),
frame_rate=16000, # Default WebRTC audio frame rate
sample_width=2, # 16-bit audio
channels=1 # Mono
)
# Perform ASR on the audio segment
st.info("Transcribing audio...")
transcription = inference(asr_model, audio_segment.raw_data)
# Display transcription
st.text_area("Transcription", transcription["text"], height=200)
else:
st.warning("No audio data captured! Please speak into your microphone.")