Spaces:
Sleeping
Sleeping
File size: 2,106 Bytes
1466b77 59f1adb faee479 1466b77 59f1adb faee479 1466b77 faee479 1466b77 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
import streamlit as st
from streamlit_webrtc import webrtc_streamer, AudioProcessorBase, WebRtcMode
import numpy as np
import pydub
from transformers import pipeline
from asr import load_model, inference
# Define a custom audio processor to handle microphone input
class AudioProcessor(AudioProcessorBase):
def __init__(self):
self.audio_data = []
def recv_audio(self, frame):
# Convert the audio frame to a NumPy array
audio_array = np.frombuffer(frame.to_ndarray(), dtype=np.int16)
self.audio_data.append(audio_array)
return frame
def get_audio_data(self):
# Combine all captured audio data
if self.audio_data:
combined = np.concatenate(self.audio_data, axis=0)
return combined
return None
# Title of the app
st.title("Real-Time Speech-to-Text")
# Initialize the audio processor
audio_processor = AudioProcessor()
# WebRTC streamer to capture microphone input
webrtc_streamer(
key="audio",
mode=WebRtcMode.SENDONLY,
audio_processor_factory=lambda: audio_processor,
media_stream_constraints={"audio": True, "video": False},
)
# Load a pre-trained ASR pipeline from Hugging Face
@st.cache_resource
def load_asr_model():
return load_model()
asr_model = load_asr_model()
# Button to process audio and perform ASR
if st.button("Transcribe Audio"):
audio_data = audio_processor.get_audio_data()
if audio_data is not None:
# Convert the NumPy array to a WAV-like audio segment
audio_segment = pydub.AudioSegment(
audio_data.tobytes(),
frame_rate=16000, # Default WebRTC audio frame rate
sample_width=2, # 16-bit audio
channels=1 # Mono
)
# Perform ASR on the audio segment
st.info("Transcribing audio...")
transcription = inference(asr_model, audio_segment.raw_data)
# Display transcription
st.text_area("Transcription", transcription["text"], height=200)
else:
st.warning("No audio data captured! Please speak into your microphone.")
|