File size: 2,106 Bytes
1466b77
59f1adb
 
 
faee479
1466b77
 
59f1adb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
faee479
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1466b77
faee479
 
 
 
1466b77
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import streamlit as st
from streamlit_webrtc import webrtc_streamer, AudioProcessorBase, WebRtcMode
import numpy as np
import pydub
from transformers import pipeline
from asr import load_model, inference


# Define a custom audio processor to handle microphone input
class AudioProcessor(AudioProcessorBase):
    def __init__(self):
        self.audio_data = []

    def recv_audio(self, frame):
        # Convert the audio frame to a NumPy array
        audio_array = np.frombuffer(frame.to_ndarray(), dtype=np.int16)
        self.audio_data.append(audio_array)
        return frame

    def get_audio_data(self):
        # Combine all captured audio data
        if self.audio_data:
            combined = np.concatenate(self.audio_data, axis=0)
            return combined
        return None

# Title of the app
st.title("Real-Time Speech-to-Text")

# Initialize the audio processor
audio_processor = AudioProcessor()

# WebRTC streamer to capture microphone input
webrtc_streamer(
    key="audio",
    mode=WebRtcMode.SENDONLY,
    audio_processor_factory=lambda: audio_processor,
    media_stream_constraints={"audio": True, "video": False},
)

# Load a pre-trained ASR pipeline from Hugging Face
@st.cache_resource
def load_asr_model():
    return load_model()

asr_model = load_asr_model()

# Button to process audio and perform ASR
if st.button("Transcribe Audio"):
    audio_data = audio_processor.get_audio_data()
    if audio_data is not None:
        # Convert the NumPy array to a WAV-like audio segment
        audio_segment = pydub.AudioSegment(
            audio_data.tobytes(),
            frame_rate=16000,  # Default WebRTC audio frame rate
            sample_width=2,  # 16-bit audio
            channels=1  # Mono
        )

        # Perform ASR on the audio segment
        st.info("Transcribing audio...")
        transcription = inference(asr_model, audio_segment.raw_data)
        
        # Display transcription
        st.text_area("Transcription", transcription["text"], height=200)
    else:
        st.warning("No audio data captured! Please speak into your microphone.")