Lguyogiro commited on
Commit
59f1adb
·
1 Parent(s): 6b2f3c6

try adding mic inp

Browse files
Files changed (1) hide show
  1. app.py +61 -1
app.py CHANGED
@@ -3,7 +3,9 @@ import datetime
3
  import logging
4
  import soundfile
5
  import streamlit as st
6
-
 
 
7
  from pathlib import Path
8
 
9
  from asr import load_model, inference
@@ -12,6 +14,26 @@ LOG_DIR = "./logs"
12
  DATA_DIR = "./data"
13
  logger = logging.getLogger(__name__)
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  def upload_audio() -> Path:
16
  # Upload audio file
17
  uploaded_file = st.file_uploader("Choose a audio file(wav, mp3, flac)", type=['wav','mp3','flac'])
@@ -68,6 +90,44 @@ def main():
68
  st.success(f"Inference finished in {process_time}.")
69
  st.write(f"output: {output['text']}")
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
  if __name__ == "__main__":
73
  # Setting logger
 
3
  import logging
4
  import soundfile
5
  import streamlit as st
6
+ from streamlit_webrtc import webrtc_streamer, AudioProcessorBase, WebRtcMode
7
+ import numpy as np
8
+ import pydub
9
  from pathlib import Path
10
 
11
  from asr import load_model, inference
 
14
  DATA_DIR = "./data"
15
  logger = logging.getLogger(__name__)
16
 
17
+
18
+ # Define a custom audio processor to handle microphone input
19
+ class AudioProcessor(AudioProcessorBase):
20
+ def __init__(self):
21
+ self.audio_data = []
22
+
23
+ def recv_audio(self, frame):
24
+ # Convert the audio frame to a NumPy array
25
+ audio_array = np.frombuffer(frame.to_ndarray(), dtype=np.int16)
26
+ self.audio_data.append(audio_array)
27
+ return frame
28
+
29
+ def get_audio_data(self):
30
+ # Combine all captured audio data
31
+ if self.audio_data:
32
+ combined = np.concatenate(self.audio_data, axis=0)
33
+ return combined
34
+ return None
35
+
36
+
37
  def upload_audio() -> Path:
38
  # Upload audio file
39
  uploaded_file = st.file_uploader("Choose a audio file(wav, mp3, flac)", type=['wav','mp3','flac'])
 
90
  st.success(f"Inference finished in {process_time}.")
91
  st.write(f"output: {output['text']}")
92
 
93
+ st.title("Microphone Input for ASR")
94
+
95
+ # Initialize the audio processor
96
+ audio_processor = AudioProcessor()
97
+
98
+ webrtc_streamer(
99
+ key="audio",
100
+ mode=WebRtcMode.SENDONLY,
101
+ audio_processor_factory=lambda: audio_processor,
102
+ media_stream_constraints={"audio": True, "video": False},
103
+ )
104
+
105
+
106
+ if st.button("Process Audio"):
107
+ audio_data = audio_processor.get_audio_data()
108
+ if audio_data is not None:
109
+ # Convert the NumPy array to a WAV-like audio segment
110
+ audio_segment = pydub.AudioSegment(
111
+ audio_data.tobytes(),
112
+ frame_rate=16000, # Default WebRTC audio frame rate
113
+ sample_width=2, # 16-bit audio
114
+ channels=1 # Mono
115
+ )
116
+ # Save or process audio_segment as needed
117
+ st.success("Audio captured successfully!")
118
+ # st.audio(audio_segment.export(format="wav"), format="audio/wav")
119
+ else:
120
+ st.warning("No audio data captured!")
121
+
122
+
123
+ if st.button("Transcribe Audio"):
124
+ if audio_data is not None:
125
+ # Perform ASR on the audio segment
126
+ transcription = inference(generator, audio_segment.raw_data)
127
+ st.text_area("Transcription", transcription["text"])
128
+ else:
129
+ st.warning("No audio data to transcribe!")
130
+
131
 
132
  if __name__ == "__main__":
133
  # Setting logger