Lguyogiro commited on
Commit
c734958
·
1 Parent(s): 8d2723f
Files changed (1) hide show
  1. app.py +86 -60
app.py CHANGED
@@ -1,73 +1,99 @@
 
 
 
1
  import streamlit as st
2
- from streamlit_webrtc import webrtc_streamer, AudioProcessorBase, WebRtcMode
3
- import numpy as np
4
- import pydub
5
- from transformers import pipeline
6
  from asr import load_model, inference
 
7
 
8
 
9
- # Define a custom audio processor to handle microphone input
10
- class AudioProcessor(AudioProcessorBase):
11
- def __init__(self):
12
- self.audio_data = []
13
-
14
- def recv_audio(self, frame):
15
- # Convert the audio frame to a NumPy array
16
- audio_array = np.frombuffer(frame.to_ndarray(), dtype=np.int16)
17
- self.audio_data.append(audio_array)
18
- return frame
19
-
20
- def get_audio_data(self):
21
- # Combine all captured audio data
22
- if self.audio_data:
23
- combined = np.concatenate(self.audio_data, axis=0)
24
- return combined
25
- return None
26
-
27
- # Title of the app
28
- st.title("Real-Time Speech-to-Text")
29
-
30
- # Initialize the audio processor
31
- audio_processor = AudioProcessor()
32
-
33
- # WebRTC streamer to capture microphone input
34
- webrtc_streamer(
35
- key="audio",
36
- mode=WebRtcMode.SENDONLY,
37
- audio_processor_factory=lambda: audio_processor,
38
- media_stream_constraints={"audio": True, "video": False},
39
- rtc_configuration={
40
- "iceServers": [
41
- {"urls": "stun:stun.l.google.com:19302"} # Public STUN server
42
- ]
43
- },
44
- )
45
-
46
- # Load a pre-trained ASR pipeline from Hugging Face
47
  @st.cache_resource
48
  def load_asr_model():
49
  return load_model()
50
 
51
  asr_model = load_asr_model()
52
 
53
- # Button to process audio and perform ASR
54
- if st.button("Transcribe Audio"):
55
- audio_data = audio_processor.get_audio_data()
56
- if audio_data is not None:
57
- # Convert the NumPy array to a WAV-like audio segment
58
- audio_segment = pydub.AudioSegment(
59
- audio_data.tobytes(),
60
- frame_rate=16000, # Default WebRTC audio frame rate
61
- sample_width=2, # 16-bit audio
62
- channels=1 # Mono
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  )
64
 
65
- # Perform ASR on the audio segment
66
- st.info("Transcribing audio...")
67
- transcription = inference(asr_model, audio_segment.raw_data)
68
-
69
- # Display transcription
70
- st.text_area("Transcription", transcription["text"], height=200)
71
- else:
72
- st.warning("No audio data captured! Please speak into your microphone.")
 
 
 
 
 
 
 
 
 
 
 
73
 
 
 
 
1
+ import os
2
+ import sys
3
+ import datetime
4
  import streamlit as st
 
 
 
 
5
  from asr import load_model, inference
6
+ from audio_recorder_streamlit import audio_recorder
7
 
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  @st.cache_resource
10
  def load_asr_model():
11
  return load_model()
12
 
13
  asr_model = load_asr_model()
14
 
15
+
16
+ def transcribe(audio_file):
17
+ transcript = openai.Audio.transcribe("whisper-1", audio_file)
18
+ return transcript
19
+
20
+
21
+ def save_audio_file(audio_bytes, file_extension):
22
+ """
23
+ Save audio bytes to a file with the specified extension.
24
+
25
+ :param audio_bytes: Audio data in bytes
26
+ :param file_extension: The extension of the output audio file
27
+ :return: The name of the saved audio file
28
+ """
29
+ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
30
+ file_name = f"audio_{timestamp}.{file_extension}"
31
+
32
+ with open(file_name, "wb") as f:
33
+ f.write(audio_bytes)
34
+
35
+ return file_name
36
+
37
+
38
+ def transcribe_audio(file_path):
39
+ """
40
+ Transcribe the audio file at the specified path.
41
+
42
+ :param file_path: The path of the audio file to transcribe
43
+ :return: The transcribed text
44
+ """
45
+ with open(file_path, "rb") as audio_file:
46
+ transcript = inference(asr_model, audio_file)
47
+ return transcript
48
+
49
+
50
+ def main():
51
+ """
52
+ """
53
+ st.title("Anishinaabemowin Transcription")
54
+ tab1, tab2 = st.tabs(["Record Audio", "Upload Audio"])
55
+
56
+ # Record Audio tab
57
+ with tab1:
58
+ audio_bytes = audio_recorder()
59
+ if audio_bytes:
60
+ st.audio(audio_bytes, format="audio/wav")
61
+ save_audio_file(audio_bytes, "wav")
62
+
63
+ # Upload Audio tab
64
+ with tab2:
65
+ audio_file = st.file_uploader("Upload Audio", type=["wav"])
66
+ if audio_file:
67
+ file_extension = audio_file.type.split('/')[1]
68
+ save_audio_file(audio_file.read(), file_extension)
69
+
70
+ # Transcribe button action
71
+ if st.button("Transcribe"):
72
+ # Find the newest audio file
73
+ audio_file_path = max(
74
+ [f for f in os.listdir(".") if f.startswith("audio")],
75
+ key=os.path.getctime,
76
  )
77
 
78
+ # Transcribe the audio file
79
+ transcript_text = transcribe_audio(audio_file_path)
80
+
81
+ # Display the transcript
82
+ st.header("Transcript")
83
+ st.write(transcript_text)
84
+
85
+ # Save the transcript to a text file
86
+ with open("transcript.txt", "w") as f:
87
+ f.write(transcript_text)
88
+
89
+ # Provide a download button for the transcript
90
+ st.download_button("Download Transcript", transcript_text)
91
+
92
+
93
+ if __name__ == "__main__":
94
+ # Set up the working directory
95
+ working_dir = os.path.dirname(os.path.abspath(__file__))
96
+ sys.path.append(working_dir)
97
 
98
+ # Run the main function
99
+ main()