Spaces:
Sleeping
Sleeping
File size: 4,784 Bytes
faee479 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
import time
import datetime
import logging
import soundfile
import streamlit as st
from streamlit_webrtc import webrtc_streamer, AudioProcessorBase, WebRtcMode
import numpy as np
import pydub
from pathlib import Path
from asr import load_model, inference
LOG_DIR = "./logs"
DATA_DIR = "./data"
logger = logging.getLogger(__name__)
# Define a custom audio processor to handle microphone input
class AudioProcessor(AudioProcessorBase):
def __init__(self):
self.audio_data = []
def recv_audio(self, frame):
# Convert the audio frame to a NumPy array
audio_array = np.frombuffer(frame.to_ndarray(), dtype=np.int16)
self.audio_data.append(audio_array)
return frame
def get_audio_data(self):
# Combine all captured audio data
if self.audio_data:
combined = np.concatenate(self.audio_data, axis=0)
return combined
return None
def upload_audio() -> Path:
# Upload audio file
uploaded_file = st.file_uploader("Choose a audio file(wav, mp3, flac)", type=['wav','mp3','flac'])
if uploaded_file is not None:
# Save audio file
audio_data, samplerate = soundfile.read(uploaded_file)
# Make save directory
now = datetime.datetime.now()
now_time = now.strftime('%Y-%m-%d-%H:%M:%S')
audio_dir = Path(DATA_DIR) / f"{now_time}"
audio_dir.mkdir(parents=True, exist_ok=True)
audio_path = audio_dir / uploaded_file.name
soundfile.write(audio_path, audio_data, samplerate)
# Show audio file
with open(audio_path, 'rb') as audio_file:
audio_bytes = audio_file.read()
st.audio(audio_bytes, format=uploaded_file.type)
return audio_path
@st.cache_resource(show_spinner=False)
def call_load_model():
generator = load_model()
return generator
def main():
st.header("Speech-to-Text app with streamlit")
st.markdown(
"""
This STT app is using a fine-tuned MMS ASR model.
"""
)
audio_path = upload_audio()
logger.info(f"Uploaded audio file: {audio_path}")
with st.spinner(text="Wait for loading ASR Model..."):
generator = call_load_model()
if audio_path is not None:
start_time = time.time()
with st.spinner(text='Wait for inference...'):
output = inference(generator, audio_path)
end_time = time.time()
process_time = time.gmtime(end_time - start_time)
process_time = time.strftime("%H hour %M min %S secs", process_time)
st.success(f"Inference finished in {process_time}.")
st.write(f"output: {output['text']}")
st.title("Microphone Input for ASR")
# Initialize the audio processor
audio_processor = AudioProcessor()
webrtc_streamer(
key="audio",
mode=WebRtcMode.SENDONLY,
audio_processor_factory=lambda: audio_processor,
media_stream_constraints={"audio": True, "video": False},
)
if st.button("Process Audio"):
audio_data = audio_processor.get_audio_data()
if audio_data is not None:
# Convert the NumPy array to a WAV-like audio segment
audio_segment = pydub.AudioSegment(
audio_data.tobytes(),
frame_rate=16000, # Default WebRTC audio frame rate
sample_width=2, # 16-bit audio
channels=1 # Mono
)
# Save or process audio_segment as needed
st.success("Audio captured successfully!")
# st.audio(audio_segment.export(format="wav"), format="audio/wav")
else:
st.warning("No audio data captured!")
if st.button("Transcribe Audio"):
if audio_data is not None:
# Perform ASR on the audio segment
transcription = inference(generator, audio_segment.raw_data)
st.text_area("Transcription", transcription["text"])
else:
st.warning("No audio data to transcribe!")
if __name__ == "__main__":
# Setting logger
logger.setLevel(logging.INFO)
formatter = logging.Formatter("%(levelname)8s %(asctime)s %(name)s %(message)s")
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)
now = datetime.datetime.now()
now_time = now.strftime('%Y-%m-%d-%H:%M:%S')
log_dir = Path(LOG_DIR)
log_dir.mkdir(parents=True, exist_ok=True)
log_file = log_dir / f"{now_time}.log"
file_handler = logging.FileHandler(str(log_file), encoding='utf-8')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
logger.info('Start App')
main()
|