rrevo's picture
server
71f6464
raw
history blame
3.39 kB
import httpx
from queue import Queue
import logging
from datetime import UTC, datetime, timedelta
from time import sleep
import pickle
import speech_recognition as sr
from audio_utils import get_microphone, get_speech_recognizer, get_all_audio_queue, to_audio_array, AudioChunk
logger = logging.getLogger(__name__)
TRANSCRIBING_SERVER = "http://localhost:3535/transcribe"
def main():
recording_duration = 2
sample_rate = 16000
energy_threshold = 300
data_queue = Queue()
microphone = get_microphone(sample_rate=sample_rate)
speech_recognizer = get_speech_recognizer(energy_threshold=energy_threshold)
with microphone:
speech_recognizer.adjust_for_ambient_noise(source=microphone)
def record_callback(_, audio: sr.AudioData) -> None:
data = audio.get_raw_data()
data_queue.put(data)
speech_recognizer.listen_in_background(source=microphone, callback=record_callback)
print("\n🎤 Microphone is now listening...\n")
current_audio_chunk = AudioChunk(start_time=datetime.now(tz=UTC))
while True:
try:
now = datetime.now(tz=UTC)
# Pull raw recorded audio from the queue.
if not data_queue.empty():
# Store end time if we're over the recording time limit.
if now - current_audio_chunk.start_time > timedelta(seconds=recording_duration):
current_audio_chunk.end_time = now
# Get audio data from queue
audio_data = get_all_audio_queue(data_queue)
audio_np_array = to_audio_array(audio_data)
if current_audio_chunk.is_complete:
print('start serialize')
serialized = pickle.dumps(current_audio_chunk.audio_array)
print('end serialize')
print('start req')
response = httpx.post(TRANSCRIBING_SERVER, data=serialized)
print('req done', response.text, response.status_code)
# text = transcribe_model.transcribe(current_audio_chunk.audio_array)
# sentence = Sentence(
# start_time=current_audio_chunk.start_time, end_time=current_audio_chunk.end_time, text=text
# )
current_audio_chunk = AudioChunk(
audio_array=audio_np_array, start_time=datetime.now(tz=UTC)
)
# print(sentence.text) # noqa: T201
else:
current_audio_chunk.update_array(audio_np_array)
# Flush stdout
print("", end="", flush=True) # noqa: T201
# Infinite loops are bad for processors, must sleep.
sleep(0.25)
except KeyboardInterrupt:
current_audio_chunk.end_time = datetime.now(tz=UTC)
if current_audio_chunk.is_complete:
logger.warning("⚠️ Transcribing last chunk...")
# text = transcribe_model.transcribe(current_audio_chunk.audio_array)
# sentence = Sentence(
# start_time=current_audio_chunk.start_time, end_time=current_audio_chunk.end_time, text=text
# )
# print(sentence.text) # noqa: T201
break
if __name__ == '__main__':
main()