{ "cells": [ { "cell_type": "code", "execution_count": 29, "id": "5c7d8fe6-69ca-4f29-9046-0b0bc9f31911", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "99ee6b03c5154644998c23c837444e83", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(Button(button_style='success', description='Record', icon='microphone', style=ButtonStyle()), B…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "2b3e4f24da8d4c198b5d15f0f3f7399d", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Output()" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import ipywidgets as widgets\n", "from IPython.display import display, clear_output\n", "from threading import Thread\n", "from queue import Queue\n", "import time\n", "\n", "messages = Queue()\n", "recordings = Queue()\n", "\n", "record_button = widgets.Button(\n", " description=\"Record\",\n", " disabled=False,\n", " button_style=\"success\",\n", " icon=\"microphone\"\n", ")\n", "\n", "stop_button = widgets.Button(\n", " description=\"Stop\",\n", " disabled=False,\n", " button_style=\"warning\",\n", " icon=\"stop\"\n", ")\n", "\n", "output = widgets.Output()\n", "\n", "def record_microphone():\n", " while not messages.empty():\n", " time.sleep(1) # Simulate recording\n", " recordings.put(\"Audio recorded.\") # Simulated recorded audio data\n", "\n", "def speech_recognition(output_widget):\n", " while not messages.empty():\n", " time.sleep(2) # Simulate transcription\n", " with output_widget:\n", " clear_output(wait=True)\n", " display(\"Transcription: Hello, how are you?\") # Simulated transcription result\n", "\n", "def start_recording(data):\n", " if not messages.empty():\n", " return # Recording already in progress\n", "\n", " messages.put(True)\n", " with output:\n", " clear_output(wait=True)\n", " display(\"Starting...\")\n", "\n", " record = Thread(target=record_microphone)\n", " record.start()\n", "\n", " transcribe = Thread(target=speech_recognition, args=(output,))\n", " transcribe.start()\n", "\n", "def stop_recording(data):\n", " if messages.empty():\n", " return # No recording in progress\n", "\n", " messages.get()\n", " with output:\n", " clear_output(wait=True)\n", " display(\"Stopped.\")\n", "\n", "record_button.on_click(start_recording)\n", "stop_button.on_click(stop_recording)\n", "\n", "display(widgets.HBox([record_button, stop_button]), output)\n" ] }, { "cell_type": "code", "execution_count": 30, "id": "bdcb9097-ab31-4dcc-9e2a-4e0818fceb3f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: pyaudio in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (0.2.14)\n" ] } ], "source": [ "!python -m pip install pyaudio" ] }, { "cell_type": "code", "execution_count": 31, "id": "34112777-1845-4aff-80de-099ceed52f01", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'index': 0, 'structVersion': 2, 'name': 'Microsoft Sound Mapper - Input', 'hostApi': 0, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}\n", "{'index': 1, 'structVersion': 2, 'name': 'Microphone Array (Intel® Smart ', 'hostApi': 0, 'maxInputChannels': 4, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}\n", "{'index': 2, 'structVersion': 2, 'name': 'Microsoft Sound Mapper - Output', 'hostApi': 0, 'maxInputChannels': 0, 'maxOutputChannels': 2, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}\n", "{'index': 3, 'structVersion': 2, 'name': 'Speakers (Realtek(R) Audio)', 'hostApi': 0, 'maxInputChannels': 0, 'maxOutputChannels': 2, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}\n", "{'index': 4, 'structVersion': 2, 'name': 'Primary Sound Capture Driver', 'hostApi': 1, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.12, 'defaultLowOutputLatency': 0.0, 'defaultHighInputLatency': 0.24, 'defaultHighOutputLatency': 0.0, 'defaultSampleRate': 44100.0}\n", "{'index': 5, 'structVersion': 2, 'name': 'Microphone Array (Intel® Smart Sound Technology (Intel® SST))', 'hostApi': 1, 'maxInputChannels': 4, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.12, 'defaultLowOutputLatency': 0.0, 'defaultHighInputLatency': 0.24, 'defaultHighOutputLatency': 0.0, 'defaultSampleRate': 44100.0}\n", "{'index': 6, 'structVersion': 2, 'name': 'Primary Sound Driver', 'hostApi': 1, 'maxInputChannels': 0, 'maxOutputChannels': 8, 'defaultLowInputLatency': 0.0, 'defaultLowOutputLatency': 0.12, 'defaultHighInputLatency': 0.0, 'defaultHighOutputLatency': 0.24, 'defaultSampleRate': 44100.0}\n", "{'index': 7, 'structVersion': 2, 'name': 'Speakers (Realtek(R) Audio)', 'hostApi': 1, 'maxInputChannels': 0, 'maxOutputChannels': 2, 'defaultLowInputLatency': 0.0, 'defaultLowOutputLatency': 0.12, 'defaultHighInputLatency': 0.0, 'defaultHighOutputLatency': 0.24, 'defaultSampleRate': 44100.0}\n", "{'index': 8, 'structVersion': 2, 'name': 'Speakers (Realtek(R) Audio)', 'hostApi': 2, 'maxInputChannels': 0, 'maxOutputChannels': 8, 'defaultLowInputLatency': 0.0, 'defaultLowOutputLatency': 0.003, 'defaultHighInputLatency': 0.0, 'defaultHighOutputLatency': 0.01, 'defaultSampleRate': 48000.0}\n", "{'index': 9, 'structVersion': 2, 'name': 'Microphone Array (Intel® Smart Sound Technology (Intel® SST))', 'hostApi': 2, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.002, 'defaultLowOutputLatency': 0.0, 'defaultHighInputLatency': 0.01, 'defaultHighOutputLatency': 0.0, 'defaultSampleRate': 48000.0}\n", "{'index': 10, 'structVersion': 2, 'name': 'Microphone Array 1 (Intel® Smart Sound Technology (Intel® SST) Microphone)', 'hostApi': 3, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.01, 'defaultLowOutputLatency': 0.01, 'defaultHighInputLatency': 0.04, 'defaultHighOutputLatency': 0.04, 'defaultSampleRate': 48000.0}\n", "{'index': 11, 'structVersion': 2, 'name': 'Microphone Array 2 (Intel® Smart Sound Technology (Intel® SST) Microphone)', 'hostApi': 3, 'maxInputChannels': 4, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.01, 'defaultLowOutputLatency': 0.01, 'defaultHighInputLatency': 0.04, 'defaultHighOutputLatency': 0.04, 'defaultSampleRate': 16000.0}\n", "{'index': 12, 'structVersion': 2, 'name': 'Microphone Array 3 (Intel® Smart Sound Technology (Intel® SST) Microphone)', 'hostApi': 3, 'maxInputChannels': 4, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.01, 'defaultLowOutputLatency': 0.01, 'defaultHighInputLatency': 0.04, 'defaultHighOutputLatency': 0.04, 'defaultSampleRate': 16000.0}\n", "{'index': 13, 'structVersion': 2, 'name': 'Stereo Mix (Realtek HD Audio Stereo input)', 'hostApi': 3, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.01, 'defaultLowOutputLatency': 0.01, 'defaultHighInputLatency': 0.04, 'defaultHighOutputLatency': 0.04, 'defaultSampleRate': 48000.0}\n", "{'index': 14, 'structVersion': 2, 'name': 'Headphones (Realtek HD Audio 2nd output with SST)', 'hostApi': 3, 'maxInputChannels': 0, 'maxOutputChannels': 2, 'defaultLowInputLatency': 0.01, 'defaultLowOutputLatency': 0.01, 'defaultHighInputLatency': 0.04, 'defaultHighOutputLatency': 0.04, 'defaultSampleRate': 48000.0}\n", "{'index': 15, 'structVersion': 2, 'name': 'Speakers (Realtek HD Audio output with SST)', 'hostApi': 3, 'maxInputChannels': 0, 'maxOutputChannels': 2, 'defaultLowInputLatency': 0.01, 'defaultLowOutputLatency': 0.01, 'defaultHighInputLatency': 0.04, 'defaultHighOutputLatency': 0.04, 'defaultSampleRate': 48000.0}\n", "{'index': 16, 'structVersion': 2, 'name': 'Microphone (Realtek HD Audio Mic input)', 'hostApi': 3, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.01, 'defaultLowOutputLatency': 0.01, 'defaultHighInputLatency': 0.04, 'defaultHighOutputLatency': 0.04, 'defaultSampleRate': 44100.0}\n" ] } ], "source": [ "import pyaudio\n", "\n", "p = pyaudio.PyAudio()\n", "for i in range(p.get_device_count()):\n", " print(p.get_device_info_by_index(i))\n", "\n", "p.terminate()" ] }, { "cell_type": "code", "execution_count": 32, "id": "2e74dacf-1a91-4dfa-bf91-c64c72755d75", "metadata": {}, "outputs": [], "source": [ "import pyaudio\n", "from queue import Queue\n", "\n", "CHANNELS = 1\n", "FRAME_RATE = 16000\n", "RECORD_SECONDS = 20\n", "AUDIO_FORMAT = pyaudio.paInt16\n", "SAMPLE_SIZE = 2\n", "\n", "messages = Queue()\n", "recordings = Queue()\n", "\n", "def record_microphone(chunk=1024):\n", " p = pyaudio.PyAudio()\n", "\n", " stream = p.open(format=AUDIO_FORMAT,\n", " channels=CHANNELS,\n", " rate=FRAME_RATE,\n", " input=True,\n", " input_device_index=1,\n", " frames_per_buffer=chunk)\n", "\n", " frames = []\n", "\n", " while not messages.empty():\n", " data = stream.read(chunk)\n", " frames.append(data)\n", "\n", " if len(frames) >= int(FRAME_RATE * RECORD_SECONDS / chunk):\n", " recordings.put(frames.copy())\n", " frames = []\n", "\n", " stream.stop_stream()\n", " stream.close()\n", " p.terminate()\n" ] }, { "cell_type": "code", "execution_count": 33, "id": "931dc754-e034-45e7-981b-a9210c1fe6e9", "metadata": {}, "outputs": [], "source": [ "import subprocess\n", "import json\n", "from vosk import Model, KaldiRecognizer\n", "\n", "model = Model(model_name=\"vosk-model-en-us-0.42-gigaspeech\")\n", "rec = KaldiRecognizer(model, FRAME_RATE)\n", "rec.SetWords(True)\n", "\n", "def speech_recognition(output):\n", " while not messages.empty():\n", " frames = recordings.get()\n", "\n", " rec.AcceptWaveform(b''.join(frames))\n", " result = rec.Result()\n", " text = json.loads(result)[\"text\"]\n", "\n", " cased = subprocess.check_output(\"python recasepunc/recasepunc.py predict recasepunc/checkpoint\", shell=True, text=True, input=text)\n", " output.append_stdout(cased)" ] }, { "cell_type": "code", "execution_count": null, "id": "a27fb138-d3a9-4e04-83fe-23aca2921d92", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 5 }