Spaces:

samarthsrivastava
/

voice_to_text_system

Build error

File size: 12,317 Bytes

787a546

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "5c7d8fe6-69ca-4f29-9046-0b0bc9f31911",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "99ee6b03c5154644998c23c837444e83",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(Button(button_style='success', description='Record', icon='microphone', style=ButtonStyle()), B…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2b3e4f24da8d4c198b5d15f0f3f7399d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Output()"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import ipywidgets as widgets\n",
    "from IPython.display import display, clear_output\n",
    "from threading import Thread\n",
    "from queue import Queue\n",
    "import time\n",
    "\n",
    "messages = Queue()\n",
    "recordings = Queue()\n",
    "\n",
    "record_button = widgets.Button(\n",
    "    description=\"Record\",\n",
    "    disabled=False,\n",
    "    button_style=\"success\",\n",
    "    icon=\"microphone\"\n",
    ")\n",
    "\n",
    "stop_button = widgets.Button(\n",
    "    description=\"Stop\",\n",
    "    disabled=False,\n",
    "    button_style=\"warning\",\n",
    "    icon=\"stop\"\n",
    ")\n",
    "\n",
    "output = widgets.Output()\n",
    "\n",
    "def record_microphone():\n",
    "    while not messages.empty():\n",
    "        time.sleep(1)  # Simulate recording\n",
    "        recordings.put(\"Audio recorded.\")  # Simulated recorded audio data\n",
    "\n",
    "def speech_recognition(output_widget):\n",
    "    while not messages.empty():\n",
    "        time.sleep(2)  # Simulate transcription\n",
    "        with output_widget:\n",
    "            clear_output(wait=True)\n",
    "            display(\"Transcription: Hello, how are you?\")  # Simulated transcription result\n",
    "\n",
    "def start_recording(data):\n",
    "    if not messages.empty():\n",
    "        return  # Recording already in progress\n",
    "\n",
    "    messages.put(True)\n",
    "    with output:\n",
    "        clear_output(wait=True)\n",
    "        display(\"Starting...\")\n",
    "\n",
    "    record = Thread(target=record_microphone)\n",
    "    record.start()\n",
    "\n",
    "    transcribe = Thread(target=speech_recognition, args=(output,))\n",
    "    transcribe.start()\n",
    "\n",
    "def stop_recording(data):\n",
    "    if messages.empty():\n",
    "        return  # No recording in progress\n",
    "\n",
    "    messages.get()\n",
    "    with output:\n",
    "        clear_output(wait=True)\n",
    "        display(\"Stopped.\")\n",
    "\n",
    "record_button.on_click(start_recording)\n",
    "stop_button.on_click(stop_recording)\n",
    "\n",
    "display(widgets.HBox([record_button, stop_button]), output)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "bdcb9097-ab31-4dcc-9e2a-4e0818fceb3f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: pyaudio in c:\\users\\samarth srivastava\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (0.2.14)\n"
     ]
    }
   ],
   "source": [
    "!python -m pip install pyaudio"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "34112777-1845-4aff-80de-099ceed52f01",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'index': 0, 'structVersion': 2, 'name': 'Microsoft Sound Mapper - Input', 'hostApi': 0, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}\n",
      "{'index': 1, 'structVersion': 2, 'name': 'Microphone Array (IntelÂ® Smart ', 'hostApi': 0, 'maxInputChannels': 4, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}\n",
      "{'index': 2, 'structVersion': 2, 'name': 'Microsoft Sound Mapper - Output', 'hostApi': 0, 'maxInputChannels': 0, 'maxOutputChannels': 2, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}\n",
      "{'index': 3, 'structVersion': 2, 'name': 'Speakers (Realtek(R) Audio)', 'hostApi': 0, 'maxInputChannels': 0, 'maxOutputChannels': 2, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}\n",
      "{'index': 4, 'structVersion': 2, 'name': 'Primary Sound Capture Driver', 'hostApi': 1, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.12, 'defaultLowOutputLatency': 0.0, 'defaultHighInputLatency': 0.24, 'defaultHighOutputLatency': 0.0, 'defaultSampleRate': 44100.0}\n",
      "{'index': 5, 'structVersion': 2, 'name': 'Microphone Array (IntelÂ® Smart Sound Technology (IntelÂ® SST))', 'hostApi': 1, 'maxInputChannels': 4, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.12, 'defaultLowOutputLatency': 0.0, 'defaultHighInputLatency': 0.24, 'defaultHighOutputLatency': 0.0, 'defaultSampleRate': 44100.0}\n",
      "{'index': 6, 'structVersion': 2, 'name': 'Primary Sound Driver', 'hostApi': 1, 'maxInputChannels': 0, 'maxOutputChannels': 8, 'defaultLowInputLatency': 0.0, 'defaultLowOutputLatency': 0.12, 'defaultHighInputLatency': 0.0, 'defaultHighOutputLatency': 0.24, 'defaultSampleRate': 44100.0}\n",
      "{'index': 7, 'structVersion': 2, 'name': 'Speakers (Realtek(R) Audio)', 'hostApi': 1, 'maxInputChannels': 0, 'maxOutputChannels': 2, 'defaultLowInputLatency': 0.0, 'defaultLowOutputLatency': 0.12, 'defaultHighInputLatency': 0.0, 'defaultHighOutputLatency': 0.24, 'defaultSampleRate': 44100.0}\n",
      "{'index': 8, 'structVersion': 2, 'name': 'Speakers (Realtek(R) Audio)', 'hostApi': 2, 'maxInputChannels': 0, 'maxOutputChannels': 8, 'defaultLowInputLatency': 0.0, 'defaultLowOutputLatency': 0.003, 'defaultHighInputLatency': 0.0, 'defaultHighOutputLatency': 0.01, 'defaultSampleRate': 48000.0}\n",
      "{'index': 9, 'structVersion': 2, 'name': 'Microphone Array (IntelÂ® Smart Sound Technology (IntelÂ® SST))', 'hostApi': 2, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.002, 'defaultLowOutputLatency': 0.0, 'defaultHighInputLatency': 0.01, 'defaultHighOutputLatency': 0.0, 'defaultSampleRate': 48000.0}\n",
      "{'index': 10, 'structVersion': 2, 'name': 'Microphone Array 1 (IntelÂ® Smart Sound Technology (IntelÂ® SST) Microphone)', 'hostApi': 3, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.01, 'defaultLowOutputLatency': 0.01, 'defaultHighInputLatency': 0.04, 'defaultHighOutputLatency': 0.04, 'defaultSampleRate': 48000.0}\n",
      "{'index': 11, 'structVersion': 2, 'name': 'Microphone Array 2 (IntelÂ® Smart Sound Technology (IntelÂ® SST) Microphone)', 'hostApi': 3, 'maxInputChannels': 4, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.01, 'defaultLowOutputLatency': 0.01, 'defaultHighInputLatency': 0.04, 'defaultHighOutputLatency': 0.04, 'defaultSampleRate': 16000.0}\n",
      "{'index': 12, 'structVersion': 2, 'name': 'Microphone Array 3 (IntelÂ® Smart Sound Technology (IntelÂ® SST) Microphone)', 'hostApi': 3, 'maxInputChannels': 4, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.01, 'defaultLowOutputLatency': 0.01, 'defaultHighInputLatency': 0.04, 'defaultHighOutputLatency': 0.04, 'defaultSampleRate': 16000.0}\n",
      "{'index': 13, 'structVersion': 2, 'name': 'Stereo Mix (Realtek HD Audio Stereo input)', 'hostApi': 3, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.01, 'defaultLowOutputLatency': 0.01, 'defaultHighInputLatency': 0.04, 'defaultHighOutputLatency': 0.04, 'defaultSampleRate': 48000.0}\n",
      "{'index': 14, 'structVersion': 2, 'name': 'Headphones (Realtek HD Audio 2nd output with SST)', 'hostApi': 3, 'maxInputChannels': 0, 'maxOutputChannels': 2, 'defaultLowInputLatency': 0.01, 'defaultLowOutputLatency': 0.01, 'defaultHighInputLatency': 0.04, 'defaultHighOutputLatency': 0.04, 'defaultSampleRate': 48000.0}\n",
      "{'index': 15, 'structVersion': 2, 'name': 'Speakers (Realtek HD Audio output with SST)', 'hostApi': 3, 'maxInputChannels': 0, 'maxOutputChannels': 2, 'defaultLowInputLatency': 0.01, 'defaultLowOutputLatency': 0.01, 'defaultHighInputLatency': 0.04, 'defaultHighOutputLatency': 0.04, 'defaultSampleRate': 48000.0}\n",
      "{'index': 16, 'structVersion': 2, 'name': 'Microphone (Realtek HD Audio Mic input)', 'hostApi': 3, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.01, 'defaultLowOutputLatency': 0.01, 'defaultHighInputLatency': 0.04, 'defaultHighOutputLatency': 0.04, 'defaultSampleRate': 44100.0}\n"
     ]
    }
   ],
   "source": [
    "import pyaudio\n",
    "\n",
    "p = pyaudio.PyAudio()\n",
    "for i in range(p.get_device_count()):\n",
    "    print(p.get_device_info_by_index(i))\n",
    "\n",
    "p.terminate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "2e74dacf-1a91-4dfa-bf91-c64c72755d75",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pyaudio\n",
    "from queue import Queue\n",
    "\n",
    "CHANNELS = 1\n",
    "FRAME_RATE = 16000\n",
    "RECORD_SECONDS = 20\n",
    "AUDIO_FORMAT = pyaudio.paInt16\n",
    "SAMPLE_SIZE = 2\n",
    "\n",
    "messages = Queue()\n",
    "recordings = Queue()\n",
    "\n",
    "def record_microphone(chunk=1024):\n",
    "    p = pyaudio.PyAudio()\n",
    "\n",
    "    stream = p.open(format=AUDIO_FORMAT,\n",
    "                    channels=CHANNELS,\n",
    "                    rate=FRAME_RATE,\n",
    "                    input=True,\n",
    "                    input_device_index=1,\n",
    "                    frames_per_buffer=chunk)\n",
    "\n",
    "    frames = []\n",
    "\n",
    "    while not messages.empty():\n",
    "        data = stream.read(chunk)\n",
    "        frames.append(data)\n",
    "\n",
    "        if len(frames) >= int(FRAME_RATE * RECORD_SECONDS / chunk):\n",
    "            recordings.put(frames.copy())\n",
    "            frames = []\n",
    "\n",
    "    stream.stop_stream()\n",
    "    stream.close()\n",
    "    p.terminate()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "931dc754-e034-45e7-981b-a9210c1fe6e9",
   "metadata": {},
   "outputs": [],
   "source": [
    "import subprocess\n",
    "import json\n",
    "from vosk import Model, KaldiRecognizer\n",
    "\n",
    "model = Model(model_name=\"vosk-model-en-us-0.42-gigaspeech\")\n",
    "rec = KaldiRecognizer(model, FRAME_RATE)\n",
    "rec.SetWords(True)\n",
    "\n",
    "def speech_recognition(output):\n",
    "    while not messages.empty():\n",
    "        frames = recordings.get()\n",
    "\n",
    "        rec.AcceptWaveform(b''.join(frames))\n",
    "        result = rec.Result()\n",
    "        text = json.loads(result)[\"text\"]\n",
    "\n",
    "        cased = subprocess.check_output(\"python recasepunc/recasepunc.py predict recasepunc/checkpoint\", shell=True, text=True, input=text)\n",
    "        output.append_stdout(cased)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a27fb138-d3a9-4e04-83fe-23aca2921d92",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}