Spaces:

KingZack
/

ctp-slack-bot

Runtime error

App Files Files Community

LiKenun commited on Apr 14

Commit

93f6882

1 Parent(s): 100a4fd

WebVTT parsing and chunk extraction proof-of-concept

Browse files

Files changed (1) hide show

notebooks/web-vtt.ipynb +126 -0

notebooks/web-vtt.ipynb ADDED Viewed

	@@ -0,0 +1,126 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# WebVTT Reading and Chunking Test"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from functools import partial\n",
+    "from html import escape\n",
+    "from io import BytesIO\n",
+    "from IPython.display import display_html\n",
+    "from itertools import chain\n",
+    "import re\n",
+    "from webvtt import Caption, WebVTT\n",
+    "\n",
+    "display_html = partial(display_html, raw=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "FILE_PATH = #"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(FILE_PATH, 'rb') as file:\n",
+    "    web_vtt = WebVTT.from_buffer(BytesIO(file.read()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<ul><li>__class__</li><li>__delattr__</li><li>__dict__</li><li>__dir__</li><li>__doc__</li><li>__eq__</li><li>__format__</li><li>__ge__</li><li>__getattribute__</li><li>__getitem__</li><li>__getstate__</li><li>__gt__</li><li>__hash__</li><li>__init__</li><li>__init_subclass__</li><li>__le__</li><li>__len__</li><li>__lt__</li><li>__module__</li><li>__ne__</li><li>__new__</li><li>__reduce__</li><li>__reduce_ex__</li><li>__repr__</li><li>__setattr__</li><li>__sizeof__</li><li>__str__</li><li>__subclasshook__</li><li>__weakref__</li><li>_get_destination_file</li><li>_get_lines</li><li>_has_bom</li><li>captions</li><li>content</li><li>encoding</li><li>file</li><li>footer_comments</li><li>from_buffer</li><li>from_sbv</li><li>from_srt</li><li>from_string</li><li>header_comments</li><li>iter_slice</li><li>read</li><li>read_buffer</li><li>save</li><li>save_as_srt</li><li>styles</li><li>total_length</li><li>write</li></ul>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "display_html(''.join(chain('<ul>', (f'<li>{escape(member)}</li>' for member in dir(web_vtt)), '</ul>')))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "                    <strong>Caption</strong> #344\n",
+       "                    <ul>\n",
+       "                        <li><strong>Time:</strong> 01:01:19.390–01:01:22.370</li>\n",
+       "                        <li><strong>Speaker:</strong> CUNY Tech Prep (CTP)</li>\n",
+       "                        <li><strong>Speech:</strong> <em>Alright. You can pick the rooms. Now go into your rooms.</em></li>\n",
+       "                    </ul>\n",
+       "                "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "speaker_speech_pattern = re.compile('(?:([^:]+): )?(.*)')\n",
+    "\n",
+    "match web_vtt.captions[343]:\n",
+    "    case Caption(identifier=identifier, start=start, end=end, text=text):\n",
+    "        match speaker_speech_pattern.search(text).groups():\n",
+    "            case (speaker, speech):\n",
+    "                display_html(f\"\"\"\n",
+    "                    <strong>Caption</strong> #{identifier}\n",
+    "                    <ul>\n",
+    "                        <li><strong>Time:</strong> {start}–{end}</li>\n",
+    "                        <li><strong>Speaker:</strong> {escape(speaker)}</li>\n",
+    "                        <li><strong>Speech:</strong> <em>{escape(speech)}</em></li>\n",
+    "                    </ul>\n",
+    "                \"\"\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}