File size: 4,832 Bytes
93f6882
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62f7cb7
93f6882
 
 
 
 
 
 
62f7cb7
 
93f6882
 
 
 
 
 
 
 
 
 
62f7cb7
 
 
93f6882
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62f7cb7
93f6882
62f7cb7
 
93f6882
 
 
 
 
 
 
 
 
 
 
 
62f7cb7
93f6882
 
 
 
 
62f7cb7
93f6882
62f7cb7
 
93f6882
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# WebVTT Reading and Chunking Test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datetime import datetime, timedelta\n",
    "from functools import partial\n",
    "from html import escape\n",
    "from io import BytesIO\n",
    "from IPython.display import display_html\n",
    "from itertools import chain\n",
    "import re\n",
    "from webvtt import Caption, WebVTT\n",
    "from webvtt.models import Timestamp\n",
    "from zoneinfo import ZoneInfo\n",
    "\n",
    "display_html = partial(display_html, raw=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "FILE_PATH = 'GMT20250411-223535_Recording.transcript.vtt'\n",
    "TIME_ZONE = ZoneInfo(\"America/New_York\")\n",
    "BASE_TIME = datetime(2025, 4, 11, hour=22, minute=35, second=35, tzinfo=ZoneInfo(\"GMT\")).astimezone(TIME_ZONE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(FILE_PATH, 'rb') as file:\n",
    "    web_vtt = WebVTT.from_buffer(BytesIO(file.read()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<ul><li>__class__</li><li>__delattr__</li><li>__dict__</li><li>__dir__</li><li>__doc__</li><li>__eq__</li><li>__format__</li><li>__ge__</li><li>__getattribute__</li><li>__getitem__</li><li>__getstate__</li><li>__gt__</li><li>__hash__</li><li>__init__</li><li>__init_subclass__</li><li>__le__</li><li>__len__</li><li>__lt__</li><li>__module__</li><li>__ne__</li><li>__new__</li><li>__reduce__</li><li>__reduce_ex__</li><li>__repr__</li><li>__setattr__</li><li>__sizeof__</li><li>__str__</li><li>__subclasshook__</li><li>__weakref__</li><li>_get_destination_file</li><li>_get_lines</li><li>_has_bom</li><li>captions</li><li>content</li><li>encoding</li><li>file</li><li>footer_comments</li><li>from_buffer</li><li>from_sbv</li><li>from_srt</li><li>from_string</li><li>header_comments</li><li>iter_slice</li><li>read</li><li>read_buffer</li><li>save</li><li>save_as_srt</li><li>styles</li><li>total_length</li><li>write</li></ul>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display_html(''.join(chain('<ul>', (f'<li>{escape(member)}</li>' for member in dir(web_vtt)), '</ul>')))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "                    <strong>Caption</strong> #344\n",
       "                    <ul>\n",
       "                        <li><strong>Start:</strong> Friday, April 11, 2025, 07:36:54 PM EDT</li>\n",
       "                        <li><strong>Speaker:</strong> CUNY Tech Prep (CTP)</li>\n",
       "                        <li><strong>Speech:</strong> Alright. You can pick the rooms. Now go into your rooms.</li>\n",
       "                        <li><strong>End:</strong> Friday, April 11, 2025, 07:36:57 PM EDT</li>\n",
       "                    </ul>\n",
       "                "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "speaker_speech_pattern = re.compile('(?:([^:]+): )?(.*)')\n",
    "\n",
    "match web_vtt.captions[343]:\n",
    "    case Caption(identifier=identifier, start_time=start_time, end_time=end_time, text=text):\n",
    "        match speaker_speech_pattern.search(text).groups():\n",
    "            case (speaker, speech):\n",
    "                display_html(f\"\"\"\n",
    "                    <strong>Caption</strong> #{identifier}\n",
    "                    <ul>\n",
    "                        <li><strong>Start:</strong> {BASE_TIME + timedelta(**start_time.__dict__):%A, %B %d, %Y, %I:%M:%S %p %Z}</li>\n",
    "                        <li><strong>Speaker:</strong> {escape(speaker)}</li>\n",
    "                        <li><strong>Speech:</strong> {escape(speech)}</li>\n",
    "                        <li><strong>End:</strong> {BASE_TIME + timedelta(**end_time.__dict__):%A, %B %d, %Y, %I:%M:%S %p %Z}</li>\n",
    "                    </ul>\n",
    "                \"\"\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}