File size: 4,157 Bytes
93f6882
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# WebVTT Reading and Chunking Test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from functools import partial\n",
    "from html import escape\n",
    "from io import BytesIO\n",
    "from IPython.display import display_html\n",
    "from itertools import chain\n",
    "import re\n",
    "from webvtt import Caption, WebVTT\n",
    "\n",
    "display_html = partial(display_html, raw=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "FILE_PATH = #"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(FILE_PATH, 'rb') as file:\n",
    "    web_vtt = WebVTT.from_buffer(BytesIO(file.read()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<ul><li>__class__</li><li>__delattr__</li><li>__dict__</li><li>__dir__</li><li>__doc__</li><li>__eq__</li><li>__format__</li><li>__ge__</li><li>__getattribute__</li><li>__getitem__</li><li>__getstate__</li><li>__gt__</li><li>__hash__</li><li>__init__</li><li>__init_subclass__</li><li>__le__</li><li>__len__</li><li>__lt__</li><li>__module__</li><li>__ne__</li><li>__new__</li><li>__reduce__</li><li>__reduce_ex__</li><li>__repr__</li><li>__setattr__</li><li>__sizeof__</li><li>__str__</li><li>__subclasshook__</li><li>__weakref__</li><li>_get_destination_file</li><li>_get_lines</li><li>_has_bom</li><li>captions</li><li>content</li><li>encoding</li><li>file</li><li>footer_comments</li><li>from_buffer</li><li>from_sbv</li><li>from_srt</li><li>from_string</li><li>header_comments</li><li>iter_slice</li><li>read</li><li>read_buffer</li><li>save</li><li>save_as_srt</li><li>styles</li><li>total_length</li><li>write</li></ul>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display_html(''.join(chain('<ul>', (f'<li>{escape(member)}</li>' for member in dir(web_vtt)), '</ul>')))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "                    <strong>Caption</strong> #344\n",
       "                    <ul>\n",
       "                        <li><strong>Time:</strong> 01:01:19.390–01:01:22.370</li>\n",
       "                        <li><strong>Speaker:</strong> CUNY Tech Prep (CTP)</li>\n",
       "                        <li><strong>Speech:</strong> <em>Alright. You can pick the rooms. Now go into your rooms.</em></li>\n",
       "                    </ul>\n",
       "                "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "speaker_speech_pattern = re.compile('(?:([^:]+): )?(.*)')\n",
    "\n",
    "match web_vtt.captions[343]:\n",
    "    case Caption(identifier=identifier, start=start, end=end, text=text):\n",
    "        match speaker_speech_pattern.search(text).groups():\n",
    "            case (speaker, speech):\n",
    "                display_html(f\"\"\"\n",
    "                    <strong>Caption</strong> #{identifier}\n",
    "                    <ul>\n",
    "                        <li><strong>Time:</strong> {start}–{end}</li>\n",
    "                        <li><strong>Speaker:</strong> {escape(speaker)}</li>\n",
    "                        <li><strong>Speech:</strong> <em>{escape(speech)}</em></li>\n",
    "                    </ul>\n",
    "                \"\"\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}