LiKenun commited on
Commit
93f6882
·
1 Parent(s): 100a4fd

WebVTT parsing and chunk extraction proof-of-concept

Browse files
Files changed (1) hide show
  1. notebooks/web-vtt.ipynb +126 -0
notebooks/web-vtt.ipynb ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# WebVTT Reading and Chunking Test"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": 1,
13
+ "metadata": {},
14
+ "outputs": [],
15
+ "source": [
16
+ "from functools import partial\n",
17
+ "from html import escape\n",
18
+ "from io import BytesIO\n",
19
+ "from IPython.display import display_html\n",
20
+ "from itertools import chain\n",
21
+ "import re\n",
22
+ "from webvtt import Caption, WebVTT\n",
23
+ "\n",
24
+ "display_html = partial(display_html, raw=True)"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "code",
29
+ "execution_count": null,
30
+ "metadata": {},
31
+ "outputs": [],
32
+ "source": [
33
+ "FILE_PATH = #"
34
+ ]
35
+ },
36
+ {
37
+ "cell_type": "code",
38
+ "execution_count": 3,
39
+ "metadata": {},
40
+ "outputs": [],
41
+ "source": [
42
+ "with open(FILE_PATH, 'rb') as file:\n",
43
+ " web_vtt = WebVTT.from_buffer(BytesIO(file.read()))"
44
+ ]
45
+ },
46
+ {
47
+ "cell_type": "code",
48
+ "execution_count": 4,
49
+ "metadata": {},
50
+ "outputs": [
51
+ {
52
+ "data": {
53
+ "text/html": [
54
+ "<ul><li>__class__</li><li>__delattr__</li><li>__dict__</li><li>__dir__</li><li>__doc__</li><li>__eq__</li><li>__format__</li><li>__ge__</li><li>__getattribute__</li><li>__getitem__</li><li>__getstate__</li><li>__gt__</li><li>__hash__</li><li>__init__</li><li>__init_subclass__</li><li>__le__</li><li>__len__</li><li>__lt__</li><li>__module__</li><li>__ne__</li><li>__new__</li><li>__reduce__</li><li>__reduce_ex__</li><li>__repr__</li><li>__setattr__</li><li>__sizeof__</li><li>__str__</li><li>__subclasshook__</li><li>__weakref__</li><li>_get_destination_file</li><li>_get_lines</li><li>_has_bom</li><li>captions</li><li>content</li><li>encoding</li><li>file</li><li>footer_comments</li><li>from_buffer</li><li>from_sbv</li><li>from_srt</li><li>from_string</li><li>header_comments</li><li>iter_slice</li><li>read</li><li>read_buffer</li><li>save</li><li>save_as_srt</li><li>styles</li><li>total_length</li><li>write</li></ul>"
55
+ ]
56
+ },
57
+ "metadata": {},
58
+ "output_type": "display_data"
59
+ }
60
+ ],
61
+ "source": [
62
+ "display_html(''.join(chain('<ul>', (f'<li>{escape(member)}</li>' for member in dir(web_vtt)), '</ul>')))"
63
+ ]
64
+ },
65
+ {
66
+ "cell_type": "code",
67
+ "execution_count": 5,
68
+ "metadata": {},
69
+ "outputs": [
70
+ {
71
+ "data": {
72
+ "text/html": [
73
+ "\n",
74
+ " <strong>Caption</strong> #344\n",
75
+ " <ul>\n",
76
+ " <li><strong>Time:</strong> 01:01:19.390–01:01:22.370</li>\n",
77
+ " <li><strong>Speaker:</strong> CUNY Tech Prep (CTP)</li>\n",
78
+ " <li><strong>Speech:</strong> <em>Alright. You can pick the rooms. Now go into your rooms.</em></li>\n",
79
+ " </ul>\n",
80
+ " "
81
+ ]
82
+ },
83
+ "metadata": {},
84
+ "output_type": "display_data"
85
+ }
86
+ ],
87
+ "source": [
88
+ "speaker_speech_pattern = re.compile('(?:([^:]+): )?(.*)')\n",
89
+ "\n",
90
+ "match web_vtt.captions[343]:\n",
91
+ " case Caption(identifier=identifier, start=start, end=end, text=text):\n",
92
+ " match speaker_speech_pattern.search(text).groups():\n",
93
+ " case (speaker, speech):\n",
94
+ " display_html(f\"\"\"\n",
95
+ " <strong>Caption</strong> #{identifier}\n",
96
+ " <ul>\n",
97
+ " <li><strong>Time:</strong> {start}–{end}</li>\n",
98
+ " <li><strong>Speaker:</strong> {escape(speaker)}</li>\n",
99
+ " <li><strong>Speech:</strong> <em>{escape(speech)}</em></li>\n",
100
+ " </ul>\n",
101
+ " \"\"\")"
102
+ ]
103
+ }
104
+ ],
105
+ "metadata": {
106
+ "kernelspec": {
107
+ "display_name": ".venv",
108
+ "language": "python",
109
+ "name": "python3"
110
+ },
111
+ "language_info": {
112
+ "codemirror_mode": {
113
+ "name": "ipython",
114
+ "version": 3
115
+ },
116
+ "file_extension": ".py",
117
+ "mimetype": "text/x-python",
118
+ "name": "python",
119
+ "nbconvert_exporter": "python",
120
+ "pygments_lexer": "ipython3",
121
+ "version": "3.12.3"
122
+ }
123
+ },
124
+ "nbformat": 4,
125
+ "nbformat_minor": 2
126
+ }