Spaces:
Runtime error
Runtime error
File size: 4,157 Bytes
93f6882 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# WebVTT Reading and Chunking Test"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from functools import partial\n",
"from html import escape\n",
"from io import BytesIO\n",
"from IPython.display import display_html\n",
"from itertools import chain\n",
"import re\n",
"from webvtt import Caption, WebVTT\n",
"\n",
"display_html = partial(display_html, raw=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"FILE_PATH = #"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"with open(FILE_PATH, 'rb') as file:\n",
" web_vtt = WebVTT.from_buffer(BytesIO(file.read()))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<ul><li>__class__</li><li>__delattr__</li><li>__dict__</li><li>__dir__</li><li>__doc__</li><li>__eq__</li><li>__format__</li><li>__ge__</li><li>__getattribute__</li><li>__getitem__</li><li>__getstate__</li><li>__gt__</li><li>__hash__</li><li>__init__</li><li>__init_subclass__</li><li>__le__</li><li>__len__</li><li>__lt__</li><li>__module__</li><li>__ne__</li><li>__new__</li><li>__reduce__</li><li>__reduce_ex__</li><li>__repr__</li><li>__setattr__</li><li>__sizeof__</li><li>__str__</li><li>__subclasshook__</li><li>__weakref__</li><li>_get_destination_file</li><li>_get_lines</li><li>_has_bom</li><li>captions</li><li>content</li><li>encoding</li><li>file</li><li>footer_comments</li><li>from_buffer</li><li>from_sbv</li><li>from_srt</li><li>from_string</li><li>header_comments</li><li>iter_slice</li><li>read</li><li>read_buffer</li><li>save</li><li>save_as_srt</li><li>styles</li><li>total_length</li><li>write</li></ul>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display_html(''.join(chain('<ul>', (f'<li>{escape(member)}</li>' for member in dir(web_vtt)), '</ul>')))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" <strong>Caption</strong> #344\n",
" <ul>\n",
" <li><strong>Time:</strong> 01:01:19.390–01:01:22.370</li>\n",
" <li><strong>Speaker:</strong> CUNY Tech Prep (CTP)</li>\n",
" <li><strong>Speech:</strong> <em>Alright. You can pick the rooms. Now go into your rooms.</em></li>\n",
" </ul>\n",
" "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"speaker_speech_pattern = re.compile('(?:([^:]+): )?(.*)')\n",
"\n",
"match web_vtt.captions[343]:\n",
" case Caption(identifier=identifier, start=start, end=end, text=text):\n",
" match speaker_speech_pattern.search(text).groups():\n",
" case (speaker, speech):\n",
" display_html(f\"\"\"\n",
" <strong>Caption</strong> #{identifier}\n",
" <ul>\n",
" <li><strong>Time:</strong> {start}–{end}</li>\n",
" <li><strong>Speaker:</strong> {escape(speaker)}</li>\n",
" <li><strong>Speech:</strong> <em>{escape(speech)}</em></li>\n",
" </ul>\n",
" \"\"\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|