Spaces:
Runtime error
Runtime error
File size: 4,832 Bytes
93f6882 62f7cb7 93f6882 62f7cb7 93f6882 62f7cb7 93f6882 62f7cb7 93f6882 62f7cb7 93f6882 62f7cb7 93f6882 62f7cb7 93f6882 62f7cb7 93f6882 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# WebVTT Reading and Chunking Test"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from datetime import datetime, timedelta\n",
"from functools import partial\n",
"from html import escape\n",
"from io import BytesIO\n",
"from IPython.display import display_html\n",
"from itertools import chain\n",
"import re\n",
"from webvtt import Caption, WebVTT\n",
"from webvtt.models import Timestamp\n",
"from zoneinfo import ZoneInfo\n",
"\n",
"display_html = partial(display_html, raw=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"FILE_PATH = 'GMT20250411-223535_Recording.transcript.vtt'\n",
"TIME_ZONE = ZoneInfo(\"America/New_York\")\n",
"BASE_TIME = datetime(2025, 4, 11, hour=22, minute=35, second=35, tzinfo=ZoneInfo(\"GMT\")).astimezone(TIME_ZONE)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"with open(FILE_PATH, 'rb') as file:\n",
" web_vtt = WebVTT.from_buffer(BytesIO(file.read()))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<ul><li>__class__</li><li>__delattr__</li><li>__dict__</li><li>__dir__</li><li>__doc__</li><li>__eq__</li><li>__format__</li><li>__ge__</li><li>__getattribute__</li><li>__getitem__</li><li>__getstate__</li><li>__gt__</li><li>__hash__</li><li>__init__</li><li>__init_subclass__</li><li>__le__</li><li>__len__</li><li>__lt__</li><li>__module__</li><li>__ne__</li><li>__new__</li><li>__reduce__</li><li>__reduce_ex__</li><li>__repr__</li><li>__setattr__</li><li>__sizeof__</li><li>__str__</li><li>__subclasshook__</li><li>__weakref__</li><li>_get_destination_file</li><li>_get_lines</li><li>_has_bom</li><li>captions</li><li>content</li><li>encoding</li><li>file</li><li>footer_comments</li><li>from_buffer</li><li>from_sbv</li><li>from_srt</li><li>from_string</li><li>header_comments</li><li>iter_slice</li><li>read</li><li>read_buffer</li><li>save</li><li>save_as_srt</li><li>styles</li><li>total_length</li><li>write</li></ul>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display_html(''.join(chain('<ul>', (f'<li>{escape(member)}</li>' for member in dir(web_vtt)), '</ul>')))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" <strong>Caption</strong> #344\n",
" <ul>\n",
" <li><strong>Start:</strong> Friday, April 11, 2025, 07:36:54 PM EDT</li>\n",
" <li><strong>Speaker:</strong> CUNY Tech Prep (CTP)</li>\n",
" <li><strong>Speech:</strong> Alright. You can pick the rooms. Now go into your rooms.</li>\n",
" <li><strong>End:</strong> Friday, April 11, 2025, 07:36:57 PM EDT</li>\n",
" </ul>\n",
" "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"speaker_speech_pattern = re.compile('(?:([^:]+): )?(.*)')\n",
"\n",
"match web_vtt.captions[343]:\n",
" case Caption(identifier=identifier, start_time=start_time, end_time=end_time, text=text):\n",
" match speaker_speech_pattern.search(text).groups():\n",
" case (speaker, speech):\n",
" display_html(f\"\"\"\n",
" <strong>Caption</strong> #{identifier}\n",
" <ul>\n",
" <li><strong>Start:</strong> {BASE_TIME + timedelta(**start_time.__dict__):%A, %B %d, %Y, %I:%M:%S %p %Z}</li>\n",
" <li><strong>Speaker:</strong> {escape(speaker)}</li>\n",
" <li><strong>Speech:</strong> {escape(speech)}</li>\n",
" <li><strong>End:</strong> {BASE_TIME + timedelta(**end_time.__dict__):%A, %B %d, %Y, %I:%M:%S %p %Z}</li>\n",
" </ul>\n",
" \"\"\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|