# WebVTT Reading and Chunking Test

## Pure `webvtt-py` as Proof-of-concept

In [1]:
from datetime import datetime, timedelta
from functools import partial
from html import escape
from io import BytesIO
from IPython.display import display_html
from itertools import chain
import re
from webvtt import Caption, WebVTT
from webvtt.models import Timestamp
from zoneinfo import ZoneInfo

display_html = partial(display_html, raw=True)

In [None]:
FILE_PATH = "GMT20250411-223535_Recording.transcript.vtt"
TIME_ZONE = ZoneInfo("America/New_York")
BASE_TIME = datetime(2025, 4, 11, hour=22, minute=35, second=35, tzinfo=ZoneInfo("GMT")).astimezone(TIME_ZONE)

In [3]:
with open(FILE_PATH, "rb") as file:
    web_vtt = WebVTT.from_buffer(BytesIO(file.read()))

In [4]:
display_html("".join(chain("<ul>", (f"<li>{escape(member)}</li>" for member in dir(web_vtt)), "</ul>")))

In [5]:
speaker_speech_pattern = re.compile("(?:([^:]+): )?(.*)")

match web_vtt.captions[343]:
    case Caption(identifier=identifier, start_time=start_time, end_time=end_time, text=text):
        match speaker_speech_pattern.search(text).groups():
            case (speaker, speech):
                display_html(f"""
                    <strong>Caption</strong> #{identifier}
                    <ul>
                        <li><strong>Start:</strong> {BASE_TIME + timedelta(**start_time.__dict__):%A, %B %d, %Y, %I:%M:%S %p %Z}</li>
                        <li><strong>Speaker:</strong> {escape(speaker)}</li>
                        <li><strong>Speech:</strong> {escape(speech)}</li>
                        <li><strong>End:</strong> {BASE_TIME + timedelta(**end_time.__dict__):%A, %B %d, %Y, %I:%M:%S %p %Z}</li>
                    </ul>
                """)

### Chunking

In order for chunking to produce bits with useful context, we must not only use the caption (frame) itself, but bundle it with its surrounding frames (before and after messages).

In [6]:
from more_itertools import windowed

In [7]:
CHUNK_FRAMES_OVERLAP = 1
CHUNK_FRAMES_WINDOW = 5

In [8]:
items = tuple(chr(code_point) for code_point in range(ord('A'), ord('[')))
display_html(f"<table>{"".join(map("<tr><td>{}</td></tr>".format, items))}</table>")

0
A
B
C
D
E
F
G
H
I
J


In [9]:
chunks = tuple(windowed(items, CHUNK_FRAMES_WINDOW, step=(CHUNK_FRAMES_WINDOW - CHUNK_FRAMES_OVERLAP)))
display_html(f"<table>{"".join(f"<tr>{"".join(f"<td>{item if item else ""}</td>" for item in chunk)}</tr>" for chunk in chunks)}</table>")

0,1,2,3,4
A,B,C,D,E
E,F,G,H,I
I,J,K,L,M
M,N,O,P,Q
Q,R,S,T,U
U,V,W,X,Y
Y,Z,,,


## Using the `WebVTTFile` Class

In [10]:
from datetime import datetime
from hashlib import sha256
from zoneinfo import ZoneInfo

from ctp_slack_bot.models import WebVTTContent

In [None]:
FILE_PATH = "GMT20250411-223535_Recording.transcript.vtt"
TIME_ZONE = ZoneInfo("America/New_York")
MODIFICATION_TIME = datetime(2025, 4, 11, hour=22, minute=35, second=35, tzinfo=ZoneInfo("GMT")).astimezone(TIME_ZONE)

In [12]:
with open(FILE_PATH, "rb") as file:
    bytes = file.read()
    web_vtt_content = WebVTTContent.from_bytes(sha256(bytes).hexdigest(), {"modification_time": MODIFICATION_TIME}, bytes)

In [13]:
web_vtt_content.start_time

datetime.datetime(2025, 4, 11, 22, 35, 35, tzinfo=datetime.timezone.utc)

In [14]:
web_vtt_content.get_chunks()

(Chunk(text="iyeshia: For the workshop. We want to set you up.\n\niyeshia: Thank you, Kevin, for a question. We want to set you up for success in year one. And so this workshop is to help you kind of like\n\niyeshia: figure out, or how to adjust, as you're coming into your careers what to expect like your 30 days of work, 60 days of work, 90 days of work when you are starting your full time roles. So with that, said, let us get started.\n\niyeshia: So the topic, of course, is going to be discussing things of like the onboarding process of what it looks like when you start your jobs. How to maneuver or move around in your workplace environments. We'll discuss negotiating raises, because last time we didn't negotiating offers. So now we pass that you already got the offer. So now we'd be at the\n\niyeshia: the race card after that year. Don't try to come into your job already. 5 days in somebody to raise. Wait, and then from there we'll do activity on asking for feedback when you have, l