LiKenun commited on
Commit
c089adf
·
1 Parent(s): 9195c31

Convert WebVTT parser to a dynamically registered MIME type handler

Browse files
notebooks/web_vtt.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
src/ctp_slack_bot/containers.py CHANGED
@@ -1,11 +1,14 @@
1
  from dependency_injector.containers import DeclarativeContainer
2
- from dependency_injector.providers import Resource, Singleton
 
 
3
  from slack_bolt.adapter.socket_mode.async_handler import AsyncSocketModeHandler
4
  from slack_bolt.async_app import AsyncApp
5
 
6
  from ctp_slack_bot.core.config import Settings
7
  from ctp_slack_bot.db.mongo_db import MongoDBResource
8
- from ctp_slack_bot.db.repositories import MongoVectorizedChunkRepository
 
9
  from ctp_slack_bot.services.answer_retrieval_service import AnswerRetrievalService
10
  from ctp_slack_bot.services.content_ingestion_service import ContentIngestionService
11
  from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
@@ -20,6 +23,17 @@ from ctp_slack_bot.services.vector_database_service import VectorDatabaseService
20
  from ctp_slack_bot.services.vectorization_service import VectorizationService
21
 
22
 
 
 
 
 
 
 
 
 
 
 
 
23
  class Container(DeclarativeContainer):
24
  settings = Singleton(Settings)
25
  event_brokerage_service = Singleton(EventBrokerageService)
@@ -37,4 +51,6 @@ class Container(DeclarativeContainer):
37
  slack_bolt_app = Singleton(AsyncApp, token=settings.provided.SLACK_BOT_TOKEN().get_secret_value())
38
  slack_service = Resource(SlackServiceResource, event_brokerage_service=event_brokerage_service, slack_bolt_app=slack_bolt_app)
39
  socket_mode_handler = Singleton(lambda _, app, app_token: AsyncSocketModeHandler(app, app_token), slack_service, slack_bolt_app, settings.provided.SLACK_APP_TOKEN().get_secret_value())
 
40
  google_drive_service = Singleton(GoogleDriveService, settings=settings)
 
 
1
  from dependency_injector.containers import DeclarativeContainer
2
+ from dependency_injector.providers import Callable, Resource, Singleton
3
+ from importlib import import_module
4
+ from pathlib import Path
5
  from slack_bolt.adapter.socket_mode.async_handler import AsyncSocketModeHandler
6
  from slack_bolt.async_app import AsyncApp
7
 
8
  from ctp_slack_bot.core.config import Settings
9
  from ctp_slack_bot.db.mongo_db import MongoDBResource
10
+ from ctp_slack_bot.db.repositories.mongo_db_vectorized_chunk_repository import MongoVectorizedChunkRepository
11
+ from ctp_slack_bot.mime_type_handlers.base import MimeTypeHandlerMeta
12
  from ctp_slack_bot.services.answer_retrieval_service import AnswerRetrievalService
13
  from ctp_slack_bot.services.content_ingestion_service import ContentIngestionService
14
  from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
 
23
  from ctp_slack_bot.services.vectorization_service import VectorizationService
24
 
25
 
26
+ def __load_plugins(plugin_dir) -> None:
27
+ for path in Path(plugin_dir).glob("*.py"):
28
+ if path.stem == "__init__":
29
+ continue # Skip __init__.py files
30
+ module_name = f"{plugin_dir.replace('/', '.')}.{path.stem}"
31
+ import_module(module_name)
32
+
33
+
34
+ __load_plugins("ctp_slack_bot/mime_type_handlers")
35
+
36
+
37
  class Container(DeclarativeContainer):
38
  settings = Singleton(Settings)
39
  event_brokerage_service = Singleton(EventBrokerageService)
 
51
  slack_bolt_app = Singleton(AsyncApp, token=settings.provided.SLACK_BOT_TOKEN().get_secret_value())
52
  slack_service = Resource(SlackServiceResource, event_brokerage_service=event_brokerage_service, slack_bolt_app=slack_bolt_app)
53
  socket_mode_handler = Singleton(lambda _, app, app_token: AsyncSocketModeHandler(app, app_token), slack_service, slack_bolt_app, settings.provided.SLACK_APP_TOKEN().get_secret_value())
54
+ mime_type_handler_factory = Callable(lambda mime_type: MimeTypeHandlerMeta._registry[mime_type]())
55
  google_drive_service = Singleton(GoogleDriveService, settings=settings)
56
+ # file_monitor_service = Singleton(FileMonitorService, settings=settings, google_drive_service=google_drive_service, mime_type_handler_factory=mime_type_handler_factory)
src/ctp_slack_bot/mime_type_handlers/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from ctp_slack_bot.mime_type_handlers.base import BaseMimeTypeHandler, MimeTypeHandlerMeta
2
+ from ctp_slack_bot.mime_type_handlers.text.vtt import WebVTTMimeTypeHandler
src/ctp_slack_bot/mime_type_handlers/base.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABCMeta, abstractmethod
2
+ from typing import Any, ClassVar, Dict, Mapping, Optional
3
+
4
+ from ctp_slack_bot.models import Content
5
+
6
+
7
+ class MimeTypeHandlerMeta(type):
8
+
9
+ _registry: ClassVar[dict[str, type["BaseMimeTypeHandler"]]] = {}
10
+
11
+ def __init__(cls, name: str, bases: tuple[type, ...], dict: Dict[str, Any]) -> None:
12
+ super().__init__(name, bases, dict)
13
+ if hasattr(cls, "MIME_TYPE"):
14
+ MimeTypeHandlerMeta._registry[cls.MIME_TYPE] = cls
15
+
16
+
17
+ class MimeTypeHandlerABCMeta(MimeTypeHandlerMeta, ABCMeta):
18
+ pass
19
+
20
+
21
+ class BaseMimeTypeHandler(metaclass=MimeTypeHandlerABCMeta):
22
+
23
+ @classmethod
24
+ def for_mime_type(cls, mime_type: str) -> Optional[type["BaseMimeHandler"]]:
25
+ return cls._registry.get(mime_type)
26
+
27
+ @abstractmethod
28
+ def from_bytes(cls, id: str, metadata: Mapping[str, Any], buffer: bytes) -> Content:
29
+ pass
src/ctp_slack_bot/mime_type_handlers/text/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from ctp_slack_bot.mime_type_handlers.text.vtt import WebVTTMimeTypeHandler
src/ctp_slack_bot/mime_type_handlers/text/vtt.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ from io import BytesIO
3
+ from re import compile as compile_re
4
+ from typing import Any, Mapping, Optional, Self
5
+ from webvtt import WebVTT
6
+
7
+ from ctp_slack_bot.mime_type_handlers import BaseMimeTypeHandler
8
+ from ctp_slack_bot.models import Content, WebVTTContent, WebVTTFrame
9
+
10
+
11
+ ISO_DATE_TIME_PATTERN = compile_re(r"Start time: (\d{4}-\d{2}-\d{2}(?: \d{2}:\d{2}:\d{2}(?:Z|[+-]\d{2}:\d{2})?)?)")
12
+
13
+ class WebVTTMimeTypeHandler(BaseMimeTypeHandler):
14
+
15
+ MIME_TYPE = "text/vtt"
16
+
17
+ def from_buffer(self: Self, id: str, metadata: Mapping[str, Any], buffer: bytes) -> WebVTTContent:
18
+ web_vtt = WebVTT.from_buffer(BytesIO(buffer))
19
+ frames = tuple(WebVTTFrame.from_webvtt_caption(caption, index)
20
+ for index, caption
21
+ in enumerate(web_vtt.captions, 1))
22
+ return WebVTTContent(id=id, metadata=MappingProxyType(metadata), start_time=cls.__get_start_time(web_vtt), frames=frames)
23
+
24
+ @classmethod
25
+ def __get_start_time(cls, web_vtt: WebVTT) -> Optional[datetime]:
26
+ try:
27
+ return next(datetime.fromisoformat(result[0])
28
+ for result
29
+ in map(ISO_DATE_TIME_PATTERN.findall, web_vtt.header_comments)
30
+ if result)
31
+ except ValueError:
32
+ return None
33
+
34
+ def from_bytes(self: Self, id: str, metadata: Mapping[str, Any], buffer: bytes) -> WebVTTContent:
35
+ web_vtt = WebVTT.from_buffer(BytesIO(buffer))
36
+ frames = tuple(WebVTTFrame.from_webvtt_caption(caption, index)
37
+ for index, caption
38
+ in enumerate(web_vtt.captions, 1))
39
+ return WebVTTContent(id=id, metadata=metadata, start_time=WebVTTMimeTypeHandler.__get_start_time(web_vtt), frames=frames)
src/ctp_slack_bot/models/webvtt.py CHANGED
@@ -4,7 +4,6 @@ from itertools import starmap
4
  from json import dumps
5
  from more_itertools import windowed
6
  from pydantic import BaseModel, ConfigDict, Field, PositiveInt, PrivateAttr
7
- from re import compile as compile_re
8
  from types import MappingProxyType
9
  from typing import Any, Dict, Literal, Mapping, Optional, Self, Sequence
10
  from webvtt import Caption, WebVTT
@@ -15,7 +14,6 @@ from ctp_slack_bot.models.base import Chunk, Content
15
  CHUNK_FRAMES_OVERLAP = 1
16
  CHUNK_FRAMES_WINDOW = 5
17
  SPEAKER_SPEECH_TEXT_SEPARATOR = ": "
18
- ISO_DATE_TIME_PATTERN = compile_re(r"Start time: (\d{4}-\d{2}-\d{2}(?: \d{2}:\d{2}:\d{2}(?:Z|[+-]\d{2}:\d{2})?)?)")
19
 
20
 
21
  class WebVTTFrame(BaseModel):
@@ -71,21 +69,3 @@ class WebVTTContent(Content):
71
 
72
  def get_metadata(self: Self) -> Mapping[str, Any]:
73
  return MappingProxyType(self.metadata)
74
-
75
- @classmethod
76
- def __get_start_time(cls, web_vtt: WebVTT) -> Optional[datetime]:
77
- try:
78
- return next(datetime.fromisoformat(result[0])
79
- for result
80
- in map(ISO_DATE_TIME_PATTERN.findall, web_vtt.header_comments)
81
- if result)
82
- except ValueError:
83
- return None
84
-
85
- @classmethod
86
- def from_bytes(cls, id: str, metadata: Mapping[str, Any], buffer: bytes) -> Self:
87
- web_vtt = WebVTT.from_buffer(BytesIO(buffer))
88
- frames = tuple(WebVTTFrame.from_webvtt_caption(caption, index)
89
- for index, caption
90
- in enumerate(web_vtt.captions, 1))
91
- return WebVTTContent(id=id, metadata=MappingProxyType(metadata), start_time=cls.__get_start_time(web_vtt), frames=frames)
 
4
  from json import dumps
5
  from more_itertools import windowed
6
  from pydantic import BaseModel, ConfigDict, Field, PositiveInt, PrivateAttr
 
7
  from types import MappingProxyType
8
  from typing import Any, Dict, Literal, Mapping, Optional, Self, Sequence
9
  from webvtt import Caption, WebVTT
 
14
  CHUNK_FRAMES_OVERLAP = 1
15
  CHUNK_FRAMES_WINDOW = 5
16
  SPEAKER_SPEECH_TEXT_SEPARATOR = ": "
 
17
 
18
 
19
  class WebVTTFrame(BaseModel):
 
69
 
70
  def get_metadata(self: Self) -> Mapping[str, Any]:
71
  return MappingProxyType(self.metadata)