Spaces:
Runtime error
Runtime error
Convert WebVTT parser to a dynamically registered MIME type handler
Browse files- notebooks/web_vtt.ipynb +0 -0
- src/ctp_slack_bot/containers.py +18 -2
- src/ctp_slack_bot/mime_type_handlers/__init__.py +2 -0
- src/ctp_slack_bot/mime_type_handlers/base.py +29 -0
- src/ctp_slack_bot/mime_type_handlers/text/__init__.py +1 -0
- src/ctp_slack_bot/mime_type_handlers/text/vtt.py +39 -0
- src/ctp_slack_bot/models/webvtt.py +0 -20
notebooks/web_vtt.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
src/ctp_slack_bot/containers.py
CHANGED
@@ -1,11 +1,14 @@
|
|
1 |
from dependency_injector.containers import DeclarativeContainer
|
2 |
-
from dependency_injector.providers import Resource, Singleton
|
|
|
|
|
3 |
from slack_bolt.adapter.socket_mode.async_handler import AsyncSocketModeHandler
|
4 |
from slack_bolt.async_app import AsyncApp
|
5 |
|
6 |
from ctp_slack_bot.core.config import Settings
|
7 |
from ctp_slack_bot.db.mongo_db import MongoDBResource
|
8 |
-
from ctp_slack_bot.db.repositories import MongoVectorizedChunkRepository
|
|
|
9 |
from ctp_slack_bot.services.answer_retrieval_service import AnswerRetrievalService
|
10 |
from ctp_slack_bot.services.content_ingestion_service import ContentIngestionService
|
11 |
from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
|
@@ -20,6 +23,17 @@ from ctp_slack_bot.services.vector_database_service import VectorDatabaseService
|
|
20 |
from ctp_slack_bot.services.vectorization_service import VectorizationService
|
21 |
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
class Container(DeclarativeContainer):
|
24 |
settings = Singleton(Settings)
|
25 |
event_brokerage_service = Singleton(EventBrokerageService)
|
@@ -37,4 +51,6 @@ class Container(DeclarativeContainer):
|
|
37 |
slack_bolt_app = Singleton(AsyncApp, token=settings.provided.SLACK_BOT_TOKEN().get_secret_value())
|
38 |
slack_service = Resource(SlackServiceResource, event_brokerage_service=event_brokerage_service, slack_bolt_app=slack_bolt_app)
|
39 |
socket_mode_handler = Singleton(lambda _, app, app_token: AsyncSocketModeHandler(app, app_token), slack_service, slack_bolt_app, settings.provided.SLACK_APP_TOKEN().get_secret_value())
|
|
|
40 |
google_drive_service = Singleton(GoogleDriveService, settings=settings)
|
|
|
|
1 |
from dependency_injector.containers import DeclarativeContainer
|
2 |
+
from dependency_injector.providers import Callable, Resource, Singleton
|
3 |
+
from importlib import import_module
|
4 |
+
from pathlib import Path
|
5 |
from slack_bolt.adapter.socket_mode.async_handler import AsyncSocketModeHandler
|
6 |
from slack_bolt.async_app import AsyncApp
|
7 |
|
8 |
from ctp_slack_bot.core.config import Settings
|
9 |
from ctp_slack_bot.db.mongo_db import MongoDBResource
|
10 |
+
from ctp_slack_bot.db.repositories.mongo_db_vectorized_chunk_repository import MongoVectorizedChunkRepository
|
11 |
+
from ctp_slack_bot.mime_type_handlers.base import MimeTypeHandlerMeta
|
12 |
from ctp_slack_bot.services.answer_retrieval_service import AnswerRetrievalService
|
13 |
from ctp_slack_bot.services.content_ingestion_service import ContentIngestionService
|
14 |
from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
|
|
|
23 |
from ctp_slack_bot.services.vectorization_service import VectorizationService
|
24 |
|
25 |
|
26 |
+
def __load_plugins(plugin_dir) -> None:
|
27 |
+
for path in Path(plugin_dir).glob("*.py"):
|
28 |
+
if path.stem == "__init__":
|
29 |
+
continue # Skip __init__.py files
|
30 |
+
module_name = f"{plugin_dir.replace('/', '.')}.{path.stem}"
|
31 |
+
import_module(module_name)
|
32 |
+
|
33 |
+
|
34 |
+
__load_plugins("ctp_slack_bot/mime_type_handlers")
|
35 |
+
|
36 |
+
|
37 |
class Container(DeclarativeContainer):
|
38 |
settings = Singleton(Settings)
|
39 |
event_brokerage_service = Singleton(EventBrokerageService)
|
|
|
51 |
slack_bolt_app = Singleton(AsyncApp, token=settings.provided.SLACK_BOT_TOKEN().get_secret_value())
|
52 |
slack_service = Resource(SlackServiceResource, event_brokerage_service=event_brokerage_service, slack_bolt_app=slack_bolt_app)
|
53 |
socket_mode_handler = Singleton(lambda _, app, app_token: AsyncSocketModeHandler(app, app_token), slack_service, slack_bolt_app, settings.provided.SLACK_APP_TOKEN().get_secret_value())
|
54 |
+
mime_type_handler_factory = Callable(lambda mime_type: MimeTypeHandlerMeta._registry[mime_type]())
|
55 |
google_drive_service = Singleton(GoogleDriveService, settings=settings)
|
56 |
+
# file_monitor_service = Singleton(FileMonitorService, settings=settings, google_drive_service=google_drive_service, mime_type_handler_factory=mime_type_handler_factory)
|
src/ctp_slack_bot/mime_type_handlers/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from ctp_slack_bot.mime_type_handlers.base import BaseMimeTypeHandler, MimeTypeHandlerMeta
|
2 |
+
from ctp_slack_bot.mime_type_handlers.text.vtt import WebVTTMimeTypeHandler
|
src/ctp_slack_bot/mime_type_handlers/base.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABCMeta, abstractmethod
|
2 |
+
from typing import Any, ClassVar, Dict, Mapping, Optional
|
3 |
+
|
4 |
+
from ctp_slack_bot.models import Content
|
5 |
+
|
6 |
+
|
7 |
+
class MimeTypeHandlerMeta(type):
|
8 |
+
|
9 |
+
_registry: ClassVar[dict[str, type["BaseMimeTypeHandler"]]] = {}
|
10 |
+
|
11 |
+
def __init__(cls, name: str, bases: tuple[type, ...], dict: Dict[str, Any]) -> None:
|
12 |
+
super().__init__(name, bases, dict)
|
13 |
+
if hasattr(cls, "MIME_TYPE"):
|
14 |
+
MimeTypeHandlerMeta._registry[cls.MIME_TYPE] = cls
|
15 |
+
|
16 |
+
|
17 |
+
class MimeTypeHandlerABCMeta(MimeTypeHandlerMeta, ABCMeta):
|
18 |
+
pass
|
19 |
+
|
20 |
+
|
21 |
+
class BaseMimeTypeHandler(metaclass=MimeTypeHandlerABCMeta):
|
22 |
+
|
23 |
+
@classmethod
|
24 |
+
def for_mime_type(cls, mime_type: str) -> Optional[type["BaseMimeHandler"]]:
|
25 |
+
return cls._registry.get(mime_type)
|
26 |
+
|
27 |
+
@abstractmethod
|
28 |
+
def from_bytes(cls, id: str, metadata: Mapping[str, Any], buffer: bytes) -> Content:
|
29 |
+
pass
|
src/ctp_slack_bot/mime_type_handlers/text/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from ctp_slack_bot.mime_type_handlers.text.vtt import WebVTTMimeTypeHandler
|
src/ctp_slack_bot/mime_type_handlers/text/vtt.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datetime import datetime
|
2 |
+
from io import BytesIO
|
3 |
+
from re import compile as compile_re
|
4 |
+
from typing import Any, Mapping, Optional, Self
|
5 |
+
from webvtt import WebVTT
|
6 |
+
|
7 |
+
from ctp_slack_bot.mime_type_handlers import BaseMimeTypeHandler
|
8 |
+
from ctp_slack_bot.models import Content, WebVTTContent, WebVTTFrame
|
9 |
+
|
10 |
+
|
11 |
+
ISO_DATE_TIME_PATTERN = compile_re(r"Start time: (\d{4}-\d{2}-\d{2}(?: \d{2}:\d{2}:\d{2}(?:Z|[+-]\d{2}:\d{2})?)?)")
|
12 |
+
|
13 |
+
class WebVTTMimeTypeHandler(BaseMimeTypeHandler):
|
14 |
+
|
15 |
+
MIME_TYPE = "text/vtt"
|
16 |
+
|
17 |
+
def from_buffer(self: Self, id: str, metadata: Mapping[str, Any], buffer: bytes) -> WebVTTContent:
|
18 |
+
web_vtt = WebVTT.from_buffer(BytesIO(buffer))
|
19 |
+
frames = tuple(WebVTTFrame.from_webvtt_caption(caption, index)
|
20 |
+
for index, caption
|
21 |
+
in enumerate(web_vtt.captions, 1))
|
22 |
+
return WebVTTContent(id=id, metadata=MappingProxyType(metadata), start_time=cls.__get_start_time(web_vtt), frames=frames)
|
23 |
+
|
24 |
+
@classmethod
|
25 |
+
def __get_start_time(cls, web_vtt: WebVTT) -> Optional[datetime]:
|
26 |
+
try:
|
27 |
+
return next(datetime.fromisoformat(result[0])
|
28 |
+
for result
|
29 |
+
in map(ISO_DATE_TIME_PATTERN.findall, web_vtt.header_comments)
|
30 |
+
if result)
|
31 |
+
except ValueError:
|
32 |
+
return None
|
33 |
+
|
34 |
+
def from_bytes(self: Self, id: str, metadata: Mapping[str, Any], buffer: bytes) -> WebVTTContent:
|
35 |
+
web_vtt = WebVTT.from_buffer(BytesIO(buffer))
|
36 |
+
frames = tuple(WebVTTFrame.from_webvtt_caption(caption, index)
|
37 |
+
for index, caption
|
38 |
+
in enumerate(web_vtt.captions, 1))
|
39 |
+
return WebVTTContent(id=id, metadata=metadata, start_time=WebVTTMimeTypeHandler.__get_start_time(web_vtt), frames=frames)
|
src/ctp_slack_bot/models/webvtt.py
CHANGED
@@ -4,7 +4,6 @@ from itertools import starmap
|
|
4 |
from json import dumps
|
5 |
from more_itertools import windowed
|
6 |
from pydantic import BaseModel, ConfigDict, Field, PositiveInt, PrivateAttr
|
7 |
-
from re import compile as compile_re
|
8 |
from types import MappingProxyType
|
9 |
from typing import Any, Dict, Literal, Mapping, Optional, Self, Sequence
|
10 |
from webvtt import Caption, WebVTT
|
@@ -15,7 +14,6 @@ from ctp_slack_bot.models.base import Chunk, Content
|
|
15 |
CHUNK_FRAMES_OVERLAP = 1
|
16 |
CHUNK_FRAMES_WINDOW = 5
|
17 |
SPEAKER_SPEECH_TEXT_SEPARATOR = ": "
|
18 |
-
ISO_DATE_TIME_PATTERN = compile_re(r"Start time: (\d{4}-\d{2}-\d{2}(?: \d{2}:\d{2}:\d{2}(?:Z|[+-]\d{2}:\d{2})?)?)")
|
19 |
|
20 |
|
21 |
class WebVTTFrame(BaseModel):
|
@@ -71,21 +69,3 @@ class WebVTTContent(Content):
|
|
71 |
|
72 |
def get_metadata(self: Self) -> Mapping[str, Any]:
|
73 |
return MappingProxyType(self.metadata)
|
74 |
-
|
75 |
-
@classmethod
|
76 |
-
def __get_start_time(cls, web_vtt: WebVTT) -> Optional[datetime]:
|
77 |
-
try:
|
78 |
-
return next(datetime.fromisoformat(result[0])
|
79 |
-
for result
|
80 |
-
in map(ISO_DATE_TIME_PATTERN.findall, web_vtt.header_comments)
|
81 |
-
if result)
|
82 |
-
except ValueError:
|
83 |
-
return None
|
84 |
-
|
85 |
-
@classmethod
|
86 |
-
def from_bytes(cls, id: str, metadata: Mapping[str, Any], buffer: bytes) -> Self:
|
87 |
-
web_vtt = WebVTT.from_buffer(BytesIO(buffer))
|
88 |
-
frames = tuple(WebVTTFrame.from_webvtt_caption(caption, index)
|
89 |
-
for index, caption
|
90 |
-
in enumerate(web_vtt.captions, 1))
|
91 |
-
return WebVTTContent(id=id, metadata=MappingProxyType(metadata), start_time=cls.__get_start_time(web_vtt), frames=frames)
|
|
|
4 |
from json import dumps
|
5 |
from more_itertools import windowed
|
6 |
from pydantic import BaseModel, ConfigDict, Field, PositiveInt, PrivateAttr
|
|
|
7 |
from types import MappingProxyType
|
8 |
from typing import Any, Dict, Literal, Mapping, Optional, Self, Sequence
|
9 |
from webvtt import Caption, WebVTT
|
|
|
14 |
CHUNK_FRAMES_OVERLAP = 1
|
15 |
CHUNK_FRAMES_WINDOW = 5
|
16 |
SPEAKER_SPEECH_TEXT_SEPARATOR = ": "
|
|
|
17 |
|
18 |
|
19 |
class WebVTTFrame(BaseModel):
|
|
|
69 |
|
70 |
def get_metadata(self: Self) -> Mapping[str, Any]:
|
71 |
return MappingProxyType(self.metadata)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|