Spaces:
Runtime error
Runtime error
Use decorator-based plugin registration
Browse files
notebooks/google_drive_web_vtt_vectorizer_and_storer.ipynb
CHANGED
@@ -85,7 +85,7 @@
|
|
85 |
"metadata": {},
|
86 |
"outputs": [],
|
87 |
"source": [
|
88 |
-
"web_vtt_parser = container.
|
89 |
"display_html(f\"<p>{escape(str(type(web_vtt_parser)))}</p><code>{id(web_vtt_parser)}</code>\")"
|
90 |
]
|
91 |
},
|
@@ -95,7 +95,7 @@
|
|
95 |
"metadata": {},
|
96 |
"outputs": [],
|
97 |
"source": [
|
98 |
-
"display_html(f\"<code>{id(container.
|
99 |
]
|
100 |
},
|
101 |
{
|
|
|
85 |
"metadata": {},
|
86 |
"outputs": [],
|
87 |
"source": [
|
88 |
+
"web_vtt_parser = container.mime_type_handlers()[MIME_TYPE]\n",
|
89 |
"display_html(f\"<p>{escape(str(type(web_vtt_parser)))}</p><code>{id(web_vtt_parser)}</code>\")"
|
90 |
]
|
91 |
},
|
|
|
95 |
"metadata": {},
|
96 |
"outputs": [],
|
97 |
"source": [
|
98 |
+
"display_html(f\"<code>{id(container.mime_type_handlers()[MIME_TYPE])}</code>\")"
|
99 |
]
|
100 |
},
|
101 |
{
|
src/ctp_slack_bot/app.py
CHANGED
@@ -18,7 +18,7 @@ async def main() -> None:
|
|
18 |
container = Container()
|
19 |
container.wire(packages=["ctp_slack_bot"])
|
20 |
|
21 |
-
#
|
22 |
application_health_service = await container.application_health_service()
|
23 |
container.content_ingestion_service()
|
24 |
container.question_dispatch_service()
|
|
|
18 |
container = Container()
|
19 |
container.wire(packages=["ctp_slack_bot"])
|
20 |
|
21 |
+
# Instantiate services which should be active from the beginning.
|
22 |
application_health_service = await container.application_health_service()
|
23 |
container.content_ingestion_service()
|
24 |
container.question_dispatch_service()
|
src/ctp_slack_bot/containers.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
from dependency_injector.containers import DeclarativeContainer
|
2 |
-
from dependency_injector.providers import Callable, List, Resource, Singleton
|
3 |
from importlib import import_module
|
4 |
from pkgutil import iter_modules
|
5 |
from slack_bolt.adapter.socket_mode.async_handler import AsyncSocketModeHandler
|
@@ -9,7 +9,7 @@ from types import ModuleType
|
|
9 |
from ctp_slack_bot.core import Settings
|
10 |
from ctp_slack_bot.db.mongo_db import MongoDBResource
|
11 |
from ctp_slack_bot.db.repositories.mongo_db_vectorized_chunk_repository import MongoVectorizedChunkRepositoryResource
|
12 |
-
from ctp_slack_bot.mime_type_handlers import
|
13 |
from ctp_slack_bot.services.answer_retrieval_service import AnswerRetrievalService
|
14 |
from ctp_slack_bot.services.application_health_service import ApplicationHealthService
|
15 |
from ctp_slack_bot.services.content_ingestion_service import ContentIngestionService
|
@@ -86,7 +86,9 @@ class Container(DeclarativeContainer): # TODO: audit for potential async-related
|
|
86 |
slack_service,
|
87 |
slack_bolt_app,
|
88 |
settings)
|
89 |
-
|
|
|
|
|
90 |
google_drive_service = Singleton(GoogleDriveService,
|
91 |
settings=settings)
|
92 |
# file_monitor_service = Singleton(FileMonitorService,
|
|
|
1 |
from dependency_injector.containers import DeclarativeContainer
|
2 |
+
from dependency_injector.providers import Callable, Dict, List, Resource, Singleton
|
3 |
from importlib import import_module
|
4 |
from pkgutil import iter_modules
|
5 |
from slack_bolt.adapter.socket_mode.async_handler import AsyncSocketModeHandler
|
|
|
9 |
from ctp_slack_bot.core import Settings
|
10 |
from ctp_slack_bot.db.mongo_db import MongoDBResource
|
11 |
from ctp_slack_bot.db.repositories.mongo_db_vectorized_chunk_repository import MongoVectorizedChunkRepositoryResource
|
12 |
+
from ctp_slack_bot.mime_type_handlers.base import MimeTypeHandlerRegistry
|
13 |
from ctp_slack_bot.services.answer_retrieval_service import AnswerRetrievalService
|
14 |
from ctp_slack_bot.services.application_health_service import ApplicationHealthService
|
15 |
from ctp_slack_bot.services.content_ingestion_service import ContentIngestionService
|
|
|
86 |
slack_service,
|
87 |
slack_bolt_app,
|
88 |
settings)
|
89 |
+
mime_type_handlers = Dict ({mime_type: Singleton(handler)
|
90 |
+
for mime_type, handler
|
91 |
+
in MimeTypeHandlerRegistry.get_registry().items()})
|
92 |
google_drive_service = Singleton(GoogleDriveService,
|
93 |
settings=settings)
|
94 |
# file_monitor_service = Singleton(FileMonitorService,
|
src/ctp_slack_bot/mime_type_handlers/__init__.py
CHANGED
@@ -1,2 +1,2 @@
|
|
1 |
-
from ctp_slack_bot.mime_type_handlers.base import MimeTypeHandler,
|
2 |
from ctp_slack_bot.mime_type_handlers.text.vtt import WebVTTMimeTypeHandler
|
|
|
1 |
+
from ctp_slack_bot.mime_type_handlers.base import MimeTypeHandler, MimeTypeHandlerRegistry
|
2 |
from ctp_slack_bot.mime_type_handlers.text.vtt import WebVTTMimeTypeHandler
|
src/ctp_slack_bot/mime_type_handlers/base.py
CHANGED
@@ -1,31 +1,39 @@
|
|
1 |
from abc import ABC, abstractmethod
|
2 |
from functools import lru_cache
|
|
|
|
|
3 |
from typing import Any, ClassVar, Mapping, Optional
|
4 |
|
|
|
5 |
from ctp_slack_bot.models import Content
|
6 |
|
7 |
|
8 |
-
class
|
9 |
|
10 |
-
|
|
|
|
|
11 |
|
12 |
-
def __init__(cls, name: str, bases: tuple[type, ...], dict: dict[str, Any]) -> None:
|
13 |
-
super().__init__(name, bases, dict)
|
14 |
-
if hasattr(cls, "MIME_TYPE"):
|
15 |
-
MimeTypeHandlerMeta._registry[cls.MIME_TYPE] = cls
|
16 |
|
|
|
17 |
|
18 |
-
|
19 |
-
pass
|
20 |
|
|
|
|
|
|
|
|
|
21 |
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
@classmethod
|
25 |
@lru_cache
|
26 |
-
def
|
27 |
return cls._registry.get(mime_type)()
|
28 |
-
|
29 |
-
@abstractmethod
|
30 |
-
def from_bytes(cls, id: str, metadata: Mapping[str, Any], buffer: bytes) -> Content:
|
31 |
-
pass
|
|
|
1 |
from abc import ABC, abstractmethod
|
2 |
from functools import lru_cache
|
3 |
+
from importlib import import_module
|
4 |
+
from types import MappingProxyType
|
5 |
from typing import Any, ClassVar, Mapping, Optional
|
6 |
|
7 |
+
from ctp_slack_bot.core import ApplicationComponentBase
|
8 |
from ctp_slack_bot.models import Content
|
9 |
|
10 |
|
11 |
+
class MimeTypeHandler(ApplicationComponentBase):
|
12 |
|
13 |
+
@abstractmethod
|
14 |
+
def from_bytes(cls, id: str, metadata: Mapping[str, Any], buffer: bytes) -> Content:
|
15 |
+
pass
|
16 |
|
|
|
|
|
|
|
|
|
17 |
|
18 |
+
class MimeTypeHandlerRegistry:
|
19 |
|
20 |
+
_registry: ClassVar[dict[str, MimeTypeHandler]] = {}
|
|
|
21 |
|
22 |
+
@classmethod
|
23 |
+
def get_registry(cls) -> Mapping[str, MimeTypeHandler]:
|
24 |
+
import_module(__package__)
|
25 |
+
return MappingProxyType(cls._registry)
|
26 |
|
27 |
+
@classmethod
|
28 |
+
def register(cls, mime_type: str):
|
29 |
+
def decorator(handler_cls: MimeTypeHandler):
|
30 |
+
if mime_type in cls._registry:
|
31 |
+
raise ValueError(f"The MIME type, {mime_type}, is already registered.")
|
32 |
+
cls._registry[mime_type] = handler_cls
|
33 |
+
return handler_cls
|
34 |
+
return decorator
|
35 |
|
36 |
@classmethod
|
37 |
@lru_cache
|
38 |
+
def get_handler(cls, mime_type: str) -> Optional[MimeTypeHandler]:
|
39 |
return cls._registry.get(mime_type)()
|
|
|
|
|
|
|
|
src/ctp_slack_bot/mime_type_handlers/text/vtt.py
CHANGED
@@ -1,27 +1,28 @@
|
|
1 |
from datetime import datetime
|
2 |
from io import BytesIO
|
3 |
-
from
|
|
|
4 |
from types import MappingProxyType
|
5 |
from typing import Any, Mapping, Optional, Self
|
6 |
from webvtt import WebVTT
|
7 |
|
8 |
-
from ctp_slack_bot.mime_type_handlers import MimeTypeHandler
|
9 |
from ctp_slack_bot.models import Content, WebVTTContent, WebVTTFrame
|
10 |
|
11 |
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
class WebVTTMimeTypeHandler(MimeTypeHandler):
|
16 |
|
17 |
-
|
|
|
|
|
18 |
|
19 |
@classmethod
|
20 |
def __get_start_time(cls, web_vtt: WebVTT) -> Optional[datetime]:
|
21 |
try:
|
22 |
return next(datetime.fromisoformat(result[0])
|
23 |
for result
|
24 |
-
in map(ISO_DATE_TIME_PATTERN.findall, web_vtt.header_comments)
|
25 |
if result)
|
26 |
except (StopIteration, ValueError):
|
27 |
return None
|
@@ -32,3 +33,7 @@ class WebVTTMimeTypeHandler(MimeTypeHandler):
|
|
32 |
for index, caption
|
33 |
in enumerate(web_vtt.captions, 1))
|
34 |
return WebVTTContent(id=id, metadata=MappingProxyType(metadata), start_time=WebVTTMimeTypeHandler.__get_start_time(web_vtt), frames=frames)
|
|
|
|
|
|
|
|
|
|
1 |
from datetime import datetime
|
2 |
from io import BytesIO
|
3 |
+
from pydantic import ConfigDict
|
4 |
+
from re import compile as compile_re, Pattern
|
5 |
from types import MappingProxyType
|
6 |
from typing import Any, Mapping, Optional, Self
|
7 |
from webvtt import WebVTT
|
8 |
|
9 |
+
from ctp_slack_bot.mime_type_handlers import MimeTypeHandler, MimeTypeHandlerRegistry
|
10 |
from ctp_slack_bot.models import Content, WebVTTContent, WebVTTFrame
|
11 |
|
12 |
|
13 |
+
@MimeTypeHandlerRegistry.register("text/vtt")
|
|
|
|
|
14 |
class WebVTTMimeTypeHandler(MimeTypeHandler):
|
15 |
|
16 |
+
model_config = ConfigDict(frozen=True)
|
17 |
+
|
18 |
+
ISO_DATE_TIME_PATTERN: Pattern = compile_re(r"Start time: (\d{4}-\d{2}-\d{2}(?: \d{2}:\d{2}:\d{2}(?:Z|[+-]\d{2}:\d{2})?)?)")
|
19 |
|
20 |
@classmethod
|
21 |
def __get_start_time(cls, web_vtt: WebVTT) -> Optional[datetime]:
|
22 |
try:
|
23 |
return next(datetime.fromisoformat(result[0])
|
24 |
for result
|
25 |
+
in map(cls.ISO_DATE_TIME_PATTERN.findall, web_vtt.header_comments)
|
26 |
if result)
|
27 |
except (StopIteration, ValueError):
|
28 |
return None
|
|
|
33 |
for index, caption
|
34 |
in enumerate(web_vtt.captions, 1))
|
35 |
return WebVTTContent(id=id, metadata=MappingProxyType(metadata), start_time=WebVTTMimeTypeHandler.__get_start_time(web_vtt), frames=frames)
|
36 |
+
|
37 |
+
@property
|
38 |
+
def name(self: Self) -> str:
|
39 |
+
return "web_vtt_mime_type_handler"
|