Spaces:
Runtime error
Runtime error
Refactor #3
Browse files- .env.template +4 -0
- README.md +14 -9
- pyproject.toml +2 -1
- scripts/run-dev.sh +1 -1
- src/ctp_slack_bot/app.py +10 -3
- src/ctp_slack_bot/containers.py +11 -5
- src/ctp_slack_bot/core/__init__.py +1 -0
- src/ctp_slack_bot/core/abstractions.py +31 -0
- src/ctp_slack_bot/core/config.py +47 -46
- src/ctp_slack_bot/core/logging.py +3 -2
- src/ctp_slack_bot/db/mongo_db.py +20 -17
- src/ctp_slack_bot/db/repositories/mongo_db_vectorized_chunk_repository.py +12 -8
- src/ctp_slack_bot/db/repositories/vector_repository_base.py +15 -18
- src/ctp_slack_bot/db/repositories/vectorized_chunk_repository.py +4 -2
- src/ctp_slack_bot/mime_type_handlers/base.py +4 -4
- src/ctp_slack_bot/mime_type_handlers/text/vtt.py +1 -0
- src/ctp_slack_bot/models/base.py +10 -8
- src/ctp_slack_bot/models/google_drive.py +2 -2
- src/ctp_slack_bot/models/slack.py +14 -5
- src/ctp_slack_bot/models/webvtt.py +3 -3
- src/ctp_slack_bot/services/__init__.py +1 -0
- src/ctp_slack_bot/services/answer_retrieval_service.py +9 -10
- src/ctp_slack_bot/services/application_database_service.py +9 -10
- src/ctp_slack_bot/services/application_health_service.py +25 -0
- src/ctp_slack_bot/services/content_ingestion_service.py +13 -10
- src/ctp_slack_bot/services/context_retrieval_service.py +12 -12
- src/ctp_slack_bot/services/embeddings_model_service.py +19 -16
- src/ctp_slack_bot/services/event_brokerage_service.py +11 -10
- src/ctp_slack_bot/services/google_drive_service.py +27 -24
- src/ctp_slack_bot/services/http_client_service.py +14 -0
- src/ctp_slack_bot/services/http_server_service.py +15 -0
- src/ctp_slack_bot/services/language_model_service.py +20 -19
- src/ctp_slack_bot/services/question_dispatch_service.py +12 -10
- src/ctp_slack_bot/services/schedule_service.py +16 -14
- src/ctp_slack_bot/services/slack_service.py +28 -16
- src/ctp_slack_bot/services/vectorization_service.py +10 -10
- src/ctp_slack_bot/utils/secret_stripper.py +1 -0
.env.template
CHANGED
@@ -1,5 +1,9 @@
|
|
1 |
# Copy this file and modify. Do not save or commit the secrets!
|
2 |
|
|
|
|
|
|
|
|
|
3 |
# APScheduler Configuration
|
4 |
SCHEDULER_TIMEZONE=UTC
|
5 |
|
|
|
1 |
# Copy this file and modify. Do not save or commit the secrets!
|
2 |
|
3 |
+
# HTTP Server Configuration
|
4 |
+
HTTP_HOST=0.0.0.0
|
5 |
+
HTTP_PORT=8080
|
6 |
+
|
7 |
# APScheduler Configuration
|
8 |
SCHEDULER_TIMEZONE=UTC
|
9 |
|
README.md
CHANGED
@@ -24,10 +24,10 @@ You need to configure it first. This is done via environment variables, or an `.
|
|
24 |
|
25 |
Obtaining the values requires setting up API tokens/secrets with:
|
26 |
|
27 |
-
* Slack: for `
|
28 |
-
* MongoDB: for `
|
29 |
-
* OpenAI: for `
|
30 |
-
* Google Drive: for `
|
31 |
* For Google Drive, set up a service account. It’s the only supported authentication type.
|
32 |
|
33 |
### Normally
|
@@ -58,12 +58,14 @@ pip3 install -e .
|
|
58 |
|
59 |
Make a copy of `.env.template` as `.env` and define the environment variables. (You can also define them by other means, but this has the least friction.) This file should not be committed and is excluded by `.gitignore`!
|
60 |
|
61 |
-
If `localhost` port `
|
62 |
|
63 |
```sh
|
64 |
scripts/run-dev.sh
|
65 |
```
|
66 |
|
|
|
|
|
67 |
## Tech Stack
|
68 |
|
69 |
* Hugging Face Spaces for hosting
|
@@ -77,14 +79,17 @@ scripts/run-dev.sh
|
|
77 |
|
78 |
## General Project Structure
|
79 |
|
|
|
|
|
80 |
* `src/`
|
81 |
* `ctp_slack_bot/`
|
82 |
* `core/`: fundamental components like configuration (using pydantic), logging setup (loguru), and custom exceptions
|
83 |
-
* `db/`:
|
84 |
-
* `repositories/`:
|
85 |
-
* `models/`:
|
86 |
* `services/`: business logic
|
87 |
* `answer_retrieval_service.py`: obtains an answer to a question from a language model using relevant context
|
|
|
88 |
* `content_ingestion_service.py`: converts content into chunks and stores them into the database
|
89 |
* `context_retrieval_service.py`: queries for relevant context from the database to answer a question
|
90 |
* `embeddings_model_service.py`: converts text to embeddings
|
@@ -95,7 +100,7 @@ scripts/run-dev.sh
|
|
95 |
* `slack_service.py`: handles events from Slack and sends back responses
|
96 |
* `vector_database_service.py`: stores and queries chunks
|
97 |
* `vectorization_service.py`: converts chunks into chunks with embeddings
|
98 |
-
* `tasks/`:
|
99 |
* `utils/`: reusable utilities
|
100 |
* `app.py`: application entry point
|
101 |
* `containers.py`: the dependency injection container
|
|
|
24 |
|
25 |
Obtaining the values requires setting up API tokens/secrets with:
|
26 |
|
27 |
+
* Slack: for `slack_bot_token` and `slack_app_token`
|
28 |
+
* MongoDB: for `mongodb_uri`
|
29 |
+
* OpenAI: for `openai_api_key`
|
30 |
+
* Google Drive: for `google_project_id`, `google_client_id`, `google_client_email`, `google_private_key_id`, and `google_private_key`
|
31 |
* For Google Drive, set up a service account. It’s the only supported authentication type.
|
32 |
|
33 |
### Normally
|
|
|
58 |
|
59 |
Make a copy of `.env.template` as `.env` and define the environment variables. (You can also define them by other means, but this has the least friction.) This file should not be committed and is excluded by `.gitignore`!
|
60 |
|
61 |
+
If `localhost` port `8080` is free, running the following will make the application available on that port:
|
62 |
|
63 |
```sh
|
64 |
scripts/run-dev.sh
|
65 |
```
|
66 |
|
67 |
+
Visiting http://localhost:8080/health will return HTTP status OK and a payload containing the health status of individual components if everything is working.
|
68 |
+
|
69 |
## Tech Stack
|
70 |
|
71 |
* Hugging Face Spaces for hosting
|
|
|
79 |
|
80 |
## General Project Structure
|
81 |
|
82 |
+
Not every file or folder is listed, but the important stuff is here.
|
83 |
+
|
84 |
* `src/`
|
85 |
* `ctp_slack_bot/`
|
86 |
* `core/`: fundamental components like configuration (using pydantic), logging setup (loguru), and custom exceptions
|
87 |
+
* `db/`: data connection and interface logic
|
88 |
+
* `repositories/`: data collection/table interface logic
|
89 |
+
* `models/`: data models
|
90 |
* `services/`: business logic
|
91 |
* `answer_retrieval_service.py`: obtains an answer to a question from a language model using relevant context
|
92 |
+
* `application_health_service.py`: collects the health status of the application components
|
93 |
* `content_ingestion_service.py`: converts content into chunks and stores them into the database
|
94 |
* `context_retrieval_service.py`: queries for relevant context from the database to answer a question
|
95 |
* `embeddings_model_service.py`: converts text to embeddings
|
|
|
100 |
* `slack_service.py`: handles events from Slack and sends back responses
|
101 |
* `vector_database_service.py`: stores and queries chunks
|
102 |
* `vectorization_service.py`: converts chunks into chunks with embeddings
|
103 |
+
* `tasks/`: scheduled tasks to run in the background
|
104 |
* `utils/`: reusable utilities
|
105 |
* `app.py`: application entry point
|
106 |
* `containers.py`: the dependency injection container
|
pyproject.toml
CHANGED
@@ -19,7 +19,7 @@ classifiers = [
|
|
19 |
"Operating System :: OS Independent",
|
20 |
]
|
21 |
dependencies = [
|
22 |
-
"pydantic>=2.11.2",
|
23 |
"pydantic-settings>=2.8.1",
|
24 |
"cachetools>=5.5.2",
|
25 |
"more-itertools>=10.6.0",
|
@@ -30,6 +30,7 @@ dependencies = [
|
|
30 |
"apscheduler>=3.11.0",
|
31 |
# "tenacity>=9.1.2",
|
32 |
# "pybreaker>=1.3.0",
|
|
|
33 |
"aiohttp>=3.11.16",
|
34 |
"webvtt-py>=0.5.1",
|
35 |
"slack-sdk>=3.35.0",
|
|
|
19 |
"Operating System :: OS Independent",
|
20 |
]
|
21 |
dependencies = [
|
22 |
+
"pydantic[email]>=2.11.2",
|
23 |
"pydantic-settings>=2.8.1",
|
24 |
"cachetools>=5.5.2",
|
25 |
"more-itertools>=10.6.0",
|
|
|
30 |
"apscheduler>=3.11.0",
|
31 |
# "tenacity>=9.1.2",
|
32 |
# "pybreaker>=1.3.0",
|
33 |
+
"httpx>=0.28.1",
|
34 |
"aiohttp>=3.11.16",
|
35 |
"webvtt-py>=0.5.1",
|
36 |
"slack-sdk>=3.35.0",
|
scripts/run-dev.sh
CHANGED
@@ -2,4 +2,4 @@
|
|
2 |
|
3 |
parent_path=$(cd "$(dirname "${BASH_SOURCE[0]}")"; pwd -P)
|
4 |
|
5 |
-
|
|
|
2 |
|
3 |
parent_path=$(cd "$(dirname "${BASH_SOURCE[0]}")"; pwd -P)
|
4 |
|
5 |
+
log_level=DEBUG python3 "${parent_path}/../src/ctp_slack_bot/app.py"
|
src/ctp_slack_bot/app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
from aiohttp.web import Application as WebApplication, AppRunner as WebAppRunner, Response, TCPSite
|
2 |
from asyncio import all_tasks, CancelledError, create_task, current_task, get_running_loop, run
|
|
|
3 |
from loguru import logger
|
4 |
from signal import SIGINT, SIGTERM
|
5 |
from typing import Any, Callable
|
@@ -18,17 +19,22 @@ async def main() -> None:
|
|
18 |
container.wire(packages=["ctp_slack_bot"])
|
19 |
|
20 |
# Kick off services which should be active from the start.
|
|
|
21 |
container.content_ingestion_service()
|
22 |
container.question_dispatch_service()
|
23 |
container.schedule_service()
|
24 |
|
25 |
-
async def health(request):
|
26 |
-
|
|
|
|
|
|
|
|
|
27 |
http_server = WebApplication()
|
28 |
http_server.router.add_get("/health", health)
|
29 |
web_app_runner = WebAppRunner(http_server)
|
30 |
await web_app_runner.setup()
|
31 |
-
website = TCPSite(web_app_runner, "0.0.0.0", 8080)
|
32 |
await website.start()
|
33 |
|
34 |
async def handle_shutdown_signal() -> None:
|
@@ -61,5 +67,6 @@ async def main() -> None:
|
|
61 |
await socket_mode_handler.close_async()
|
62 |
await container.shutdown_resources()
|
63 |
|
|
|
64 |
if __name__ == "__main__":
|
65 |
run(main())
|
|
|
1 |
from aiohttp.web import Application as WebApplication, AppRunner as WebAppRunner, Response, TCPSite
|
2 |
from asyncio import all_tasks, CancelledError, create_task, current_task, get_running_loop, run
|
3 |
+
from json import dumps
|
4 |
from loguru import logger
|
5 |
from signal import SIGINT, SIGTERM
|
6 |
from typing import Any, Callable
|
|
|
19 |
container.wire(packages=["ctp_slack_bot"])
|
20 |
|
21 |
# Kick off services which should be active from the start.
|
22 |
+
application_health_service = await container.application_health_service()
|
23 |
container.content_ingestion_service()
|
24 |
container.question_dispatch_service()
|
25 |
container.schedule_service()
|
26 |
|
27 |
+
async def health(request): # TODO: Abstract away
|
28 |
+
health_statuses = await application_health_service.get_health()
|
29 |
+
if all(health_statuses.values()):
|
30 |
+
return Response(text=dumps(dict(health_statuses)), content_type="application/json")
|
31 |
+
else:
|
32 |
+
return Response(body=dumps(dict(health_statuses)), content_type="application/json", status=503)
|
33 |
http_server = WebApplication()
|
34 |
http_server.router.add_get("/health", health)
|
35 |
web_app_runner = WebAppRunner(http_server)
|
36 |
await web_app_runner.setup()
|
37 |
+
website = TCPSite(web_app_runner, "0.0.0.0", 8080) # TODO: Un-hard-code
|
38 |
await website.start()
|
39 |
|
40 |
async def handle_shutdown_signal() -> None:
|
|
|
67 |
await socket_mode_handler.close_async()
|
68 |
await container.shutdown_resources()
|
69 |
|
70 |
+
|
71 |
if __name__ == "__main__":
|
72 |
run(main())
|
src/ctp_slack_bot/containers.py
CHANGED
@@ -1,21 +1,23 @@
|
|
1 |
from dependency_injector.containers import DeclarativeContainer
|
2 |
-
from dependency_injector.providers import Callable, Resource, Singleton
|
3 |
from importlib import import_module
|
4 |
from pkgutil import iter_modules
|
5 |
from slack_bolt.adapter.socket_mode.async_handler import AsyncSocketModeHandler
|
6 |
from slack_bolt.async_app import AsyncApp
|
7 |
from types import ModuleType
|
8 |
|
9 |
-
from ctp_slack_bot.core
|
10 |
from ctp_slack_bot.db.mongo_db import MongoDBResource
|
11 |
from ctp_slack_bot.db.repositories.mongo_db_vectorized_chunk_repository import MongoVectorizedChunkRepositoryResource
|
12 |
-
from ctp_slack_bot.mime_type_handlers
|
13 |
from ctp_slack_bot.services.answer_retrieval_service import AnswerRetrievalService
|
|
|
14 |
from ctp_slack_bot.services.content_ingestion_service import ContentIngestionService
|
15 |
from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
|
16 |
from ctp_slack_bot.services.embeddings_model_service import EmbeddingsModelService
|
17 |
from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
18 |
from ctp_slack_bot.services.google_drive_service import GoogleDriveService
|
|
|
19 |
from ctp_slack_bot.services.language_model_service import LanguageModelService
|
20 |
from ctp_slack_bot.services.question_dispatch_service import QuestionDispatchService
|
21 |
from ctp_slack_bot.services.schedule_service import ScheduleServiceResource
|
@@ -42,6 +44,7 @@ class Container(DeclarativeContainer): # TODO: audit for potential async-related
|
|
42 |
event_brokerage_service = Singleton(EventBrokerageService)
|
43 |
schedule_service = Resource (ScheduleServiceResource,
|
44 |
settings=settings)
|
|
|
45 |
mongo_db = Resource (MongoDBResource,
|
46 |
settings=settings)
|
47 |
vectorized_chunk_repository = Resource (MongoVectorizedChunkRepositoryResource,
|
@@ -73,12 +76,13 @@ class Container(DeclarativeContainer): # TODO: audit for potential async-related
|
|
73 |
content_ingestion_service=content_ingestion_service,
|
74 |
context_retrieval_service=context_retrieval_service,
|
75 |
answer_retrieval_service=answer_retrieval_service)
|
76 |
-
slack_bolt_app = Singleton(lambda settings: AsyncApp(token=settings.
|
77 |
settings)
|
78 |
slack_service = Resource (SlackServiceResource,
|
79 |
event_brokerage_service=event_brokerage_service,
|
|
|
80 |
slack_bolt_app=slack_bolt_app)
|
81 |
-
socket_mode_handler = Singleton(lambda _, app, settings: AsyncSocketModeHandler(app, settings.
|
82 |
slack_service,
|
83 |
slack_bolt_app,
|
84 |
settings)
|
@@ -89,3 +93,5 @@ class Container(DeclarativeContainer): # TODO: audit for potential async-related
|
|
89 |
# settings=settings,
|
90 |
# google_drive_service=google_drive_service,
|
91 |
# mime_type_handler_factory=mime_type_handler_factory)
|
|
|
|
|
|
1 |
from dependency_injector.containers import DeclarativeContainer
|
2 |
+
from dependency_injector.providers import Callable, List, Resource, Singleton
|
3 |
from importlib import import_module
|
4 |
from pkgutil import iter_modules
|
5 |
from slack_bolt.adapter.socket_mode.async_handler import AsyncSocketModeHandler
|
6 |
from slack_bolt.async_app import AsyncApp
|
7 |
from types import ModuleType
|
8 |
|
9 |
+
from ctp_slack_bot.core import Settings
|
10 |
from ctp_slack_bot.db.mongo_db import MongoDBResource
|
11 |
from ctp_slack_bot.db.repositories.mongo_db_vectorized_chunk_repository import MongoVectorizedChunkRepositoryResource
|
12 |
+
from ctp_slack_bot.mime_type_handlers import MimeTypeHandler
|
13 |
from ctp_slack_bot.services.answer_retrieval_service import AnswerRetrievalService
|
14 |
+
from ctp_slack_bot.services.application_health_service import ApplicationHealthService
|
15 |
from ctp_slack_bot.services.content_ingestion_service import ContentIngestionService
|
16 |
from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
|
17 |
from ctp_slack_bot.services.embeddings_model_service import EmbeddingsModelService
|
18 |
from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
19 |
from ctp_slack_bot.services.google_drive_service import GoogleDriveService
|
20 |
+
from ctp_slack_bot.services.http_client_service import HTTPClientServiceResource
|
21 |
from ctp_slack_bot.services.language_model_service import LanguageModelService
|
22 |
from ctp_slack_bot.services.question_dispatch_service import QuestionDispatchService
|
23 |
from ctp_slack_bot.services.schedule_service import ScheduleServiceResource
|
|
|
44 |
event_brokerage_service = Singleton(EventBrokerageService)
|
45 |
schedule_service = Resource (ScheduleServiceResource,
|
46 |
settings=settings)
|
47 |
+
http_client = Resource (HTTPClientServiceResource)
|
48 |
mongo_db = Resource (MongoDBResource,
|
49 |
settings=settings)
|
50 |
vectorized_chunk_repository = Resource (MongoVectorizedChunkRepositoryResource,
|
|
|
76 |
content_ingestion_service=content_ingestion_service,
|
77 |
context_retrieval_service=context_retrieval_service,
|
78 |
answer_retrieval_service=answer_retrieval_service)
|
79 |
+
slack_bolt_app = Singleton(lambda settings: AsyncApp(token=settings.slack_bot_token.get_secret_value()),
|
80 |
settings)
|
81 |
slack_service = Resource (SlackServiceResource,
|
82 |
event_brokerage_service=event_brokerage_service,
|
83 |
+
http_client=http_client,
|
84 |
slack_bolt_app=slack_bolt_app)
|
85 |
+
socket_mode_handler = Singleton(lambda _, app, settings: AsyncSocketModeHandler(app, settings.slack_app_token.get_secret_value()),
|
86 |
slack_service,
|
87 |
slack_bolt_app,
|
88 |
settings)
|
|
|
93 |
# settings=settings,
|
94 |
# google_drive_service=google_drive_service,
|
95 |
# mime_type_handler_factory=mime_type_handler_factory)
|
96 |
+
application_health_service = Singleton(ApplicationHealthService,
|
97 |
+
services=List(mongo_db, slack_service))
|
src/ctp_slack_bot/core/__init__.py
CHANGED
@@ -1 +1,2 @@
|
|
|
|
1 |
from ctp_slack_bot.core.config import Settings
|
|
|
1 |
+
from ctp_slack_bot.core.abstractions import AbstractBaseModel, ApplicationComponentBase, HealthReportingApplicationComponentBase
|
2 |
from ctp_slack_bot.core.config import Settings
|
src/ctp_slack_bot/core/abstractions.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
from loguru import logger
|
3 |
+
from pydantic import BaseModel
|
4 |
+
from typing import Any, Self
|
5 |
+
|
6 |
+
|
7 |
+
class AbstractModelMetaclass(type(BaseModel), type(ABC)):
|
8 |
+
pass
|
9 |
+
|
10 |
+
|
11 |
+
class AbstractBaseModel(BaseModel, ABC, metaclass=AbstractModelMetaclass):
|
12 |
+
pass
|
13 |
+
|
14 |
+
|
15 |
+
class ApplicationComponentBase(AbstractBaseModel):
|
16 |
+
|
17 |
+
def __init__(self: Self, **data: dict[str, Any]) -> None:
|
18 |
+
super().__init__(**data)
|
19 |
+
logger.debug("Created {}", self.__class__.__name__)
|
20 |
+
|
21 |
+
@property
|
22 |
+
@abstractmethod
|
23 |
+
def name(self: Self) -> str:
|
24 |
+
pass
|
25 |
+
|
26 |
+
|
27 |
+
class HealthReportingApplicationComponentBase(ApplicationComponentBase):
|
28 |
+
|
29 |
+
@abstractmethod
|
30 |
+
async def is_healthy(self: Self) -> bool:
|
31 |
+
pass
|
src/ctp_slack_bot/core/config.py
CHANGED
@@ -1,78 +1,79 @@
|
|
1 |
from loguru import logger
|
2 |
-
from pydantic import Field, MongoDsn, NonNegativeFloat, NonNegativeInt, PositiveInt, SecretStr
|
3 |
from pydantic_settings import BaseSettings, SettingsConfigDict
|
4 |
from types import MappingProxyType
|
5 |
from typing import Literal, Mapping, Optional, Self
|
6 |
|
|
|
7 |
class Settings(BaseSettings):
|
8 |
"""
|
9 |
Application settings loaded from environment variables.
|
10 |
"""
|
11 |
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
|
|
|
|
17 |
|
18 |
# Logging Configuration ― not actually used to configure Loguru, but defined to prevent warnings about “unknown” environment variables
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
|
|
21 |
|
22 |
# APScheduler Configuration
|
23 |
-
|
24 |
|
25 |
# Slack Configuration
|
26 |
-
|
27 |
-
|
28 |
|
29 |
# Vectorization Configuration
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
|
36 |
# MongoDB Configuration
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
|
43 |
# Hugging Face Configuration
|
44 |
-
|
45 |
|
46 |
# OpenAI Configuration
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
|
53 |
# Google Drive Configuration
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
GOOGLE_TOKEN_URI: str = "https://oauth2.googleapis.com/token"
|
62 |
-
GOOGLE_AUTH_PROVIDER_CERT_URL: str = "https://www.googleapis.com/oauth2/v1/certs"
|
63 |
-
GOOGLE_CLIENT_CERT_URL: str = "https://www.googleapis.com/robot/v1/metadata/x509/ctp-slack-bot-714%40voltaic-reducer-294821.iam.gserviceaccount.com"
|
64 |
-
GOOGLE_UNIVERSE_DOMAIN: str = "googleapis.com"
|
65 |
|
66 |
# File Monitoring Configuration
|
67 |
-
|
68 |
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
frozen=True
|
75 |
-
)
|
76 |
|
77 |
def get_extra_environment_variables(self: Self) -> Mapping[str, str]:
|
78 |
return MappingProxyType(self.__pydantic_extra__)
|
|
|
1 |
from loguru import logger
|
2 |
+
from pydantic import EmailStr, Field, MongoDsn, NonNegativeFloat, NonNegativeInt, PositiveInt, SecretStr
|
3 |
from pydantic_settings import BaseSettings, SettingsConfigDict
|
4 |
from types import MappingProxyType
|
5 |
from typing import Literal, Mapping, Optional, Self
|
6 |
|
7 |
+
|
8 |
class Settings(BaseSettings):
|
9 |
"""
|
10 |
Application settings loaded from environment variables.
|
11 |
"""
|
12 |
|
13 |
+
model_config = SettingsConfigDict(
|
14 |
+
case_sensitive=False,
|
15 |
+
env_file=".env",
|
16 |
+
env_file_encoding="utf-8",
|
17 |
+
extra="allow",
|
18 |
+
frozen=True
|
19 |
+
)
|
20 |
|
21 |
# Logging Configuration ― not actually used to configure Loguru, but defined to prevent warnings about “unknown” environment variables
|
22 |
+
log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = Field(default_factory=lambda data: "DEBUG" if data.get("DEBUG", False) else "INFO")
|
23 |
+
log_format: Literal["text", "json"] = "json"
|
24 |
+
|
25 |
+
# HTTP Server Configuration
|
26 |
+
http_host: str = "0.0.0.0"
|
27 |
+
http_port: PositiveInt = 8080
|
28 |
|
29 |
# APScheduler Configuration
|
30 |
+
scheduler_timezone: Optional[str] = "UTC"
|
31 |
|
32 |
# Slack Configuration
|
33 |
+
slack_bot_token: SecretStr
|
34 |
+
slack_app_token: SecretStr
|
35 |
|
36 |
# Vectorization Configuration
|
37 |
+
embedding_model: str
|
38 |
+
vector_dimension: PositiveInt
|
39 |
+
chunk_size: PositiveInt
|
40 |
+
chunk_overlap: NonNegativeInt
|
41 |
+
top_k_matches: PositiveInt
|
42 |
|
43 |
# MongoDB Configuration
|
44 |
+
mongodb_uri: SecretStr # TODO: Contemplate switching to MongoDsn type for the main URL, and separate out the credentials to SecretStr variables.
|
45 |
+
mongodb_name: str
|
46 |
+
vectorized_chunks_collection_name: str = "vectorized_chunks"
|
47 |
+
vectorized_chunks_search_index_name: Optional[str] = None
|
48 |
+
score_threshold: NonNegativeFloat
|
49 |
|
50 |
# Hugging Face Configuration
|
51 |
+
hf_api_token: Optional[SecretStr] = None # TODO: Currently, this is unused.
|
52 |
|
53 |
# OpenAI Configuration
|
54 |
+
openai_api_key: SecretStr
|
55 |
+
chat_model: str
|
56 |
+
max_tokens: PositiveInt
|
57 |
+
temperature: NonNegativeFloat
|
58 |
+
system_prompt: str
|
59 |
|
60 |
# Google Drive Configuration
|
61 |
+
google_drive_root_id: str
|
62 |
+
google_project_id: str
|
63 |
+
google_private_key_id: SecretStr
|
64 |
+
google_private_key: SecretStr
|
65 |
+
google_client_id: str
|
66 |
+
google_client_email: EmailStr
|
67 |
+
google_token_uri: str = "https://oauth2.googleapis.com/token"
|
|
|
|
|
|
|
|
|
68 |
|
69 |
# File Monitoring Configuration
|
70 |
+
file_monitor_root_path: str = ""
|
71 |
|
72 |
+
def __init__(self: Self, **data) -> None:
|
73 |
+
super().__init__(**data)
|
74 |
+
logger.debug("Created {}", self.__class__.__name__)
|
75 |
+
if self.__pydantic_extra__:
|
76 |
+
logger.warning("Extra unrecognized environment variables were provided: {}", ", ".join(self.__pydantic_extra__))
|
|
|
|
|
77 |
|
78 |
def get_extra_environment_variables(self: Self) -> Mapping[str, str]:
|
79 |
return MappingProxyType(self.__pydantic_extra__)
|
src/ctp_slack_bot/core/logging.py
CHANGED
@@ -4,6 +4,7 @@ from os import access, getenv, W_OK
|
|
4 |
from sys import stderr
|
5 |
from typing import Self
|
6 |
|
|
|
7 |
class InterceptHandler(Handler):
|
8 |
"""
|
9 |
Intercept standard logging messages toward Loguru.
|
@@ -39,8 +40,8 @@ def setup_logging() -> None:
|
|
39 |
"""
|
40 |
|
41 |
# Get logger configuration from environment variables.
|
42 |
-
log_level = getenv("
|
43 |
-
log_format = getenv("
|
44 |
|
45 |
# Remove default loguru handler.
|
46 |
logger.remove()
|
|
|
4 |
from sys import stderr
|
5 |
from typing import Self
|
6 |
|
7 |
+
|
8 |
class InterceptHandler(Handler):
|
9 |
"""
|
10 |
Intercept standard logging messages toward Loguru.
|
|
|
40 |
"""
|
41 |
|
42 |
# Get logger configuration from environment variables.
|
43 |
+
log_level = getenv("log_level", "INFO")
|
44 |
+
log_format = getenv("log_format", "text")
|
45 |
|
46 |
# Remove default loguru handler.
|
47 |
logger.remove()
|
src/ctp_slack_bot/db/mongo_db.py
CHANGED
@@ -1,34 +1,29 @@
|
|
1 |
from dependency_injector.resources import AsyncResource
|
2 |
-
from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorCollection
|
3 |
from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError
|
4 |
from loguru import logger
|
5 |
-
from pydantic import
|
6 |
-
from typing import Any,
|
7 |
|
8 |
-
from ctp_slack_bot.core
|
9 |
from ctp_slack_bot.utils import sanitize_mongo_db_uri
|
10 |
|
11 |
|
12 |
-
class MongoDB(
|
13 |
"""
|
14 |
MongoDB connection manager using Motor for async operations.
|
15 |
"""
|
16 |
-
settings: Settings
|
17 |
-
_client: PrivateAttr = PrivateAttr()
|
18 |
-
_db: PrivateAttr = PrivateAttr()
|
19 |
|
20 |
-
|
21 |
-
frozen=True
|
22 |
-
arbitrary_types_allowed = True
|
23 |
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
|
28 |
def connect(self: Self) -> None:
|
29 |
"""Initialize MongoDB client with settings."""
|
30 |
try:
|
31 |
-
connection_string = self.settings.
|
32 |
logger.debug("Connecting to MongoDB using URI: {}", sanitize_mongo_db_uri(connection_string))
|
33 |
|
34 |
# Create client with appropriate settings.
|
@@ -43,7 +38,7 @@ class MongoDB(BaseModel):
|
|
43 |
)
|
44 |
|
45 |
# Get the database name.
|
46 |
-
db_name = self.settings.
|
47 |
|
48 |
self._db = self._client[db_name]
|
49 |
logger.debug("MongoDB client initialized for database: {}", db_name)
|
@@ -104,9 +99,17 @@ class MongoDB(BaseModel):
|
|
104 |
self._client = None
|
105 |
self._db = None
|
106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
class MongoDBResource(AsyncResource):
|
108 |
async def init(self: Self, settings: Settings) -> MongoDB:
|
109 |
-
logger.info("Initializing MongoDB connection for database: {}", settings.
|
110 |
mongo_db = MongoDB(settings=settings)
|
111 |
mongo_db.connect()
|
112 |
await self._test_connection(mongo_db)
|
|
|
1 |
from dependency_injector.resources import AsyncResource
|
2 |
+
from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorDatabase, AsyncIOMotorCollection
|
3 |
from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError
|
4 |
from loguru import logger
|
5 |
+
from pydantic import ConfigDict
|
6 |
+
from typing import Any, Self
|
7 |
|
8 |
+
from ctp_slack_bot.core import HealthReportingApplicationComponentBase, Settings
|
9 |
from ctp_slack_bot.utils import sanitize_mongo_db_uri
|
10 |
|
11 |
|
12 |
+
class MongoDB(HealthReportingApplicationComponentBase):
|
13 |
"""
|
14 |
MongoDB connection manager using Motor for async operations.
|
15 |
"""
|
|
|
|
|
|
|
16 |
|
17 |
+
model_config = ConfigDict(frozen=True)
|
|
|
|
|
18 |
|
19 |
+
settings: Settings
|
20 |
+
_client: AsyncIOMotorClient
|
21 |
+
_db: AsyncIOMotorDatabase
|
22 |
|
23 |
def connect(self: Self) -> None:
|
24 |
"""Initialize MongoDB client with settings."""
|
25 |
try:
|
26 |
+
connection_string = self.settings.mongodb_uri.get_secret_value()
|
27 |
logger.debug("Connecting to MongoDB using URI: {}", sanitize_mongo_db_uri(connection_string))
|
28 |
|
29 |
# Create client with appropriate settings.
|
|
|
38 |
)
|
39 |
|
40 |
# Get the database name.
|
41 |
+
db_name = self.settings.mongodb_name
|
42 |
|
43 |
self._db = self._client[db_name]
|
44 |
logger.debug("MongoDB client initialized for database: {}", db_name)
|
|
|
99 |
self._client = None
|
100 |
self._db = None
|
101 |
|
102 |
+
@property
|
103 |
+
def name(self: Self) -> str:
|
104 |
+
return "mongo_db"
|
105 |
+
|
106 |
+
async def is_healthy(self: Self) -> bool:
|
107 |
+
return await self.ping()
|
108 |
+
|
109 |
+
|
110 |
class MongoDBResource(AsyncResource):
|
111 |
async def init(self: Self, settings: Settings) -> MongoDB:
|
112 |
+
logger.info("Initializing MongoDB connection for database: {}", settings.mongodb_name)
|
113 |
mongo_db = MongoDB(settings=settings)
|
114 |
mongo_db.connect()
|
115 |
await self._test_connection(mongo_db)
|
src/ctp_slack_bot/db/repositories/mongo_db_vectorized_chunk_repository.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
from dependency_injector.resources import AsyncResource
|
2 |
from loguru import logger
|
3 |
from pymongo import ASCENDING, ReturnDocument
|
4 |
-
from typing import Any, Collection,
|
5 |
|
6 |
from ctp_slack_bot.core import Settings
|
7 |
from ctp_slack_bot.models import Chunk, VectorizedChunk, VectorQuery
|
@@ -13,10 +13,6 @@ from ctp_slack_bot.db.repositories.vector_repository_base import VectorRepositor
|
|
13 |
class MongoVectorizedChunkRepository(VectorRepositoryBase, VectorizedChunkRepository):
|
14 |
"""MongoDB implementation of VectorizedChunkRepository"""
|
15 |
|
16 |
-
def __init__(self: Self, **data: Dict[str, Any]) -> None:
|
17 |
-
super().__init__(**data)
|
18 |
-
logger.debug("Created {}", self.__class__.__name__)
|
19 |
-
|
20 |
async def count_by_id(self: Self, parent_id: str, chunk_id: Optional[str] = None) -> int:
|
21 |
if chunk_id is None:
|
22 |
return await self.collection.count_documents({"parent_id": parent_id})
|
@@ -54,7 +50,7 @@ class MongoVectorizedChunkRepository(VectorRepositoryBase, VectorizedChunkReposi
|
|
54 |
pipeline = [
|
55 |
{
|
56 |
"$vectorSearch": {
|
57 |
-
"index": self.settings.
|
58 |
"path": "embedding",
|
59 |
"queryVector": query.query_embeddings,
|
60 |
"numCandidates": query.k * 10,
|
@@ -133,12 +129,20 @@ class MongoVectorizedChunkRepository(VectorRepositoryBase, VectorizedChunkReposi
|
|
133 |
await super().ensure_indices_exist()
|
134 |
index_name = "parent_chunk_unique"
|
135 |
existing_indices = await self.collection.index_information()
|
136 |
-
|
|
|
|
|
|
|
137 |
await self.collection.create_index([("parent_id", ASCENDING), ("chunk_id", ASCENDING)], unique=True, name=index_name)
|
138 |
|
|
|
|
|
|
|
|
|
|
|
139 |
class MongoVectorizedChunkRepositoryResource(AsyncResource):
|
140 |
async def init(self: Self, settings: Settings, mongo_db: MongoDB) -> MongoVectorizedChunkRepository:
|
141 |
-
vectorized_chunk_collection = await mongo_db.get_collection(settings.
|
142 |
vectorized_chunk_repository = MongoVectorizedChunkRepository(settings=settings, collection=vectorized_chunk_collection)
|
143 |
await vectorized_chunk_repository.ensure_indices_exist()
|
144 |
return vectorized_chunk_repository
|
|
|
1 |
from dependency_injector.resources import AsyncResource
|
2 |
from loguru import logger
|
3 |
from pymongo import ASCENDING, ReturnDocument
|
4 |
+
from typing import Any, Collection, Iterable, Mapping, Optional, Self, Sequence, Set
|
5 |
|
6 |
from ctp_slack_bot.core import Settings
|
7 |
from ctp_slack_bot.models import Chunk, VectorizedChunk, VectorQuery
|
|
|
13 |
class MongoVectorizedChunkRepository(VectorRepositoryBase, VectorizedChunkRepository):
|
14 |
"""MongoDB implementation of VectorizedChunkRepository"""
|
15 |
|
|
|
|
|
|
|
|
|
16 |
async def count_by_id(self: Self, parent_id: str, chunk_id: Optional[str] = None) -> int:
|
17 |
if chunk_id is None:
|
18 |
return await self.collection.count_documents({"parent_id": parent_id})
|
|
|
50 |
pipeline = [
|
51 |
{
|
52 |
"$vectorSearch": {
|
53 |
+
"index": self.settings.vectorized_chunks_search_index_name or f"{self.collection.name}_vector_index",
|
54 |
"path": "embedding",
|
55 |
"queryVector": query.query_embeddings,
|
56 |
"numCandidates": query.k * 10,
|
|
|
129 |
await super().ensure_indices_exist()
|
130 |
index_name = "parent_chunk_unique"
|
131 |
existing_indices = await self.collection.index_information()
|
132 |
+
logger.debug("{} existing indices were found: {}", len(existing_indices), existing_indices)
|
133 |
+
if index_name in existing_indices:
|
134 |
+
logger.debug("Index, {}, already exists; duplicate index will not be created.", index_name)
|
135 |
+
else:
|
136 |
await self.collection.create_index([("parent_id", ASCENDING), ("chunk_id", ASCENDING)], unique=True, name=index_name)
|
137 |
|
138 |
+
@property
|
139 |
+
def name(self: Self) -> str:
|
140 |
+
return "mongo_db_vectorized_chunk_repository"
|
141 |
+
|
142 |
+
|
143 |
class MongoVectorizedChunkRepositoryResource(AsyncResource):
|
144 |
async def init(self: Self, settings: Settings, mongo_db: MongoDB) -> MongoVectorizedChunkRepository:
|
145 |
+
vectorized_chunk_collection = await mongo_db.get_collection(settings.vectorized_chunks_collection_name)
|
146 |
vectorized_chunk_repository = MongoVectorizedChunkRepository(settings=settings, collection=vectorized_chunk_collection)
|
147 |
await vectorized_chunk_repository.ensure_indices_exist()
|
148 |
return vectorized_chunk_repository
|
src/ctp_slack_bot/db/repositories/vector_repository_base.py
CHANGED
@@ -1,22 +1,20 @@
|
|
1 |
-
from abc import ABC
|
2 |
from loguru import logger
|
3 |
from motor.motor_asyncio import AsyncIOMotorCollection
|
4 |
-
from pydantic import
|
5 |
from pymongo.operations import SearchIndexModel
|
6 |
from typing import Self
|
7 |
|
8 |
-
from ctp_slack_bot.core import Settings
|
9 |
|
10 |
-
|
|
|
11 |
"""MongoDB implementation of VectorizedChunkRepository"""
|
12 |
|
|
|
|
|
13 |
settings: Settings
|
14 |
collection: AsyncIOMotorCollection
|
15 |
|
16 |
-
class Config:
|
17 |
-
frozen=True
|
18 |
-
arbitrary_types_allowed = True
|
19 |
-
|
20 |
async def ensure_indices_exist(self: Self) -> None:
|
21 |
"""Ensure that indices exist."""
|
22 |
await self.ensure_search_index_exists()
|
@@ -25,12 +23,12 @@ class VectorRepositoryBase(ABC, BaseModel):
|
|
25 |
"""
|
26 |
Ensure that a vector search index exists.
|
27 |
"""
|
28 |
-
index_name = self.settings.
|
29 |
try:
|
30 |
-
|
31 |
-
logger.debug("{} existing indices were found: {}", len(
|
32 |
-
if index_name in
|
33 |
-
logger.debug("Index
|
34 |
return
|
35 |
|
36 |
# Create search index model using MongoDB's recommended approach.
|
@@ -40,7 +38,7 @@ class VectorRepositoryBase(ABC, BaseModel):
|
|
40 |
{
|
41 |
"type": "vector",
|
42 |
"path": "embedding",
|
43 |
-
"numDimensions": self.settings.
|
44 |
"similarity": "cosine",
|
45 |
"quantization": "scalar"
|
46 |
}
|
@@ -50,13 +48,12 @@ class VectorRepositoryBase(ABC, BaseModel):
|
|
50 |
type="vectorSearch"
|
51 |
)
|
52 |
result = await self.collection.create_search_index(search_index_model)
|
53 |
-
logger.info("Vector search index
|
54 |
except Exception as e:
|
55 |
if "command not found" in str(e).lower():
|
56 |
logger.warning("Vector search not supported by this MongoDB instance. Some functionality may be limited.")
|
57 |
-
# Create a fallback standard index on embedding field.
|
58 |
-
|
59 |
-
logger.info("Created standard index on {} field as fallback.", "embedding")
|
60 |
else:
|
61 |
logger.error("Failed to create any index: {}", e)
|
62 |
raise
|
|
|
|
|
1 |
from loguru import logger
|
2 |
from motor.motor_asyncio import AsyncIOMotorCollection
|
3 |
+
from pydantic import ConfigDict
|
4 |
from pymongo.operations import SearchIndexModel
|
5 |
from typing import Self
|
6 |
|
7 |
+
from ctp_slack_bot.core import ApplicationComponentBase, Settings
|
8 |
|
9 |
+
|
10 |
+
class VectorRepositoryBase(ApplicationComponentBase):
|
11 |
"""MongoDB implementation of VectorizedChunkRepository"""
|
12 |
|
13 |
+
model_config = ConfigDict(arbitrary_types_allowed=True, frozen=True)
|
14 |
+
|
15 |
settings: Settings
|
16 |
collection: AsyncIOMotorCollection
|
17 |
|
|
|
|
|
|
|
|
|
18 |
async def ensure_indices_exist(self: Self) -> None:
|
19 |
"""Ensure that indices exist."""
|
20 |
await self.ensure_search_index_exists()
|
|
|
23 |
"""
|
24 |
Ensure that a vector search index exists.
|
25 |
"""
|
26 |
+
index_name = self.settings.vectorized_chunks_search_index_name or f"{self.collection.name}_vector_index"
|
27 |
try:
|
28 |
+
existing_indices = [index["name"] async for index in self.collection.list_search_indexes()]
|
29 |
+
logger.debug("{} existing indices were found: {}", len(existing_indices), existing_indices)
|
30 |
+
if index_name in existing_indices:
|
31 |
+
logger.debug("Index, {}, already exists; duplicate index will not be created.", index_name)
|
32 |
return
|
33 |
|
34 |
# Create search index model using MongoDB's recommended approach.
|
|
|
38 |
{
|
39 |
"type": "vector",
|
40 |
"path": "embedding",
|
41 |
+
"numDimensions": self.settings.vector_dimension,
|
42 |
"similarity": "cosine",
|
43 |
"quantization": "scalar"
|
44 |
}
|
|
|
48 |
type="vectorSearch"
|
49 |
)
|
50 |
result = await self.collection.create_search_index(search_index_model)
|
51 |
+
logger.info("Vector search index, {}, created for collection {}.", result, self.collection.name)
|
52 |
except Exception as e:
|
53 |
if "command not found" in str(e).lower():
|
54 |
logger.warning("Vector search not supported by this MongoDB instance. Some functionality may be limited.")
|
55 |
+
await self.collection.create_index("embedding") # Create a fallback standard index on embedding field.
|
56 |
+
logger.info("Created standard index on field, {}, as fallback.", "embedding")
|
|
|
57 |
else:
|
58 |
logger.error("Failed to create any index: {}", e)
|
59 |
raise
|
src/ctp_slack_bot/db/repositories/vectorized_chunk_repository.py
CHANGED
@@ -1,10 +1,12 @@
|
|
1 |
-
from abc import
|
2 |
from pydantic import BaseModel
|
3 |
from typing import Any, Collection, Iterable, Mapping, Optional, Self, Sequence, Set
|
4 |
|
|
|
5 |
from ctp_slack_bot.models import Chunk, VectorizedChunk, VectorQuery
|
6 |
|
7 |
-
|
|
|
8 |
"""Repository interface for VectorizedChunk entities."""
|
9 |
|
10 |
@abstractmethod
|
|
|
1 |
+
from abc import abstractmethod
|
2 |
from pydantic import BaseModel
|
3 |
from typing import Any, Collection, Iterable, Mapping, Optional, Self, Sequence, Set
|
4 |
|
5 |
+
from ctp_slack_bot.core import ApplicationComponentBase
|
6 |
from ctp_slack_bot.models import Chunk, VectorizedChunk, VectorQuery
|
7 |
|
8 |
+
|
9 |
+
class VectorizedChunkRepository(ApplicationComponentBase):
|
10 |
"""Repository interface for VectorizedChunk entities."""
|
11 |
|
12 |
@abstractmethod
|
src/ctp_slack_bot/mime_type_handlers/base.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
-
from abc import
|
2 |
from functools import lru_cache
|
3 |
-
from typing import Any, ClassVar,
|
4 |
|
5 |
from ctp_slack_bot.models import Content
|
6 |
|
@@ -9,13 +9,13 @@ class MimeTypeHandlerMeta(type):
|
|
9 |
|
10 |
_registry: ClassVar[dict[str, type["BaseMimeTypeHandler"]]] = {}
|
11 |
|
12 |
-
def __init__(cls, name: str, bases: tuple[type, ...], dict:
|
13 |
super().__init__(name, bases, dict)
|
14 |
if hasattr(cls, "MIME_TYPE"):
|
15 |
MimeTypeHandlerMeta._registry[cls.MIME_TYPE] = cls
|
16 |
|
17 |
|
18 |
-
class MimeTypeHandlerABCMeta(MimeTypeHandlerMeta,
|
19 |
pass
|
20 |
|
21 |
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
from functools import lru_cache
|
3 |
+
from typing import Any, ClassVar, Mapping, Optional
|
4 |
|
5 |
from ctp_slack_bot.models import Content
|
6 |
|
|
|
9 |
|
10 |
_registry: ClassVar[dict[str, type["BaseMimeTypeHandler"]]] = {}
|
11 |
|
12 |
+
def __init__(cls, name: str, bases: tuple[type, ...], dict: dict[str, Any]) -> None:
|
13 |
super().__init__(name, bases, dict)
|
14 |
if hasattr(cls, "MIME_TYPE"):
|
15 |
MimeTypeHandlerMeta._registry[cls.MIME_TYPE] = cls
|
16 |
|
17 |
|
18 |
+
class MimeTypeHandlerABCMeta(MimeTypeHandlerMeta, type(ABC)):
|
19 |
pass
|
20 |
|
21 |
|
src/ctp_slack_bot/mime_type_handlers/text/vtt.py
CHANGED
@@ -11,6 +11,7 @@ from ctp_slack_bot.models import Content, WebVTTContent, WebVTTFrame
|
|
11 |
|
12 |
ISO_DATE_TIME_PATTERN = compile_re(r"Start time: (\d{4}-\d{2}-\d{2}(?: \d{2}:\d{2}:\d{2}(?:Z|[+-]\d{2}:\d{2})?)?)")
|
13 |
|
|
|
14 |
class WebVTTMimeTypeHandler(MimeTypeHandler):
|
15 |
|
16 |
MIME_TYPE = "text/vtt"
|
|
|
11 |
|
12 |
ISO_DATE_TIME_PATTERN = compile_re(r"Start time: (\d{4}-\d{2}-\d{2}(?: \d{2}:\d{2}:\d{2}(?:Z|[+-]\d{2}:\d{2})?)?)")
|
13 |
|
14 |
+
|
15 |
class WebVTTMimeTypeHandler(MimeTypeHandler):
|
16 |
|
17 |
MIME_TYPE = "text/vtt"
|
src/ctp_slack_bot/models/base.py
CHANGED
@@ -1,19 +1,21 @@
|
|
1 |
-
from abc import
|
2 |
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
|
|
3 |
from typing import Any, final, Mapping, Optional, Self
|
4 |
|
|
|
5 |
from ctp_slack_bot.utils import to_deep_immutable
|
6 |
|
7 |
|
8 |
class Chunk(BaseModel):
|
9 |
"""A class representing a chunk of content."""
|
10 |
|
|
|
|
|
11 |
text: str # The text representation
|
12 |
parent_id: str # The source content’s identity
|
13 |
chunk_id: str # This chunk’s identity—unique within the source content
|
14 |
-
metadata: Mapping[str, Any] = Field(default_factory=
|
15 |
-
|
16 |
-
model_config = ConfigDict(frozen=True)
|
17 |
|
18 |
@field_validator('metadata')
|
19 |
@classmethod
|
@@ -32,12 +34,12 @@ class VectorQuery(BaseModel):
|
|
32 |
filter_metadata: Optional filters for metadata fields
|
33 |
"""
|
34 |
|
|
|
|
|
35 |
query_embeddings: tuple[float, ...]
|
36 |
k: int
|
37 |
score_threshold: float = Field(default=0.7)
|
38 |
-
filter_metadata: Mapping[str, Any] = Field(default_factory=
|
39 |
-
|
40 |
-
model_config = ConfigDict(frozen=True)
|
41 |
|
42 |
@field_validator('filter_metadata')
|
43 |
@classmethod
|
@@ -52,7 +54,7 @@ class VectorizedChunk(Chunk):
|
|
52 |
embedding: tuple[float, ...] # The vector representation
|
53 |
|
54 |
|
55 |
-
class Content(
|
56 |
"""An abstract base class for all types of content."""
|
57 |
|
58 |
model_config = ConfigDict(frozen=True)
|
|
|
1 |
+
from abc import abstractmethod
|
2 |
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
3 |
+
from types import MappingProxyType
|
4 |
from typing import Any, final, Mapping, Optional, Self
|
5 |
|
6 |
+
from ctp_slack_bot.core import AbstractBaseModel
|
7 |
from ctp_slack_bot.utils import to_deep_immutable
|
8 |
|
9 |
|
10 |
class Chunk(BaseModel):
|
11 |
"""A class representing a chunk of content."""
|
12 |
|
13 |
+
model_config = ConfigDict(frozen=True)
|
14 |
+
|
15 |
text: str # The text representation
|
16 |
parent_id: str # The source content’s identity
|
17 |
chunk_id: str # This chunk’s identity—unique within the source content
|
18 |
+
metadata: Mapping[str, Any] = Field(default_factory=lambda: MappingProxyType({}))
|
|
|
|
|
19 |
|
20 |
@field_validator('metadata')
|
21 |
@classmethod
|
|
|
34 |
filter_metadata: Optional filters for metadata fields
|
35 |
"""
|
36 |
|
37 |
+
model_config = ConfigDict(frozen=True)
|
38 |
+
|
39 |
query_embeddings: tuple[float, ...]
|
40 |
k: int
|
41 |
score_threshold: float = Field(default=0.7)
|
42 |
+
filter_metadata: Mapping[str, Any] = Field(default_factory=lambda: MappingProxyType({}))
|
|
|
|
|
43 |
|
44 |
@field_validator('filter_metadata')
|
45 |
@classmethod
|
|
|
54 |
embedding: tuple[float, ...] # The vector representation
|
55 |
|
56 |
|
57 |
+
class Content(AbstractBaseModel):
|
58 |
"""An abstract base class for all types of content."""
|
59 |
|
60 |
model_config = ConfigDict(frozen=True)
|
src/ctp_slack_bot/models/google_drive.py
CHANGED
@@ -6,14 +6,14 @@ from typing import Self
|
|
6 |
class GoogleDriveMetadata(BaseModel):
|
7 |
"""Represents Google Drive file or folder metadata."""
|
8 |
|
|
|
|
|
9 |
id: str
|
10 |
name: str
|
11 |
modified_time: datetime
|
12 |
mime_type: str
|
13 |
folder_path: str
|
14 |
|
15 |
-
model_config = ConfigDict(frozen=True)
|
16 |
-
|
17 |
@classmethod
|
18 |
def from_folder_path_and_dict(cls, folder_path: str, dict: dict) -> Self:
|
19 |
id = dict["id"]
|
|
|
6 |
class GoogleDriveMetadata(BaseModel):
|
7 |
"""Represents Google Drive file or folder metadata."""
|
8 |
|
9 |
+
model_config = ConfigDict(frozen=True)
|
10 |
+
|
11 |
id: str
|
12 |
name: str
|
13 |
modified_time: datetime
|
14 |
mime_type: str
|
15 |
folder_path: str
|
16 |
|
|
|
|
|
17 |
@classmethod
|
18 |
def from_folder_path_and_dict(cls, folder_path: str, dict: dict) -> Self:
|
19 |
id = dict["id"]
|
src/ctp_slack_bot/models/slack.py
CHANGED
@@ -1,21 +1,26 @@
|
|
1 |
from datetime import datetime
|
2 |
from json import dumps
|
3 |
-
from pydantic import BaseModel, ConfigDict, PositiveInt
|
4 |
from types import MappingProxyType
|
5 |
from typing import Any, Literal, Mapping, Optional, Self
|
6 |
|
7 |
from ctp_slack_bot.models.base import Chunk, Content
|
8 |
|
|
|
9 |
class SlackEventPayload(BaseModel):
|
10 |
"""Represents a general event payload from Slack."""
|
|
|
|
|
|
|
11 |
type: str
|
12 |
event_ts: str
|
13 |
|
14 |
-
model_config = ConfigDict(extra='allow', frozen=True)
|
15 |
|
16 |
class SlackEvent(BaseModel):
|
17 |
"""Represents a general event from Slack."""
|
18 |
|
|
|
|
|
19 |
token: str
|
20 |
team_id: str
|
21 |
api_app_id: str
|
@@ -25,24 +30,25 @@ class SlackEvent(BaseModel):
|
|
25 |
event_time: int
|
26 |
authed_users: tuple[str, ...]
|
27 |
|
28 |
-
model_config = ConfigDict(frozen=True)
|
29 |
|
30 |
class SlackUserTimestampPair(BaseModel):
|
31 |
"""Represents a Slack user-timestamp pair."""
|
32 |
|
|
|
|
|
33 |
user: str
|
34 |
ts: str
|
35 |
|
36 |
-
model_config = ConfigDict(frozen=True)
|
37 |
|
38 |
class SlackReaction(BaseModel):
|
39 |
"""Represents a Slack reaction information."""
|
40 |
|
|
|
|
|
41 |
name: str
|
42 |
count: PositiveInt
|
43 |
users: tuple[str, ...]
|
44 |
|
45 |
-
model_config = ConfigDict(frozen=True)
|
46 |
|
47 |
class SlackMessage(Content):
|
48 |
"""Represents a message from Slack after adaptation."""
|
@@ -76,9 +82,12 @@ class SlackMessage(Content):
|
|
76 |
"modificationTime": datetime.fromtimestamp(float(self.ts))
|
77 |
})
|
78 |
|
|
|
79 |
class SlackResponse(BaseModel): # TODO: This should also be based on Content as it is a SlackMessage―just not one for which we know the identity yet.
|
80 |
"""Represents a response message to be sent to Slack."""
|
81 |
|
|
|
|
|
82 |
text: str
|
83 |
channel: Optional[str]
|
84 |
thread_ts: Optional[str] = None
|
|
|
1 |
from datetime import datetime
|
2 |
from json import dumps
|
3 |
+
from pydantic import BaseModel, ConfigDict, PositiveInt
|
4 |
from types import MappingProxyType
|
5 |
from typing import Any, Literal, Mapping, Optional, Self
|
6 |
|
7 |
from ctp_slack_bot.models.base import Chunk, Content
|
8 |
|
9 |
+
|
10 |
class SlackEventPayload(BaseModel):
|
11 |
"""Represents a general event payload from Slack."""
|
12 |
+
|
13 |
+
model_config = ConfigDict(extra='allow', frozen=True)
|
14 |
+
|
15 |
type: str
|
16 |
event_ts: str
|
17 |
|
|
|
18 |
|
19 |
class SlackEvent(BaseModel):
|
20 |
"""Represents a general event from Slack."""
|
21 |
|
22 |
+
model_config = ConfigDict(frozen=True)
|
23 |
+
|
24 |
token: str
|
25 |
team_id: str
|
26 |
api_app_id: str
|
|
|
30 |
event_time: int
|
31 |
authed_users: tuple[str, ...]
|
32 |
|
|
|
33 |
|
34 |
class SlackUserTimestampPair(BaseModel):
|
35 |
"""Represents a Slack user-timestamp pair."""
|
36 |
|
37 |
+
model_config = ConfigDict(frozen=True)
|
38 |
+
|
39 |
user: str
|
40 |
ts: str
|
41 |
|
|
|
42 |
|
43 |
class SlackReaction(BaseModel):
|
44 |
"""Represents a Slack reaction information."""
|
45 |
|
46 |
+
model_config = ConfigDict(frozen=True)
|
47 |
+
|
48 |
name: str
|
49 |
count: PositiveInt
|
50 |
users: tuple[str, ...]
|
51 |
|
|
|
52 |
|
53 |
class SlackMessage(Content):
|
54 |
"""Represents a message from Slack after adaptation."""
|
|
|
82 |
"modificationTime": datetime.fromtimestamp(float(self.ts))
|
83 |
})
|
84 |
|
85 |
+
|
86 |
class SlackResponse(BaseModel): # TODO: This should also be based on Content as it is a SlackMessage―just not one for which we know the identity yet.
|
87 |
"""Represents a response message to be sent to Slack."""
|
88 |
|
89 |
+
model_config = ConfigDict(frozen=True)
|
90 |
+
|
91 |
text: str
|
92 |
channel: Optional[str]
|
93 |
thread_ts: Optional[str] = None
|
src/ctp_slack_bot/models/webvtt.py
CHANGED
@@ -18,14 +18,14 @@ SPEAKER_SPEECH_TEXT_SEPARATOR = ": "
|
|
18 |
class WebVTTFrame(BaseModel):
|
19 |
"""Represents a WebVTT frame"""
|
20 |
|
|
|
|
|
21 |
identifier: str
|
22 |
start: timedelta
|
23 |
end: timedelta
|
24 |
speaker: Optional[str] = None
|
25 |
speech: str
|
26 |
|
27 |
-
model_config = ConfigDict(frozen=True)
|
28 |
-
|
29 |
@classmethod
|
30 |
def from_webvtt_caption(cls, caption: Caption, index: int) -> Self:
|
31 |
identifier = caption.identifier if caption.identifier else str(index)
|
@@ -42,7 +42,7 @@ class WebVTTContent(Content):
|
|
42 |
"""Represents parsed WebVTT content."""
|
43 |
|
44 |
id: str
|
45 |
-
metadata: Mapping[str, Any] = Field(default_factory=
|
46 |
start_time: Optional[datetime]
|
47 |
frames: tuple[WebVTTFrame, ...]
|
48 |
|
|
|
18 |
class WebVTTFrame(BaseModel):
|
19 |
"""Represents a WebVTT frame"""
|
20 |
|
21 |
+
model_config = ConfigDict(frozen=True)
|
22 |
+
|
23 |
identifier: str
|
24 |
start: timedelta
|
25 |
end: timedelta
|
26 |
speaker: Optional[str] = None
|
27 |
speech: str
|
28 |
|
|
|
|
|
29 |
@classmethod
|
30 |
def from_webvtt_caption(cls, caption: Caption, index: int) -> Self:
|
31 |
identifier = caption.identifier if caption.identifier else str(index)
|
|
|
42 |
"""Represents parsed WebVTT content."""
|
43 |
|
44 |
id: str
|
45 |
+
metadata: Mapping[str, Any] = Field(default_factory=lambda: MappingProxyType({}))
|
46 |
start_time: Optional[datetime]
|
47 |
frames: tuple[WebVTTFrame, ...]
|
48 |
|
src/ctp_slack_bot/services/__init__.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
from ctp_slack_bot.services.answer_retrieval_service import AnswerRetrievalService
|
|
|
2 |
from ctp_slack_bot.services.content_ingestion_service import ContentIngestionService
|
3 |
from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
|
4 |
from ctp_slack_bot.services.embeddings_model_service import EmbeddingsModelService
|
|
|
1 |
from ctp_slack_bot.services.answer_retrieval_service import AnswerRetrievalService
|
2 |
+
from ctp_slack_bot.services.application_health_service import ApplicationHealthService
|
3 |
from ctp_slack_bot.services.content_ingestion_service import ContentIngestionService
|
4 |
from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
|
5 |
from ctp_slack_bot.services.embeddings_model_service import EmbeddingsModelService
|
src/ctp_slack_bot/services/answer_retrieval_service.py
CHANGED
@@ -1,30 +1,25 @@
|
|
1 |
from loguru import logger
|
2 |
-
from pydantic import
|
3 |
from typing import Collection, Self
|
4 |
|
5 |
-
from ctp_slack_bot.core import Settings
|
6 |
from ctp_slack_bot.enums import EventType
|
7 |
from ctp_slack_bot.models import Chunk, SlackMessage, SlackResponse
|
8 |
from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
9 |
from ctp_slack_bot.services.language_model_service import LanguageModelService
|
10 |
|
11 |
|
12 |
-
class AnswerRetrievalService(
|
13 |
"""
|
14 |
Service for context-based answer retrievel from a language model.
|
15 |
"""
|
16 |
|
|
|
|
|
17 |
settings: Settings
|
18 |
event_brokerage_service: EventBrokerageService
|
19 |
language_model_service: LanguageModelService
|
20 |
|
21 |
-
class Config:
|
22 |
-
frozen=True
|
23 |
-
|
24 |
-
def __init__(self: Self, **data) -> None:
|
25 |
-
super().__init__(**data)
|
26 |
-
logger.debug("Created {}", self.__class__.__name__)
|
27 |
-
|
28 |
async def push(self: Self, question: SlackMessage, context: Collection[Chunk]) -> None:
|
29 |
channel_to_respond_to = question.channel
|
30 |
thread_to_respond_to = question.thread_ts if question.thread_ts else question.ts
|
@@ -32,3 +27,7 @@ class AnswerRetrievalService(BaseModel):
|
|
32 |
logger.debug("Pushing response to channel {} and thread {}: {}", channel_to_respond_to, thread_to_respond_to, answer)
|
33 |
slack_response = SlackResponse(text=answer, channel=channel_to_respond_to, thread_ts=thread_to_respond_to)
|
34 |
await self.event_brokerage_service.publish(EventType.OUTGOING_SLACK_RESPONSE, slack_response)
|
|
|
|
|
|
|
|
|
|
1 |
from loguru import logger
|
2 |
+
from pydantic import ConfigDict
|
3 |
from typing import Collection, Self
|
4 |
|
5 |
+
from ctp_slack_bot.core import ApplicationComponentBase, Settings
|
6 |
from ctp_slack_bot.enums import EventType
|
7 |
from ctp_slack_bot.models import Chunk, SlackMessage, SlackResponse
|
8 |
from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
9 |
from ctp_slack_bot.services.language_model_service import LanguageModelService
|
10 |
|
11 |
|
12 |
+
class AnswerRetrievalService(ApplicationComponentBase):
|
13 |
"""
|
14 |
Service for context-based answer retrievel from a language model.
|
15 |
"""
|
16 |
|
17 |
+
model_config = ConfigDict(frozen=True)
|
18 |
+
|
19 |
settings: Settings
|
20 |
event_brokerage_service: EventBrokerageService
|
21 |
language_model_service: LanguageModelService
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
async def push(self: Self, question: SlackMessage, context: Collection[Chunk]) -> None:
|
24 |
channel_to_respond_to = question.channel
|
25 |
thread_to_respond_to = question.thread_ts if question.thread_ts else question.ts
|
|
|
27 |
logger.debug("Pushing response to channel {} and thread {}: {}", channel_to_respond_to, thread_to_respond_to, answer)
|
28 |
slack_response = SlackResponse(text=answer, channel=channel_to_respond_to, thread_ts=thread_to_respond_to)
|
29 |
await self.event_brokerage_service.publish(EventType.OUTGOING_SLACK_RESPONSE, slack_response)
|
30 |
+
|
31 |
+
@property
|
32 |
+
def name(self: Self) -> str:
|
33 |
+
return "answer_retrieval_service"
|
src/ctp_slack_bot/services/application_database_service.py
CHANGED
@@ -1,25 +1,20 @@
|
|
1 |
from datetime import datetime
|
2 |
from loguru import logger
|
3 |
-
from pydantic import
|
4 |
from typing import Iterable, Mapping, Self
|
5 |
|
6 |
-
from ctp_slack_bot.core import Settings
|
7 |
from ctp_slack_bot.db import MongoDB
|
8 |
|
9 |
|
10 |
-
class ApplicationDatabaseService(
|
11 |
"""Service for retrieving and persisting application state."""
|
12 |
|
|
|
|
|
13 |
settings: Settings
|
14 |
mongo_db: MongoDB # TODO: This should be replaced following the repository pattern―one repository class per collection.
|
15 |
|
16 |
-
class Config:
|
17 |
-
frozen=True
|
18 |
-
|
19 |
-
def __init__(self: Self, **data) -> None:
|
20 |
-
super().__init__(**data)
|
21 |
-
logger.debug("Created {}", self.__class__.__name__)
|
22 |
-
|
23 |
async def get_last_modification_times_by_file_paths(self: Self, file_paths: Iterable[str]) -> Mapping[str, datetime]:
|
24 |
"""Retrieve the last modification time for each file path."""
|
25 |
raise NotImplementedError() # TODO
|
@@ -27,3 +22,7 @@ class ApplicationDatabaseService(BaseModel):
|
|
27 |
async def set_last_modification_time_by_file_path(self: Self, file_path: str, modification_time: datetime) -> None:
|
28 |
"""Set the last modification time for a file path."""
|
29 |
raise NotImplementedError() # TODO
|
|
|
|
|
|
|
|
|
|
1 |
from datetime import datetime
|
2 |
from loguru import logger
|
3 |
+
from pydantic import ConfigDict
|
4 |
from typing import Iterable, Mapping, Self
|
5 |
|
6 |
+
from ctp_slack_bot.core import ApplicationComponentBase, Settings
|
7 |
from ctp_slack_bot.db import MongoDB
|
8 |
|
9 |
|
10 |
+
class ApplicationDatabaseService(ApplicationComponentBase):
|
11 |
"""Service for retrieving and persisting application state."""
|
12 |
|
13 |
+
model_config = ConfigDict(frozen=True)
|
14 |
+
|
15 |
settings: Settings
|
16 |
mongo_db: MongoDB # TODO: This should be replaced following the repository pattern―one repository class per collection.
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
async def get_last_modification_times_by_file_paths(self: Self, file_paths: Iterable[str]) -> Mapping[str, datetime]:
|
19 |
"""Retrieve the last modification time for each file path."""
|
20 |
raise NotImplementedError() # TODO
|
|
|
22 |
async def set_last_modification_time_by_file_path(self: Self, file_path: str, modification_time: datetime) -> None:
|
23 |
"""Set the last modification time for a file path."""
|
24 |
raise NotImplementedError() # TODO
|
25 |
+
|
26 |
+
@property
|
27 |
+
def name(self: Self) -> str:
|
28 |
+
return "application_database_service"
|
src/ctp_slack_bot/services/application_health_service.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from loguru import logger
|
2 |
+
from pydantic import ConfigDict
|
3 |
+
from types import MappingProxyType
|
4 |
+
from typing import Collection, Mapping, Self
|
5 |
+
|
6 |
+
from ctp_slack_bot.core import ApplicationComponentBase, HealthReportingApplicationComponentBase
|
7 |
+
|
8 |
+
|
9 |
+
class ApplicationHealthService(ApplicationComponentBase):
|
10 |
+
"""
|
11 |
+
Service for checking and reporting application health.
|
12 |
+
"""
|
13 |
+
|
14 |
+
model_config = ConfigDict(frozen=True)
|
15 |
+
|
16 |
+
services: list[HealthReportingApplicationComponentBase]
|
17 |
+
|
18 |
+
async def get_health(self: Self) -> Mapping[str, bool]:
|
19 |
+
return MappingProxyType({service.name: await service.is_healthy()
|
20 |
+
for service
|
21 |
+
in self.services})
|
22 |
+
|
23 |
+
@property
|
24 |
+
def name(self: Self) -> str:
|
25 |
+
return "application_health_service"
|
src/ctp_slack_bot/services/content_ingestion_service.py
CHANGED
@@ -1,32 +1,31 @@
|
|
1 |
from loguru import logger
|
2 |
-
from pydantic import
|
3 |
-
from typing import Self, Sequence, Set
|
4 |
|
5 |
-
from ctp_slack_bot.core import Settings
|
6 |
from ctp_slack_bot.db.repositories import VectorizedChunkRepository
|
7 |
from ctp_slack_bot.enums import EventType
|
8 |
from ctp_slack_bot.models import Chunk, Content, SlackMessage
|
9 |
from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
10 |
from ctp_slack_bot.services.vectorization_service import VectorizationService
|
11 |
|
12 |
-
|
|
|
13 |
"""
|
14 |
Service for ingesting content.
|
15 |
"""
|
16 |
|
|
|
|
|
17 |
settings: Settings
|
18 |
event_brokerage_service: EventBrokerageService
|
19 |
vectorized_chunk_repository: VectorizedChunkRepository
|
20 |
vectorization_service: VectorizationService
|
21 |
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
def __init__(self: Self, **data) -> None:
|
26 |
-
super().__init__(**data)
|
27 |
self.event_brokerage_service.subscribe(EventType.INCOMING_CONTENT, self.process_incoming_content)
|
28 |
# self.event_brokerage_service.subscribe(EventType.INCOMING_SLACK_MESSAGE, self.process_incoming_slack_message)
|
29 |
-
logger.debug("Created {}", self.__class__.__name__)
|
30 |
|
31 |
async def process_incoming_content(self: Self, content: Content) -> None:
|
32 |
logger.debug("Content ingestion service received content with metadata: {}", content.get_metadata())
|
@@ -46,3 +45,7 @@ class ContentIngestionService(BaseModel):
|
|
46 |
async def __vectorize_and_store_chunks_in_database(self: Self, chunks: Sequence[Chunk]) -> Set[str]:
|
47 |
vectorized_chunks = await self.vectorization_service.vectorize(chunks)
|
48 |
return await self.vectorized_chunk_repository.insert_many(vectorized_chunks)
|
|
|
|
|
|
|
|
|
|
1 |
from loguru import logger
|
2 |
+
from pydantic import ConfigDict
|
3 |
+
from typing import Any, Self, Sequence, Set
|
4 |
|
5 |
+
from ctp_slack_bot.core import ApplicationComponentBase, Settings
|
6 |
from ctp_slack_bot.db.repositories import VectorizedChunkRepository
|
7 |
from ctp_slack_bot.enums import EventType
|
8 |
from ctp_slack_bot.models import Chunk, Content, SlackMessage
|
9 |
from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
10 |
from ctp_slack_bot.services.vectorization_service import VectorizationService
|
11 |
|
12 |
+
|
13 |
+
class ContentIngestionService(ApplicationComponentBase):
|
14 |
"""
|
15 |
Service for ingesting content.
|
16 |
"""
|
17 |
|
18 |
+
model_config = ConfigDict(frozen=True)
|
19 |
+
|
20 |
settings: Settings
|
21 |
event_brokerage_service: EventBrokerageService
|
22 |
vectorized_chunk_repository: VectorizedChunkRepository
|
23 |
vectorization_service: VectorizationService
|
24 |
|
25 |
+
def model_post_init(self: Self, context: Any, /) -> None:
|
26 |
+
super().model_post_init(context)
|
|
|
|
|
|
|
27 |
self.event_brokerage_service.subscribe(EventType.INCOMING_CONTENT, self.process_incoming_content)
|
28 |
# self.event_brokerage_service.subscribe(EventType.INCOMING_SLACK_MESSAGE, self.process_incoming_slack_message)
|
|
|
29 |
|
30 |
async def process_incoming_content(self: Self, content: Content) -> None:
|
31 |
logger.debug("Content ingestion service received content with metadata: {}", content.get_metadata())
|
|
|
45 |
async def __vectorize_and_store_chunks_in_database(self: Self, chunks: Sequence[Chunk]) -> Set[str]:
|
46 |
vectorized_chunks = await self.vectorization_service.vectorize(chunks)
|
47 |
return await self.vectorized_chunk_repository.insert_many(vectorized_chunks)
|
48 |
+
|
49 |
+
@property
|
50 |
+
def name(self: Self) -> str:
|
51 |
+
return "content_ingestion_service"
|
src/ctp_slack_bot/services/context_retrieval_service.py
CHANGED
@@ -1,28 +1,24 @@
|
|
1 |
from loguru import logger
|
2 |
-
from pydantic import
|
3 |
from typing import Self, Sequence
|
4 |
|
5 |
-
from ctp_slack_bot.core
|
6 |
from ctp_slack_bot.db.repositories import VectorizedChunkRepository
|
7 |
from ctp_slack_bot.models import Chunk, SlackMessage, VectorQuery, VectorizedChunk
|
8 |
from ctp_slack_bot.services.vectorization_service import VectorizationService
|
9 |
|
10 |
-
|
|
|
11 |
"""
|
12 |
Service for retrieving relevant context from the vector database based on user questions.
|
13 |
"""
|
14 |
|
|
|
|
|
15 |
settings: Settings
|
16 |
vectorization_service: VectorizationService
|
17 |
vectorized_chunk_repository: VectorizedChunkRepository
|
18 |
|
19 |
-
class Config:
|
20 |
-
frozen=True
|
21 |
-
|
22 |
-
def __init__(self: Self, **data) -> None:
|
23 |
-
super().__init__(**data)
|
24 |
-
logger.debug("Created {}", self.__class__.__name__)
|
25 |
-
|
26 |
async def get_context(self: Self, message: SlackMessage) -> Sequence[Chunk]:
|
27 |
"""
|
28 |
Retrieve relevant context for a given SlackMessage by vectorizing the message and
|
@@ -43,8 +39,8 @@ class ContextRetrievalService(BaseModel):
|
|
43 |
|
44 |
query = VectorQuery(
|
45 |
query_embeddings=vectorized_message_chunks[0].embedding,
|
46 |
-
k=self.settings.
|
47 |
-
score_threshold=self.settings.
|
48 |
filter_metadata={} # Can be expanded to include filters based on message metadata
|
49 |
)
|
50 |
|
@@ -55,3 +51,7 @@ class ContextRetrievalService(BaseModel):
|
|
55 |
except Exception as e:
|
56 |
logger.error("An error occurred while searching the vector database for context: {}", e)
|
57 |
return ()
|
|
|
|
|
|
|
|
|
|
1 |
from loguru import logger
|
2 |
+
from pydantic import ConfigDict
|
3 |
from typing import Self, Sequence
|
4 |
|
5 |
+
from ctp_slack_bot.core import ApplicationComponentBase, Settings
|
6 |
from ctp_slack_bot.db.repositories import VectorizedChunkRepository
|
7 |
from ctp_slack_bot.models import Chunk, SlackMessage, VectorQuery, VectorizedChunk
|
8 |
from ctp_slack_bot.services.vectorization_service import VectorizationService
|
9 |
|
10 |
+
|
11 |
+
class ContextRetrievalService(ApplicationComponentBase):
|
12 |
"""
|
13 |
Service for retrieving relevant context from the vector database based on user questions.
|
14 |
"""
|
15 |
|
16 |
+
model_config = ConfigDict(frozen=True)
|
17 |
+
|
18 |
settings: Settings
|
19 |
vectorization_service: VectorizationService
|
20 |
vectorized_chunk_repository: VectorizedChunkRepository
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
async def get_context(self: Self, message: SlackMessage) -> Sequence[Chunk]:
|
23 |
"""
|
24 |
Retrieve relevant context for a given SlackMessage by vectorizing the message and
|
|
|
39 |
|
40 |
query = VectorQuery(
|
41 |
query_embeddings=vectorized_message_chunks[0].embedding,
|
42 |
+
k=self.settings.top_k_matches,
|
43 |
+
score_threshold=self.settings.score_threshold,
|
44 |
filter_metadata={} # Can be expanded to include filters based on message metadata
|
45 |
)
|
46 |
|
|
|
51 |
except Exception as e:
|
52 |
logger.error("An error occurred while searching the vector database for context: {}", e)
|
53 |
return ()
|
54 |
+
|
55 |
+
@property
|
56 |
+
def name(self: Self) -> str:
|
57 |
+
return "context_retrieval_service"
|
src/ctp_slack_bot/services/embeddings_model_service.py
CHANGED
@@ -1,25 +1,24 @@
|
|
1 |
from loguru import logger
|
2 |
from openai import AsyncOpenAI
|
3 |
-
from pydantic import
|
4 |
-
from typing import Any,
|
5 |
|
6 |
-
from ctp_slack_bot.core import Settings
|
7 |
|
8 |
-
|
|
|
9 |
"""
|
10 |
Service for embeddings model operations.
|
11 |
"""
|
12 |
|
13 |
-
|
14 |
-
_open_ai_client: PrivateAttr = PrivateAttr()
|
15 |
|
16 |
-
|
17 |
-
|
18 |
|
19 |
-
def
|
20 |
-
super().
|
21 |
-
self._open_ai_client = AsyncOpenAI(api_key=self.settings.
|
22 |
-
logger.debug("Created {}", self.__class__.__name__)
|
23 |
|
24 |
async def get_embeddings(self: Self, texts: Sequence[str]) -> Sequence[Sequence[float]]:
|
25 |
"""Get embeddings for a collection of texts using OpenAI’s API.
|
@@ -28,20 +27,24 @@ class EmbeddingsModelService(BaseModel):
|
|
28 |
texts (Collection[str]): Collection of text chunks to embed
|
29 |
|
30 |
Returns:
|
31 |
-
NDArray: Array of embeddings with shape (n_texts,
|
32 |
|
33 |
Raises:
|
34 |
ValueError: If the embedding dimensions don't match expected size
|
35 |
"""
|
36 |
logger.debug("Creating embeddings for {} text string(s)…", len(texts))
|
37 |
response = await self._open_ai_client.embeddings.create(
|
38 |
-
model=self.settings.
|
39 |
input=texts,
|
40 |
encoding_format="float" # Ensure we get raw float values.
|
41 |
)
|
42 |
embeddings = tuple(tuple(data.embedding) for data in response.data)
|
43 |
match embeddings:
|
44 |
-
case (first, _) if len(first) != self.settings.
|
45 |
-
logger.error("Embedding dimension mismatch and/or misconfiguration: expected configured dimension {}, but got {}.", self.settings.
|
46 |
raise ValueError() # TODO: raise a more specific type.
|
47 |
return embeddings
|
|
|
|
|
|
|
|
|
|
1 |
from loguru import logger
|
2 |
from openai import AsyncOpenAI
|
3 |
+
from pydantic import ConfigDict
|
4 |
+
from typing import Any, Sequence, Self
|
5 |
|
6 |
+
from ctp_slack_bot.core import ApplicationComponentBase, Settings
|
7 |
|
8 |
+
|
9 |
+
class EmbeddingsModelService(ApplicationComponentBase):
|
10 |
"""
|
11 |
Service for embeddings model operations.
|
12 |
"""
|
13 |
|
14 |
+
model_config = ConfigDict(frozen=True)
|
|
|
15 |
|
16 |
+
settings: Settings
|
17 |
+
_open_ai_client: AsyncOpenAI
|
18 |
|
19 |
+
def model_post_init(self: Self, context: Any, /) -> None:
|
20 |
+
super().model_post_init(context)
|
21 |
+
self._open_ai_client = AsyncOpenAI(api_key=self.settings.openai_api_key.get_secret_value())
|
|
|
22 |
|
23 |
async def get_embeddings(self: Self, texts: Sequence[str]) -> Sequence[Sequence[float]]:
|
24 |
"""Get embeddings for a collection of texts using OpenAI’s API.
|
|
|
27 |
texts (Collection[str]): Collection of text chunks to embed
|
28 |
|
29 |
Returns:
|
30 |
+
NDArray: Array of embeddings with shape (n_texts, vector_dimension)
|
31 |
|
32 |
Raises:
|
33 |
ValueError: If the embedding dimensions don't match expected size
|
34 |
"""
|
35 |
logger.debug("Creating embeddings for {} text string(s)…", len(texts))
|
36 |
response = await self._open_ai_client.embeddings.create(
|
37 |
+
model=self.settings.embedding_model,
|
38 |
input=texts,
|
39 |
encoding_format="float" # Ensure we get raw float values.
|
40 |
)
|
41 |
embeddings = tuple(tuple(data.embedding) for data in response.data)
|
42 |
match embeddings:
|
43 |
+
case (first, _) if len(first) != self.settings.vector_dimension:
|
44 |
+
logger.error("Embedding dimension mismatch and/or misconfiguration: expected configured dimension {}, but got {}.", self.settings.vector_dimension, len(first))
|
45 |
raise ValueError() # TODO: raise a more specific type.
|
46 |
return embeddings
|
47 |
+
|
48 |
+
@property
|
49 |
+
def name(self: Self) -> str:
|
50 |
+
return "embeddings_model_service"
|
src/ctp_slack_bot/services/event_brokerage_service.py
CHANGED
@@ -1,24 +1,21 @@
|
|
1 |
from asyncio import create_task, iscoroutinefunction, to_thread
|
2 |
from collections import defaultdict
|
3 |
from loguru import logger
|
4 |
-
from pydantic import
|
5 |
-
from typing import Any, Callable,
|
6 |
|
|
|
7 |
from ctp_slack_bot.enums import EventType
|
8 |
|
9 |
-
|
|
|
10 |
"""
|
11 |
Service for brokering events between services.
|
12 |
"""
|
13 |
|
14 |
-
|
15 |
-
|
16 |
-
class Config:
|
17 |
-
frozen=True
|
18 |
|
19 |
-
|
20 |
-
super().__init__(**data)
|
21 |
-
logger.debug("Created {}", self.__class__.__name__)
|
22 |
|
23 |
def subscribe(self: Self, type: EventType, callback: Callable) -> None:
|
24 |
"""Subscribe to an event type with a callback function."""
|
@@ -45,3 +42,7 @@ class EventBrokerageService(BaseModel):
|
|
45 |
create_task(to_thread(callback, data))
|
46 |
except Exception as e:
|
47 |
logger.error("Error scheduling synchronous callback to handle event {}: {}", type, e)
|
|
|
|
|
|
|
|
|
|
1 |
from asyncio import create_task, iscoroutinefunction, to_thread
|
2 |
from collections import defaultdict
|
3 |
from loguru import logger
|
4 |
+
from pydantic import ConfigDict, PrivateAttr
|
5 |
+
from typing import Any, Callable, List, MutableMapping, Self
|
6 |
|
7 |
+
from ctp_slack_bot.core import ApplicationComponentBase
|
8 |
from ctp_slack_bot.enums import EventType
|
9 |
|
10 |
+
|
11 |
+
class EventBrokerageService(ApplicationComponentBase):
|
12 |
"""
|
13 |
Service for brokering events between services.
|
14 |
"""
|
15 |
|
16 |
+
model_config = ConfigDict(frozen=True)
|
|
|
|
|
|
|
17 |
|
18 |
+
_subscribers: MutableMapping[EventType, list[Callable]] = PrivateAttr(default_factory=lambda: defaultdict(list))
|
|
|
|
|
19 |
|
20 |
def subscribe(self: Self, type: EventType, callback: Callable) -> None:
|
21 |
"""Subscribe to an event type with a callback function."""
|
|
|
42 |
create_task(to_thread(callback, data))
|
43 |
except Exception as e:
|
44 |
logger.error("Error scheduling synchronous callback to handle event {}: {}", type, e)
|
45 |
+
|
46 |
+
@property
|
47 |
+
def name(self: Self) -> str:
|
48 |
+
return "event_brokerage_service"
|
src/ctp_slack_bot/services/google_drive_service.py
CHANGED
@@ -1,17 +1,17 @@
|
|
1 |
from datetime import datetime
|
2 |
from cachetools import TTLCache
|
3 |
from functools import reduce
|
4 |
-
from google.oauth2 import
|
5 |
-
from googleapiclient.discovery import build
|
6 |
from googleapiclient.http import MediaIoBaseDownload
|
7 |
from googleapiclient.errors import HttpError
|
8 |
from io import BytesIO
|
9 |
from itertools import chain
|
10 |
from loguru import logger
|
11 |
-
from pydantic import
|
12 |
-
from typing import Collection, Optional, Self
|
13 |
|
14 |
-
from ctp_slack_bot.core import Settings
|
15 |
from ctp_slack_bot.models import GoogleDriveMetadata
|
16 |
|
17 |
|
@@ -19,40 +19,39 @@ FOLDER_MIME_TYPE: str = "application/vnd.google-apps.folder"
|
|
19 |
PATH_SEPARATOR: str = "/"
|
20 |
|
21 |
|
22 |
-
class GoogleDriveService(
|
23 |
"""Service for interacting with Google Drive."""
|
24 |
|
25 |
-
|
26 |
-
_google_drive_client: PrivateAttr = PrivateAttr()
|
27 |
-
_folder_cache: PrivateAttr = PrivateAttr(default_factory=lambda: TTLCache(maxsize=256, ttl=60))
|
28 |
|
29 |
-
|
30 |
-
|
|
|
31 |
|
32 |
-
def
|
33 |
-
super().
|
34 |
-
credentials =
|
35 |
"type": "service_account",
|
36 |
-
"project_id": self.settings.
|
37 |
-
"private_key_id": self.settings.
|
38 |
-
"private_key": self.settings.
|
39 |
-
"client_email": self.settings.
|
40 |
-
"client_id": self.settings.
|
41 |
-
"token_uri": self.settings.
|
42 |
}, scopes=["https://www.googleapis.com/auth/drive"])
|
43 |
self._google_drive_client = build('drive', 'v3', credentials=credentials)
|
44 |
-
logger.
|
45 |
|
46 |
def _resolve_folder_id(self: Self, folder_path: str) -> Optional[str]:
|
47 |
"""Resolve a folder path to a Google Drive ID."""
|
48 |
|
49 |
if not folder_path:
|
50 |
-
return self.settings.
|
51 |
|
52 |
if folder_path in self._folder_cache:
|
53 |
return self._folder_cache[folder_path]
|
54 |
|
55 |
-
current_id = self.settings.
|
56 |
try:
|
57 |
for part in folder_path.split(PATH_SEPARATOR):
|
58 |
results = self._google_drive_client.files().list(
|
@@ -112,7 +111,7 @@ class GoogleDriveService(BaseModel):
|
|
112 |
match item_path.rsplit(PATH_SEPARATOR, 1):
|
113 |
case [item_name]:
|
114 |
folder_path = ""
|
115 |
-
folder_id = self.settings.
|
116 |
case [folder_path, item_name]:
|
117 |
folder_id = self._resolve_folder_id(folder_path)
|
118 |
|
@@ -151,3 +150,7 @@ class GoogleDriveService(BaseModel):
|
|
151 |
except HttpError as e:
|
152 |
logger.error("Error reading file by ID, {}: {}", file_id, e)
|
153 |
return None
|
|
|
|
|
|
|
|
|
|
1 |
from datetime import datetime
|
2 |
from cachetools import TTLCache
|
3 |
from functools import reduce
|
4 |
+
from google.oauth2.service_account import Credentials
|
5 |
+
from googleapiclient.discovery import build, Resource
|
6 |
from googleapiclient.http import MediaIoBaseDownload
|
7 |
from googleapiclient.errors import HttpError
|
8 |
from io import BytesIO
|
9 |
from itertools import chain
|
10 |
from loguru import logger
|
11 |
+
from pydantic import ConfigDict, PrivateAttr
|
12 |
+
from typing import Any, Collection, Optional, Self
|
13 |
|
14 |
+
from ctp_slack_bot.core import ApplicationComponentBase, Settings
|
15 |
from ctp_slack_bot.models import GoogleDriveMetadata
|
16 |
|
17 |
|
|
|
19 |
PATH_SEPARATOR: str = "/"
|
20 |
|
21 |
|
22 |
+
class GoogleDriveService(ApplicationComponentBase):
|
23 |
"""Service for interacting with Google Drive."""
|
24 |
|
25 |
+
model_config = ConfigDict(frozen=True)
|
|
|
|
|
26 |
|
27 |
+
settings: Settings
|
28 |
+
_google_drive_client: Resource
|
29 |
+
_folder_cache: TTLCache = PrivateAttr(default_factory=lambda: TTLCache(maxsize=256, ttl=60))
|
30 |
|
31 |
+
def model_post_init(self: Self, context: Any, /) -> None:
|
32 |
+
super().model_post_init(context)
|
33 |
+
credentials = Credentials.from_service_account_info({
|
34 |
"type": "service_account",
|
35 |
+
"project_id": self.settings.google_project_id,
|
36 |
+
"private_key_id": self.settings.google_private_key_id.get_secret_value(),
|
37 |
+
"private_key": self.settings.google_private_key.get_secret_value(),
|
38 |
+
"client_email": self.settings.google_client_email,
|
39 |
+
"client_id": self.settings.google_client_id,
|
40 |
+
"token_uri": self.settings.google_token_uri,
|
41 |
}, scopes=["https://www.googleapis.com/auth/drive"])
|
42 |
self._google_drive_client = build('drive', 'v3', credentials=credentials)
|
43 |
+
logger.info(type(self._google_drive_client))
|
44 |
|
45 |
def _resolve_folder_id(self: Self, folder_path: str) -> Optional[str]:
|
46 |
"""Resolve a folder path to a Google Drive ID."""
|
47 |
|
48 |
if not folder_path:
|
49 |
+
return self.settings.google_drive_root_id
|
50 |
|
51 |
if folder_path in self._folder_cache:
|
52 |
return self._folder_cache[folder_path]
|
53 |
|
54 |
+
current_id = self.settings.google_drive_root_id
|
55 |
try:
|
56 |
for part in folder_path.split(PATH_SEPARATOR):
|
57 |
results = self._google_drive_client.files().list(
|
|
|
111 |
match item_path.rsplit(PATH_SEPARATOR, 1):
|
112 |
case [item_name]:
|
113 |
folder_path = ""
|
114 |
+
folder_id = self.settings.google_drive_root_id
|
115 |
case [folder_path, item_name]:
|
116 |
folder_id = self._resolve_folder_id(folder_path)
|
117 |
|
|
|
150 |
except HttpError as e:
|
151 |
logger.error("Error reading file by ID, {}: {}", file_id, e)
|
152 |
return None
|
153 |
+
|
154 |
+
@property
|
155 |
+
def name(self: Self) -> str:
|
156 |
+
return "google_drive_service"
|
src/ctp_slack_bot/services/http_client_service.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dependency_injector.resources import AsyncResource
|
2 |
+
from httpx import AsyncClient
|
3 |
+
from typing import Self
|
4 |
+
|
5 |
+
|
6 |
+
# TODO: Implement HTTPClientService to abstract away underlying HTTP client.
|
7 |
+
|
8 |
+
|
9 |
+
class HTTPClientServiceResource(AsyncResource):
|
10 |
+
async def init(self: Self) -> AsyncClient:
|
11 |
+
return AsyncClient()
|
12 |
+
|
13 |
+
async def shutdown(self: Self, client: AsyncClient) -> None:
|
14 |
+
await client.aclose()
|
src/ctp_slack_bot/services/http_server_service.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from aiohttp.web import Application, AppRunner, Response, TCPSite
|
2 |
+
from dependency_injector.resources import AsyncResource
|
3 |
+
from pydantic import ConfigDict
|
4 |
+
from typing import Self
|
5 |
+
|
6 |
+
from ctp_slack_bot.core import ApplicationComponentBase, Settings
|
7 |
+
|
8 |
+
|
9 |
+
class HTTPServerService(ApplicationComponentBase):
|
10 |
+
model_config = ConfigDict(frozen=True)
|
11 |
+
|
12 |
+
settings: Settings
|
13 |
+
|
14 |
+
async def listen(self: Self) -> None:
|
15 |
+
pass
|
src/ctp_slack_bot/services/language_model_service.py
CHANGED
@@ -1,28 +1,26 @@
|
|
1 |
from datetime import datetime
|
2 |
from loguru import logger
|
3 |
from openai import AsyncOpenAI
|
4 |
-
from
|
5 |
-
from
|
6 |
-
from typing import Collection, Self
|
7 |
|
8 |
-
from ctp_slack_bot.core import Settings
|
9 |
from ctp_slack_bot.models import Chunk
|
10 |
|
11 |
-
|
|
|
12 |
"""
|
13 |
Service for language model operations.
|
14 |
"""
|
15 |
|
16 |
-
|
17 |
-
_open_ai_client: PrivateAttr = PrivateAttr()
|
18 |
|
19 |
-
|
20 |
-
|
21 |
|
22 |
-
def
|
23 |
-
super().
|
24 |
-
self._open_ai_client = AsyncOpenAI(api_key=self.settings.
|
25 |
-
logger.debug("Created {}", self.__class__.__name__)
|
26 |
|
27 |
async def answer_question(self, asker: str, question: str, context: Collection[Chunk]) -> str: # TODO: generify into just another agent.
|
28 |
"""Generate a response using OpenAI’s API with retrieved context.
|
@@ -36,7 +34,7 @@ class LanguageModelService(BaseModel):
|
|
36 |
"""
|
37 |
logger.debug("Generating response for question “{}” using {} context chunks…", question, len(context))
|
38 |
messages = [
|
39 |
-
{"role": "system", "content": self.settings.
|
40 |
{"role": "user", "content": (
|
41 |
f"""Inquirer Name: {asker}
|
42 |
|
@@ -48,11 +46,14 @@ class LanguageModelService(BaseModel):
|
|
48 |
Context from class materials and transcripts:
|
49 |
{'\n\n'.join(chunk.text for chunk in context)}""")}
|
50 |
]
|
51 |
-
response
|
52 |
-
model=self.settings.
|
53 |
messages=messages,
|
54 |
-
max_tokens=self.settings.
|
55 |
-
temperature=self.settings.
|
56 |
)
|
57 |
-
|
58 |
return response.choices[0].message.content
|
|
|
|
|
|
|
|
|
|
1 |
from datetime import datetime
|
2 |
from loguru import logger
|
3 |
from openai import AsyncOpenAI
|
4 |
+
from pydantic import ConfigDict
|
5 |
+
from typing import Any, Collection, Self
|
|
|
6 |
|
7 |
+
from ctp_slack_bot.core import ApplicationComponentBase, Settings
|
8 |
from ctp_slack_bot.models import Chunk
|
9 |
|
10 |
+
|
11 |
+
class LanguageModelService(ApplicationComponentBase):
|
12 |
"""
|
13 |
Service for language model operations.
|
14 |
"""
|
15 |
|
16 |
+
model_config = ConfigDict(arbitrary_types_allowed=True, frozen=True)
|
|
|
17 |
|
18 |
+
settings: Settings
|
19 |
+
_open_ai_client: AsyncOpenAI
|
20 |
|
21 |
+
def model_post_init(self: Self, context: Any, /) -> None:
|
22 |
+
super().model_post_init(context)
|
23 |
+
self._open_ai_client = AsyncOpenAI(api_key=self.settings.openai_api_key.get_secret_value())
|
|
|
24 |
|
25 |
async def answer_question(self, asker: str, question: str, context: Collection[Chunk]) -> str: # TODO: generify into just another agent.
|
26 |
"""Generate a response using OpenAI’s API with retrieved context.
|
|
|
34 |
"""
|
35 |
logger.debug("Generating response for question “{}” using {} context chunks…", question, len(context))
|
36 |
messages = [
|
37 |
+
{"role": "system", "content": self.settings.system_prompt},
|
38 |
{"role": "user", "content": (
|
39 |
f"""Inquirer Name: {asker}
|
40 |
|
|
|
46 |
Context from class materials and transcripts:
|
47 |
{'\n\n'.join(chunk.text for chunk in context)}""")}
|
48 |
]
|
49 |
+
response = await self._open_ai_client.chat.completions.create(
|
50 |
+
model=self.settings.chat_model,
|
51 |
messages=messages,
|
52 |
+
max_tokens=self.settings.max_tokens,
|
53 |
+
temperature=self.settings.temperature
|
54 |
)
|
|
|
55 |
return response.choices[0].message.content
|
56 |
+
|
57 |
+
@property
|
58 |
+
def name(self: Self) -> str:
|
59 |
+
return "language_model_service"
|
src/ctp_slack_bot/services/question_dispatch_service.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
from loguru import logger
|
2 |
-
from pydantic import
|
3 |
-
from typing import Self
|
4 |
|
5 |
-
from ctp_slack_bot.core import Settings
|
6 |
from ctp_slack_bot.enums import EventType
|
7 |
from ctp_slack_bot.models import Chunk, SlackMessage
|
8 |
from ctp_slack_bot.services.answer_retrieval_service import AnswerRetrievalService
|
@@ -10,26 +10,28 @@ from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalSer
|
|
10 |
from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
11 |
|
12 |
|
13 |
-
class QuestionDispatchService(
|
14 |
"""
|
15 |
Service for determining whether a Slack message constitutes a question.
|
16 |
"""
|
17 |
|
|
|
|
|
18 |
settings: Settings
|
19 |
event_brokerage_service: EventBrokerageService
|
20 |
context_retrieval_service: ContextRetrievalService
|
21 |
answer_retrieval_service: AnswerRetrievalService
|
22 |
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
def __init__(self: Self, **data) -> None:
|
27 |
-
super().__init__(**data)
|
28 |
self.event_brokerage_service.subscribe(EventType.INCOMING_SLACK_MESSAGE, self.__process_incoming_slack_message)
|
29 |
-
logger.debug("Created {}", self.__class__.__name__)
|
30 |
|
31 |
async def __process_incoming_slack_message(self: Self, message: SlackMessage) -> None:
|
32 |
if message.subtype != 'bot_message':
|
33 |
logger.debug("Question dispatch service received an answerable question: {}", message.text)
|
34 |
context = await self.context_retrieval_service.get_context(message)
|
35 |
await self.answer_retrieval_service.push(message, context)
|
|
|
|
|
|
|
|
|
|
1 |
from loguru import logger
|
2 |
+
from pydantic import ConfigDict
|
3 |
+
from typing import Any, Self
|
4 |
|
5 |
+
from ctp_slack_bot.core import ApplicationComponentBase, Settings
|
6 |
from ctp_slack_bot.enums import EventType
|
7 |
from ctp_slack_bot.models import Chunk, SlackMessage
|
8 |
from ctp_slack_bot.services.answer_retrieval_service import AnswerRetrievalService
|
|
|
10 |
from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
11 |
|
12 |
|
13 |
+
class QuestionDispatchService(ApplicationComponentBase):
|
14 |
"""
|
15 |
Service for determining whether a Slack message constitutes a question.
|
16 |
"""
|
17 |
|
18 |
+
model_config = ConfigDict(arbitrary_types_allowed=True, frozen=True)
|
19 |
+
|
20 |
settings: Settings
|
21 |
event_brokerage_service: EventBrokerageService
|
22 |
context_retrieval_service: ContextRetrievalService
|
23 |
answer_retrieval_service: AnswerRetrievalService
|
24 |
|
25 |
+
def model_post_init(self: Self, context: Any, /) -> None:
|
26 |
+
super().model_post_init(context)
|
|
|
|
|
|
|
27 |
self.event_brokerage_service.subscribe(EventType.INCOMING_SLACK_MESSAGE, self.__process_incoming_slack_message)
|
|
|
28 |
|
29 |
async def __process_incoming_slack_message(self: Self, message: SlackMessage) -> None:
|
30 |
if message.subtype != 'bot_message':
|
31 |
logger.debug("Question dispatch service received an answerable question: {}", message.text)
|
32 |
context = await self.context_retrieval_service.get_context(message)
|
33 |
await self.answer_retrieval_service.push(message, context)
|
34 |
+
|
35 |
+
@property
|
36 |
+
def name(self: Self) -> str:
|
37 |
+
return "question_dispatch_service"
|
src/ctp_slack_bot/services/schedule_service.py
CHANGED
@@ -4,29 +4,26 @@ from asyncio import create_task, iscoroutinefunction, to_thread
|
|
4 |
from datetime import datetime
|
5 |
from dependency_injector.resources import Resource
|
6 |
from loguru import logger
|
7 |
-
from pydantic import
|
8 |
from pytz import timezone
|
9 |
-
from typing import Optional, Self
|
10 |
|
11 |
-
from ctp_slack_bot.core import Settings
|
12 |
|
13 |
-
|
|
|
14 |
"""
|
15 |
Service for running scheduled tasks.
|
16 |
"""
|
17 |
|
18 |
-
|
19 |
-
_scheduler: PrivateAttr
|
20 |
|
21 |
-
|
22 |
-
|
23 |
|
24 |
-
def
|
25 |
-
super().
|
26 |
-
|
27 |
-
self._configure_jobs()
|
28 |
-
self._scheduler = AsyncIOScheduler(timezone=timezone(zone))
|
29 |
-
logger.debug("Created {}", self.__class__.__name__)
|
30 |
|
31 |
def _configure_jobs(self: Self) -> None:
|
32 |
# Example jobs (uncomment and implement as needed)
|
@@ -55,6 +52,11 @@ class ScheduleService(BaseModel):
|
|
55 |
else:
|
56 |
logger.debug("The scheduler is not running. There is no scheduler to shut down.")
|
57 |
|
|
|
|
|
|
|
|
|
|
|
58 |
class ScheduleServiceResource(Resource):
|
59 |
def init(self: Self, settings: Settings) -> ScheduleService:
|
60 |
logger.info("Starting scheduler…")
|
|
|
4 |
from datetime import datetime
|
5 |
from dependency_injector.resources import Resource
|
6 |
from loguru import logger
|
7 |
+
from pydantic import ConfigDict
|
8 |
from pytz import timezone
|
9 |
+
from typing import Any, Optional, Self
|
10 |
|
11 |
+
from ctp_slack_bot.core import ApplicationComponentBase, Settings
|
12 |
|
13 |
+
|
14 |
+
class ScheduleService(ApplicationComponentBase):
|
15 |
"""
|
16 |
Service for running scheduled tasks.
|
17 |
"""
|
18 |
|
19 |
+
model_config = ConfigDict(frozen=True)
|
|
|
20 |
|
21 |
+
settings: Settings
|
22 |
+
_scheduler: AsyncIOScheduler
|
23 |
|
24 |
+
def model_post_init(self: Self, context: Any, /) -> None:
|
25 |
+
super().model_post_init(context)
|
26 |
+
self._scheduler = AsyncIOScheduler(timezone=timezone(self.settings.scheduler_timezone))
|
|
|
|
|
|
|
27 |
|
28 |
def _configure_jobs(self: Self) -> None:
|
29 |
# Example jobs (uncomment and implement as needed)
|
|
|
52 |
else:
|
53 |
logger.debug("The scheduler is not running. There is no scheduler to shut down.")
|
54 |
|
55 |
+
@property
|
56 |
+
def name(self: Self) -> str:
|
57 |
+
return "schedule_service"
|
58 |
+
|
59 |
+
|
60 |
class ScheduleServiceResource(Resource):
|
61 |
def init(self: Self, settings: Settings) -> ScheduleService:
|
62 |
logger.info("Starting scheduler…")
|
src/ctp_slack_bot/services/slack_service.py
CHANGED
@@ -1,12 +1,14 @@
|
|
1 |
from dependency_injector.resources import AsyncResource
|
|
|
2 |
from loguru import logger
|
3 |
from openai import OpenAI
|
4 |
-
from pydantic import
|
5 |
from re import compile as compile_re
|
6 |
from slack_bolt.async_app import AsyncApp
|
7 |
from slack_sdk.web.async_slack_response import AsyncSlackResponse
|
8 |
-
from typing import Any, Mapping, Self
|
9 |
|
|
|
10 |
from ctp_slack_bot.enums import EventType
|
11 |
from ctp_slack_bot.models import SlackMessage, SlackResponse
|
12 |
from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
@@ -15,25 +17,23 @@ from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
|
15 |
SLACK_USER_MENTION_PATTERN = compile_re(r"<@([A-Z0-9]+)>")
|
16 |
|
17 |
|
18 |
-
class SlackService(
|
19 |
"""
|
20 |
Service for interfacing with Slack.
|
21 |
"""
|
22 |
|
|
|
|
|
23 |
event_brokerage_service: EventBrokerageService
|
|
|
24 |
slack_bolt_app: AsyncApp
|
25 |
-
user_id_name_map:
|
26 |
-
|
27 |
-
class Config:
|
28 |
-
arbitrary_types_allowed = True
|
29 |
-
frozen=True
|
30 |
|
31 |
-
def
|
32 |
-
super().
|
33 |
self.event_brokerage_service.subscribe(EventType.OUTGOING_SLACK_RESPONSE, self.send_message)
|
34 |
-
logger.debug("Created {}", self.__class__.__name__)
|
35 |
|
36 |
-
def adapt_event_payload(self: Self, event: Mapping[str, Any]) -> SlackMessage:
|
37 |
text = SLACK_USER_MENTION_PATTERN.sub(lambda match: f"@{self.user_id_name_map.get(match.group(1))}", event.get("text", "")) # TODO: permit look-up of Slack again when not found.
|
38 |
user_id = event.get("user")
|
39 |
return SlackMessage(
|
@@ -41,7 +41,7 @@ class SlackService(BaseModel):
|
|
41 |
subtype=event.get("subtype"),
|
42 |
channel=event.get("channel"),
|
43 |
channel_type=event.get("channel_type"),
|
44 |
-
user=self.
|
45 |
bot_id=event.get("bot_id"),
|
46 |
thread_ts=event.get("thread_ts"),
|
47 |
text=text,
|
@@ -50,7 +50,7 @@ class SlackService(BaseModel):
|
|
50 |
)
|
51 |
|
52 |
async def process_message(self: Self, event: Mapping[str, Any]) -> None:
|
53 |
-
slack_message = self.adapt_event_payload(event.get("event", {}))
|
54 |
logger.debug("Received message from Slack: {}", slack_message)
|
55 |
await self.event_brokerage_service.publish(EventType.INCOMING_SLACK_MESSAGE, slack_message)
|
56 |
|
@@ -70,9 +70,21 @@ class SlackService(BaseModel):
|
|
70 |
self.slack_bolt_app.event("app_mention")(self.handle_app_mention_event)
|
71 |
logger.debug("Registered 2 handlers for Slack Bolt message and app mention events.")
|
72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
class SlackServiceResource(AsyncResource):
|
75 |
-
async def init(self: Self, event_brokerage_service: EventBrokerageService, slack_bolt_app: AsyncApp) -> SlackService:
|
76 |
match await slack_bolt_app.client.users_list():
|
77 |
case AsyncSlackResponse(status_code=200, data={"ok": True, "members": users}):
|
78 |
user_id_name_map = {id: display_name
|
@@ -83,7 +95,7 @@ class SlackServiceResource(AsyncResource):
|
|
83 |
case something:
|
84 |
user_id_name_map = {}
|
85 |
logger.error("Could not obtain a list of user name for the workspace.")
|
86 |
-
slack_service = SlackService(event_brokerage_service=event_brokerage_service, slack_bolt_app=slack_bolt_app, user_id_name_map=user_id_name_map)
|
87 |
slack_service.initialize()
|
88 |
return slack_service
|
89 |
|
|
|
1 |
from dependency_injector.resources import AsyncResource
|
2 |
+
from httpx import AsyncClient
|
3 |
from loguru import logger
|
4 |
from openai import OpenAI
|
5 |
+
from pydantic import ConfigDict
|
6 |
from re import compile as compile_re
|
7 |
from slack_bolt.async_app import AsyncApp
|
8 |
from slack_sdk.web.async_slack_response import AsyncSlackResponse
|
9 |
+
from typing import Any, Mapping, MutableMapping, Self
|
10 |
|
11 |
+
from ctp_slack_bot.core import HealthReportingApplicationComponentBase
|
12 |
from ctp_slack_bot.enums import EventType
|
13 |
from ctp_slack_bot.models import SlackMessage, SlackResponse
|
14 |
from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
|
|
17 |
SLACK_USER_MENTION_PATTERN = compile_re(r"<@([A-Z0-9]+)>")
|
18 |
|
19 |
|
20 |
+
class SlackService(HealthReportingApplicationComponentBase):
|
21 |
"""
|
22 |
Service for interfacing with Slack.
|
23 |
"""
|
24 |
|
25 |
+
model_config = ConfigDict(arbitrary_types_allowed=True, frozen=True)
|
26 |
+
|
27 |
event_brokerage_service: EventBrokerageService
|
28 |
+
http_client: AsyncClient
|
29 |
slack_bolt_app: AsyncApp
|
30 |
+
user_id_name_map: MutableMapping[str, str]
|
|
|
|
|
|
|
|
|
31 |
|
32 |
+
def model_post_init(self: Self, context: Any, /) -> None:
|
33 |
+
super().model_post_init(context)
|
34 |
self.event_brokerage_service.subscribe(EventType.OUTGOING_SLACK_RESPONSE, self.send_message)
|
|
|
35 |
|
36 |
+
async def adapt_event_payload(self: Self, event: Mapping[str, Any]) -> SlackMessage:
|
37 |
text = SLACK_USER_MENTION_PATTERN.sub(lambda match: f"@{self.user_id_name_map.get(match.group(1))}", event.get("text", "")) # TODO: permit look-up of Slack again when not found.
|
38 |
user_id = event.get("user")
|
39 |
return SlackMessage(
|
|
|
41 |
subtype=event.get("subtype"),
|
42 |
channel=event.get("channel"),
|
43 |
channel_type=event.get("channel_type"),
|
44 |
+
user=await self._get_user_display_name(user_id),
|
45 |
bot_id=event.get("bot_id"),
|
46 |
thread_ts=event.get("thread_ts"),
|
47 |
text=text,
|
|
|
50 |
)
|
51 |
|
52 |
async def process_message(self: Self, event: Mapping[str, Any]) -> None:
|
53 |
+
slack_message = await self.adapt_event_payload(event.get("event", {}))
|
54 |
logger.debug("Received message from Slack: {}", slack_message)
|
55 |
await self.event_brokerage_service.publish(EventType.INCOMING_SLACK_MESSAGE, slack_message)
|
56 |
|
|
|
70 |
self.slack_bolt_app.event("app_mention")(self.handle_app_mention_event)
|
71 |
logger.debug("Registered 2 handlers for Slack Bolt message and app mention events.")
|
72 |
|
73 |
+
@property
|
74 |
+
def name(self: Self) -> str:
|
75 |
+
return "slack_service"
|
76 |
+
|
77 |
+
async def is_healthy(self: Self) -> bool:
|
78 |
+
response = await self.http_client.get("https://slack-status.com/api/v2.0.0/current")
|
79 |
+
return response.status_code == 200
|
80 |
+
|
81 |
+
async def _get_user_display_name(self: Self, user_id: str) -> str:
|
82 |
+
return self.user_id_name_map.get(user_id, f"<@{user_id}>")
|
83 |
+
# TODO: Handle new users.
|
84 |
+
|
85 |
|
86 |
class SlackServiceResource(AsyncResource):
|
87 |
+
async def init(self: Self, event_brokerage_service: EventBrokerageService, http_client: AsyncClient, slack_bolt_app: AsyncApp) -> SlackService:
|
88 |
match await slack_bolt_app.client.users_list():
|
89 |
case AsyncSlackResponse(status_code=200, data={"ok": True, "members": users}):
|
90 |
user_id_name_map = {id: display_name
|
|
|
95 |
case something:
|
96 |
user_id_name_map = {}
|
97 |
logger.error("Could not obtain a list of user name for the workspace.")
|
98 |
+
slack_service = SlackService(event_brokerage_service=event_brokerage_service, http_client=http_client, slack_bolt_app=slack_bolt_app, user_id_name_map=user_id_name_map)
|
99 |
slack_service.initialize()
|
100 |
return slack_service
|
101 |
|
src/ctp_slack_bot/services/vectorization_service.py
CHANGED
@@ -1,26 +1,22 @@
|
|
1 |
from loguru import logger
|
2 |
-
from pydantic import
|
3 |
from typing import Self, Sequence
|
4 |
|
5 |
-
from ctp_slack_bot.core import Settings
|
6 |
from ctp_slack_bot.models import Chunk, VectorizedChunk
|
7 |
from ctp_slack_bot.services.embeddings_model_service import EmbeddingsModelService
|
8 |
|
9 |
-
|
|
|
10 |
"""
|
11 |
Service for vectorizing chunks of text data.
|
12 |
"""
|
13 |
|
|
|
|
|
14 |
settings: Settings
|
15 |
embeddings_model_service: EmbeddingsModelService
|
16 |
|
17 |
-
class Config:
|
18 |
-
frozen=True
|
19 |
-
|
20 |
-
def __init__(self: Self, **data) -> None:
|
21 |
-
super().__init__(**data)
|
22 |
-
logger.debug("Created {}", self.__class__.__name__)
|
23 |
-
|
24 |
async def vectorize(self: Self, chunks: Sequence[Chunk]) -> Sequence[VectorizedChunk]:
|
25 |
embeddings = await self.embeddings_model_service.get_embeddings([chunk.text for chunk in chunks])
|
26 |
return tuple(VectorizedChunk(
|
@@ -32,3 +28,7 @@ class VectorizationService(BaseModel):
|
|
32 |
)
|
33 |
for chunk, embedding
|
34 |
in zip(chunks, embeddings))
|
|
|
|
|
|
|
|
|
|
1 |
from loguru import logger
|
2 |
+
from pydantic import ConfigDict
|
3 |
from typing import Self, Sequence
|
4 |
|
5 |
+
from ctp_slack_bot.core import ApplicationComponentBase, Settings
|
6 |
from ctp_slack_bot.models import Chunk, VectorizedChunk
|
7 |
from ctp_slack_bot.services.embeddings_model_service import EmbeddingsModelService
|
8 |
|
9 |
+
|
10 |
+
class VectorizationService(ApplicationComponentBase):
|
11 |
"""
|
12 |
Service for vectorizing chunks of text data.
|
13 |
"""
|
14 |
|
15 |
+
model_config = ConfigDict(frozen=True)
|
16 |
+
|
17 |
settings: Settings
|
18 |
embeddings_model_service: EmbeddingsModelService
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
async def vectorize(self: Self, chunks: Sequence[Chunk]) -> Sequence[VectorizedChunk]:
|
21 |
embeddings = await self.embeddings_model_service.get_embeddings([chunk.text for chunk in chunks])
|
22 |
return tuple(VectorizedChunk(
|
|
|
28 |
)
|
29 |
for chunk, embedding
|
30 |
in zip(chunks, embeddings))
|
31 |
+
|
32 |
+
@property
|
33 |
+
def name(self: Self) -> str:
|
34 |
+
return "vectorization_service"
|
src/ctp_slack_bot/utils/secret_stripper.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
from urllib.parse import urlparse, urlunparse
|
2 |
|
|
|
3 |
def sanitize_mongo_db_uri(uri: str) -> str:
|
4 |
parts = urlparse(uri)
|
5 |
sanitized_netloc = ":".join(filter(None, (parts.hostname, parts.port)))
|
|
|
1 |
from urllib.parse import urlparse, urlunparse
|
2 |
|
3 |
+
|
4 |
def sanitize_mongo_db_uri(uri: str) -> str:
|
5 |
parts = urlparse(uri)
|
6 |
sanitized_netloc = ":".join(filter(None, (parts.hostname, parts.port)))
|