Spaces:
Runtime error
Runtime error
W.I.P.
Browse files- pyproject.toml +0 -1
- src/ctp_slack_bot/__init__.py +0 -1
- src/ctp_slack_bot/api/main.py +40 -5
- src/ctp_slack_bot/api/routes.py +48 -3
- src/ctp_slack_bot/containers.py +14 -19
- src/ctp_slack_bot/core/__init__.py +0 -1
- src/ctp_slack_bot/core/logging.py +7 -4
- src/ctp_slack_bot/core/response_rendering.py +1 -1
- src/ctp_slack_bot/db/mongo_db.py +12 -8
- src/ctp_slack_bot/enums.py +6 -0
- src/ctp_slack_bot/models/__init__.py +2 -3
- src/ctp_slack_bot/models/base.py +36 -45
- src/ctp_slack_bot/models/content.py +0 -19
- src/ctp_slack_bot/models/slack.py +91 -11
- src/ctp_slack_bot/models/vector_query.py +4 -3
- src/ctp_slack_bot/services/__init__.py +2 -0
- src/ctp_slack_bot/services/answer_retrieval_service.py +13 -47
- src/ctp_slack_bot/services/content_ingestion_service.py +27 -1
- src/ctp_slack_bot/services/context_retrieval_service.py +39 -37
- src/ctp_slack_bot/services/embeddings_model_service.py +48 -0
- src/ctp_slack_bot/services/event_brokerage_service.py +29 -21
- src/ctp_slack_bot/services/language_model_service.py +56 -0
- src/ctp_slack_bot/services/question_dispatch_service.py +9 -6
- src/ctp_slack_bot/services/slack_service.py +31 -5
- src/ctp_slack_bot/services/vector_database_service.py +18 -16
- src/ctp_slack_bot/services/vectorization_service.py +17 -50
- src/ctp_slack_bot/tasks/scheduler.py +1 -1
pyproject.toml
CHANGED
@@ -33,7 +33,6 @@ dependencies = [
|
|
33 |
"apscheduler>=3.11.0",
|
34 |
"slack-sdk>=3.35.0",
|
35 |
"pymongo>=4.11.3 ",
|
36 |
-
"numpy>=2.2.4",
|
37 |
"webvtt-py>=0.5.1",
|
38 |
"openai>=1.70.0",
|
39 |
# "langchain>=0.3.23",
|
|
|
33 |
"apscheduler>=3.11.0",
|
34 |
"slack-sdk>=3.35.0",
|
35 |
"pymongo>=4.11.3 ",
|
|
|
36 |
"webvtt-py>=0.5.1",
|
37 |
"openai>=1.70.0",
|
38 |
# "langchain>=0.3.23",
|
src/ctp_slack_bot/__init__.py
CHANGED
@@ -1 +0,0 @@
|
|
1 |
-
from ctp_slack_bot.containers import Container
|
|
|
|
src/ctp_slack_bot/api/main.py
CHANGED
@@ -1,12 +1,19 @@
|
|
1 |
from contextlib import asynccontextmanager
|
|
|
2 |
from fastapi import FastAPI, HTTPException, Depends
|
3 |
from loguru import logger
|
4 |
-
from typing import AsyncGenerator
|
5 |
-
from
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
-
from ctp_slack_bot import Container
|
8 |
from ctp_slack_bot.api.routes import router
|
9 |
-
from ctp_slack_bot.core import Settings
|
|
|
10 |
from ctp_slack_bot.core.response_rendering import PrettyJSONResponse
|
11 |
from ctp_slack_bot.tasks import start_scheduler, stop_scheduler
|
12 |
|
@@ -16,6 +23,7 @@ async def lifespan(app: FastAPI) -> AsyncGenerator:
|
|
16 |
Lifespan context manager for FastAPI application.
|
17 |
Handles startup and shutdown events.
|
18 |
"""
|
|
|
19 |
# Initialize container and wire the container to modules that need dependency injection.
|
20 |
container = Container()
|
21 |
container.wire(packages=['ctp_slack_bot'])
|
@@ -25,10 +33,38 @@ async def lifespan(app: FastAPI) -> AsyncGenerator:
|
|
25 |
setup_logging(container)
|
26 |
logger.info("Starting application")
|
27 |
|
|
|
|
|
|
|
28 |
# Start the scheduler.
|
29 |
scheduler = start_scheduler(container)
|
30 |
logger.info("Started scheduler")
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
yield # control to FastAPI until shutdown.
|
33 |
|
34 |
# Shutdown.
|
@@ -36,7 +72,6 @@ async def lifespan(app: FastAPI) -> AsyncGenerator:
|
|
36 |
stop_scheduler(scheduler)
|
37 |
logger.info("Stopped scheduler")
|
38 |
|
39 |
-
|
40 |
app = FastAPI(
|
41 |
title="CTP Slack Bot",
|
42 |
description="A Slack bot for processing and analyzing Zoom transcripts using AI",
|
|
|
1 |
from contextlib import asynccontextmanager
|
2 |
+
from dependency_injector.wiring import inject, Provide
|
3 |
from fastapi import FastAPI, HTTPException, Depends
|
4 |
from loguru import logger
|
5 |
+
from typing import Any, AsyncGenerator
|
6 |
+
from slack_bolt import App
|
7 |
+
from slack_bolt.adapter.socket_mode import SocketModeHandler
|
8 |
+
from starlette.requests import Request
|
9 |
+
from starlette.responses import Response
|
10 |
+
from threading import Thread
|
11 |
+
from typing import Any, Dict, Self
|
12 |
|
13 |
+
from ctp_slack_bot.containers import Container
|
14 |
from ctp_slack_bot.api.routes import router
|
15 |
+
from ctp_slack_bot.core.config import Settings
|
16 |
+
from ctp_slack_bot.core.logging import setup_logging
|
17 |
from ctp_slack_bot.core.response_rendering import PrettyJSONResponse
|
18 |
from ctp_slack_bot.tasks import start_scheduler, stop_scheduler
|
19 |
|
|
|
23 |
Lifespan context manager for FastAPI application.
|
24 |
Handles startup and shutdown events.
|
25 |
"""
|
26 |
+
|
27 |
# Initialize container and wire the container to modules that need dependency injection.
|
28 |
container = Container()
|
29 |
container.wire(packages=['ctp_slack_bot'])
|
|
|
33 |
setup_logging(container)
|
34 |
logger.info("Starting application")
|
35 |
|
36 |
+
# Log route being served.
|
37 |
+
logger.info("Serving {} route(s):{}", len(app.routes), ''.join(f"\n* {", ".join(route.methods)} {route.path}" for route in app.routes))
|
38 |
+
|
39 |
# Start the scheduler.
|
40 |
scheduler = start_scheduler(container)
|
41 |
logger.info("Started scheduler")
|
42 |
|
43 |
+
# Initialize primordial dependencies in container.
|
44 |
+
container.primordial_services()
|
45 |
+
|
46 |
+
# Start Slack socket mode in a background thread and set up an event handler for the Bolt app.
|
47 |
+
bolt_app = container.slack_bolt_app()
|
48 |
+
slack_service = container.slack_service()
|
49 |
+
@bolt_app.event("message")
|
50 |
+
def handle_message(body: Dict[str, Any]) -> None:
|
51 |
+
logger.debug("Received Slack message event: {}", body)
|
52 |
+
slack_service.process_message(body)
|
53 |
+
@bolt_app.event("app_mention")
|
54 |
+
def handle_message(body: Dict[str, Any]) -> None:
|
55 |
+
logger.debug("Received Slack app mention event: {}", body)
|
56 |
+
slack_service.process_message(body)
|
57 |
+
|
58 |
+
# Start Socket Mode handler in a background thread
|
59 |
+
socket_mode_handler = SocketModeHandler(
|
60 |
+
app=bolt_app,
|
61 |
+
app_token=container.settings().SLACK_APP_TOKEN.get_secret_value()
|
62 |
+
)
|
63 |
+
socket_thread = Thread(target=socket_mode_handler.start)
|
64 |
+
socket_thread.daemon = True
|
65 |
+
socket_thread.start()
|
66 |
+
logger.info("Started Slack Socket Mode handler")
|
67 |
+
|
68 |
yield # control to FastAPI until shutdown.
|
69 |
|
70 |
# Shutdown.
|
|
|
72 |
stop_scheduler(scheduler)
|
73 |
logger.info("Stopped scheduler")
|
74 |
|
|
|
75 |
app = FastAPI(
|
76 |
title="CTP Slack Bot",
|
77 |
description="A Slack bot for processing and analyzing Zoom transcripts using AI",
|
src/ctp_slack_bot/api/routes.py
CHANGED
@@ -2,9 +2,9 @@ from fastapi import APIRouter, Depends, HTTPException, status
|
|
2 |
from dependency_injector.wiring import inject, Provide
|
3 |
from loguru import logger
|
4 |
|
5 |
-
from ctp_slack_bot import Container
|
6 |
-
from ctp_slack_bot.core import Settings
|
7 |
-
from ctp_slack_bot.services import SlackService
|
8 |
|
9 |
router = APIRouter(prefix="/api/v1")
|
10 |
|
@@ -15,6 +15,51 @@ async def get_env(settings: Settings = Depends(Provide[Container.settings])) ->
|
|
15 |
raise HTTPException(status_code=404)
|
16 |
return settings
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
# @router.post("/transcripts/analyze", response_model=TranscriptResponse)
|
19 |
# async def analyze_transcript(
|
20 |
# request: TranscriptRequest,
|
|
|
2 |
from dependency_injector.wiring import inject, Provide
|
3 |
from loguru import logger
|
4 |
|
5 |
+
from ctp_slack_bot.containers import Container
|
6 |
+
from ctp_slack_bot.core.config import Settings
|
7 |
+
from ctp_slack_bot.services import AnswerRetrievalService, ContentIngestionService, ContextRetrievalService, EmbeddingsModelService, LanguageModelService, QuestionDispatchService, SlackService, VectorDatabaseService, VectorizationService
|
8 |
|
9 |
router = APIRouter(prefix="/api/v1")
|
10 |
|
|
|
15 |
raise HTTPException(status_code=404)
|
16 |
return settings
|
17 |
|
18 |
+
@router.get("/answer_retrieval_service")
|
19 |
+
@inject
|
20 |
+
async def test(answer_retrieval_service: AnswerRetrievalService = Depends(Provide[Container.answer_retrieval_service])) -> None:
|
21 |
+
pass
|
22 |
+
|
23 |
+
@router.get("/content_ingestion_service")
|
24 |
+
@inject
|
25 |
+
async def test(content_ingestion_service: ContentIngestionService = Depends(Provide[Container.content_ingestion_service])) -> None:
|
26 |
+
pass
|
27 |
+
|
28 |
+
@router.get("/context_retrieval_service")
|
29 |
+
@inject
|
30 |
+
async def test(context_retrieval_service: ContextRetrievalService = Depends(Provide[Container.context_retrieval_service])) -> None:
|
31 |
+
pass
|
32 |
+
|
33 |
+
@router.get("/embeddings_model_service")
|
34 |
+
@inject
|
35 |
+
async def test(embeddings_model_service: EmbeddingsModelService = Depends(Provide[Container.embeddings_model_service])) -> None:
|
36 |
+
pass
|
37 |
+
|
38 |
+
@router.get("/language_model_service")
|
39 |
+
@inject
|
40 |
+
async def test(language_model_service: LanguageModelService = Depends(Provide[Container.language_model_service])) -> None:
|
41 |
+
pass
|
42 |
+
|
43 |
+
@router.get("/question_dispatch_service")
|
44 |
+
@inject
|
45 |
+
async def test(question_dispatch_service: QuestionDispatchService = Depends(Provide[Container.question_dispatch_service])) -> None:
|
46 |
+
pass
|
47 |
+
|
48 |
+
@router.get("/slack_service")
|
49 |
+
@inject
|
50 |
+
async def test(slack_service: SlackService = Depends(Provide[Container.slack_service])) -> None:
|
51 |
+
pass
|
52 |
+
|
53 |
+
@router.get("/vector_database_service")
|
54 |
+
@inject
|
55 |
+
async def test(vector_database_service: VectorDatabaseService = Depends(Provide[Container.vector_database_service])) -> None:
|
56 |
+
pass
|
57 |
+
|
58 |
+
@router.get("/vectorization_service")
|
59 |
+
@inject
|
60 |
+
async def test(vectorization_service: VectorizationService = Depends(Provide[Container.vectorization_service])) -> None:
|
61 |
+
pass
|
62 |
+
|
63 |
# @router.post("/transcripts/analyze", response_model=TranscriptResponse)
|
64 |
# async def analyze_transcript(
|
65 |
# request: TranscriptRequest,
|
src/ctp_slack_bot/containers.py
CHANGED
@@ -1,13 +1,16 @@
|
|
1 |
from dependency_injector.containers import DeclarativeContainer
|
2 |
-
from dependency_injector.providers import Factory, Singleton
|
3 |
-
from
|
|
|
4 |
|
5 |
from ctp_slack_bot.core.config import Settings
|
6 |
from ctp_slack_bot.db.mongo_db import MongoDB
|
7 |
from ctp_slack_bot.services.answer_retrieval_service import AnswerRetrievalService
|
8 |
from ctp_slack_bot.services.content_ingestion_service import ContentIngestionService
|
9 |
from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
|
|
|
10 |
from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
|
|
11 |
from ctp_slack_bot.services.question_dispatch_service import QuestionDispatchService
|
12 |
from ctp_slack_bot.services.slack_service import SlackService
|
13 |
from ctp_slack_bot.services.vector_database_service import VectorDatabaseService
|
@@ -16,29 +19,21 @@ from ctp_slack_bot.services.vectorization_service import VectorizationService
|
|
16 |
|
17 |
class Container(DeclarativeContainer):
|
18 |
settings = Singleton(Settings)
|
19 |
-
|
20 |
event_brokerage_service = Singleton(EventBrokerageService)
|
21 |
-
|
22 |
-
mongo_db = Singleton(MongoDB, settings=settings)
|
23 |
-
|
24 |
# Repositories
|
25 |
# transcript_repository = Factory(
|
26 |
# # Your transcript repository class
|
27 |
-
#
|
28 |
# )
|
29 |
-
|
30 |
-
open_ai_client = Factory(OpenAI, api_key=settings.provided.OPENAI_API_KEY) # TODO: poor practice to do it this way; create a LanguageModelService that creates an OpenAI client.
|
31 |
-
|
32 |
vector_database_service = Singleton(VectorDatabaseService, settings=settings, mongo_db=mongo_db)
|
33 |
-
|
34 |
-
vectorization_service = Singleton(VectorizationService, settings=settings,
|
35 |
-
|
36 |
content_ingestion_service = Singleton(ContentIngestionService, settings=settings, event_brokerage_service=event_brokerage_service, vector_database_service=vector_database_service, vectorization_service=vectorization_service)
|
37 |
-
|
38 |
context_retrieval_service = Singleton(ContextRetrievalService, settings=settings, vectorization_service=vectorization_service, vector_database_service=vector_database_service)
|
39 |
-
|
40 |
-
answer_retrieval_service = Singleton(AnswerRetrievalService, settings=settings, event_brokerage_service=event_brokerage_service,
|
41 |
-
|
42 |
question_dispatch_service = Singleton(QuestionDispatchService, settings=settings, event_brokerage_service=event_brokerage_service, content_ingestion_service=content_ingestion_service, context_retrieval_service=context_retrieval_service, answer_retrieval_service=answer_retrieval_service)
|
43 |
-
|
44 |
-
|
|
|
1 |
from dependency_injector.containers import DeclarativeContainer
|
2 |
+
from dependency_injector.providers import Factory, List, Singleton
|
3 |
+
from slack_bolt import App
|
4 |
+
from slack_bolt.adapter.socket_mode import SocketModeHandler
|
5 |
|
6 |
from ctp_slack_bot.core.config import Settings
|
7 |
from ctp_slack_bot.db.mongo_db import MongoDB
|
8 |
from ctp_slack_bot.services.answer_retrieval_service import AnswerRetrievalService
|
9 |
from ctp_slack_bot.services.content_ingestion_service import ContentIngestionService
|
10 |
from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
|
11 |
+
from ctp_slack_bot.services.embeddings_model_service import EmbeddingsModelService
|
12 |
from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
13 |
+
from ctp_slack_bot.services.language_model_service import LanguageModelService
|
14 |
from ctp_slack_bot.services.question_dispatch_service import QuestionDispatchService
|
15 |
from ctp_slack_bot.services.slack_service import SlackService
|
16 |
from ctp_slack_bot.services.vector_database_service import VectorDatabaseService
|
|
|
19 |
|
20 |
class Container(DeclarativeContainer):
|
21 |
settings = Singleton(Settings)
|
|
|
22 |
event_brokerage_service = Singleton(EventBrokerageService)
|
23 |
+
slack_bolt_app = Factory(App, token=settings.provided.SLACK_BOT_TOKEN().get_secret_value())
|
24 |
+
mongo_db = Singleton(MongoDB, settings=settings) # TODO: we could really use less commitment to MongoDB.
|
|
|
25 |
# Repositories
|
26 |
# transcript_repository = Factory(
|
27 |
# # Your transcript repository class
|
28 |
+
# mongo_db=mongo_db
|
29 |
# )
|
|
|
|
|
|
|
30 |
vector_database_service = Singleton(VectorDatabaseService, settings=settings, mongo_db=mongo_db)
|
31 |
+
embeddings_model_service = Singleton(EmbeddingsModelService, settings=settings)
|
32 |
+
vectorization_service = Singleton(VectorizationService, settings=settings, embeddings_model_service=embeddings_model_service)
|
|
|
33 |
content_ingestion_service = Singleton(ContentIngestionService, settings=settings, event_brokerage_service=event_brokerage_service, vector_database_service=vector_database_service, vectorization_service=vectorization_service)
|
|
|
34 |
context_retrieval_service = Singleton(ContextRetrievalService, settings=settings, vectorization_service=vectorization_service, vector_database_service=vector_database_service)
|
35 |
+
language_model_service = Singleton(LanguageModelService, settings=settings)
|
36 |
+
answer_retrieval_service = Singleton(AnswerRetrievalService, settings=settings, event_brokerage_service=event_brokerage_service, language_model_service=language_model_service)
|
|
|
37 |
question_dispatch_service = Singleton(QuestionDispatchService, settings=settings, event_brokerage_service=event_brokerage_service, content_ingestion_service=content_ingestion_service, context_retrieval_service=context_retrieval_service, answer_retrieval_service=answer_retrieval_service)
|
38 |
+
slack_service = Singleton(SlackService, event_brokerage_service=event_brokerage_service, slack_bolt_app=slack_bolt_app)
|
39 |
+
primordial_services = List(settings, event_brokerage_service, slack_bolt_app, slack_service, question_dispatch_service, content_ingestion_service)
|
src/ctp_slack_bot/core/__init__.py
CHANGED
@@ -1,2 +1 @@
|
|
1 |
from ctp_slack_bot.core.config import Settings
|
2 |
-
from ctp_slack_bot.core.logging import logger, setup_logging
|
|
|
1 |
from ctp_slack_bot.core.config import Settings
|
|
src/ctp_slack_bot/core/logging.py
CHANGED
@@ -1,7 +1,10 @@
|
|
|
|
1 |
from logging import __file__ as logging_file, basicConfig, currentframe, getLogger, Handler, INFO, LogRecord
|
2 |
from loguru import logger
|
3 |
from sys import stderr
|
4 |
-
from typing import
|
|
|
|
|
5 |
|
6 |
class InterceptHandler(Handler):
|
7 |
"""
|
@@ -11,7 +14,7 @@ class InterceptHandler(Handler):
|
|
11 |
to Loguru, allowing unified logging across the application.
|
12 |
"""
|
13 |
|
14 |
-
def emit(self, record: LogRecord) -> None:
|
15 |
# Get corresponding Loguru level if it exists
|
16 |
try:
|
17 |
level = logger.level(record.levelname).name
|
@@ -29,7 +32,7 @@ class InterceptHandler(Handler):
|
|
29 |
)
|
30 |
|
31 |
|
32 |
-
def setup_logging(container: "Container") -> None:
|
33 |
"""
|
34 |
Configure logging with Loguru.
|
35 |
|
@@ -37,7 +40,7 @@ def setup_logging(container: "Container") -> None:
|
|
37 |
configures the log format based on settings, and intercepts
|
38 |
standard logging messages.
|
39 |
"""
|
40 |
-
from ctp_slack_bot import Container
|
41 |
settings = container.settings() if container else Provide[Container.settings]
|
42 |
|
43 |
# Remove default loguru handler
|
|
|
1 |
+
from dependency_injector.wiring import Provide
|
2 |
from logging import __file__ as logging_file, basicConfig, currentframe, getLogger, Handler, INFO, LogRecord
|
3 |
from loguru import logger
|
4 |
from sys import stderr
|
5 |
+
from typing import Self
|
6 |
+
|
7 |
+
from ctp_slack_bot.containers import Container
|
8 |
|
9 |
class InterceptHandler(Handler):
|
10 |
"""
|
|
|
14 |
to Loguru, allowing unified logging across the application.
|
15 |
"""
|
16 |
|
17 |
+
def emit(self: Self, record: LogRecord) -> None:
|
18 |
# Get corresponding Loguru level if it exists
|
19 |
try:
|
20 |
level = logger.level(record.levelname).name
|
|
|
32 |
)
|
33 |
|
34 |
|
35 |
+
def setup_logging(container: "Container") -> None: # TODO: Perhaps get rid of the container dependence since we only need two settings.
|
36 |
"""
|
37 |
Configure logging with Loguru.
|
38 |
|
|
|
40 |
configures the log format based on settings, and intercepts
|
41 |
standard logging messages.
|
42 |
"""
|
43 |
+
from ctp_slack_bot.containers import Container
|
44 |
settings = container.settings() if container else Provide[Container.settings]
|
45 |
|
46 |
# Remove default loguru handler
|
src/ctp_slack_bot/core/response_rendering.py
CHANGED
@@ -10,4 +10,4 @@ class PrettyJSONResponse(JSONResponse):
|
|
10 |
allow_nan=False,
|
11 |
indent=4,
|
12 |
separators=(", ", ": "),
|
13 |
-
).encode(
|
|
|
10 |
allow_nan=False,
|
11 |
indent=4,
|
12 |
separators=(", ", ": "),
|
13 |
+
).encode()
|
src/ctp_slack_bot/db/mongo_db.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
#from motor.motor_asyncio import AsyncIOMotorClient
|
2 |
from loguru import logger
|
3 |
-
from pydantic import BaseModel, model_validator
|
4 |
#from pymongo import IndexModel, ASCENDING
|
5 |
from typing import Optional, Self
|
6 |
|
@@ -13,19 +13,23 @@ class MongoDB(BaseModel):
|
|
13 |
"""
|
14 |
|
15 |
settings: Settings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
@model_validator(mode='after')
|
18 |
def post_init(self: Self) -> Self:
|
19 |
logger.debug("Created {}", self.__class__.__name__)
|
20 |
return self
|
21 |
|
22 |
-
def __init__(self: Self, settings: Settings) -> Self:
|
23 |
-
#self.client: Optional[AsyncIOMotorClient] = None
|
24 |
-
#self.db = None
|
25 |
-
#self.vector_collection = None
|
26 |
-
#self.initialized = False
|
27 |
-
pass # The above initialization needs to be done some other way.
|
28 |
-
|
29 |
# async def connect(self):
|
30 |
# """
|
31 |
# Connect to MongoDB using connection string from settings.
|
|
|
1 |
#from motor.motor_asyncio import AsyncIOMotorClient
|
2 |
from loguru import logger
|
3 |
+
from pydantic import BaseModel, model_validator, PrivateAttr
|
4 |
#from pymongo import IndexModel, ASCENDING
|
5 |
from typing import Optional, Self
|
6 |
|
|
|
13 |
"""
|
14 |
|
15 |
settings: Settings
|
16 |
+
_client: PrivateAttr = PrivateAttr()
|
17 |
+
_db: PrivateAttr = PrivateAttr()
|
18 |
+
_vector_collection: PrivateAttr = PrivateAttr()
|
19 |
+
_initialized: PrivateAttr = PrivateAttr()
|
20 |
+
|
21 |
+
def __init__(self: Self, **data) -> None:
|
22 |
+
super().__init__(**data)
|
23 |
+
self._client = None
|
24 |
+
self._db = None
|
25 |
+
self._vector_collection = None
|
26 |
+
self._initialized = False
|
27 |
|
28 |
@model_validator(mode='after')
|
29 |
def post_init(self: Self) -> Self:
|
30 |
logger.debug("Created {}", self.__class__.__name__)
|
31 |
return self
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
# async def connect(self):
|
34 |
# """
|
35 |
# Connect to MongoDB using connection string from settings.
|
src/ctp_slack_bot/enums.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from enum import auto, StrEnum
|
2 |
+
|
3 |
+
class EventType(StrEnum):
|
4 |
+
INCOMING_CONTENT = auto()
|
5 |
+
INCOMING_SLACK_MESSAGE = auto()
|
6 |
+
OUTGOING_SLACK_RESPONSE = auto()
|
src/ctp_slack_bot/models/__init__.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
from ctp_slack_bot.models.base import
|
2 |
-
from ctp_slack_bot.models.
|
3 |
-
from ctp_slack_bot.models.slack import SlackMessage
|
4 |
from ctp_slack_bot.models.vector_query import VectorQuery
|
|
|
1 |
+
from ctp_slack_bot.models.base import Chunk, Content, VectorizedChunk
|
2 |
+
from ctp_slack_bot.models.slack import SlackEventPayload, SlackMessage, SlackReaction, SlackResponse, SlackUserTimestampPair
|
|
|
3 |
from ctp_slack_bot.models.vector_query import VectorQuery
|
src/ctp_slack_bot/models/base.py
CHANGED
@@ -1,61 +1,52 @@
|
|
1 |
from abc import ABC, abstractmethod
|
2 |
-
from
|
3 |
-
from
|
4 |
-
from typing import Dict, List, Optional, Union, Any, ClassVar
|
5 |
-
import hashlib
|
6 |
-
import json
|
7 |
|
8 |
|
9 |
-
class
|
10 |
-
"""A class representing
|
11 |
|
12 |
-
|
13 |
-
|
14 |
-
|
|
|
15 |
|
|
|
16 |
|
17 |
-
class Content(BaseModel):
|
18 |
-
"""A class representing ingested content."""
|
19 |
|
20 |
-
|
|
|
|
|
21 |
|
|
|
22 |
|
|
|
23 |
|
24 |
-
class Ingestible(ABC, BaseModel):
|
25 |
-
"""An abstract base class for ingestible content."""
|
26 |
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
@property
|
30 |
@abstractmethod
|
31 |
-
def
|
32 |
-
"""
|
33 |
-
Return content ready for vectorization.
|
34 |
-
|
35 |
-
This could be:
|
36 |
-
- A single string
|
37 |
-
- A list of strings (pre-chunked)
|
38 |
-
- A more complex structure that can be recursively processed
|
39 |
-
"""
|
40 |
pass
|
41 |
-
|
42 |
-
def get_chunks(self) -> List[str]:
|
43 |
-
"""
|
44 |
-
Split content into chunks suitable for vectorization.
|
45 |
-
Override this in subclasses for specialized chunking logic.
|
46 |
-
"""
|
47 |
-
content = self.content
|
48 |
-
if isinstance(content, str):
|
49 |
-
# Simple chunking by character count
|
50 |
-
return [content[i:i+self.chunk_size]
|
51 |
-
for i in range(0, len(content), self.chunk_size)]
|
52 |
-
elif isinstance(content, list):
|
53 |
-
# Content is already chunked
|
54 |
-
return content
|
55 |
-
else:
|
56 |
-
raise ValueError(f"Unsupported content type: {type(content)}")
|
57 |
-
|
58 |
@property
|
59 |
-
|
60 |
-
|
61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from abc import ABC, abstractmethod
|
2 |
+
from pydantic import BaseModel, ConfigDict
|
3 |
+
from typing import Any, Dict, final, Self, Sequence
|
|
|
|
|
|
|
4 |
|
5 |
|
6 |
+
class Chunk(BaseModel):
|
7 |
+
"""A class representing a chunk of content."""
|
8 |
|
9 |
+
text: str # The text representation
|
10 |
+
parent_id: str # The source content’s identity
|
11 |
+
chunk_id: str # This chunk’s identity—unique within the source content
|
12 |
+
metadata: Dict[str, Any]
|
13 |
|
14 |
+
model_config = ConfigDict(frozen=True)
|
15 |
|
|
|
|
|
16 |
|
17 |
+
@final
|
18 |
+
class VectorizedChunk(Chunk):
|
19 |
+
"""A class representing a vectorized chunk of content."""
|
20 |
|
21 |
+
embedding: Sequence[float] # The vector representation
|
22 |
|
23 |
+
model_config = ConfigDict(frozen=True)
|
24 |
|
|
|
|
|
25 |
|
26 |
+
class Content(ABC, BaseModel):
|
27 |
+
"""An abstract base class for all types of content."""
|
28 |
+
|
29 |
+
model_config = ConfigDict(frozen=True)
|
30 |
+
|
31 |
+
@abstractmethod
|
32 |
+
def get_chunks(self: Self) -> Sequence[Chunk]:
|
33 |
+
pass
|
34 |
+
|
35 |
+
@abstractmethod
|
36 |
+
def get_metadata(self: Self) -> Dict[str, Any]:
|
37 |
+
pass
|
38 |
|
39 |
@property
|
40 |
@abstractmethod
|
41 |
+
def get_text(self: Self) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
pass
|
43 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
@property
|
45 |
+
@abstractmethod
|
46 |
+
def bytes(self: Self) -> bytes:
|
47 |
+
pass
|
48 |
+
|
49 |
+
@property
|
50 |
+
@abstractmethod
|
51 |
+
def id(self: Self) -> str:
|
52 |
+
pass
|
src/ctp_slack_bot/models/content.py
DELETED
@@ -1,19 +0,0 @@
|
|
1 |
-
from pydantic import BaseModel, Field
|
2 |
-
from typing import Optional, List, Dict, Any
|
3 |
-
from ctp_slack_bot.models.slack import SlackMessage
|
4 |
-
|
5 |
-
class RetreivedContext(BaseModel):
|
6 |
-
"""Represents a the context of a question from Slack returned from the Vector Store Database.
|
7 |
-
|
8 |
-
contextual_text: The text that is relevant to the question.
|
9 |
-
metadata_source: The source of the contextual text.
|
10 |
-
similarity_score: The similarity score of the contextual text to the question.
|
11 |
-
|
12 |
-
in_reation_to_question: OPTINAL: The question that the contextual text is related to.
|
13 |
-
"""
|
14 |
-
contextual_text: str
|
15 |
-
metadata_source: str
|
16 |
-
similarity_score: float
|
17 |
-
|
18 |
-
said_by: str = Optional[None]
|
19 |
-
in_reation_to_question: str = Optional[None]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/ctp_slack_bot/models/slack.py
CHANGED
@@ -1,16 +1,96 @@
|
|
1 |
-
from
|
2 |
-
from
|
|
|
|
|
3 |
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
"""Represents a message from Slack after adaptation."""
|
6 |
-
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
|
|
9 |
thread_ts: Optional[str] = None
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
@property
|
14 |
-
def
|
|
|
|
|
|
|
|
|
15 |
"""Unique identifier for this message."""
|
16 |
-
return f"slack:{self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datetime import datetime
|
2 |
+
from json import dumps
|
3 |
+
from pydantic import BaseModel, ConfigDict, PositiveInt, PrivateAttr
|
4 |
+
from typing import Any, Dict, Literal, Optional, Self, Sequence
|
5 |
|
6 |
+
from ctp_slack_bot.models.base import Chunk, Content
|
7 |
+
|
8 |
+
class SlackEventPayload(BaseModel):
|
9 |
+
"""Represents a general event payload from Slack."""
|
10 |
+
type: str
|
11 |
+
event_ts: str
|
12 |
+
|
13 |
+
model_config = ConfigDict(extra='allow', frozen=True)
|
14 |
+
|
15 |
+
class SlackEvent(BaseModel):
|
16 |
+
"""Represents a general event from Slack."""
|
17 |
+
|
18 |
+
token: str
|
19 |
+
team_id: str
|
20 |
+
api_app_id: str
|
21 |
+
event: SlackEventPayload
|
22 |
+
type: str
|
23 |
+
event_id: str
|
24 |
+
event_time: int
|
25 |
+
authed_users: Sequence[str]
|
26 |
+
|
27 |
+
model_config = ConfigDict(frozen=True)
|
28 |
+
|
29 |
+
class SlackUserTimestampPair(BaseModel):
|
30 |
+
"""Represents a Slack user-timestamp pair."""
|
31 |
+
|
32 |
+
user: str
|
33 |
+
ts: str
|
34 |
+
|
35 |
+
model_config = ConfigDict(frozen=True)
|
36 |
+
|
37 |
+
class SlackReaction(BaseModel):
|
38 |
+
"""Represents a Slack reaction information."""
|
39 |
+
|
40 |
+
name: str
|
41 |
+
count: PositiveInt
|
42 |
+
users: Sequence[str]
|
43 |
+
|
44 |
+
model_config = ConfigDict(frozen=True)
|
45 |
+
|
46 |
+
class SlackMessage(Content):
|
47 |
"""Represents a message from Slack after adaptation."""
|
48 |
+
|
49 |
+
type: Literal["app_mention", "message"]
|
50 |
+
subtype: Optional[str] = None
|
51 |
+
channel: str
|
52 |
+
channel_type: Optional[str] = None
|
53 |
+
user: Optional[str] = None
|
54 |
+
bot_id: Optional[str] = None
|
55 |
thread_ts: Optional[str] = None
|
56 |
+
text: str
|
57 |
+
ts: str
|
58 |
+
edited: Optional[SlackUserTimestampPair] = None
|
59 |
+
event_ts: str
|
60 |
+
deleted_ts: Optional[str] = None
|
61 |
+
hidden: bool = False
|
62 |
+
is_starred: Optional[bool] = None
|
63 |
+
pinned_to: Optional[Sequence[str]] = None
|
64 |
+
reactions: Optional[Sequence[SlackReaction]] = None
|
65 |
+
_canonical_json: PrivateAttr
|
66 |
+
|
67 |
+
def __init__(self: Self, **data: Dict[str, Any]) -> None:
|
68 |
+
super().__init__(**data)
|
69 |
+
self._canonical_json = PrivateAttr(default_factory=lambda: dumps(data, sort_keys=True).encode())
|
70 |
+
|
71 |
+
def get_chunks(self: Self) -> Sequence[Chunk]:
|
72 |
+
return (Chunk(text=self.text, parent_id=self.id, chunk_id="", metadata=self.metadata), )
|
73 |
+
|
74 |
+
def get_metadata(self: Self) -> Dict[str, Any]:
|
75 |
+
return {
|
76 |
+
"modificationTime": datetime.fromtimestamp(float(self.ts))
|
77 |
+
}
|
78 |
+
|
79 |
+
def get_text(self: Self) -> str:
|
80 |
+
return self.text
|
81 |
+
|
82 |
@property
|
83 |
+
def bytes(self: Self) -> bytes:
|
84 |
+
return self._canonical_json
|
85 |
+
|
86 |
+
@property
|
87 |
+
def id(self: Self) -> str:
|
88 |
"""Unique identifier for this message."""
|
89 |
+
return f"slack-message:{self.channel}:{self.ts}"
|
90 |
+
|
91 |
+
class SlackResponse(BaseModel): # TODO: This should also be based on Content as it is a SlackMessage―just not one for which we know the identity yet.
|
92 |
+
"""Represents a response message to be sent to Slack."""
|
93 |
+
|
94 |
+
text: str
|
95 |
+
channel: Optional[str]
|
96 |
+
thread_ts: Optional[str] = None
|
src/ctp_slack_bot/models/vector_query.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
-
from pydantic import BaseModel, Field,
|
2 |
-
from typing import
|
3 |
|
4 |
class VectorQuery(BaseModel):
|
5 |
"""Model for vector database similarity search queries.
|
@@ -10,7 +10,8 @@ class VectorQuery(BaseModel):
|
|
10 |
score_threshold: Minimum similarity score threshold for inclusion in results
|
11 |
filter_metadata: Optional filters for metadata fields
|
12 |
"""
|
13 |
-
|
|
|
14 |
k: int
|
15 |
score_threshold: float = Field(default=0.7)
|
16 |
filter_metadata: Optional[Dict[str, Any]] = None
|
|
|
1 |
+
from pydantic import BaseModel, Field, model_validator
|
2 |
+
from typing import Any, Dict, Optional, Sequence
|
3 |
|
4 |
class VectorQuery(BaseModel):
|
5 |
"""Model for vector database similarity search queries.
|
|
|
10 |
score_threshold: Minimum similarity score threshold for inclusion in results
|
11 |
filter_metadata: Optional filters for metadata fields
|
12 |
"""
|
13 |
+
|
14 |
+
query_embeddings: Sequence[float]
|
15 |
k: int
|
16 |
score_threshold: float = Field(default=0.7)
|
17 |
filter_metadata: Optional[Dict[str, Any]] = None
|
src/ctp_slack_bot/services/__init__.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1 |
from ctp_slack_bot.services.answer_retrieval_service import AnswerRetrievalService
|
2 |
from ctp_slack_bot.services.content_ingestion_service import ContentIngestionService
|
3 |
from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
|
|
|
4 |
from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
|
|
5 |
from ctp_slack_bot.services.question_dispatch_service import QuestionDispatchService
|
6 |
from ctp_slack_bot.services.slack_service import SlackService
|
7 |
from ctp_slack_bot.services.vector_database_service import VectorDatabaseService
|
|
|
1 |
from ctp_slack_bot.services.answer_retrieval_service import AnswerRetrievalService
|
2 |
from ctp_slack_bot.services.content_ingestion_service import ContentIngestionService
|
3 |
from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
|
4 |
+
from ctp_slack_bot.services.embeddings_model_service import EmbeddingsModelService
|
5 |
from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
6 |
+
from ctp_slack_bot.services.language_model_service import LanguageModelService
|
7 |
from ctp_slack_bot.services.question_dispatch_service import QuestionDispatchService
|
8 |
from ctp_slack_bot.services.slack_service import SlackService
|
9 |
from ctp_slack_bot.services.vector_database_service import VectorDatabaseService
|
src/ctp_slack_bot/services/answer_retrieval_service.py
CHANGED
@@ -1,65 +1,31 @@
|
|
1 |
-
# from asyncio import create_task
|
2 |
from loguru import logger
|
3 |
-
from openai import OpenAI
|
4 |
from pydantic import BaseModel, model_validator
|
5 |
-
from typing import
|
6 |
|
7 |
from ctp_slack_bot.core import Settings
|
8 |
-
from ctp_slack_bot.
|
|
|
9 |
from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
|
|
10 |
|
11 |
|
12 |
-
class AnswerRetrievalService(BaseModel):
|
13 |
"""
|
14 |
-
Service for language model
|
15 |
"""
|
16 |
|
17 |
settings: Settings
|
18 |
event_brokerage_service: EventBrokerageService
|
19 |
-
|
20 |
-
|
21 |
-
class Config:
|
22 |
-
arbitrary_types_allowed = True
|
23 |
|
24 |
@model_validator(mode='after')
|
25 |
def post_init(self: Self) -> Self:
|
26 |
logger.debug("Created {}", self.__class__.__name__)
|
27 |
return self
|
28 |
|
29 |
-
def
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
Returns:
|
37 |
-
str: Generated answer
|
38 |
-
"""
|
39 |
-
# Prepare context string from retrieved chunks
|
40 |
-
context_str = ""
|
41 |
-
for c in context:
|
42 |
-
context_str += f"{c.contextual_text}\n"
|
43 |
-
|
44 |
-
|
45 |
-
# logger.info(f"Generating response for question: {question}")
|
46 |
-
# logger.info(f"Using {len(context)} context chunks")
|
47 |
-
|
48 |
-
# Create messages for the chat completion
|
49 |
-
messages = [
|
50 |
-
{"role": "system", "content": settings.SYSTEM_PROMPT},
|
51 |
-
{"role": "user", "content":
|
52 |
-
f"""Student Auestion: {question.text}
|
53 |
-
Context from class materials and transcripts: {context_str}
|
54 |
-
Please answer the Student Auestion based on the Context from class materials and transcripts. If the context doesn't contain relevant information, acknowledge that and suggest asking the professor."""}
|
55 |
-
]
|
56 |
-
|
57 |
-
# Generate response
|
58 |
-
response = self.client.chat.completions.create(
|
59 |
-
model=settings.CHAT_MODEL,
|
60 |
-
messages=messages,
|
61 |
-
max_tokens=settings.MAX_TOKENS,
|
62 |
-
temperature=settings.TEMPERATURE
|
63 |
-
)
|
64 |
-
|
65 |
-
return response.choices[0].message.content
|
|
|
|
|
1 |
from loguru import logger
|
|
|
2 |
from pydantic import BaseModel, model_validator
|
3 |
+
from typing import Collection, Self
|
4 |
|
5 |
from ctp_slack_bot.core import Settings
|
6 |
+
from ctp_slack_bot.enums import EventType
|
7 |
+
from ctp_slack_bot.models import Chunk, SlackMessage, SlackResponse
|
8 |
from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
9 |
+
from ctp_slack_bot.services.language_model_service import LanguageModelService
|
10 |
|
11 |
|
12 |
+
class AnswerRetrievalService(BaseModel):
|
13 |
"""
|
14 |
+
Service for context-based answer retrievel from a language model.
|
15 |
"""
|
16 |
|
17 |
settings: Settings
|
18 |
event_brokerage_service: EventBrokerageService
|
19 |
+
language_model_service: LanguageModelService
|
|
|
|
|
|
|
20 |
|
21 |
@model_validator(mode='after')
|
22 |
def post_init(self: Self) -> Self:
|
23 |
logger.debug("Created {}", self.__class__.__name__)
|
24 |
return self
|
25 |
|
26 |
+
def push(self: Self, question: SlackMessage, context: Collection[Chunk]) -> None:
|
27 |
+
channel_to_respond_to = question.channel
|
28 |
+
thread_to_respond_to = question.thread_ts if question.thread_ts else question.ts
|
29 |
+
answer = self.language_model_service.answer_question(question.text, context)
|
30 |
+
slack_response = SlackResponse(text=answer, channel=channel_to_respond_to, thread_ts=thread_to_respond_to)
|
31 |
+
self.event_brokerage_service.publish(EventType.OUTGOING_SLACK_RESPONSE, slack_response)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/ctp_slack_bot/services/content_ingestion_service.py
CHANGED
@@ -1,8 +1,11 @@
|
|
1 |
from loguru import logger
|
2 |
from pydantic import BaseModel, model_validator
|
3 |
-
from typing import Self
|
4 |
|
5 |
from ctp_slack_bot.core import Settings
|
|
|
|
|
|
|
6 |
from ctp_slack_bot.services.vector_database_service import VectorDatabaseService
|
7 |
from ctp_slack_bot.services.vectorization_service import VectorizationService
|
8 |
|
@@ -12,10 +15,33 @@ class ContentIngestionService(BaseModel):
|
|
12 |
"""
|
13 |
|
14 |
settings: Settings
|
|
|
15 |
vector_database_service: VectorDatabaseService
|
16 |
vectorization_service: VectorizationService
|
17 |
|
18 |
@model_validator(mode='after')
|
19 |
def post_init(self: Self) -> Self:
|
20 |
logger.debug("Created {}", self.__class__.__name__)
|
|
|
|
|
21 |
return self
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from loguru import logger
|
2 |
from pydantic import BaseModel, model_validator
|
3 |
+
from typing import Self, Sequence
|
4 |
|
5 |
from ctp_slack_bot.core import Settings
|
6 |
+
from ctp_slack_bot.enums import EventType
|
7 |
+
from ctp_slack_bot.models import Chunk, Content, SlackMessage
|
8 |
+
from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
9 |
from ctp_slack_bot.services.vector_database_service import VectorDatabaseService
|
10 |
from ctp_slack_bot.services.vectorization_service import VectorizationService
|
11 |
|
|
|
15 |
"""
|
16 |
|
17 |
settings: Settings
|
18 |
+
event_brokerage_service: EventBrokerageService
|
19 |
vector_database_service: VectorDatabaseService
|
20 |
vectorization_service: VectorizationService
|
21 |
|
22 |
@model_validator(mode='after')
|
23 |
def post_init(self: Self) -> Self:
|
24 |
logger.debug("Created {}", self.__class__.__name__)
|
25 |
+
self.event_brokerage_service.subscribe(EventType.INCOMING_CONTENT, self.process_incoming_content)
|
26 |
+
self.event_brokerage_service.subscribe(EventType.INCOMING_SLACK_MESSAGE, self.process_incoming_slack_message)
|
27 |
return self
|
28 |
+
|
29 |
+
def process_incoming_content(self: Self, content: Content) -> None:
|
30 |
+
logger.debug("Content ingestion service received content with metadata: {}", content.get_metadata())
|
31 |
+
# if self.vector_database_service.has_content(content.id) # TODO
|
32 |
+
# logger.debug("Ignored content with ID {} because it already exists in the database.", content.id)
|
33 |
+
# return
|
34 |
+
chunks = content.get_chunks()
|
35 |
+
self.__vectorize_and_store_chunks_in_database(chunks)
|
36 |
+
logger.debug("Stored {} vectorized chunk(s) in the database.", len(chunks))
|
37 |
+
|
38 |
+
def process_incoming_slack_message(self: Self, slack_message: SlackMessage) -> None:
|
39 |
+
logger.debug("Content ingestion service received a Slack message: {}", slack_message.text)
|
40 |
+
chunks = slack_message.get_chunks()
|
41 |
+
self.__vectorize_and_store_chunks_in_database(chunks)
|
42 |
+
logger.debug("Stored {} vectorized chunk(s) in the database.", len(chunks))
|
43 |
+
|
44 |
+
def __vectorize_and_store_chunks_in_database(self: Self, chunks: Sequence[Chunk]) -> None:
|
45 |
+
# vectorized_chunks = self.vectorization_service.vectorize(chunks) # TODO
|
46 |
+
# self.vector_database_service.store(vectorized_chunks) # TODO
|
47 |
+
pass
|
src/ctp_slack_bot/services/context_retrieval_service.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
from loguru import logger
|
2 |
from pydantic import BaseModel, model_validator
|
3 |
-
from typing import
|
4 |
|
5 |
from ctp_slack_bot.core.config import Settings
|
6 |
-
from ctp_slack_bot.models import
|
7 |
from ctp_slack_bot.services.vector_database_service import VectorDatabaseService
|
8 |
from ctp_slack_bot.services.vectorization_service import VectorizationService
|
9 |
|
@@ -20,14 +20,15 @@ class ContextRetrievalService(BaseModel):
|
|
20 |
def post_init(self: Self) -> Self:
|
21 |
logger.debug("Created {}", self.__class__.__name__)
|
22 |
return self
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
|
|
31 |
"""
|
32 |
Retrieve relevant context for a given Slack message.
|
33 |
|
@@ -43,36 +44,37 @@ class ContextRetrievalService(BaseModel):
|
|
43 |
Returns:
|
44 |
List[RetreivedContext]: List of retrieved context items with similarity scores
|
45 |
"""
|
46 |
-
if not message.is_question:
|
47 |
-
|
48 |
-
|
49 |
|
50 |
-
try:
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
|
57 |
-
|
58 |
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
|
72 |
-
|
73 |
-
|
74 |
|
75 |
-
except Exception as e:
|
76 |
-
|
77 |
-
|
78 |
-
|
|
|
|
1 |
from loguru import logger
|
2 |
from pydantic import BaseModel, model_validator
|
3 |
+
from typing import Self, Sequence
|
4 |
|
5 |
from ctp_slack_bot.core.config import Settings
|
6 |
+
from ctp_slack_bot.models import Chunk, SlackMessage, VectorQuery, VectorizedChunk
|
7 |
from ctp_slack_bot.services.vector_database_service import VectorDatabaseService
|
8 |
from ctp_slack_bot.services.vectorization_service import VectorizationService
|
9 |
|
|
|
20 |
def post_init(self: Self) -> Self:
|
21 |
logger.debug("Created {}", self.__class__.__name__)
|
22 |
return self
|
23 |
+
|
24 |
+
# Should not allow initialization calls to bubble up all the way to the surface ― sequester in `post_init` or the class on which it depends.
|
25 |
+
# async def initialize(self):
|
26 |
+
# """
|
27 |
+
# Initialize the required services.
|
28 |
+
# """
|
29 |
+
# await self.vector_database_service.initialize()
|
30 |
+
|
31 |
+
def get_context(self, message: SlackMessage) -> Sequence[Chunk]:
|
32 |
"""
|
33 |
Retrieve relevant context for a given Slack message.
|
34 |
|
|
|
44 |
Returns:
|
45 |
List[RetreivedContext]: List of retrieved context items with similarity scores
|
46 |
"""
|
47 |
+
# if not message.is_question:
|
48 |
+
# logger.debug(f"Message {message.key} is not a question, skipping context retrieval")
|
49 |
+
# return []
|
50 |
|
51 |
+
# try:
|
52 |
+
# # Vectorize the message text
|
53 |
+
# embeddings = self.vectorization_service.vectorize([message.text])
|
54 |
+
# if embeddings is None or len(embeddings) == 0:
|
55 |
+
# logger.error(f"Failed to generate embedding for message: {message.key}")
|
56 |
+
# return []
|
57 |
|
58 |
+
# query_embedding = embeddings[0].tolist()
|
59 |
|
60 |
+
# # Create vector query
|
61 |
+
# vector_query = VectorQuery(
|
62 |
+
# query_text=message.text,
|
63 |
+
# k=self.settings.TOP_K_MATCHES,
|
64 |
+
# score_threshold=0.7 # Minimum similarity threshold
|
65 |
+
# )
|
66 |
|
67 |
+
# # Search for similar content chunks in vector database
|
68 |
+
# context_results = await self.vector_database_service.search_by_similarity(
|
69 |
+
# query=vector_query,
|
70 |
+
# query_embedding=query_embedding
|
71 |
+
# )
|
72 |
|
73 |
+
# logger.info(f"Retrieved {len(context_results)} context items for message: {message.key}")
|
74 |
+
# return context_results
|
75 |
|
76 |
+
# except Exception as e:
|
77 |
+
# logger.error(f"Error retrieving context for message {message.key}: {str(e)}")
|
78 |
+
# return []
|
79 |
+
return (VectorizedChunk(text="Mock context chunk", parent_id="lol", chunk_id="no", metadata={}),
|
80 |
+
VectorizedChunk(text="Moar mock context chunk", parent_id="lol", chunk_id="wut", metadata={}))
|
src/ctp_slack_bot/services/embeddings_model_service.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from loguru import logger
|
2 |
+
from openai import OpenAI
|
3 |
+
from pydantic import BaseModel, PrivateAttr, model_validator
|
4 |
+
from typing import Any, Dict, Sequence, Self
|
5 |
+
|
6 |
+
from ctp_slack_bot.core import Settings
|
7 |
+
|
8 |
+
class EmbeddingsModelService(BaseModel):
|
9 |
+
"""
|
10 |
+
Service for embeddings model operations.
|
11 |
+
"""
|
12 |
+
|
13 |
+
settings: Settings
|
14 |
+
_open_ai_client: PrivateAttr = PrivateAttr()
|
15 |
+
|
16 |
+
def __init__(self: Self, **data: Dict[str, Any]) -> None:
|
17 |
+
super().__init__(**data)
|
18 |
+
self._open_ai_client = OpenAI(api_key=self.settings.OPENAI_API_KEY.get_secret_value())
|
19 |
+
|
20 |
+
@model_validator(mode='after')
|
21 |
+
def post_init(self: Self) -> Self:
|
22 |
+
logger.debug("Created {}", self.__class__.__name__)
|
23 |
+
return self
|
24 |
+
|
25 |
+
def get_embeddings(self: Self, texts: Sequence[str]) -> Sequence[Sequence[float]]:
|
26 |
+
"""Get embeddings for a collection of texts using OpenAI’s API.
|
27 |
+
|
28 |
+
Args:
|
29 |
+
texts (Collection[str]): Collection of text chunks to embed
|
30 |
+
|
31 |
+
Returns:
|
32 |
+
NDArray: Array of embeddings with shape (n_texts, VECTOR_DIMENSION)
|
33 |
+
|
34 |
+
Raises:
|
35 |
+
ValueError: If the embedding dimensions don't match expected size
|
36 |
+
"""
|
37 |
+
logger.debug("Creating embeddings for {} text string(s)…", len(texts))
|
38 |
+
response = self._open_ai_client.embeddings.create(
|
39 |
+
model=self.settings.EMBEDDING_MODEL,
|
40 |
+
input=texts,
|
41 |
+
encoding_format="float" # Ensure we get raw float values.
|
42 |
+
)
|
43 |
+
embeddings = tuple(tuple(data.embedding) for data in response.data)
|
44 |
+
match embeddings:
|
45 |
+
case (first, _) if len(first) != self.settings.VECTOR_DIMENSION:
|
46 |
+
logger.error("Embedding dimension mismatch and/or misconfiguration: expected configured dimension {}, but got {}.", self.settings.VECTOR_DIMENSION, len(first))
|
47 |
+
raise ValueError() # TODO: raise a more specific type.
|
48 |
+
return embeddings
|
src/ctp_slack_bot/services/event_brokerage_service.py
CHANGED
@@ -1,38 +1,46 @@
|
|
1 |
-
|
|
|
|
|
2 |
from loguru import logger
|
3 |
-
from
|
4 |
-
from pydantic import BaseModel, model_validator
|
5 |
from typing import Any, Callable, Dict, List, Self
|
6 |
|
7 |
-
from ctp_slack_bot.
|
8 |
-
from ctp_slack_bot.models import RetreivedContext, SlackMessage
|
9 |
-
from ctp_slack_bot.services.content_ingestion_service import ContentIngestionService
|
10 |
-
from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
|
11 |
-
|
12 |
|
13 |
class EventBrokerageService(BaseModel):
|
14 |
"""
|
15 |
Service for brokering events between services.
|
16 |
"""
|
17 |
|
18 |
-
|
19 |
-
|
20 |
-
class Config:
|
21 |
-
arbitrary_types_allowed = True
|
22 |
|
23 |
@model_validator(mode='after')
|
24 |
def post_init(self: Self) -> Self:
|
25 |
logger.debug("Created {}", self.__class__.__name__)
|
26 |
return self
|
27 |
|
28 |
-
def subscribe(self: Self,
|
29 |
"""Subscribe to an event type with a callback function."""
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
|
|
35 |
"""Publish an event with optional data to all subscribers."""
|
36 |
-
|
37 |
-
|
38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from asyncio import create_task, iscoroutinefunction, to_thread
|
3 |
+
from collections import defaultdict
|
4 |
from loguru import logger
|
5 |
+
from pydantic import BaseModel, model_validator, PrivateAttr
|
|
|
6 |
from typing import Any, Callable, Dict, List, Self
|
7 |
|
8 |
+
from ctp_slack_bot.enums import EventType
|
|
|
|
|
|
|
|
|
9 |
|
10 |
class EventBrokerageService(BaseModel):
|
11 |
"""
|
12 |
Service for brokering events between services.
|
13 |
"""
|
14 |
|
15 |
+
_subscribers: PrivateAttr = PrivateAttr(default_factory=lambda: defaultdict(list))
|
|
|
|
|
|
|
16 |
|
17 |
@model_validator(mode='after')
|
18 |
def post_init(self: Self) -> Self:
|
19 |
logger.debug("Created {}", self.__class__.__name__)
|
20 |
return self
|
21 |
|
22 |
+
def subscribe(self: Self, type: EventType, callback: Callable) -> None:
|
23 |
"""Subscribe to an event type with a callback function."""
|
24 |
+
logger.debug("1 new subscriber is listening for {} events.", type)
|
25 |
+
subscribers = self._subscribers[type]
|
26 |
+
subscribers.append(callback)
|
27 |
+
logger.debug("Event type {} has {} subscriber(s).", type, len(subscribers))
|
28 |
+
|
29 |
+
def publish(self: Self, type: EventType, data: Any = None) -> None:
|
30 |
"""Publish an event with optional data to all subscribers."""
|
31 |
+
subscribers = self._subscribers[type]
|
32 |
+
if not subscribers:
|
33 |
+
logger.debug("No subscribers handle event {}: {}", type, len(subscribers), data)
|
34 |
+
return
|
35 |
+
logger.debug("Broadcasting event {} to {} subscriber(s): {}", type, len(subscribers), data)
|
36 |
+
for callback in subscribers:
|
37 |
+
if iscoroutinefunction(callback):
|
38 |
+
task: create_task(callback(data))
|
39 |
+
task.add_done_callback(lambda done_task: logger.error("Error in asynchronous event callback handling event {}: {}", done_task.exception())
|
40 |
+
if done_task.exception()
|
41 |
+
else None)
|
42 |
+
else:
|
43 |
+
try:
|
44 |
+
create_task(to_thread(callback, data))
|
45 |
+
except Exception as e:
|
46 |
+
logger.error("Error scheduling synchronous callback to handle event {}: {}", type, e)
|
src/ctp_slack_bot/services/language_model_service.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from loguru import logger
|
2 |
+
from openai import OpenAI
|
3 |
+
from openai.types.chat import ChatCompletion
|
4 |
+
from pydantic import BaseModel, PrivateAttr, model_validator
|
5 |
+
from typing import Collection, Self
|
6 |
+
|
7 |
+
from ctp_slack_bot.core import Settings
|
8 |
+
from ctp_slack_bot.models import Chunk
|
9 |
+
|
10 |
+
class LanguageModelService(BaseModel):
|
11 |
+
"""
|
12 |
+
Service for language model operations.
|
13 |
+
"""
|
14 |
+
|
15 |
+
settings: Settings
|
16 |
+
_open_ai_client: PrivateAttr = PrivateAttr()
|
17 |
+
|
18 |
+
def __init__(self: Self, **data) -> None:
|
19 |
+
super().__init__(**data)
|
20 |
+
self._open_ai_client = OpenAI(api_key=self.settings.OPENAI_API_KEY.get_secret_value())
|
21 |
+
|
22 |
+
@model_validator(mode='after')
|
23 |
+
def post_init(self: Self) -> Self:
|
24 |
+
logger.debug("Created {}", self.__class__.__name__)
|
25 |
+
return self
|
26 |
+
|
27 |
+
def answer_question(self, question: str, context: Collection[Chunk]) -> str:
|
28 |
+
"""Generate a response using OpenAI’s API with retrieved context.
|
29 |
+
|
30 |
+
Args:
|
31 |
+
question (str): The user’s question
|
32 |
+
context (List[RetreivedContext]): The context retreived for answering the question
|
33 |
+
|
34 |
+
Returns:
|
35 |
+
str: Generated answer
|
36 |
+
"""
|
37 |
+
logger.debug("Generating response for question “{}” using {} context chunks…", question, len(context))
|
38 |
+
messages = [
|
39 |
+
{"role": "system", "content": self.settings.SYSTEM_PROMPT},
|
40 |
+
{"role": "user", "content":
|
41 |
+
f"""Student Question: {question}
|
42 |
+
|
43 |
+
Context from class materials and transcripts:
|
44 |
+
{'\n'.join(chunk.text for chunk in context)}
|
45 |
+
|
46 |
+
Please answer the Student Question based on the Context from class materials and transcripts. If the context doesn’t contain relevant information, acknowledge that and suggest asking the professor."""}
|
47 |
+
]
|
48 |
+
# response: ChatCompletion = self._open_ai_client.chat.completions.create(
|
49 |
+
# model=self.settings.CHAT_MODEL,
|
50 |
+
# messages=messages,
|
51 |
+
# max_tokens=self.settings.MAX_TOKENS,
|
52 |
+
# temperature=self.settings.TEMPERATURE
|
53 |
+
# )
|
54 |
+
|
55 |
+
# return response.choices[0].message.content
|
56 |
+
return f"A mock response to “{question}”"
|
src/ctp_slack_bot/services/question_dispatch_service.py
CHANGED
@@ -1,11 +1,11 @@
|
|
1 |
# from asyncio import create_task
|
2 |
from loguru import logger
|
3 |
-
from openai import OpenAI
|
4 |
from pydantic import BaseModel, model_validator
|
5 |
-
from typing import
|
6 |
|
7 |
from ctp_slack_bot.core import Settings
|
8 |
-
from ctp_slack_bot.
|
|
|
9 |
from ctp_slack_bot.services.answer_retrieval_service import AnswerRetrievalService
|
10 |
from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
|
11 |
from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
@@ -24,8 +24,11 @@ class QuestionDispatchService(BaseModel):
|
|
24 |
@model_validator(mode='after')
|
25 |
def post_init(self: Self) -> Self:
|
26 |
logger.debug("Created {}", self.__class__.__name__)
|
|
|
27 |
return self
|
28 |
|
29 |
-
def
|
30 |
-
|
31 |
-
|
|
|
|
|
|
1 |
# from asyncio import create_task
|
2 |
from loguru import logger
|
|
|
3 |
from pydantic import BaseModel, model_validator
|
4 |
+
from typing import Self
|
5 |
|
6 |
from ctp_slack_bot.core import Settings
|
7 |
+
from ctp_slack_bot.enums import EventType
|
8 |
+
from ctp_slack_bot.models import Chunk, SlackMessage
|
9 |
from ctp_slack_bot.services.answer_retrieval_service import AnswerRetrievalService
|
10 |
from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
|
11 |
from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
|
|
24 |
@model_validator(mode='after')
|
25 |
def post_init(self: Self) -> Self:
|
26 |
logger.debug("Created {}", self.__class__.__name__)
|
27 |
+
self.event_brokerage_service.subscribe(EventType.INCOMING_SLACK_MESSAGE, self.__process_incoming_slack_message)
|
28 |
return self
|
29 |
|
30 |
+
def __process_incoming_slack_message(self: Self, message: SlackMessage) -> None:
|
31 |
+
if message.subtype != 'bot_message':
|
32 |
+
logger.debug("Question dispatch service received an answerable question: {}", message.text)
|
33 |
+
context = self.context_retrieval_service.get_context(message)
|
34 |
+
self.answer_retrieval_service.push(message, context)
|
src/ctp_slack_bot/services/slack_service.py
CHANGED
@@ -1,11 +1,11 @@
|
|
1 |
-
# from asyncio import create_task
|
2 |
from loguru import logger
|
3 |
from openai import OpenAI
|
4 |
from pydantic import BaseModel, model_validator
|
5 |
-
from
|
|
|
6 |
|
7 |
-
from ctp_slack_bot.
|
8 |
-
from ctp_slack_bot.models import
|
9 |
from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
10 |
|
11 |
|
@@ -14,10 +14,36 @@ class SlackService(BaseModel):
|
|
14 |
Service for interfacing with Slack.
|
15 |
"""
|
16 |
|
17 |
-
settings: Settings
|
18 |
event_brokerage_service: EventBrokerageService
|
|
|
|
|
|
|
|
|
19 |
|
20 |
@model_validator(mode='after')
|
21 |
def post_init(self: Self) -> Self:
|
|
|
22 |
logger.debug("Created {}", self.__class__.__name__)
|
23 |
return self
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from loguru import logger
|
2 |
from openai import OpenAI
|
3 |
from pydantic import BaseModel, model_validator
|
4 |
+
from slack_bolt import App
|
5 |
+
from typing import Any, Dict, Self
|
6 |
|
7 |
+
from ctp_slack_bot.enums import EventType
|
8 |
+
from ctp_slack_bot.models import SlackMessage, SlackResponse
|
9 |
from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
10 |
|
11 |
|
|
|
14 |
Service for interfacing with Slack.
|
15 |
"""
|
16 |
|
|
|
17 |
event_brokerage_service: EventBrokerageService
|
18 |
+
slack_bolt_app: App
|
19 |
+
|
20 |
+
class Config:
|
21 |
+
arbitrary_types_allowed = True
|
22 |
|
23 |
@model_validator(mode='after')
|
24 |
def post_init(self: Self) -> Self:
|
25 |
+
self.event_brokerage_service.subscribe(EventType.OUTGOING_SLACK_RESPONSE, self.send_message)
|
26 |
logger.debug("Created {}", self.__class__.__name__)
|
27 |
return self
|
28 |
+
|
29 |
+
def adapt_event_payload(self: Self, event: Dict[str, Any]) -> SlackMessage:
|
30 |
+
return SlackMessage(
|
31 |
+
type=event.get("type"),
|
32 |
+
subtype=event.get("subtype"),
|
33 |
+
channel=event.get("channel"),
|
34 |
+
channel_type=event.get("channel_type"),
|
35 |
+
user=event.get("user"),
|
36 |
+
bot_id=event.get("bot_id"),
|
37 |
+
thread_ts=event.get("thread_ts"),
|
38 |
+
text=event.get("text", ""),
|
39 |
+
ts=event.get("ts"),
|
40 |
+
event_ts=event.get("event_ts")
|
41 |
+
)
|
42 |
+
|
43 |
+
def process_message(self: Self, event: Dict[str, Any]) -> None:
|
44 |
+
slack_message = self.adapt_event_payload(event.get("event", {}))
|
45 |
+
logger.debug("Received message from Slack: {}", slack_message)
|
46 |
+
self.event_brokerage_service.publish(EventType.INCOMING_SLACK_MESSAGE, slack_message)
|
47 |
+
|
48 |
+
def send_message(self: Self, message: SlackResponse) -> None:
|
49 |
+
self.slack_bolt_app.client.chat_postMessage(channel=message.channel, text=message.text, thread_ts=message.thread_ts)
|
src/ctp_slack_bot/services/vector_database_service.py
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
from loguru import logger
|
2 |
from pydantic import BaseModel, model_validator
|
3 |
-
from typing import Any, Dict, List, Optional, Self
|
4 |
|
5 |
from ctp_slack_bot.core import Settings
|
6 |
from ctp_slack_bot.db import MongoDB
|
7 |
-
from ctp_slack_bot.models import
|
8 |
|
9 |
class VectorDatabaseService(BaseModel): # TODO: this should not rely specifically on MongoDB.
|
10 |
"""
|
@@ -19,13 +19,15 @@ class VectorDatabaseService(BaseModel): # TODO: this should not rely specificall
|
|
19 |
logger.debug("Created {}", self.__class__.__name__)
|
20 |
return self
|
21 |
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
|
|
27 |
|
28 |
-
|
|
|
29 |
"""
|
30 |
Store text and its embedding vector in the database.
|
31 |
|
@@ -37,8 +39,8 @@ class VectorDatabaseService(BaseModel): # TODO: this should not rely specificall
|
|
37 |
Returns:
|
38 |
str: The ID of the stored document
|
39 |
"""
|
40 |
-
if not
|
41 |
-
await
|
42 |
|
43 |
try:
|
44 |
# Create document to store
|
@@ -49,7 +51,7 @@ class VectorDatabaseService(BaseModel): # TODO: this should not rely specificall
|
|
49 |
}
|
50 |
|
51 |
# Insert into collection
|
52 |
-
result = await
|
53 |
logger.debug(f"Stored document with ID: {result.inserted_id}")
|
54 |
|
55 |
return str(result.inserted_id)
|
@@ -57,7 +59,7 @@ class VectorDatabaseService(BaseModel): # TODO: this should not rely specificall
|
|
57 |
logger.error(f"Error storing embedding: {str(e)}")
|
58 |
raise
|
59 |
|
60 |
-
async def search_by_similarity(self, query: VectorQuery
|
61 |
"""
|
62 |
Query the vector database for similar documents.
|
63 |
|
@@ -68,8 +70,8 @@ class VectorDatabaseService(BaseModel): # TODO: this should not rely specificall
|
|
68 |
Returns:
|
69 |
List[RetreivedContext]: List of similar documents with similarity scores
|
70 |
"""
|
71 |
-
if not
|
72 |
-
await
|
73 |
|
74 |
try:
|
75 |
# Build aggregation pipeline for vector search
|
@@ -100,7 +102,7 @@ class VectorDatabaseService(BaseModel): # TODO: this should not rely specificall
|
|
100 |
pipeline.insert(1, {"$match": metadata_filter})
|
101 |
|
102 |
# Execute the pipeline
|
103 |
-
results = await
|
104 |
|
105 |
# Convert to RetreivedContext objects directly
|
106 |
context_results = []
|
@@ -113,7 +115,7 @@ class VectorDatabaseService(BaseModel): # TODO: this should not rely specificall
|
|
113 |
continue
|
114 |
|
115 |
context_results.append(
|
116 |
-
|
117 |
contextual_text=result["text"],
|
118 |
metadata_source=result["metadata"].get("source", "unknown"),
|
119 |
similarity_score=normalized_score,
|
|
|
1 |
from loguru import logger
|
2 |
from pydantic import BaseModel, model_validator
|
3 |
+
from typing import Any, Collection, Dict, List, Optional, Self, Sequence
|
4 |
|
5 |
from ctp_slack_bot.core import Settings
|
6 |
from ctp_slack_bot.db import MongoDB
|
7 |
+
from ctp_slack_bot.models import Chunk, Content, VectorizedChunk, VectorQuery
|
8 |
|
9 |
class VectorDatabaseService(BaseModel): # TODO: this should not rely specifically on MongoDB.
|
10 |
"""
|
|
|
19 |
logger.debug("Created {}", self.__class__.__name__)
|
20 |
return self
|
21 |
|
22 |
+
# Should not allow initialization calls to bubble up all the way to the surface ― sequester in `post_init` or the class on which it depends.
|
23 |
+
# async def initialize(self) -> None:
|
24 |
+
# """
|
25 |
+
# Initialize the database connection.
|
26 |
+
# """
|
27 |
+
# await self.mongo_db.initialize()
|
28 |
|
29 |
+
# TODO: Weight cost of going all async.
|
30 |
+
async def store(self, chunks: Collection[VectorizedChunk]) -> None:
|
31 |
"""
|
32 |
Store text and its embedding vector in the database.
|
33 |
|
|
|
39 |
Returns:
|
40 |
str: The ID of the stored document
|
41 |
"""
|
42 |
+
if not self.mongo_db.initialized:
|
43 |
+
await self.mongo_db.initialize()
|
44 |
|
45 |
try:
|
46 |
# Create document to store
|
|
|
51 |
}
|
52 |
|
53 |
# Insert into collection
|
54 |
+
result = await self.mongo_db.vector_collection.insert_one(document)
|
55 |
logger.debug(f"Stored document with ID: {result.inserted_id}")
|
56 |
|
57 |
return str(result.inserted_id)
|
|
|
59 |
logger.error(f"Error storing embedding: {str(e)}")
|
60 |
raise
|
61 |
|
62 |
+
async def search_by_similarity(self, query: VectorQuery) -> Sequence[Chunk]:
|
63 |
"""
|
64 |
Query the vector database for similar documents.
|
65 |
|
|
|
70 |
Returns:
|
71 |
List[RetreivedContext]: List of similar documents with similarity scores
|
72 |
"""
|
73 |
+
if not self.mongo_db.initialized:
|
74 |
+
await self.mongo_db.initialize()
|
75 |
|
76 |
try:
|
77 |
# Build aggregation pipeline for vector search
|
|
|
102 |
pipeline.insert(1, {"$match": metadata_filter})
|
103 |
|
104 |
# Execute the pipeline
|
105 |
+
results = await self.mongo_db.vector_collection.aggregate(pipeline).to_list(length=query.k)
|
106 |
|
107 |
# Convert to RetreivedContext objects directly
|
108 |
context_results = []
|
|
|
115 |
continue
|
116 |
|
117 |
context_results.append(
|
118 |
+
Content(
|
119 |
contextual_text=result["text"],
|
120 |
metadata_source=result["metadata"].get("source", "unknown"),
|
121 |
similarity_score=normalized_score,
|
src/ctp_slack_bot/services/vectorization_service.py
CHANGED
@@ -1,10 +1,13 @@
|
|
1 |
from loguru import logger
|
2 |
-
|
|
|
3 |
from openai import OpenAI
|
4 |
from pydantic import BaseModel, model_validator
|
5 |
-
from typing import
|
6 |
|
7 |
from ctp_slack_bot.core import Settings
|
|
|
|
|
8 |
|
9 |
class VectorizationService(BaseModel):
|
10 |
"""
|
@@ -12,57 +15,21 @@ class VectorizationService(BaseModel):
|
|
12 |
"""
|
13 |
|
14 |
settings: Settings
|
15 |
-
|
16 |
-
|
17 |
-
class Config:
|
18 |
-
arbitrary_types_allowed = True
|
19 |
|
20 |
@model_validator(mode='after')
|
21 |
def post_init(self: Self) -> Self:
|
22 |
logger.debug("Created {}", self.__class__.__name__)
|
23 |
return self
|
24 |
-
|
25 |
-
def get_embeddings(self, texts: List[str]) -> np.ndarray:
|
26 |
-
"""Get embeddings for a list of texts using OpenAI's API.
|
27 |
-
|
28 |
-
Args:
|
29 |
-
texts (List[str]): List of text chunks to embed
|
30 |
-
|
31 |
-
Returns:
|
32 |
-
np.ndarray: Array of embeddings with shape (n_texts, VECTOR_DIMENSION)
|
33 |
-
|
34 |
-
Raises:
|
35 |
-
ValueError: If the embedding dimensions don't match expected size
|
36 |
-
"""
|
37 |
-
try:
|
38 |
-
# Use the initialized client instead of the global openai module
|
39 |
-
response = self.client.embeddings.create(
|
40 |
-
model=self.settings.EMBEDDING_MODEL,
|
41 |
-
input=texts,
|
42 |
-
encoding_format="float" # Ensure we get raw float values
|
43 |
-
)
|
44 |
-
|
45 |
-
# Extract embeddings and verify dimensions
|
46 |
-
embeddings = np.array([data.embedding for data in response.data])
|
47 |
-
|
48 |
-
if embeddings.shape[1] != self.settings.VECTOR_DIMENSION:
|
49 |
-
raise ValueError(
|
50 |
-
f"Embedding dimension mismatch. Expected {self.settings.VECTOR_DIMENSION}, "
|
51 |
-
f"but got {embeddings.shape[1]}. Please update VECTOR_DIMENSION "
|
52 |
-
f"in config.py to match the model's output."
|
53 |
-
)
|
54 |
-
|
55 |
-
return embeddings
|
56 |
-
|
57 |
-
except Exception as e:
|
58 |
-
print(f"Error getting embeddings: {str(e)}")
|
59 |
-
pass
|
60 |
|
61 |
-
def
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
|
|
|
|
|
|
|
1 |
from loguru import logger
|
2 |
+
from numpy import array
|
3 |
+
from numpy.typing import NDArray
|
4 |
from openai import OpenAI
|
5 |
from pydantic import BaseModel, model_validator
|
6 |
+
from typing import Self, Sequence
|
7 |
|
8 |
from ctp_slack_bot.core import Settings
|
9 |
+
from ctp_slack_bot.models import Chunk, VectorizedChunk
|
10 |
+
from ctp_slack_bot.services.embeddings_model_service import EmbeddingsModelService
|
11 |
|
12 |
class VectorizationService(BaseModel):
|
13 |
"""
|
|
|
15 |
"""
|
16 |
|
17 |
settings: Settings
|
18 |
+
embeddings_model_service: EmbeddingsModelService
|
|
|
|
|
|
|
19 |
|
20 |
@model_validator(mode='after')
|
21 |
def post_init(self: Self) -> Self:
|
22 |
logger.debug("Created {}", self.__class__.__name__)
|
23 |
return self
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
+
def vectorize(self: Self, chunks: Sequence[Chunk]) -> Sequence[VectorizedChunk]:
|
26 |
+
embeddings = self.embeddings_model_service.get_embeddings([chunk.text for chunk in chunks])
|
27 |
+
return tuple(VectorizedChunk(
|
28 |
+
text=chunk.text,
|
29 |
+
parent_id=chunk.parent_id,
|
30 |
+
chunk_id=chunk.chunk_id,
|
31 |
+
metadata=chunk.metadata,
|
32 |
+
embedding=embedding
|
33 |
+
)
|
34 |
+
for chunk, embedding
|
35 |
+
in zip(chunks, embeddings))
|
src/ctp_slack_bot/tasks/scheduler.py
CHANGED
@@ -6,7 +6,7 @@ from loguru import logger
|
|
6 |
from pytz import timezone
|
7 |
from typing import Optional
|
8 |
|
9 |
-
from ctp_slack_bot import Container
|
10 |
|
11 |
@inject
|
12 |
def start_scheduler(container: Container) -> AsyncIOScheduler:
|
|
|
6 |
from pytz import timezone
|
7 |
from typing import Optional
|
8 |
|
9 |
+
from ctp_slack_bot.containers import Container
|
10 |
|
11 |
@inject
|
12 |
def start_scheduler(container: Container) -> AsyncIOScheduler:
|