LiKenun commited on
Commit
bb5dde5
·
1 Parent(s): 86644e7

Repository pattern refactor W.I.P.

Browse files
notebooks/google_drive_web_vtt_vectorizer_and_storer.ipynb CHANGED
@@ -11,31 +11,7 @@
11
  "cell_type": "code",
12
  "execution_count": null,
13
  "metadata": {},
14
- "outputs": [
15
- {
16
- "name": "stderr",
17
- "output_type": "stream",
18
- "text": [
19
- "\u001b[32m2025-04-20 00:08:14.649\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.core.config\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m14\u001b[0m - \u001b[34m\u001b[1mCreated Settings\u001b[0m\n",
20
- "\u001b[32m2025-04-20 00:08:14.652\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.google_drive_service\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m44\u001b[0m - \u001b[34m\u001b[1mCreated GoogleDriveService\u001b[0m\n",
21
- "\u001b[32m2025-04-20 00:08:14.664\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.embeddings_model_service\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m22\u001b[0m - \u001b[34m\u001b[1mCreated EmbeddingsModelService\u001b[0m\n",
22
- "\u001b[32m2025-04-20 00:08:14.664\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vectorization_service\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m22\u001b[0m - \u001b[34m\u001b[1mCreated VectorizationService\u001b[0m\n"
23
- ]
24
- },
25
- {
26
- "name": "stderr",
27
- "output_type": "stream",
28
- "text": [
29
- "\u001b[32m2025-04-20 00:08:14.666\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36minit\u001b[0m:\u001b[36m175\u001b[0m - \u001b[1mInitializing MongoDB connection for database: ctp_slack_bot_dev\u001b[0m\n",
30
- "\u001b[32m2025-04-20 00:08:14.666\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m26\u001b[0m - \u001b[34m\u001b[1mCreated MongoDB\u001b[0m\n",
31
- "\u001b[32m2025-04-20 00:08:14.667\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mconnect\u001b[0m:\u001b[36m32\u001b[0m - \u001b[34m\u001b[1mConnecting to MongoDB using URI: mongodb+srv://ctp-slack-bot.xkipuvm.mongodb.net/?retryWrites=true&w=majority&appName=ctp-slack-bot\u001b[0m\n",
32
- "\u001b[32m2025-04-20 00:08:14.667\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mconnect\u001b[0m:\u001b[36m49\u001b[0m - \u001b[34m\u001b[1mMongoDB client initialized for database: ctp_slack_bot_dev\u001b[0m\n",
33
- "\u001b[32m2025-04-20 00:08:15.043\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
34
- "\u001b[32m2025-04-20 00:08:15.044\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36m_test_connection\u001b[0m:\u001b[36m186\u001b[0m - \u001b[1mMongoDB connection test successful!\u001b[0m\n",
35
- "\u001b[32m2025-04-20 00:08:15.044\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m21\u001b[0m - \u001b[34m\u001b[1mCreated VectorDatabaseService\u001b[0m\n"
36
- ]
37
- }
38
- ],
39
  "source": [
40
  "from datetime import datetime\n",
41
  "from functools import partial\n",
@@ -67,7 +43,7 @@
67
  },
68
  {
69
  "cell_type": "code",
70
- "execution_count": 6,
71
  "metadata": {},
72
  "outputs": [],
73
  "source": [
@@ -85,64 +61,9 @@
85
  },
86
  {
87
  "cell_type": "code",
88
- "execution_count": 7,
89
  "metadata": {},
90
- "outputs": [
91
- {
92
- "data": {
93
- "text/html": [
94
- "<p>Found 11 files/folders.</p>"
95
- ]
96
- },
97
- "metadata": {},
98
- "output_type": "display_data"
99
- },
100
- {
101
- "data": {
102
- "text/html": [
103
- "<ul><li>/Friday Building AI Applications Session</li><li>/Friday Building AI Applications Session/GMT20250411-223535_Recording.transcript.vtt</li><li>/Friday Building AI Applications Session/GMT20250404-231749_Recording.transcript.vtt</li><li>/Friday Building AI Applications Session/GMT20250328-223256_Recording.transcript.vtt</li><li>/Friday Building AI Applications Session/GMT20250321-223330_Recording.transcript.vtt</li><li>/Friday Building AI Applications Session/GMT20250314-223145_Recording.transcript.vtt</li><li>/Friday Building AI Applications Session/GMT20250307-233135_Recording.transcript.vtt</li><li>/Friday Building AI Applications Session/GMT20250228-233632_Recording.transcript.vtt</li><li>/Friday Building AI Applications Session/GMT20250221-233332_Recording.transcript.vtt</li><li>/Friday Building AI Applications Session/GMT20250214-234809_Recording.transcript.vtt</li><li>/Friday Building AI Applications Session/GMT20250207-233258_Recording.transcript.vtt</li></ul>"
104
- ]
105
- },
106
- "metadata": {},
107
- "output_type": "display_data"
108
- },
109
- {
110
- "data": {
111
- "text/html": [
112
- "<p>11 files/folders pass the modification time (<em>2024-08-30 00:00:00+00:00</em>) cut-off.</p>"
113
- ]
114
- },
115
- "metadata": {},
116
- "output_type": "display_data"
117
- },
118
- {
119
- "data": {
120
- "text/html": [
121
- "<ul><li>/Friday Building AI Applications Session</li><li>/Friday Building AI Applications Session/GMT20250411-223535_Recording.transcript.vtt</li><li>/Friday Building AI Applications Session/GMT20250404-231749_Recording.transcript.vtt</li><li>/Friday Building AI Applications Session/GMT20250328-223256_Recording.transcript.vtt</li><li>/Friday Building AI Applications Session/GMT20250321-223330_Recording.transcript.vtt</li><li>/Friday Building AI Applications Session/GMT20250314-223145_Recording.transcript.vtt</li><li>/Friday Building AI Applications Session/GMT20250307-233135_Recording.transcript.vtt</li><li>/Friday Building AI Applications Session/GMT20250228-233632_Recording.transcript.vtt</li><li>/Friday Building AI Applications Session/GMT20250221-233332_Recording.transcript.vtt</li><li>/Friday Building AI Applications Session/GMT20250214-234809_Recording.transcript.vtt</li><li>/Friday Building AI Applications Session/GMT20250207-233258_Recording.transcript.vtt</li></ul>"
122
- ]
123
- },
124
- "metadata": {},
125
- "output_type": "display_data"
126
- },
127
- {
128
- "data": {
129
- "text/html": [
130
- "<p>11 files/folders pass the modification time (<em>2024-08-30 00:00:00+00:00</em>) cut-off and MIME type (<em>text/vtt</em>) criterion.</p>"
131
- ]
132
- },
133
- "metadata": {},
134
- "output_type": "display_data"
135
- },
136
- {
137
- "data": {
138
- "text/html": [
139
- "<ul><li>/Friday Building AI Applications Session/GMT20250411-223535_Recording.transcript.vtt</li><li>/Friday Building AI Applications Session/GMT20250404-231749_Recording.transcript.vtt</li><li>/Friday Building AI Applications Session/GMT20250328-223256_Recording.transcript.vtt</li><li>/Friday Building AI Applications Session/GMT20250321-223330_Recording.transcript.vtt</li><li>/Friday Building AI Applications Session/GMT20250314-223145_Recording.transcript.vtt</li><li>/Friday Building AI Applications Session/GMT20250307-233135_Recording.transcript.vtt</li><li>/Friday Building AI Applications Session/GMT20250228-233632_Recording.transcript.vtt</li><li>/Friday Building AI Applications Session/GMT20250221-233332_Recording.transcript.vtt</li><li>/Friday Building AI Applications Session/GMT20250214-234809_Recording.transcript.vtt</li><li>/Friday Building AI Applications Session/GMT20250207-233258_Recording.transcript.vtt</li></ul>"
140
- ]
141
- },
142
- "metadata": {},
143
- "output_type": "display_data"
144
- }
145
- ],
146
  "source": [
147
  "item_metadata = google_drive_service.list_directory(\"\", True)\n",
148
  "display_html(f\"<p>Found {len(item_metadata)} files/folders.</p>\")\n",
@@ -159,19 +80,9 @@
159
  },
160
  {
161
  "cell_type": "code",
162
- "execution_count": 8,
163
  "metadata": {},
164
- "outputs": [
165
- {
166
- "data": {
167
- "text/html": [
168
- "Processed 10 files."
169
- ]
170
- },
171
- "metadata": {},
172
- "output_type": "display_data"
173
- }
174
- ],
175
  "source": [
176
  "web_vtts = tuple(WebVTTContent.from_bytes(f\"googledrive:{metadata.folder_path}/{metadata.name}\",\n",
177
  " {\n",
@@ -188,521 +99,9 @@
188
  },
189
  {
190
  "cell_type": "code",
191
- "execution_count": 9,
192
  "metadata": {},
193
- "outputs": [
194
- {
195
- "data": {
196
- "text/html": [
197
- "Chunked GMT20250411-223535_Recording.transcript.vtt into 86 chunks."
198
- ]
199
- },
200
- "metadata": {},
201
- "output_type": "display_data"
202
- },
203
- {
204
- "name": "stderr",
205
- "output_type": "stream",
206
- "text": [
207
- "\u001b[32m2025-04-20 00:08:52.269\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.embeddings_model_service\u001b[0m:\u001b[36mget_embeddings\u001b[0m:\u001b[36m36\u001b[0m - \u001b[34m\u001b[1mCreating embeddings for 86 text string(s)…\u001b[0m\n"
208
- ]
209
- },
210
- {
211
- "data": {
212
- "text/html": [
213
- "Vectorized GMT20250411-223535_Recording.transcript.vtt’s 86 chunks."
214
- ]
215
- },
216
- "metadata": {},
217
- "output_type": "display_data"
218
- },
219
- {
220
- "name": "stderr",
221
- "output_type": "stream",
222
- "text": [
223
- "\u001b[32m2025-04-20 00:08:54.190\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m38\u001b[0m - \u001b[34m\u001b[1mGetting vectors collection for storing 86 chunks\u001b[0m\n",
224
- "\u001b[32m2025-04-20 00:08:54.216\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
225
- "\u001b[32m2025-04-20 00:08:54.217\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
226
- "\u001b[32m2025-04-20 00:08:54.240\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m110\u001b[0m - \u001b[1mCollection 'vectors' does not exist. Creating it…\u001b[0m\n",
227
- "\u001b[32m2025-04-20 00:08:54.301\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m113\u001b[0m - \u001b[34m\u001b[1mSuccessfully created collection: vectors\u001b[0m\n",
228
- "\u001b[32m2025-04-20 00:08:54.302\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreating vector search index for vectors collection\u001b[0m\n",
229
- "\u001b[32m2025-04-20 00:08:54.320\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
230
- "\u001b[32m2025-04-20 00:08:54.321\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
231
- "\u001b[32m2025-04-20 00:08:54.341\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
232
- "\u001b[32m2025-04-20 00:08:54.491\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mcreate_indexes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1mVector search index 'vectors_vector_index' created for collection vectors.\u001b[0m\n",
233
- "\u001b[32m2025-04-20 00:08:54.494\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m62\u001b[0m - \u001b[34m\u001b[1mInserting 86 documents into vectors collection\u001b[0m\n",
234
- "\u001b[32m2025-04-20 00:08:55.239\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mStored 86 vector chunks in database\u001b[0m\n"
235
- ]
236
- },
237
- {
238
- "data": {
239
- "text/html": [
240
- "Stored GMT20250411-223535_Recording.transcript.vtt’s 86 vectorized chunks to the database."
241
- ]
242
- },
243
- "metadata": {},
244
- "output_type": "display_data"
245
- },
246
- {
247
- "data": {
248
- "text/html": [
249
- "Chunked GMT20250404-231749_Recording.transcript.vtt into 56 chunks."
250
- ]
251
- },
252
- "metadata": {},
253
- "output_type": "display_data"
254
- },
255
- {
256
- "name": "stderr",
257
- "output_type": "stream",
258
- "text": [
259
- "\u001b[32m2025-04-20 00:08:55.241\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.embeddings_model_service\u001b[0m:\u001b[36mget_embeddings\u001b[0m:\u001b[36m36\u001b[0m - \u001b[34m\u001b[1mCreating embeddings for 56 text string(s)…\u001b[0m\n"
260
- ]
261
- },
262
- {
263
- "data": {
264
- "text/html": [
265
- "Vectorized GMT20250404-231749_Recording.transcript.vtt’s 56 chunks."
266
- ]
267
- },
268
- "metadata": {},
269
- "output_type": "display_data"
270
- },
271
- {
272
- "name": "stderr",
273
- "output_type": "stream",
274
- "text": [
275
- "\u001b[32m2025-04-20 00:08:56.099\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m38\u001b[0m - \u001b[34m\u001b[1mGetting vectors collection for storing 56 chunks\u001b[0m\n",
276
- "\u001b[32m2025-04-20 00:08:56.119\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
277
- "\u001b[32m2025-04-20 00:08:56.120\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
278
- "\u001b[32m2025-04-20 00:08:56.151\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
279
- "\u001b[32m2025-04-20 00:08:56.151\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreating vector search index for vectors collection\u001b[0m\n",
280
- "\u001b[32m2025-04-20 00:08:56.170\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
281
- "\u001b[32m2025-04-20 00:08:56.170\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
282
- "\u001b[32m2025-04-20 00:08:56.199\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
283
- "\u001b[32m2025-04-20 00:08:56.341\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mcreate_indexes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1mVector search index 'vectors_vector_index' created for collection vectors.\u001b[0m\n",
284
- "\u001b[32m2025-04-20 00:08:56.341\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m62\u001b[0m - \u001b[34m\u001b[1mInserting 56 documents into vectors collection\u001b[0m\n",
285
- "\u001b[32m2025-04-20 00:08:56.732\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mStored 56 vector chunks in database\u001b[0m\n"
286
- ]
287
- },
288
- {
289
- "data": {
290
- "text/html": [
291
- "Stored GMT20250404-231749_Recording.transcript.vtt’s 56 vectorized chunks to the database."
292
- ]
293
- },
294
- "metadata": {},
295
- "output_type": "display_data"
296
- },
297
- {
298
- "data": {
299
- "text/html": [
300
- "Chunked GMT20250328-223256_Recording.transcript.vtt into 359 chunks."
301
- ]
302
- },
303
- "metadata": {},
304
- "output_type": "display_data"
305
- },
306
- {
307
- "name": "stderr",
308
- "output_type": "stream",
309
- "text": [
310
- "\u001b[32m2025-04-20 00:08:56.735\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.embeddings_model_service\u001b[0m:\u001b[36mget_embeddings\u001b[0m:\u001b[36m36\u001b[0m - \u001b[34m\u001b[1mCreating embeddings for 359 text string(s)…\u001b[0m\n"
311
- ]
312
- },
313
- {
314
- "data": {
315
- "text/html": [
316
- "Vectorized GMT20250328-223256_Recording.transcript.vtt’s 359 chunks."
317
- ]
318
- },
319
- "metadata": {},
320
- "output_type": "display_data"
321
- },
322
- {
323
- "name": "stderr",
324
- "output_type": "stream",
325
- "text": [
326
- "\u001b[32m2025-04-20 00:09:00.360\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m38\u001b[0m - \u001b[34m\u001b[1mGetting vectors collection for storing 359 chunks\u001b[0m\n",
327
- "\u001b[32m2025-04-20 00:09:00.384\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
328
- "\u001b[32m2025-04-20 00:09:00.384\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
329
- "\u001b[32m2025-04-20 00:09:00.404\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
330
- "\u001b[32m2025-04-20 00:09:00.404\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreating vector search index for vectors collection\u001b[0m\n",
331
- "\u001b[32m2025-04-20 00:09:00.424\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
332
- "\u001b[32m2025-04-20 00:09:00.424\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
333
- "\u001b[32m2025-04-20 00:09:00.445\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
334
- "\u001b[32m2025-04-20 00:09:00.588\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mcreate_indexes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1mVector search index 'vectors_vector_index' created for collection vectors.\u001b[0m\n",
335
- "\u001b[32m2025-04-20 00:09:00.590\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m62\u001b[0m - \u001b[34m\u001b[1mInserting 359 documents into vectors collection\u001b[0m\n",
336
- "\u001b[32m2025-04-20 00:09:04.394\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mStored 359 vector chunks in database\u001b[0m\n"
337
- ]
338
- },
339
- {
340
- "data": {
341
- "text/html": [
342
- "Stored GMT20250328-223256_Recording.transcript.vtt’s 359 vectorized chunks to the database."
343
- ]
344
- },
345
- "metadata": {},
346
- "output_type": "display_data"
347
- },
348
- {
349
- "data": {
350
- "text/html": [
351
- "Chunked GMT20250321-223330_Recording.transcript.vtt into 314 chunks."
352
- ]
353
- },
354
- "metadata": {},
355
- "output_type": "display_data"
356
- },
357
- {
358
- "name": "stderr",
359
- "output_type": "stream",
360
- "text": [
361
- "\u001b[32m2025-04-20 00:09:04.397\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.embeddings_model_service\u001b[0m:\u001b[36mget_embeddings\u001b[0m:\u001b[36m36\u001b[0m - \u001b[34m\u001b[1mCreating embeddings for 314 text string(s)…\u001b[0m\n"
362
- ]
363
- },
364
- {
365
- "data": {
366
- "text/html": [
367
- "Vectorized GMT20250321-223330_Recording.transcript.vtt’s 314 chunks."
368
- ]
369
- },
370
- "metadata": {},
371
- "output_type": "display_data"
372
- },
373
- {
374
- "name": "stderr",
375
- "output_type": "stream",
376
- "text": [
377
- "\u001b[32m2025-04-20 00:09:07.348\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m38\u001b[0m - \u001b[34m\u001b[1mGetting vectors collection for storing 314 chunks\u001b[0m\n",
378
- "\u001b[32m2025-04-20 00:09:07.369\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
379
- "\u001b[32m2025-04-20 00:09:07.370\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
380
- "\u001b[32m2025-04-20 00:09:07.389\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
381
- "\u001b[32m2025-04-20 00:09:07.389\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreating vector search index for vectors collection\u001b[0m\n",
382
- "\u001b[32m2025-04-20 00:09:07.410\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
383
- "\u001b[32m2025-04-20 00:09:07.410\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
384
- "\u001b[32m2025-04-20 00:09:07.430\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
385
- "\u001b[32m2025-04-20 00:09:07.566\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mcreate_indexes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1mVector search index 'vectors_vector_index' created for collection vectors.\u001b[0m\n",
386
- "\u001b[32m2025-04-20 00:09:07.568\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m62\u001b[0m - \u001b[34m\u001b[1mInserting 314 documents into vectors collection\u001b[0m\n",
387
- "\u001b[32m2025-04-20 00:09:11.153\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mStored 314 vector chunks in database\u001b[0m\n"
388
- ]
389
- },
390
- {
391
- "data": {
392
- "text/html": [
393
- "Stored GMT20250321-223330_Recording.transcript.vtt’s 314 vectorized chunks to the database."
394
- ]
395
- },
396
- "metadata": {},
397
- "output_type": "display_data"
398
- },
399
- {
400
- "data": {
401
- "text/html": [
402
- "Chunked GMT20250314-223145_Recording.transcript.vtt into 331 chunks."
403
- ]
404
- },
405
- "metadata": {},
406
- "output_type": "display_data"
407
- },
408
- {
409
- "name": "stderr",
410
- "output_type": "stream",
411
- "text": [
412
- "\u001b[32m2025-04-20 00:09:11.157\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.embeddings_model_service\u001b[0m:\u001b[36mget_embeddings\u001b[0m:\u001b[36m36\u001b[0m - \u001b[34m\u001b[1mCreating embeddings for 331 text string(s)…\u001b[0m\n"
413
- ]
414
- },
415
- {
416
- "data": {
417
- "text/html": [
418
- "Vectorized GMT20250314-223145_Recording.transcript.vtt’s 331 chunks."
419
- ]
420
- },
421
- "metadata": {},
422
- "output_type": "display_data"
423
- },
424
- {
425
- "name": "stderr",
426
- "output_type": "stream",
427
- "text": [
428
- "\u001b[32m2025-04-20 00:09:14.751\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m38\u001b[0m - \u001b[34m\u001b[1mGetting vectors collection for storing 331 chunks\u001b[0m\n",
429
- "\u001b[32m2025-04-20 00:09:14.774\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
430
- "\u001b[32m2025-04-20 00:09:14.774\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
431
- "\u001b[32m2025-04-20 00:09:14.794\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
432
- "\u001b[32m2025-04-20 00:09:14.794\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreating vector search index for vectors collection\u001b[0m\n",
433
- "\u001b[32m2025-04-20 00:09:14.813\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
434
- "\u001b[32m2025-04-20 00:09:14.813\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
435
- "\u001b[32m2025-04-20 00:09:14.834\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
436
- "\u001b[32m2025-04-20 00:09:14.948\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mcreate_indexes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1mVector search index 'vectors_vector_index' created for collection vectors.\u001b[0m\n",
437
- "\u001b[32m2025-04-20 00:09:14.950\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m62\u001b[0m - \u001b[34m\u001b[1mInserting 331 documents into vectors collection\u001b[0m\n",
438
- "\u001b[32m2025-04-20 00:09:18.640\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mStored 331 vector chunks in database\u001b[0m\n"
439
- ]
440
- },
441
- {
442
- "data": {
443
- "text/html": [
444
- "Stored GMT20250314-223145_Recording.transcript.vtt’s 331 vectorized chunks to the database."
445
- ]
446
- },
447
- "metadata": {},
448
- "output_type": "display_data"
449
- },
450
- {
451
- "data": {
452
- "text/html": [
453
- "Chunked GMT20250307-233135_Recording.transcript.vtt into 280 chunks."
454
- ]
455
- },
456
- "metadata": {},
457
- "output_type": "display_data"
458
- },
459
- {
460
- "name": "stderr",
461
- "output_type": "stream",
462
- "text": [
463
- "\u001b[32m2025-04-20 00:09:18.643\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.embeddings_model_service\u001b[0m:\u001b[36mget_embeddings\u001b[0m:\u001b[36m36\u001b[0m - \u001b[34m\u001b[1mCreating embeddings for 280 text string(s)…\u001b[0m\n"
464
- ]
465
- },
466
- {
467
- "data": {
468
- "text/html": [
469
- "Vectorized GMT20250307-233135_Recording.transcript.vtt’s 280 chunks."
470
- ]
471
- },
472
- "metadata": {},
473
- "output_type": "display_data"
474
- },
475
- {
476
- "name": "stderr",
477
- "output_type": "stream",
478
- "text": [
479
- "\u001b[32m2025-04-20 00:09:22.256\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m38\u001b[0m - \u001b[34m\u001b[1mGetting vectors collection for storing 280 chunks\u001b[0m\n",
480
- "\u001b[32m2025-04-20 00:09:22.278\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
481
- "\u001b[32m2025-04-20 00:09:22.279\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
482
- "\u001b[32m2025-04-20 00:09:22.297\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
483
- "\u001b[32m2025-04-20 00:09:22.297\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreating vector search index for vectors collection\u001b[0m\n",
484
- "\u001b[32m2025-04-20 00:09:22.344\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
485
- "\u001b[32m2025-04-20 00:09:22.345\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
486
- "\u001b[32m2025-04-20 00:09:22.368\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
487
- "\u001b[32m2025-04-20 00:09:22.505\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mcreate_indexes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1mVector search index 'vectors_vector_index' created for collection vectors.\u001b[0m\n",
488
- "\u001b[32m2025-04-20 00:09:22.507\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m62\u001b[0m - \u001b[34m\u001b[1mInserting 280 documents into vectors collection\u001b[0m\n",
489
- "\u001b[32m2025-04-20 00:09:24.988\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mStored 280 vector chunks in database\u001b[0m\n"
490
- ]
491
- },
492
- {
493
- "data": {
494
- "text/html": [
495
- "Stored GMT20250307-233135_Recording.transcript.vtt’s 280 vectorized chunks to the database."
496
- ]
497
- },
498
- "metadata": {},
499
- "output_type": "display_data"
500
- },
501
- {
502
- "data": {
503
- "text/html": [
504
- "Chunked GMT20250228-233632_Recording.transcript.vtt into 233 chunks."
505
- ]
506
- },
507
- "metadata": {},
508
- "output_type": "display_data"
509
- },
510
- {
511
- "name": "stderr",
512
- "output_type": "stream",
513
- "text": [
514
- "\u001b[32m2025-04-20 00:09:24.991\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.embeddings_model_service\u001b[0m:\u001b[36mget_embeddings\u001b[0m:\u001b[36m36\u001b[0m - \u001b[34m\u001b[1mCreating embeddings for 233 text string(s)…\u001b[0m\n"
515
- ]
516
- },
517
- {
518
- "data": {
519
- "text/html": [
520
- "Vectorized GMT20250228-233632_Recording.transcript.vtt’s 233 chunks."
521
- ]
522
- },
523
- "metadata": {},
524
- "output_type": "display_data"
525
- },
526
- {
527
- "name": "stderr",
528
- "output_type": "stream",
529
- "text": [
530
- "\u001b[32m2025-04-20 00:09:28.628\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m38\u001b[0m - \u001b[34m\u001b[1mGetting vectors collection for storing 233 chunks\u001b[0m\n",
531
- "\u001b[32m2025-04-20 00:09:28.648\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
532
- "\u001b[32m2025-04-20 00:09:28.649\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
533
- "\u001b[32m2025-04-20 00:09:28.669\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
534
- "\u001b[32m2025-04-20 00:09:28.669\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreating vector search index for vectors collection\u001b[0m\n",
535
- "\u001b[32m2025-04-20 00:09:28.688\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
536
- "\u001b[32m2025-04-20 00:09:28.688\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
537
- "\u001b[32m2025-04-20 00:09:28.709\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
538
- "\u001b[32m2025-04-20 00:09:28.836\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mcreate_indexes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1mVector search index 'vectors_vector_index' created for collection vectors.\u001b[0m\n",
539
- "\u001b[32m2025-04-20 00:09:28.838\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m62\u001b[0m - \u001b[34m\u001b[1mInserting 233 documents into vectors collection\u001b[0m\n",
540
- "\u001b[32m2025-04-20 00:09:31.039\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mStored 233 vector chunks in database\u001b[0m\n"
541
- ]
542
- },
543
- {
544
- "data": {
545
- "text/html": [
546
- "Stored GMT20250228-233632_Recording.transcript.vtt’s 233 vectorized chunks to the database."
547
- ]
548
- },
549
- "metadata": {},
550
- "output_type": "display_data"
551
- },
552
- {
553
- "data": {
554
- "text/html": [
555
- "Chunked GMT20250221-233332_Recording.transcript.vtt into 278 chunks."
556
- ]
557
- },
558
- "metadata": {},
559
- "output_type": "display_data"
560
- },
561
- {
562
- "name": "stderr",
563
- "output_type": "stream",
564
- "text": [
565
- "\u001b[32m2025-04-20 00:09:31.042\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.embeddings_model_service\u001b[0m:\u001b[36mget_embeddings\u001b[0m:\u001b[36m36\u001b[0m - \u001b[34m\u001b[1mCreating embeddings for 278 text string(s)…\u001b[0m\n"
566
- ]
567
- },
568
- {
569
- "data": {
570
- "text/html": [
571
- "Vectorized GMT20250221-233332_Recording.transcript.vtt’s 278 chunks."
572
- ]
573
- },
574
- "metadata": {},
575
- "output_type": "display_data"
576
- },
577
- {
578
- "name": "stderr",
579
- "output_type": "stream",
580
- "text": [
581
- "\u001b[32m2025-04-20 00:09:36.119\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m38\u001b[0m - \u001b[34m\u001b[1mGetting vectors collection for storing 278 chunks\u001b[0m\n",
582
- "\u001b[32m2025-04-20 00:09:36.138\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
583
- "\u001b[32m2025-04-20 00:09:36.138\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
584
- "\u001b[32m2025-04-20 00:09:36.157\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
585
- "\u001b[32m2025-04-20 00:09:36.158\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreating vector search index for vectors collection\u001b[0m\n",
586
- "\u001b[32m2025-04-20 00:09:36.177\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
587
- "\u001b[32m2025-04-20 00:09:36.177\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
588
- "\u001b[32m2025-04-20 00:09:36.198\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
589
- "\u001b[32m2025-04-20 00:09:36.314\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mcreate_indexes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1mVector search index 'vectors_vector_index' created for collection vectors.\u001b[0m\n",
590
- "\u001b[32m2025-04-20 00:09:36.316\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m62\u001b[0m - \u001b[34m\u001b[1mInserting 278 documents into vectors collection\u001b[0m\n",
591
- "\u001b[32m2025-04-20 00:09:38.707\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mStored 278 vector chunks in database\u001b[0m\n"
592
- ]
593
- },
594
- {
595
- "data": {
596
- "text/html": [
597
- "Stored GMT20250221-233332_Recording.transcript.vtt’s 278 vectorized chunks to the database."
598
- ]
599
- },
600
- "metadata": {},
601
- "output_type": "display_data"
602
- },
603
- {
604
- "data": {
605
- "text/html": [
606
- "Chunked GMT20250214-234809_Recording.transcript.vtt into 97 chunks."
607
- ]
608
- },
609
- "metadata": {},
610
- "output_type": "display_data"
611
- },
612
- {
613
- "name": "stderr",
614
- "output_type": "stream",
615
- "text": [
616
- "\u001b[32m2025-04-20 00:09:38.710\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.embeddings_model_service\u001b[0m:\u001b[36mget_embeddings\u001b[0m:\u001b[36m36\u001b[0m - \u001b[34m\u001b[1mCreating embeddings for 97 text string(s)…\u001b[0m\n"
617
- ]
618
- },
619
- {
620
- "data": {
621
- "text/html": [
622
- "Vectorized GMT20250214-234809_Recording.transcript.vtt’s 97 chunks."
623
- ]
624
- },
625
- "metadata": {},
626
- "output_type": "display_data"
627
- },
628
- {
629
- "name": "stderr",
630
- "output_type": "stream",
631
- "text": [
632
- "\u001b[32m2025-04-20 00:09:40.479\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m38\u001b[0m - \u001b[34m\u001b[1mGetting vectors collection for storing 97 chunks\u001b[0m\n",
633
- "\u001b[32m2025-04-20 00:09:40.499\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
634
- "\u001b[32m2025-04-20 00:09:40.499\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
635
- "\u001b[32m2025-04-20 00:09:40.529\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
636
- "\u001b[32m2025-04-20 00:09:40.529\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreating vector search index for vectors collection\u001b[0m\n",
637
- "\u001b[32m2025-04-20 00:09:40.548\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
638
- "\u001b[32m2025-04-20 00:09:40.548\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
639
- "\u001b[32m2025-04-20 00:09:40.568\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
640
- "\u001b[32m2025-04-20 00:09:40.678\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mcreate_indexes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1mVector search index 'vectors_vector_index' created for collection vectors.\u001b[0m\n",
641
- "\u001b[32m2025-04-20 00:09:40.679\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m62\u001b[0m - \u001b[34m\u001b[1mInserting 97 documents into vectors collection\u001b[0m\n",
642
- "\u001b[32m2025-04-20 00:09:41.562\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mStored 97 vector chunks in database\u001b[0m\n"
643
- ]
644
- },
645
- {
646
- "data": {
647
- "text/html": [
648
- "Stored GMT20250214-234809_Recording.transcript.vtt’s 97 vectorized chunks to the database."
649
- ]
650
- },
651
- "metadata": {},
652
- "output_type": "display_data"
653
- },
654
- {
655
- "data": {
656
- "text/html": [
657
- "Chunked GMT20250207-233258_Recording.transcript.vtt into 209 chunks."
658
- ]
659
- },
660
- "metadata": {},
661
- "output_type": "display_data"
662
- },
663
- {
664
- "name": "stderr",
665
- "output_type": "stream",
666
- "text": [
667
- "\u001b[32m2025-04-20 00:09:41.565\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.embeddings_model_service\u001b[0m:\u001b[36mget_embeddings\u001b[0m:\u001b[36m36\u001b[0m - \u001b[34m\u001b[1mCreating embeddings for 209 text string(s)…\u001b[0m\n"
668
- ]
669
- },
670
- {
671
- "data": {
672
- "text/html": [
673
- "Vectorized GMT20250207-233258_Recording.transcript.vtt’s 209 chunks."
674
- ]
675
- },
676
- "metadata": {},
677
- "output_type": "display_data"
678
- },
679
- {
680
- "name": "stderr",
681
- "output_type": "stream",
682
- "text": [
683
- "\u001b[32m2025-04-20 00:09:44.152\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m38\u001b[0m - \u001b[34m\u001b[1mGetting vectors collection for storing 209 chunks\u001b[0m\n",
684
- "\u001b[32m2025-04-20 00:09:44.178\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
685
- "\u001b[32m2025-04-20 00:09:44.178\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
686
- "\u001b[32m2025-04-20 00:09:44.197\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
687
- "\u001b[32m2025-04-20 00:09:44.198\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreating vector search index for vectors collection\u001b[0m\n",
688
- "\u001b[32m2025-04-20 00:09:44.221\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
689
- "\u001b[32m2025-04-20 00:09:44.222\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
690
- "\u001b[32m2025-04-20 00:09:44.247\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
691
- "\u001b[32m2025-04-20 00:09:44.390\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mcreate_indexes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1mVector search index 'vectors_vector_index' created for collection vectors.\u001b[0m\n",
692
- "\u001b[32m2025-04-20 00:09:44.391\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m62\u001b[0m - \u001b[34m\u001b[1mInserting 209 documents into vectors collection\u001b[0m\n",
693
- "\u001b[32m2025-04-20 00:09:46.410\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mStored 209 vector chunks in database\u001b[0m\n"
694
- ]
695
- },
696
- {
697
- "data": {
698
- "text/html": [
699
- "Stored GMT20250207-233258_Recording.transcript.vtt’s 209 vectorized chunks to the database."
700
- ]
701
- },
702
- "metadata": {},
703
- "output_type": "display_data"
704
- }
705
- ],
706
  "source": [
707
  "for web_vtt in web_vtts:\n",
708
  " chunks = web_vtt.get_chunks()\n",
 
11
  "cell_type": "code",
12
  "execution_count": null,
13
  "metadata": {},
14
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  "source": [
16
  "from datetime import datetime\n",
17
  "from functools import partial\n",
 
43
  },
44
  {
45
  "cell_type": "code",
46
+ "execution_count": null,
47
  "metadata": {},
48
  "outputs": [],
49
  "source": [
 
61
  },
62
  {
63
  "cell_type": "code",
64
+ "execution_count": null,
65
  "metadata": {},
66
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  "source": [
68
  "item_metadata = google_drive_service.list_directory(\"\", True)\n",
69
  "display_html(f\"<p>Found {len(item_metadata)} files/folders.</p>\")\n",
 
80
  },
81
  {
82
  "cell_type": "code",
83
+ "execution_count": null,
84
  "metadata": {},
85
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
86
  "source": [
87
  "web_vtts = tuple(WebVTTContent.from_bytes(f\"googledrive:{metadata.folder_path}/{metadata.name}\",\n",
88
  " {\n",
 
99
  },
100
  {
101
  "cell_type": "code",
102
+ "execution_count": null,
103
  "metadata": {},
104
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  "source": [
106
  "for web_vtt in web_vtts:\n",
107
  " chunks = web_vtt.get_chunks()\n",
src/ctp_slack_bot/containers.py CHANGED
@@ -5,7 +5,7 @@ from slack_bolt.async_app import AsyncApp
5
 
6
  from ctp_slack_bot.core.config import Settings
7
  from ctp_slack_bot.db.mongo_db import MongoDBResource
8
- from ctp_slack_bot.db.repositories import MongoVectorizedChunkRepository
9
  from ctp_slack_bot.services.answer_retrieval_service import AnswerRetrievalService
10
  from ctp_slack_bot.services.content_ingestion_service import ContentIngestionService
11
  from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
@@ -20,13 +20,13 @@ from ctp_slack_bot.services.vector_database_service import VectorDatabaseService
20
  from ctp_slack_bot.services.vectorization_service import VectorizationService
21
 
22
 
23
- class Container(DeclarativeContainer):
24
  settings = Singleton(Settings)
25
  event_brokerage_service = Singleton(EventBrokerageService)
26
  schedule_service = Resource(ScheduleServiceResource, settings=settings)
27
  mongo_db = Resource(MongoDBResource, settings=settings) # TODO: generalize to any database.
28
- vectorized_chunk_repository = Singleton(MongoVectorizedChunkRepository, mongo_db=mongo_db)
29
- vector_database_service = Singleton(VectorDatabaseService, settings=settings, mongo_db=mongo_db)
30
  embeddings_model_service = Singleton(EmbeddingsModelService, settings=settings)
31
  vectorization_service = Singleton(VectorizationService, settings=settings, embeddings_model_service=embeddings_model_service)
32
  content_ingestion_service = Singleton(ContentIngestionService, settings=settings, event_brokerage_service=event_brokerage_service, vector_database_service=vector_database_service, vectorization_service=vectorization_service)
 
5
 
6
  from ctp_slack_bot.core.config import Settings
7
  from ctp_slack_bot.db.mongo_db import MongoDBResource
8
+ from ctp_slack_bot.db.repositories.mongo_db_vectorized_chunk_repository import MongoVectorizedChunkRepositoryResource
9
  from ctp_slack_bot.services.answer_retrieval_service import AnswerRetrievalService
10
  from ctp_slack_bot.services.content_ingestion_service import ContentIngestionService
11
  from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
 
20
  from ctp_slack_bot.services.vectorization_service import VectorizationService
21
 
22
 
23
+ class Container(DeclarativeContainer): # TODO: audit for potential async-related bugs.
24
  settings = Singleton(Settings)
25
  event_brokerage_service = Singleton(EventBrokerageService)
26
  schedule_service = Resource(ScheduleServiceResource, settings=settings)
27
  mongo_db = Resource(MongoDBResource, settings=settings) # TODO: generalize to any database.
28
+ vectorized_chunk_repository = Resource(MongoVectorizedChunkRepositoryResource, settings=settings, mongo_db=mongo_db)
29
+ vector_database_service = Singleton(VectorDatabaseService, settings=settings, vectorized_chunk_repository=vectorized_chunk_repository)
30
  embeddings_model_service = Singleton(EmbeddingsModelService, settings=settings)
31
  vectorization_service = Singleton(VectorizationService, settings=settings, embeddings_model_service=embeddings_model_service)
32
  content_ingestion_service = Singleton(ContentIngestionService, settings=settings, event_brokerage_service=event_brokerage_service, vector_database_service=vector_database_service, vectorization_service=vectorization_service)
src/ctp_slack_bot/db/mongo_db.py CHANGED
@@ -1,15 +1,14 @@
1
- from asyncio import create_task
2
  from dependency_injector.resources import AsyncResource
3
- from motor.motor_asyncio import AsyncIOMotorClient
4
  from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError
5
- from pymongo.operations import SearchIndexModel
6
  from loguru import logger
7
  from pydantic import BaseModel, PrivateAttr
8
- from typing import Any, Dict, Optional, Self
9
 
10
  from ctp_slack_bot.core.config import Settings
11
  from ctp_slack_bot.utils import sanitize_mongo_db_uri
12
 
 
13
  class MongoDB(BaseModel):
14
  """
15
  MongoDB connection manager using Motor for async operations.
@@ -19,6 +18,7 @@ class MongoDB(BaseModel):
19
  _db: PrivateAttr = PrivateAttr()
20
 
21
  class Config:
 
22
  arbitrary_types_allowed = True
23
 
24
  def __init__(self: Self, **data: Dict[str, Any]) -> None:
@@ -31,7 +31,7 @@ class MongoDB(BaseModel):
31
  connection_string = self.settings.MONGODB_URI.get_secret_value()
32
  logger.debug("Connecting to MongoDB using URI: {}", sanitize_mongo_db_uri(connection_string))
33
 
34
- # Create client with appropriate settings
35
  self._client = AsyncIOMotorClient(
36
  connection_string,
37
  serverSelectionTimeoutMS=5000,
@@ -42,7 +42,7 @@ class MongoDB(BaseModel):
42
  w="majority"
43
  )
44
 
45
- # Set database
46
  db_name = self.settings.MONGODB_NAME
47
 
48
  self._db = self._client[db_name]
@@ -54,116 +54,50 @@ class MongoDB(BaseModel):
54
  self._db = None
55
  raise
56
 
57
- @property
58
- def client(self: Self) -> AsyncIOMotorClient:
59
- """Get the MongoDB client instance."""
60
- if not hasattr(self, '_client') or self._client is None:
61
- logger.warning("MongoDB client not initialized. Attempting to initialize…")
62
- self.connect()
63
- if not hasattr(self, '_client') or self._client is None:
64
- raise ConnectionError("Failed to initialize MongoDB client.")
65
- return self._client
66
-
67
- @property
68
- def db(self: Self) -> Any:
69
- """Get the MongoDB database instance."""
70
- if not hasattr(self, '_db') or self._db is None:
71
- logger.warning("MongoDB database not initialized. Attempting to initialize client…")
72
- self.connect()
73
- if not hasattr(self, '_db') or self._db is None:
74
- raise ConnectionError("Failed to initialize MongoDB database.")
75
- return self._db
76
-
77
  async def ping(self: Self) -> bool:
78
  """Check if MongoDB connection is alive."""
79
  try:
80
- # Get client to ensure we're connected
81
- client = self.client
82
-
83
- # Try a simple ping command
84
- await client.admin.command('ping')
85
  logger.debug("MongoDB connection is active!")
86
  return True
87
  except (ConnectionFailure, ServerSelectionTimeoutError) as e:
88
  logger.error("MongoDB connection failed: {}", e)
89
- return False
90
  except Exception as e:
91
  logger.error("Unexpected error during MongoDB ping: {}", e)
92
- return False
93
 
94
- async def get_collection(self: Self, name: str) -> Any:
95
  """
96
- Get a collection by name with validation.
97
- Creates the collection if it doesn't exist.
98
  """
99
- # First ensure we can connect at all
100
  if not await self.ping():
101
  logger.error("Cannot get collection '{}' because a MongoDB connection is not available.", name)
102
  raise ConnectionError("MongoDB connection is not available.")
103
 
104
  try:
105
- # Get all collection names to check if this one exists
106
  logger.debug("Checking if collection '{}' exists…", name)
107
- collection_names = await self.db.list_collection_names()
108
 
109
  if name not in collection_names:
110
  logger.info("Collection '{}' does not exist. Creating it…", name)
111
- # Create the collection
112
- await self.db.create_collection(name)
 
113
  logger.debug("Successfully created collection: {}", name)
114
  else:
115
  logger.debug("Collection '{}' already exists!", name)
116
 
117
- # Get and return the collection
118
- collection = self.db[name]
119
  return collection
120
  except Exception as e:
121
  logger.error("Error accessing collection '{}': {}", name, e)
122
  raise
123
 
124
- async def create_indexes(self: Self, collection_name: str) -> None:
125
- """
126
- Create a vector search index on a collection.
127
-
128
- Args:
129
- collection_name: Name of the collection
130
- """
131
- collection = await self.get_collection(collection_name)
132
-
133
- try:
134
- # Create search index model using MongoDB's recommended approach
135
- search_index_model = SearchIndexModel(
136
- definition={
137
- "fields": [
138
- {
139
- "type": "vector",
140
- "path": "embedding",
141
- "numDimensions": self.settings.VECTOR_DIMENSION,
142
- "similarity": "cosine",
143
- "quantization": "scalar"
144
- }
145
- ]
146
- },
147
- name=f"{collection_name}_vector_index",
148
- type="vectorSearch"
149
- )
150
-
151
- # Create the search index using the motor collection
152
- result = await collection.create_search_index(search_index_model)
153
- logger.info("Vector search index '{}' created for collection {}.", result, collection_name)
154
-
155
- except Exception as e:
156
- if "command not found" in str(e).lower():
157
- logger.warning("Vector search not supported by this MongoDB instance. Some functionality may be limited.")
158
- # Create a fallback standard index on embedding field
159
- await collection.create_index("embedding")
160
- logger.info("Created standard index on 'embedding' field as fallback.")
161
- else:
162
- logger.error("Failed to create vector index: {}", e)
163
- raise
164
-
165
- async def close(self: Self) -> None:
166
- """Close MongoDB connection."""
167
  if self._client:
168
  self._client.close()
169
  logger.info("Closed MongoDB connection.")
@@ -193,6 +127,6 @@ class MongoDBResource(AsyncResource):
193
  async def shutdown(self: Self, mongo_db: MongoDB) -> None:
194
  """Close MongoDB connection on shutdown."""
195
  try:
196
- await mongo_db.close()
197
  except Exception as e:
198
  logger.error("Error closing MongoDB connection: {}", e)
 
 
1
  from dependency_injector.resources import AsyncResource
2
+ from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorCollection
3
  from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError
 
4
  from loguru import logger
5
  from pydantic import BaseModel, PrivateAttr
6
+ from typing import Any, Dict, Self
7
 
8
  from ctp_slack_bot.core.config import Settings
9
  from ctp_slack_bot.utils import sanitize_mongo_db_uri
10
 
11
+
12
  class MongoDB(BaseModel):
13
  """
14
  MongoDB connection manager using Motor for async operations.
 
18
  _db: PrivateAttr = PrivateAttr()
19
 
20
  class Config:
21
+ frozen=True
22
  arbitrary_types_allowed = True
23
 
24
  def __init__(self: Self, **data: Dict[str, Any]) -> None:
 
31
  connection_string = self.settings.MONGODB_URI.get_secret_value()
32
  logger.debug("Connecting to MongoDB using URI: {}", sanitize_mongo_db_uri(connection_string))
33
 
34
+ # Create client with appropriate settings.
35
  self._client = AsyncIOMotorClient(
36
  connection_string,
37
  serverSelectionTimeoutMS=5000,
 
42
  w="majority"
43
  )
44
 
45
+ # Get the database name.
46
  db_name = self.settings.MONGODB_NAME
47
 
48
  self._db = self._client[db_name]
 
54
  self._db = None
55
  raise
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  async def ping(self: Self) -> bool:
58
  """Check if MongoDB connection is alive."""
59
  try:
60
+ await self._client.admin.command("ping")
 
 
 
 
61
  logger.debug("MongoDB connection is active!")
62
  return True
63
  except (ConnectionFailure, ServerSelectionTimeoutError) as e:
64
  logger.error("MongoDB connection failed: {}", e)
 
65
  except Exception as e:
66
  logger.error("Unexpected error during MongoDB ping: {}", e)
67
+ return False
68
 
69
+ async def get_collection(self: Self, name: str) -> AsyncIOMotorCollection:
70
  """
71
+ Get a collection by name or creates it if it doesn’t exist.
 
72
  """
73
+ # First ensure we can connect at all.
74
  if not await self.ping():
75
  logger.error("Cannot get collection '{}' because a MongoDB connection is not available.", name)
76
  raise ConnectionError("MongoDB connection is not available.")
77
 
78
  try:
79
+ # Get all collection names to check if this one exists.
80
  logger.debug("Checking if collection '{}' exists…", name)
81
+ collection_names = await self._db.list_collection_names()
82
 
83
  if name not in collection_names:
84
  logger.info("Collection '{}' does not exist. Creating it…", name)
85
+
86
+ # Create the collection.
87
+ await self._db.create_collection(name)
88
  logger.debug("Successfully created collection: {}", name)
89
  else:
90
  logger.debug("Collection '{}' already exists!", name)
91
 
92
+ # Get and return the collection.
93
+ collection = self._db[name]
94
  return collection
95
  except Exception as e:
96
  logger.error("Error accessing collection '{}': {}", name, e)
97
  raise
98
 
99
+ def close(self: Self) -> None:
100
+ """Close the MongoDB connection."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  if self._client:
102
  self._client.close()
103
  logger.info("Closed MongoDB connection.")
 
127
  async def shutdown(self: Self, mongo_db: MongoDB) -> None:
128
  """Close MongoDB connection on shutdown."""
129
  try:
130
+ mongo_db.close()
131
  except Exception as e:
132
  logger.error("Error closing MongoDB connection: {}", e)
src/ctp_slack_bot/db/repositories/__init__.py CHANGED
@@ -1,2 +1,3 @@
1
  from ctp_slack_bot.db.repositories.mongo_db_vectorized_chunk_repository import MongoVectorizedChunkRepository
2
  from ctp_slack_bot.db.repositories.vectorized_chunk_repository import VectorizedChunkRepository
 
 
1
  from ctp_slack_bot.db.repositories.mongo_db_vectorized_chunk_repository import MongoVectorizedChunkRepository
2
  from ctp_slack_bot.db.repositories.vectorized_chunk_repository import VectorizedChunkRepository
3
+ from ctp_slack_bot.db.repositories.vector_repository_base import VectorRepositoryBase
src/ctp_slack_bot/db/repositories/mongo_db_vectorized_chunk_repository.py CHANGED
@@ -1,65 +1,155 @@
1
- from typing import List, Optional, Dict, Any
2
- import pymongo
3
- from bson import ObjectId
 
4
 
5
- from ctp_slack_bot.db import MongoDB
 
 
6
  from ctp_slack_bot.db.repositories.vectorized_chunk_repository import VectorizedChunkRepository
7
- from ctp_slack_bot.models.base import VectorizedChunk
8
-
9
- class MongoVectorizedChunkRepository(VectorizedChunkRepository):
10
- """MongoDB implementation of VectorizedChunkRepository."""
11
-
12
- def __init__(self, mongo_db: MongoDB):
13
- self.mongo_db = mongo_db
14
- self.collection = self.mongo_db.db.get_collection("vectorized_chunks")
15
-
16
- # Create indexes for efficient queries
17
- self.collection.create_index("chunk_id")
18
- self.collection.create_index("parent_id")
19
-
20
- async def find_by_id(self, id: str) -> Optional[VectorizedChunk]:
21
- doc = await self.collection.find_one({"_id": ObjectId(id)})
22
- return self._map_to_entity(doc) if doc else None
23
-
24
- async def find_all(self) -> List[VectorizedChunk]:
25
- cursor = self.collection.find({})
26
- return [self._map_to_entity(doc) async for doc in cursor]
27
-
28
- async def find_by_parent_id(self, parent_id: str) -> List[VectorizedChunk]:
 
 
 
 
29
  cursor = self.collection.find({"parent_id": parent_id})
30
- return [self._map_to_entity(doc) async for doc in cursor]
31
-
32
- async def save(self, chunk: VectorizedChunk) -> VectorizedChunk:
33
- doc = self._map_to_document(chunk)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- if "_id" in doc and doc["_id"]:
36
- # Update existing document
37
- await self.collection.replace_one({"_id": doc["_id"]}, doc)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  else:
39
- # Insert new document
40
- result = await self.collection.insert_one(doc)
41
- doc["_id"] = result.inserted_id
42
-
43
- return self._map_to_entity(doc)
44
-
45
- async def delete(self, id: str) -> bool:
46
- result = await self.collection.delete_one({"_id": ObjectId(id)})
47
- return result.deleted_count > 0
48
-
49
- async def find_by_metadata(self, metadata_query: Dict[str, Any]) -> List[VectorizedChunk]:
50
- # Convert the metadata query to MongoDB query format
51
- query = {f"metadata.{k}": v for k, v in metadata_query.items()}
52
- cursor = self.collection.find(query)
53
- return [self._map_to_entity(doc) async for doc in cursor]
54
-
55
- def _map_to_document(self, chunk: VectorizedChunk) -> Dict[str, Any]:
56
- """Convert a VectorizedChunk to a MongoDB document."""
57
- doc = chunk.model_dump()
58
- # Handle any special conversions needed
59
- return doc
60
-
61
- def _map_to_entity(self, doc: Dict[str, Any]) -> VectorizedChunk:
62
- """Convert a MongoDB document to a VectorizedChunk."""
63
- if "_id" in doc:
64
- doc["id"] = str(doc.pop("_id"))
65
- return VectorizedChunk(**doc)
 
1
+ from dependency_injector.resources import AsyncResource
2
+ from loguru import logger
3
+ from pymongo import ReturnDocument
4
+ from typing import Any, Collection, Dict, Iterable, Mapping, Optional, Self, Sequence, Set
5
 
6
+ from ctp_slack_bot.core import Settings
7
+ from ctp_slack_bot.models import Chunk, VectorizedChunk, VectorQuery
8
+ from ctp_slack_bot.db.mongo_db import MongoDB
9
  from ctp_slack_bot.db.repositories.vectorized_chunk_repository import VectorizedChunkRepository
10
+ from ctp_slack_bot.db.repositories.vector_repository_base import VectorRepositoryBase
11
+
12
+
13
+ class MongoVectorizedChunkRepository(VectorRepositoryBase, VectorizedChunkRepository):
14
+ """MongoDB implementation of VectorizedChunkRepository"""
15
+
16
+ def __init__(self: Self, **data: Dict[str, Any]) -> None:
17
+ super().__init__(**data)
18
+ logger.debug("Created {}", self.__class__.__name__)
19
+
20
+ async def count_by_id(self: Self, parent_id: str, chunk_id: Optional[str] = None) -> int:
21
+ if chunk_id is None:
22
+ return await self.collection.count_documents({"parent_id": parent_id})
23
+ else:
24
+ return await self.collection.count_documents({"parent_id": parent_id, "chunk_id": chunk_id})
25
+
26
+ async def find_all(self: Self) -> Collection[VectorizedChunk]:
27
+ cursor = self.collection.find()
28
+ return [VectorizedChunk(**document) async for document in cursor] # TODO: mutable until async support is extended to tuples
29
+
30
+ async def find_by_metadata(self: Self, metadata_query: Mapping[str, Any]) -> Collection[VectorizedChunk]:
31
+ query = {f"metadata.{key}": value for key, value in metadata_query.items()}
32
+ cursor = self.collection.find(query)
33
+ return [VectorizedChunk(**document) async for document in cursor] # TODO: mutable until async support is extended to tuples
34
+
35
+ async def find_by_parent_id(self: Self, parent_id: str) -> Collection[VectorizedChunk]:
36
  cursor = self.collection.find({"parent_id": parent_id})
37
+ return [VectorizedChunk(**document) async for document in cursor] # TODO: mutable until async support is extended to tuples
38
+
39
+ async def find_by_parent_and_chunk_ids(self: Self, parent_id: str, chunk_id: str) -> Optional[VectorizedChunk]:
40
+ document = await self.collection.find_one({"parent_id": parent_id, "chunk_id": chunk_id})
41
+ return VectorizedChunk(**document) if document else None
42
+
43
+ async def find_by_vector(self: Self, query_embedding: Sequence[float], k: int = 5, score_threshold: float = 0.7) -> Sequence[VectorizedChunk]:
44
+ pipeline = [
45
+ {
46
+ "$vectorSearch": {
47
+ "index": "vector_index",
48
+ "path": "embedding",
49
+ "queryVector": query_embedding,
50
+ "numCandidates": k * 2,
51
+ "limit": k,
52
+ "score": {"$meta": "vectorSearchScore"}
53
+ }
54
+ },
55
+ {"$match": {"score": {"$gte": score_threshold}}}
56
+ ]
57
+ cursor = self.collection.aggregate(pipeline)
58
+ return [VectorizedChunk(**document) async for document in cursor] # TODO: mutable until async support is extended to tuples
59
+
60
+ async def find_by_vector(self: Self, query: VectorQuery) -> Sequence[Chunk]:
61
+ """
62
+ Query the vector database for similar documents.
63
 
64
+ Args:
65
+ query: VectorQuery object with search parameters
66
+
67
+ Returns:
68
+ Sequence[Chunk]: List of similar chunks
69
+ """
70
+ # Build aggregation pipeline for vector search using official MongoDB format.
71
+ pipeline = [
72
+ {
73
+ "$vectorSearch": {
74
+ "index": f"{self.collection.name}_vector_index",
75
+ "path": "embedding",
76
+ "queryVector": query.query_embeddings,
77
+ "numCandidates": query.k * 10,
78
+ "limit": query.k
79
+ }
80
+ },
81
+ {
82
+ "$project": {
83
+ "text": 1,
84
+ "metadata": 1,
85
+ "parent_id": 1,
86
+ "chunk_id": 1,
87
+ "score": { "$meta": "vectorSearchScore" }
88
+ }
89
+ },
90
+ {
91
+ "$match": {
92
+ "score": { "$gte": query.score_threshold }
93
+ }
94
+ }
95
+ ]
96
+ if query.filter_metadata: # Add metadata filters if provided.
97
+ metadata_filter = {f"metadata.{key}": value for key, value in query.filter_metadata.items()}
98
+ pipeline.insert(1, {"$match": metadata_filter})
99
+
100
+ # Execute the vector search pipeline.
101
+ results = await self.collection.aggregate(pipeline).to_list(length=query.k)
102
+
103
+ # Convert results to Chunk objects ― don’t care about the embeddings.
104
+ return tuple(Chunk(text=result["text"],
105
+ parent_id=result["parent_id"],
106
+ chunk_id=result["chunk_id"],
107
+ metadata={**result["metadata"], "similarity_score": result.get("score", 0)})
108
+ for result
109
+ in results)
110
+
111
+ async def insert_one(self, chunk: VectorizedChunk) -> str:
112
+ document = chunk.model_dump()
113
+ result = await self.collection.insert_one(document)
114
+ return str(result.inserted_id)
115
+
116
+ async def insert_many(self, chunks: Iterable[VectorizedChunk]) -> Set[str]:
117
+ documents = [chunk.model_dump() for chunk in chunks]
118
+ result = await self.collection.insert_many(documents)
119
+ return frozenset(map(str, result.inserted_ids))
120
+
121
+ async def replace_all(self: Self, chunks: Iterable[VectorizedChunk]) -> Set[str]:
122
+ parent_ids = set()
123
+ documents = []
124
+ for chunk in chunks:
125
+ parent_ids.add(chunk.parent_id)
126
+ documents.append(chunk.model_dump())
127
+ async with await self.collection.database.client.start_session() as session:
128
+ async with session.start_transaction():
129
+ delete_result = await self.collection.delete_many({"parent_id": {"$in": tuple(parent_ids)}}, session=session)
130
+ insert_result = await self.collection.insert_many(documents, session=session)
131
+ return frozenset(map(str, insert_result.inserted_ids))
132
+
133
+ async def replace_one(self: Self, chunk: VectorizedChunk) -> str:
134
+ result = await self.collection.find_one_and_replace(
135
+ {"parent_id": chunk.parent_id, "chunk_id": chunk.chunk_id},
136
+ chunk.model_dump(),
137
+ upsert=True,
138
+ return_document=ReturnDocument.AFTER
139
+ )
140
+ return result["_id"]
141
+
142
+ async def delete(self: Self, parent_id: str, chunk_id: Optional[str] = None) -> int:
143
+ if chunk_id is not None:
144
+ result = await self.collection.delete_one({"parent_id": parent_id, "chunk_id": chunk_id})
145
  else:
146
+ result = await self.collection.delete_many({"parent_id": parent_id})
147
+ return result.deleted_count
148
+
149
+
150
+ class MongoVectorizedChunkRepositoryResource(AsyncResource):
151
+ async def init(self: Self, settings: Settings, mongo_db: MongoDB) -> MongoVectorizedChunkRepository:
152
+ vectorized_chunk_collection = await mongo_db.get_collection("vectorized_chunks")
153
+ vectorized_chunk_repository = MongoVectorizedChunkRepository(settings=settings, collection=vectorized_chunk_collection)
154
+ await vectorized_chunk_repository.create_indexes()
155
+ return vectorized_chunk_repository
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/ctp_slack_bot/db/repositories/vector_repository_base.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC
2
+ from loguru import logger
3
+ from motor.motor_asyncio import AsyncIOMotorCollection
4
+ from pydantic import BaseModel
5
+ from pymongo.operations import SearchIndexModel
6
+ from typing import Self
7
+
8
+ from ctp_slack_bot.core import Settings
9
+
10
+ class VectorRepositoryBase(ABC, BaseModel):
11
+ """MongoDB implementation of VectorizedChunkRepository"""
12
+
13
+ settings: Settings
14
+ collection: AsyncIOMotorCollection
15
+
16
+ class Config:
17
+ frozen=True
18
+ arbitrary_types_allowed = True
19
+
20
+ async def create_indexes(self: Self) -> None:
21
+ """
22
+ Create a vector search index.
23
+ """
24
+ index_name = f"{self.collection.name}_vector_index"
25
+ try:
26
+ existing_indexes = [index["name"] async for index in self.collection.list_search_indexes()]
27
+ logger.debug("{} existing indices were found: {}", len(existing_indexes), existing_indexes)
28
+ if index_name in existing_indexes:
29
+ logger.debug("Index '{}' already exists; duplicate index will not be created.", index_name)
30
+ return
31
+
32
+ # Create search index model using MongoDB's recommended approach.
33
+ search_index_model = SearchIndexModel(
34
+ definition={
35
+ "fields": [
36
+ {
37
+ "type": "vector",
38
+ "path": "embedding",
39
+ "numDimensions": self.settings.VECTOR_DIMENSION,
40
+ "similarity": "cosine",
41
+ "quantization": "scalar"
42
+ }
43
+ ]
44
+ },
45
+ name=index_name,
46
+ type="vectorSearch"
47
+ )
48
+ result = await self.collection.create_search_index(search_index_model)
49
+ logger.info("Vector search index '{}' created for collection {}.", result, self.collection.name)
50
+ except Exception as e:
51
+ if "command not found" in str(e).lower():
52
+ logger.warning("Vector search not supported by this MongoDB instance. Some functionality may be limited.")
53
+ # Create a fallback standard index on embedding field.
54
+ await self.collection.create_index("embedding")
55
+ logger.info("Created standard index on 'embedding' field as fallback.")
56
+ else:
57
+ logger.error("Failed to create vector index: {}", e)
58
+ raise
src/ctp_slack_bot/db/repositories/vectorized_chunk_repository.py CHANGED
@@ -1,30 +1,52 @@
1
- from typing import List, Optional, Dict, Any
 
 
2
 
3
- from ctp_slack_bot.models.base import VectorizedChunk
4
 
5
- class VectorizedChunkRepository:
6
  """Repository interface for VectorizedChunk entities."""
7
 
8
- async def find_by_id(self, id: str) -> Optional[VectorizedChunk]:
9
- """Find a chunk by its ID."""
10
  pass
11
 
12
- async def find_all(self) -> List[VectorizedChunk]:
13
- """Find all chunks."""
14
  pass
15
 
16
- async def find_by_parent_id(self, parent_id: str) -> List[VectorizedChunk]:
17
- """Find chunks by parent document ID."""
18
  pass
19
 
20
- async def save(self, chunk: VectorizedChunk) -> VectorizedChunk:
21
- """Save a chunk to the database."""
22
  pass
23
 
24
- async def delete(self, id: str) -> bool:
25
- """Delete a chunk by its ID."""
26
  pass
27
 
28
- async def find_by_metadata(self, metadata_query: Dict[str, Any]) -> List[VectorizedChunk]:
29
- """Find chunks by metadata criteria."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  pass
 
1
+ from abc import ABC, abstractmethod
2
+ from pydantic import BaseModel
3
+ from typing import Any, Collection, Iterable, Mapping, Optional, Self, Sequence, Set
4
 
5
+ from ctp_slack_bot.models import Chunk, VectorizedChunk, VectorQuery
6
 
7
+ class VectorizedChunkRepository(ABC, BaseModel):
8
  """Repository interface for VectorizedChunk entities."""
9
 
10
+ @abstractmethod
11
+ async def count_by_id(self: Self, parent_id: str, chunk_id: Optional[str] = None) -> int:
12
  pass
13
 
14
+ @abstractmethod
15
+ async def find_all(self: Self) -> Collection[VectorizedChunk]:
16
  pass
17
 
18
+ @abstractmethod
19
+ async def find_by_metadata(self: Self, metadata_query: Mapping[str, Any]) -> Collection[VectorizedChunk]:
20
  pass
21
 
22
+ @abstractmethod
23
+ async def find_by_parent_id(self: Self, parent_id: str) -> Collection[VectorizedChunk]:
24
  pass
25
 
26
+ @abstractmethod
27
+ async def find_by_parent_and_chunk_ids(self: Self, parent_id: str, chunk_id: str) -> Optional[VectorizedChunk]:
28
  pass
29
 
30
+ @abstractmethod
31
+ async def find_by_vector(self: Self, query: VectorQuery) -> Sequence[Chunk]:
32
+ pass
33
+
34
+ @abstractmethod
35
+ async def insert_one(self, chunk: VectorizedChunk) -> str:
36
+ pass
37
+
38
+ @abstractmethod
39
+ async def insert_many(self, chunks: Iterable[VectorizedChunk]) -> Set[str]:
40
+ pass
41
+
42
+ @abstractmethod
43
+ async def replace_all(self: Self, chunks: Iterable[VectorizedChunk]) -> Set[str]:
44
+ pass
45
+
46
+ @abstractmethod
47
+ async def replace_one(self: Self, chunk: VectorizedChunk) -> str:
48
+ pass
49
+
50
+ @abstractmethod
51
+ async def delete(self: Self, parent_id: str, chunk_id: Optional[str] = None) -> int:
52
  pass
src/ctp_slack_bot/models/base.py CHANGED
@@ -1,6 +1,8 @@
1
  from abc import ABC, abstractmethod
2
- from pydantic import BaseModel, ConfigDict, Field
3
- from typing import Any, final, Mapping, Self, Sequence, Optional
 
 
4
 
5
 
6
  class Chunk(BaseModel):
@@ -9,10 +11,15 @@ class Chunk(BaseModel):
9
  text: str # The text representation
10
  parent_id: str # The source content’s identity
11
  chunk_id: str # This chunk’s identity—unique within the source content
12
- metadata: Mapping[str, Any]
13
 
14
  model_config = ConfigDict(frozen=True)
15
 
 
 
 
 
 
16
 
17
  @final
18
  class VectorQuery(BaseModel):
@@ -25,19 +32,24 @@ class VectorQuery(BaseModel):
25
  filter_metadata: Optional filters for metadata fields
26
  """
27
 
28
- query_embeddings: Sequence[float]
29
  k: int
30
  score_threshold: float = Field(default=0.7)
31
- filter_metadata: Optional[Mapping[str, Any]] = None
32
 
33
  model_config = ConfigDict(frozen=True)
34
 
 
 
 
 
 
35
 
36
  @final
37
  class VectorizedChunk(Chunk):
38
  """A class representing a vectorized chunk of content."""
39
 
40
- embedding: Sequence[float] # The vector representation
41
 
42
 
43
  class Content(ABC, BaseModel):
@@ -50,7 +62,7 @@ class Content(ABC, BaseModel):
50
  pass
51
 
52
  @abstractmethod
53
- def get_chunks(self: Self) -> Sequence[Chunk]:
54
  pass
55
 
56
  @abstractmethod
 
1
  from abc import ABC, abstractmethod
2
+ from pydantic import BaseModel, ConfigDict, Field, field_validator
3
+ from typing import Any, final, Mapping, Optional, Self
4
+
5
+ from ctp_slack_bot.utils import to_deep_immutable
6
 
7
 
8
  class Chunk(BaseModel):
 
11
  text: str # The text representation
12
  parent_id: str # The source content’s identity
13
  chunk_id: str # This chunk’s identity—unique within the source content
14
+ metadata: Mapping[str, Any] = Field(default_factory=dict)
15
 
16
  model_config = ConfigDict(frozen=True)
17
 
18
+ @field_validator('metadata')
19
+ @classmethod
20
+ def __make_metadata_readonly(cls, value: Mapping[str, Any]) -> Mapping[str, Any]:
21
+ return to_deep_immutable(value)
22
+
23
 
24
  @final
25
  class VectorQuery(BaseModel):
 
32
  filter_metadata: Optional filters for metadata fields
33
  """
34
 
35
+ query_embeddings: tuple[float, ...]
36
  k: int
37
  score_threshold: float = Field(default=0.7)
38
+ filter_metadata: Mapping[str, Any] = Field(default_factory=dict)
39
 
40
  model_config = ConfigDict(frozen=True)
41
 
42
+ @field_validator('filter_metadata')
43
+ @classmethod
44
+ def __make_metadata_readonly(cls, value: Mapping[str, Any]) -> Mapping[str, Any]:
45
+ return to_deep_immutable(value)
46
+
47
 
48
  @final
49
  class VectorizedChunk(Chunk):
50
  """A class representing a vectorized chunk of content."""
51
 
52
+ embedding: tuple[float, ...] # The vector representation
53
 
54
 
55
  class Content(ABC, BaseModel):
 
62
  pass
63
 
64
  @abstractmethod
65
+ def get_chunks(self: Self) -> tuple[Chunk, ...]:
66
  pass
67
 
68
  @abstractmethod
src/ctp_slack_bot/models/slack.py CHANGED
@@ -2,7 +2,7 @@ from datetime import datetime
2
  from json import dumps
3
  from pydantic import BaseModel, ConfigDict, PositiveInt, PrivateAttr
4
  from types import MappingProxyType
5
- from typing import Any, Dict, Literal, Mapping, Optional, Self, Sequence
6
 
7
  from ctp_slack_bot.models.base import Chunk, Content
8
 
@@ -23,7 +23,7 @@ class SlackEvent(BaseModel):
23
  type: str
24
  event_id: str
25
  event_time: int
26
- authed_users: Sequence[str]
27
 
28
  model_config = ConfigDict(frozen=True)
29
 
@@ -40,7 +40,7 @@ class SlackReaction(BaseModel):
40
 
41
  name: str
42
  count: PositiveInt
43
- users: Sequence[str]
44
 
45
  model_config = ConfigDict(frozen=True)
46
 
@@ -61,14 +61,14 @@ class SlackMessage(Content):
61
  deleted_ts: Optional[str] = None
62
  hidden: bool = False
63
  is_starred: Optional[bool] = None
64
- pinned_to: Optional[Sequence[str]] = None
65
- reactions: Optional[Sequence[SlackReaction]] = None
66
 
67
  def get_id(self: Self) -> str:
68
  """Unique identifier for this message."""
69
  return f"slack-message:{self.channel}:{self.ts}"
70
 
71
- def get_chunks(self: Self) -> Sequence[Chunk]:
72
  return (Chunk(text=self.text, parent_id=self.get_id(), chunk_id="", metadata=self.get_metadata()), )
73
 
74
  def get_metadata(self: Self) -> Mapping[str, Any]:
 
2
  from json import dumps
3
  from pydantic import BaseModel, ConfigDict, PositiveInt, PrivateAttr
4
  from types import MappingProxyType
5
+ from typing import Any, Literal, Mapping, Optional, Self
6
 
7
  from ctp_slack_bot.models.base import Chunk, Content
8
 
 
23
  type: str
24
  event_id: str
25
  event_time: int
26
+ authed_users: tuple[str, ...]
27
 
28
  model_config = ConfigDict(frozen=True)
29
 
 
40
 
41
  name: str
42
  count: PositiveInt
43
+ users: tuple[str, ...]
44
 
45
  model_config = ConfigDict(frozen=True)
46
 
 
61
  deleted_ts: Optional[str] = None
62
  hidden: bool = False
63
  is_starred: Optional[bool] = None
64
+ pinned_to: Optional[tuple[str, ...]] = None
65
+ reactions: Optional[tuple[SlackReaction, ...]] = None
66
 
67
  def get_id(self: Self) -> str:
68
  """Unique identifier for this message."""
69
  return f"slack-message:{self.channel}:{self.ts}"
70
 
71
+ def get_chunks(self: Self) -> tuple[Chunk]:
72
  return (Chunk(text=self.text, parent_id=self.get_id(), chunk_id="", metadata=self.get_metadata()), )
73
 
74
  def get_metadata(self: Self) -> Mapping[str, Any]:
src/ctp_slack_bot/models/webvtt.py CHANGED
@@ -1,15 +1,13 @@
1
  from datetime import datetime, timedelta
2
  from io import BytesIO
3
- from itertools import starmap
4
- from json import dumps
5
  from more_itertools import windowed
6
- from pydantic import BaseModel, ConfigDict, Field, PositiveInt, PrivateAttr
7
  from re import compile as compile_re
8
- from types import MappingProxyType
9
- from typing import Any, Dict, Literal, Mapping, Optional, Self, Sequence
10
  from webvtt import Caption, WebVTT
11
 
12
  from ctp_slack_bot.models.base import Chunk, Content
 
13
 
14
 
15
  CHUNK_FRAMES_OVERLAP = 1
@@ -47,12 +45,12 @@ class WebVTTContent(Content):
47
  id: str
48
  metadata: Mapping[str, Any] = Field(default_factory=dict)
49
  start_time: Optional[datetime]
50
- frames: Sequence[WebVTTFrame]
51
 
52
  def get_id(self: Self) -> str:
53
  return self.id
54
 
55
- def get_chunks(self: Self) -> Sequence[Chunk]:
56
  windows = (tuple(filter(None, window))
57
  for window
58
  in windowed(self.frames, CHUNK_FRAMES_WINDOW, step=CHUNK_FRAMES_WINDOW-CHUNK_FRAMES_OVERLAP))
@@ -64,13 +62,13 @@ class WebVTTContent(Content):
64
  metadata={
65
  "start": self.start_time + frames[0].start if self.start_time else None,
66
  "end": self.start_time + frames[-1].end if self.start_time else None,
67
- "speakers": tuple(frame.speaker for frame in frames if frame.speaker)
68
  })
69
  for frames
70
  in windows)
71
 
72
  def get_metadata(self: Self) -> Mapping[str, Any]:
73
- return MappingProxyType(self.metadata)
74
 
75
  @classmethod
76
  def __get_start_time(cls, web_vtt: WebVTT) -> Optional[datetime]:
@@ -88,4 +86,9 @@ class WebVTTContent(Content):
88
  frames = tuple(WebVTTFrame.from_webvtt_caption(caption, index)
89
  for index, caption
90
  in enumerate(web_vtt.captions, 1))
91
- return WebVTTContent(id=id, metadata=MappingProxyType(metadata), start_time=cls.__get_start_time(web_vtt), frames=frames)
 
 
 
 
 
 
1
  from datetime import datetime, timedelta
2
  from io import BytesIO
 
 
3
  from more_itertools import windowed
4
+ from pydantic import BaseModel, ConfigDict, Field, field_validator
5
  from re import compile as compile_re
6
+ from typing import Any, Literal, Mapping, Optional, Self
 
7
  from webvtt import Caption, WebVTT
8
 
9
  from ctp_slack_bot.models.base import Chunk, Content
10
+ from ctp_slack_bot.utils import to_deep_immutable
11
 
12
 
13
  CHUNK_FRAMES_OVERLAP = 1
 
45
  id: str
46
  metadata: Mapping[str, Any] = Field(default_factory=dict)
47
  start_time: Optional[datetime]
48
+ frames: tuple[WebVTTFrame, ...]
49
 
50
  def get_id(self: Self) -> str:
51
  return self.id
52
 
53
+ def get_chunks(self: Self) -> tuple[Chunk]:
54
  windows = (tuple(filter(None, window))
55
  for window
56
  in windowed(self.frames, CHUNK_FRAMES_WINDOW, step=CHUNK_FRAMES_WINDOW-CHUNK_FRAMES_OVERLAP))
 
62
  metadata={
63
  "start": self.start_time + frames[0].start if self.start_time else None,
64
  "end": self.start_time + frames[-1].end if self.start_time else None,
65
+ "speakers": (frame.speaker for frame in frames if frame.speaker)
66
  })
67
  for frames
68
  in windows)
69
 
70
  def get_metadata(self: Self) -> Mapping[str, Any]:
71
+ return self.metadata
72
 
73
  @classmethod
74
  def __get_start_time(cls, web_vtt: WebVTT) -> Optional[datetime]:
 
86
  frames = tuple(WebVTTFrame.from_webvtt_caption(caption, index)
87
  for index, caption
88
  in enumerate(web_vtt.captions, 1))
89
+ return WebVTTContent(id=id, metadata=metadata, start_time=cls.__get_start_time(web_vtt), frames=frames)
90
+
91
+ @field_validator('metadata')
92
+ @classmethod
93
+ def __make_metadata_readonly(cls, value: Mapping[str, Any]) -> Mapping[str, Any]:
94
+ return to_deep_immutable(value)
src/ctp_slack_bot/services/content_ingestion_service.py CHANGED
@@ -30,9 +30,9 @@ class ContentIngestionService(BaseModel):
30
 
31
  async def process_incoming_content(self: Self, content: Content) -> None:
32
  logger.debug("Content ingestion service received content with metadata: {}", content.get_metadata())
33
- # if self.vector_database_service.has_content(content.get_id()) # TODO
34
- # logger.debug("Ignored content with ID {} because it already exists in the database.", content.get_id())
35
- # return
36
  chunks = content.get_chunks()
37
  await self.__vectorize_and_store_chunks_in_database(chunks)
38
  logger.debug("Stored {} vectorized chunk(s) in the database.", len(chunks))
@@ -44,6 +44,5 @@ class ContentIngestionService(BaseModel):
44
  logger.debug("Stored {} vectorized chunk(s) in the database.", len(chunks))
45
 
46
  async def __vectorize_and_store_chunks_in_database(self: Self, chunks: Sequence[Chunk]) -> None:
47
- vectorized_chunks = self.vectorization_service.vectorize(chunks) # TODO
48
- await self.vector_database_service.store(vectorized_chunks) # TODO
49
-
 
30
 
31
  async def process_incoming_content(self: Self, content: Content) -> None:
32
  logger.debug("Content ingestion service received content with metadata: {}", content.get_metadata())
33
+ if self.vector_database_service.content_exists(content.get_id()):
34
+ logger.debug("Ignored content with identifier, {}, because it already exists in the database.", content.get_id())
35
+ return
36
  chunks = content.get_chunks()
37
  await self.__vectorize_and_store_chunks_in_database(chunks)
38
  logger.debug("Stored {} vectorized chunk(s) in the database.", len(chunks))
 
44
  logger.debug("Stored {} vectorized chunk(s) in the database.", len(chunks))
45
 
46
  async def __vectorize_and_store_chunks_in_database(self: Self, chunks: Sequence[Chunk]) -> None:
47
+ vectorized_chunks = self.vectorization_service.vectorize(chunks)
48
+ await self.vector_database_service.store(vectorized_chunks)
 
src/ctp_slack_bot/services/context_retrieval_service.py CHANGED
@@ -34,33 +34,23 @@ class ContextRetrievalService(BaseModel):
34
  Returns:
35
  Sequence[Chunk]: List of retrieved context items with similarity scores
36
  """
37
- # Extract chunks from the message
38
- message_chunks = message.get_chunks()
39
-
40
- # Vectorize the chunks
41
- vectorized_chunks = self.vectorization_service.vectorize(message_chunks)
42
-
43
- # Create vector query using the first chunk's embedding (typically there's only one chunk for a message)
44
- if not vectorized_chunks:
45
- logger.warning("No vectorized chunks were created for message")
46
- return []
47
-
48
  query = VectorQuery(
49
- query_embeddings=vectorized_chunks[0].embedding,
50
  k=self.settings.TOP_K_MATCHES,
51
  score_threshold=self.settings.SCORE_THRESHOLD,
52
- filter_metadata=None # Can be expanded to include filters based on message metadata
53
  )
54
-
55
- # Perform similarity search
56
  try:
57
- results = await self.vector_database_service.search_by_similarity(query)
58
- # logger.info(f"Retrieved {len(results)} context chunks for query")
59
  return results
60
  except Exception as e:
61
- logger.error(f"Error retrieving context: {str(e)}")
62
- return []
63
-
64
- # test return statement
65
- # return (VectorizedChunk(text="Mock context chunk", parent_id="lol", chunk_id="no", metadata={}, embedding=tuple()),
66
- # VectorizedChunk(text="Moar mock context chunk", parent_id="lol", chunk_id="wut", metadata={}, embedding=tuple()))
 
34
  Returns:
35
  Sequence[Chunk]: List of retrieved context items with similarity scores
36
  """
37
+ message_chunks = message.get_chunks() # Guaranteed to have exactly 1 chunk
38
+
39
+ try:
40
+ vectorized_message_chunks = self.vectorization_service.vectorize(message_chunks)
41
+ except Exception as e:
42
+ logger.error("An error occurred while vectorizing the question, “{}”: {}", message.text, e)
43
+
 
 
 
 
44
  query = VectorQuery(
45
+ query_embeddings=vectorized_message_chunks[0].embedding,
46
  k=self.settings.TOP_K_MATCHES,
47
  score_threshold=self.settings.SCORE_THRESHOLD,
48
+ filter_metadata={} # Can be expanded to include filters based on message metadata
49
  )
50
+
 
51
  try:
52
+ results = await self.vector_database_service.find_by_vector(query)
 
53
  return results
54
  except Exception as e:
55
+ logger.error("An error occurred while searching the vector database for context: {}", e)
56
+ return ()
 
 
 
 
src/ctp_slack_bot/services/vector_database_service.py CHANGED
@@ -1,17 +1,18 @@
1
  from loguru import logger
2
  from pydantic import BaseModel
3
- from typing import Any, Collection, Dict, List, Optional, Self, Sequence
4
 
5
  from ctp_slack_bot.core import Settings
6
- from ctp_slack_bot.db import MongoDB
7
  from ctp_slack_bot.models import Chunk, VectorizedChunk, VectorQuery
8
 
9
  class VectorDatabaseService(BaseModel): # TODO: this should not rely specifically on MongoDB.
10
  """
11
  Service for storing and retrieving vector embeddings from MongoDB.
12
  """
 
13
  settings: Settings
14
- mongo_db: MongoDB
15
 
16
  class Config:
17
  frozen=True
@@ -19,157 +20,48 @@ class VectorDatabaseService(BaseModel): # TODO: this should not rely specificall
19
  def __init__(self: Self, **data) -> None:
20
  super().__init__(**data)
21
  logger.debug("Created {}", self.__class__.__name__)
22
-
23
- async def store(self: Self, chunks: Collection[VectorizedChunk]) -> None:
24
- """
25
- Stores vectorized chunks and their embedding vectors in the database.
26
-
27
- Args:
28
- chunks: Collection of VectorizedChunk objects to store
29
-
30
- Returns: None
31
- """
32
- if not chunks:
33
- logger.debug("No chunks to store")
34
- return
35
-
36
- try:
37
- # Get the vector collection - this will create it if it doesn't exist
38
- logger.debug("Getting vectors collection for storing {} chunks", len(chunks))
39
- vector_collection = await self.mongo_db.get_collection("vectors")
40
-
41
- # Ensure vector search index exists
42
- logger.debug("Creating vector search index for vectors collection")
43
- await self.mongo_db.create_indexes("vectors")
44
-
45
- # Create documents to store, ensuring compatibility with BSON
46
- documents = []
47
- for chunk in chunks:
48
- # Convert embedding to standard list format (important for BSON compatibility)
49
- embedding = list(chunk.embedding) if not isinstance(chunk.embedding, list) else chunk.embedding
50
-
51
- # Build document with proper structure
52
- document = {
53
- "text": chunk.text,
54
- "embedding": embedding,
55
- "metadata": chunk.metadata,
56
- "parent_id": chunk.parent_id,
57
- "chunk_id": chunk.chunk_id
58
- }
59
- documents.append(document)
60
-
61
- # Insert into collection as a batch
62
- logger.debug("Inserting {} documents into vectors collection", len(documents))
63
- result = await vector_collection.insert_many(documents)
64
- logger.info("Stored {} vector chunks in database", len(result.inserted_ids))
65
-
66
- except Exception as e:
67
- logger.error("Error storing vector embeddings: {}", str(e))
68
- # Include more diagnostic information
69
- logger.debug("MongoDB connection info: URI defined: {}, DB name: {}",
70
- bool(self.settings.MONGODB_URI), self.settings.MONGODB_NAME)
71
- raise
72
 
73
- async def content_exists(self: Self, key: str)-> bool: # TODO: implement this.
74
  """
75
- Check if content exists in the database.
76
 
77
  Args:
78
- key: The key to check for content existence
 
79
  """
80
- pass
 
81
 
82
- async def search_by_similarity(self: Self, query: VectorQuery) -> Sequence[Chunk]:
83
  """
84
- Query the vector database for similar documents.
85
-
86
  Args:
87
- query: VectorQuery object with search parameters
88
 
89
  Returns:
90
- Sequence[Chunk]: List of similar chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  """
92
  try:
93
- # Get the vector collection
94
- logger.debug("Getting vectors collection for similarity search")
95
- vector_collection = await self.mongo_db.get_collection("vectors")
96
-
97
- # Build aggregation pipeline for vector search using official MongoDB format
98
- logger.debug("Building vector search pipeline with query embedding dimension: {}", len(query.query_embeddings))
99
- pipeline = [
100
- {
101
- "$vectorSearch": {
102
- "index": "vectors_vector_index",
103
- "path": "embedding",
104
- "queryVector": query.query_embeddings, #list(query.query_embeddings),
105
- "numCandidates": query.k * 10,
106
- "limit": query.k
107
- }
108
- },
109
- {
110
- "$project": {
111
- "text": 1,
112
- "metadata": 1,
113
- "parent_id": 1,
114
- "chunk_id": 1,
115
- "score": { "$meta": "vectorSearchScore" }
116
- }
117
- }
118
- ]
119
-
120
- # Add metadata filters if provided
121
- if query.filter_metadata:
122
- metadata_filter = {f"metadata.{k}": v for k, v in query.filter_metadata.items()}
123
- pipeline.insert(1, {"$match": metadata_filter})
124
- logger.debug("Added metadata filters to search: {}", query.filter_metadata)
125
-
126
- # Add score threshold filter if needed
127
- if query.score_threshold > 0:
128
- pipeline.append({
129
- "$match": {
130
- "score": { "$gte": query.score_threshold }
131
- }
132
- })
133
- logger.debug("Added score threshold filter: {}", query.score_threshold)
134
-
135
- try:
136
- # Execute the vector search pipeline
137
- logger.debug("Executing vector search pipeline")
138
- results = await vector_collection.aggregate(pipeline).to_list(length=query.k)
139
- logger.debug("Vector search returned {} results", len(results))
140
- except Exception as e:
141
- logger.warning("Vector search failed: {}. Falling back to basic text search.", str(e))
142
- # Fall back to basic filtering with limit
143
- query_filter = {}
144
- if query.filter_metadata:
145
- query_filter.update({f"metadata.{k}": v for k, v in query.filter_metadata.items()})
146
-
147
- logger.debug("Executing fallback basic search with filter: {}", query_filter)
148
- results = await vector_collection.find(query_filter).limit(query.k).to_list(length=query.k)
149
- logger.debug("Fallback search returned {} results", len(results))
150
-
151
- # Convert results to Chunk objects
152
- chunks = []
153
- for result in results:
154
- chunk = Chunk(
155
- text=result["text"],
156
- parent_id=result["parent_id"],
157
- chunk_id=result["chunk_id"],
158
- metadata={
159
- **result["metadata"],
160
- "similarity_score": result.get("score", 0)
161
- }
162
- )
163
- chunks.append(chunk)
164
-
165
- logger.info("Found {} similar chunks with similarity search", len(chunks))
166
- return chunks
167
-
168
  except Exception as e:
169
- logger.error("Error in similarity search: {}", str(e))
170
- # Include additional diagnostic information
171
- logger.debug("MongoDB connection info: URI defined: {}, DB name: {}",
172
- bool(self.settings.MONGODB_URI), self.settings.MONGODB_NAME)
173
- logger.debug("Query details: k={}, dimension={}",
174
- query.k, len(query.query_embeddings) if query.query_embeddings else "None")
175
  raise
 
1
  from loguru import logger
2
  from pydantic import BaseModel
3
+ from typing import Iterable, Optional, Self, Sequence
4
 
5
  from ctp_slack_bot.core import Settings
6
+ from ctp_slack_bot.db.repositories import VectorizedChunkRepository
7
  from ctp_slack_bot.models import Chunk, VectorizedChunk, VectorQuery
8
 
9
  class VectorDatabaseService(BaseModel): # TODO: this should not rely specifically on MongoDB.
10
  """
11
  Service for storing and retrieving vector embeddings from MongoDB.
12
  """
13
+
14
  settings: Settings
15
+ vectorized_chunk_repository: VectorizedChunkRepository
16
 
17
  class Config:
18
  frozen=True
 
20
  def __init__(self: Self, **data) -> None:
21
  super().__init__(**data)
22
  logger.debug("Created {}", self.__class__.__name__)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
+ async def content_exists(self: Self, parent_id: str, chunk_id: Optional[str] = None)-> bool:
25
  """
26
+ Check if the content identified by the parent and optionally the chunk identifiers exist in the database.
27
 
28
  Args:
29
+ parent_id: the identifier of the source content
30
+ chunk_id: the identifier of the chunk within the source content
31
  """
32
+ matching_chunk_count = await self.vectorized_chunk_repository.count_by_id(parent_id, chunk_id)
33
+ return 0 < matching_chunk_count
34
 
35
+ async def find_by_vector(self: Self, query: VectorQuery) -> Sequence[Chunk]:
36
  """
37
+ Query the vector database for similar chunks.
38
+
39
  Args:
40
+ query: the query criteria
41
 
42
  Returns:
43
+ Sequence[Chunk]: an ordered collection of similar chunks
44
+ """
45
+ try:
46
+ result = await self.vectorized_chunk_repository.find_by_vector(query)
47
+ logger.debug("Found {} chunks in the database by similarity search.", len(result))
48
+ return result
49
+ except Exception as e:
50
+ logger.error("Error finding chunks by vector: {}", str(e))
51
+ raise
52
+
53
+ async def store(self: Self, chunks: Iterable[VectorizedChunk]) -> None:
54
+ """
55
+ Stores vectorized chunks and their embedding vectors in the database.
56
+
57
+ Args:
58
+ chunks: a collection of vectorized chunks to store
59
+
60
+ Returns: None
61
  """
62
  try:
63
+ inserted_ids = await self.vectorized_chunk_repository.insert_many(chunks)
64
+ logger.debug("Stored {} vectorized chunks in the database.", len(inserted_ids))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  except Exception as e:
66
+ logger.error("Error storing vectorized chunks: {}", str(e))
 
 
 
 
 
67
  raise
src/ctp_slack_bot/utils/__init__.py CHANGED
@@ -1 +1,2 @@
 
1
  from ctp_slack_bot.utils.secret_stripper import sanitize_mongo_db_uri
 
1
+ from ctp_slack_bot.utils.immutable import to_deep_immutable
2
  from ctp_slack_bot.utils.secret_stripper import sanitize_mongo_db_uri
src/ctp_slack_bot/utils/immutable.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from types import MappingProxyType
2
+ from collections.abc import Iterable, Mapping, Sequence, Set
3
+ from typing import Any
4
+
5
+
6
+ def to_deep_immutable(obj: Any):
7
+ """Recursively convert mutable containers to immutable equivalents."""
8
+
9
+ # Handle mappings (dict-like).
10
+ if isinstance(obj, Mapping):
11
+ return MappingProxyType({to_deep_immutable(key): to_deep_immutable(value) for key, value in obj.items()})
12
+
13
+ # Handle sets.
14
+ if isinstance(obj, Set):
15
+ return frozenset(to_deep_immutable(item) for item in obj)
16
+
17
+ # Handle sequences (list/tuple-like).
18
+ if isinstance(obj, (Iterable, Sequence)) and not isinstance(obj, (str, bytes)):
19
+ return tuple(to_deep_immutable(item) for item in obj)
20
+
21
+ # Return anything else as-is.
22
+ return obj