Spaces:
Runtime error
Runtime error
Add Google Drive-to-MongoDB WebVTT vectorization pipeline notebook
Browse files
notebooks/google_drive.ipynb
CHANGED
@@ -9,7 +9,7 @@
|
|
9 |
},
|
10 |
{
|
11 |
"cell_type": "code",
|
12 |
-
"execution_count":
|
13 |
"metadata": {},
|
14 |
"outputs": [
|
15 |
{
|
@@ -28,8 +28,6 @@
|
|
28 |
"from textwrap import wrap\n",
|
29 |
"\n",
|
30 |
"from ctp_slack_bot.containers import Container\n",
|
31 |
-
"from ctp_slack_bot.models import GoogleDriveMetadata\n",
|
32 |
-
"from ctp_slack_bot.services import GoogleDriveService\n",
|
33 |
"\n",
|
34 |
"display_html = partial(display_html, raw=True)\n",
|
35 |
"\n",
|
|
|
9 |
},
|
10 |
{
|
11 |
"cell_type": "code",
|
12 |
+
"execution_count": null,
|
13 |
"metadata": {},
|
14 |
"outputs": [
|
15 |
{
|
|
|
28 |
"from textwrap import wrap\n",
|
29 |
"\n",
|
30 |
"from ctp_slack_bot.containers import Container\n",
|
|
|
|
|
31 |
"\n",
|
32 |
"display_html = partial(display_html, raw=True)\n",
|
33 |
"\n",
|
notebooks/google_drive_web_vtt_vectorizer_and_storer.ipynb
ADDED
@@ -0,0 +1,585 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {},
|
6 |
+
"source": [
|
7 |
+
"# Google Drive WebVTT Vectorizer and Storer"
|
8 |
+
]
|
9 |
+
},
|
10 |
+
{
|
11 |
+
"cell_type": "code",
|
12 |
+
"execution_count": null,
|
13 |
+
"metadata": {},
|
14 |
+
"outputs": [
|
15 |
+
{
|
16 |
+
"name": "stderr",
|
17 |
+
"output_type": "stream",
|
18 |
+
"text": [
|
19 |
+
"\u001b[32m2025-04-19 19:21:27.333\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.core.config\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m14\u001b[0m - \u001b[34m\u001b[1mCreated Settings\u001b[0m\n",
|
20 |
+
"\u001b[32m2025-04-19 19:21:27.334\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.core.config\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m14\u001b[0m - \u001b[34m\u001b[1mCreated Settings\u001b[0m\n",
|
21 |
+
"\u001b[32m2025-04-19 19:21:27.337\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.google_drive_service\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreated GoogleDriveService\u001b[0m\n",
|
22 |
+
"\u001b[32m2025-04-19 19:21:27.361\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.embeddings_model_service\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m22\u001b[0m - \u001b[34m\u001b[1mCreated EmbeddingsModelService\u001b[0m\n",
|
23 |
+
"\u001b[32m2025-04-19 19:21:27.362\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vectorization_service\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m22\u001b[0m - \u001b[34m\u001b[1mCreated VectorizationService\u001b[0m\n"
|
24 |
+
]
|
25 |
+
},
|
26 |
+
{
|
27 |
+
"name": "stderr",
|
28 |
+
"output_type": "stream",
|
29 |
+
"text": [
|
30 |
+
"\u001b[32m2025-04-19 19:21:27.364\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36minit\u001b[0m:\u001b[36m175\u001b[0m - \u001b[1mInitializing MongoDB connection for database: ctp_slack_bot\u001b[0m\n",
|
31 |
+
"\u001b[32m2025-04-19 19:21:27.364\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m26\u001b[0m - \u001b[34m\u001b[1mCreated MongoDB\u001b[0m\n",
|
32 |
+
"\u001b[32m2025-04-19 19:21:27.364\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mconnect\u001b[0m:\u001b[36m32\u001b[0m - \u001b[34m\u001b[1mConnecting to MongoDB using URI: mongodb+srv://ctp-slack-bot.xkipuvm.mongodb.net/?retryWrites=true&w=majority&appName=ctp-slack-bot\u001b[0m\n",
|
33 |
+
"\u001b[32m2025-04-19 19:21:27.365\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mconnect\u001b[0m:\u001b[36m49\u001b[0m - \u001b[34m\u001b[1mMongoDB client initialized for database: ctp_slack_bot\u001b[0m\n",
|
34 |
+
"\u001b[32m2025-04-19 19:21:27.825\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
|
35 |
+
"\u001b[32m2025-04-19 19:21:27.825\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36m_test_connection\u001b[0m:\u001b[36m186\u001b[0m - \u001b[1mMongoDB connection test successful!\u001b[0m\n",
|
36 |
+
"\u001b[32m2025-04-19 19:21:27.825\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m21\u001b[0m - \u001b[34m\u001b[1mCreated VectorDatabaseService\u001b[0m\n"
|
37 |
+
]
|
38 |
+
}
|
39 |
+
],
|
40 |
+
"source": [
|
41 |
+
"from datetime import datetime\n",
|
42 |
+
"from functools import partial\n",
|
43 |
+
"from html import escape\n",
|
44 |
+
"from IPython.display import display_html\n",
|
45 |
+
"from itertools import chain\n",
|
46 |
+
"from textwrap import wrap\n",
|
47 |
+
"from zoneinfo import ZoneInfo\n",
|
48 |
+
"\n",
|
49 |
+
"from ctp_slack_bot.containers import Container\n",
|
50 |
+
"from ctp_slack_bot.models import WebVTTContent\n",
|
51 |
+
"\n",
|
52 |
+
"display_html = partial(display_html, raw=True)\n",
|
53 |
+
"\n",
|
54 |
+
"container = Container()\n",
|
55 |
+
"google_drive_service = container.google_drive_service()\n",
|
56 |
+
"vectorization_service = container.vectorization_service()\n",
|
57 |
+
"vector_database_service = container.vector_database_service()"
|
58 |
+
]
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"cell_type": "markdown",
|
62 |
+
"metadata": {},
|
63 |
+
"source": [
|
64 |
+
"## Configuration\n",
|
65 |
+
"\n",
|
66 |
+
"⚠️ Configure before running the code to avoid processing the wrong file type or re-uploading past files which were already uploaded."
|
67 |
+
]
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"cell_type": "code",
|
71 |
+
"execution_count": 2,
|
72 |
+
"metadata": {},
|
73 |
+
"outputs": [],
|
74 |
+
"source": [
|
75 |
+
"MIME_TYPE = \"text/vtt\" # This should probably not be changed.\n",
|
76 |
+
"\n",
|
77 |
+
"MODIFICATION_TIME_CUTOFF = datetime(2024, 8, 30, tzinfo=ZoneInfo(\"UTC\"))"
|
78 |
+
]
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"cell_type": "markdown",
|
82 |
+
"metadata": {},
|
83 |
+
"source": [
|
84 |
+
"## Upload"
|
85 |
+
]
|
86 |
+
},
|
87 |
+
{
|
88 |
+
"cell_type": "code",
|
89 |
+
"execution_count": 3,
|
90 |
+
"metadata": {},
|
91 |
+
"outputs": [
|
92 |
+
{
|
93 |
+
"data": {
|
94 |
+
"text/html": [
|
95 |
+
"<p>Found 7 files/folders.</p>"
|
96 |
+
]
|
97 |
+
},
|
98 |
+
"metadata": {},
|
99 |
+
"output_type": "display_data"
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"data": {
|
103 |
+
"text/html": [
|
104 |
+
"<ul><li>Week-03-Analytics-Friday-2024-09-13.cc.vtt</li><li>Week-07-Regressors-via-Linear-Regression-Friday-2024-10-18.transcript.vtt</li><li>Week-06-Classifiers-via-Logistic-Regression-Friday-2024-10-11.transcript.vtt</li><li>Week-09-AI-Part-1-Neural-Networks-Intro-to-HuggingFace-Friday-2024-11-01.cc.vtt</li><li>Week-08-Decision-Trees-Random-Forest-Tuesday-2024-10-22.cc.vtt</li><li>Week-02-Finding-Cleaning-Data-Friday-2024-09-06.vtt</li><li>Week-01-Setup-Pandas-Friday-2024-08-30.vtt</li></ul>"
|
105 |
+
]
|
106 |
+
},
|
107 |
+
"metadata": {},
|
108 |
+
"output_type": "display_data"
|
109 |
+
},
|
110 |
+
{
|
111 |
+
"data": {
|
112 |
+
"text/html": [
|
113 |
+
"<p>7 files/folders pass the modification time (<em>2024-08-30 00:00:00+00:00</em>) cut-off.</p>"
|
114 |
+
]
|
115 |
+
},
|
116 |
+
"metadata": {},
|
117 |
+
"output_type": "display_data"
|
118 |
+
},
|
119 |
+
{
|
120 |
+
"data": {
|
121 |
+
"text/html": [
|
122 |
+
"<ul><li>Week-03-Analytics-Friday-2024-09-13.cc.vtt</li><li>Week-07-Regressors-via-Linear-Regression-Friday-2024-10-18.transcript.vtt</li><li>Week-06-Classifiers-via-Logistic-Regression-Friday-2024-10-11.transcript.vtt</li><li>Week-09-AI-Part-1-Neural-Networks-Intro-to-HuggingFace-Friday-2024-11-01.cc.vtt</li><li>Week-08-Decision-Trees-Random-Forest-Tuesday-2024-10-22.cc.vtt</li><li>Week-02-Finding-Cleaning-Data-Friday-2024-09-06.vtt</li><li>Week-01-Setup-Pandas-Friday-2024-08-30.vtt</li></ul>"
|
123 |
+
]
|
124 |
+
},
|
125 |
+
"metadata": {},
|
126 |
+
"output_type": "display_data"
|
127 |
+
},
|
128 |
+
{
|
129 |
+
"data": {
|
130 |
+
"text/html": [
|
131 |
+
"<p>7 files/folders pass the modification time (<em>2024-08-30 00:00:00+00:00</em>) cut-off and MIME type (<em>text/vtt</em>) criterion.</p>"
|
132 |
+
]
|
133 |
+
},
|
134 |
+
"metadata": {},
|
135 |
+
"output_type": "display_data"
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"data": {
|
139 |
+
"text/html": [
|
140 |
+
"<ul><li>Week-03-Analytics-Friday-2024-09-13.cc.vtt</li><li>Week-07-Regressors-via-Linear-Regression-Friday-2024-10-18.transcript.vtt</li><li>Week-06-Classifiers-via-Logistic-Regression-Friday-2024-10-11.transcript.vtt</li><li>Week-09-AI-Part-1-Neural-Networks-Intro-to-HuggingFace-Friday-2024-11-01.cc.vtt</li><li>Week-08-Decision-Trees-Random-Forest-Tuesday-2024-10-22.cc.vtt</li><li>Week-02-Finding-Cleaning-Data-Friday-2024-09-06.vtt</li><li>Week-01-Setup-Pandas-Friday-2024-08-30.vtt</li></ul>"
|
141 |
+
]
|
142 |
+
},
|
143 |
+
"metadata": {},
|
144 |
+
"output_type": "display_data"
|
145 |
+
}
|
146 |
+
],
|
147 |
+
"source": [
|
148 |
+
"item_metadata = google_drive_service.list_directory(\"\")\n",
|
149 |
+
"display_html(f\"<p>Found {len(item_metadata)} files/folders.</p>\")\n",
|
150 |
+
"display_html(\"\".join(chain(\"<ul>\", (f\"<li>{escape(metadata.name)}</li>\" for metadata in item_metadata), \"</ul>\")))\n",
|
151 |
+
"\n",
|
152 |
+
"recent_metadata = tuple(filter(lambda metadata: MODIFICATION_TIME_CUTOFF <= metadata.modified_time, item_metadata))\n",
|
153 |
+
"display_html(f\"<p>{len(item_metadata)} files/folders pass the modification time (<em>{MODIFICATION_TIME_CUTOFF}</em>) cut-off.</p>\")\n",
|
154 |
+
"display_html(\"\".join(chain(\"<ul>\", (f\"<li>{escape(metadata.name)}</li>\" for metadata in recent_metadata), \"</ul>\")))\n",
|
155 |
+
"\n",
|
156 |
+
"metadata_to_process = tuple(filter(lambda metadata: metadata.mime_type == MIME_TYPE, recent_metadata))\n",
|
157 |
+
"display_html(f\"<p>{len(item_metadata)} files/folders pass the modification time (<em>{MODIFICATION_TIME_CUTOFF}</em>) cut-off and MIME type (<em>{MIME_TYPE}</em>) criterion.</p>\")\n",
|
158 |
+
"display_html(\"\".join(chain(\"<ul>\", (f\"<li>{escape(metadata.name)}</li>\" for metadata in metadata_to_process), \"</ul>\")))"
|
159 |
+
]
|
160 |
+
},
|
161 |
+
{
|
162 |
+
"cell_type": "code",
|
163 |
+
"execution_count": 4,
|
164 |
+
"metadata": {},
|
165 |
+
"outputs": [
|
166 |
+
{
|
167 |
+
"data": {
|
168 |
+
"text/html": [
|
169 |
+
"Processed 7 files."
|
170 |
+
]
|
171 |
+
},
|
172 |
+
"metadata": {},
|
173 |
+
"output_type": "display_data"
|
174 |
+
}
|
175 |
+
],
|
176 |
+
"source": [
|
177 |
+
"web_vtts = tuple(WebVTTContent.from_bytes(f\"googledrive:{metadata.folder_path}/{metadata.name}\",\n",
|
178 |
+
" {\n",
|
179 |
+
" \"filename\": metadata.name,\n",
|
180 |
+
" \"mimeType\": metadata.mime_type,\n",
|
181 |
+
" \"modificationTime\": metadata.modified_time\n",
|
182 |
+
" },\n",
|
183 |
+
" google_drive_service.read_file_by_id(metadata.id))\n",
|
184 |
+
" for metadata\n",
|
185 |
+
" in metadata_to_process)\n",
|
186 |
+
"\n",
|
187 |
+
"display_html(f\"Processed {len(web_vtts)} files.\")"
|
188 |
+
]
|
189 |
+
},
|
190 |
+
{
|
191 |
+
"cell_type": "code",
|
192 |
+
"execution_count": 5,
|
193 |
+
"metadata": {},
|
194 |
+
"outputs": [
|
195 |
+
{
|
196 |
+
"data": {
|
197 |
+
"text/html": [
|
198 |
+
"Chunked Week-03-Analytics-Friday-2024-09-13.cc.vtt into 496 chunks."
|
199 |
+
]
|
200 |
+
},
|
201 |
+
"metadata": {},
|
202 |
+
"output_type": "display_data"
|
203 |
+
},
|
204 |
+
{
|
205 |
+
"name": "stderr",
|
206 |
+
"output_type": "stream",
|
207 |
+
"text": [
|
208 |
+
"\u001b[32m2025-04-19 19:21:37.826\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.embeddings_model_service\u001b[0m:\u001b[36mget_embeddings\u001b[0m:\u001b[36m36\u001b[0m - \u001b[34m\u001b[1mCreating embeddings for 496 text string(s)…\u001b[0m\n"
|
209 |
+
]
|
210 |
+
},
|
211 |
+
{
|
212 |
+
"data": {
|
213 |
+
"text/html": [
|
214 |
+
"Vectorized Week-03-Analytics-Friday-2024-09-13.cc.vtt’s 496 chunks."
|
215 |
+
]
|
216 |
+
},
|
217 |
+
"metadata": {},
|
218 |
+
"output_type": "display_data"
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"name": "stderr",
|
222 |
+
"output_type": "stream",
|
223 |
+
"text": [
|
224 |
+
"\u001b[32m2025-04-19 19:21:42.297\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m38\u001b[0m - \u001b[34m\u001b[1mGetting vectors collection for storing 496 chunks\u001b[0m\n",
|
225 |
+
"\u001b[32m2025-04-19 19:21:42.319\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
|
226 |
+
"\u001b[32m2025-04-19 19:21:42.320\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
|
227 |
+
"\u001b[32m2025-04-19 19:21:42.340\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
|
228 |
+
"\u001b[32m2025-04-19 19:21:42.341\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreating vector search index for vectors collection\u001b[0m\n",
|
229 |
+
"\u001b[32m2025-04-19 19:21:42.360\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
|
230 |
+
"\u001b[32m2025-04-19 19:21:42.360\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
|
231 |
+
"\u001b[32m2025-04-19 19:21:42.380\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
|
232 |
+
"\u001b[32m2025-04-19 19:21:42.500\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mcreate_indexes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1mVector search index 'vectors_vector_index' created for collection vectors.\u001b[0m\n",
|
233 |
+
"\u001b[32m2025-04-19 19:21:42.505\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m62\u001b[0m - \u001b[34m\u001b[1mInserting 496 documents into vectors collection\u001b[0m\n",
|
234 |
+
"\u001b[32m2025-04-19 19:21:48.862\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mStored 496 vector chunks in database\u001b[0m\n"
|
235 |
+
]
|
236 |
+
},
|
237 |
+
{
|
238 |
+
"data": {
|
239 |
+
"text/html": [
|
240 |
+
"Stored Week-03-Analytics-Friday-2024-09-13.cc.vtt’s 496 vectorized chunks to the database."
|
241 |
+
]
|
242 |
+
},
|
243 |
+
"metadata": {},
|
244 |
+
"output_type": "display_data"
|
245 |
+
},
|
246 |
+
{
|
247 |
+
"data": {
|
248 |
+
"text/html": [
|
249 |
+
"Chunked Week-07-Regressors-via-Linear-Regression-Friday-2024-10-18.transcript.vtt into 321 chunks."
|
250 |
+
]
|
251 |
+
},
|
252 |
+
"metadata": {},
|
253 |
+
"output_type": "display_data"
|
254 |
+
},
|
255 |
+
{
|
256 |
+
"name": "stderr",
|
257 |
+
"output_type": "stream",
|
258 |
+
"text": [
|
259 |
+
"\u001b[32m2025-04-19 19:21:48.866\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.embeddings_model_service\u001b[0m:\u001b[36mget_embeddings\u001b[0m:\u001b[36m36\u001b[0m - \u001b[34m\u001b[1mCreating embeddings for 321 text string(s)…\u001b[0m\n"
|
260 |
+
]
|
261 |
+
},
|
262 |
+
{
|
263 |
+
"data": {
|
264 |
+
"text/html": [
|
265 |
+
"Vectorized Week-07-Regressors-via-Linear-Regression-Friday-2024-10-18.transcript.vtt’s 321 chunks."
|
266 |
+
]
|
267 |
+
},
|
268 |
+
"metadata": {},
|
269 |
+
"output_type": "display_data"
|
270 |
+
},
|
271 |
+
{
|
272 |
+
"name": "stderr",
|
273 |
+
"output_type": "stream",
|
274 |
+
"text": [
|
275 |
+
"\u001b[32m2025-04-19 19:21:52.629\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m38\u001b[0m - \u001b[34m\u001b[1mGetting vectors collection for storing 321 chunks\u001b[0m\n",
|
276 |
+
"\u001b[32m2025-04-19 19:21:52.652\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
|
277 |
+
"\u001b[32m2025-04-19 19:21:52.652\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
|
278 |
+
"\u001b[32m2025-04-19 19:21:52.671\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
|
279 |
+
"\u001b[32m2025-04-19 19:21:52.672\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreating vector search index for vectors collection\u001b[0m\n",
|
280 |
+
"\u001b[32m2025-04-19 19:21:52.691\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
|
281 |
+
"\u001b[32m2025-04-19 19:21:52.691\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
|
282 |
+
"\u001b[32m2025-04-19 19:21:52.712\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
|
283 |
+
"\u001b[32m2025-04-19 19:21:52.829\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mcreate_indexes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1mVector search index 'vectors_vector_index' created for collection vectors.\u001b[0m\n",
|
284 |
+
"\u001b[32m2025-04-19 19:21:52.831\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m62\u001b[0m - \u001b[34m\u001b[1mInserting 321 documents into vectors collection\u001b[0m\n",
|
285 |
+
"\u001b[32m2025-04-19 19:21:58.227\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mStored 321 vector chunks in database\u001b[0m\n"
|
286 |
+
]
|
287 |
+
},
|
288 |
+
{
|
289 |
+
"data": {
|
290 |
+
"text/html": [
|
291 |
+
"Stored Week-07-Regressors-via-Linear-Regression-Friday-2024-10-18.transcript.vtt’s 321 vectorized chunks to the database."
|
292 |
+
]
|
293 |
+
},
|
294 |
+
"metadata": {},
|
295 |
+
"output_type": "display_data"
|
296 |
+
},
|
297 |
+
{
|
298 |
+
"data": {
|
299 |
+
"text/html": [
|
300 |
+
"Chunked Week-06-Classifiers-via-Logistic-Regression-Friday-2024-10-11.transcript.vtt into 337 chunks."
|
301 |
+
]
|
302 |
+
},
|
303 |
+
"metadata": {},
|
304 |
+
"output_type": "display_data"
|
305 |
+
},
|
306 |
+
{
|
307 |
+
"name": "stderr",
|
308 |
+
"output_type": "stream",
|
309 |
+
"text": [
|
310 |
+
"\u001b[32m2025-04-19 19:21:58.231\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.embeddings_model_service\u001b[0m:\u001b[36mget_embeddings\u001b[0m:\u001b[36m36\u001b[0m - \u001b[34m\u001b[1mCreating embeddings for 337 text string(s)…\u001b[0m\n"
|
311 |
+
]
|
312 |
+
},
|
313 |
+
{
|
314 |
+
"data": {
|
315 |
+
"text/html": [
|
316 |
+
"Vectorized Week-06-Classifiers-via-Logistic-Regression-Friday-2024-10-11.transcript.vtt’s 337 chunks."
|
317 |
+
]
|
318 |
+
},
|
319 |
+
"metadata": {},
|
320 |
+
"output_type": "display_data"
|
321 |
+
},
|
322 |
+
{
|
323 |
+
"name": "stderr",
|
324 |
+
"output_type": "stream",
|
325 |
+
"text": [
|
326 |
+
"\u001b[32m2025-04-19 19:22:02.126\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m38\u001b[0m - \u001b[34m\u001b[1mGetting vectors collection for storing 337 chunks\u001b[0m\n",
|
327 |
+
"\u001b[32m2025-04-19 19:22:02.147\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
|
328 |
+
"\u001b[32m2025-04-19 19:22:02.147\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
|
329 |
+
"\u001b[32m2025-04-19 19:22:02.167\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
|
330 |
+
"\u001b[32m2025-04-19 19:22:02.167\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreating vector search index for vectors collection\u001b[0m\n",
|
331 |
+
"\u001b[32m2025-04-19 19:22:02.186\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
|
332 |
+
"\u001b[32m2025-04-19 19:22:02.187\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
|
333 |
+
"\u001b[32m2025-04-19 19:22:02.207\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
|
334 |
+
"\u001b[32m2025-04-19 19:22:02.352\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mcreate_indexes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1mVector search index 'vectors_vector_index' created for collection vectors.\u001b[0m\n",
|
335 |
+
"\u001b[32m2025-04-19 19:22:02.354\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m62\u001b[0m - \u001b[34m\u001b[1mInserting 337 documents into vectors collection\u001b[0m\n",
|
336 |
+
"\u001b[32m2025-04-19 19:22:08.520\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mStored 337 vector chunks in database\u001b[0m\n"
|
337 |
+
]
|
338 |
+
},
|
339 |
+
{
|
340 |
+
"data": {
|
341 |
+
"text/html": [
|
342 |
+
"Stored Week-06-Classifiers-via-Logistic-Regression-Friday-2024-10-11.transcript.vtt’s 337 vectorized chunks to the database."
|
343 |
+
]
|
344 |
+
},
|
345 |
+
"metadata": {},
|
346 |
+
"output_type": "display_data"
|
347 |
+
},
|
348 |
+
{
|
349 |
+
"data": {
|
350 |
+
"text/html": [
|
351 |
+
"Chunked Week-09-AI-Part-1-Neural-Networks-Intro-to-HuggingFace-Friday-2024-11-01.cc.vtt into 341 chunks."
|
352 |
+
]
|
353 |
+
},
|
354 |
+
"metadata": {},
|
355 |
+
"output_type": "display_data"
|
356 |
+
},
|
357 |
+
{
|
358 |
+
"name": "stderr",
|
359 |
+
"output_type": "stream",
|
360 |
+
"text": [
|
361 |
+
"\u001b[32m2025-04-19 19:22:08.524\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.embeddings_model_service\u001b[0m:\u001b[36mget_embeddings\u001b[0m:\u001b[36m36\u001b[0m - \u001b[34m\u001b[1mCreating embeddings for 341 text string(s)…\u001b[0m\n"
|
362 |
+
]
|
363 |
+
},
|
364 |
+
{
|
365 |
+
"data": {
|
366 |
+
"text/html": [
|
367 |
+
"Vectorized Week-09-AI-Part-1-Neural-Networks-Intro-to-HuggingFace-Friday-2024-11-01.cc.vtt’s 341 chunks."
|
368 |
+
]
|
369 |
+
},
|
370 |
+
"metadata": {},
|
371 |
+
"output_type": "display_data"
|
372 |
+
},
|
373 |
+
{
|
374 |
+
"name": "stderr",
|
375 |
+
"output_type": "stream",
|
376 |
+
"text": [
|
377 |
+
"\u001b[32m2025-04-19 19:22:12.675\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m38\u001b[0m - \u001b[34m\u001b[1mGetting vectors collection for storing 341 chunks\u001b[0m\n",
|
378 |
+
"\u001b[32m2025-04-19 19:22:12.712\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
|
379 |
+
"\u001b[32m2025-04-19 19:22:12.712\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
|
380 |
+
"\u001b[32m2025-04-19 19:22:12.731\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
|
381 |
+
"\u001b[32m2025-04-19 19:22:12.731\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreating vector search index for vectors collection\u001b[0m\n",
|
382 |
+
"\u001b[32m2025-04-19 19:22:12.750\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
|
383 |
+
"\u001b[32m2025-04-19 19:22:12.751\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
|
384 |
+
"\u001b[32m2025-04-19 19:22:12.773\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
|
385 |
+
"\u001b[32m2025-04-19 19:22:12.924\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mcreate_indexes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1mVector search index 'vectors_vector_index' created for collection vectors.\u001b[0m\n",
|
386 |
+
"\u001b[32m2025-04-19 19:22:12.926\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m62\u001b[0m - \u001b[34m\u001b[1mInserting 341 documents into vectors collection\u001b[0m\n",
|
387 |
+
"\u001b[32m2025-04-19 19:22:18.356\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mStored 341 vector chunks in database\u001b[0m\n"
|
388 |
+
]
|
389 |
+
},
|
390 |
+
{
|
391 |
+
"data": {
|
392 |
+
"text/html": [
|
393 |
+
"Stored Week-09-AI-Part-1-Neural-Networks-Intro-to-HuggingFace-Friday-2024-11-01.cc.vtt’s 341 vectorized chunks to the database."
|
394 |
+
]
|
395 |
+
},
|
396 |
+
"metadata": {},
|
397 |
+
"output_type": "display_data"
|
398 |
+
},
|
399 |
+
{
|
400 |
+
"data": {
|
401 |
+
"text/html": [
|
402 |
+
"Chunked Week-08-Decision-Trees-Random-Forest-Tuesday-2024-10-22.cc.vtt into 378 chunks."
|
403 |
+
]
|
404 |
+
},
|
405 |
+
"metadata": {},
|
406 |
+
"output_type": "display_data"
|
407 |
+
},
|
408 |
+
{
|
409 |
+
"name": "stderr",
|
410 |
+
"output_type": "stream",
|
411 |
+
"text": [
|
412 |
+
"\u001b[32m2025-04-19 19:22:18.360\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.embeddings_model_service\u001b[0m:\u001b[36mget_embeddings\u001b[0m:\u001b[36m36\u001b[0m - \u001b[34m\u001b[1mCreating embeddings for 378 text string(s)…\u001b[0m\n"
|
413 |
+
]
|
414 |
+
},
|
415 |
+
{
|
416 |
+
"data": {
|
417 |
+
"text/html": [
|
418 |
+
"Vectorized Week-08-Decision-Trees-Random-Forest-Tuesday-2024-10-22.cc.vtt’s 378 chunks."
|
419 |
+
]
|
420 |
+
},
|
421 |
+
"metadata": {},
|
422 |
+
"output_type": "display_data"
|
423 |
+
},
|
424 |
+
{
|
425 |
+
"name": "stderr",
|
426 |
+
"output_type": "stream",
|
427 |
+
"text": [
|
428 |
+
"\u001b[32m2025-04-19 19:22:21.808\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m38\u001b[0m - \u001b[34m\u001b[1mGetting vectors collection for storing 378 chunks\u001b[0m\n",
|
429 |
+
"\u001b[32m2025-04-19 19:22:21.841\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
|
430 |
+
"\u001b[32m2025-04-19 19:22:21.841\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
|
431 |
+
"\u001b[32m2025-04-19 19:22:21.873\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
|
432 |
+
"\u001b[32m2025-04-19 19:22:21.874\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreating vector search index for vectors collection\u001b[0m\n",
|
433 |
+
"\u001b[32m2025-04-19 19:22:21.894\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
|
434 |
+
"\u001b[32m2025-04-19 19:22:21.894\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
|
435 |
+
"\u001b[32m2025-04-19 19:22:21.914\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
|
436 |
+
"\u001b[32m2025-04-19 19:22:22.029\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mcreate_indexes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1mVector search index 'vectors_vector_index' created for collection vectors.\u001b[0m\n",
|
437 |
+
"\u001b[32m2025-04-19 19:22:22.035\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m62\u001b[0m - \u001b[34m\u001b[1mInserting 378 documents into vectors collection\u001b[0m\n",
|
438 |
+
"\u001b[32m2025-04-19 19:22:28.108\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mStored 378 vector chunks in database\u001b[0m\n"
|
439 |
+
]
|
440 |
+
},
|
441 |
+
{
|
442 |
+
"data": {
|
443 |
+
"text/html": [
|
444 |
+
"Stored Week-08-Decision-Trees-Random-Forest-Tuesday-2024-10-22.cc.vtt’s 378 vectorized chunks to the database."
|
445 |
+
]
|
446 |
+
},
|
447 |
+
"metadata": {},
|
448 |
+
"output_type": "display_data"
|
449 |
+
},
|
450 |
+
{
|
451 |
+
"data": {
|
452 |
+
"text/html": [
|
453 |
+
"Chunked Week-02-Finding-Cleaning-Data-Friday-2024-09-06.vtt into 680 chunks."
|
454 |
+
]
|
455 |
+
},
|
456 |
+
"metadata": {},
|
457 |
+
"output_type": "display_data"
|
458 |
+
},
|
459 |
+
{
|
460 |
+
"name": "stderr",
|
461 |
+
"output_type": "stream",
|
462 |
+
"text": [
|
463 |
+
"\u001b[32m2025-04-19 19:22:28.113\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.embeddings_model_service\u001b[0m:\u001b[36mget_embeddings\u001b[0m:\u001b[36m36\u001b[0m - \u001b[34m\u001b[1mCreating embeddings for 680 text string(s)…\u001b[0m\n"
|
464 |
+
]
|
465 |
+
},
|
466 |
+
{
|
467 |
+
"data": {
|
468 |
+
"text/html": [
|
469 |
+
"Vectorized Week-02-Finding-Cleaning-Data-Friday-2024-09-06.vtt’s 680 chunks."
|
470 |
+
]
|
471 |
+
},
|
472 |
+
"metadata": {},
|
473 |
+
"output_type": "display_data"
|
474 |
+
},
|
475 |
+
{
|
476 |
+
"name": "stderr",
|
477 |
+
"output_type": "stream",
|
478 |
+
"text": [
|
479 |
+
"\u001b[32m2025-04-19 19:22:34.652\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m38\u001b[0m - \u001b[34m\u001b[1mGetting vectors collection for storing 680 chunks\u001b[0m\n",
|
480 |
+
"\u001b[32m2025-04-19 19:22:34.671\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
|
481 |
+
"\u001b[32m2025-04-19 19:22:34.671\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
|
482 |
+
"\u001b[32m2025-04-19 19:22:34.705\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
|
483 |
+
"\u001b[32m2025-04-19 19:22:34.705\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreating vector search index for vectors collection\u001b[0m\n",
|
484 |
+
"\u001b[32m2025-04-19 19:22:34.720\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
|
485 |
+
"\u001b[32m2025-04-19 19:22:34.720\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
|
486 |
+
"\u001b[32m2025-04-19 19:22:34.740\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
|
487 |
+
"\u001b[32m2025-04-19 19:22:34.859\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mcreate_indexes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1mVector search index 'vectors_vector_index' created for collection vectors.\u001b[0m\n",
|
488 |
+
"\u001b[32m2025-04-19 19:22:34.866\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m62\u001b[0m - \u001b[34m\u001b[1mInserting 680 documents into vectors collection\u001b[0m\n",
|
489 |
+
"\u001b[32m2025-04-19 19:22:43.431\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mStored 680 vector chunks in database\u001b[0m\n"
|
490 |
+
]
|
491 |
+
},
|
492 |
+
{
|
493 |
+
"data": {
|
494 |
+
"text/html": [
|
495 |
+
"Stored Week-02-Finding-Cleaning-Data-Friday-2024-09-06.vtt’s 680 vectorized chunks to the database."
|
496 |
+
]
|
497 |
+
},
|
498 |
+
"metadata": {},
|
499 |
+
"output_type": "display_data"
|
500 |
+
},
|
501 |
+
{
|
502 |
+
"data": {
|
503 |
+
"text/html": [
|
504 |
+
"Chunked Week-01-Setup-Pandas-Friday-2024-08-30.vtt into 742 chunks."
|
505 |
+
]
|
506 |
+
},
|
507 |
+
"metadata": {},
|
508 |
+
"output_type": "display_data"
|
509 |
+
},
|
510 |
+
{
|
511 |
+
"name": "stderr",
|
512 |
+
"output_type": "stream",
|
513 |
+
"text": [
|
514 |
+
"\u001b[32m2025-04-19 19:22:43.438\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.embeddings_model_service\u001b[0m:\u001b[36mget_embeddings\u001b[0m:\u001b[36m36\u001b[0m - \u001b[34m\u001b[1mCreating embeddings for 742 text string(s)…\u001b[0m\n"
|
515 |
+
]
|
516 |
+
},
|
517 |
+
{
|
518 |
+
"data": {
|
519 |
+
"text/html": [
|
520 |
+
"Vectorized Week-01-Setup-Pandas-Friday-2024-08-30.vtt’s 742 chunks."
|
521 |
+
]
|
522 |
+
},
|
523 |
+
"metadata": {},
|
524 |
+
"output_type": "display_data"
|
525 |
+
},
|
526 |
+
{
|
527 |
+
"name": "stderr",
|
528 |
+
"output_type": "stream",
|
529 |
+
"text": [
|
530 |
+
"\u001b[32m2025-04-19 19:22:50.402\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m38\u001b[0m - \u001b[34m\u001b[1mGetting vectors collection for storing 742 chunks\u001b[0m\n",
|
531 |
+
"\u001b[32m2025-04-19 19:22:50.426\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
|
532 |
+
"\u001b[32m2025-04-19 19:22:50.426\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
|
533 |
+
"\u001b[32m2025-04-19 19:22:50.452\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
|
534 |
+
"\u001b[32m2025-04-19 19:22:50.452\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreating vector search index for vectors collection\u001b[0m\n",
|
535 |
+
"\u001b[32m2025-04-19 19:22:50.475\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
|
536 |
+
"\u001b[32m2025-04-19 19:22:50.475\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
|
537 |
+
"\u001b[32m2025-04-19 19:22:50.508\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
|
538 |
+
"\u001b[32m2025-04-19 19:22:50.617\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mcreate_indexes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1mVector search index 'vectors_vector_index' created for collection vectors.\u001b[0m\n",
|
539 |
+
"\u001b[32m2025-04-19 19:22:50.626\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m62\u001b[0m - \u001b[34m\u001b[1mInserting 742 documents into vectors collection\u001b[0m\n",
|
540 |
+
"\u001b[32m2025-04-19 19:23:01.166\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mStored 742 vector chunks in database\u001b[0m\n"
|
541 |
+
]
|
542 |
+
},
|
543 |
+
{
|
544 |
+
"data": {
|
545 |
+
"text/html": [
|
546 |
+
"Stored Week-01-Setup-Pandas-Friday-2024-08-30.vtt’s 742 vectorized chunks to the database."
|
547 |
+
]
|
548 |
+
},
|
549 |
+
"metadata": {},
|
550 |
+
"output_type": "display_data"
|
551 |
+
}
|
552 |
+
],
|
553 |
+
"source": [
|
554 |
+
"for web_vtt in web_vtts:\n",
|
555 |
+
" chunks = web_vtt.get_chunks()\n",
|
556 |
+
" display_html(f\"Chunked {web_vtt.get_metadata().get(\"filename\")} into {len(chunks)} chunks.\")\n",
|
557 |
+
" vectorized_chunks = vectorization_service.vectorize(chunks)\n",
|
558 |
+
" display_html(f\"Vectorized {web_vtt.get_metadata().get(\"filename\")}’s {len(vectorized_chunks)} chunks.\")\n",
|
559 |
+
" await (await vector_database_service).store(vectorized_chunks)\n",
|
560 |
+
" display_html(f\"Stored {web_vtt.get_metadata().get(\"filename\")}’s {len(vectorized_chunks)} vectorized chunks to the database.\")"
|
561 |
+
]
|
562 |
+
}
|
563 |
+
],
|
564 |
+
"metadata": {
|
565 |
+
"kernelspec": {
|
566 |
+
"display_name": ".venv",
|
567 |
+
"language": "python",
|
568 |
+
"name": "python3"
|
569 |
+
},
|
570 |
+
"language_info": {
|
571 |
+
"codemirror_mode": {
|
572 |
+
"name": "ipython",
|
573 |
+
"version": 3
|
574 |
+
},
|
575 |
+
"file_extension": ".py",
|
576 |
+
"mimetype": "text/x-python",
|
577 |
+
"name": "python",
|
578 |
+
"nbconvert_exporter": "python",
|
579 |
+
"pygments_lexer": "ipython3",
|
580 |
+
"version": "3.12.3"
|
581 |
+
}
|
582 |
+
},
|
583 |
+
"nbformat": 4,
|
584 |
+
"nbformat_minor": 2
|
585 |
+
}
|
src/ctp_slack_bot/models/webvtt.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
from datetime import datetime, timedelta
|
2 |
from io import BytesIO
|
|
|
3 |
from json import dumps
|
4 |
from more_itertools import windowed
|
5 |
from pydantic import BaseModel, ConfigDict, Field, PositiveInt, PrivateAttr
|
@@ -25,8 +26,8 @@ class WebVTTFrame(BaseModel):
|
|
25 |
model_config = ConfigDict(frozen=True)
|
26 |
|
27 |
@classmethod
|
28 |
-
def from_webvtt_caption(cls: type["WebVTTFrame"], caption: Caption) -> Self:
|
29 |
-
identifier = caption.identifier
|
30 |
start = timedelta(**caption.start_time.__dict__)
|
31 |
end = timedelta(**caption.end_time.__dict__)
|
32 |
match caption.text.split(SPEAKER_SPEECH_TEXT_SEPARATOR, 1):
|
@@ -56,9 +57,9 @@ class WebVTTContent(Content):
|
|
56 |
parent_id=self.get_id(),
|
57 |
chunk_id=f"{frames[0].identifier}-{frames[-1].identifier}",
|
58 |
metadata={
|
59 |
-
"start": frames[0].start,
|
60 |
-
"end": frames[-1].end,
|
61 |
-
"speakers":
|
62 |
})
|
63 |
for frames
|
64 |
in windows)
|
@@ -68,5 +69,5 @@ class WebVTTContent(Content):
|
|
68 |
|
69 |
@classmethod
|
70 |
def from_bytes(cls: type["WebVTTContent"], id: str, metadata: Mapping[str, Any], buffer: bytes) -> Self:
|
71 |
-
frames = tuple(
|
72 |
return WebVTTContent(id=id, metadata=MappingProxyType(metadata), frames=frames)
|
|
|
1 |
from datetime import datetime, timedelta
|
2 |
from io import BytesIO
|
3 |
+
from itertools import starmap
|
4 |
from json import dumps
|
5 |
from more_itertools import windowed
|
6 |
from pydantic import BaseModel, ConfigDict, Field, PositiveInt, PrivateAttr
|
|
|
26 |
model_config = ConfigDict(frozen=True)
|
27 |
|
28 |
@classmethod
|
29 |
+
def from_webvtt_caption(cls: type["WebVTTFrame"], index: int, caption: Caption) -> Self:
|
30 |
+
identifier = caption.identifier if caption.identifier else str(index)
|
31 |
start = timedelta(**caption.start_time.__dict__)
|
32 |
end = timedelta(**caption.end_time.__dict__)
|
33 |
match caption.text.split(SPEAKER_SPEECH_TEXT_SEPARATOR, 1):
|
|
|
57 |
parent_id=self.get_id(),
|
58 |
chunk_id=f"{frames[0].identifier}-{frames[-1].identifier}",
|
59 |
metadata={
|
60 |
+
"start": str(frames[0].start), # TODO: This is a harder problem: to get the offsets to become real datetimes so that they can be queryable using MongoDB.
|
61 |
+
"end": str(frames[-1].end),
|
62 |
+
"speakers": [frame.speaker for frame in frames if frame.speaker]
|
63 |
})
|
64 |
for frames
|
65 |
in windows)
|
|
|
69 |
|
70 |
@classmethod
|
71 |
def from_bytes(cls: type["WebVTTContent"], id: str, metadata: Mapping[str, Any], buffer: bytes) -> Self:
|
72 |
+
frames = tuple(starmap(WebVTTFrame.from_webvtt_caption, enumerate(WebVTT.from_buffer(BytesIO(buffer)).captions, 1)))
|
73 |
return WebVTTContent(id=id, metadata=MappingProxyType(metadata), frames=frames)
|
src/ctp_slack_bot/services/vector_database_service.py
CHANGED
@@ -172,4 +172,4 @@ class VectorDatabaseService(BaseModel): # TODO: this should not rely specificall
|
|
172 |
bool(self.settings.MONGODB_URI), self.settings.MONGODB_NAME)
|
173 |
logger.debug("Query details: k={}, dimension={}",
|
174 |
query.k, len(query.query_embeddings) if query.query_embeddings else "None")
|
175 |
-
raise
|
|
|
172 |
bool(self.settings.MONGODB_URI), self.settings.MONGODB_NAME)
|
173 |
logger.debug("Query details: k={}, dimension={}",
|
174 |
query.k, len(query.query_embeddings) if query.query_embeddings else "None")
|
175 |
+
raise
|