Kevin Li commited on
Commit
c1b84d6
·
unverified ·
2 Parent(s): dfc575a fb92766

Merge pull request #5 from CUNYTechPrep/refactor-2

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .env.template +19 -19
  2. .github/workflows/main.yml +19 -0
  3. Dockerfile +5 -2
  4. README.MD → README.md +53 -41
  5. notebooks/container.ipynb +102 -0
  6. notebooks/google_drive.ipynb +0 -0
  7. notebooks/google_drive_web_vtt_vectorizer_and_storer.ipynb +585 -0
  8. notebooks/web_vtt.ipynb +355 -0
  9. pyproject.toml +15 -14
  10. scripts/run-dev.sh +1 -1
  11. src/ctp_slack_bot/__init__.py +0 -1
  12. src/ctp_slack_bot/api/__init__.py +0 -1
  13. src/ctp_slack_bot/api/main.py +0 -70
  14. src/ctp_slack_bot/api/routes.py +0 -67
  15. src/ctp_slack_bot/app.py +53 -0
  16. src/ctp_slack_bot/containers.py +21 -25
  17. src/ctp_slack_bot/core/__init__.py +0 -1
  18. src/ctp_slack_bot/core/config.py +37 -14
  19. src/ctp_slack_bot/core/logging.py +34 -30
  20. src/ctp_slack_bot/core/response_rendering.py +0 -13
  21. src/ctp_slack_bot/db/mongo_db.py +167 -94
  22. src/ctp_slack_bot/db/repositories/__init__.py +2 -0
  23. src/ctp_slack_bot/db/repositories/mongo_db_vectorized_chunk_repository.py +65 -0
  24. src/ctp_slack_bot/db/repositories/vectorized_chunk_repository.py +30 -0
  25. src/ctp_slack_bot/enums.py +6 -0
  26. src/ctp_slack_bot/models/__init__.py +4 -4
  27. src/ctp_slack_bot/models/base.py +44 -47
  28. src/ctp_slack_bot/models/content.py +0 -19
  29. src/ctp_slack_bot/models/google_drive.py +25 -0
  30. src/ctp_slack_bot/models/slack.py +80 -12
  31. src/ctp_slack_bot/models/vector_query.py +0 -16
  32. src/ctp_slack_bot/models/webvtt.py +73 -0
  33. src/ctp_slack_bot/services/GOOGLE_DRIVE_README.md +0 -228
  34. src/ctp_slack_bot/services/__init__.py +3 -0
  35. src/ctp_slack_bot/services/answer_retrieval_service.py +18 -49
  36. src/ctp_slack_bot/services/application_database_service.py +29 -0
  37. src/ctp_slack_bot/services/content_ingestion_service.py +33 -5
  38. src/ctp_slack_bot/services/context_retrieval_service.py +38 -44
  39. src/ctp_slack_bot/services/embeddings_model_service.py +47 -0
  40. src/ctp_slack_bot/services/event_brokerage_service.py +31 -22
  41. src/ctp_slack_bot/services/google_drive_access.py +0 -623
  42. src/ctp_slack_bot/services/google_drive_basic_usage.py +0 -178
  43. src/ctp_slack_bot/services/google_drive_service.py +142 -0
  44. src/ctp_slack_bot/services/language_model_service.py +55 -0
  45. src/ctp_slack_bot/services/question_dispatch_service.py +15 -10
  46. src/ctp_slack_bot/services/schedule_service.py +68 -0
  47. src/ctp_slack_bot/services/slack_service.py +55 -9
  48. src/ctp_slack_bot/services/vector_database_service.py +115 -86
  49. src/ctp_slack_bot/services/vectorization_service.py +19 -53
  50. src/ctp_slack_bot/tasks/__init__.py +0 -1
.env.template CHANGED
@@ -1,41 +1,41 @@
1
  # Copy this file and modify. Do not save or commit the secrets!
2
 
3
- # Application Configuration
4
- DEBUG=false
5
-
6
- # Logging Configuration
7
- LOG_LEVEL=INFO
8
- LOG_FORMAT=text
9
-
10
  # APScheduler Configuration
11
  SCHEDULER_TIMEZONE=UTC
12
 
13
- # API Configuration
14
- API_HOST=0.0.0.0
15
- API_PORT=8000
16
-
17
  # Slack Configuration
18
  SLACK_BOT_TOKEN=🪙
19
- SLACK_SIGNING_SECRET=🔏
20
  SLACK_APP_TOKEN=🦥
21
 
22
  # Vectorization Configuration
23
  EMBEDDING_MODEL=🌮
24
- VECTOR_DIMENSION=9001
25
- CHUNK_SIZE=42
26
- CHUNK_OVERLAP=37
27
- TOP_K_MATCHES=1
28
 
29
  # MongoDB Configuration
30
  MONGODB_URI=mongodb+srv://username:[email protected]/database?retryWrites=true&w=majority
31
  MONGODB_NAME=ctp_slack_bot
 
32
 
33
  # Hugging Face Configuration
34
  HF_API_TOKEN=🤗
35
 
36
  # OpenAI Configuration
37
  OPENAI_API_KEY=😐
38
- CHAT_MODEL=🙊
39
- MAX_TOKENS=42
40
- TEMPERATURE=0.5
41
  SYSTEM_PROMPT="You are a helpful teaching assistant for a data science class.\nBased on the students question, you will be given context retreived from class transcripts and materials to answer their question.\nYour responses should be:\n\n1. Accurate and based on the class content\n2. Clear and educational\n3. Concise but complete\nIf you're unsure about something, acknowledge it and suggest asking the professor."
 
 
 
 
 
 
 
 
 
 
 
 
1
  # Copy this file and modify. Do not save or commit the secrets!
2
 
 
 
 
 
 
 
 
3
  # APScheduler Configuration
4
  SCHEDULER_TIMEZONE=UTC
5
 
 
 
 
 
6
  # Slack Configuration
7
  SLACK_BOT_TOKEN=🪙
 
8
  SLACK_APP_TOKEN=🦥
9
 
10
  # Vectorization Configuration
11
  EMBEDDING_MODEL=🌮
12
+ VECTOR_DIMENSION=1536
13
+ CHUNK_SIZE=1000
14
+ CHUNK_OVERLAP=200
15
+ TOP_K_MATCHES=5
16
 
17
  # MongoDB Configuration
18
  MONGODB_URI=mongodb+srv://username:[email protected]/database?retryWrites=true&w=majority
19
  MONGODB_NAME=ctp_slack_bot
20
+ SCORE_THRESHOLD=0.5
21
 
22
  # Hugging Face Configuration
23
  HF_API_TOKEN=🤗
24
 
25
  # OpenAI Configuration
26
  OPENAI_API_KEY=😐
27
+ CHAT_MODEL=gpt-3.5-turbo
28
+ MAX_TOKENS=150
29
+ TEMPERATURE=0.8
30
  SYSTEM_PROMPT="You are a helpful teaching assistant for a data science class.\nBased on the students question, you will be given context retreived from class transcripts and materials to answer their question.\nYour responses should be:\n\n1. Accurate and based on the class content\n2. Clear and educational\n3. Concise but complete\nIf you're unsure about something, acknowledge it and suggest asking the professor."
31
+
32
+ # Google Drive Configuration
33
+ GOOGLE_DRIVE_ROOT_ID=1NB91EcIUXbOVcdCkXOAHdmWrDfgoh9fQ
34
+ GOOGLE_PROJECT_ID=insufferable-slacker-123456
35
+ GOOGLE_PRIVATE_KEY_ID=1a2b3c4d5e6f748891091d21304e506674829507
36
+ GOOGLE_PRIVATE_KEY="-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASC...\n-----END PRIVATE KEY-----\n"
37
+ GOOGLE_CLIENT_EMAIL=botty-bot@insufferable-slacker-123456.iam.gserviceaccount.com
38
+ GOOGLE_CLIENT_ID=123456789012345678901
39
+
40
+ # File Monitoring Configuration
41
+ FILE_MONITOR_ROOT_PATH=Transcripts/Friday
.github/workflows/main.yml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face hub
2
+ on:
3
+ push:
4
+ branches: [main]
5
+ # to run this workflow manually from the Actions tab
6
+ workflow_dispatch:
7
+
8
+ jobs:
9
+ sync-to-hub:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v3
13
+ with:
14
+ fetch-depth: 0
15
+ lfs: true
16
+ - name: Push to hub
17
+ env:
18
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
19
+ run: git push https://KingZack:[email protected]/spaces/KingZack/ctp-slack-bot main
Dockerfile CHANGED
@@ -5,7 +5,7 @@ WORKDIR /app
5
  # Set environment variables.
6
  ENV PYTHONDONTWRITEBYTECODE=1 \
7
  PYTHONUNBUFFERED=1 \
8
- PYTHONPATH=/app
9
 
10
  # Install system dependencies.
11
  RUN apt-get update \
@@ -25,5 +25,8 @@ RUN pip install --no-cache-dir .
25
  RUN useradd -m appuser
26
  USER appuser
27
 
 
 
 
28
  # Run the application.
29
- CMD ["uvicorn", "src.ctp_slack_bot.api.main:app", "--host", "0.0.0.0", "--port", "8000"]
 
5
  # Set environment variables.
6
  ENV PYTHONDONTWRITEBYTECODE=1 \
7
  PYTHONUNBUFFERED=1 \
8
+ PYTHONPATH=/app/src
9
 
10
  # Install system dependencies.
11
  RUN apt-get update \
 
25
  RUN useradd -m appuser
26
  USER appuser
27
 
28
+ # Expose a volume mount for logs ― Hugging Face Spaces requires specifically /data.
29
+ VOLUME /data
30
+
31
  # Run the application.
32
+ CMD ["python", "-m", "ctp_slack_bot.app"]
README.MD → README.md RENAMED
@@ -1,42 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
  # CTP Slack Bot
2
 
3
  ## _Modus Operandi_ in a Nutshell
4
 
5
- * Intelligently responds to Slack messages based on a repository of data.
6
  * Periodically checks for new content to add to its repository.
7
 
8
- ## Tech Stack
9
-
10
- * Hugging Face Spaces for hosting and serverless API
11
- * Google Drive for reference data (i.e., the material to be incorporated into the bot’s knowledge base)
12
- * MongoDB for data persistence
13
- * Docker for containerization
14
- * Python
15
- * FastAPI for serving HTTP requests
16
- * httpx for making HTTP requests
17
- * APScheduler for running periodic tasks in the background
18
- * See `pyproject.toml` for additional Python packages.
19
-
20
- ## General Project Structure
21
-
22
- * `src/`
23
- * `ctp_slack_bot/`
24
- * `api/`: FastAPI application structure
25
- * `routes.py`: API endpoint definitions
26
- * `core/`: fundamental components like configuration (using pydantic), logging setup (loguru), and custom exceptions
27
- * `db/`: database connection
28
- * `repositories/`: repository pattern implementation
29
- * `models/`: Pydantic models for data validation and serialization
30
- * `services/`: business logic
31
- * `tasks/`: background scheduled jobs
32
- * `utils/`: reusable utilities
33
- * `tests/`: unit tests
34
- * `scripts/`: utility scripts for development, deployment, etc.
35
- * `run-dev.sh`: script to run the application locally
36
- * `notebooks/`: Jupyter notebooks for exploration and model development
37
- * `.env`: local environment variables for development purposes (to be created for local use only from `.env.template`)
38
- * `Dockerfile`: Docker container build definition
39
-
40
  ## How to Run the Application
41
 
42
  ### Normally
@@ -52,7 +32,7 @@ docker build . -t ctp-slack-bot
52
  Run it with:
53
 
54
  ```sh
55
- docker run --env-file=.env -p 8000:8000 --name my-ctp-slack-bot-instance ctp-slack-bot
56
  ```
57
 
58
  ### For Development
@@ -73,13 +53,45 @@ If `localhost` port `8000` is free, running the following will make the applicat
73
  scripts/run-dev.sh
74
  ```
75
 
76
- You can check that it’s reachable by visiting [http://localhost:8000/health](http://localhost:8000/health).
77
 
78
- ```text
79
- $ curl http://localhost:8000/health
80
- {"status":"healthy"}
81
- ```
 
 
 
 
82
 
83
- In debug mode (`DEBUG=true`), [http://localhost:8000/api/v1/env](http://localhost:8000/api/v1/env) will pretty-print the non-sensitive environment variables as JSON.
84
 
85
- Uvicorn will restart the application automatically when any source files are changed.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: CTP Slack Bot
3
+ emoji: 🦥
4
+ colorFrom: red
5
+ colorTo: green
6
+ sdk: docker
7
+ pinned: false
8
+ license: mit
9
+ short_description: Spring 2025 CTP Slack Bot RAG system
10
+ ---
11
+
12
+
13
  # CTP Slack Bot
14
 
15
  ## _Modus Operandi_ in a Nutshell
16
 
17
+ * Intelligently responds to Slack messages (when mentioned) based on a repository of data.
18
  * Periodically checks for new content to add to its repository.
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  ## How to Run the Application
21
 
22
  ### Normally
 
32
  Run it with:
33
 
34
  ```sh
35
+ docker run --volume ./logs:/app/logs/ --env-file=.env -p 8000:8000 --name my-ctp-slack-bot-instance ctp-slack-bot
36
  ```
37
 
38
  ### For Development
 
53
  scripts/run-dev.sh
54
  ```
55
 
56
+ ## Tech Stack
57
 
58
+ * Hugging Face Spaces for hosting
59
+ * OpenAI for embeddings and language models
60
+ * Google Drive for reference data (i.e., the material to be incorporated into the bot’s knowledge base)
61
+ * MongoDB for data persistence
62
+ * Docker for containerization
63
+ * Python
64
+ * Slack Bolt client for interfacing with Slack
65
+ * See `pyproject.toml` for additional Python packages.
66
 
67
+ ## General Project Structure
68
 
69
+ * `src/`
70
+ * `ctp_slack_bot/`
71
+ * `core/`: fundamental components like configuration (using pydantic), logging setup (loguru), and custom exceptions
72
+ * `db/`: database connection
73
+ * `repositories/`: repository pattern implementation
74
+ * `models/`: Pydantic models for data validation and serialization
75
+ * `services/`: business logic
76
+ * `answer_retrieval_service.py`: obtains an answer to a question from a language model using relevant context
77
+ * `content_ingestion_service.py`: converts content into chunks and stores them into the database
78
+ * `context_retrieval_service.py`: queries for relevant context from the database to answer a question
79
+ * `embeddings_model_service.py`: converts text to embeddings
80
+ * `event_brokerage_service.py`: brokers events between decoupled components
81
+ * `language_model_service.py`: answers questions using relevant context
82
+ * `question_dispatch_service.py`: listens for questions and retrieves relevant context to get answers
83
+ * `schedule_service.py`: runs background jobs
84
+ * `slack_service.py`: handles events from Slack and sends back responses
85
+ * `vector_database_service.py`: stores and queries chunks
86
+ * `vectorization_service.py`: converts chunks into chunks with embeddings
87
+ * `tasks/`: background scheduled jobs
88
+ * `utils/`: reusable utilities
89
+ * `app.py`: application entry point
90
+ * `containers.py`: the dependency injection container
91
+ * `tests/`: unit tests
92
+ * `scripts/`: utility scripts for development, deployment, etc.
93
+ * `run-dev.sh`: script to run the application locally
94
+ * `notebooks/`: Jupyter notebooks for exploration and model development
95
+ * `.env`: local environment variables for development purposes (to be created for local use only from `.env.template`)
96
+ * `Dockerfile`: Docker container build definition
97
+ * `pyproject.toml`: project definition and dependencies
notebooks/container.ipynb ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Loading Dependency Injection Container in Jupyter Notebook"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": 4,
13
+ "metadata": {},
14
+ "outputs": [],
15
+ "source": [
16
+ "from ctp_slack_bot.containers import Container\n",
17
+ "from ctp_slack_bot.services import VectorDatabaseService\n",
18
+ "\n",
19
+ "container = Container()\n",
20
+ "container.wire(packages=['ctp_slack_bot'])"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": 2,
26
+ "metadata": {},
27
+ "outputs": [
28
+ {
29
+ "name": "stderr",
30
+ "output_type": "stream",
31
+ "text": [
32
+ "\u001b[32m2025-04-19 16:43:46.927\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.core.config\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m14\u001b[0m - \u001b[34m\u001b[1mCreated Settings\u001b[0m\n"
33
+ ]
34
+ },
35
+ {
36
+ "data": {
37
+ "text/plain": [
38
+ "Settings(LOG_LEVEL='INFO', LOG_FORMAT='json', SCHEDULER_TIMEZONE='America/New_York', SLACK_BOT_TOKEN=SecretStr('**********'), SLACK_APP_TOKEN=SecretStr('**********'), EMBEDDING_MODEL='text-embedding-3-small', VECTOR_DIMENSION=1536, CHUNK_SIZE=1000, CHUNK_OVERLAP=200, TOP_K_MATCHES=5, MONGODB_URI=SecretStr('**********'), MONGODB_NAME='ctp_slack_bot', SCORE_THRESHOLD=0.5, HF_API_TOKEN=SecretStr('**********'), OPENAI_API_KEY=SecretStr('**********'), CHAT_MODEL='gpt-3.5-turbo', MAX_TOKENS=150, TEMPERATURE=0.8, SYSTEM_PROMPT=\"You are a helpful teaching assistant for a data science class.\\nBased on the students question, you will be given context retreived from class transcripts and materials to answer their question.\\nYour responses should be:\\n\\n1. Accurate and based on the class content\\n2. Clear and educational\\n3. Concise but complete\\nIf you're unsure about something, acknowledge it and suggest asking the professor.\", GOOGLE_PROJECT_ID='voltaic-reducer-294821', GOOGLE_PRIVATE_KEY_ID=SecretStr('**********'), GOOGLE_PRIVATE_KEY=SecretStr('**********'), GOOGLE_CLIENT_ID='102943207835073856980', GOOGLE_CLIENT_EMAIL='[email protected]', GOOGLE_AUTH_URI='https://accounts.google.com/o/oauth2/auth', GOOGLE_TOKEN_URI='https://oauth2.googleapis.com/token', GOOGLE_AUTH_PROVIDER_CERT_URL='https://www.googleapis.com/oauth2/v1/certs', GOOGLE_CLIENT_CERT_URL='https://www.googleapis.com/robot/v1/metadata/x509/ctp-slack-bot-714%40voltaic-reducer-294821.iam.gserviceaccount.com', GOOGLE_UNIVERSE_DOMAIN='googleapis.com', FILE_MONITOR_ROOT_PATH='Transcripts/Friday Building AI Applications Session')"
39
+ ]
40
+ },
41
+ "execution_count": 2,
42
+ "metadata": {},
43
+ "output_type": "execute_result"
44
+ }
45
+ ],
46
+ "source": [
47
+ "container.settings()"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": null,
53
+ "metadata": {},
54
+ "outputs": [
55
+ {
56
+ "name": "stderr",
57
+ "output_type": "stream",
58
+ "text": [
59
+ "\u001b[32m2025-04-19 16:45:25.997\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.core.config\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m14\u001b[0m - \u001b[34m\u001b[1mCreated Settings\u001b[0m\n"
60
+ ]
61
+ },
62
+ {
63
+ "name": "stderr",
64
+ "output_type": "stream",
65
+ "text": [
66
+ "\u001b[32m2025-04-19 16:45:25.999\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36minit\u001b[0m:\u001b[36m175\u001b[0m - \u001b[1mInitializing MongoDB connection for database: ctp_slack_bot\u001b[0m\n",
67
+ "\u001b[32m2025-04-19 16:45:25.999\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m26\u001b[0m - \u001b[34m\u001b[1mCreated MongoDB\u001b[0m\n",
68
+ "\u001b[32m2025-04-19 16:45:25.999\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mconnect\u001b[0m:\u001b[36m32\u001b[0m - \u001b[34m\u001b[1mConnecting to MongoDB using URI: mongodb+srv://ctp-slack-bot.xkipuvm.mongodb.net/?retryWrites=true&w=majority&appName=ctp-slack-bot\u001b[0m\n",
69
+ "\u001b[32m2025-04-19 16:45:26.000\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mconnect\u001b[0m:\u001b[36m49\u001b[0m - \u001b[34m\u001b[1mMongoDB client initialized for database: ctp_slack_bot\u001b[0m\n",
70
+ "\u001b[32m2025-04-19 16:45:26.279\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
71
+ "\u001b[32m2025-04-19 16:45:26.280\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36m_test_connection\u001b[0m:\u001b[36m186\u001b[0m - \u001b[1mMongoDB connection test successful!\u001b[0m\n",
72
+ "\u001b[32m2025-04-19 16:45:26.280\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m21\u001b[0m - \u001b[34m\u001b[1mCreated VectorDatabaseService\u001b[0m\n"
73
+ ]
74
+ }
75
+ ],
76
+ "source": [
77
+ "vector_database_service: VectorDatabaseService = container.vector_database_service()"
78
+ ]
79
+ }
80
+ ],
81
+ "metadata": {
82
+ "kernelspec": {
83
+ "display_name": ".venv",
84
+ "language": "python",
85
+ "name": "python3"
86
+ },
87
+ "language_info": {
88
+ "codemirror_mode": {
89
+ "name": "ipython",
90
+ "version": 3
91
+ },
92
+ "file_extension": ".py",
93
+ "mimetype": "text/x-python",
94
+ "name": "python",
95
+ "nbconvert_exporter": "python",
96
+ "pygments_lexer": "ipython3",
97
+ "version": "3.12.3"
98
+ }
99
+ },
100
+ "nbformat": 4,
101
+ "nbformat_minor": 2
102
+ }
notebooks/google_drive.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/google_drive_web_vtt_vectorizer_and_storer.ipynb ADDED
@@ -0,0 +1,585 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Google Drive WebVTT Vectorizer and Storer"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": null,
13
+ "metadata": {},
14
+ "outputs": [
15
+ {
16
+ "name": "stderr",
17
+ "output_type": "stream",
18
+ "text": [
19
+ "\u001b[32m2025-04-19 19:21:27.333\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.core.config\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m14\u001b[0m - \u001b[34m\u001b[1mCreated Settings\u001b[0m\n",
20
+ "\u001b[32m2025-04-19 19:21:27.334\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.core.config\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m14\u001b[0m - \u001b[34m\u001b[1mCreated Settings\u001b[0m\n",
21
+ "\u001b[32m2025-04-19 19:21:27.337\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.google_drive_service\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreated GoogleDriveService\u001b[0m\n",
22
+ "\u001b[32m2025-04-19 19:21:27.361\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.embeddings_model_service\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m22\u001b[0m - \u001b[34m\u001b[1mCreated EmbeddingsModelService\u001b[0m\n",
23
+ "\u001b[32m2025-04-19 19:21:27.362\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vectorization_service\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m22\u001b[0m - \u001b[34m\u001b[1mCreated VectorizationService\u001b[0m\n"
24
+ ]
25
+ },
26
+ {
27
+ "name": "stderr",
28
+ "output_type": "stream",
29
+ "text": [
30
+ "\u001b[32m2025-04-19 19:21:27.364\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36minit\u001b[0m:\u001b[36m175\u001b[0m - \u001b[1mInitializing MongoDB connection for database: ctp_slack_bot\u001b[0m\n",
31
+ "\u001b[32m2025-04-19 19:21:27.364\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m26\u001b[0m - \u001b[34m\u001b[1mCreated MongoDB\u001b[0m\n",
32
+ "\u001b[32m2025-04-19 19:21:27.364\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mconnect\u001b[0m:\u001b[36m32\u001b[0m - \u001b[34m\u001b[1mConnecting to MongoDB using URI: mongodb+srv://ctp-slack-bot.xkipuvm.mongodb.net/?retryWrites=true&w=majority&appName=ctp-slack-bot\u001b[0m\n",
33
+ "\u001b[32m2025-04-19 19:21:27.365\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mconnect\u001b[0m:\u001b[36m49\u001b[0m - \u001b[34m\u001b[1mMongoDB client initialized for database: ctp_slack_bot\u001b[0m\n",
34
+ "\u001b[32m2025-04-19 19:21:27.825\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
35
+ "\u001b[32m2025-04-19 19:21:27.825\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36m_test_connection\u001b[0m:\u001b[36m186\u001b[0m - \u001b[1mMongoDB connection test successful!\u001b[0m\n",
36
+ "\u001b[32m2025-04-19 19:21:27.825\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m21\u001b[0m - \u001b[34m\u001b[1mCreated VectorDatabaseService\u001b[0m\n"
37
+ ]
38
+ }
39
+ ],
40
+ "source": [
41
+ "from datetime import datetime\n",
42
+ "from functools import partial\n",
43
+ "from html import escape\n",
44
+ "from IPython.display import display_html\n",
45
+ "from itertools import chain\n",
46
+ "from textwrap import wrap\n",
47
+ "from zoneinfo import ZoneInfo\n",
48
+ "\n",
49
+ "from ctp_slack_bot.containers import Container\n",
50
+ "from ctp_slack_bot.models import WebVTTContent\n",
51
+ "\n",
52
+ "display_html = partial(display_html, raw=True)\n",
53
+ "\n",
54
+ "container = Container()\n",
55
+ "google_drive_service = container.google_drive_service()\n",
56
+ "vectorization_service = container.vectorization_service()\n",
57
+ "vector_database_service = container.vector_database_service()"
58
+ ]
59
+ },
60
+ {
61
+ "cell_type": "markdown",
62
+ "metadata": {},
63
+ "source": [
64
+ "## Configuration\n",
65
+ "\n",
66
+ "⚠️ Configure before running the code to avoid processing the wrong file type or re-uploading past files which were already uploaded."
67
+ ]
68
+ },
69
+ {
70
+ "cell_type": "code",
71
+ "execution_count": 2,
72
+ "metadata": {},
73
+ "outputs": [],
74
+ "source": [
75
+ "MIME_TYPE = \"text/vtt\" # This should probably not be changed.\n",
76
+ "\n",
77
+ "MODIFICATION_TIME_CUTOFF = datetime(2024, 8, 30, tzinfo=ZoneInfo(\"UTC\"))"
78
+ ]
79
+ },
80
+ {
81
+ "cell_type": "markdown",
82
+ "metadata": {},
83
+ "source": [
84
+ "## Upload"
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "code",
89
+ "execution_count": 3,
90
+ "metadata": {},
91
+ "outputs": [
92
+ {
93
+ "data": {
94
+ "text/html": [
95
+ "<p>Found 7 files/folders.</p>"
96
+ ]
97
+ },
98
+ "metadata": {},
99
+ "output_type": "display_data"
100
+ },
101
+ {
102
+ "data": {
103
+ "text/html": [
104
+ "<ul><li>Week-03-Analytics-Friday-2024-09-13.cc.vtt</li><li>Week-07-Regressors-via-Linear-Regression-Friday-2024-10-18.transcript.vtt</li><li>Week-06-Classifiers-via-Logistic-Regression-Friday-2024-10-11.transcript.vtt</li><li>Week-09-AI-Part-1-Neural-Networks-Intro-to-HuggingFace-Friday-2024-11-01.cc.vtt</li><li>Week-08-Decision-Trees-Random-Forest-Tuesday-2024-10-22.cc.vtt</li><li>Week-02-Finding-Cleaning-Data-Friday-2024-09-06.vtt</li><li>Week-01-Setup-Pandas-Friday-2024-08-30.vtt</li></ul>"
105
+ ]
106
+ },
107
+ "metadata": {},
108
+ "output_type": "display_data"
109
+ },
110
+ {
111
+ "data": {
112
+ "text/html": [
113
+ "<p>7 files/folders pass the modification time (<em>2024-08-30 00:00:00+00:00</em>) cut-off.</p>"
114
+ ]
115
+ },
116
+ "metadata": {},
117
+ "output_type": "display_data"
118
+ },
119
+ {
120
+ "data": {
121
+ "text/html": [
122
+ "<ul><li>Week-03-Analytics-Friday-2024-09-13.cc.vtt</li><li>Week-07-Regressors-via-Linear-Regression-Friday-2024-10-18.transcript.vtt</li><li>Week-06-Classifiers-via-Logistic-Regression-Friday-2024-10-11.transcript.vtt</li><li>Week-09-AI-Part-1-Neural-Networks-Intro-to-HuggingFace-Friday-2024-11-01.cc.vtt</li><li>Week-08-Decision-Trees-Random-Forest-Tuesday-2024-10-22.cc.vtt</li><li>Week-02-Finding-Cleaning-Data-Friday-2024-09-06.vtt</li><li>Week-01-Setup-Pandas-Friday-2024-08-30.vtt</li></ul>"
123
+ ]
124
+ },
125
+ "metadata": {},
126
+ "output_type": "display_data"
127
+ },
128
+ {
129
+ "data": {
130
+ "text/html": [
131
+ "<p>7 files/folders pass the modification time (<em>2024-08-30 00:00:00+00:00</em>) cut-off and MIME type (<em>text/vtt</em>) criterion.</p>"
132
+ ]
133
+ },
134
+ "metadata": {},
135
+ "output_type": "display_data"
136
+ },
137
+ {
138
+ "data": {
139
+ "text/html": [
140
+ "<ul><li>Week-03-Analytics-Friday-2024-09-13.cc.vtt</li><li>Week-07-Regressors-via-Linear-Regression-Friday-2024-10-18.transcript.vtt</li><li>Week-06-Classifiers-via-Logistic-Regression-Friday-2024-10-11.transcript.vtt</li><li>Week-09-AI-Part-1-Neural-Networks-Intro-to-HuggingFace-Friday-2024-11-01.cc.vtt</li><li>Week-08-Decision-Trees-Random-Forest-Tuesday-2024-10-22.cc.vtt</li><li>Week-02-Finding-Cleaning-Data-Friday-2024-09-06.vtt</li><li>Week-01-Setup-Pandas-Friday-2024-08-30.vtt</li></ul>"
141
+ ]
142
+ },
143
+ "metadata": {},
144
+ "output_type": "display_data"
145
+ }
146
+ ],
147
+ "source": [
148
+ "item_metadata = google_drive_service.list_directory(\"\")\n",
149
+ "display_html(f\"<p>Found {len(item_metadata)} files/folders.</p>\")\n",
150
+ "display_html(\"\".join(chain(\"<ul>\", (f\"<li>{escape(metadata.name)}</li>\" for metadata in item_metadata), \"</ul>\")))\n",
151
+ "\n",
152
+ "recent_metadata = tuple(filter(lambda metadata: MODIFICATION_TIME_CUTOFF <= metadata.modified_time, item_metadata))\n",
153
+ "display_html(f\"<p>{len(item_metadata)} files/folders pass the modification time (<em>{MODIFICATION_TIME_CUTOFF}</em>) cut-off.</p>\")\n",
154
+ "display_html(\"\".join(chain(\"<ul>\", (f\"<li>{escape(metadata.name)}</li>\" for metadata in recent_metadata), \"</ul>\")))\n",
155
+ "\n",
156
+ "metadata_to_process = tuple(filter(lambda metadata: metadata.mime_type == MIME_TYPE, recent_metadata))\n",
157
+ "display_html(f\"<p>{len(item_metadata)} files/folders pass the modification time (<em>{MODIFICATION_TIME_CUTOFF}</em>) cut-off and MIME type (<em>{MIME_TYPE}</em>) criterion.</p>\")\n",
158
+ "display_html(\"\".join(chain(\"<ul>\", (f\"<li>{escape(metadata.name)}</li>\" for metadata in metadata_to_process), \"</ul>\")))"
159
+ ]
160
+ },
161
+ {
162
+ "cell_type": "code",
163
+ "execution_count": 4,
164
+ "metadata": {},
165
+ "outputs": [
166
+ {
167
+ "data": {
168
+ "text/html": [
169
+ "Processed 7 files."
170
+ ]
171
+ },
172
+ "metadata": {},
173
+ "output_type": "display_data"
174
+ }
175
+ ],
176
+ "source": [
177
+ "web_vtts = tuple(WebVTTContent.from_bytes(f\"googledrive:{metadata.folder_path}/{metadata.name}\",\n",
178
+ " {\n",
179
+ " \"filename\": metadata.name,\n",
180
+ " \"mimeType\": metadata.mime_type,\n",
181
+ " \"modificationTime\": metadata.modified_time\n",
182
+ " },\n",
183
+ " google_drive_service.read_file_by_id(metadata.id))\n",
184
+ " for metadata\n",
185
+ " in metadata_to_process)\n",
186
+ "\n",
187
+ "display_html(f\"Processed {len(web_vtts)} files.\")"
188
+ ]
189
+ },
190
+ {
191
+ "cell_type": "code",
192
+ "execution_count": 5,
193
+ "metadata": {},
194
+ "outputs": [
195
+ {
196
+ "data": {
197
+ "text/html": [
198
+ "Chunked Week-03-Analytics-Friday-2024-09-13.cc.vtt into 496 chunks."
199
+ ]
200
+ },
201
+ "metadata": {},
202
+ "output_type": "display_data"
203
+ },
204
+ {
205
+ "name": "stderr",
206
+ "output_type": "stream",
207
+ "text": [
208
+ "\u001b[32m2025-04-19 19:21:37.826\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.embeddings_model_service\u001b[0m:\u001b[36mget_embeddings\u001b[0m:\u001b[36m36\u001b[0m - \u001b[34m\u001b[1mCreating embeddings for 496 text string(s)…\u001b[0m\n"
209
+ ]
210
+ },
211
+ {
212
+ "data": {
213
+ "text/html": [
214
+ "Vectorized Week-03-Analytics-Friday-2024-09-13.cc.vtt’s 496 chunks."
215
+ ]
216
+ },
217
+ "metadata": {},
218
+ "output_type": "display_data"
219
+ },
220
+ {
221
+ "name": "stderr",
222
+ "output_type": "stream",
223
+ "text": [
224
+ "\u001b[32m2025-04-19 19:21:42.297\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m38\u001b[0m - \u001b[34m\u001b[1mGetting vectors collection for storing 496 chunks\u001b[0m\n",
225
+ "\u001b[32m2025-04-19 19:21:42.319\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
226
+ "\u001b[32m2025-04-19 19:21:42.320\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
227
+ "\u001b[32m2025-04-19 19:21:42.340\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
228
+ "\u001b[32m2025-04-19 19:21:42.341\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreating vector search index for vectors collection\u001b[0m\n",
229
+ "\u001b[32m2025-04-19 19:21:42.360\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
230
+ "\u001b[32m2025-04-19 19:21:42.360\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
231
+ "\u001b[32m2025-04-19 19:21:42.380\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
232
+ "\u001b[32m2025-04-19 19:21:42.500\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mcreate_indexes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1mVector search index 'vectors_vector_index' created for collection vectors.\u001b[0m\n",
233
+ "\u001b[32m2025-04-19 19:21:42.505\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m62\u001b[0m - \u001b[34m\u001b[1mInserting 496 documents into vectors collection\u001b[0m\n",
234
+ "\u001b[32m2025-04-19 19:21:48.862\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mStored 496 vector chunks in database\u001b[0m\n"
235
+ ]
236
+ },
237
+ {
238
+ "data": {
239
+ "text/html": [
240
+ "Stored Week-03-Analytics-Friday-2024-09-13.cc.vtt’s 496 vectorized chunks to the database."
241
+ ]
242
+ },
243
+ "metadata": {},
244
+ "output_type": "display_data"
245
+ },
246
+ {
247
+ "data": {
248
+ "text/html": [
249
+ "Chunked Week-07-Regressors-via-Linear-Regression-Friday-2024-10-18.transcript.vtt into 321 chunks."
250
+ ]
251
+ },
252
+ "metadata": {},
253
+ "output_type": "display_data"
254
+ },
255
+ {
256
+ "name": "stderr",
257
+ "output_type": "stream",
258
+ "text": [
259
+ "\u001b[32m2025-04-19 19:21:48.866\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.embeddings_model_service\u001b[0m:\u001b[36mget_embeddings\u001b[0m:\u001b[36m36\u001b[0m - \u001b[34m\u001b[1mCreating embeddings for 321 text string(s)…\u001b[0m\n"
260
+ ]
261
+ },
262
+ {
263
+ "data": {
264
+ "text/html": [
265
+ "Vectorized Week-07-Regressors-via-Linear-Regression-Friday-2024-10-18.transcript.vtt’s 321 chunks."
266
+ ]
267
+ },
268
+ "metadata": {},
269
+ "output_type": "display_data"
270
+ },
271
+ {
272
+ "name": "stderr",
273
+ "output_type": "stream",
274
+ "text": [
275
+ "\u001b[32m2025-04-19 19:21:52.629\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m38\u001b[0m - \u001b[34m\u001b[1mGetting vectors collection for storing 321 chunks\u001b[0m\n",
276
+ "\u001b[32m2025-04-19 19:21:52.652\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
277
+ "\u001b[32m2025-04-19 19:21:52.652\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
278
+ "\u001b[32m2025-04-19 19:21:52.671\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
279
+ "\u001b[32m2025-04-19 19:21:52.672\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreating vector search index for vectors collection\u001b[0m\n",
280
+ "\u001b[32m2025-04-19 19:21:52.691\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
281
+ "\u001b[32m2025-04-19 19:21:52.691\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
282
+ "\u001b[32m2025-04-19 19:21:52.712\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
283
+ "\u001b[32m2025-04-19 19:21:52.829\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mcreate_indexes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1mVector search index 'vectors_vector_index' created for collection vectors.\u001b[0m\n",
284
+ "\u001b[32m2025-04-19 19:21:52.831\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m62\u001b[0m - \u001b[34m\u001b[1mInserting 321 documents into vectors collection\u001b[0m\n",
285
+ "\u001b[32m2025-04-19 19:21:58.227\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mStored 321 vector chunks in database\u001b[0m\n"
286
+ ]
287
+ },
288
+ {
289
+ "data": {
290
+ "text/html": [
291
+ "Stored Week-07-Regressors-via-Linear-Regression-Friday-2024-10-18.transcript.vtt’s 321 vectorized chunks to the database."
292
+ ]
293
+ },
294
+ "metadata": {},
295
+ "output_type": "display_data"
296
+ },
297
+ {
298
+ "data": {
299
+ "text/html": [
300
+ "Chunked Week-06-Classifiers-via-Logistic-Regression-Friday-2024-10-11.transcript.vtt into 337 chunks."
301
+ ]
302
+ },
303
+ "metadata": {},
304
+ "output_type": "display_data"
305
+ },
306
+ {
307
+ "name": "stderr",
308
+ "output_type": "stream",
309
+ "text": [
310
+ "\u001b[32m2025-04-19 19:21:58.231\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.embeddings_model_service\u001b[0m:\u001b[36mget_embeddings\u001b[0m:\u001b[36m36\u001b[0m - \u001b[34m\u001b[1mCreating embeddings for 337 text string(s)…\u001b[0m\n"
311
+ ]
312
+ },
313
+ {
314
+ "data": {
315
+ "text/html": [
316
+ "Vectorized Week-06-Classifiers-via-Logistic-Regression-Friday-2024-10-11.transcript.vtt’s 337 chunks."
317
+ ]
318
+ },
319
+ "metadata": {},
320
+ "output_type": "display_data"
321
+ },
322
+ {
323
+ "name": "stderr",
324
+ "output_type": "stream",
325
+ "text": [
326
+ "\u001b[32m2025-04-19 19:22:02.126\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m38\u001b[0m - \u001b[34m\u001b[1mGetting vectors collection for storing 337 chunks\u001b[0m\n",
327
+ "\u001b[32m2025-04-19 19:22:02.147\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
328
+ "\u001b[32m2025-04-19 19:22:02.147\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
329
+ "\u001b[32m2025-04-19 19:22:02.167\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
330
+ "\u001b[32m2025-04-19 19:22:02.167\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreating vector search index for vectors collection\u001b[0m\n",
331
+ "\u001b[32m2025-04-19 19:22:02.186\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
332
+ "\u001b[32m2025-04-19 19:22:02.187\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
333
+ "\u001b[32m2025-04-19 19:22:02.207\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
334
+ "\u001b[32m2025-04-19 19:22:02.352\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mcreate_indexes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1mVector search index 'vectors_vector_index' created for collection vectors.\u001b[0m\n",
335
+ "\u001b[32m2025-04-19 19:22:02.354\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m62\u001b[0m - \u001b[34m\u001b[1mInserting 337 documents into vectors collection\u001b[0m\n",
336
+ "\u001b[32m2025-04-19 19:22:08.520\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mStored 337 vector chunks in database\u001b[0m\n"
337
+ ]
338
+ },
339
+ {
340
+ "data": {
341
+ "text/html": [
342
+ "Stored Week-06-Classifiers-via-Logistic-Regression-Friday-2024-10-11.transcript.vtt’s 337 vectorized chunks to the database."
343
+ ]
344
+ },
345
+ "metadata": {},
346
+ "output_type": "display_data"
347
+ },
348
+ {
349
+ "data": {
350
+ "text/html": [
351
+ "Chunked Week-09-AI-Part-1-Neural-Networks-Intro-to-HuggingFace-Friday-2024-11-01.cc.vtt into 341 chunks."
352
+ ]
353
+ },
354
+ "metadata": {},
355
+ "output_type": "display_data"
356
+ },
357
+ {
358
+ "name": "stderr",
359
+ "output_type": "stream",
360
+ "text": [
361
+ "\u001b[32m2025-04-19 19:22:08.524\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.embeddings_model_service\u001b[0m:\u001b[36mget_embeddings\u001b[0m:\u001b[36m36\u001b[0m - \u001b[34m\u001b[1mCreating embeddings for 341 text string(s)…\u001b[0m\n"
362
+ ]
363
+ },
364
+ {
365
+ "data": {
366
+ "text/html": [
367
+ "Vectorized Week-09-AI-Part-1-Neural-Networks-Intro-to-HuggingFace-Friday-2024-11-01.cc.vtt’s 341 chunks."
368
+ ]
369
+ },
370
+ "metadata": {},
371
+ "output_type": "display_data"
372
+ },
373
+ {
374
+ "name": "stderr",
375
+ "output_type": "stream",
376
+ "text": [
377
+ "\u001b[32m2025-04-19 19:22:12.675\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m38\u001b[0m - \u001b[34m\u001b[1mGetting vectors collection for storing 341 chunks\u001b[0m\n",
378
+ "\u001b[32m2025-04-19 19:22:12.712\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
379
+ "\u001b[32m2025-04-19 19:22:12.712\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
380
+ "\u001b[32m2025-04-19 19:22:12.731\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
381
+ "\u001b[32m2025-04-19 19:22:12.731\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreating vector search index for vectors collection\u001b[0m\n",
382
+ "\u001b[32m2025-04-19 19:22:12.750\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
383
+ "\u001b[32m2025-04-19 19:22:12.751\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
384
+ "\u001b[32m2025-04-19 19:22:12.773\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
385
+ "\u001b[32m2025-04-19 19:22:12.924\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mcreate_indexes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1mVector search index 'vectors_vector_index' created for collection vectors.\u001b[0m\n",
386
+ "\u001b[32m2025-04-19 19:22:12.926\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m62\u001b[0m - \u001b[34m\u001b[1mInserting 341 documents into vectors collection\u001b[0m\n",
387
+ "\u001b[32m2025-04-19 19:22:18.356\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mStored 341 vector chunks in database\u001b[0m\n"
388
+ ]
389
+ },
390
+ {
391
+ "data": {
392
+ "text/html": [
393
+ "Stored Week-09-AI-Part-1-Neural-Networks-Intro-to-HuggingFace-Friday-2024-11-01.cc.vtt’s 341 vectorized chunks to the database."
394
+ ]
395
+ },
396
+ "metadata": {},
397
+ "output_type": "display_data"
398
+ },
399
+ {
400
+ "data": {
401
+ "text/html": [
402
+ "Chunked Week-08-Decision-Trees-Random-Forest-Tuesday-2024-10-22.cc.vtt into 378 chunks."
403
+ ]
404
+ },
405
+ "metadata": {},
406
+ "output_type": "display_data"
407
+ },
408
+ {
409
+ "name": "stderr",
410
+ "output_type": "stream",
411
+ "text": [
412
+ "\u001b[32m2025-04-19 19:22:18.360\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.embeddings_model_service\u001b[0m:\u001b[36mget_embeddings\u001b[0m:\u001b[36m36\u001b[0m - \u001b[34m\u001b[1mCreating embeddings for 378 text string(s)…\u001b[0m\n"
413
+ ]
414
+ },
415
+ {
416
+ "data": {
417
+ "text/html": [
418
+ "Vectorized Week-08-Decision-Trees-Random-Forest-Tuesday-2024-10-22.cc.vtt’s 378 chunks."
419
+ ]
420
+ },
421
+ "metadata": {},
422
+ "output_type": "display_data"
423
+ },
424
+ {
425
+ "name": "stderr",
426
+ "output_type": "stream",
427
+ "text": [
428
+ "\u001b[32m2025-04-19 19:22:21.808\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m38\u001b[0m - \u001b[34m\u001b[1mGetting vectors collection for storing 378 chunks\u001b[0m\n",
429
+ "\u001b[32m2025-04-19 19:22:21.841\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
430
+ "\u001b[32m2025-04-19 19:22:21.841\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
431
+ "\u001b[32m2025-04-19 19:22:21.873\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
432
+ "\u001b[32m2025-04-19 19:22:21.874\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreating vector search index for vectors collection\u001b[0m\n",
433
+ "\u001b[32m2025-04-19 19:22:21.894\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
434
+ "\u001b[32m2025-04-19 19:22:21.894\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
435
+ "\u001b[32m2025-04-19 19:22:21.914\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
436
+ "\u001b[32m2025-04-19 19:22:22.029\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mcreate_indexes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1mVector search index 'vectors_vector_index' created for collection vectors.\u001b[0m\n",
437
+ "\u001b[32m2025-04-19 19:22:22.035\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m62\u001b[0m - \u001b[34m\u001b[1mInserting 378 documents into vectors collection\u001b[0m\n",
438
+ "\u001b[32m2025-04-19 19:22:28.108\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mStored 378 vector chunks in database\u001b[0m\n"
439
+ ]
440
+ },
441
+ {
442
+ "data": {
443
+ "text/html": [
444
+ "Stored Week-08-Decision-Trees-Random-Forest-Tuesday-2024-10-22.cc.vtt’s 378 vectorized chunks to the database."
445
+ ]
446
+ },
447
+ "metadata": {},
448
+ "output_type": "display_data"
449
+ },
450
+ {
451
+ "data": {
452
+ "text/html": [
453
+ "Chunked Week-02-Finding-Cleaning-Data-Friday-2024-09-06.vtt into 680 chunks."
454
+ ]
455
+ },
456
+ "metadata": {},
457
+ "output_type": "display_data"
458
+ },
459
+ {
460
+ "name": "stderr",
461
+ "output_type": "stream",
462
+ "text": [
463
+ "\u001b[32m2025-04-19 19:22:28.113\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.embeddings_model_service\u001b[0m:\u001b[36mget_embeddings\u001b[0m:\u001b[36m36\u001b[0m - \u001b[34m\u001b[1mCreating embeddings for 680 text string(s)…\u001b[0m\n"
464
+ ]
465
+ },
466
+ {
467
+ "data": {
468
+ "text/html": [
469
+ "Vectorized Week-02-Finding-Cleaning-Data-Friday-2024-09-06.vtt’s 680 chunks."
470
+ ]
471
+ },
472
+ "metadata": {},
473
+ "output_type": "display_data"
474
+ },
475
+ {
476
+ "name": "stderr",
477
+ "output_type": "stream",
478
+ "text": [
479
+ "\u001b[32m2025-04-19 19:22:34.652\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m38\u001b[0m - \u001b[34m\u001b[1mGetting vectors collection for storing 680 chunks\u001b[0m\n",
480
+ "\u001b[32m2025-04-19 19:22:34.671\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
481
+ "\u001b[32m2025-04-19 19:22:34.671\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
482
+ "\u001b[32m2025-04-19 19:22:34.705\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
483
+ "\u001b[32m2025-04-19 19:22:34.705\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreating vector search index for vectors collection\u001b[0m\n",
484
+ "\u001b[32m2025-04-19 19:22:34.720\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
485
+ "\u001b[32m2025-04-19 19:22:34.720\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
486
+ "\u001b[32m2025-04-19 19:22:34.740\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
487
+ "\u001b[32m2025-04-19 19:22:34.859\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mcreate_indexes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1mVector search index 'vectors_vector_index' created for collection vectors.\u001b[0m\n",
488
+ "\u001b[32m2025-04-19 19:22:34.866\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m62\u001b[0m - \u001b[34m\u001b[1mInserting 680 documents into vectors collection\u001b[0m\n",
489
+ "\u001b[32m2025-04-19 19:22:43.431\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mStored 680 vector chunks in database\u001b[0m\n"
490
+ ]
491
+ },
492
+ {
493
+ "data": {
494
+ "text/html": [
495
+ "Stored Week-02-Finding-Cleaning-Data-Friday-2024-09-06.vtt’s 680 vectorized chunks to the database."
496
+ ]
497
+ },
498
+ "metadata": {},
499
+ "output_type": "display_data"
500
+ },
501
+ {
502
+ "data": {
503
+ "text/html": [
504
+ "Chunked Week-01-Setup-Pandas-Friday-2024-08-30.vtt into 742 chunks."
505
+ ]
506
+ },
507
+ "metadata": {},
508
+ "output_type": "display_data"
509
+ },
510
+ {
511
+ "name": "stderr",
512
+ "output_type": "stream",
513
+ "text": [
514
+ "\u001b[32m2025-04-19 19:22:43.438\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.embeddings_model_service\u001b[0m:\u001b[36mget_embeddings\u001b[0m:\u001b[36m36\u001b[0m - \u001b[34m\u001b[1mCreating embeddings for 742 text string(s)…\u001b[0m\n"
515
+ ]
516
+ },
517
+ {
518
+ "data": {
519
+ "text/html": [
520
+ "Vectorized Week-01-Setup-Pandas-Friday-2024-08-30.vtt’s 742 chunks."
521
+ ]
522
+ },
523
+ "metadata": {},
524
+ "output_type": "display_data"
525
+ },
526
+ {
527
+ "name": "stderr",
528
+ "output_type": "stream",
529
+ "text": [
530
+ "\u001b[32m2025-04-19 19:22:50.402\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m38\u001b[0m - \u001b[34m\u001b[1mGetting vectors collection for storing 742 chunks\u001b[0m\n",
531
+ "\u001b[32m2025-04-19 19:22:50.426\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
532
+ "\u001b[32m2025-04-19 19:22:50.426\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
533
+ "\u001b[32m2025-04-19 19:22:50.452\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
534
+ "\u001b[32m2025-04-19 19:22:50.452\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreating vector search index for vectors collection\u001b[0m\n",
535
+ "\u001b[32m2025-04-19 19:22:50.475\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
536
+ "\u001b[32m2025-04-19 19:22:50.475\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
537
+ "\u001b[32m2025-04-19 19:22:50.508\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
538
+ "\u001b[32m2025-04-19 19:22:50.617\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mcreate_indexes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1mVector search index 'vectors_vector_index' created for collection vectors.\u001b[0m\n",
539
+ "\u001b[32m2025-04-19 19:22:50.626\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m62\u001b[0m - \u001b[34m\u001b[1mInserting 742 documents into vectors collection\u001b[0m\n",
540
+ "\u001b[32m2025-04-19 19:23:01.166\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mStored 742 vector chunks in database\u001b[0m\n"
541
+ ]
542
+ },
543
+ {
544
+ "data": {
545
+ "text/html": [
546
+ "Stored Week-01-Setup-Pandas-Friday-2024-08-30.vtt’s 742 vectorized chunks to the database."
547
+ ]
548
+ },
549
+ "metadata": {},
550
+ "output_type": "display_data"
551
+ }
552
+ ],
553
+ "source": [
554
+ "for web_vtt in web_vtts:\n",
555
+ " chunks = web_vtt.get_chunks()\n",
556
+ " display_html(f\"Chunked {web_vtt.get_metadata().get(\"filename\")} into {len(chunks)} chunks.\")\n",
557
+ " vectorized_chunks = vectorization_service.vectorize(chunks)\n",
558
+ " display_html(f\"Vectorized {web_vtt.get_metadata().get(\"filename\")}’s {len(vectorized_chunks)} chunks.\")\n",
559
+ " await (await vector_database_service).store(vectorized_chunks)\n",
560
+ " display_html(f\"Stored {web_vtt.get_metadata().get(\"filename\")}’s {len(vectorized_chunks)} vectorized chunks to the database.\")"
561
+ ]
562
+ }
563
+ ],
564
+ "metadata": {
565
+ "kernelspec": {
566
+ "display_name": ".venv",
567
+ "language": "python",
568
+ "name": "python3"
569
+ },
570
+ "language_info": {
571
+ "codemirror_mode": {
572
+ "name": "ipython",
573
+ "version": 3
574
+ },
575
+ "file_extension": ".py",
576
+ "mimetype": "text/x-python",
577
+ "name": "python",
578
+ "nbconvert_exporter": "python",
579
+ "pygments_lexer": "ipython3",
580
+ "version": "3.12.3"
581
+ }
582
+ },
583
+ "nbformat": 4,
584
+ "nbformat_minor": 2
585
+ }
notebooks/web_vtt.ipynb ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# WebVTT Reading and Chunking Test"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "markdown",
12
+ "metadata": {},
13
+ "source": [
14
+ "## Pure `webvtt-py` as Proof-of-concept"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": 1,
20
+ "metadata": {},
21
+ "outputs": [],
22
+ "source": [
23
+ "from datetime import datetime, timedelta\n",
24
+ "from functools import partial\n",
25
+ "from html import escape\n",
26
+ "from io import BytesIO\n",
27
+ "from IPython.display import display_html\n",
28
+ "from itertools import chain\n",
29
+ "import re\n",
30
+ "from webvtt import Caption, WebVTT\n",
31
+ "from webvtt.models import Timestamp\n",
32
+ "from zoneinfo import ZoneInfo\n",
33
+ "\n",
34
+ "display_html = partial(display_html, raw=True)"
35
+ ]
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "execution_count": null,
40
+ "metadata": {},
41
+ "outputs": [],
42
+ "source": [
43
+ "FILE_PATH = \"GMT20250411-223535_Recording.transcript.vtt\"\n",
44
+ "TIME_ZONE = ZoneInfo(\"America/New_York\")\n",
45
+ "BASE_TIME = datetime(2025, 4, 11, hour=22, minute=35, second=35, tzinfo=ZoneInfo(\"GMT\")).astimezone(TIME_ZONE)"
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "code",
50
+ "execution_count": 3,
51
+ "metadata": {},
52
+ "outputs": [],
53
+ "source": [
54
+ "with open(FILE_PATH, \"rb\") as file:\n",
55
+ " web_vtt = WebVTT.from_buffer(BytesIO(file.read()))"
56
+ ]
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "execution_count": 4,
61
+ "metadata": {},
62
+ "outputs": [
63
+ {
64
+ "data": {
65
+ "text/html": [
66
+ "<ul><li>__class__</li><li>__delattr__</li><li>__dict__</li><li>__dir__</li><li>__doc__</li><li>__eq__</li><li>__format__</li><li>__ge__</li><li>__getattribute__</li><li>__getitem__</li><li>__getstate__</li><li>__gt__</li><li>__hash__</li><li>__init__</li><li>__init_subclass__</li><li>__le__</li><li>__len__</li><li>__lt__</li><li>__module__</li><li>__ne__</li><li>__new__</li><li>__reduce__</li><li>__reduce_ex__</li><li>__repr__</li><li>__setattr__</li><li>__sizeof__</li><li>__str__</li><li>__subclasshook__</li><li>__weakref__</li><li>_get_destination_file</li><li>_get_lines</li><li>_has_bom</li><li>captions</li><li>content</li><li>encoding</li><li>file</li><li>footer_comments</li><li>from_buffer</li><li>from_sbv</li><li>from_srt</li><li>from_string</li><li>header_comments</li><li>iter_slice</li><li>read</li><li>read_buffer</li><li>save</li><li>save_as_srt</li><li>styles</li><li>total_length</li><li>write</li></ul>"
67
+ ]
68
+ },
69
+ "metadata": {},
70
+ "output_type": "display_data"
71
+ }
72
+ ],
73
+ "source": [
74
+ "display_html(\"\".join(chain(\"<ul>\", (f\"<li>{escape(member)}</li>\" for member in dir(web_vtt)), \"</ul>\")))"
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "execution_count": 5,
80
+ "metadata": {},
81
+ "outputs": [
82
+ {
83
+ "data": {
84
+ "text/html": [
85
+ "\n",
86
+ " <strong>Caption</strong> #344\n",
87
+ " <ul>\n",
88
+ " <li><strong>Start:</strong> Friday, April 11, 2025, 07:36:54 PM EDT</li>\n",
89
+ " <li><strong>Speaker:</strong> CUNY Tech Prep (CTP)</li>\n",
90
+ " <li><strong>Speech:</strong> Alright. You can pick the rooms. Now go into your rooms.</li>\n",
91
+ " <li><strong>End:</strong> Friday, April 11, 2025, 07:36:57 PM EDT</li>\n",
92
+ " </ul>\n",
93
+ " "
94
+ ]
95
+ },
96
+ "metadata": {},
97
+ "output_type": "display_data"
98
+ }
99
+ ],
100
+ "source": [
101
+ "speaker_speech_pattern = re.compile(\"(?:([^:]+): )?(.*)\")\n",
102
+ "\n",
103
+ "match web_vtt.captions[343]:\n",
104
+ " case Caption(identifier=identifier, start_time=start_time, end_time=end_time, text=text):\n",
105
+ " match speaker_speech_pattern.search(text).groups():\n",
106
+ " case (speaker, speech):\n",
107
+ " display_html(f\"\"\"\n",
108
+ " <strong>Caption</strong> #{identifier}\n",
109
+ " <ul>\n",
110
+ " <li><strong>Start:</strong> {BASE_TIME + timedelta(**start_time.__dict__):%A, %B %d, %Y, %I:%M:%S %p %Z}</li>\n",
111
+ " <li><strong>Speaker:</strong> {escape(speaker)}</li>\n",
112
+ " <li><strong>Speech:</strong> {escape(speech)}</li>\n",
113
+ " <li><strong>End:</strong> {BASE_TIME + timedelta(**end_time.__dict__):%A, %B %d, %Y, %I:%M:%S %p %Z}</li>\n",
114
+ " </ul>\n",
115
+ " \"\"\")"
116
+ ]
117
+ },
118
+ {
119
+ "cell_type": "markdown",
120
+ "metadata": {},
121
+ "source": [
122
+ "### Chunking\n",
123
+ "\n",
124
+ "In order for chunking to produce bits with useful context, we must not only use the caption (frame) itself, but bundle it with its surrounding frames (before and after messages)."
125
+ ]
126
+ },
127
+ {
128
+ "cell_type": "code",
129
+ "execution_count": 6,
130
+ "metadata": {},
131
+ "outputs": [],
132
+ "source": [
133
+ "from more_itertools import windowed"
134
+ ]
135
+ },
136
+ {
137
+ "cell_type": "code",
138
+ "execution_count": 7,
139
+ "metadata": {},
140
+ "outputs": [],
141
+ "source": [
142
+ "CHUNK_FRAMES_OVERLAP = 1\n",
143
+ "CHUNK_FRAMES_WINDOW = 5"
144
+ ]
145
+ },
146
+ {
147
+ "cell_type": "code",
148
+ "execution_count": 8,
149
+ "metadata": {},
150
+ "outputs": [
151
+ {
152
+ "data": {
153
+ "text/html": [
154
+ "<table><tr><td>A</td></tr><tr><td>B</td></tr><tr><td>C</td></tr><tr><td>D</td></tr><tr><td>E</td></tr><tr><td>F</td></tr><tr><td>G</td></tr><tr><td>H</td></tr><tr><td>I</td></tr><tr><td>J</td></tr><tr><td>K</td></tr><tr><td>L</td></tr><tr><td>M</td></tr><tr><td>N</td></tr><tr><td>O</td></tr><tr><td>P</td></tr><tr><td>Q</td></tr><tr><td>R</td></tr><tr><td>S</td></tr><tr><td>T</td></tr><tr><td>U</td></tr><tr><td>V</td></tr><tr><td>W</td></tr><tr><td>X</td></tr><tr><td>Y</td></tr><tr><td>Z</td></tr></table>"
155
+ ]
156
+ },
157
+ "metadata": {},
158
+ "output_type": "display_data"
159
+ }
160
+ ],
161
+ "source": [
162
+ "items = tuple(chr(code_point) for code_point in range(ord('A'), ord('[')))\n",
163
+ "display_html(f\"<table>{\"\".join(map(\"<tr><td>{}</td></tr>\".format, items))}</table>\")"
164
+ ]
165
+ },
166
+ {
167
+ "cell_type": "code",
168
+ "execution_count": 9,
169
+ "metadata": {},
170
+ "outputs": [
171
+ {
172
+ "data": {
173
+ "text/html": [
174
+ "<table><tr><td>A</td><td>B</td><td>C</td><td>D</td><td>E</td></tr><tr><td>E</td><td>F</td><td>G</td><td>H</td><td>I</td></tr><tr><td>I</td><td>J</td><td>K</td><td>L</td><td>M</td></tr><tr><td>M</td><td>N</td><td>O</td><td>P</td><td>Q</td></tr><tr><td>Q</td><td>R</td><td>S</td><td>T</td><td>U</td></tr><tr><td>U</td><td>V</td><td>W</td><td>X</td><td>Y</td></tr><tr><td>Y</td><td>Z</td><td></td><td></td><td></td></tr></table>"
175
+ ]
176
+ },
177
+ "metadata": {},
178
+ "output_type": "display_data"
179
+ }
180
+ ],
181
+ "source": [
182
+ "chunks = tuple(windowed(items, CHUNK_FRAMES_WINDOW, step=(CHUNK_FRAMES_WINDOW - CHUNK_FRAMES_OVERLAP)))\n",
183
+ "display_html(f\"<table>{\"\".join(f\"<tr>{\"\".join(f\"<td>{item if item else \"\"}</td>\" for item in chunk)}</tr>\" for chunk in chunks)}</table>\")"
184
+ ]
185
+ },
186
+ {
187
+ "cell_type": "markdown",
188
+ "metadata": {},
189
+ "source": [
190
+ "## Using the `WebVTTFile` Class"
191
+ ]
192
+ },
193
+ {
194
+ "cell_type": "code",
195
+ "execution_count": 10,
196
+ "metadata": {},
197
+ "outputs": [],
198
+ "source": [
199
+ "from datetime import datetime\n",
200
+ "from hashlib import sha256\n",
201
+ "from zoneinfo import ZoneInfo\n",
202
+ "\n",
203
+ "from ctp_slack_bot.models import WebVTTContent"
204
+ ]
205
+ },
206
+ {
207
+ "cell_type": "code",
208
+ "execution_count": null,
209
+ "metadata": {},
210
+ "outputs": [],
211
+ "source": [
212
+ "FILE_PATH = \"GMT20250411-223535_Recording.transcript.vtt\"\n",
213
+ "TIME_ZONE = ZoneInfo(\"America/New_York\")\n",
214
+ "MODIFICATION_TIME = datetime(2025, 4, 11, hour=22, minute=35, second=35, tzinfo=ZoneInfo(\"GMT\")).astimezone(TIME_ZONE)"
215
+ ]
216
+ },
217
+ {
218
+ "cell_type": "code",
219
+ "execution_count": 12,
220
+ "metadata": {},
221
+ "outputs": [],
222
+ "source": [
223
+ "with open(FILE_PATH, \"rb\") as file:\n",
224
+ " bytes = file.read()\n",
225
+ " web_vtt_content = WebVTTContent.from_bytes(sha256(bytes).hexdigest(), {\"modification_time\": MODIFICATION_TIME}, bytes)"
226
+ ]
227
+ },
228
+ {
229
+ "cell_type": "code",
230
+ "execution_count": 13,
231
+ "metadata": {},
232
+ "outputs": [
233
+ {
234
+ "data": {
235
+ "text/plain": [
236
+ "(Chunk(text=\"iyeshia: For the workshop. We want to set you up.\\n\\niyeshia: Thank you, Kevin, for a question. We want to set you up for success in year one. And so this workshop is to help you kind of like\\n\\niyeshia: figure out, or how to adjust, as you're coming into your careers what to expect like your 30 days of work, 60 days of work, 90 days of work when you are starting your full time roles. So with that, said, let us get started.\\n\\niyeshia: So the topic, of course, is going to be discussing things of like the onboarding process of what it looks like when you start your jobs. How to maneuver or move around in your workplace environments. We'll discuss negotiating raises, because last time we didn't negotiating offers. So now we pass that you already got the offer. So now we'd be at the\\n\\niyeshia: the race card after that year. Don't try to come into your job already. 5 days in somebody to raise. Wait, and then from there we'll do activity on asking for feedback when you have, like your supervisor or manager, and you want to discuss things like that.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='1-5', metadata={'start': datetime.timedelta(0), 'end': datetime.timedelta(seconds=60, microseconds=379000), 'speakers': frozenset({'iyeshia'})}),\n",
237
+ " Chunk(text=\"iyeshia: the race card after that year. Don't try to come into your job already. 5 days in somebody to raise. Wait, and then from there we'll do activity on asking for feedback when you have, like your supervisor or manager, and you want to discuss things like that.\\n\\niyeshia: So let's kick it off with the onboarding process.\\n\\niyeshia: So with this, what you can expect ideally when you start your your job. There could be some type of welcome package. They might have a folder. They might have an email electronically or things like that. But it's gonna describe the details of like the company's environment. What your 1st day, or your 1st week or 1st month, a couple of months, might look like. As you're starting your onboarding process and the paperwork they might even show with you on the 1st day\\n\\niyeshia: work. You might be paired up with a Buddy or other people who might be hired at the same day, or maybe someone who was hired a year before, and they might be shadowing you to help you join and to get comfortable with your work environment.\\n\\niyeshia: and then also, your manager will. Hopefully, our supervisor would let you know what to expect. As you're starting your new\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='5-9', metadata={'start': datetime.timedelta(seconds=45, microseconds=930000), 'end': datetime.timedelta(seconds=108, microseconds=640000), 'speakers': frozenset({'iyeshia'})}),\n",
238
+ " Chunk(text=\"iyeshia: and then also, your manager will. Hopefully, our supervisor would let you know what to expect. As you're starting your new\\n\\niyeshia: job or career, and then from there, if you're unsure about your onboarding process as you're starting off, please ask questions to your manager or supervisor. The best part is to ask as many questions as you can. You're new, you're learning. They understand that. So they want to hear from you and your input\\n\\niyeshia: from there, I would say, I'm just looking at the\\n\\niyeshia: the chat. Yes, prepare for a lot of paperwork. Yes, I mean W. 2 W. Fours. They might have you fill out all those things. And that was 2. Okay, all right, Kevin.\\n\\niyeshia: So from there we'll kick it off. So an idea of what that could look like for you from 30 days to 60 days to 90 days to infinity and beyond like buzz light year, but from there you would hopefully to have intros with your your team, your manager, different departments. When you're starting\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='9-13', metadata={'start': datetime.timedelta(seconds=102, microseconds=82000), 'end': datetime.timedelta(seconds=166, microseconds=199000), 'speakers': frozenset({'iyeshia'})}),\n",
239
+ " Chunk(text=\"iyeshia: So from there we'll kick it off. So an idea of what that could look like for you from 30 days to 60 days to 90 days to infinity and beyond like buzz light year, but from there you would hopefully to have intros with your your team, your manager, different departments. When you're starting\\n\\niyeshia: they'll go over etiquette with you of like what you can expect. At the job that can include your attire, your desk hygiene communication, checking in with managers or teams.\\n\\niyeshia: Once you, after the 30 days we get to maybe days, 60 days, and then you're able to develop like your needs. Gain a better understanding of the company, develop plans and deliverables and outcomes. And then you go into your 90 days of being on the job where you're kind of learning your role. You're kind of getting adjust, you're being more effective and being becoming more independent.\\n\\niyeshia: And then from there you be able to understand, like, after the 90 days that you're kind of like settled in maybe months 4 to 6, or maybe the whole year. You should be settled into your role, understanding what's going on understanding how different departments move and things like that. So this is just the overview of what that looks like. It's not necessarily concrete, because every job is different.\\n\\niyeshia: But this is just to give an idea of what you can expect of that. And please just be mindful like with every workshop. I'm definitely going to send you the Powerpoint at the end. So if you want to look over that on your own time, you definitely can.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='13-17', metadata={'start': datetime.timedelta(seconds=147, microseconds=8000), 'end': datetime.timedelta(seconds=233, microseconds=730000), 'speakers': frozenset({'iyeshia'})}),\n",
240
+ " Chunk(text=\"iyeshia: But this is just to give an idea of what you can expect of that. And please just be mindful like with every workshop. I'm definitely going to send you the Powerpoint at the end. So if you want to look over that on your own time, you definitely can.\\n\\niyeshia: And so now that we've got through the onboarding process, this is probably the quickest we've done onboarding process because Kevin did it in 2 weeks. So from there we are going to move to navigating the workplace environment.\\n\\niyeshia: And so with that said, some things that are really important in your workplace environment is building relationships. Whether that's with your peers, your colleagues. Your manager. Trying to have a mentor mentee connection. All relationships are important.\\n\\niyeshia: With that I would say that when it comes to identifying your relationship needs, you want to know what you're expecting like, what? How do you need to show up in your role. What do you need from others? Understanding those type of things can help build better, I would say. Connections with your teammates and things of that nature when it's time to like cover problems or solve projects and things like that.\\n\\niyeshia: Another thing, too, you want to focus on is your Eiq. Emotional intelligence and communication that is basically pretty much helpful on the ability of recognizing your own emotions. Are you adequate enough, or know where your emotions are where you can get things done, what you need, what you don't need? Can you articulate that to your employer when you know those you can be able to identify and handle your emotions.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='17-21', metadata={'start': datetime.timedelta(seconds=220, microseconds=406000), 'end': datetime.timedelta(seconds=315, microseconds=170000), 'speakers': frozenset({'iyeshia'})}),\n",
241
+ " Chunk(text=\"iyeshia: Another thing, too, you want to focus on is your Eiq. Emotional intelligence and communication that is basically pretty much helpful on the ability of recognizing your own emotions. Are you adequate enough, or know where your emotions are where you can get things done, what you need, what you don't need? Can you articulate that to your employer when you know those you can be able to identify and handle your emotions.\\n\\niyeshia: And you can add basically help also to learn how to understand and help others. As well.\\n\\niyeshia: Another thing, as far as building relationships goes, is practicing, mindful listening. So the best way to truly listen is to talk less, and of course to understand more. And so when you learn from your teammates, listen as much as you can gain as much knowledge as you can from others, and that's gonna help you kinda conduct, or, you know, be a better team player. In your work environment.\\n\\niyeshia: And then a few things that you can do is\\n\\niyeshia: another way to help build a relationship is manager boundaries, you know, saying what is for you, scheduling time? With colleagues trying not to go over certain tasks or assignments. So that time management is gonna definitely help when you want to focus on your boundaries and you want to set schedules to maybe build connections with your team, and these are ways that you can go about it. Introduce yourself to people, whether your peers, whether it's\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='21-25', metadata={'start': datetime.timedelta(seconds=288, microseconds=600000), 'end': datetime.timedelta(seconds=376, microseconds=110000), 'speakers': frozenset({'iyeshia'})}),\n",
242
+ " Chunk(text=\"iyeshia: another way to help build a relationship is manager boundaries, you know, saying what is for you, scheduling time? With colleagues trying not to go over certain tasks or assignments. So that time management is gonna definitely help when you want to focus on your boundaries and you want to set schedules to maybe build connections with your team, and these are ways that you can go about it. Introduce yourself to people, whether your peers, whether it's\\n\\niyeshia: I don't care if it's a janitor security. The Cfo treat everybody equal and the same. And get to know. Get to know people because you just never know when you're going to need someone or work with someone. During that time.\\n\\niyeshia: And so those are the ways you can go about it. Greet people. You can invite people to coffee breaks, do quick message, check-in, and things of that nature, and then from there the 6 or 7 1, i think, are really important in the workplace environment. Some of the things you want to do is show gratitude, embrace others, give.\\n\\niyeshia: you know, credit where credit is due. Don't try to take anybody's ideas. If it comes to projects and things like that, that is a serious no-no show gratitude, and by any means necessary, try to avoid any gossip, any issues with office politics stay out of it. This is your first.st\\n\\niyeshia: This might be your 1st real like role, as far as like full time. In your career. So you just want to make sure you just keep in the peace and be respectful from there. Gossiping is kind of a big deal and a big no-no as well. So just be mindful of that.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='25-29', metadata={'start': datetime.timedelta(seconds=351, microseconds=10000), 'end': datetime.timedelta(seconds=438, microseconds=590000), 'speakers': frozenset({'iyeshia'})}),\n",
243
+ " Chunk(text=\"iyeshia: This might be your 1st real like role, as far as like full time. In your career. So you just want to make sure you just keep in the peace and be respectful from there. Gossiping is kind of a big deal and a big no-no as well. So just be mindful of that.\\n\\niyeshia: So the next thing, as far as we're talking about building relationship goals, you definitely want to also build those relationships, as I stated, with your peers. And things like that. Your coworkers? But you want to make sure you build a relationship with your manager. And just remember that it's important to have a relationship with your manager. But that's not the only relationship that's like you should focus on, you know. Like, I said before, you want to be a team play. You want to treat everybody equally because you just never know who you connect with.\\n\\niyeshia: But when it comes to that manager time, or asking for I would say, supervisions or meetings with them. You can ask questions. Those are always encouraged. You can ask them about their you know, supervisor style. Are they transformative? Are they hands on?\\n\\niyeshia: Do they like feedback directly towards them? Is everything written email? How are they? What's their work? Style? You can even ask them for the expectations of what is this like in a role like, what are your expectations, as far as how you show up in your role to them? And what are they looking for like with the measurements of success. Of course we always tell fellows to document everything that you do, as far as like when it comes to any goals that you bring any success.\\n\\niyeshia: rate, that you have many tasks that you might have brought to the table any of your accomplishments I know some people carry, or they write down like a accomplishment form of all the things that they've done, which, while they were at work to help with the ideas of what they bring to the table when it's time to come up for that, raise negotiation process. So just make sure you also update your resume as we go along, too.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='29-33', metadata={'start': datetime.timedelta(seconds=424, microseconds=830000), 'end': datetime.timedelta(seconds=536, microseconds=219000), 'speakers': frozenset({'iyeshia'})}),\n",
244
+ " Chunk(text=\"iyeshia: rate, that you have many tasks that you might have brought to the table any of your accomplishments I know some people carry, or they write down like a accomplishment form of all the things that they've done, which, while they were at work to help with the ideas of what they bring to the table when it's time to come up for that, raise negotiation process. So just make sure you also update your resume as we go along, too.\\n\\niyeshia: and then to talk with your manager about not only your successes and what you accomplish, but maybe areas of where you can grow and what you've been struggling to focus on so they can help support you with that as well.\\n\\niyeshia: Be observant in meetings when you're meeting with your team and other people. So that way you could learn about what else is going on, or whatever what everybody else is doing. So you can see how things work together. If you want to connect and socialize, you can ask people to lunch or coffee chats and things like that, and then always just remain proactive. You know it's always a good gesture to ask for teammate. It's like, Hey, is there anything you need before you know the end of the day? Or before I'm about to leave. You know things like that. It's always\\n\\niyeshia: helpful, too, because you never know when it's like your time, and someone is asking or offering help to you. And you're like, Oh, yeah, definitely need help with this. So it's always great to return their favor.\\n\\niyeshia: And so\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='33-37', metadata={'start': datetime.timedelta(seconds=511, microseconds=850000), 'end': datetime.timedelta(seconds=589, microseconds=330000), 'speakers': frozenset({'iyeshia'})}),\n",
245
+ " Chunk(text=\"iyeshia: And so\\n\\niyeshia: from there I would say, overall in regards of meeting with your supervisor, depending on how they do it. It could be quarterly it could be every other month. It could be 3 times throughout the year. They have a performance review. And so some companies like to start with, maybe January, you start, or maybe June, you started\\n\\niyeshia: working with them, and you track goals and what you could accomplish. With your manager until, like the next meeting, you have to go over just to make sure that you're on track with your goals throughout the throughout the year, as you've been working with your with your company.\\n\\niyeshia: That you got hired by, and so sometimes they'll do like a mid year review report to see your progress. If there's any touch points they could assist you with or support you with. You can meet with them with one on one meetings. If you feel like that's too long, and you want to make suggestions to meet with them sooner. Maybe you want to do every 3 months\\n\\niyeshia: just to see what's going on and how you can stay on track, and so I would say. Performance reviews, I guess, could be nerve wracking if it's like your 1st time, because you don't know what to expect.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='37-41', metadata={'start': datetime.timedelta(seconds=587, microseconds=800000), 'end': datetime.timedelta(seconds=654, microseconds=640000), 'speakers': frozenset({'iyeshia'})}),\n",
246
+ " Chunk(text=\"iyeshia: just to see what's going on and how you can stay on track, and so I would say. Performance reviews, I guess, could be nerve wracking if it's like your 1st time, because you don't know what to expect.\\n\\niyeshia: but of course you'll get used to it. As it progresses. But then, of course, you're still maintaining those connections with your supervisor, so you can definitely ask them questions of what you can expect from a performance review and things like that.\\n\\niyeshia: I'll pause here. If anybody has any questions about anything that I've mentioned. Anything like that?\\n\\niyeshia: Any questions? Are we all good.\\n\\nCUNY Tech Prep (CTP): Now's your chance before you forget what you wanted to ask.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='41-45', metadata={'start': datetime.timedelta(seconds=645, microseconds=172000), 'end': datetime.timedelta(seconds=682, microseconds=250000), 'speakers': frozenset({'CUNY Tech Prep (CTP)', 'iyeshia'})}),\n",
247
+ " Chunk(text=\"CUNY Tech Prep (CTP): Now's your chance before you forget what you wanted to ask.\\n\\nCUNY Tech Prep (CTP): No takers.\\n\\nCUNY Tech Prep (CTP): I have a few comments.\\n\\niyeshia: You want to go ahead, Kevin.\\n\\nCUNY Tech Prep (CTP): Well, self, I see self document as also having a secondary goal, particularly if you find yourself in\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='45-49', metadata={'start': datetime.timedelta(seconds=678, microseconds=110000), 'end': datetime.timedelta(seconds=700, microseconds=910000), 'speakers': frozenset({'CUNY Tech Prep (CTP)', 'iyeshia'})}),\n",
248
+ " Chunk(text=\"CUNY Tech Prep (CTP): Well, self, I see self document as also having a secondary goal, particularly if you find yourself in\\n\\nCUNY Tech Prep (CTP): not such a nice work environment.\\n\\nCUNY Tech Prep (CTP): It helps prevent people from gaslighting. You, for example.\\n\\nCUNY Tech Prep (CTP): And like it keeps you out of trouble. Let's say cause if you self document, then\\n\\nCUNY Tech Prep (CTP): you know exactly what was decided on.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='49-53', metadata={'start': datetime.timedelta(seconds=693, microseconds=509000), 'end': datetime.timedelta(seconds=720, microseconds=809000), 'speakers': frozenset({'CUNY Tech Prep (CTP)'})}),\n",
249
+ " Chunk(text=\"CUNY Tech Prep (CTP): you know exactly what was decided on.\\n\\nCUNY Tech Prep (CTP): And you're just following exactly what was said.\\n\\niyeshia: That is correct.\\n\\nCUNY Tech Prep (CTP): And then the setting boundaries right.\\n\\nCUNY Tech Prep (CTP): and there are some. There are some\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='53-57', metadata={'start': datetime.timedelta(seconds=717, microseconds=970000), 'end': datetime.timedelta(seconds=732, microseconds=590000), 'speakers': frozenset({'CUNY Tech Prep (CTP)', 'iyeshia'})}),\n",
250
+ " Chunk(text=\"CUNY Tech Prep (CTP): and there are some. There are some\\n\\nCUNY Tech Prep (CTP): bosses who will push your boundaries. Try to get you to like\\n\\nCUNY Tech Prep (CTP): do overtime. Stay longer than like\\n\\nCUNY Tech Prep (CTP): your stay longer than what's on like the contract, or whatever.\\n\\nCUNY Tech Prep (CTP): If you give an inch sometimes they'll take a mile, so\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='57-61', metadata={'start': datetime.timedelta(seconds=729, microseconds=400000), 'end': datetime.timedelta(seconds=749, microseconds=960000), 'speakers': frozenset({'CUNY Tech Prep (CTP)'})}),\n",
251
+ " Chunk(text=\"CUNY Tech Prep (CTP): If you give an inch sometimes they'll take a mile, so\\n\\nCUNY Tech Prep (CTP): you should be very clear on\\n\\nCUNY Tech Prep (CTP): your time. Your time limits, like.\\n\\nCUNY Tech Prep (CTP): you know, have always have an out, for\\n\\nCUNY Tech Prep (CTP): when too much is being requested of you.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='61-65', metadata={'start': datetime.timedelta(seconds=745, microseconds=275000), 'end': datetime.timedelta(seconds=767, microseconds=120000), 'speakers': frozenset({'CUNY Tech Prep (CTP)'})}),\n",
252
+ " Chunk(text=\"CUNY Tech Prep (CTP): when too much is being requested of you.\\n\\nCUNY Tech Prep (CTP): My usual go to is like, Oh, I I have like I have a meeting for Ctp, or like I have class.\\n\\niyeshia: Very good. That's good to good to know. And I know. David. Put in the chat like for an example of documentation. On March 16, th at 4, 35, you said, and I quote that is, that is exactly.\\n\\nCUNY Tech Prep (CTP): Under my lap.\\n\\niyeshia: But if you're in that situation, you definitely, it's so fresh, and it's so like truthful, like someone's like, no, I'm not going to doubt that someone made that.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='65-69', metadata={'start': datetime.timedelta(seconds=764, microseconds=400000), 'end': datetime.timedelta(seconds=803, microseconds=550000), 'speakers': frozenset({'CUNY Tech Prep (CTP)', 'iyeshia'})}),\n",
253
+ " Chunk(text=\"iyeshia: But if you're in that situation, you definitely, it's so fresh, and it's so like truthful, like someone's like, no, I'm not going to doubt that someone made that.\\n\\nCUNY Tech Prep (CTP): Yeah.\\n\\niyeshia: We wrote that and gave them the time so absolutely documentation goals for the good and for the bad. So definitely. Thank you for sharing that Kevin and David?\\n\\niyeshia: And so with that said, We'll go on to the the next slide. Which is a question of is my manager the same as having a mentor. Does anybody want to come off the come off mute and say yes or no?\\n\\niyeshia: I can just call on Kyle.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='69-73', metadata={'start': datetime.timedelta(seconds=795, microseconds=400000), 'end': datetime.timedelta(seconds=831, microseconds=790000), 'speakers': frozenset({'CUNY Tech Prep (CTP)', 'iyeshia'})}),\n",
254
+ " Chunk(text=\"iyeshia: I can just call on Kyle.\\n\\nCUNY Tech Prep (CTP): Kyle, you there.\\n\\nKyle Schoenhardt: No, it's not.\\n\\niyeshia: Okay, let's see.\\n\\niyeshia: Yay, good job, PAL. The answer is, no.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='73-77', metadata={'start': datetime.timedelta(seconds=828, microseconds=820000), 'end': datetime.timedelta(seconds=844, microseconds=930000), 'speakers': frozenset({'CUNY Tech Prep (CTP)', 'Kyle Schoenhardt', 'iyeshia'})}),\n",
255
+ " Chunk(text=\"iyeshia: Yay, good job, PAL. The answer is, no.\\n\\niyeshia: Did you want to give more input?\\n\\nKyle Schoenhardt: Yeah. Sure.\\n\\niyeshia: Yeah.\\n\\nKyle Schoenhardt: Well, I mean, sometimes you can just have really bad managers who are there to cover their own self, make themselves look good sometimes at your expense, or they micromanage, or you just don't click well with that person. For whatever reason a mentor is akin to a leader, I think they are there to lift you up and show you\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='77-81', metadata={'start': datetime.timedelta(seconds=841, microseconds=340000), 'end': datetime.timedelta(seconds=869, microseconds=440000), 'speakers': frozenset({'Kyle Schoenhardt', 'iyeshia'})}),\n",
256
+ " Chunk(text=\"Kyle Schoenhardt: Well, I mean, sometimes you can just have really bad managers who are there to cover their own self, make themselves look good sometimes at your expense, or they micromanage, or you just don't click well with that person. For whatever reason a mentor is akin to a leader, I think they are there to lift you up and show you\\n\\nKyle Schoenhardt: how you can improve on yourself like a coach.\\n\\nKyle Schoenhardt: Constantly giving you feedback, whether positive or negative.\\n\\nKyle Schoenhardt: I would say someone you would\\n\\nKyle Schoenhardt: go to immediately like. If the 1st person you think of that you need help with something is not your manager, then that's\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='81-85', metadata={'start': datetime.timedelta(seconds=850, microseconds=340000), 'end': datetime.timedelta(seconds=885, microseconds=510000), 'speakers': frozenset({'Kyle Schoenhardt'})}),\n",
257
+ " Chunk(text=\"Kyle Schoenhardt: go to immediately like. If the 1st person you think of that you need help with something is not your manager, then that's\\n\\nKyle Schoenhardt: a good indicator, that that person is not a mentor, or, if you need help with something, your your 1st go to person to that you think of is\\n\\nKyle Schoenhardt: someone else that is probably who your mentor is most likely to be, could be a coworker. It could be a manager, but it's not always.\\n\\niyeshia: Got it. Thank you, Kevin. I mean. Thank you, Kyle, said Kevin. Thank you. Kyle. Appreciate that. With that, said, I don't feel like I need to add any more. I feel like Kyle took that. So I'm gonna move on to the day.\\n\\niyeshia: So the next question is, should my manager, be my mentor.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='85-89', metadata={'start': datetime.timedelta(seconds=879, microseconds=360000), 'end': datetime.timedelta(seconds=919, microseconds=30000), 'speakers': frozenset({'Kyle Schoenhardt', 'iyeshia'})}),\n",
258
+ " Chunk(text=\"iyeshia: So the next question is, should my manager, be my mentor.\\n\\niyeshia: Alison.\\n\\nAllison Lee: Well, you you can't force a mentor mentee relationship if that's not how it's going to work.\\n\\nAllison Lee: But it is possible for your manager to be some kind of mentor figure.\\n\\niyeshia: Thank you.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='89-93', metadata={'start': datetime.timedelta(seconds=914, microseconds=565000), 'end': datetime.timedelta(seconds=945, microseconds=810000), 'speakers': frozenset({'iyeshia', 'Allison Lee'})}),\n",
259
+ " Chunk(text=\"iyeshia: Thank you.\\n\\niyeshia: So with that, said.\\n\\niyeshia: that depends. So I appreciate Allison. Your response. It definitely depends. Can't force them. But of course, if you do get along with your supervisor, and you want to ask them that\\n\\niyeshia: by all means. But good, answers everyone.\\n\\niyeshia: So now we go more in depth of what can good mentorship look like? And so from there I would say, mentors, as\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='93-97', metadata={'start': datetime.timedelta(seconds=944, microseconds=920000), 'end': datetime.timedelta(seconds=975, microseconds=362000), 'speakers': frozenset({'iyeshia'})}),\n",
260
+ " Chunk(text=\"iyeshia: So now we go more in depth of what can good mentorship look like? And so from there I would say, mentors, as\\n\\niyeshia: Kyle touched on was that they provide support, wisdom to help you succeed in certain examples are, this is pretty much sharing any ideas you might have with them from paying program with you on a code base providing feedback, maybe on a slide deck to helping you remind that it's impossible to know everything. So they're kind of reassuring you in your in your role as you're starting your career.\\n\\niyeshia: and then you want to make sure your mentor is a is a safe space for you at the time. Sometimes your mentor. You can talk to your mentor about your manager sometimes if they are difficult or not, and so from there it's a form of trust\\n\\niyeshia: with your with your mentor. So if you have, if you are blessed to have a supervisor who can be both roles, a manager and a mentor. Go for it, if you're like. I'm still learning. I'm only 3, 30 days in 60 days, 90 days. Take your time, then. So that is definitely something to to know from that.\\n\\niyeshia: And then questions of Where can I find? A mentor? And so, before I even answer this question, who can tell me what erg stands for\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='97-101', metadata={'start': datetime.timedelta(seconds=964, microseconds=630000), 'end': datetime.timedelta(seconds=1046, microseconds=430000), 'speakers': frozenset({'iyeshia'})}),\n",
261
+ " Chunk(text=\"iyeshia: And then questions of Where can I find? A mentor? And so, before I even answer this question, who can tell me what erg stands for\\n\\niyeshia: anyone?\\n\\niyeshia: Go ahead, Devon, please.\\n\\nDevin Xie (no cam): Employee resource groups.\\n\\niyeshia: Thank you so much, Devin. I appreciate you and blouse right there. Next to erg. So the examples of that can be any groups that they have at your job related to Lgbtq. It could be groups related to race and identity. It could be anything from parenthood. I wish they had groups related for auntiehood and things of that nature. But it's all about finding your community and resources for things to help support you while you're working\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='101-105', metadata={'start': datetime.timedelta(seconds=1035, microseconds=839000), 'end': datetime.timedelta(seconds=1085, microseconds=780000), 'speakers': frozenset({'iyeshia', 'Devin Xie (no cam)'})}),\n",
262
+ " Chunk(text=\"iyeshia: Thank you so much, Devin. I appreciate you and blouse right there. Next to erg. So the examples of that can be any groups that they have at your job related to Lgbtq. It could be groups related to race and identity. It could be anything from parenthood. I wish they had groups related for auntiehood and things of that nature. But it's all about finding your community and resources for things to help support you while you're working\\n\\niyeshia: in some of your environments. And then, when you have your community, you can always reflect on interests related to tech.\\n\\niyeshia: or maybe research on your company like, who's in your area. And you could always reach out to some people for informational interviews. If you're really trying to seek this mentor Mentee relationship from people who are at your company. So just to keep that in mind.\\n\\niyeshia: I think I saw something.\\n\\niyeshia: Auntie Hood. Yes, and then I think, Mingle, said Manager supervisors are not your friend. Their one and only job is to find a person that can get the job done. Okay, come on, now, very good. And so\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='105-109', metadata={'start': datetime.timedelta(seconds=1057, microseconds=780000), 'end': datetime.timedelta(seconds=1131, microseconds=240000), 'speakers': frozenset({'iyeshia'})}),\n",
263
+ " Chunk(text=\"iyeshia: Auntie Hood. Yes, and then I think, Mingle, said Manager supervisors are not your friend. Their one and only job is to find a person that can get the job done. Okay, come on, now, very good. And so\\n\\niyeshia: with that, said, I think y'all know the roles between manager and mentor, and I appreciate that.\\n\\niyeshia: So now the next part is negotiating raises. So the last workshop we did was negotiating offers, as I stated before. So this one's gonna be a little different. You got the job. So now, after that whole success in your 1st year you want to start discussing maybe time for a raise. So let's get into that.\\n\\niyeshia: So you did a great job.\\n\\niyeshia: 1st year you knocked it out. You got outcomes, you got successes. You're amazing. On the 1st year what happens now?\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='109-113', metadata={'start': datetime.timedelta(seconds=1114, microseconds=170000), 'end': datetime.timedelta(seconds=1167, microseconds=119000), 'speakers': frozenset({'iyeshia'})}),\n",
264
+ " Chunk(text=\"iyeshia: 1st year you knocked it out. You got outcomes, you got successes. You're amazing. On the 1st year what happens now?\\n\\niyeshia: Your success is going to be measured by achievements, contributions into your organization, and that could be rewarded with\\n\\niyeshia: money or something else you value that could be related to time. Things of that nature. You want to go up based off your benefits. As we stated before, in the last workshop, you might wanna negotiate that. But if you want to talk about money first.st That's okay, too.\\n\\niyeshia: And these are gonna help you, too, as well with your I would say. Manager or supervisor. Meetings\\n\\niyeshia: from there.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='113-117', metadata={'start': datetime.timedelta(seconds=1160, microseconds=790000), 'end': datetime.timedelta(seconds=1199, microseconds=450000), 'speakers': frozenset({'iyeshia'})}),\n",
265
+ " Chunk(text=\"iyeshia: from there.\\n\\niyeshia: So just remember that it's okay when you when you flex those negotiating offers or flex those muscles during conversations around raises. It's not bragging. If you're talking about your achievements and things like that. It's okay to to talk about your successes, you know, especially during a raise time, because you're trying to show your manager or prove what you brought to the to the table. So keep that in mind.\\n\\niyeshia: So how does it look.\\n\\nCUNY Tech Prep (CTP): Comments, sorry.\\n\\niyeshia: Yeah, that is.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='117-121', metadata={'start': datetime.timedelta(seconds=1198, microseconds=703000), 'end': datetime.timedelta(seconds=1228, microseconds=390000), 'speakers': frozenset({'CUNY Tech Prep (CTP)', 'iyeshia'})}),\n",
266
+ " Chunk(text='iyeshia: Yeah, that is.\\n\\nCUNY Tech Prep (CTP): Something you would also document. If your manager praises you, you document that.\\n\\niyeshia: That.\\n\\nCUNY Tech Prep (CTP): Is evidence you can use in your negotiations.\\n\\niyeshia: That is such a fact.', parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='121-125', metadata={'start': datetime.timedelta(seconds=1227, microseconds=350000), 'end': datetime.timedelta(seconds=1240, microseconds=380000), 'speakers': frozenset({'CUNY Tech Prep (CTP)', 'iyeshia'})}),\n",
267
+ " Chunk(text=\"iyeshia: That is such a fact.\\n\\niyeshia: I literally just copy to paste everything, my manager said. Yep, one of my negotiation days. Yep, so thank you, Kevin, for saying that? So with that said, if you have those those meetings with them, document not only what you say, but what they said, as Kevin mentioned.\\n\\niyeshia: That was great in the negotiating offer. So how else do we prepare for this?\\n\\niyeshia: You're going to research? Yes, you're going to gather all your feedback, whether it's from your colleagues and meetings, whether it's from the success that you hear from your manager or tips from people that you work with, you're going to make sure you learn about your role. What's going on in the market. Just research is going to be your best.\\n\\niyeshia: Put input on this as well. When you're talking about your salary. The next thing you want to do is list the accomplishments. Keep those documents. Don't wait to the last minute you get to the end of the year. You're like, what did I do? It's been 12 months, like.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='125-129', metadata={'start': datetime.timedelta(seconds=1238, microseconds=990000), 'end': datetime.timedelta(seconds=1296, microseconds=189000), 'speakers': frozenset({'iyeshia'})}),\n",
268
+ " Chunk(text=\"iyeshia: Put input on this as well. When you're talking about your salary. The next thing you want to do is list the accomplishments. Keep those documents. Don't wait to the last minute you get to the end of the year. You're like, what did I do? It's been 12 months, like.\\n\\niyeshia: yeah, document everything, because you might forget some stuff. So that's definitely gonna help, too.\\n\\niyeshia: With that, said, you want to make sure you remind everyone. Maybe you save a bunch of money for the company. Oh, maybe you help them with other accomplishments, or maybe you spend off a project that's done really well. For your department. Share it. So please feel free to do that.\\n\\niyeshia: and then that will also help you keep your resume updated as well. So you don't have to worry about trying to\\n\\niyeshia: scatter or get all your thoughts together at the last minute.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='129-133', metadata={'start': datetime.timedelta(seconds=1281, microseconds=940000), 'end': datetime.timedelta(seconds=1331, microseconds=399000), 'speakers': frozenset({'iyeshia'})}),\n",
269
+ " Chunk(text=\"iyeshia: scatter or get all your thoughts together at the last minute.\\n\\niyeshia: And then with that status also, your manager needs to have the facts, too, to convince their boss to approve you for a raise. So if your manager is giving you the praises already, they're like, yeah, I did say that like\\n\\niyeshia: as well. Even if they make a joke like saying to you like, Hey, you deserve a raise document that you could go right back to like, you know. April 11th at 5, at 6 58 pm. You said, I deserve a raise this time like it. Just everything will just work for you in your favor for that, so please feel free to do that.\\n\\niyeshia: And so now you did the you did the raise. You had the meeting with your your manager. They're proposing it to the Supervisor, or things of that nature. I know different companies work in different ways, so they might have you go directly to your boss's boss to talk about the raise, or whoever is in charge of that\\n\\niyeshia: common, to negotiate that with them. But every company is different. But if they say yes, that's great job all done. Now, what if you get to a conversation where they say, No, what do you do, then? Well, there are alternatives for that. You can ask to work on, maybe towards a promotion. You know what I'm saying as far as if they say based off your level. We can't go any higher than that\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='133-137', metadata={'start': datetime.timedelta(seconds=1328, microseconds=370000), 'end': datetime.timedelta(seconds=1410, microseconds=720000), 'speakers': frozenset({'iyeshia'})}),\n",
270
+ " Chunk(text=\"iyeshia: common, to negotiate that with them. But every company is different. But if they say yes, that's great job all done. Now, what if you get to a conversation where they say, No, what do you do, then? Well, there are alternatives for that. You can ask to work on, maybe towards a promotion. You know what I'm saying as far as if they say based off your level. We can't go any higher than that\\n\\niyeshia: negotiate for promotion which would include maybe getting a title change, or better money that comes with it. This is why we say research, because you can definitely research what's going on in the market saying, Hey, that's my job. But the title is different.\\n\\niyeshia: Look that up and like definitely propose that if you want to. You can even ask for a faster review cycle. If they say something like, Hey, we can't give that to you. Just yet today. But let's revisit this topic on the 6 months, maybe, like, hey? Can we meet sooner, maybe in 3 months, to discuss more about how I can go about this\\n\\niyeshia: and then you could simply, if they say no. Ask why? Because you don't want to hear anything as far as like knowing that period. No, they should give you an explanation for it. So always ask questions with that to help like what's driving? That? Was it bad timing? Is there a gap? Is there their cap? Is there certain budgets. Did I miss anything that could help? So they can definitely\\n\\niyeshia: share with you and tell you that information of why they might have done. It could be a whole timing thing. It could be a budget thing. But just keep in mind to keep so just to keep in mind you could ask for like. Go around it 3 these ways, let's say 3 different ways. You can go about the answer and no from there. With that, said, does anyone have any questions so far?\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='137-141', metadata={'start': datetime.timedelta(seconds=1386, microseconds=520000), 'end': datetime.timedelta(seconds=1487, microseconds=429000), 'speakers': frozenset({'iyeshia'})}),\n",
271
+ " Chunk(text=\"iyeshia: share with you and tell you that information of why they might have done. It could be a whole timing thing. It could be a budget thing. But just keep in mind to keep so just to keep in mind you could ask for like. Go around it 3 these ways, let's say 3 different ways. You can go about the answer and no from there. With that, said, does anyone have any questions so far?\\n\\niyeshia: Nobody. Okay. Devin.\\n\\nCUNY Tech Prep (CTP): Devin does Devon.\\n\\nDevin Xie (no cam): Just curious. So like, say, we\\n\\nDevin Xie (no cam): find some opportunity after we graduate from Cuny Tech fair.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='141-145', metadata={'start': datetime.timedelta(seconds=1467, microseconds=260000), 'end': datetime.timedelta(seconds=1503, microseconds=140000), 'speakers': frozenset({'CUNY Tech Prep (CTP)', 'iyeshia', 'Devin Xie (no cam)'})}),\n",
272
+ " Chunk(text=\"Devin Xie (no cam): find some opportunity after we graduate from Cuny Tech fair.\\n\\nDevin Xie (no cam): And then we have questions about this stuff like.\\n\\nDevin Xie (no cam): let's say we work there for like a year. And we\\n\\nDevin Xie (no cam): we stop. We we want to ask for some advice. Can we still hit you guys up.\\n\\niyeshia: Yeah, but you become alumni. You're not just gonna drop you all off in May and be like, bye. No, you can definitely you'll be invited. May like, after the graduation, I want to say in the summertime you'll get an invite to the alumni slack channel and you can join\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='145-149', metadata={'start': datetime.timedelta(seconds=1499, microseconds=630000), 'end': datetime.timedelta(seconds=1531, microseconds=469000), 'speakers': frozenset({'iyeshia', 'Devin Xie (no cam)'})}),\n",
273
+ " Chunk(text=\"iyeshia: Yeah, but you become alumni. You're not just gonna drop you all off in May and be like, bye. No, you can definitely you'll be invited. May like, after the graduation, I want to say in the summertime you'll get an invite to the alumni slack channel and you can join\\n\\niyeshia: that, and I will be gladly to assist you. There. We have a career coach there, but usually all the the staff is on the Ctv team is on the alumni channel. So yeah, definitely. But we also like, I said before, Devin, save the Powerpoint, too.\\n\\niyeshia: Just putting that out there? So yeah, good question.\\n\\niyeshia: Okay?\\n\\niyeshia: And so the next part is after the conversation for the the raise. You want to make sure. The conversation goes well, timing is going to be a part of that. So clarifying the process, asking them like, you know, when should I expect the raise? You know that's not being thirsty. That's that's your money. You can ask questions about it. And what's the next step for that?\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='149-153', metadata={'start': datetime.timedelta(seconds=1513, microseconds=30000), 'end': datetime.timedelta(seconds=1577, microseconds=890000), 'speakers': frozenset({'iyeshia'})}),\n",
274
+ " Chunk(text=\"iyeshia: And so the next part is after the conversation for the the raise. You want to make sure. The conversation goes well, timing is going to be a part of that. So clarifying the process, asking them like, you know, when should I expect the raise? You know that's not being thirsty. That's that's your money. You can ask questions about it. And what's the next step for that?\\n\\niyeshia: You can always confirm with your manager? Like. If the reason they said no, was it because there's certain maybe I would say physical years of like, how they what deadline they have for the New Year or the new budget. Time or deadline, was it? Did I miss it when I asked for a salary? Or when's the next time I should ask for a salary. Increase, and things like that. Cause your your department, or you would hope the team that you're on will show you throughout the year of like what's coming up and what you can expect.\\n\\niyeshia: So you definitely want to plan ahead next time. If they say no, and then review the work and the feedback asking for feedback. Was it my, the way that I would propose the raise? Is there anything I could do to get? You know better on that? That would help with the mentor, of course.\\n\\niyeshia: Cause the person you're proposing it to might not give the input. But definitely, a mentor is gonna help you with that as well to see what's going on. You could definitely check in with your manager. If they had any feedback they might tell your manager to like, let them know like this is why they might have said No or this? Why, they might have said, Not yet, or they'll say yes later. So keep that in mind.\\n\\niyeshia: and then let's see right\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='153-157', metadata={'start': datetime.timedelta(seconds=1553, microseconds=290000), 'end': datetime.timedelta(seconds=1648, microseconds=679000), 'speakers': frozenset({'iyeshia'})}),\n",
275
+ " Chunk(text=\"iyeshia: and then let's see right\\n\\niyeshia: from there we'll go to the activity.\\n\\niyeshia: And so from there, this is an activity of asking for feedback.\\n\\niyeshia: And we're gonna do a scenario of you want to ask for feedback from your manager.\\n\\niyeshia: and you previously had passed up for raise and want to learn more about how you can ensure success earning one in the next review cycle.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='157-161', metadata={'start': datetime.timedelta(seconds=1641, microseconds=970000), 'end': datetime.timedelta(seconds=1673, microseconds=539000), 'speakers': frozenset({'iyeshia'})}),\n",
276
+ " Chunk(text=\"iyeshia: and you previously had passed up for raise and want to learn more about how you can ensure success earning one in the next review cycle.\\n\\niyeshia: So this part is, how would you start that conversation in your weekly check in?\\n\\niyeshia: So since we're virtual, we're gonna have, I'm gonna give you about 30 seconds to come up with your own answer, and then type it in the chat.\\n\\niyeshia: So review the scenario now and then we'll start in 30 seconds.\\n\\niyeshia: So\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='161-165', metadata={'start': datetime.timedelta(seconds=1665, microseconds=550000), 'end': datetime.timedelta(seconds=1692, microseconds=620000), 'speakers': frozenset({'iyeshia'})}),\n",
277
+ " Chunk(text='iyeshia: So\\n\\niyeshia: we set the timer for 30.\\n\\niyeshia: Okay?\\n\\niyeshia: Goes now\\n\\niyeshia: 10 seconds.', parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='165-169', metadata={'start': datetime.timedelta(seconds=1691, microseconds=890000), 'end': datetime.timedelta(seconds=1727, microseconds=70000), 'speakers': frozenset({'iyeshia'})}),\n",
278
+ " Chunk(text='iyeshia: 10 seconds.\\n\\niyeshia: Okay, time is up.\\n\\niyeshia: Okay, nice.\\n\\niyeshia: And look for a raise on to guarantee a raise in this performance. Review. Awesome. Thank you. Ty\\n\\niyeshia: and Mckenzie. Thank you.', parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='169-173', metadata={'start': datetime.timedelta(seconds=1725, microseconds=970000), 'end': datetime.timedelta(seconds=1767, microseconds=160000), 'speakers': frozenset({'iyeshia'})}),\n",
279
+ " Chunk(text='iyeshia: and Mckenzie. Thank you.\\n\\niyeshia: 13.\\n\\niyeshia: Some feedback to see what I can build. Awesome.\\n\\niyeshia: Hey, boys!\\n\\niyeshia: Oh, my God this time to reach out a bit. Okay, okay for me.', parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='173-177', metadata={'start': datetime.timedelta(seconds=1765, microseconds=20000), 'end': datetime.timedelta(seconds=1785, microseconds=509000), 'speakers': frozenset({'iyeshia'})}),\n",
280
+ " Chunk(text='iyeshia: Oh, my God this time to reach out a bit. Okay, okay for me.\\n\\niyeshia: No.\\n\\niyeshia: Okay.\\n\\niyeshia: Any improvement that you see that I cannot. Okay, thank you.\\n\\niyeshia: Let me check in with you.', parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='177-181', metadata={'start': datetime.timedelta(seconds=1780, microseconds=400000), 'end': datetime.timedelta(seconds=1810, microseconds=859000), 'speakers': frozenset({'iyeshia'})}),\n",
281
+ " Chunk(text=\"iyeshia: Let me check in with you.\\n\\niyeshia: There we go.\\n\\niyeshia: Okay, perfect.\\n\\niyeshia: So what I can make for the next recycle. Awesome. Thank you all for sharing so far, I'm gonna move on to the the next part. I think I kind of skipped\\n\\niyeshia: ahead.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='181-185', metadata={'start': datetime.timedelta(seconds=1807, microseconds=139000), 'end': datetime.timedelta(seconds=1830, microseconds=670000), 'speakers': frozenset({'iyeshia'})}),\n",
282
+ " Chunk(text=\"iyeshia: ahead.\\n\\niyeshia: Okay.\\n\\niyeshia: so right now, we have a role play example between a manager and you. Let's say you would.\\n\\niyeshia: it could be data science. Related. Right? So from here, I'm going to\\n\\niyeshia: probably volunteer, because I'm not sure if people will volunteer to be the manager and someone be you\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='185-189', metadata={'start': datetime.timedelta(seconds=1829, microseconds=480000), 'end': datetime.timedelta(seconds=1857, microseconds=657000), 'speakers': frozenset({'iyeshia'})}),\n",
283
+ " Chunk(text=\"iyeshia: probably volunteer, because I'm not sure if people will volunteer to be the manager and someone be you\\n\\niyeshia: So let me see who I can get.\\n\\niyeshia: Okay, I'll go with David for manager, and I'll go for\\n\\niyeshia: Let's try, Kevin for you.\\n\\niyeshia: If you have to read this role, play example.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='189-193', metadata={'start': datetime.timedelta(seconds=1850, microseconds=520000), 'end': datetime.timedelta(seconds=1877, microseconds=689000), 'speakers': frozenset({'iyeshia'})}),\n",
284
+ " Chunk(text='iyeshia: If you have to read this role, play example.\\n\\nDavid Rodriguez: Should I start now?\\n\\nCUNY Tech Prep (CTP): Kevin, you there?\\n\\nCUNY Tech Prep (CTP): Kevin? Chen.\\n\\nKevin Zheng: Right, right.', parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='193-197', metadata={'start': datetime.timedelta(seconds=1874, microseconds=660000), 'end': datetime.timedelta(seconds=1892, microseconds=270000), 'speakers': frozenset({'CUNY Tech Prep (CTP)', 'David Rodriguez', 'iyeshia', 'Kevin Zheng'})}),\n",
285
+ " Chunk(text=\"Kevin Zheng: Right, right.\\n\\nCUNY Tech Prep (CTP): Alright!\\n\\nDavid Rodriguez: Great I'll start.\\n\\nDavid Rodriguez: Is there anything else you'd like to talk about?\\n\\nKevin Zheng: Yes, as you know, I've been taking on additional responsibilities since we used the team, and I'd like to speak to you about my conversation package.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='197-201', metadata={'start': datetime.timedelta(seconds=1891, microseconds=450000), 'end': datetime.timedelta(seconds=1910, microseconds=499000), 'speakers': frozenset({'CUNY Tech Prep (CTP)', 'David Rodriguez', 'Kevin Zheng'})}),\n",
286
+ " Chunk(text=\"Kevin Zheng: Yes, as you know, I've been taking on additional responsibilities since we used the team, and I'd like to speak to you about my conversation package.\\n\\nDavid Rodriguez: We really appreciate your hard work.\\n\\nDavid Rodriguez: but it's still a tough economy, and we're not really in a position to give you anything more than a 2% raise. We can talk about a raise at your next review in about 6 months.\\n\\nKevin Zheng: I do understand that the economy has made things difficult. Can we set a time to discuss my compensation again before my next schedule Review.\\n\\nKevin Zheng: I appreciate an opportunity to talk in more detail on the additional work I've taken on, and its impact.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='201-205', metadata={'start': datetime.timedelta(seconds=1901, microseconds=690000), 'end': datetime.timedelta(seconds=1938, microseconds=959000), 'speakers': frozenset({'David Rodriguez', 'Kevin Zheng'})}),\n",
287
+ " Chunk(text=\"Kevin Zheng: I appreciate an opportunity to talk in more detail on the additional work I've taken on, and its impact.\\n\\nDavid Rodriguez: Sure that makes sense.\\n\\nDavid Rodriguez: I want to make sure you heard how about a month.\\n\\nKevin Zheng: Great. Thank you. I'll find some time on your calendar for us to meet.\\n\\niyeshia: Thank you. So with that, said, I. Just want to open up the the floor. To everyone. What did you notice?\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='205-209', metadata={'start': datetime.timedelta(seconds=1933, microseconds=720000), 'end': datetime.timedelta(seconds=1967, microseconds=303000), 'speakers': frozenset({'David Rodriguez', 'iyeshia', 'Kevin Zheng'})}),\n",
288
+ " Chunk(text=\"iyeshia: Thank you. So with that, said, I. Just want to open up the the floor. To everyone. What did you notice?\\n\\niyeshia: that during the the role play. That the let's say the data scientists who was played by Kevin,\\n\\niyeshia: did as far as like, maybe something different from your responses that you put in the chat. Did y'all notice anything differently?\\n\\niyeshia: Hey, Devin?\\n\\nDevin Xie (no cam): I don't know if I'm correct. But I think\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='209-213', metadata={'start': datetime.timedelta(seconds=1957, microseconds=300000), 'end': datetime.timedelta(seconds=2005, microseconds=496000), 'speakers': frozenset({'iyeshia', 'Devin Xie (no cam)'})}),\n",
289
+ " Chunk(text=\"Devin Xie (no cam): I don't know if I'm correct. But I think\\n\\nDevin Xie (no cam): the data scientists or us in this situation, we try to like Scheduler, a review like\\n\\nDevin Xie (no cam): in a later time.\\n\\niyeshia: absolutely. Thank you. He took initiative and be like, you know, hey, let me, let me get on your calendar for next time, instead of just like waiting around, you know, people be like, Oh, I'll get back to you and things like that. He's like, no, we can. We can discuss later, like, what's your schedule like? So that\\n\\niyeshia: that forwardness of just, you know, following up and seeing it through is definitely helpful.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='213-217', metadata={'start': datetime.timedelta(seconds=2002, microseconds=950000), 'end': datetime.timedelta(seconds=2041, microseconds=590000), 'speakers': frozenset({'iyeshia', 'Devin Xie (no cam)'})}),\n",
290
+ " Chunk(text=\"iyeshia: that forwardness of just, you know, following up and seeing it through is definitely helpful.\\n\\niyeshia: So and so, for now I would say this would take about maybe\\n\\niyeshia: so final reflection. We could talk about this for like maybe 3\\xa0min, or anybody could just like popcorn it out unless I just call on them. But for today's learning from the workshop what are some things you can generally expect when you 1st join a company? What is a manager's role in your success? And how do you find out your measures of success? Does anyone want to\\n\\niyeshia: volunteer and answer any of the any of the 3 questions that are of their choice\\n\\niyeshia: before I call on someone.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='217-221', metadata={'start': datetime.timedelta(seconds=2035, microseconds=850000), 'end': datetime.timedelta(seconds=2087, microseconds=550000), 'speakers': frozenset({'iyeshia'})}),\n",
291
+ " Chunk(text=\"iyeshia: before I call on someone.\\n\\niyeshia: Okay, anybody but Devin.\\n\\niyeshia: See, I'm gonna go with anthony.\\n\\nAnthony Jerez: Yes, I'm here.\\n\\niyeshia: Which question would you like to answer? You had to reflect.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='221-225', metadata={'start': datetime.timedelta(seconds=2086, microseconds=20000), 'end': datetime.timedelta(seconds=2122, microseconds=210000), 'speakers': frozenset({'Anthony Jerez', 'iyeshia'})}),\n",
292
+ " Chunk(text=\"iyeshia: Which question would you like to answer? You had to reflect.\\n\\nAnthony Jerez: On, I would say the 1st one.\\n\\niyeshia: Okay, go for it.\\n\\nAnthony Jerez: So some major things that I would expect would be we're going through like sessions like orientation, and like onboarding\\n\\nAnthony Jerez: also knowledge about like some some resources resources that we would have access to at any point.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='225-229', metadata={'start': datetime.timedelta(seconds=2119, microseconds=390000), 'end': datetime.timedelta(seconds=2147, microseconds=390000), 'speakers': frozenset({'Anthony Jerez', 'iyeshia'})}),\n",
293
+ " Chunk(text=\"Anthony Jerez: also knowledge about like some some resources resources that we would have access to at any point.\\n\\nAnthony Jerez: And yeah, stuff like that. I would say.\\n\\niyeshia: Thank you, Anthony, for sharing.\\n\\niyeshia: and then let me see, trying to see who's not making eye contact. Oh, oh, not everybody looks okay. So let's go with\\n\\niyeshia: Ibrahim.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='229-233', metadata={'start': datetime.timedelta(seconds=2139, microseconds=43000), 'end': datetime.timedelta(seconds=2167, microseconds=810000), 'speakers': frozenset({'Anthony Jerez', 'iyeshia'})}),\n",
294
+ " Chunk(text=\"iyeshia: Ibrahim.\\n\\nIbrahim Faruquee: Yeah, I'll answer question, too.\\n\\nIbrahim Faruquee: So your manager's role is mainly like for the company to manage like people and make sure that the right persons for the right job, but they can be like a mentor figure for you. So like, if there can be like good mentors who like help you throughout the process and help you with a raise, or they could also like, be difficult and make that like harder for you. But they're kind of. It's not like there's nothing to be, I guess, expected from a manager. It's just like\\n\\nIbrahim Faruquee: what they like. What do you, I guess. What do you end up with.\\n\\nIbrahim Faruquee: or what do you make the most of.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='233-237', metadata={'start': datetime.timedelta(seconds=2166, microseconds=780000), 'end': datetime.timedelta(seconds=2208, microseconds=880000), 'speakers': frozenset({'iyeshia', 'Ibrahim Faruquee'})}),\n",
295
+ " Chunk(text=\"Ibrahim Faruquee: or what do you make the most of.\\n\\niyeshia: Awesome. Thank you.\\n\\niyeshia: And then for the 3rd question.\\n\\niyeshia: and we're gonna go for Isabel.\\n\\nIsabel Loçi: Hello!\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='237-241', metadata={'start': datetime.timedelta(seconds=2207, microseconds=390000), 'end': datetime.timedelta(seconds=2223, microseconds=750000), 'speakers': frozenset({'Isabel Loçi', 'iyeshia', 'Ibrahim Faruquee'})}),\n",
296
+ " Chunk(text=\"Isabel Loçi: Hello!\\n\\niyeshia: Hello!\\n\\nIsabel Loçi: Sorry. My Internet's horrible, and might I might disconnect?\\n\\nIsabel Loçi: I'll see if I can answer the 3rd one. How do you find your measures of success.\\n\\nIsabel Loçi: I would say, ask for feedback from other people elsewhere, from other colleagues, from your manager.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='241-245', metadata={'start': datetime.timedelta(seconds=2222, microseconds=900000), 'end': datetime.timedelta(seconds=2245, microseconds=189000), 'speakers': frozenset({'Isabel Loçi', 'iyeshia'})}),\n",
297
+ " Chunk(text=\"Isabel Loçi: I would say, ask for feedback from other people elsewhere, from other colleagues, from your manager.\\n\\nIsabel Loçi: That way you get a better understanding of where you are right now. And also I would say to also look back on the goals that you've set for yourself, and see if you've reached those goals as well, and that would be a good measure of success.\\n\\niyeshia: Okay, very good. All right.\\n\\niyeshia: So yeah, definitely helped make my life easier with this presentation. So thank you. I'm glad things are sticking and so with that said, We will go and launch Kahoo. But before I do that I definitely want to say just be mindful of these things.\\n\\niyeshia: When you are starting in your 1st year, in your career. As it was stated in one of the slides, you don't have to have it all figured out is the perfect time to ask questions. You're gonna make mistakes, or you're not. But if you do, it's okay. Because it's all gonna be a learning process. For your 1st year, and your managers expect that.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='245-249', metadata={'start': datetime.timedelta(seconds=2238, microseconds=660000), 'end': datetime.timedelta(seconds=2306, microseconds=319000), 'speakers': frozenset({'Isabel Loçi', 'iyeshia'})}),\n",
298
+ " Chunk(text=\"iyeshia: When you are starting in your 1st year, in your career. As it was stated in one of the slides, you don't have to have it all figured out is the perfect time to ask questions. You're gonna make mistakes, or you're not. But if you do, it's okay. Because it's all gonna be a learning process. For your 1st year, and your managers expect that.\\n\\niyeshia: So just keep that in mind.\\n\\niyeshia: And then, if you are going to seek, you know, support, I think. It was great that it's a bell, stated asking for feedback from your manager, but you could also ask for feedback from your teammates, too. Cause they, if you work with them closely. If you have a team to see, like what your areas of strengths are your areas of growth.\\n\\niyeshia: and things that you're learning. That could be helpful. Towards that process if you're going up for a raise. But sometimes people could see our strengths stronger or clearer, or even faster than we can, and we don't even realize it.\\n\\niyeshia: And then even asking your mentors, too, as well, can be helpful. And then.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='249-253', metadata={'start': datetime.timedelta(seconds=2282, microseconds=771000), 'end': datetime.timedelta(seconds=2345, microseconds=799000), 'speakers': frozenset({'iyeshia'})}),\n",
299
+ " Chunk(text=\"iyeshia: And then even asking your mentors, too, as well, can be helpful. And then.\\n\\niyeshia: if you are going to negotiate, remember to keep for raise, to keep that documented focus on your skills. Make sure you do your research on the market and definitely, just try to figure out if you can negotiate other things.\\n\\niyeshia: And when it comes to relationships, at work, you wanna make sure to treat everybody equally so I hope that that helps. If you didn't get anything else. I hope that's what helps you with them\\n\\niyeshia: with your 1st year? As you enter into your careers. And so with that said, we'll go into Kahoot.\\n\\niyeshia: and so I'm going to launch it now.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='253-257', metadata={'start': datetime.timedelta(seconds=2341, microseconds=80000), 'end': datetime.timedelta(seconds=2390, microseconds=330000), 'speakers': frozenset({'iyeshia'})}),\n",
300
+ " Chunk(text=\"iyeshia: and so I'm going to launch it now.\\n\\niyeshia: Let's get it started.\\n\\niyeshia: I don't think my headphones died so\\n\\niyeshia: got 33 people on here, and only 16.\\n\\niyeshia: Okay.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='257-261', metadata={'start': datetime.timedelta(seconds=2387, microseconds=420000), 'end': datetime.timedelta(seconds=2445, microseconds=90000), 'speakers': frozenset({'iyeshia'})}),\n",
301
+ " Chunk(text=\"iyeshia: Okay.\\n\\niyeshia: sound. Good.\\n\\niyeshia: 33.\\n\\niyeshia: Well, I didn't cut myself. That's Kevin. You're playing too.\\n\\niyeshia: Figure out how to be successful on my own.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='261-265', metadata={'start': datetime.timedelta(seconds=2444, microseconds=230000), 'end': datetime.timedelta(seconds=2550, microseconds=965000), 'speakers': frozenset({'iyeshia'})}),\n",
302
+ " Chunk(text=\"iyeshia: Figure out how to be successful on my own.\\n\\niyeshia: Oh, you do not have to figure that out.\\n\\niyeshia: That's why we tell you, have mentors, extra peers and things of that nature.\\n\\niyeshia: Well, yeah, shout out to the 22. It's okay. One. I'll take the 22 others, you know. Wow!\\n\\niyeshia: Your boss. My goodness, okay, is in the lead.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='265-269', metadata={'start': datetime.timedelta(seconds=2547, microseconds=780000), 'end': datetime.timedelta(seconds=2583, microseconds=779000), 'speakers': frozenset({'iyeshia'})}),\n",
303
+ " Chunk(text=\"iyeshia: Your boss. My goodness, okay, is in the lead.\\n\\niyeshia: So let's go ahead\\n\\niyeshia: who should not go to\\n\\niyeshia: thank you definitely. The worst thing you could do is talk to no one. If you need support with something.\\n\\niyeshia: So I hope.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='269-273', metadata={'start': datetime.timedelta(seconds=2578, microseconds=507000), 'end': datetime.timedelta(seconds=2624, microseconds=130000), 'speakers': frozenset({'iyeshia'})}),\n",
304
+ " Chunk(text=\"iyeshia: So I hope.\\n\\nCUNY Tech Prep (CTP): I am shocked.\\n\\niyeshia: That one should you not go to? So yeah.\\n\\niyeshia: let's see. Okay, Jamie is in the name.\\n\\niyeshia: Okay, let's go.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='273-277', metadata={'start': datetime.timedelta(seconds=2622, microseconds=675000), 'end': datetime.timedelta(seconds=2641, microseconds=959000), 'speakers': frozenset({'CUNY Tech Prep (CTP)', 'iyeshia'})}),\n",
305
+ " Chunk(text=\"iyeshia: Okay, let's go.\\n\\niyeshia: 3rd question, what are not considerations to mention when providing reasons for a salary increase.\\n\\niyeshia: There aren't enough.\\n\\niyeshia: Okay? 18. Yes, the cost of living. That is correct. You should not consider that\\n\\niyeshia: They don't, they don't. They don't care so definitely the other ones. You could do that on your own when you're doing your negotiating your your budget. But don't come out and say, like, Hey, the cost of living in this city? They're like\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='277-281', metadata={'start': datetime.timedelta(seconds=2640, microseconds=140000), 'end': datetime.timedelta(seconds=2695, microseconds=309000), 'speakers': frozenset({'iyeshia'})}),\n",
306
+ " Chunk(text=\"iyeshia: They don't, they don't. They don't care so definitely the other ones. You could do that on your own when you're doing your negotiating your your budget. But don't come out and say, like, Hey, the cost of living in this city? They're like\\n\\niyeshia: or virtual.\\n\\niyeshia: our office in California, we have no idea. So yeah, just just keep that in mind. So good job to the the cost of living folks.\\n\\niyeshia: Okay, David Rv is in the lead.\\n\\niyeshia: Okay, let's go to the next question.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='281-285', metadata={'start': datetime.timedelta(seconds=2680, microseconds=250000), 'end': datetime.timedelta(seconds=2715, microseconds=419000), 'speakers': frozenset({'iyeshia'})}),\n",
307
+ " Chunk(text=\"iyeshia: Okay, let's go to the next question.\\n\\niyeshia: what is a thoughtful way to actually negotiate?\\n\\niyeshia: So we can negotiate? Very good. It's a thoughtful way to act\\n\\niyeshia: and I think most of y'all got that in the chat. I saw some other answers. I'm gonna leave that questionable. But for the ones who did shout out to y'all.\\n\\niyeshia: So I think this is the last question.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='285-289', metadata={'start': datetime.timedelta(seconds=2712, microseconds=460000), 'end': datetime.timedelta(seconds=2758, microseconds=389000), 'speakers': frozenset({'iyeshia'})}),\n",
308
+ " Chunk(text=\"iyeshia: So I think this is the last question.\\n\\niyeshia: But Kyle is in the lead now, and so shouts to Kyle. So here goes the last question.\\n\\niyeshia: The most important relationship at work is with my manager.\\n\\niyeshia: Shout out to the people who said, False I said, it is important, but not the most important. Yeah, there's team this\\n\\niyeshia: Ceos, what about yourself? You know, things like that? So I just want to keep that in mind. So\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='289-293', metadata={'start': datetime.timedelta(seconds=2755, microseconds=680000), 'end': datetime.timedelta(seconds=2795, microseconds=579000), 'speakers': frozenset({'iyeshia'})}),\n",
309
+ " Chunk(text=\"iyeshia: Ceos, what about yourself? You know, things like that? So I just want to keep that in mind. So\\n\\niyeshia: yeah, let's always about that. So let's go to the windows.\\n\\niyeshia: Okay, let's okay.\\n\\niyeshia: Number one.\\n\\niyeshia: Okay, at the bottom.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='293-297', metadata={'start': datetime.timedelta(seconds=2788, microseconds=670000), 'end': datetime.timedelta(seconds=2827, microseconds=966000), 'speakers': frozenset({'iyeshia'})}),\n",
310
+ " Chunk(text=\"iyeshia: Okay, at the bottom.\\n\\niyeshia: Okay, with that, said\\n\\niyeshia: the last thing I will do. These are some follow up questions that you can ask your career coach. If I'm your career coach, you could definitely ask me that.\\n\\niyeshia: But how much of a raise. Can you ask for? When do you? Should you start a retirement fund? I would say, Asap, how long should you take to figure out if your company is a good fit, and how do you approach a conflict with a manager or coworker? So if you have any questions about those, please feel free to reach out to me or your career coach, if you would like to discuss further details, and I do want to be mindful of time.\\n\\niyeshia: And so I want to thank you for your time, and just want to let you know. This is the feedback form that really helps me with this presentation\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='297-301', metadata={'start': datetime.timedelta(seconds=2822, microseconds=600000), 'end': datetime.timedelta(seconds=2879, microseconds=310000), 'speakers': frozenset({'iyeshia'})}),\n",
311
+ " Chunk(text=\"iyeshia: And so I want to thank you for your time, and just want to let you know. This is the feedback form that really helps me with this presentation\\n\\niyeshia: and help me to deliver it better or worse. So if I did a good job, that's great. But I'm going to put this in the chat.\\n\\niyeshia: So you could fill that out now and then. Also want to invite you all to Rsvp. For Ctp's graduation.\\n\\niyeshia: So I would say, you can do that right now as well\\n\\niyeshia: and please register as a student. For those who can attend. You're more than welcome for the I believe the May 20th ones. If you cannot attend because you have a final, you have an internship. It is okay. There's no pressure. We're not going to be like, Hey, you can't you got to make it? No, we totally get it, I mean, we understand. So blessings on your finals\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='301-305', metadata={'start': datetime.timedelta(seconds=2870, microseconds=460000), 'end': datetime.timedelta(seconds=2919, microseconds=640000), 'speakers': frozenset({'iyeshia'})}),\n",
312
+ " Chunk(text=\"iyeshia: and please register as a student. For those who can attend. You're more than welcome for the I believe the May 20th ones. If you cannot attend because you have a final, you have an internship. It is okay. There's no pressure. We're not going to be like, Hey, you can't you got to make it? No, we totally get it, I mean, we understand. So blessings on your finals\\n\\niyeshia: and your projects. But for those who can't attend come through. It's going to be great to see your projects to see each other one last time, like Demo Night. And it's gonna be it's going to be a great time as we close out the the cohort in in May. So, and also to Devin's question, just one more time. We won't leave you hanging you will get an invite to be alumni\\n\\niyeshia: for Ctp, and that way you'll be with everybody who did the cohorts before your cohorts, one through 9 and so it'll be one through 10 now. And so that'll be like over a thousand people in that slack channel. So you can definitely network with your peers and the people who came before you. So yeah, just keep that in mind.\\n\\niyeshia: So thank you all. And I will stop sharing.\\n\\niyeshia: And yeah, please. Rsvp for the graduation. And please fill out that feedback form. It is greatly appreciative. I want to thank you for your time lessons on your projects. And yeah, if any of my fellows have any questions about the presentation, you can highlight me on slack. I am there to support you, and other than that. I want to thank you. And, Kevin, I think it's all yours now.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='305-309', metadata={'start': datetime.timedelta(seconds=2901, microseconds=130000), 'end': datetime.timedelta(seconds=2988, microseconds=469000), 'speakers': frozenset({'iyeshia'})}),\n",
313
+ " Chunk(text=\"iyeshia: And yeah, please. Rsvp for the graduation. And please fill out that feedback form. It is greatly appreciative. I want to thank you for your time lessons on your projects. And yeah, if any of my fellows have any questions about the presentation, you can highlight me on slack. I am there to support you, and other than that. I want to thank you. And, Kevin, I think it's all yours now.\\n\\nCUNY Tech Prep (CTP): Definitely. Thank you, Aisha, for the valuable tips. I think. A lot of students, a lot of the students I've spoken to, at least are.\\n\\nCUNY Tech Prep (CTP): have got recently gotten jobs or are very close to getting them, and\\n\\nCUNY Tech Prep (CTP): they will find this material very useful. I'm actually kind of glad I remember to click record at the beginning, because some of them are like in traffic right now.\\n\\niyeshia: Got it. Okay.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='309-313', metadata={'start': datetime.timedelta(seconds=2964, microseconds=60000), 'end': datetime.timedelta(seconds=3011, microseconds=947000), 'speakers': frozenset({'CUNY Tech Prep (CTP)', 'iyeshia'})}),\n",
314
+ " Chunk(text=\"iyeshia: Got it. Okay.\\n\\niyeshia: I'm glad.\\n\\nCUNY Tech Prep (CTP): Okay, thank you. So I'm gonna give you all 10\\xa0min to fill this out. Since you got 2 things to fill out. One is the inviting yourself to the graduation, and then 2 is the survey.\\n\\nCUNY Tech Prep (CTP): Alright, so we will come back at 7, 35.\\n\\nCUNY Tech Prep (CTP): Oh, yes, there's good news for those of you who missed it.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='313-317', metadata={'start': datetime.timedelta(seconds=3010, microseconds=980000), 'end': datetime.timedelta(seconds=3063, microseconds=720000), 'speakers': frozenset({'CUNY Tech Prep (CTP)', 'iyeshia'})}),\n",
315
+ " Chunk(text=\"CUNY Tech Prep (CTP): Oh, yes, there's good news for those of you who missed it.\\n\\nCUNY Tech Prep (CTP): There's no homework for the next 2 weeks, and there's spring break. So which means.\\n\\nCUNY Tech Prep (CTP): after this class, I'll be seeing you the second Friday from now.\\n\\nCUNY Tech Prep (CTP): Not next Friday.\\n\\nCUNY Tech Prep (CTP): No, a break is not exactly a break, so you have projects.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='317-321', metadata={'start': datetime.timedelta(seconds=3060, microseconds=740000), 'end': datetime.timedelta(seconds=3115, microseconds=180000), 'speakers': frozenset({'CUNY Tech Prep (CTP)'})}),\n",
316
+ " Chunk(text='CUNY Tech Prep (CTP): No, a break is not exactly a break, so you have projects.\\n\\nCUNY Tech Prep (CTP): This is time to do your projects.\\n\\nCUNY Tech Prep (CTP): Alright, so just as a gift to all the people who are in class.\\n\\nCUNY Tech Prep (CTP): If you check the homework sheet.\\n\\nCUNY Tech Prep (CTP): there is actually a column where you can grade yourselves. You can give yourself any emoji you want.', parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='321-325', metadata={'start': datetime.timedelta(seconds=3110, microseconds=350000), 'end': datetime.timedelta(seconds=3275, microseconds=10000), 'speakers': frozenset({'CUNY Tech Prep (CTP)'})}),\n",
317
+ " Chunk(text=\"CUNY Tech Prep (CTP): there is actually a column where you can grade yourselves. You can give yourself any emoji you want.\\n\\nCUNY Tech Prep (CTP): I'll let you figure out which one that is\\n\\nCUNY Tech Prep (CTP): alright. We're back.\\n\\nCUNY Tech Prep (CTP): So go for the rest of this day. So we're gonna I'm gonna put you in breakout rooms\\n\\nCUNY Tech Prep (CTP): for your projects.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='325-329', metadata={'start': datetime.timedelta(seconds=3269, microseconds=390000), 'end': datetime.timedelta(seconds=3591, microseconds=359000), 'speakers': frozenset({'CUNY Tech Prep (CTP)'})}),\n",
318
+ " Chunk(text='CUNY Tech Prep (CTP): for your projects.\\n\\nCUNY Tech Prep (CTP): And what I want you to do is I need to think about the state of the project. You, the the state the project is in.\\n\\nCUNY Tech Prep (CTP): I will be coming around to check in\\n\\nCUNY Tech Prep (CTP): because you have 2 weeks and no homework.\\n\\nCUNY Tech Prep (CTP): I want you to put your all into the project. So', parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='329-333', metadata={'start': datetime.timedelta(seconds=3589, microseconds=600000), 'end': datetime.timedelta(seconds=3613, microseconds=269000), 'speakers': frozenset({'CUNY Tech Prep (CTP)'})}),\n",
319
+ " Chunk(text='CUNY Tech Prep (CTP): I want you to put your all into the project. So\\n\\nCUNY Tech Prep (CTP): let me make the breakout rooms first.st\\n\\nCUNY Tech Prep (CTP): Basically, what I want you to do is plan out the next 2 weeks. Okay, what do you want? What? What is missing from\\n\\nCUNY Tech Prep (CTP): your project that you need to complete it?\\n\\nCUNY Tech Prep (CTP): And how are you going to get there in the next 2 weeks?', parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='333-337', metadata={'start': datetime.timedelta(seconds=3609, microseconds=440000), 'end': datetime.timedelta(seconds=3646, microseconds=619000), 'speakers': frozenset({'CUNY Tech Prep (CTP)'})}),\n",
320
+ " Chunk(text=\"CUNY Tech Prep (CTP): And how are you going to get there in the next 2 weeks?\\n\\nCUNY Tech Prep (CTP): Because after the next 2 weeks you literally have only 2 weeks left.\\n\\nCUNY Tech Prep (CTP): There's class. There's week 11, and then there's week 12\\n\\nCUNY Tech Prep (CTP): week. 13 is like May May 10th or May 9, th\\n\\nCUNY Tech Prep (CTP): and then the week after that, I believe, is\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='337-341', metadata={'start': datetime.timedelta(seconds=3643, microseconds=720000), 'end': datetime.timedelta(seconds=3672, microseconds=696000), 'speakers': frozenset({'CUNY Tech Prep (CTP)'})}),\n",
321
+ " Chunk(text=\"CUNY Tech Prep (CTP): and then the week after that, I believe, is\\n\\nCUNY Tech Prep (CTP): when you're going to do Demos.\\n\\nCUNY Tech Prep (CTP): I could be wrong.\\n\\nCUNY Tech Prep (CTP): Alright. You can pick the rooms. Now go into your rooms.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='341-344', metadata={'start': datetime.timedelta(seconds=3670, microseconds=320000), 'end': datetime.timedelta(seconds=3682, microseconds=370000), 'speakers': frozenset({'CUNY Tech Prep (CTP)'})}))"
322
+ ]
323
+ },
324
+ "execution_count": 13,
325
+ "metadata": {},
326
+ "output_type": "execute_result"
327
+ }
328
+ ],
329
+ "source": [
330
+ "web_vtt_content.get_chunks()"
331
+ ]
332
+ }
333
+ ],
334
+ "metadata": {
335
+ "kernelspec": {
336
+ "display_name": ".venv",
337
+ "language": "python",
338
+ "name": "python3"
339
+ },
340
+ "language_info": {
341
+ "codemirror_mode": {
342
+ "name": "ipython",
343
+ "version": 3
344
+ },
345
+ "file_extension": ".py",
346
+ "mimetype": "text/x-python",
347
+ "name": "python",
348
+ "nbconvert_exporter": "python",
349
+ "pygments_lexer": "ipython3",
350
+ "version": "3.12.3"
351
+ }
352
+ },
353
+ "nbformat": 4,
354
+ "nbformat_minor": 2
355
+ }
pyproject.toml CHANGED
@@ -7,7 +7,7 @@ name = "ctp-slack-bot"
7
  version = "0.1.0"
8
  description = "A Slack bot for processing and analyzing Zoom transcripts using AI"
9
  readme = "README.md"
10
- requires-python = ">=3.11.9"
11
  license = {text = "MIT"}
12
  authors = [
13
  {name = "Your Name", email = "[email protected]"}
@@ -19,26 +19,27 @@ classifiers = [
19
  "Operating System :: OS Independent",
20
  ]
21
  dependencies = [
22
- "dependency-injector>=4.46.0",
23
  "pydantic>=2.11.2",
24
  "pydantic-settings>=2.8.1",
25
- "fastapi>=0.115.12",
26
- "uvicorn>=0.34.0",
27
- "loguru>=0.7.3",
28
  "python-dotenv>=1.1.0",
29
- "httpx>=0.28.1",
30
- "tenacity>=9.1.2",
31
- "pybreaker>=1.3.0",
32
  "pytz>=2025.2",
33
  "apscheduler>=3.11.0",
 
 
 
 
34
  "slack-sdk>=3.35.0",
 
35
  "pymongo>=4.11.3 ",
36
- "numpy>=2.2.4",
37
- "webvtt-py>=0.5.1",
38
  "openai>=1.70.0",
39
- # "langchain>=0.3.23",
40
- # "transformers>=4.51.0",
41
- # "torch>=2.6.0",
42
  ]
43
 
44
  [project.optional-dependencies]
@@ -49,7 +50,7 @@ dev = [
49
  "types-pytz>=2025.2",
50
  "black>=25.1.0",
51
  "isort>=6.0.1",
52
- "ruff>=0.11.4",
53
  ]
54
 
55
  [project.urls]
 
7
  version = "0.1.0"
8
  description = "A Slack bot for processing and analyzing Zoom transcripts using AI"
9
  readme = "README.md"
10
+ requires-python = ">=3.12"
11
  license = {text = "MIT"}
12
  authors = [
13
  {name = "Your Name", email = "[email protected]"}
 
19
  "Operating System :: OS Independent",
20
  ]
21
  dependencies = [
 
22
  "pydantic>=2.11.2",
23
  "pydantic-settings>=2.8.1",
24
+ "cachetools>=5.5.2",
25
+ "more-itertools>=10.6.0",
 
26
  "python-dotenv>=1.1.0",
27
+ "loguru>=0.7.3",
28
+ "dependency-injector>=4.46.0",
 
29
  "pytz>=2025.2",
30
  "apscheduler>=3.11.0",
31
+ # "tenacity>=9.1.2",
32
+ # "pybreaker>=1.3.0",
33
+ "aiohttp>=3.11.16",
34
+ "webvtt-py>=0.5.1",
35
  "slack-sdk>=3.35.0",
36
+ "slack_bolt>=1.23.0",
37
  "pymongo>=4.11.3 ",
38
+ "motor>=3.7.0",
 
39
  "openai>=1.70.0",
40
+ "google-api-python-client>=2.167.0",
41
+ "google-auth>=2.39.0",
42
+ "google-auth-oauthlib>=1.2.1"
43
  ]
44
 
45
  [project.optional-dependencies]
 
50
  "types-pytz>=2025.2",
51
  "black>=25.1.0",
52
  "isort>=6.0.1",
53
+ "ruff>=0.11.4"
54
  ]
55
 
56
  [project.urls]
scripts/run-dev.sh CHANGED
@@ -2,4 +2,4 @@
2
 
3
  parent_path=$(cd "$(dirname "${BASH_SOURCE[0]}")"; pwd -P)
4
 
5
- python3 "${parent_path}/../src/ctp_slack_bot/api/main.py"
 
2
 
3
  parent_path=$(cd "$(dirname "${BASH_SOURCE[0]}")"; pwd -P)
4
 
5
+ LOG_LEVEL=DEBUG python3 "${parent_path}/../src/ctp_slack_bot/app.py"
src/ctp_slack_bot/__init__.py CHANGED
@@ -1 +0,0 @@
1
- from ctp_slack_bot.containers import Container
 
 
src/ctp_slack_bot/api/__init__.py DELETED
@@ -1 +0,0 @@
1
- from ctp_slack_bot.api.main import app, run
 
 
src/ctp_slack_bot/api/main.py DELETED
@@ -1,70 +0,0 @@
1
- from contextlib import asynccontextmanager
2
- from fastapi import FastAPI, HTTPException, Depends
3
- from loguru import logger
4
- from typing import AsyncGenerator
5
- from dependency_injector.wiring import inject, Provide
6
-
7
- from ctp_slack_bot import Container
8
- from ctp_slack_bot.api.routes import router
9
- from ctp_slack_bot.core import Settings, setup_logging
10
- from ctp_slack_bot.core.response_rendering import PrettyJSONResponse
11
- from ctp_slack_bot.tasks import start_scheduler, stop_scheduler
12
-
13
- @asynccontextmanager
14
- async def lifespan(app: FastAPI) -> AsyncGenerator:
15
- """
16
- Lifespan context manager for FastAPI application.
17
- Handles startup and shutdown events.
18
- """
19
- # Initialize container and wire the container to modules that need dependency injection.
20
- container = Container()
21
- container.wire(packages=['ctp_slack_bot'])
22
- app.container = container
23
-
24
- # Setup logging.
25
- setup_logging(container)
26
- logger.info("Starting application")
27
-
28
- # Start the scheduler.
29
- scheduler = start_scheduler(container)
30
- logger.info("Started scheduler")
31
-
32
- yield # control to FastAPI until shutdown.
33
-
34
- # Shutdown.
35
- logger.info("Shutting down application")
36
- stop_scheduler(scheduler)
37
- logger.info("Stopped scheduler")
38
-
39
-
40
- app = FastAPI(
41
- title="CTP Slack Bot",
42
- description="A Slack bot for processing and analyzing Zoom transcripts using AI",
43
- version="0.1.0",
44
- lifespan=lifespan,
45
- )
46
-
47
- # Include routers.
48
- app.include_router(router)
49
-
50
- # Provide a minimalist health check endpoint for clients to detect availability.
51
- @app.get("/health")
52
- async def get_health() -> dict[str, str]:
53
- """Health check"""
54
- return {
55
- "status": "healthy"
56
- }
57
-
58
- # Alternate starting path for development
59
- def run() -> None:
60
- import uvicorn
61
- settings = Settings() # type: ignore
62
- uvicorn.run(
63
- "main:app",
64
- host=settings.API_HOST,
65
- port=settings.API_PORT,
66
- reload=settings.DEBUG
67
- )
68
-
69
- if __name__ == "__main__":
70
- run()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/ctp_slack_bot/api/routes.py DELETED
@@ -1,67 +0,0 @@
1
- from fastapi import APIRouter, Depends, HTTPException, status
2
- from dependency_injector.wiring import inject, Provide
3
- from loguru import logger
4
-
5
- from ctp_slack_bot import Container
6
- from ctp_slack_bot.core import Settings
7
- from ctp_slack_bot.services import SlackService
8
-
9
- router = APIRouter(prefix="/api/v1")
10
-
11
- @router.get("/env", response_model=Settings)
12
- @inject
13
- async def get_env(settings: Settings = Depends(Provide[Container.settings])) -> Settings:
14
- if not settings.DEBUG:
15
- raise HTTPException(status_code=404)
16
- return settings
17
-
18
- # @router.post("/transcripts/analyze", response_model=TranscriptResponse)
19
- # async def analyze_transcript(
20
- # request: TranscriptRequest,
21
- # transcript_service: TranscriptService = Depends(get_transcript_service),
22
- # ):
23
- # """
24
- # Analyze a Zoom transcript and return insights.
25
- # """
26
- # logger.info(f"Analyzing transcript: {request.transcript_id}")
27
- # try:
28
- # result = await transcript_service.analyze_transcript(request)
29
- # return result
30
- # except Exception as e:
31
- # logger.error(f"Error analyzing transcript: {e}")
32
- # raise HTTPException(
33
- # status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
34
- # detail="Failed to analyze transcript",
35
- # )
36
-
37
-
38
- # @router.post("/slack/message")
39
- # async def send_slack_message(
40
- # channel: str,
41
- # message: str,
42
- # slack_service: SlackService = Depends(get_slack_service),
43
- # ):
44
- # """
45
- # Send a message to a Slack channel.
46
- # """
47
- # logger.info(f"Sending message to Slack channel: {channel}")
48
- # try:
49
- # result = await slack_service.send_message(channel, message)
50
- # return {"status": "success", "message_ts": result.get("ts")}
51
- # except Exception as e:
52
- # logger.error(f"Error sending Slack message: {e}")
53
- # raise HTTPException(
54
- # status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
55
- # detail="Failed to send Slack message",
56
- # )
57
-
58
-
59
- # @router.post("/slack/webhook", include_in_schema=False)
60
- # async def slack_webhook(
61
- # slack_service: SlackService = Depends(get_slack_service),
62
- # ):
63
- # """
64
- # Webhook endpoint for Slack events.
65
- # """
66
- # # This would typically handle Slack verification and event processing
67
- # return {"challenge": "challenge_token"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/ctp_slack_bot/app.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from asyncio import all_tasks, CancelledError, create_task, current_task, get_running_loop, run
2
+ from loguru import logger
3
+ from signal import SIGINT, SIGTERM
4
+ from typing import Any, Callable
5
+
6
+ from ctp_slack_bot.containers import Container
7
+ from ctp_slack_bot.core.logging import setup_logging
8
+
9
+ async def handle_shutdown_signal() -> None:
10
+ logger.info("Received shutdown signal.")
11
+ for task in all_tasks():
12
+ if task is not current_task() and not task.done():
13
+ task.cancel()
14
+ logger.trace("Cancelled task {}.", task.get_name())
15
+ logger.info("Cancelled all tasks.")
16
+
17
+ def create_shutdown_signal_handler() -> Callable[[], None]:
18
+ def shutdown_signal_handler() -> None:
19
+ create_task(handle_shutdown_signal())
20
+ return shutdown_signal_handler
21
+
22
+ async def main() -> None:
23
+ # Setup logging.
24
+ setup_logging()
25
+ logger.info("Starting application…")
26
+
27
+ # Set up dependency injection container.
28
+ container = Container()
29
+ container.wire(packages=['ctp_slack_bot'])
30
+
31
+ # Kick off services which should be active from the start.
32
+ container.content_ingestion_service()
33
+ container.question_dispatch_service()
34
+ container.schedule_service()
35
+
36
+ # Start the Slack socket mode handler in the background.
37
+ socket_mode_handler = container.socket_mode_handler()
38
+ slack_bolt_task = create_task(socket_mode_handler.start_async())
39
+ shutdown_signal_handler = create_shutdown_signal_handler()
40
+ loop = get_running_loop()
41
+ loop.add_signal_handler(SIGINT, shutdown_signal_handler)
42
+ loop.add_signal_handler(SIGTERM, shutdown_signal_handler)
43
+ try:
44
+ logger.info("Starting Slack Socket Mode handler…")
45
+ await slack_bolt_task
46
+ except CancelledError:
47
+ logger.info("Shutting down application…")
48
+ finally:
49
+ await socket_mode_handler.close_async()
50
+ await container.shutdown_resources()
51
+
52
+ if __name__ == "__main__":
53
+ run(main())
src/ctp_slack_bot/containers.py CHANGED
@@ -1,44 +1,40 @@
1
  from dependency_injector.containers import DeclarativeContainer
2
- from dependency_injector.providers import Factory, Singleton
3
- from openai import OpenAI
 
4
 
5
  from ctp_slack_bot.core.config import Settings
6
- from ctp_slack_bot.db.mongo_db import MongoDB
 
7
  from ctp_slack_bot.services.answer_retrieval_service import AnswerRetrievalService
8
  from ctp_slack_bot.services.content_ingestion_service import ContentIngestionService
9
  from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
 
10
  from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
 
 
11
  from ctp_slack_bot.services.question_dispatch_service import QuestionDispatchService
12
- from ctp_slack_bot.services.slack_service import SlackService
 
13
  from ctp_slack_bot.services.vector_database_service import VectorDatabaseService
14
  from ctp_slack_bot.services.vectorization_service import VectorizationService
15
 
16
 
17
  class Container(DeclarativeContainer):
18
  settings = Singleton(Settings)
19
-
20
  event_brokerage_service = Singleton(EventBrokerageService)
21
-
22
- mongo_db = Singleton(MongoDB, settings=settings)
23
-
24
- # Repositories
25
- # transcript_repository = Factory(
26
- # # Your transcript repository class
27
- # db=db
28
- # )
29
-
30
- open_ai_client = Factory(OpenAI, api_key=settings.provided.OPENAI_API_KEY) # TODO: poor practice to do it this way; create a LanguageModelService that creates an OpenAI client.
31
-
32
  vector_database_service = Singleton(VectorDatabaseService, settings=settings, mongo_db=mongo_db)
33
-
34
- vectorization_service = Singleton(VectorizationService, settings=settings, client=open_ai_client)
35
-
36
  content_ingestion_service = Singleton(ContentIngestionService, settings=settings, event_brokerage_service=event_brokerage_service, vector_database_service=vector_database_service, vectorization_service=vectorization_service)
37
-
38
  context_retrieval_service = Singleton(ContextRetrievalService, settings=settings, vectorization_service=vectorization_service, vector_database_service=vector_database_service)
39
-
40
- answer_retrieval_service = Singleton(AnswerRetrievalService, settings=settings, event_brokerage_service=event_brokerage_service, client=open_ai_client)
41
-
42
  question_dispatch_service = Singleton(QuestionDispatchService, settings=settings, event_brokerage_service=event_brokerage_service, content_ingestion_service=content_ingestion_service, context_retrieval_service=context_retrieval_service, answer_retrieval_service=answer_retrieval_service)
43
-
44
- slack_service = Singleton(SlackService, settings=settings, event_brokerage_service=event_brokerage_service)
 
 
 
1
  from dependency_injector.containers import DeclarativeContainer
2
+ from dependency_injector.providers import Resource, Singleton
3
+ from slack_bolt.adapter.socket_mode.async_handler import AsyncSocketModeHandler
4
+ from slack_bolt.async_app import AsyncApp
5
 
6
  from ctp_slack_bot.core.config import Settings
7
+ from ctp_slack_bot.db.mongo_db import MongoDBResource
8
+ from ctp_slack_bot.db.repositories import MongoVectorizedChunkRepository
9
  from ctp_slack_bot.services.answer_retrieval_service import AnswerRetrievalService
10
  from ctp_slack_bot.services.content_ingestion_service import ContentIngestionService
11
  from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
12
+ from ctp_slack_bot.services.embeddings_model_service import EmbeddingsModelService
13
  from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
14
+ from ctp_slack_bot.services.google_drive_service import GoogleDriveService
15
+ from ctp_slack_bot.services.language_model_service import LanguageModelService
16
  from ctp_slack_bot.services.question_dispatch_service import QuestionDispatchService
17
+ from ctp_slack_bot.services.schedule_service import ScheduleServiceResource
18
+ from ctp_slack_bot.services.slack_service import SlackServiceResource
19
  from ctp_slack_bot.services.vector_database_service import VectorDatabaseService
20
  from ctp_slack_bot.services.vectorization_service import VectorizationService
21
 
22
 
23
  class Container(DeclarativeContainer):
24
  settings = Singleton(Settings)
 
25
  event_brokerage_service = Singleton(EventBrokerageService)
26
+ schedule_service = Resource(ScheduleServiceResource, settings=settings)
27
+ mongo_db = Resource(MongoDBResource, settings=settings) # TODO: generalize to any database.
28
+ vectorized_chunk_repository = Singleton(MongoVectorizedChunkRepository, mongo_db=mongo_db)
 
 
 
 
 
 
 
 
29
  vector_database_service = Singleton(VectorDatabaseService, settings=settings, mongo_db=mongo_db)
30
+ embeddings_model_service = Singleton(EmbeddingsModelService, settings=settings)
31
+ vectorization_service = Singleton(VectorizationService, settings=settings, embeddings_model_service=embeddings_model_service)
 
32
  content_ingestion_service = Singleton(ContentIngestionService, settings=settings, event_brokerage_service=event_brokerage_service, vector_database_service=vector_database_service, vectorization_service=vectorization_service)
 
33
  context_retrieval_service = Singleton(ContextRetrievalService, settings=settings, vectorization_service=vectorization_service, vector_database_service=vector_database_service)
34
+ language_model_service = Singleton(LanguageModelService, settings=settings)
35
+ answer_retrieval_service = Singleton(AnswerRetrievalService, settings=settings, event_brokerage_service=event_brokerage_service, language_model_service=language_model_service)
 
36
  question_dispatch_service = Singleton(QuestionDispatchService, settings=settings, event_brokerage_service=event_brokerage_service, content_ingestion_service=content_ingestion_service, context_retrieval_service=context_retrieval_service, answer_retrieval_service=answer_retrieval_service)
37
+ slack_bolt_app = Singleton(AsyncApp, token=settings.provided.SLACK_BOT_TOKEN().get_secret_value())
38
+ slack_service = Resource(SlackServiceResource, event_brokerage_service=event_brokerage_service, slack_bolt_app=slack_bolt_app)
39
+ socket_mode_handler = Singleton(lambda _, app, app_token: AsyncSocketModeHandler(app, app_token), slack_service, slack_bolt_app, settings.provided.SLACK_APP_TOKEN().get_secret_value())
40
+ google_drive_service = Singleton(GoogleDriveService, settings=settings)
src/ctp_slack_bot/core/__init__.py CHANGED
@@ -1,2 +1 @@
1
  from ctp_slack_bot.core.config import Settings
2
- from ctp_slack_bot.core.logging import logger, setup_logging
 
1
  from ctp_slack_bot.core.config import Settings
 
src/ctp_slack_bot/core/config.py CHANGED
@@ -1,28 +1,29 @@
 
1
  from pydantic import Field, MongoDsn, NonNegativeFloat, NonNegativeInt, PositiveInt, SecretStr
2
  from pydantic_settings import BaseSettings, SettingsConfigDict
3
- from typing import Literal, Optional
 
4
 
5
- class Settings(BaseSettings): # TODO: Strong guarantees of validity, because garbage in = garbage out, and settings flow into all the nooks and crannies
6
  """
7
  Application settings loaded from environment variables.
8
  """
9
- # Application Configuration
10
- DEBUG: bool = False
11
 
12
- # Logging Configuration
 
 
 
 
 
 
13
  LOG_LEVEL: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = Field(default_factory=lambda data: "DEBUG" if data.get("DEBUG", False) else "INFO")
14
  LOG_FORMAT: Literal["text", "json"] = "json"
15
 
16
  # APScheduler Configuration
17
- SCHEDULER_TIMEZONE: str = "UTC"
18
-
19
- # API Configuration
20
- API_HOST: str
21
- API_PORT: PositiveInt
22
 
23
  # Slack Configuration
24
  SLACK_BOT_TOKEN: SecretStr
25
- SLACK_SIGNING_SECRET: SecretStr
26
  SLACK_APP_TOKEN: SecretStr
27
 
28
  # Vectorization Configuration
@@ -31,23 +32,45 @@ class Settings(BaseSettings): # TODO: Strong guarantees of validity, because gar
31
  CHUNK_SIZE: PositiveInt
32
  CHUNK_OVERLAP: NonNegativeInt
33
  TOP_K_MATCHES: PositiveInt
34
-
35
  # MongoDB Configuration
36
  MONGODB_URI: SecretStr # TODO: Contemplate switching to MongoDsn type for the main URL, and separate out the credentials to SecretStr variables.
37
  MONGODB_NAME: str
 
38
 
39
  # Hugging Face Configuration
40
- HF_API_TOKEN: Optional[SecretStr] = None
41
 
42
  # OpenAI Configuration
43
- OPENAI_API_KEY: Optional[SecretStr] = None
44
  CHAT_MODEL: str
45
  MAX_TOKENS: PositiveInt
46
  TEMPERATURE: NonNegativeFloat
47
  SYSTEM_PROMPT: str
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  model_config = SettingsConfigDict(
50
  env_file=".env",
51
  env_file_encoding="utf-8",
52
  case_sensitive=True,
 
 
53
  )
 
 
 
 
1
+ from loguru import logger
2
  from pydantic import Field, MongoDsn, NonNegativeFloat, NonNegativeInt, PositiveInt, SecretStr
3
  from pydantic_settings import BaseSettings, SettingsConfigDict
4
+ from types import MappingProxyType
5
+ from typing import Literal, Mapping, Optional, Self
6
 
7
+ class Settings(BaseSettings):
8
  """
9
  Application settings loaded from environment variables.
10
  """
 
 
11
 
12
+ def __init__(self: Self, **data) -> None:
13
+ super().__init__(**data)
14
+ logger.debug("Created {}", self.__class__.__name__)
15
+ if self.__pydantic_extra__:
16
+ logger.warning("Extra unrecognized environment variables were provided: {}", ", ".join(self.__pydantic_extra__))
17
+
18
+ # Logging Configuration ― not actually used to configure Loguru, but defined to prevent warnings about “unknown” environment variables
19
  LOG_LEVEL: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = Field(default_factory=lambda data: "DEBUG" if data.get("DEBUG", False) else "INFO")
20
  LOG_FORMAT: Literal["text", "json"] = "json"
21
 
22
  # APScheduler Configuration
23
+ SCHEDULER_TIMEZONE: Optional[str] = "UTC"
 
 
 
 
24
 
25
  # Slack Configuration
26
  SLACK_BOT_TOKEN: SecretStr
 
27
  SLACK_APP_TOKEN: SecretStr
28
 
29
  # Vectorization Configuration
 
32
  CHUNK_SIZE: PositiveInt
33
  CHUNK_OVERLAP: NonNegativeInt
34
  TOP_K_MATCHES: PositiveInt
35
+
36
  # MongoDB Configuration
37
  MONGODB_URI: SecretStr # TODO: Contemplate switching to MongoDsn type for the main URL, and separate out the credentials to SecretStr variables.
38
  MONGODB_NAME: str
39
+ SCORE_THRESHOLD: NonNegativeFloat
40
 
41
  # Hugging Face Configuration
42
+ HF_API_TOKEN: Optional[SecretStr] = None # TODO: Currently, this is unused.
43
 
44
  # OpenAI Configuration
45
+ OPENAI_API_KEY: SecretStr
46
  CHAT_MODEL: str
47
  MAX_TOKENS: PositiveInt
48
  TEMPERATURE: NonNegativeFloat
49
  SYSTEM_PROMPT: str
50
 
51
+ # Google Drive Configuration
52
+ GOOGLE_DRIVE_ROOT_ID: str
53
+ GOOGLE_PROJECT_ID: str
54
+ GOOGLE_PRIVATE_KEY_ID: SecretStr
55
+ GOOGLE_PRIVATE_KEY: SecretStr
56
+ GOOGLE_CLIENT_ID: str
57
+ GOOGLE_CLIENT_EMAIL: str
58
+ GOOGLE_AUTH_URI: str = "https://accounts.google.com/o/oauth2/auth"
59
+ GOOGLE_TOKEN_URI: str = "https://oauth2.googleapis.com/token"
60
+ GOOGLE_AUTH_PROVIDER_CERT_URL: str = "https://www.googleapis.com/oauth2/v1/certs"
61
+ GOOGLE_CLIENT_CERT_URL: str = "https://www.googleapis.com/robot/v1/metadata/x509/ctp-slack-bot-714%40voltaic-reducer-294821.iam.gserviceaccount.com"
62
+ GOOGLE_UNIVERSE_DOMAIN: str = "googleapis.com"
63
+
64
+ # File Monitoring Configuration
65
+ FILE_MONITOR_ROOT_PATH: Optional[str] = None
66
+
67
  model_config = SettingsConfigDict(
68
  env_file=".env",
69
  env_file_encoding="utf-8",
70
  case_sensitive=True,
71
+ extra="allow",
72
+ frozen=True
73
  )
74
+
75
+ def get_extra_environment_variables(self: Self) -> Mapping[str, str]:
76
+ return MappingProxyType(self.__pydantic_extra__)
src/ctp_slack_bot/core/logging.py CHANGED
@@ -1,7 +1,8 @@
1
- from logging import __file__ as logging_file, basicConfig, currentframe, getLogger, Handler, INFO, LogRecord
2
  from loguru import logger
 
3
  from sys import stderr
4
- from typing import Dict, Union
5
 
6
  class InterceptHandler(Handler):
7
  """
@@ -11,7 +12,7 @@ class InterceptHandler(Handler):
11
  to Loguru, allowing unified logging across the application.
12
  """
13
 
14
- def emit(self, record: LogRecord) -> None:
15
  # Get corresponding Loguru level if it exists
16
  try:
17
  level = logger.level(record.levelname).name
@@ -29,22 +30,23 @@ class InterceptHandler(Handler):
29
  )
30
 
31
 
32
- def setup_logging(container: "Container") -> None:
33
  """
34
  Configure logging with Loguru.
35
 
36
- This function sets up Loguru as the main logging provider,
37
- configures the log format based on settings, and intercepts
38
- standard logging messages.
39
  """
40
- from ctp_slack_bot import Container
41
- settings = container.settings() if container else Provide[Container.settings]
42
 
43
- # Remove default loguru handler
 
 
 
 
44
  logger.remove()
45
 
46
- # Determine log format
47
- if settings.LOG_FORMAT == "json":
48
  log_format = {
49
  "time": "{time:YYYY-MM-DD HH:mm:ss.SSS}",
50
  "level": "{level}",
@@ -62,33 +64,35 @@ def setup_logging(container: "Container") -> None:
62
  "<level>{message}</level>"
63
  )
64
 
65
- # Add console handler
66
  logger.add(
67
  stderr,
68
  format=format_string,
69
- level=settings.LOG_LEVEL,
70
- serialize=(settings.LOG_FORMAT == "json"),
71
  backtrace=True,
72
  diagnose=True,
73
  )
74
 
75
- # Add file handler for non-DEBUG environments
76
- if settings.LOG_LEVEL != "DEBUG":
77
- logger.add(
78
- "logs/app.log",
79
- rotation="10 MB",
80
- retention="1 week",
81
- compression="zip",
82
- format=format_string,
83
- level=settings.LOG_LEVEL,
84
- serialize=(settings.LOG_FORMAT == "json"),
85
- )
86
 
87
- # Intercept standard logging messages
88
  basicConfig(handlers=[InterceptHandler()], level=0, force=True)
89
 
90
- # Update logging levels for some noisy libraries
91
- for logger_name in ("uvicorn", "uvicorn.error", "fastapi", "httpx", "apscheduler", "pymongo"):
92
  getLogger(logger_name).setLevel(INFO)
 
 
93
 
94
- logger.info(f"Logging configured with level {settings.LOG_LEVEL}")
 
1
+ from logging import __file__ as logging_file, basicConfig, currentframe, getLogger, Handler, INFO, LogRecord, WARNING
2
  from loguru import logger
3
+ from os import getenv
4
  from sys import stderr
5
+ from typing import Self
6
 
7
  class InterceptHandler(Handler):
8
  """
 
12
  to Loguru, allowing unified logging across the application.
13
  """
14
 
15
+ def emit(self: Self, record: LogRecord) -> None:
16
  # Get corresponding Loguru level if it exists
17
  try:
18
  level = logger.level(record.levelname).name
 
30
  )
31
 
32
 
33
+ def setup_logging() -> None:
34
  """
35
  Configure logging with Loguru.
36
 
37
+ This function sets up Loguru as the main logging provider, configures the log format based on environment variables,
38
+ and intercepts standard logging messages.
 
39
  """
 
 
40
 
41
+ # Get logger configuration from environment variables.
42
+ log_level = getenv("LOG_LEVEL", "INFO")
43
+ log_format = getenv("LOG_FORMAT", "text")
44
+
45
+ # Remove default loguru handler.
46
  logger.remove()
47
 
48
+ # Determine log format.
49
+ if log_format == "json":
50
  log_format = {
51
  "time": "{time:YYYY-MM-DD HH:mm:ss.SSS}",
52
  "level": "{level}",
 
64
  "<level>{message}</level>"
65
  )
66
 
67
+ # Add console handler.
68
  logger.add(
69
  stderr,
70
  format=format_string,
71
+ level=log_level,
72
+ serialize=(log_format == "json"),
73
  backtrace=True,
74
  diagnose=True,
75
  )
76
 
77
+ # Add file handler for non-DEBUG environments.
78
+ # if log_level != "DEBUG":
79
+ # logger.add(
80
+ # "/data/app.log",
81
+ # rotation="10 MB",
82
+ # retention="1 week",
83
+ # compression="zip",
84
+ # format=format_string,
85
+ # level=log_level,
86
+ # serialize=(log_format == "json"),
87
+ # )
88
 
89
+ # Intercept standard logging messages.
90
  basicConfig(handlers=[InterceptHandler()], level=0, force=True)
91
 
92
+ # Update logging levels for some noisy libraries.
93
+ for logger_name in ("uvicorn", "uvicorn.error", "fastapi", "httpx", "pymongo"):
94
  getLogger(logger_name).setLevel(INFO)
95
+ for logger_name in ("apscheduler"):
96
+ getLogger(logger_name).setLevel(WARNING)
97
 
98
+ logger.info(f"Logging configured with level {log_level}")
src/ctp_slack_bot/core/response_rendering.py DELETED
@@ -1,13 +0,0 @@
1
- from json import dumps
2
- from starlette.responses import JSONResponse
3
- from typing import Any, Self
4
-
5
- class PrettyJSONResponse(JSONResponse):
6
- def render(self: Self, content: Any) -> bytes:
7
- return dumps(
8
- content,
9
- ensure_ascii=False,
10
- allow_nan=False,
11
- indent=4,
12
- separators=(", ", ": "),
13
- ).encode("utf-8")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/ctp_slack_bot/db/mongo_db.py CHANGED
@@ -1,125 +1,198 @@
1
- from pymongo import MongoClient, ASCENDING
 
 
 
 
2
  from loguru import logger
3
- from pydantic import BaseModel, model_validator, ConfigDict
4
- from typing import Optional, Self, Any
5
 
6
  from ctp_slack_bot.core.config import Settings
 
7
 
8
  class MongoDB(BaseModel):
9
  """
10
- MongoDB connection and initialization class.
11
- Handles connection to MongoDB, database selection, and index creation.
12
  """
13
-
14
  settings: Settings
15
- client: Optional[MongoClient] = None
16
- db: Optional[Any] = None
17
- vector_collection: Optional[Any] = None
18
- initialized: bool = False
19
-
20
- model_config = ConfigDict(arbitrary_types_allowed=True)
21
-
22
- @model_validator(mode='after')
23
- def post_init(self: Self) -> Self:
24
- logger.debug("Created {}", self.__class__.__name__)
25
- return self
26
 
27
- def connect(self):
28
- """
29
- Connect to MongoDB using connection string from settings.
30
- """
31
- if self.client is not None:
32
- return
33
 
34
- if not self.settings.MONGODB_URI:
35
- raise ValueError("MONGODB_URI is not set in environment variables")
 
36
 
 
 
37
  try:
38
- # Create MongoDB connection
39
- self.client = MongoClient(self.settings.MONGODB_URI.get_secret_value())
40
- self.db = self.client[self.settings.MONGODB_NAME]
41
- self.vector_collection = self.db["vector_store"]
42
- logger.info(f"Connected to MongoDB: {self.settings.MONGODB_NAME}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  except Exception as e:
44
- logger.error(f"Error connecting to MongoDB: {str(e)}")
 
 
45
  raise
46
 
47
- def initialize(self):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  """
49
- Initialize MongoDB with required collections and indexes.
 
50
  """
51
- if self.initialized:
52
- return
53
-
54
- if not self.client:
55
- self.connect()
56
-
57
  try:
58
- # Create vector index for similarity search
59
- self.create_vector_index()
60
- self.initialized = True
61
- logger.info("MongoDB initialized successfully")
 
 
 
 
 
 
 
 
 
 
 
62
  except Exception as e:
63
- logger.error(f"Error initializing MongoDB: {str(e)}")
64
  raise
65
 
66
- def create_vector_index(self):
67
  """
68
- Create vector index for similarity search using MongoDB Atlas Vector Search.
 
 
 
69
  """
 
 
70
  try:
71
- # Check if index already exists
72
- existing_indexes = list(self.vector_collection.list_indexes())
73
- index_names = [index.get('name') for index in existing_indexes]
74
-
75
- if "vector_index" not in index_names:
76
- # Create vector search index
77
- index_definition = {
78
- "mappings": {
79
- "dynamic": True,
80
- "fields": {
81
- "embedding": {
82
- "dimensions": self.settings.VECTOR_DIMENSION,
83
- "similarity": "cosine",
84
- "type": "knnVector"
85
- }
86
- }
87
- }
88
- }
89
-
90
- # Create the index
91
- self.db.command({
92
- "createIndexes": self.vector_collection.name,
93
- "indexes": [
94
  {
95
- "name": "vector_index",
96
- "key": {"embedding": "vector"},
97
- "weights": {"embedding": 1},
98
- "vectorSearchOptions": index_definition
 
99
  }
100
  ]
101
- })
102
-
103
- # Create additional metadata indexes for filtering
104
- self.vector_collection.create_index([("metadata.source", ASCENDING)])
105
- self.vector_collection.create_index([("metadata.timestamp", ASCENDING)])
106
-
107
- logger.info("Vector search index created")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  else:
109
- logger.info("Vector search index already exists")
110
-
111
  except Exception as e:
112
- logger.error(f"Error creating vector index: {str(e)}")
113
  raise
114
 
115
- def close(self):
116
- """
117
- Close MongoDB connection.
118
- """
119
- if self.client:
120
- self.client.close()
121
- self.client = None
122
- self.db = None
123
- self.vector_collection = None
124
- self.initialized = False
125
- logger.info("MongoDB connection closed")
 
1
+ from asyncio import create_task
2
+ from dependency_injector.resources import AsyncResource
3
+ from motor.motor_asyncio import AsyncIOMotorClient
4
+ from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError
5
+ from pymongo.operations import SearchIndexModel
6
  from loguru import logger
7
+ from pydantic import BaseModel, PrivateAttr
8
+ from typing import Any, Dict, Optional, Self
9
 
10
  from ctp_slack_bot.core.config import Settings
11
+ from ctp_slack_bot.utils import sanitize_mongo_db_uri
12
 
13
  class MongoDB(BaseModel):
14
  """
15
+ MongoDB connection manager using Motor for async operations.
 
16
  """
 
17
  settings: Settings
18
+ _client: PrivateAttr = PrivateAttr()
19
+ _db: PrivateAttr = PrivateAttr()
 
 
 
 
 
 
 
 
 
20
 
21
+ class Config:
22
+ arbitrary_types_allowed = True
 
 
 
 
23
 
24
+ def __init__(self: Self, **data: Dict[str, Any]) -> None:
25
+ super().__init__(**data)
26
+ logger.debug("Created {}", self.__class__.__name__)
27
 
28
+ def connect(self: Self) -> None:
29
+ """Initialize MongoDB client with settings."""
30
  try:
31
+ connection_string = self.settings.MONGODB_URI.get_secret_value()
32
+ logger.debug("Connecting to MongoDB using URI: {}", sanitize_mongo_db_uri(connection_string))
33
+
34
+ # Create client with appropriate settings
35
+ self._client = AsyncIOMotorClient(
36
+ connection_string,
37
+ serverSelectionTimeoutMS=5000,
38
+ connectTimeoutMS=10000,
39
+ socketTimeoutMS=45000,
40
+ maxPoolSize=100,
41
+ retryWrites=True,
42
+ w="majority"
43
+ )
44
+
45
+ # Set database
46
+ db_name = self.settings.MONGODB_NAME
47
+
48
+ self._db = self._client[db_name]
49
+ logger.debug("MongoDB client initialized for database: {}", db_name)
50
+
51
  except Exception as e:
52
+ logger.error("Failed to initialize MongoDB client: {}", e)
53
+ self._client = None
54
+ self._db = None
55
  raise
56
 
57
+ @property
58
+ def client(self: Self) -> AsyncIOMotorClient:
59
+ """Get the MongoDB client instance."""
60
+ if not hasattr(self, '_client') or self._client is None:
61
+ logger.warning("MongoDB client not initialized. Attempting to initialize…")
62
+ self.connect()
63
+ if not hasattr(self, '_client') or self._client is None:
64
+ raise ConnectionError("Failed to initialize MongoDB client.")
65
+ return self._client
66
+
67
+ @property
68
+ def db(self: Self) -> Any:
69
+ """Get the MongoDB database instance."""
70
+ if not hasattr(self, '_db') or self._db is None:
71
+ logger.warning("MongoDB database not initialized. Attempting to initialize client…")
72
+ self.connect()
73
+ if not hasattr(self, '_db') or self._db is None:
74
+ raise ConnectionError("Failed to initialize MongoDB database.")
75
+ return self._db
76
+
77
+ async def ping(self: Self) -> bool:
78
+ """Check if MongoDB connection is alive."""
79
+ try:
80
+ # Get client to ensure we're connected
81
+ client = self.client
82
+
83
+ # Try a simple ping command
84
+ await client.admin.command('ping')
85
+ logger.debug("MongoDB connection is active!")
86
+ return True
87
+ except (ConnectionFailure, ServerSelectionTimeoutError) as e:
88
+ logger.error("MongoDB connection failed: {}", e)
89
+ return False
90
+ except Exception as e:
91
+ logger.error("Unexpected error during MongoDB ping: {}", e)
92
+ return False
93
+
94
+ async def get_collection(self: Self, name: str) -> Any:
95
  """
96
+ Get a collection by name with validation.
97
+ Creates the collection if it doesn't exist.
98
  """
99
+ # First ensure we can connect at all
100
+ if not await self.ping():
101
+ logger.error("Cannot get collection '{}' because a MongoDB connection is not available.", name)
102
+ raise ConnectionError("MongoDB connection is not available.")
103
+
 
104
  try:
105
+ # Get all collection names to check if this one exists
106
+ logger.debug("Checking if collection '{}' exists…", name)
107
+ collection_names = await self.db.list_collection_names()
108
+
109
+ if name not in collection_names:
110
+ logger.info("Collection '{}' does not exist. Creating it…", name)
111
+ # Create the collection
112
+ await self.db.create_collection(name)
113
+ logger.debug("Successfully created collection: {}", name)
114
+ else:
115
+ logger.debug("Collection '{}' already exists!", name)
116
+
117
+ # Get and return the collection
118
+ collection = self.db[name]
119
+ return collection
120
  except Exception as e:
121
+ logger.error("Error accessing collection '{}': {}", name, e)
122
  raise
123
 
124
+ async def create_indexes(self: Self, collection_name: str) -> None:
125
  """
126
+ Create a vector search index on a collection.
127
+
128
+ Args:
129
+ collection_name: Name of the collection
130
  """
131
+ collection = await self.get_collection(collection_name)
132
+
133
  try:
134
+ # Create search index model using MongoDB's recommended approach
135
+ search_index_model = SearchIndexModel(
136
+ definition={
137
+ "fields": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  {
139
+ "type": "vector",
140
+ "path": "embedding",
141
+ "numDimensions": self.settings.VECTOR_DIMENSION,
142
+ "similarity": "cosine",
143
+ "quantization": "scalar"
144
  }
145
  ]
146
+ },
147
+ name=f"{collection_name}_vector_index",
148
+ type="vectorSearch"
149
+ )
150
+
151
+ # Create the search index using the motor collection
152
+ result = await collection.create_search_index(search_index_model)
153
+ logger.info("Vector search index '{}' created for collection {}.", result, collection_name)
154
+
155
+ except Exception as e:
156
+ if "command not found" in str(e).lower():
157
+ logger.warning("Vector search not supported by this MongoDB instance. Some functionality may be limited.")
158
+ # Create a fallback standard index on embedding field
159
+ await collection.create_index("embedding")
160
+ logger.info("Created standard index on 'embedding' field as fallback.")
161
+ else:
162
+ logger.error("Failed to create vector index: {}", e)
163
+ raise
164
+
165
+ async def close(self: Self) -> None:
166
+ """Close MongoDB connection."""
167
+ if self._client:
168
+ self._client.close()
169
+ logger.info("Closed MongoDB connection.")
170
+ self._client = None
171
+ self._db = None
172
+
173
+ class MongoDBResource(AsyncResource):
174
+ async def init(self: Self, settings: Settings) -> MongoDB:
175
+ logger.info("Initializing MongoDB connection for database: {}", settings.MONGODB_NAME)
176
+ mongo_db = MongoDB(settings=settings)
177
+ mongo_db.connect()
178
+ await self._test_connection(mongo_db)
179
+ return mongo_db
180
+
181
+ async def _test_connection(self: Self, mongo_db: MongoDB) -> None:
182
+ """Test MongoDB connection and log the result."""
183
+ try:
184
+ is_connected = await mongo_db.ping()
185
+ if is_connected:
186
+ logger.info("MongoDB connection test successful!")
187
  else:
188
+ logger.error("MongoDB connection test failed!")
 
189
  except Exception as e:
190
+ logger.error("Error testing MongoDB connection: {}", e)
191
  raise
192
 
193
+ async def shutdown(self: Self, mongo_db: MongoDB) -> None:
194
+ """Close MongoDB connection on shutdown."""
195
+ try:
196
+ await mongo_db.close()
197
+ except Exception as e:
198
+ logger.error("Error closing MongoDB connection: {}", e)
 
 
 
 
 
src/ctp_slack_bot/db/repositories/__init__.py CHANGED
@@ -0,0 +1,2 @@
 
 
 
1
+ from ctp_slack_bot.db.repositories.mongo_db_vectorized_chunk_repository import MongoVectorizedChunkRepository
2
+ from ctp_slack_bot.db.repositories.vectorized_chunk_repository import VectorizedChunkRepository
src/ctp_slack_bot/db/repositories/mongo_db_vectorized_chunk_repository.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional, Dict, Any
2
+ import pymongo
3
+ from bson import ObjectId
4
+
5
+ from ctp_slack_bot.db import MongoDB
6
+ from ctp_slack_bot.db.repositories.vectorized_chunk_repository import VectorizedChunkRepository
7
+ from ctp_slack_bot.models.base import VectorizedChunk
8
+
9
+ class MongoVectorizedChunkRepository(VectorizedChunkRepository):
10
+ """MongoDB implementation of VectorizedChunkRepository."""
11
+
12
+ def __init__(self, mongo_db: MongoDB):
13
+ self.mongo_db = mongo_db
14
+ self.collection = self.mongo_db.db.get_collection("vectorized_chunks")
15
+
16
+ # Create indexes for efficient queries
17
+ self.collection.create_index("chunk_id")
18
+ self.collection.create_index("parent_id")
19
+
20
+ async def find_by_id(self, id: str) -> Optional[VectorizedChunk]:
21
+ doc = await self.collection.find_one({"_id": ObjectId(id)})
22
+ return self._map_to_entity(doc) if doc else None
23
+
24
+ async def find_all(self) -> List[VectorizedChunk]:
25
+ cursor = self.collection.find({})
26
+ return [self._map_to_entity(doc) async for doc in cursor]
27
+
28
+ async def find_by_parent_id(self, parent_id: str) -> List[VectorizedChunk]:
29
+ cursor = self.collection.find({"parent_id": parent_id})
30
+ return [self._map_to_entity(doc) async for doc in cursor]
31
+
32
+ async def save(self, chunk: VectorizedChunk) -> VectorizedChunk:
33
+ doc = self._map_to_document(chunk)
34
+
35
+ if "_id" in doc and doc["_id"]:
36
+ # Update existing document
37
+ await self.collection.replace_one({"_id": doc["_id"]}, doc)
38
+ else:
39
+ # Insert new document
40
+ result = await self.collection.insert_one(doc)
41
+ doc["_id"] = result.inserted_id
42
+
43
+ return self._map_to_entity(doc)
44
+
45
+ async def delete(self, id: str) -> bool:
46
+ result = await self.collection.delete_one({"_id": ObjectId(id)})
47
+ return result.deleted_count > 0
48
+
49
+ async def find_by_metadata(self, metadata_query: Dict[str, Any]) -> List[VectorizedChunk]:
50
+ # Convert the metadata query to MongoDB query format
51
+ query = {f"metadata.{k}": v for k, v in metadata_query.items()}
52
+ cursor = self.collection.find(query)
53
+ return [self._map_to_entity(doc) async for doc in cursor]
54
+
55
+ def _map_to_document(self, chunk: VectorizedChunk) -> Dict[str, Any]:
56
+ """Convert a VectorizedChunk to a MongoDB document."""
57
+ doc = chunk.model_dump()
58
+ # Handle any special conversions needed
59
+ return doc
60
+
61
+ def _map_to_entity(self, doc: Dict[str, Any]) -> VectorizedChunk:
62
+ """Convert a MongoDB document to a VectorizedChunk."""
63
+ if "_id" in doc:
64
+ doc["id"] = str(doc.pop("_id"))
65
+ return VectorizedChunk(**doc)
src/ctp_slack_bot/db/repositories/vectorized_chunk_repository.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional, Dict, Any
2
+
3
+ from ctp_slack_bot.models.base import VectorizedChunk
4
+
5
+ class VectorizedChunkRepository:
6
+ """Repository interface for VectorizedChunk entities."""
7
+
8
+ async def find_by_id(self, id: str) -> Optional[VectorizedChunk]:
9
+ """Find a chunk by its ID."""
10
+ pass
11
+
12
+ async def find_all(self) -> List[VectorizedChunk]:
13
+ """Find all chunks."""
14
+ pass
15
+
16
+ async def find_by_parent_id(self, parent_id: str) -> List[VectorizedChunk]:
17
+ """Find chunks by parent document ID."""
18
+ pass
19
+
20
+ async def save(self, chunk: VectorizedChunk) -> VectorizedChunk:
21
+ """Save a chunk to the database."""
22
+ pass
23
+
24
+ async def delete(self, id: str) -> bool:
25
+ """Delete a chunk by its ID."""
26
+ pass
27
+
28
+ async def find_by_metadata(self, metadata_query: Dict[str, Any]) -> List[VectorizedChunk]:
29
+ """Find chunks by metadata criteria."""
30
+ pass
src/ctp_slack_bot/enums.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from enum import auto, StrEnum
2
+
3
+ class EventType(StrEnum):
4
+ INCOMING_CONTENT = auto()
5
+ INCOMING_SLACK_MESSAGE = auto()
6
+ OUTGOING_SLACK_RESPONSE = auto()
src/ctp_slack_bot/models/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- from ctp_slack_bot.models.base import Content, Ingestible, Metadata
2
- from ctp_slack_bot.models.content import RetreivedContext
3
- from ctp_slack_bot.models.slack import SlackMessage
4
- from ctp_slack_bot.models.vector_query import VectorQuery
 
1
+ from ctp_slack_bot.models.base import Chunk, Content, VectorizedChunk, VectorQuery
2
+ from ctp_slack_bot.models.google_drive import GoogleDriveMetadata
3
+ from ctp_slack_bot.models.slack import SlackEventPayload, SlackMessage, SlackReaction, SlackResponse, SlackUserTimestampPair
4
+ from ctp_slack_bot.models.webvtt import WebVTTContent, WebVTTFrame
src/ctp_slack_bot/models/base.py CHANGED
@@ -1,61 +1,58 @@
1
  from abc import ABC, abstractmethod
2
- from datetime import datetime
3
- from pydantic import BaseModel, Field, validator
4
- from typing import Dict, List, Optional, Union, Any, ClassVar
5
- import hashlib
6
- import json
7
 
8
 
9
- class Metadata(BaseModel):
10
- """A class representing metadata about content."""
11
 
12
- id: str # The content’s identity consistent across modifications
13
- modification_time: datetime # The content’s modification for detection of alterations
14
- hash: str # The content’s hash for detection of alterations
 
15
 
 
16
 
17
- class Content(BaseModel):
18
- """A class representing ingested content."""
19
 
20
- metadata: Metadata
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
 
 
 
 
 
 
22
 
 
23
 
24
- class Ingestible(ABC, BaseModel):
25
- """An abstract base class for ingestible content."""
26
 
27
- metadata: Metadata
 
 
 
28
 
29
- @property
30
  @abstractmethod
31
- def content(self) -> Content:
32
- """
33
- Return content ready for vectorization.
34
-
35
- This could be:
36
- - A single string
37
- - A list of strings (pre-chunked)
38
- - A more complex structure that can be recursively processed
39
- """
40
  pass
41
-
42
- def get_chunks(self) -> List[str]:
43
- """
44
- Split content into chunks suitable for vectorization.
45
- Override this in subclasses for specialized chunking logic.
46
- """
47
- content = self.content
48
- if isinstance(content, str):
49
- # Simple chunking by character count
50
- return [content[i:i+self.chunk_size]
51
- for i in range(0, len(content), self.chunk_size)]
52
- elif isinstance(content, list):
53
- # Content is already chunked
54
- return content
55
- else:
56
- raise ValueError(f"Unsupported content type: {type(content)}")
57
-
58
- @property
59
- def key(self) -> str:
60
- """Convenience accessor for the metadata key."""
61
- return self.metadata.key
 
1
  from abc import ABC, abstractmethod
2
+ from pydantic import BaseModel, ConfigDict, Field
3
+ from typing import Any, final, Mapping, Self, Sequence, Optional
 
 
 
4
 
5
 
6
+ class Chunk(BaseModel):
7
+ """A class representing a chunk of content."""
8
 
9
+ text: str # The text representation
10
+ parent_id: str # The source content’s identity
11
+ chunk_id: str # This chunk’s identity—unique within the source content
12
+ metadata: Mapping[str, Any]
13
 
14
+ model_config = ConfigDict(frozen=True)
15
 
 
 
16
 
17
+ @final
18
+ class VectorQuery(BaseModel):
19
+ """Model for vector database similarity search queries.
20
+
21
+ Attributes:
22
+ query_text: The text to be vectorized and used for similarity search
23
+ k: Number of similar documents to retrieve
24
+ score_threshold: Minimum similarity score threshold for inclusion in results
25
+ filter_metadata: Optional filters for metadata fields
26
+ """
27
+
28
+ query_embeddings: Sequence[float]
29
+ k: int
30
+ score_threshold: float = Field(default=0.7)
31
+ filter_metadata: Optional[Mapping[str, Any]] = None
32
 
33
+ model_config = ConfigDict(frozen=True)
34
+
35
+
36
+ @final
37
+ class VectorizedChunk(Chunk):
38
+ """A class representing a vectorized chunk of content."""
39
 
40
+ embedding: Sequence[float] # The vector representation
41
 
 
 
42
 
43
+ class Content(ABC, BaseModel):
44
+ """An abstract base class for all types of content."""
45
+
46
+ model_config = ConfigDict(frozen=True)
47
 
 
48
  @abstractmethod
49
+ def get_id(self: Self) -> str:
50
+ pass
51
+
52
+ @abstractmethod
53
+ def get_chunks(self: Self) -> Sequence[Chunk]:
54
+ pass
55
+
56
+ @abstractmethod
57
+ def get_metadata(self: Self) -> Mapping[str, Any]:
58
  pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/ctp_slack_bot/models/content.py DELETED
@@ -1,19 +0,0 @@
1
- from pydantic import BaseModel, Field
2
- from typing import Optional, List, Dict, Any
3
- from ctp_slack_bot.models.slack import SlackMessage
4
-
5
- class RetreivedContext(BaseModel):
6
- """Represents a the context of a question from Slack returned from the Vector Store Database.
7
-
8
- contextual_text: The text that is relevant to the question.
9
- metadata_source: The source of the contextual text.
10
- similarity_score: The similarity score of the contextual text to the question.
11
-
12
- in_reation_to_question: OPTINAL: The question that the contextual text is related to.
13
- """
14
- contextual_text: str
15
- metadata_source: str
16
- similarity_score: float
17
-
18
- said_by: str = Optional[None]
19
- in_reation_to_question: str = Optional[None]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/ctp_slack_bot/models/google_drive.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ from pydantic import BaseModel, ConfigDict
3
+ from typing import Self
4
+
5
+ from ctp_slack_bot.models import FileContent
6
+
7
+
8
+ class GoogleDriveMetadata(BaseModel):
9
+ """Represents Google Drive file or folder metadata."""
10
+
11
+ id: str
12
+ name: str
13
+ modified_time: datetime
14
+ mime_type: str
15
+ folder_path: str
16
+
17
+ model_config = ConfigDict(frozen=True)
18
+
19
+ @classmethod
20
+ def from_folder_path_and_dict(cls: type["GoogleDriveMetadata"], folder_path: str, dict: dict) -> Self:
21
+ id = dict["id"]
22
+ name = dict["name"]
23
+ modified_time = datetime.fromisoformat(dict["modifiedTime"])
24
+ mime_type = dict["mimeType"]
25
+ return GoogleDriveMetadata(id=id, name=name, modified_time=modified_time, mime_type=mime_type, folder_path=folder_path)
src/ctp_slack_bot/models/slack.py CHANGED
@@ -1,16 +1,84 @@
1
- from pydantic import BaseModel, Field
2
- from typing import Optional, List, Dict, Any
 
 
 
3
 
4
- class SlackMessage(BaseModel):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  """Represents a message from Slack after adaptation."""
6
- channel_id: str
7
- user_id: str
8
- text: str
 
 
 
 
9
  thread_ts: Optional[str] = None
10
- timestamp: str
11
- is_question: bool = False
12
-
13
- @property
14
- def key(self) -> str:
 
 
 
 
 
 
15
  """Unique identifier for this message."""
16
- return f"slack:{self.channel_id}:{self.timestamp}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ from json import dumps
3
+ from pydantic import BaseModel, ConfigDict, PositiveInt, PrivateAttr
4
+ from types import MappingProxyType
5
+ from typing import Any, Dict, Literal, Mapping, Optional, Self, Sequence
6
 
7
+ from ctp_slack_bot.models.base import Chunk, Content
8
+
9
+ class SlackEventPayload(BaseModel):
10
+ """Represents a general event payload from Slack."""
11
+ type: str
12
+ event_ts: str
13
+
14
+ model_config = ConfigDict(extra='allow', frozen=True)
15
+
16
+ class SlackEvent(BaseModel):
17
+ """Represents a general event from Slack."""
18
+
19
+ token: str
20
+ team_id: str
21
+ api_app_id: str
22
+ event: SlackEventPayload
23
+ type: str
24
+ event_id: str
25
+ event_time: int
26
+ authed_users: Sequence[str]
27
+
28
+ model_config = ConfigDict(frozen=True)
29
+
30
+ class SlackUserTimestampPair(BaseModel):
31
+ """Represents a Slack user-timestamp pair."""
32
+
33
+ user: str
34
+ ts: str
35
+
36
+ model_config = ConfigDict(frozen=True)
37
+
38
+ class SlackReaction(BaseModel):
39
+ """Represents a Slack reaction information."""
40
+
41
+ name: str
42
+ count: PositiveInt
43
+ users: Sequence[str]
44
+
45
+ model_config = ConfigDict(frozen=True)
46
+
47
+ class SlackMessage(Content):
48
  """Represents a message from Slack after adaptation."""
49
+
50
+ type: Literal["app_mention", "message"]
51
+ subtype: Optional[str] = None
52
+ channel: str
53
+ channel_type: Optional[str] = None
54
+ user: Optional[str] = None
55
+ bot_id: Optional[str] = None
56
  thread_ts: Optional[str] = None
57
+ text: str
58
+ ts: str
59
+ edited: Optional[SlackUserTimestampPair] = None
60
+ event_ts: str
61
+ deleted_ts: Optional[str] = None
62
+ hidden: bool = False
63
+ is_starred: Optional[bool] = None
64
+ pinned_to: Optional[Sequence[str]] = None
65
+ reactions: Optional[Sequence[SlackReaction]] = None
66
+
67
+ def get_id(self: Self) -> str:
68
  """Unique identifier for this message."""
69
+ return f"slack-message:{self.channel}:{self.ts}"
70
+
71
+ def get_chunks(self: Self) -> Sequence[Chunk]:
72
+ return (Chunk(text=self.text, parent_id=self.get_id(), chunk_id="", metadata=self.get_metadata()), )
73
+
74
+ def get_metadata(self: Self) -> Mapping[str, Any]:
75
+ return MappingProxyType({
76
+ "modificationTime": datetime.fromtimestamp(float(self.ts))
77
+ })
78
+
79
+ class SlackResponse(BaseModel): # TODO: This should also be based on Content as it is a SlackMessage―just not one for which we know the identity yet.
80
+ """Represents a response message to be sent to Slack."""
81
+
82
+ text: str
83
+ channel: Optional[str]
84
+ thread_ts: Optional[str] = None
src/ctp_slack_bot/models/vector_query.py DELETED
@@ -1,16 +0,0 @@
1
- from pydantic import BaseModel, Field, validator
2
- from typing import Optional, List, Dict, Any
3
-
4
- class VectorQuery(BaseModel):
5
- """Model for vector database similarity search queries.
6
-
7
- Attributes:
8
- query_text: The text to be vectorized and used for similarity search
9
- k: Number of similar documents to retrieve
10
- score_threshold: Minimum similarity score threshold for inclusion in results
11
- filter_metadata: Optional filters for metadata fields
12
- """
13
- query_text: str
14
- k: int
15
- score_threshold: float = Field(default=0.7)
16
- filter_metadata: Optional[Dict[str, Any]] = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/ctp_slack_bot/models/webvtt.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime, timedelta
2
+ from io import BytesIO
3
+ from itertools import starmap
4
+ from json import dumps
5
+ from more_itertools import windowed
6
+ from pydantic import BaseModel, ConfigDict, Field, PositiveInt, PrivateAttr
7
+ from types import MappingProxyType
8
+ from typing import Any, Dict, Literal, Mapping, Optional, Self, Sequence
9
+ from webvtt import Caption, WebVTT
10
+
11
+ from ctp_slack_bot.models.base import Chunk, Content
12
+
13
+ CHUNK_FRAMES_OVERLAP = 1
14
+ CHUNK_FRAMES_WINDOW = 5
15
+ SPEAKER_SPEECH_TEXT_SEPARATOR = ": "
16
+
17
+ class WebVTTFrame(BaseModel):
18
+ """Represents a WebVTT frame"""
19
+
20
+ identifier: str
21
+ start: timedelta
22
+ end: timedelta
23
+ speaker: Optional[str] = None
24
+ speech: str
25
+
26
+ model_config = ConfigDict(frozen=True)
27
+
28
+ @classmethod
29
+ def from_webvtt_caption(cls: type["WebVTTFrame"], index: int, caption: Caption) -> Self:
30
+ identifier = caption.identifier if caption.identifier else str(index)
31
+ start = timedelta(**caption.start_time.__dict__)
32
+ end = timedelta(**caption.end_time.__dict__)
33
+ match caption.text.split(SPEAKER_SPEECH_TEXT_SEPARATOR, 1):
34
+ case [speaker, speech]:
35
+ return cls(identifier=identifier, start=start, end=end, speaker=speaker, speech=speech)
36
+ case [speech]:
37
+ return cls(identifier=identifier, start=start, end=end, speech=speech)
38
+
39
+
40
+ class WebVTTContent(Content):
41
+ """Represents parsed WebVTT content."""
42
+
43
+ id: str
44
+ metadata: Mapping[str, Any] = Field(default_factory=dict)
45
+ frames: Sequence[WebVTTFrame]
46
+
47
+ def get_id(self: Self) -> str:
48
+ return self.id
49
+
50
+ def get_chunks(self: Self) -> Sequence[Chunk]:
51
+ windows = (tuple(filter(None, window))
52
+ for window
53
+ in windowed(self.frames, CHUNK_FRAMES_WINDOW, step=CHUNK_FRAMES_WINDOW-CHUNK_FRAMES_OVERLAP))
54
+ return tuple(Chunk(text="\n\n".join(": ".join(filter(None, (frame.speaker, frame.speech)))
55
+ for frame
56
+ in frames),
57
+ parent_id=self.get_id(),
58
+ chunk_id=f"{frames[0].identifier}-{frames[-1].identifier}",
59
+ metadata={
60
+ "start": str(frames[0].start), # TODO: This is a harder problem: to get the offsets to become real datetimes so that they can be queryable using MongoDB.
61
+ "end": str(frames[-1].end),
62
+ "speakers": [frame.speaker for frame in frames if frame.speaker]
63
+ })
64
+ for frames
65
+ in windows)
66
+
67
+ def get_metadata(self: Self) -> Mapping[str, Any]:
68
+ return MappingProxyType(self.metadata)
69
+
70
+ @classmethod
71
+ def from_bytes(cls: type["WebVTTContent"], id: str, metadata: Mapping[str, Any], buffer: bytes) -> Self:
72
+ frames = tuple(starmap(WebVTTFrame.from_webvtt_caption, enumerate(WebVTT.from_buffer(BytesIO(buffer)).captions, 1)))
73
+ return WebVTTContent(id=id, metadata=MappingProxyType(metadata), frames=frames)
src/ctp_slack_bot/services/GOOGLE_DRIVE_README.md DELETED
@@ -1,228 +0,0 @@
1
- # Google Drive Access Module
2
-
3
- This Python module provides a simplified way to interact with Google Drive, focusing on easy access to files in nested folders using path-like syntax. It handles various Google file formats and provides comprehensive metadata for files and folders.
4
-
5
- ## Features
6
-
7
- - **Path-based folder access**: Access files using simple paths like `folder1/folder2/folder3`
8
- - **Efficient caching**: Folder IDs are cached to improve performance
9
- - **Comprehensive metadata**: Get detailed information about files and folders
10
- - **Read various file types**:
11
- - Text files
12
- - Google Docs
13
- - VTT files
14
- - **Robust folder finding**: Works with exact and partial name matching
15
- - **Simple API**: Designed for ease of use with minimal code
16
-
17
- ## Setup Instructions
18
-
19
- ### 1. Create a Google Cloud Project
20
-
21
- 1. Go to the [Google Cloud Console](https://console.cloud.google.com/)
22
- 2. Click on the project dropdown at the top of the page and select "New Project"
23
- 3. Enter a project name and click "Create"
24
- 4. Once created, make sure your new project is selected in the dropdown
25
-
26
- ### 2. Enable the Google Drive API
27
-
28
- 1. In the Google Cloud Console, navigate to "APIs & Services" > "Library" in the left sidebar
29
- 2. Search for "Google Drive API" in the search bar
30
- 3. Click on "Google Drive API" in the results
31
- 4. Click the "Enable" button
32
-
33
- ### 3. Create OAuth Credentials
34
-
35
- 1. In the Google Cloud Console, go to "APIs & Services" > "Credentials" in the left sidebar
36
- 2. Click "Create Credentials" at the top and select "OAuth client ID"
37
- 3. If prompted to configure the OAuth consent screen:
38
- - Choose "External" user type (or "Internal" if you're in a Google Workspace organization)
39
- - Fill in the required information (App name, User support email, Developer contact email)
40
- - Click "Save and Continue"
41
- - Add the following scopes:
42
- - `.../auth/drive` (Full access to Google Drive)
43
- - Click "Save and Continue" and complete the registration
44
- 4. Return to the "Create OAuth client ID" screen
45
- 5. Select "Desktop application" as the Application type
46
- 6. Enter a name for your OAuth client (e.g., "Google Drive Access Desktop")
47
- 7. Click "Create"
48
- 8. Download the JSON file (this is your `client_secret.json`)
49
-
50
- ### 4. Project Setup
51
-
52
- 1. Setup a virtual environment and install dependencies:
53
- ```bash
54
- python -m venv venv
55
- source venv/bin/activate # On Windows: venv\Scripts\activate
56
- pip install -r requirements.txt
57
- ```
58
-
59
- 2. Place your credentials:
60
- - Create a `credentials` directory in your project root
61
- - Move the downloaded OAuth client JSON file to the `credentials` directory
62
- - Rename it to `client_secret.json`
63
-
64
- ### 5. Authentication Process
65
-
66
- When you run the application for the first time:
67
- 1. A browser window will open automatically
68
- 2. You'll be asked to sign in to your Google account
69
- 3. You'll see a consent screen asking for permission to access your Google Drive
70
- 4. After granting permission, the browser will display a success message
71
- 5. The application will save a token file (`token.pickle`) in the credentials directory for future use
72
-
73
- ## Usage Guide
74
-
75
- The `EasyGoogleDrive` class provides several methods to interact with Google Drive. Here's how to use the core functionality:
76
-
77
- ### Basic Usage
78
-
79
- ```python
80
- from google_drive_access import EasyGoogleDrive
81
-
82
- # Initialize the Google Drive client
83
- drive = EasyGoogleDrive()
84
-
85
- # Example folder path - replace with your actual folder path
86
- folder_path = "Spring-2025-BAI"
87
- subfolder_path = "Spring-2025-BAI/transcripts"
88
- ```
89
-
90
- ### Listing Folders
91
-
92
- ```python
93
- # List folders in a directory
94
- folders = drive.get_folders_in_folder(folder_path)
95
-
96
- # Access folder properties
97
- for folder in folders:
98
- print(f"Folder: {folder['name']}")
99
- print(f" Created: {folder.get('createdTimeFormatted', 'Unknown')}")
100
- print(f" Modified: {folder.get('modifiedTimeFormatted', 'Unknown')}")
101
- ```
102
-
103
- ### Listing Files
104
-
105
- ```python
106
- # List files in a directory
107
- files = drive.get_files_in_folder(subfolder_path)
108
-
109
- # Access file properties
110
- for file in files:
111
- print(f"File: {file['name']}")
112
- print(f" Type: {file.get('fileType', 'Unknown')}")
113
- print(f" Created: {file.get('createdTimeFormatted', 'Unknown')}")
114
- print(f" Modified: {file.get('modifiedTimeFormatted', 'Unknown')}")
115
- print(f" Size: {file.get('sizeFormatted', 'Unknown')}")
116
- ```
117
-
118
- ### Getting a Specific File
119
-
120
- ```python
121
- # Get a specific file with metadata
122
- file = drive.get_file("example.txt", subfolder_path, include_metadata=True)
123
-
124
- if file:
125
- print(f"File: {file['name']}")
126
- print(f" Type: {file.get('fileType', 'Unknown')}")
127
- print(f" Created: {file.get('createdTimeFormatted', 'Unknown')}")
128
- print(f" Modified: {file.get('modifiedTimeFormatted', 'Unknown')}")
129
- print(f" Size: {file.get('sizeFormatted', 'Unknown')}")
130
- ```
131
-
132
- ### Getting All Items in a Folder
133
-
134
- ```python
135
- # Get all items (files and folders) in a folder
136
- all_items = drive.get_all_files_in_folder(folder_path)
137
-
138
- # Access item properties
139
- for item in all_items:
140
- item_type = "Folder" if item.get('mimeType') == drive.MIME_TYPES['folder'] else item.get('fileType', 'Unknown')
141
- print(f"Item: {item['name']} ({item_type})")
142
- ```
143
-
144
- ### Checking if a File Exists
145
-
146
- ```python
147
- # Check if a file exists
148
- exists = drive.file_exists("example.txt", subfolder_path)
149
- print(f"File exists: {exists}")
150
- ```
151
-
152
- ### Getting File Modified Time
153
-
154
- ```python
155
- # Get file modified time
156
- modified_time = drive.get_file_modified_time("example.txt", subfolder_path)
157
- if modified_time:
158
- print(f"Last modified: {modified_time}")
159
- ```
160
-
161
- ### Reading File Content
162
-
163
- ```python
164
- # Get file with content
165
- file_with_content = drive.get_file("example.txt", subfolder_path, include_content=True)
166
-
167
- if file_with_content and 'file_content' in file_with_content:
168
- content = file_with_content['file_content']
169
- if content:
170
- print(f"Content: {content[:100]}...") # Print first 100 characters
171
- ```
172
-
173
- ## Complete Example
174
-
175
- For a complete example of how to use the `EasyGoogleDrive` class, see the `basic_usage.py` file included in this package. This file demonstrates all the core functionality with practical examples.
176
-
177
- ## Key Concepts
178
-
179
- ### Path-based Folder Access
180
-
181
- The module uses a simple path-like syntax to access folders:
182
-
183
- ```python
184
- # Access a deeply nested folder
185
- folder_path = "folder1/folder2/folder3"
186
- files = drive.get_files_in_folder(folder_path)
187
- ```
188
-
189
- This makes it much easier to work with nested folder structures compared to using folder IDs.
190
-
191
- ### Metadata Fields
192
-
193
- The module provides comprehensive metadata for files and folders, including:
194
-
195
- - **Creation and modification dates**: Both as datetime objects and formatted strings
196
- - **File size**: Both in bytes and human-readable format (KB, MB, GB)
197
- - **File type**: Simplified type based on MIME type
198
- - **Owner information**: Names and email addresses of file owners
199
- - **Sharing status**: Whether the file is shared
200
- - **Web links**: Direct links to view the file in a browser
201
-
202
- ## Error Handling
203
-
204
- The module includes comprehensive error handling:
205
-
206
- - **Authentication errors**: Clear messages when credentials are missing or invalid
207
- - **Folder not found**: Helpful messages when a folder in the path cannot be found
208
- - **File not found**: Attempts partial name matching before giving up
209
- - **Decoding errors**: Handles issues with file content encoding
210
-
211
- ## Dependencies
212
-
213
- - **Required**:
214
- - google-auth-oauthlib
215
- - google-auth-httplib2
216
- - google-api-python-client
217
- - python-dateutil
218
-
219
- ## Security Notes
220
-
221
- - Never commit your `client_secret.json` or token files to version control
222
- - Add `credentials/` to your `.gitignore` file
223
- - Keep your credentials secure and don't share them
224
- - For production applications, consider using service accounts with the minimum required permissions
225
-
226
- ## Contributing
227
-
228
- Feel free to contribute to this project by submitting issues or pull requests.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/ctp_slack_bot/services/__init__.py CHANGED
@@ -1,7 +1,10 @@
1
  from ctp_slack_bot.services.answer_retrieval_service import AnswerRetrievalService
2
  from ctp_slack_bot.services.content_ingestion_service import ContentIngestionService
3
  from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
 
4
  from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
 
 
5
  from ctp_slack_bot.services.question_dispatch_service import QuestionDispatchService
6
  from ctp_slack_bot.services.slack_service import SlackService
7
  from ctp_slack_bot.services.vector_database_service import VectorDatabaseService
 
1
  from ctp_slack_bot.services.answer_retrieval_service import AnswerRetrievalService
2
  from ctp_slack_bot.services.content_ingestion_service import ContentIngestionService
3
  from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
4
+ from ctp_slack_bot.services.embeddings_model_service import EmbeddingsModelService
5
  from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
6
+ from ctp_slack_bot.services.google_drive_service import GoogleDriveService
7
+ from ctp_slack_bot.services.language_model_service import LanguageModelService
8
  from ctp_slack_bot.services.question_dispatch_service import QuestionDispatchService
9
  from ctp_slack_bot.services.slack_service import SlackService
10
  from ctp_slack_bot.services.vector_database_service import VectorDatabaseService
src/ctp_slack_bot/services/answer_retrieval_service.py CHANGED
@@ -1,65 +1,34 @@
1
- # from asyncio import create_task
2
  from loguru import logger
3
- from openai import OpenAI
4
- from pydantic import BaseModel, model_validator
5
- from typing import List, Optional, Self, Tuple
6
 
7
  from ctp_slack_bot.core import Settings
8
- from ctp_slack_bot.models import RetreivedContext, SlackMessage
 
9
  from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
 
10
 
11
 
12
- class AnswerRetrievalService(BaseModel): # TODO: this should separate the OpenAI backend out into its own service.
13
  """
14
- Service for language model operations.
15
  """
16
 
17
  settings: Settings
18
  event_brokerage_service: EventBrokerageService
19
- client: OpenAI # TODO: this should separate the OpenAI backend out into its own service, one that is agnostic.
20
 
21
  class Config:
22
- arbitrary_types_allowed = True
23
 
24
- @model_validator(mode='after')
25
- def post_init(self: Self) -> Self:
26
  logger.debug("Created {}", self.__class__.__name__)
27
- return self
28
 
29
- def generate_answer(self, question: SlackMessage, context: List[RetreivedContext]) -> str:
30
- """Generate a response using OpenAI's API with retrieved context.
31
-
32
- Args:
33
- question (str): The user's question
34
- context (List[RetreivedContext]): List of RetreivedContext
35
-
36
- Returns:
37
- str: Generated answer
38
- """
39
- # Prepare context string from retrieved chunks
40
- context_str = ""
41
- for c in context:
42
- context_str += f"{c.contextual_text}\n"
43
-
44
-
45
- # logger.info(f"Generating response for question: {question}")
46
- # logger.info(f"Using {len(context)} context chunks")
47
-
48
- # Create messages for the chat completion
49
- messages = [
50
- {"role": "system", "content": settings.SYSTEM_PROMPT},
51
- {"role": "user", "content":
52
- f"""Student Auestion: {question.text}
53
- Context from class materials and transcripts: {context_str}
54
- Please answer the Student Auestion based on the Context from class materials and transcripts. If the context doesn't contain relevant information, acknowledge that and suggest asking the professor."""}
55
- ]
56
-
57
- # Generate response
58
- response = self.client.chat.completions.create(
59
- model=settings.CHAT_MODEL,
60
- messages=messages,
61
- max_tokens=settings.MAX_TOKENS,
62
- temperature=settings.TEMPERATURE
63
- )
64
-
65
- return response.choices[0].message.content
 
 
1
  from loguru import logger
2
+ from pydantic import BaseModel
3
+ from typing import Collection, Self
 
4
 
5
  from ctp_slack_bot.core import Settings
6
+ from ctp_slack_bot.enums import EventType
7
+ from ctp_slack_bot.models import Chunk, SlackMessage, SlackResponse
8
  from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
9
+ from ctp_slack_bot.services.language_model_service import LanguageModelService
10
 
11
 
12
+ class AnswerRetrievalService(BaseModel):
13
  """
14
+ Service for context-based answer retrievel from a language model.
15
  """
16
 
17
  settings: Settings
18
  event_brokerage_service: EventBrokerageService
19
+ language_model_service: LanguageModelService
20
 
21
  class Config:
22
+ frozen=True
23
 
24
+ def __init__(self: Self, **data) -> None:
25
+ super().__init__(**data)
26
  logger.debug("Created {}", self.__class__.__name__)
 
27
 
28
+ async def push(self: Self, question: SlackMessage, context: Collection[Chunk]) -> None:
29
+ channel_to_respond_to = question.channel
30
+ thread_to_respond_to = question.thread_ts if question.thread_ts else question.ts
31
+ answer = self.language_model_service.answer_question(question.text, context)
32
+ logger.debug("Pushing response to channel {} and thread {}: {}", channel_to_respond_to, thread_to_respond_to, answer)
33
+ slack_response = SlackResponse(text=answer, channel=channel_to_respond_to, thread_ts=thread_to_respond_to)
34
+ await self.event_brokerage_service.publish(EventType.OUTGOING_SLACK_RESPONSE, slack_response)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/ctp_slack_bot/services/application_database_service.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ from loguru import logger
3
+ from pydantic import BaseModel, PrivateAttr
4
+ from typing import Iterable, Mapping, Self
5
+
6
+ from ctp_slack_bot.core import Settings
7
+ from ctp_slack_bot.db import MongoDB
8
+
9
+
10
+ class ApplicationDatabaseService(BaseModel):
11
+ """Service for retrieving and persisting application state."""
12
+
13
+ settings: Settings
14
+ mongo_db: MongoDB # TODO: This should be replaced following the repository pattern―one repository class per collection.
15
+
16
+ class Config:
17
+ frozen=True
18
+
19
+ def __init__(self: Self, **data) -> None:
20
+ super().__init__(**data)
21
+ logger.debug("Created {}", self.__class__.__name__)
22
+
23
+ async def get_last_modification_times_by_file_paths(self: Self, file_paths: Iterable[str]) -> Mapping[str, datetime]:
24
+ """Retrieve the last modification time for each file path."""
25
+ raise NotImplementedError() # TODO
26
+
27
+ async def set_last_modification_time_by_file_path(self: Self, file_path: str, modification_time: datetime) -> None:
28
+ """Set the last modification time for a file path."""
29
+ raise NotImplementedError() # TODO
src/ctp_slack_bot/services/content_ingestion_service.py CHANGED
@@ -1,8 +1,11 @@
1
  from loguru import logger
2
- from pydantic import BaseModel, model_validator
3
- from typing import Self
4
 
5
  from ctp_slack_bot.core import Settings
 
 
 
6
  from ctp_slack_bot.services.vector_database_service import VectorDatabaseService
7
  from ctp_slack_bot.services.vectorization_service import VectorizationService
8
 
@@ -12,10 +15,35 @@ class ContentIngestionService(BaseModel):
12
  """
13
 
14
  settings: Settings
 
15
  vector_database_service: VectorDatabaseService
16
  vectorization_service: VectorizationService
17
 
18
- @model_validator(mode='after')
19
- def post_init(self: Self) -> Self:
 
 
 
 
 
20
  logger.debug("Created {}", self.__class__.__name__)
21
- return self
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from loguru import logger
2
+ from pydantic import BaseModel
3
+ from typing import Self, Sequence
4
 
5
  from ctp_slack_bot.core import Settings
6
+ from ctp_slack_bot.enums import EventType
7
+ from ctp_slack_bot.models import Chunk, Content, SlackMessage
8
+ from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
9
  from ctp_slack_bot.services.vector_database_service import VectorDatabaseService
10
  from ctp_slack_bot.services.vectorization_service import VectorizationService
11
 
 
15
  """
16
 
17
  settings: Settings
18
+ event_brokerage_service: EventBrokerageService
19
  vector_database_service: VectorDatabaseService
20
  vectorization_service: VectorizationService
21
 
22
+ class Config:
23
+ frozen=True
24
+
25
+ def __init__(self: Self, **data) -> None:
26
+ super().__init__(**data)
27
+ self.event_brokerage_service.subscribe(EventType.INCOMING_CONTENT, self.process_incoming_content)
28
+ self.event_brokerage_service.subscribe(EventType.INCOMING_SLACK_MESSAGE, self.process_incoming_slack_message)
29
  logger.debug("Created {}", self.__class__.__name__)
30
+
31
+ async def process_incoming_content(self: Self, content: Content) -> None:
32
+ logger.debug("Content ingestion service received content with metadata: {}", content.get_metadata())
33
+ # if self.vector_database_service.has_content(content.get_id()) # TODO
34
+ # logger.debug("Ignored content with ID {} because it already exists in the database.", content.get_id())
35
+ # return
36
+ chunks = content.get_chunks()
37
+ await self.__vectorize_and_store_chunks_in_database(chunks)
38
+ logger.debug("Stored {} vectorized chunk(s) in the database.", len(chunks))
39
+
40
+ async def process_incoming_slack_message(self: Self, slack_message: SlackMessage) -> None:
41
+ logger.debug("Content ingestion service received a Slack message: {}", slack_message.text)
42
+ chunks = slack_message.get_chunks()
43
+ await self.__vectorize_and_store_chunks_in_database(chunks)
44
+ logger.debug("Stored {} vectorized chunk(s) in the database.", len(chunks))
45
+
46
+ async def __vectorize_and_store_chunks_in_database(self: Self, chunks: Sequence[Chunk]) -> None:
47
+ vectorized_chunks = self.vectorization_service.vectorize(chunks) # TODO
48
+ await self.vector_database_service.store(vectorized_chunks) # TODO
49
+
src/ctp_slack_bot/services/context_retrieval_service.py CHANGED
@@ -1,9 +1,9 @@
1
  from loguru import logger
2
- from pydantic import BaseModel, model_validator
3
- from typing import Self, List
4
 
5
  from ctp_slack_bot.core.config import Settings
6
- from ctp_slack_bot.models import RetreivedContext, SlackMessage, VectorQuery
7
  from ctp_slack_bot.services.vector_database_service import VectorDatabaseService
8
  from ctp_slack_bot.services.vectorization_service import VectorizationService
9
 
@@ -16,57 +16,51 @@ class ContextRetrievalService(BaseModel):
16
  vectorization_service: VectorizationService
17
  vector_database_service: VectorDatabaseService
18
 
19
- @model_validator(mode='after')
20
- def post_init(self: Self) -> Self:
 
 
 
21
  logger.debug("Created {}", self.__class__.__name__)
22
- return self
23
-
24
- def get_context(self, message: SlackMessage) -> List[RetreivedContext]:
25
  """
26
- Retrieve relevant context for a given Slack message.
27
-
28
- This function:
29
- 1. Extracts the question text from the message
30
- 2. Vectorizes the question using VectorizationService
31
- 3. Queries VectorDatabaseService for similar context
32
- 4. Returns the relevant context as a list of RetreivedContext objects
33
 
34
  Args:
35
  message: The SlackMessage containing the user's question
36
 
37
  Returns:
38
- List[RetreivedContext]: List of retrieved context items with similarity scores
39
  """
40
- if not message.is_question:
41
- logger.debug(f"Message {message.key} is not a question, skipping context retrieval")
 
 
 
 
 
 
 
42
  return []
 
 
 
 
 
 
 
43
 
 
44
  try:
45
- # Vectorize the message text
46
- embeddings = self.vectorization_service.get_embeddings([message.text])
47
- if embeddings is None or len(embeddings) == 0:
48
- logger.error(f"Failed to generate embedding for message: {message.key}")
49
- return []
50
-
51
- query_embedding = embeddings[0].tolist()
52
-
53
- # Create vector query
54
- vector_query = VectorQuery(
55
- query_text=message.text,
56
- k=self.settings.TOP_K_MATCHES,
57
- score_threshold=0.7 # Minimum similarity threshold
58
- )
59
-
60
- # Search for similar content in vector database
61
- context_results = self.vector_database_service.search_by_similarity(
62
- query=vector_query,
63
- query_embedding=query_embedding
64
- )
65
-
66
- logger.info(f"Retrieved {len(context_results)} context items for message: {message.key}")
67
- return context_results
68
-
69
  except Exception as e:
70
- logger.error(f"Error retrieving context for message {message.key}: {str(e)}")
71
  return []
72
-
 
 
 
 
1
  from loguru import logger
2
+ from pydantic import BaseModel
3
+ from typing import Self, Sequence
4
 
5
  from ctp_slack_bot.core.config import Settings
6
+ from ctp_slack_bot.models import Chunk, SlackMessage, VectorQuery, VectorizedChunk
7
  from ctp_slack_bot.services.vector_database_service import VectorDatabaseService
8
  from ctp_slack_bot.services.vectorization_service import VectorizationService
9
 
 
16
  vectorization_service: VectorizationService
17
  vector_database_service: VectorDatabaseService
18
 
19
+ class Config:
20
+ frozen=True
21
+
22
+ def __init__(self: Self, **data) -> None:
23
+ super().__init__(**data)
24
  logger.debug("Created {}", self.__class__.__name__)
25
+
26
+ async def get_context(self: Self, message: SlackMessage) -> Sequence[Chunk]:
 
27
  """
28
+ Retrieve relevant context for a given SlackMessage by vectorizing the message and
29
+ querying the vectorstore.
 
 
 
 
 
30
 
31
  Args:
32
  message: The SlackMessage containing the user's question
33
 
34
  Returns:
35
+ Sequence[Chunk]: List of retrieved context items with similarity scores
36
  """
37
+ # Extract chunks from the message
38
+ message_chunks = message.get_chunks()
39
+
40
+ # Vectorize the chunks
41
+ vectorized_chunks = self.vectorization_service.vectorize(message_chunks)
42
+
43
+ # Create vector query using the first chunk's embedding (typically there's only one chunk for a message)
44
+ if not vectorized_chunks:
45
+ logger.warning("No vectorized chunks were created for message")
46
  return []
47
+
48
+ query = VectorQuery(
49
+ query_embeddings=vectorized_chunks[0].embedding,
50
+ k=self.settings.TOP_K_MATCHES,
51
+ score_threshold=self.settings.SCORE_THRESHOLD,
52
+ filter_metadata=None # Can be expanded to include filters based on message metadata
53
+ )
54
 
55
+ # Perform similarity search
56
  try:
57
+ results = await self.vector_database_service.search_by_similarity(query)
58
+ # logger.info(f"Retrieved {len(results)} context chunks for query")
59
+ return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  except Exception as e:
61
+ logger.error(f"Error retrieving context: {str(e)}")
62
  return []
63
+
64
+ # test return statement
65
+ # return (VectorizedChunk(text="Mock context chunk", parent_id="lol", chunk_id="no", metadata={}, embedding=tuple()),
66
+ # VectorizedChunk(text="Moar mock context chunk", parent_id="lol", chunk_id="wut", metadata={}, embedding=tuple()))
src/ctp_slack_bot/services/embeddings_model_service.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from loguru import logger
2
+ from openai import OpenAI
3
+ from pydantic import BaseModel, PrivateAttr
4
+ from typing import Any, Dict, Sequence, Self
5
+
6
+ from ctp_slack_bot.core import Settings
7
+
8
+ class EmbeddingsModelService(BaseModel):
9
+ """
10
+ Service for embeddings model operations.
11
+ """
12
+
13
+ settings: Settings
14
+ _open_ai_client: PrivateAttr = PrivateAttr()
15
+
16
+ class Config:
17
+ frozen=True
18
+
19
+ def __init__(self: Self, **data: Dict[str, Any]) -> None:
20
+ super().__init__(**data)
21
+ self._open_ai_client = OpenAI(api_key=self.settings.OPENAI_API_KEY.get_secret_value())
22
+ logger.debug("Created {}", self.__class__.__name__)
23
+
24
+ def get_embeddings(self: Self, texts: Sequence[str]) -> Sequence[Sequence[float]]:
25
+ """Get embeddings for a collection of texts using OpenAI’s API.
26
+
27
+ Args:
28
+ texts (Collection[str]): Collection of text chunks to embed
29
+
30
+ Returns:
31
+ NDArray: Array of embeddings with shape (n_texts, VECTOR_DIMENSION)
32
+
33
+ Raises:
34
+ ValueError: If the embedding dimensions don't match expected size
35
+ """
36
+ logger.debug("Creating embeddings for {} text string(s)…", len(texts))
37
+ response = self._open_ai_client.embeddings.create(
38
+ model=self.settings.EMBEDDING_MODEL,
39
+ input=texts,
40
+ encoding_format="float" # Ensure we get raw float values.
41
+ )
42
+ embeddings = tuple(tuple(data.embedding) for data in response.data)
43
+ match embeddings:
44
+ case (first, _) if len(first) != self.settings.VECTOR_DIMENSION:
45
+ logger.error("Embedding dimension mismatch and/or misconfiguration: expected configured dimension {}, but got {}.", self.settings.VECTOR_DIMENSION, len(first))
46
+ raise ValueError() # TODO: raise a more specific type.
47
+ return embeddings
src/ctp_slack_bot/services/event_brokerage_service.py CHANGED
@@ -1,38 +1,47 @@
1
- # from asyncio import create_task
 
2
  from loguru import logger
3
- from openai import OpenAI
4
- from pydantic import BaseModel, model_validator
5
  from typing import Any, Callable, Dict, List, Self
6
 
7
- from ctp_slack_bot.core import Settings
8
- from ctp_slack_bot.models import RetreivedContext, SlackMessage
9
- from ctp_slack_bot.services.content_ingestion_service import ContentIngestionService
10
- from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
11
-
12
 
13
  class EventBrokerageService(BaseModel):
14
  """
15
  Service for brokering events between services.
16
  """
17
 
18
- subscribers: Dict[str, List[Callable]] = {}
19
 
20
  class Config:
21
- arbitrary_types_allowed = True
22
 
23
- @model_validator(mode='after')
24
- def post_init(self: Self) -> Self:
25
  logger.debug("Created {}", self.__class__.__name__)
26
- return self
27
 
28
- def subscribe(self: Self, event_type: str, callback: Callable) -> None:
29
  """Subscribe to an event type with a callback function."""
30
- if event_type not in self.subscribers:
31
- self.subscribers[event_type] = []
32
- self.subscribers[event_type].append(callback)
33
-
34
- def publish(self: Self, event_type: str, data: Any = None) -> None:
 
35
  """Publish an event with optional data to all subscribers."""
36
- if event_type in self.subscribers:
37
- for callback in self.subscribers[event_type]:
38
- callback(data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from asyncio import create_task, iscoroutinefunction, to_thread
2
+ from collections import defaultdict
3
  from loguru import logger
4
+ from pydantic import BaseModel, PrivateAttr
 
5
  from typing import Any, Callable, Dict, List, Self
6
 
7
+ from ctp_slack_bot.enums import EventType
 
 
 
 
8
 
9
  class EventBrokerageService(BaseModel):
10
  """
11
  Service for brokering events between services.
12
  """
13
 
14
+ _subscribers: PrivateAttr = PrivateAttr(default_factory=lambda: defaultdict(list))
15
 
16
  class Config:
17
+ frozen=True
18
 
19
+ def __init__(self: Self, **data) -> None:
20
+ super().__init__(**data)
21
  logger.debug("Created {}", self.__class__.__name__)
 
22
 
23
+ def subscribe(self: Self, type: EventType, callback: Callable) -> None:
24
  """Subscribe to an event type with a callback function."""
25
+ logger.debug("1 new subscriber is listening for {} events.", type)
26
+ subscribers = self._subscribers[type]
27
+ subscribers.append(callback)
28
+ logger.debug("Event type {} has {} subscriber(s).", type, len(subscribers))
29
+
30
+ async def publish(self: Self, type: EventType, data: Any = None) -> None:
31
  """Publish an event with optional data to all subscribers."""
32
+ subscribers = self._subscribers[type]
33
+ if not subscribers:
34
+ logger.debug("No subscribers handle event {}: {}", type, len(subscribers), data)
35
+ return
36
+ logger.debug("Broadcasting event {} to {} subscriber(s): {}", type, len(subscribers), data)
37
+ for callback in subscribers:
38
+ if iscoroutinefunction(callback):
39
+ task = create_task(callback(data))
40
+ task.add_done_callback(lambda done_task: logger.error("Error in asynchronous event callback handling event {}: {}", type, done_task.exception())
41
+ if done_task.exception()
42
+ else None)
43
+ else:
44
+ try:
45
+ create_task(to_thread(callback, data))
46
+ except Exception as e:
47
+ logger.error("Error scheduling synchronous callback to handle event {}: {}", type, e)
src/ctp_slack_bot/services/google_drive_access.py DELETED
@@ -1,623 +0,0 @@
1
- """
2
- Easy Google Drive Access
3
-
4
- A simplified module for accessing Google Drive files in nested folders.
5
- Designed to make it as easy as possible to access files using path-like syntax.
6
- """
7
-
8
- import os
9
- import pickle
10
- import io
11
- import datetime
12
- from typing import List, Dict, Optional, Any, Union
13
-
14
- from google.oauth2.credentials import Credentials
15
- from google_auth_oauthlib.flow import InstalledAppFlow
16
- from google.auth.transport.requests import Request
17
- from googleapiclient.discovery import build
18
- from googleapiclient.http import MediaIoBaseDownload
19
- from googleapiclient.errors import HttpError
20
-
21
-
22
- class EasyGoogleDrive:
23
- """
24
- Simplified Google Drive access focused on accessing files in nested folders.
25
- """
26
-
27
- # Define the scopes needed for the application
28
- SCOPES = ['https://www.googleapis.com/auth/drive']
29
-
30
- # Define common MIME types
31
- MIME_TYPES = {
32
- 'folder': 'application/vnd.google-apps.folder',
33
- 'document': 'application/vnd.google-apps.document',
34
- 'spreadsheet': 'application/vnd.google-apps.spreadsheet',
35
- 'text': 'text/plain',
36
- 'pdf': 'application/pdf',
37
- 'image': 'image/jpeg',
38
- 'video': 'video/mp4',
39
- 'audio': 'audio/mpeg',
40
- }
41
-
42
- # Define metadata fields to retrieve
43
- FILE_FIELDS = 'id, name, mimeType, createdTime, modifiedTime, size, description, webViewLink, thumbnailLink, owners, shared, sharingUser, lastModifyingUser, capabilities, permissions'
44
- FOLDER_FIELDS = 'id, name, createdTime, modifiedTime, description, webViewLink, owners, shared, sharingUser, lastModifyingUser, capabilities, permissions'
45
-
46
- def __init__(self, credentials_dir: str = 'credentials'):
47
- """Initialize the Google Drive access."""
48
- self.credentials_dir = credentials_dir
49
- self.credentials_path = os.path.join(credentials_dir, 'client_secret.json')
50
- self.token_path = os.path.join(credentials_dir, 'token.pickle')
51
-
52
- # Ensure credentials directory exists
53
- os.makedirs(credentials_dir, exist_ok=True)
54
-
55
- # Initialize the Drive API service
56
- self.service = build('drive', 'v3', credentials=self._get_credentials())
57
-
58
- # Cache for folder IDs to avoid repeated lookups
59
- self.folder_id_cache = {}
60
-
61
- def _get_credentials(self) -> Credentials:
62
- """Get and refresh Google Drive API credentials."""
63
- creds = None
64
-
65
- # Load existing token if it exists
66
- if os.path.exists(self.token_path):
67
- with open(self.token_path, 'rb') as token:
68
- creds = pickle.load(token)
69
-
70
- # If credentials need refresh or don't exist
71
- if not creds or not creds.valid:
72
- if creds and creds.expired and creds.refresh_token:
73
- creds.refresh(Request())
74
- else:
75
- if not os.path.exists(self.credentials_path):
76
- raise FileNotFoundError(
77
- f"Client secrets file not found at {self.credentials_path}. "
78
- "Please follow the setup instructions in the README."
79
- )
80
-
81
- flow = InstalledAppFlow.from_client_secrets_file(
82
- self.credentials_path, self.SCOPES)
83
- creds = flow.run_local_server(port=0)
84
-
85
- # Save the credentials for future use
86
- with open(self.token_path, 'wb') as token:
87
- pickle.dump(creds, token)
88
-
89
- return creds
90
-
91
- def _format_metadata(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
92
- """
93
- Format metadata for easier reading and usage.
94
-
95
- Args:
96
- metadata: Raw metadata from Google Drive API
97
-
98
- Returns:
99
- Formatted metadata dictionary
100
- """
101
- formatted = metadata.copy()
102
-
103
- # Format dates
104
- for date_field in ['createdTime', 'modifiedTime']:
105
- if date_field in formatted:
106
- try:
107
- # Convert ISO 8601 string to datetime object
108
- dt = datetime.datetime.fromisoformat(formatted[date_field].replace('Z', '+00:00'))
109
- formatted[date_field] = dt
110
- # Add a formatted date string for easier reading
111
- formatted[f"{date_field}Formatted"] = dt.strftime('%Y-%m-%d %H:%M:%S')
112
- except (ValueError, AttributeError):
113
- pass
114
-
115
- # Format size
116
- if 'size' in formatted and formatted['size']:
117
- try:
118
- size_bytes = int(formatted['size'])
119
- # Add human-readable size
120
- if size_bytes < 1024:
121
- formatted['sizeFormatted'] = f"{size_bytes} B"
122
- elif size_bytes < 1024 * 1024:
123
- formatted['sizeFormatted'] = f"{size_bytes / 1024:.1f} KB"
124
- elif size_bytes < 1024 * 1024 * 1024:
125
- formatted['sizeFormatted'] = f"{size_bytes / (1024 * 1024):.1f} MB"
126
- else:
127
- formatted['sizeFormatted'] = f"{size_bytes / (1024 * 1024 * 1024):.1f} GB"
128
- except (ValueError, TypeError):
129
- pass
130
-
131
- # Extract owner names
132
- if 'owners' in formatted and formatted['owners']:
133
- formatted['ownerNames'] = [owner.get('displayName', 'Unknown') for owner in formatted['owners']]
134
- formatted['ownerEmails'] = [owner.get('emailAddress', 'Unknown') for owner in formatted['owners']]
135
-
136
- # Add file type description
137
- if 'mimeType' in formatted:
138
- mime_type = formatted['mimeType']
139
- for key, value in self.MIME_TYPES.items():
140
- if mime_type == value:
141
- formatted['fileType'] = key
142
- break
143
- else:
144
- # If not found in our predefined types
145
- formatted['fileType'] = mime_type.split('/')[-1]
146
-
147
- return formatted
148
-
149
- def get_folder_id(self, folder_path: str) -> Optional[str]:
150
- """
151
- Get a folder ID from a path like 'folder1/folder2/folder3'.
152
-
153
- Args:
154
- folder_path: Path to the folder, using '/' as separator
155
-
156
- Returns:
157
- The folder ID if found, None otherwise
158
- """
159
- # Check if we've already resolved this path
160
- if folder_path in self.folder_id_cache:
161
- return self.folder_id_cache[folder_path]
162
-
163
- # If it looks like an ID already, return it
164
- if len(folder_path) > 25 and '/' not in folder_path:
165
- return folder_path
166
-
167
- # Split the path into components
168
- parts = folder_path.split('/')
169
-
170
- # Start from the root
171
- current_folder_id = None
172
- current_path = ""
173
-
174
- # Traverse the path one folder at a time
175
- for i, folder_name in enumerate(parts):
176
- if not folder_name: # Skip empty parts
177
- continue
178
-
179
- # Update the current path for caching
180
- if current_path:
181
- current_path += f"/{folder_name}"
182
- else:
183
- current_path = folder_name
184
-
185
- # Check if we've already resolved this subpath
186
- if current_path in self.folder_id_cache:
187
- current_folder_id = self.folder_id_cache[current_path]
188
- continue
189
-
190
- # Search for the folder by name
191
- query = f"mimeType='{self.MIME_TYPES['folder']}' and name='{folder_name}'"
192
- if current_folder_id:
193
- query += f" and '{current_folder_id}' in parents"
194
-
195
- try:
196
- results = self.service.files().list(
197
- q=query,
198
- spaces='drive',
199
- fields='files(id, name)',
200
- pageSize=10
201
- ).execute()
202
-
203
- files = results.get('files', [])
204
- if not files:
205
- # Try a more flexible search if exact match fails
206
- query = query.replace(f"name='{folder_name}'", f"name contains '{folder_name}'")
207
- results = self.service.files().list(
208
- q=query,
209
- spaces='drive',
210
- fields='files(id, name)',
211
- pageSize=10
212
- ).execute()
213
-
214
- files = results.get('files', [])
215
- if not files:
216
- print(f"Could not find folder '{folder_name}' in path '{folder_path}'")
217
- return None
218
-
219
- # Use the first match
220
- current_folder_id = files[0]['id']
221
-
222
- # Cache this result
223
- self.folder_id_cache[current_path] = current_folder_id
224
-
225
- except HttpError as error:
226
- print(f"Error finding folder: {error}")
227
- return None
228
-
229
- return current_folder_id
230
-
231
- def get_folders_in_folder(self, folder_path: str, include_metadata: bool = True) -> List[Dict[str, Any]]:
232
- """
233
- Get all subfolders in a folder specified by path.
234
-
235
- Args:
236
- folder_path: Path to the folder, using '/' as separator
237
- include_metadata: Whether to include detailed metadata (default: True)
238
-
239
- Returns:
240
- List of folder metadata dictionaries
241
- """
242
- # Get the folder ID
243
- folder_id = self.get_folder_id(folder_path)
244
- if not folder_id:
245
- print(f"Could not find folder: '{folder_path}'")
246
- return []
247
-
248
- # List all folders in this folder
249
- query = f"'{folder_id}' in parents and mimeType = '{self.MIME_TYPES['folder']}'"
250
-
251
- try:
252
- results = self.service.files().list(
253
- q=query,
254
- spaces='drive',
255
- fields=f'files({self.FOLDER_FIELDS})' if include_metadata else 'files(id, name)',
256
- pageSize=1000
257
- ).execute()
258
-
259
- folders = results.get('files', [])
260
-
261
- # Format metadata if requested
262
- if include_metadata and folders:
263
- folders = [self._format_metadata(folder) for folder in folders]
264
-
265
- if folders:
266
- print(f"Found {len(folders)} subfolders in '{folder_path}':")
267
- for folder in folders:
268
- if include_metadata and 'createdTimeFormatted' in folder:
269
- print(f" - {folder['name']} (Created: {folder['createdTimeFormatted']})")
270
- else:
271
- print(f" - {folder['name']}")
272
- else:
273
- print(f"No subfolders found in '{folder_path}'")
274
-
275
- return folders
276
-
277
- except HttpError as error:
278
- print(f"Error listing folders: {error}")
279
- return []
280
-
281
- def get_files_in_folder(self, folder_path: str, include_metadata: bool = True, include_content: bool = False) -> List[Dict[str, Any]]:
282
- """
283
- Get all files in a folder specified by path.
284
-
285
- Args:
286
- folder_path: Path to the folder, using '/' as separator
287
- include_metadata: Whether to include detailed metadata (default: True)
288
- include_content: Whether to include file content (default: False)
289
-
290
- Returns:
291
- List of file metadata dictionaries, optionally including file content
292
- """
293
- # Get the folder ID
294
- folder_id = self.get_folder_id(folder_path)
295
- if not folder_id:
296
- print(f"Could not find folder: '{folder_path}'")
297
- return []
298
-
299
- # List all non-folder files in this folder
300
- query = f"'{folder_id}' in parents and mimeType != '{self.MIME_TYPES['folder']}'"
301
-
302
- try:
303
- results = self.service.files().list(
304
- q=query,
305
- spaces='drive',
306
- fields=f'files({self.FILE_FIELDS})' if include_metadata else 'files(id, name, mimeType)',
307
- pageSize=1000
308
- ).execute()
309
-
310
- files = results.get('files', [])
311
-
312
- # Format metadata if requested
313
- if include_metadata and files:
314
- files = [self._format_metadata(file) for file in files]
315
-
316
- # Add file content if requested
317
- if include_content and files:
318
- for file in files:
319
- try:
320
- # Skip files that are likely not text-based
321
- if any(ext in file['name'].lower() for ext in ['.jpg', '.png', '.gif', '.mp3', '.mp4']):
322
- print(f"Skipping content for non-text file: {file['name']}")
323
- file['file_content'] = None
324
- continue
325
-
326
- # Read the file content
327
- content = self.read_file_from_object(file)
328
- file['file_content'] = content
329
-
330
- if content is not None:
331
- print(f"Successfully read content for: {file['name']} ({len(content)} characters)")
332
- else:
333
- print(f"Unable to read content for: {file['name']}")
334
- except Exception as e:
335
- print(f"Error reading content for {file['name']}: {e}")
336
- file['file_content'] = None
337
-
338
- if files:
339
- print(f"Found {len(files)} files in '{folder_path}':")
340
- for file in files:
341
- if include_metadata and 'createdTimeFormatted' in file:
342
- print(f" - {file['name']} ({file.get('fileType', 'Unknown')}, Created: {file['createdTimeFormatted']})")
343
- else:
344
- print(f" - {file['name']} ({file.get('mimeType', 'Unknown')})")
345
- else:
346
- print(f"No files found in '{folder_path}'")
347
-
348
- return files
349
-
350
- except HttpError as error:
351
- print(f"Error listing files: {error}")
352
- return []
353
-
354
- def get_file(self, file_name: str, folder_path: str, include_metadata: bool = True, include_content: bool = False) -> Optional[Dict[str, Any]]:
355
- """
356
- Get a specific file by name from a folder.
357
-
358
- Args:
359
- file_name: Name of the file to get
360
- folder_path: Path to the folder containing the file
361
- include_metadata: Whether to include detailed metadata (default: True)
362
- include_content: Whether to include file content (default: False)
363
-
364
- Returns:
365
- File metadata dictionary, optionally including content, or None if file not found
366
- """
367
- # Get the folder ID
368
- folder_id = self.get_folder_id(folder_path)
369
- if not folder_id:
370
- print(f"Could not find folder: '{folder_path}'")
371
- return None
372
-
373
- # Find the file by name in this folder
374
- query = f"'{folder_id}' in parents and name = '{file_name}'"
375
-
376
- try:
377
- results = self.service.files().list(
378
- q=query,
379
- spaces='drive',
380
- fields=f'files({self.FILE_FIELDS})' if include_metadata else 'files(id, name, mimeType)',
381
- pageSize=1
382
- ).execute()
383
-
384
- files = results.get('files', [])
385
- if not files:
386
- # Try a more flexible search
387
- query = query.replace(f"name = '{file_name}'", f"name contains '{file_name}'")
388
- results = self.service.files().list(
389
- q=query,
390
- spaces='drive',
391
- fields=f'files({self.FILE_FIELDS})' if include_metadata else 'files(id, name, mimeType)',
392
- pageSize=10
393
- ).execute()
394
-
395
- files = results.get('files', [])
396
- if not files:
397
- print(f"Could not find file '{file_name}' in '{folder_path}'")
398
- return None
399
-
400
- # Use the first match
401
- file = files[0]
402
-
403
- # Format metadata if requested
404
- if include_metadata:
405
- file = self._format_metadata(file)
406
-
407
- # Add file content if requested
408
- if include_content:
409
- try:
410
- # Skip files that are likely not text-based
411
- if any(ext in file['name'].lower() for ext in ['.jpg', '.png', '.gif', '.mp3', '.mp4']):
412
- print(f"Skipping content for non-text file: {file['name']}")
413
- file['file_content'] = None
414
- else:
415
- # Read the file content
416
- content = self.read_file_from_object(file)
417
- file['file_content'] = content
418
-
419
- if content is not None:
420
- print(f"Successfully read content for: {file['name']} ({len(content)} characters)")
421
- else:
422
- print(f"Unable to read content for: {file['name']}")
423
- except Exception as e:
424
- print(f"Error reading content for {file['name']}: {e}")
425
- file['file_content'] = None
426
-
427
- print(f"Found file: {file['name']}")
428
- return file
429
-
430
- except HttpError as error:
431
- print(f"Error getting file: {error}")
432
- return None
433
-
434
- def get_all_files_in_folder(self, folder_path: str, include_metadata: bool = True, include_content: bool = False) -> List[Dict[str, Any]]:
435
- """
436
- Get all items (files and folders) in a folder specified by path.
437
-
438
- Args:
439
- folder_path: Path to the folder, using '/' as separator
440
- include_metadata: Whether to include detailed metadata (default: True)
441
- include_content: Whether to include file content (default: False)
442
-
443
- Returns:
444
- List of file and folder metadata dictionaries, optionally including file content
445
- """
446
- # Get the folder ID
447
- folder_id = self.get_folder_id(folder_path)
448
- if not folder_id:
449
- print(f"Could not find folder: '{folder_path}'")
450
- return []
451
-
452
- # List all items in this folder
453
- query = f"'{folder_id}' in parents"
454
-
455
- try:
456
- results = self.service.files().list(
457
- q=query,
458
- spaces='drive',
459
- fields=f'files({self.FILE_FIELDS})' if include_metadata else 'files(id, name, mimeType)',
460
- pageSize=1000
461
- ).execute()
462
-
463
- items = results.get('files', [])
464
-
465
- # Format metadata if requested
466
- if include_metadata and items:
467
- items = [self._format_metadata(item) for item in items]
468
-
469
- # Add file content if requested
470
- if include_content and items:
471
- for item in items:
472
- # Skip folders and non-text files
473
- if item.get('mimeType') == self.MIME_TYPES['folder'] or any(ext in item['name'].lower() for ext in ['.jpg', '.png', '.gif', '.mp3', '.mp4']):
474
- item['file_content'] = None
475
- continue
476
-
477
- try:
478
- # Read the file content
479
- content = self.read_file_from_object(item)
480
- item['file_content'] = content
481
-
482
- if content is not None:
483
- print(f"Successfully read content for: {item['name']} ({len(content)} characters)")
484
- else:
485
- print(f"Unable to read content for: {item['name']}")
486
- except Exception as e:
487
- print(f"Error reading content for {item['name']}: {e}")
488
- item['file_content'] = None
489
-
490
- if items:
491
- print(f"Found {len(items)} items in '{folder_path}':")
492
- for item in items:
493
- if include_metadata and 'createdTimeFormatted' in item:
494
- item_type = 'Folder' if item.get('mimeType') == self.MIME_TYPES['folder'] else item.get('fileType', 'Unknown')
495
- print(f" - {item['name']} ({item_type}, Created: {item['createdTimeFormatted']})")
496
- else:
497
- item_type = 'Folder' if item.get('mimeType') == self.MIME_TYPES['folder'] else item.get('mimeType', 'Unknown')
498
- print(f" - {item['name']} ({item_type})")
499
- else:
500
- print(f"No items found in '{folder_path}'")
501
-
502
- return items
503
-
504
- except HttpError as error:
505
- print(f"Error listing items: {error}")
506
- return []
507
-
508
- def file_exists(self, file_name: str, folder_path: str) -> bool:
509
- """
510
- Check if a file exists at the specified path in Google Drive.
511
-
512
- Args:
513
- file_name: Name of the file to check
514
- folder_path: Path to the folder containing the file
515
-
516
- Returns:
517
- True if the file exists, False otherwise
518
- """
519
- # Get the folder ID
520
- folder_id = self.get_folder_id(folder_path)
521
- if not folder_id:
522
- print(f"Could not find folder: '{folder_path}'")
523
- return False
524
-
525
- # Check if the file exists in this folder
526
- query = f"'{folder_id}' in parents and name = '{file_name}'"
527
-
528
- try:
529
- results = self.service.files().list(
530
- q=query,
531
- spaces='drive',
532
- fields='files(id, name)',
533
- pageSize=1
534
- ).execute()
535
-
536
- files = results.get('files', [])
537
- if not files:
538
- # Try a more flexible search
539
- query = query.replace(f"name = '{file_name}'", f"name contains '{file_name}'")
540
- results = self.service.files().list(
541
- q=query,
542
- spaces='drive',
543
- fields='files(id, name)',
544
- pageSize=10
545
- ).execute()
546
-
547
- files = results.get('files', [])
548
- if not files:
549
- print(f"File '{file_name}' does not exist in '{folder_path}'")
550
- return False
551
-
552
- # File exists
553
- print(f"File '{file_name}' exists in '{folder_path}'")
554
- return True
555
-
556
- except HttpError as error:
557
- print(f"Error checking if file exists: {error}")
558
- return False
559
-
560
- def get_file_modified_time(self, file_name: str, folder_path: str) -> Optional[datetime.datetime]:
561
- """
562
- Get the last modified time of a file.
563
-
564
- Args:
565
- file_name: Name of the file
566
- folder_path: Path to the folder containing the file
567
-
568
- Returns:
569
- The last modified time as a datetime object, or None if the file doesn't exist
570
- """
571
- # Get the file metadata
572
- file = self.get_file(file_name, folder_path, include_metadata=True)
573
- if not file:
574
- return None
575
-
576
- # Return the modified time
577
- return file.get('modifiedTime')
578
-
579
- def read_file_from_object(self, file_object: Dict[str, Any]) -> Optional[str]:
580
- """
581
- Read the contents of a file using a file object.
582
-
583
- Args:
584
- file_object: A Google file object with at least 'id' and 'mimeType' fields
585
-
586
- Returns:
587
- The file contents as a string, or None if the file couldn't be read
588
- """
589
- file_id = file_object.get('id')
590
- mime_type = file_object.get('mimeType')
591
-
592
- if not file_id or not mime_type:
593
- print("File object is missing 'id' or 'mimeType' fields.")
594
- return None
595
-
596
- try:
597
- # Read the file based on its type
598
- if mime_type == self.MIME_TYPES['document']:
599
- # Export Google Doc as plain text
600
- response = self.service.files().export(
601
- fileId=file_id,
602
- mimeType='text/plain'
603
- ).execute()
604
- return response.decode('utf-8')
605
-
606
- else:
607
- # Download regular files
608
- request = self.service.files().get_media(fileId=file_id)
609
- fh = io.BytesIO()
610
- downloader = MediaIoBaseDownload(fh, request)
611
-
612
- done = False
613
- while not done:
614
- _, done = downloader.next_chunk()
615
-
616
- return fh.getvalue().decode('utf-8')
617
-
618
- except HttpError as error:
619
- print(f"Error reading file: {error}")
620
- return None
621
- except Exception as e:
622
- print(f"Error decoding file content: {e}")
623
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/ctp_slack_bot/services/google_drive_basic_usage.py DELETED
@@ -1,178 +0,0 @@
1
- """
2
- Basic Usage Examples for EasyGoogleDrive
3
-
4
- This file demonstrates how to use the EasyGoogleDrive class to interact with Google Drive.
5
- It provides examples of the main functionality without printing all results to keep the output clean.
6
- """
7
-
8
- from google_drive_access import EasyGoogleDrive
9
- import datetime
10
-
11
- def main():
12
- """
13
- Main function demonstrating the basic usage of EasyGoogleDrive.
14
- """
15
- # Initialize the Google Drive client
16
- # This will prompt for authentication the first time it's run
17
- drive = EasyGoogleDrive()
18
-
19
- # Example folder path - replace with your actual folder path
20
- folder_path = "Spring-2025-BAI"
21
- subfolder_path = "Spring-2025-BAI/transcripts"
22
-
23
- print("=== Basic Usage Examples for EasyGoogleDrive ===\n")
24
-
25
- # Example 1: List folders in a directory
26
- print("Example 1: Listing folders in a directory")
27
- print("----------------------------------------")
28
- folders = drive.get_folders_in_folder(folder_path)
29
-
30
- # Print only the first 3 folders (if any exist)
31
- if folders:
32
- print(f"Found {len(folders)} folders. Showing first 3:")
33
- for i, folder in enumerate(folders[:3]):
34
- print(f" - {folder['name']} (Created: {folder.get('createdTimeFormatted', 'Unknown')})")
35
- if len(folders) > 3:
36
- print(f" ... and {len(folders) - 3} more folders")
37
- else:
38
- print("No folders found.")
39
- print()
40
-
41
- # Example 2: List files in a directory
42
- print("Example 2: Listing files in a directory")
43
- print("--------------------------------------")
44
- files = drive.get_files_in_folder(subfolder_path)
45
-
46
- # Print only the first 3 files (if any exist)
47
- if files:
48
- print(f"Found {len(files)} files. Showing first 3:")
49
- for i, file in enumerate(files[:3]):
50
- file_type = file.get('fileType', 'Unknown')
51
- created_time = file.get('createdTimeFormatted', 'Unknown')
52
- print(f" - {file['name']} ({file_type}, Created: {created_time})")
53
- if len(files) > 3:
54
- print(f" ... and {len(files) - 3} more files")
55
- else:
56
- print("No files found.")
57
- print()
58
-
59
- # Example 3: Get a specific file
60
- print("Example 3: Getting a specific file")
61
- print("--------------------------------")
62
- # Use the first file found in the previous example, or a default if none were found
63
- file_name = files[-1]['name'] if files and len(files) > 0 else "example.txt"
64
-
65
- file = drive.get_file(file_name, subfolder_path, include_metadata=True)
66
- if file:
67
- print(f"File found: {file['name']}")
68
- print(f" Type: {file.get('fileType', 'Unknown')}")
69
- print(f" Created: {file.get('createdTimeFormatted', 'Unknown')}")
70
- print(f" Modified: {file.get('modifiedTimeFormatted', 'Unknown')}")
71
- print(f" Size: {file.get('sizeFormatted', 'Unknown')}")
72
- else:
73
- print(f"File '{file_name}' not found.")
74
- print()
75
-
76
- # Example 4: Get all items in a folder (files and folders)
77
- print("Example 4: Getting all items in a folder")
78
- print("--------------------------------------")
79
- all_items = drive.get_all_files_in_folder(folder_path)
80
-
81
- # Print only the first 3 items (if any exist)
82
- if all_items:
83
- print(f"Found {len(all_items)} items. Showing first 3:")
84
- for i, item in enumerate(all_items[:3]):
85
- item_type = "Folder" if item.get('mimeType') == drive.MIME_TYPES['folder'] else item.get('fileType', 'Unknown')
86
- created_time = item.get('createdTimeFormatted', 'Unknown')
87
- print(f" - {item['name']} ({item_type}, Created: {created_time})")
88
- if len(all_items) > 3:
89
- print(f" ... and {len(all_items) - 3} more items")
90
- else:
91
- print("No items found.")
92
- print()
93
-
94
- # Example 5: Check if a file exists
95
- print("Example 5: Checking if a file exists")
96
- print("----------------------------------")
97
- # Use the same file name from Example 3
98
- file_to_check = file_name
99
-
100
- exists = drive.file_exists(file_to_check, subfolder_path)
101
- print(f"File '{file_to_check}' {'exists' if exists else 'does not exist'} in '{subfolder_path}'.")
102
- print()
103
-
104
- # Example 6: Get file modified time
105
- print("Example 6: Getting file modified time")
106
- print("-----------------------------------")
107
- # Use the same file name from Example 3
108
- file_to_check_time = file_name
109
-
110
- modified_time = drive.get_file_modified_time(file_to_check_time, subfolder_path)
111
- if modified_time:
112
- print(f"File '{file_to_check_time}' was last modified on: {modified_time}")
113
- else:
114
- print(f"Could not get modified time for '{file_to_check_time}'.")
115
- print()
116
-
117
- # Example 7: Get file with content
118
- print("Example 7: Getting file with content")
119
- print("----------------------------------")
120
- # Use the same file name from Example 3
121
- file_with_content = file_name
122
-
123
- file_with_content_obj = drive.get_file(file_with_content, subfolder_path, include_content=True)
124
- if file_with_content_obj and 'file_content' in file_with_content_obj:
125
- content = file_with_content_obj['file_content']
126
- if content:
127
- print(f"File '{file_with_content}' content (first 100 chars):")
128
- print(f" {content[:100]}...")
129
- else:
130
- print(f"File '{file_with_content}' has no content or content could not be read.")
131
- else:
132
- print(f"File '{file_with_content}' not found or content could not be retrieved.")
133
- print()
134
-
135
- # Example 8: Get contents of all files in a folder
136
- print("Example 8: Getting contents of all files in a folder")
137
- print("------------------------------------------------")
138
- # Get all files with content
139
- all_files_with_content = drive.get_files_in_folder(subfolder_path, include_content=True)
140
-
141
- if all_files_with_content:
142
- print(f"Found {len(all_files_with_content)} files. Showing content preview for first 3:")
143
- for i, file in enumerate(all_files_with_content[:3]):
144
- print(f" File: {file['name']}")
145
- if 'file_content' in file and file['file_content']:
146
- content = file['file_content']
147
- print(f" Content preview: {content[:50]}...")
148
- else:
149
- print(f" No content available or file is not text-based.")
150
-
151
- if len(all_files_with_content) > 3:
152
- print(f" ... and {len(all_files_with_content) - 3} more files with content")
153
- else:
154
- print("No files found or no content could be retrieved.")
155
- print()
156
-
157
- # Example 9: Get content from a specific file using read_file_from_object
158
- print("Example 9: Getting content from a specific file using read_file_from_object")
159
- print("------------------------------------------------------------------------")
160
- # Get a file object first
161
- file_obj = drive.get_file(file_name, subfolder_path)
162
-
163
- if file_obj:
164
- # Read the content directly from the file object
165
- content = drive.read_file_from_object(file_obj)
166
- if content:
167
- print(f"File '{file_obj['name']}' content (first 100 chars):")
168
- print(f" {content[:100]}...")
169
- else:
170
- print(f"File '{file_obj['name']}' has no content or content could not be read.")
171
- else:
172
- print(f"File '{file_name}' not found.")
173
- print()
174
-
175
- print("=== End of Examples ===")
176
-
177
- if __name__ == "__main__":
178
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/ctp_slack_bot/services/google_drive_service.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ from cachetools import TTLCache
3
+ from google.oauth2 import service_account
4
+ from googleapiclient.discovery import build
5
+ from googleapiclient.http import MediaIoBaseDownload
6
+ from googleapiclient.errors import HttpError
7
+ from io import BytesIO
8
+ from loguru import logger
9
+ from pydantic import BaseModel, PrivateAttr
10
+ from typing import Collection, Dict, List, Optional, Self
11
+
12
+ from ctp_slack_bot.core import Settings
13
+ from ctp_slack_bot.models import GoogleDriveMetadata
14
+
15
+
16
+ FOLDER_MIME_TYPE: str = "application/vnd.google-apps.folder"
17
+ PATH_SEPARATOR: str = "/"
18
+
19
+
20
+ class GoogleDriveService(BaseModel):
21
+ """Service for interacting with Google Drive."""
22
+
23
+ settings: Settings
24
+ _google_drive_client: PrivateAttr = PrivateAttr()
25
+ _folder_cache: PrivateAttr = PrivateAttr(default_factory=lambda: TTLCache(maxsize=256, ttl=60))
26
+
27
+ class Config:
28
+ frozen=True
29
+
30
+ def __init__(self: Self, **data) -> None:
31
+ super().__init__(**data)
32
+ credentials = service_account.Credentials.from_service_account_info({
33
+ "type": "service_account",
34
+ "project_id": self.settings.GOOGLE_PROJECT_ID,
35
+ "private_key_id": self.settings.GOOGLE_PRIVATE_KEY_ID.get_secret_value(),
36
+ "private_key": self.settings.GOOGLE_PRIVATE_KEY.get_secret_value(),
37
+ "client_email": self.settings.GOOGLE_CLIENT_EMAIL,
38
+ "client_id": self.settings.GOOGLE_CLIENT_ID,
39
+ "token_uri": self.settings.GOOGLE_TOKEN_URI,
40
+ }, scopes=["https://www.googleapis.com/auth/drive"])
41
+ self._google_drive_client = build('drive', 'v3', credentials=credentials)
42
+ logger.debug("Created {}", self.__class__.__name__)
43
+
44
+ def _resolve_folder_id(self: Self, folder_path: str) -> Optional[str]:
45
+ """Resolve a folder path to a Google Drive ID."""
46
+
47
+ if not folder_path:
48
+ return self.settings.GOOGLE_DRIVE_ROOT_ID
49
+
50
+ if folder_path in self._folder_cache:
51
+ return self._folder_cache[folder_path]
52
+
53
+ current_id = self.settings.GOOGLE_DRIVE_ROOT_ID
54
+ try:
55
+ for part in folder_path.split(PATH_SEPARATOR):
56
+ results = self._google_drive_client.files().list(
57
+ q=f"name='{part.replace("\\", "\\\\").replace("'", "\\'")}' and mimeType='{FOLDER_MIME_TYPE}' and '{current_id}' in parents",
58
+ fields="files(id,name)",
59
+ supportsAllDrives=True,
60
+ includeItemsFromAllDrives=True
61
+ ).execute()
62
+ match results:
63
+ case {"files": [ {"id": id} ]}:
64
+ current_id = id
65
+ case _:
66
+ logger.debug("Folder not found by path: {}", folder_path)
67
+ return None
68
+ except HttpError as e:
69
+ logger.error("Error resolving folder path: {}", folder_path)
70
+ return None
71
+
72
+ self._folder_cache[folder_path] = current_id
73
+ return current_id
74
+
75
+ def list_directory(self: Self, folder_path: str) -> Collection[GoogleDriveMetadata]:
76
+ """List contents of a directory with basic metadata."""
77
+
78
+ folder_id = self._resolve_folder_id(folder_path)
79
+ if not folder_id:
80
+ logger.debug("Folder not found by path: {}", folder_path)
81
+ return ()
82
+
83
+ try:
84
+ results = self._google_drive_client.files().list(
85
+ q=f"'{folder_id}' in parents",
86
+ fields="files(id,name,mimeType,modifiedTime)",
87
+ supportsAllDrives=True,
88
+ includeItemsFromAllDrives=True,
89
+ pageSize=1000
90
+ ).execute()
91
+ return tuple(GoogleDriveMetadata.from_folder_path_and_dict(folder_path, result)
92
+ for result
93
+ in results.get('files', ()))
94
+ except HttpError as e:
95
+ logger.error("Error listing folder by path, {}: {}", folder_path, e)
96
+ return ()
97
+
98
+ def get_metadata(self: Self, item_path: str) -> Optional[GoogleDriveMetadata]:
99
+ """Get metadata for a specific file/folder by path."""
100
+
101
+ match item_path.rsplit(PATH_SEPARATOR, 1):
102
+ case [item_name]:
103
+ folder_path = ""
104
+ folder_id = self.settings.GOOGLE_DRIVE_ROOT_ID
105
+ case [folder_path, item_name]:
106
+ folder_id = self._resolve_folder_id(folder_path)
107
+
108
+ if not folder_id:
109
+ logger.debug("Folder not found by path: {}", folder_path)
110
+ return None
111
+
112
+ try:
113
+ results = self._google_drive_client.files().list(
114
+ q=f"name='{item_name}' and '{folder_id}' in parents",
115
+ fields="files(id,name,mimeType,modifiedTime)",
116
+ supportsAllDrives=True,
117
+ includeItemsFromAllDrives=True,
118
+ pageSize=1
119
+ ).execute()
120
+ match results:
121
+ case {"files": [result]}:
122
+ return GoogleDriveMetadata.from_folder_path_and_dict(folder_path, result)
123
+ except HttpError as e:
124
+ logger.error("Error getting metadata for item by path, {}: {}", item_path, e)
125
+
126
+ logger.debug("Item not found by path: {}", item_path)
127
+ return None
128
+
129
+ def read_file_by_id(self: Self, file_id: str) -> Optional[bytes]:
130
+ """Read contents of a file by its unique identifier."""
131
+
132
+ try:
133
+ request = self._google_drive_client.files().get_media(fileId=file_id)
134
+ buffer = BytesIO()
135
+ downloader = MediaIoBaseDownload(buffer, request)
136
+ done = False
137
+ while not done:
138
+ _, done = downloader.next_chunk()
139
+ return buffer.getvalue()
140
+ except HttpError as e:
141
+ logger.error("Error reading file by ID, {}: {}", file_id, e)
142
+ return None
src/ctp_slack_bot/services/language_model_service.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from loguru import logger
2
+ from openai import OpenAI
3
+ from openai.types.chat import ChatCompletion
4
+ from pydantic import BaseModel, PrivateAttr
5
+ from typing import Collection, Self
6
+
7
+ from ctp_slack_bot.core import Settings
8
+ from ctp_slack_bot.models import Chunk
9
+
10
+ class LanguageModelService(BaseModel):
11
+ """
12
+ Service for language model operations.
13
+ """
14
+
15
+ settings: Settings
16
+ _open_ai_client: PrivateAttr = PrivateAttr()
17
+
18
+ class Config:
19
+ frozen=True
20
+
21
+ def __init__(self: Self, **data) -> None:
22
+ super().__init__(**data)
23
+ self._open_ai_client = OpenAI(api_key=self.settings.OPENAI_API_KEY.get_secret_value())
24
+ logger.debug("Created {}", self.__class__.__name__)
25
+
26
+ def answer_question(self, question: str, context: Collection[Chunk]) -> str:
27
+ """Generate a response using OpenAI’s API with retrieved context.
28
+
29
+ Args:
30
+ question (str): The user’s question
31
+ context (List[RetreivedContext]): The context retreived for answering the question
32
+
33
+ Returns:
34
+ str: Generated answer
35
+ """
36
+ logger.debug("Generating response for question “{}” using {} context chunks…", question, len(context))
37
+ messages = [
38
+ {"role": "system", "content": self.settings.SYSTEM_PROMPT},
39
+ {"role": "user", "content":
40
+ f"""Student Question: {question}
41
+
42
+ Context from class materials and transcripts:
43
+ {'\n'.join(chunk.text for chunk in context)}
44
+
45
+ Please answer the Student Question based on the Context from class materials and transcripts. If the context doesn’t contain relevant information, acknowledge that and suggest asking the professor."""}
46
+ ]
47
+ response: ChatCompletion = self._open_ai_client.chat.completions.create(
48
+ model=self.settings.CHAT_MODEL,
49
+ messages=messages,
50
+ max_tokens=self.settings.MAX_TOKENS,
51
+ temperature=self.settings.TEMPERATURE
52
+ )
53
+
54
+ return response.choices[0].message.content
55
+ # return f"Mock response to “{question}”"
src/ctp_slack_bot/services/question_dispatch_service.py CHANGED
@@ -1,11 +1,11 @@
1
  # from asyncio import create_task
2
  from loguru import logger
3
- from openai import OpenAI
4
- from pydantic import BaseModel, model_validator
5
- from typing import List, Optional, Self, Tuple
6
 
7
  from ctp_slack_bot.core import Settings
8
- from ctp_slack_bot.models import RetreivedContext, SlackMessage
 
9
  from ctp_slack_bot.services.answer_retrieval_service import AnswerRetrievalService
10
  from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
11
  from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
@@ -21,11 +21,16 @@ class QuestionDispatchService(BaseModel):
21
  context_retrieval_service: ContextRetrievalService
22
  answer_retrieval_service: AnswerRetrievalService
23
 
24
- @model_validator(mode='after')
25
- def post_init(self: Self) -> Self:
 
 
 
 
26
  logger.debug("Created {}", self.__class__.__name__)
27
- return self
28
 
29
- def push(self: Self, message: SlackMessage) -> None:
30
- context = self.context_retrieval_service.get_context(message)
31
- self.answer_retrieval_service.generate_answer(message, context)
 
 
 
1
  # from asyncio import create_task
2
  from loguru import logger
3
+ from pydantic import BaseModel
4
+ from typing import Self
 
5
 
6
  from ctp_slack_bot.core import Settings
7
+ from ctp_slack_bot.enums import EventType
8
+ from ctp_slack_bot.models import Chunk, SlackMessage
9
  from ctp_slack_bot.services.answer_retrieval_service import AnswerRetrievalService
10
  from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
11
  from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
 
21
  context_retrieval_service: ContextRetrievalService
22
  answer_retrieval_service: AnswerRetrievalService
23
 
24
+ class Config:
25
+ frozen=True
26
+
27
+ def __init__(self: Self, **data) -> None:
28
+ super().__init__(**data)
29
+ self.event_brokerage_service.subscribe(EventType.INCOMING_SLACK_MESSAGE, self.__process_incoming_slack_message)
30
  logger.debug("Created {}", self.__class__.__name__)
 
31
 
32
+ async def __process_incoming_slack_message(self: Self, message: SlackMessage) -> None:
33
+ if message.subtype != 'bot_message':
34
+ logger.debug("Question dispatch service received an answerable question: {}", message.text)
35
+ context = await self.context_retrieval_service.get_context(message)
36
+ await self.answer_retrieval_service.push(message, context)
src/ctp_slack_bot/services/schedule_service.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from apscheduler.schedulers.asyncio import AsyncIOScheduler
2
+ from apscheduler.triggers.cron import CronTrigger
3
+ from asyncio import create_task, iscoroutinefunction, to_thread
4
+ from datetime import datetime
5
+ from dependency_injector.resources import Resource
6
+ from loguru import logger
7
+ from pydantic import BaseModel, PrivateAttr
8
+ from pytz import timezone
9
+ from typing import Optional, Self
10
+
11
+ from ctp_slack_bot.core import Settings
12
+
13
+ class ScheduleService(BaseModel):
14
+ """
15
+ Service for running scheduled tasks.
16
+ """
17
+
18
+ settings: Settings
19
+ _scheduler: PrivateAttr
20
+
21
+ class Config:
22
+ frozen=True
23
+
24
+ def __init__(self: Self, **data) -> None:
25
+ super().__init__(**data)
26
+ zone = self.settings.SCHEDULER_TIMEZONE
27
+ self._configure_jobs()
28
+ self._scheduler = AsyncIOScheduler(timezone=timezone(zone))
29
+ logger.debug("Created {}", self.__class__.__name__)
30
+
31
+ def _configure_jobs(self: Self) -> None:
32
+ # Example jobs (uncomment and implement as needed)
33
+ # self._scheduler.add_job(
34
+ # send_error_report,
35
+ # CronTrigger(hour=7, minute=0),
36
+ # id="daily_error_report",
37
+ # name="Daily Error Report",
38
+ # replace_existing=True,
39
+ # )
40
+ # self._scheduler.add_job(
41
+ # cleanup_old_transcripts,
42
+ # CronTrigger(day_of_week="sun", hour=1, minute=0),
43
+ # id="weekly_transcript_cleanup",
44
+ # name="Weekly Transcript Cleanup",
45
+ # replace_existing=True,
46
+ # )
47
+ pass
48
+
49
+ def start(self: Self) -> None:
50
+ self._scheduler.start()
51
+
52
+ def stop(self: Self) -> None:
53
+ if self._scheduler.running:
54
+ self._scheduler.shutdown()
55
+ else:
56
+ logger.debug("The scheduler is not running. There is no scheduler to shut down.")
57
+
58
+ class ScheduleServiceResource(Resource):
59
+ def init(self: Self, settings: Settings) -> ScheduleService:
60
+ logger.info("Starting scheduler…")
61
+ schedule_service = ScheduleService(settings=settings)
62
+ schedule_service.start()
63
+ return schedule_service
64
+
65
+ def shutdown(self: Self, schedule_service: ScheduleService) -> None:
66
+ """Stop scheduler on shutdown."""
67
+ schedule_service.stop()
68
+ logger.info("Stopped scheduler.")
src/ctp_slack_bot/services/slack_service.py CHANGED
@@ -1,11 +1,12 @@
1
- # from asyncio import create_task
2
  from loguru import logger
3
  from openai import OpenAI
4
- from pydantic import BaseModel, model_validator
5
- from typing import List, Optional, Self, Tuple
 
6
 
7
- from ctp_slack_bot.core import Settings
8
- from ctp_slack_bot.models import RetreivedContext, SlackMessage
9
  from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
10
 
11
 
@@ -14,10 +15,55 @@ class SlackService(BaseModel):
14
  Service for interfacing with Slack.
15
  """
16
 
17
- settings: Settings
18
  event_brokerage_service: EventBrokerageService
 
19
 
20
- @model_validator(mode='after')
21
- def post_init(self: Self) -> Self:
 
 
 
 
 
22
  logger.debug("Created {}", self.__class__.__name__)
23
- return self
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dependency_injector.resources import Resource
2
  from loguru import logger
3
  from openai import OpenAI
4
+ from pydantic import BaseModel
5
+ from slack_bolt.async_app import AsyncApp
6
+ from typing import Any, Mapping, Self
7
 
8
+ from ctp_slack_bot.enums import EventType
9
+ from ctp_slack_bot.models import SlackMessage, SlackResponse
10
  from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
11
 
12
 
 
15
  Service for interfacing with Slack.
16
  """
17
 
 
18
  event_brokerage_service: EventBrokerageService
19
+ slack_bolt_app: AsyncApp
20
 
21
+ class Config:
22
+ arbitrary_types_allowed = True
23
+ frozen=True
24
+
25
+ def __init__(self: Self, **data) -> None:
26
+ super().__init__(**data)
27
+ self.event_brokerage_service.subscribe(EventType.OUTGOING_SLACK_RESPONSE, self.send_message)
28
  logger.debug("Created {}", self.__class__.__name__)
29
+
30
+ def adapt_event_payload(self: Self, event: Mapping[str, Any]) -> SlackMessage:
31
+ return SlackMessage(
32
+ type=event.get("type"),
33
+ subtype=event.get("subtype"),
34
+ channel=event.get("channel"),
35
+ channel_type=event.get("channel_type"),
36
+ user=event.get("user"),
37
+ bot_id=event.get("bot_id"),
38
+ thread_ts=event.get("thread_ts"),
39
+ text=event.get("text", ""),
40
+ ts=event.get("ts"),
41
+ event_ts=event.get("event_ts")
42
+ )
43
+
44
+ async def process_message(self: Self, event: Mapping[str, Any]) -> None:
45
+ slack_message = self.adapt_event_payload(event.get("event", {}))
46
+ logger.debug("Received message from Slack: {}", slack_message)
47
+ await self.event_brokerage_service.publish(EventType.INCOMING_SLACK_MESSAGE, slack_message)
48
+
49
+ async def send_message(self: Self, message: SlackResponse) -> None:
50
+ await self.slack_bolt_app.client.chat_postMessage(channel=message.channel, text=message.text, thread_ts=message.thread_ts)
51
+
52
+ async def handle_message_event(self: Self, body: Mapping[str, Any]) -> None:
53
+ logger.debug("Ignored regular message: {}", body.get("event", {}).get("text"))
54
+ # await self.process_message(body)
55
+
56
+ async def handle_app_mention_event(self: Self, body: Mapping[str, Any]) -> None:
57
+ logger.debug("Received app mention for processing: {}", body.get("event", {}).get("text"))
58
+ await self.process_message(body)
59
+
60
+ def register(self: Self) -> None:
61
+ self.slack_bolt_app.event("message")(self.handle_message_event)
62
+ self.slack_bolt_app.event("app_mention")(self.handle_app_mention_event)
63
+ logger.debug("Registered 2 handlers for Slack Bolt message and app mention events.")
64
+
65
+ class SlackServiceResource(Resource):
66
+ def init(self: Self, event_brokerage_service: EventBrokerageService, slack_bolt_app: AsyncApp) -> SlackService:
67
+ slack_service = SlackService(event_brokerage_service=event_brokerage_service, slack_bolt_app=slack_bolt_app)
68
+ slack_service.register()
69
+ return slack_service
src/ctp_slack_bot/services/vector_database_service.py CHANGED
@@ -1,111 +1,118 @@
1
  from loguru import logger
2
- from pydantic import BaseModel, model_validator
3
- from typing import Any, Dict, List, Self
4
 
5
  from ctp_slack_bot.core import Settings
6
  from ctp_slack_bot.db import MongoDB
7
- from ctp_slack_bot.models import VectorQuery, RetreivedContext
8
 
9
  class VectorDatabaseService(BaseModel): # TODO: this should not rely specifically on MongoDB.
10
  """
11
  Service for storing and retrieving vector embeddings from MongoDB.
12
  """
13
-
14
  settings: Settings
15
  mongo_db: MongoDB
16
 
17
- @model_validator(mode='after')
18
- def post_init(self: Self) -> Self:
 
 
 
19
  logger.debug("Created {}", self.__class__.__name__)
20
- return self
21
 
22
- def content_exists(self, text: str) -> bool:
23
  """
24
- Check if a text content already exists in the database.
25
 
26
  Args:
27
- text: The text content to check for existence
28
 
29
- Returns:
30
- bool: True if the content exists, False otherwise
31
  """
32
- if not self.mongo_db.initialized:
33
- self.mongo_db.initialize()
 
34
 
35
  try:
36
- # Check if the content already exists
37
- result = self.mongo_db.vector_collection.find_one({"text": text})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
- return result is not None
40
  except Exception as e:
41
- logger.error(f"Error checking content existence: {str(e)}")
 
 
 
42
  raise
43
 
44
- def store(self, text: str, embedding: List[float], metadata: Dict[str, Any]) -> str:
45
  """
46
- Store text and its embedding vector in the database.
47
 
48
  Args:
49
- text: The text content to store
50
- embedding: The vector embedding of the text
51
- metadata: Additional metadata about the text (source, timestamp, etc.)
52
-
53
- Returns:
54
- str: The ID of the stored document
55
  """
56
- if not self.mongo_db.initialized:
57
- self.mongo_db.initialize()
58
-
59
- try:
60
- # Create document to store
61
- document = {
62
- "text": text,
63
- "embedding": embedding,
64
- "metadata": metadata
65
- }
66
-
67
- # Insert into collection
68
- result = self.mongo_db.vector_collection.insert_one(document)
69
- logger.debug(f"Stored document with ID: {result.inserted_id}")
70
-
71
- return str(result.inserted_id)
72
- except Exception as e:
73
- logger.error(f"Error storing embedding: {str(e)}")
74
- raise
75
-
76
- def search_by_similarity(self, query: VectorQuery, query_embedding: List[float]) -> List[RetreivedContext]:
77
  """
78
  Query the vector database for similar documents.
79
 
80
  Args:
81
  query: VectorQuery object with search parameters
82
- query_embedding: The vector embedding of the query text
83
-
84
  Returns:
85
- List[RetreivedContext]: List of similar documents with similarity scores
86
  """
87
- if not self.mongo_db.initialized:
88
- self.mongo_db.initialize()
89
-
90
  try:
91
- # Build aggregation pipeline for vector search
 
 
 
 
 
92
  pipeline = [
93
  {
94
- "$search": {
95
- "index": "vector_index",
96
- "knnBeta": {
97
- "vector": query_embedding,
98
- "path": "embedding",
99
- "k": query.k
100
- }
101
  }
102
  },
103
  {
104
  "$project": {
105
- "_id": 0,
106
  "text": 1,
107
  "metadata": 1,
108
- "score": {"$meta": "searchScore"}
 
 
109
  }
110
  }
111
  ]
@@ -114,33 +121,55 @@ class VectorDatabaseService(BaseModel): # TODO: this should not rely specificall
114
  if query.filter_metadata:
115
  metadata_filter = {f"metadata.{k}": v for k, v in query.filter_metadata.items()}
116
  pipeline.insert(1, {"$match": metadata_filter})
 
117
 
118
- # Execute the pipeline
119
- results = list(self.mongo_db.vector_collection.aggregate(pipeline, maxTimeMS=30000))
 
 
 
 
 
 
120
 
121
- # Convert to RetreivedContext objects directly
122
- context_results = []
123
- for result in results:
124
- # Normalize score to [0,1] range
125
- normalized_score = result.get("score", 0)
 
 
 
 
 
 
126
 
127
- # Skip if below threshold
128
- if normalized_score < query.score_threshold:
129
- continue
130
-
131
- context_results.append(
132
- RetreivedContext(
133
- contextual_text=result["text"],
134
- metadata_source=result["metadata"].get("source", "unknown"),
135
- similarity_score=normalized_score,
136
- said_by=result["metadata"].get("speaker", None),
137
- in_reation_to_question=result["metadata"].get("related_question", None)
138
- )
 
 
 
139
  )
 
140
 
141
- logger.debug(f"Found {len(context_results)} similar documents")
142
- return context_results
143
 
144
  except Exception as e:
145
- logger.error(f"Error in similarity search: {str(e)}")
146
- raise
 
 
 
 
 
 
1
  from loguru import logger
2
+ from pydantic import BaseModel
3
+ from typing import Any, Collection, Dict, List, Optional, Self, Sequence
4
 
5
  from ctp_slack_bot.core import Settings
6
  from ctp_slack_bot.db import MongoDB
7
+ from ctp_slack_bot.models import Chunk, VectorizedChunk, VectorQuery
8
 
9
  class VectorDatabaseService(BaseModel): # TODO: this should not rely specifically on MongoDB.
10
  """
11
  Service for storing and retrieving vector embeddings from MongoDB.
12
  """
 
13
  settings: Settings
14
  mongo_db: MongoDB
15
 
16
+ class Config:
17
+ frozen=True
18
+
19
+ def __init__(self: Self, **data) -> None:
20
+ super().__init__(**data)
21
  logger.debug("Created {}", self.__class__.__name__)
 
22
 
23
+ async def store(self: Self, chunks: Collection[VectorizedChunk]) -> None:
24
  """
25
+ Stores vectorized chunks and their embedding vectors in the database.
26
 
27
  Args:
28
+ chunks: Collection of VectorizedChunk objects to store
29
 
30
+ Returns: None
 
31
  """
32
+ if not chunks:
33
+ logger.debug("No chunks to store")
34
+ return
35
 
36
  try:
37
+ # Get the vector collection - this will create it if it doesn't exist
38
+ logger.debug("Getting vectors collection for storing {} chunks", len(chunks))
39
+ vector_collection = await self.mongo_db.get_collection("vectors")
40
+
41
+ # Ensure vector search index exists
42
+ logger.debug("Creating vector search index for vectors collection")
43
+ await self.mongo_db.create_indexes("vectors")
44
+
45
+ # Create documents to store, ensuring compatibility with BSON
46
+ documents = []
47
+ for chunk in chunks:
48
+ # Convert embedding to standard list format (important for BSON compatibility)
49
+ embedding = list(chunk.embedding) if not isinstance(chunk.embedding, list) else chunk.embedding
50
+
51
+ # Build document with proper structure
52
+ document = {
53
+ "text": chunk.text,
54
+ "embedding": embedding,
55
+ "metadata": chunk.metadata,
56
+ "parent_id": chunk.parent_id,
57
+ "chunk_id": chunk.chunk_id
58
+ }
59
+ documents.append(document)
60
+
61
+ # Insert into collection as a batch
62
+ logger.debug("Inserting {} documents into vectors collection", len(documents))
63
+ result = await vector_collection.insert_many(documents)
64
+ logger.info("Stored {} vector chunks in database", len(result.inserted_ids))
65
 
 
66
  except Exception as e:
67
+ logger.error("Error storing vector embeddings: {}", str(e))
68
+ # Include more diagnostic information
69
+ logger.debug("MongoDB connection info: URI defined: {}, DB name: {}",
70
+ bool(self.settings.MONGODB_URI), self.settings.MONGODB_NAME)
71
  raise
72
 
73
+ async def content_exists(self: Self, key: str)-> bool: # TODO: implement this.
74
  """
75
+ Check if content exists in the database.
76
 
77
  Args:
78
+ key: The key to check for content existence
 
 
 
 
 
79
  """
80
+ pass
81
+
82
+ async def search_by_similarity(self: Self, query: VectorQuery) -> Sequence[Chunk]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  """
84
  Query the vector database for similar documents.
85
 
86
  Args:
87
  query: VectorQuery object with search parameters
88
+
 
89
  Returns:
90
+ Sequence[Chunk]: List of similar chunks
91
  """
 
 
 
92
  try:
93
+ # Get the vector collection
94
+ logger.debug("Getting vectors collection for similarity search")
95
+ vector_collection = await self.mongo_db.get_collection("vectors")
96
+
97
+ # Build aggregation pipeline for vector search using official MongoDB format
98
+ logger.debug("Building vector search pipeline with query embedding dimension: {}", len(query.query_embeddings))
99
  pipeline = [
100
  {
101
+ "$vectorSearch": {
102
+ "index": "vectors_vector_index",
103
+ "path": "embedding",
104
+ "queryVector": query.query_embeddings, #list(query.query_embeddings),
105
+ "numCandidates": query.k * 10,
106
+ "limit": query.k
 
107
  }
108
  },
109
  {
110
  "$project": {
 
111
  "text": 1,
112
  "metadata": 1,
113
+ "parent_id": 1,
114
+ "chunk_id": 1,
115
+ "score": { "$meta": "vectorSearchScore" }
116
  }
117
  }
118
  ]
 
121
  if query.filter_metadata:
122
  metadata_filter = {f"metadata.{k}": v for k, v in query.filter_metadata.items()}
123
  pipeline.insert(1, {"$match": metadata_filter})
124
+ logger.debug("Added metadata filters to search: {}", query.filter_metadata)
125
 
126
+ # Add score threshold filter if needed
127
+ if query.score_threshold > 0:
128
+ pipeline.append({
129
+ "$match": {
130
+ "score": { "$gte": query.score_threshold }
131
+ }
132
+ })
133
+ logger.debug("Added score threshold filter: {}", query.score_threshold)
134
 
135
+ try:
136
+ # Execute the vector search pipeline
137
+ logger.debug("Executing vector search pipeline")
138
+ results = await vector_collection.aggregate(pipeline).to_list(length=query.k)
139
+ logger.debug("Vector search returned {} results", len(results))
140
+ except Exception as e:
141
+ logger.warning("Vector search failed: {}. Falling back to basic text search.", str(e))
142
+ # Fall back to basic filtering with limit
143
+ query_filter = {}
144
+ if query.filter_metadata:
145
+ query_filter.update({f"metadata.{k}": v for k, v in query.filter_metadata.items()})
146
 
147
+ logger.debug("Executing fallback basic search with filter: {}", query_filter)
148
+ results = await vector_collection.find(query_filter).limit(query.k).to_list(length=query.k)
149
+ logger.debug("Fallback search returned {} results", len(results))
150
+
151
+ # Convert results to Chunk objects
152
+ chunks = []
153
+ for result in results:
154
+ chunk = Chunk(
155
+ text=result["text"],
156
+ parent_id=result["parent_id"],
157
+ chunk_id=result["chunk_id"],
158
+ metadata={
159
+ **result["metadata"],
160
+ "similarity_score": result.get("score", 0)
161
+ }
162
  )
163
+ chunks.append(chunk)
164
 
165
+ logger.info("Found {} similar chunks with similarity search", len(chunks))
166
+ return chunks
167
 
168
  except Exception as e:
169
+ logger.error("Error in similarity search: {}", str(e))
170
+ # Include additional diagnostic information
171
+ logger.debug("MongoDB connection info: URI defined: {}, DB name: {}",
172
+ bool(self.settings.MONGODB_URI), self.settings.MONGODB_NAME)
173
+ logger.debug("Query details: k={}, dimension={}",
174
+ query.k, len(query.query_embeddings) if query.query_embeddings else "None")
175
+ raise
src/ctp_slack_bot/services/vectorization_service.py CHANGED
@@ -1,10 +1,10 @@
1
  from loguru import logger
2
- import numpy as np
3
- from openai import OpenAI
4
- from pydantic import BaseModel, model_validator
5
- from typing import List, Optional, Self
6
 
7
  from ctp_slack_bot.core import Settings
 
 
8
 
9
  class VectorizationService(BaseModel):
10
  """
@@ -12,57 +12,23 @@ class VectorizationService(BaseModel):
12
  """
13
 
14
  settings: Settings
15
- client: OpenAI # TODO: this should separate the OpenAI backend out into its own service, one that is agnostic.
16
 
17
  class Config:
18
- arbitrary_types_allowed = True
19
 
20
- @model_validator(mode='after')
21
- def post_init(self: Self) -> Self:
22
  logger.debug("Created {}", self.__class__.__name__)
23
- return self
24
-
25
- def get_embeddings(self, texts: List[str]) -> np.ndarray:
26
- """Get embeddings for a list of texts using OpenAI's API.
27
-
28
- Args:
29
- texts (List[str]): List of text chunks to embed
30
-
31
- Returns:
32
- np.ndarray: Array of embeddings with shape (n_texts, VECTOR_DIMENSION)
33
-
34
- Raises:
35
- ValueError: If the embedding dimensions don't match expected size
36
- """
37
- try:
38
- # Use the initialized client instead of the global openai module
39
- response = self.client.embeddings.create(
40
- model=self.settings.EMBEDDING_MODEL,
41
- input=texts,
42
- encoding_format="float" # Ensure we get raw float values
43
- )
44
-
45
- # Extract embeddings and verify dimensions
46
- embeddings = np.array([data.embedding for data in response.data])
47
-
48
- if embeddings.shape[1] != self.settings.VECTOR_DIMENSION:
49
- raise ValueError(
50
- f"Embedding dimension mismatch. Expected {self.settings.VECTOR_DIMENSION}, "
51
- f"but got {embeddings.shape[1]}. Please update VECTOR_DIMENSION "
52
- f"in config.py to match the model's output."
53
- )
54
-
55
- return embeddings
56
-
57
- except Exception as e:
58
- print(f"Error getting embeddings: {str(e)}")
59
- pass
60
 
61
- def _test(self, list_of_strings: List[str] = ['Hello my sweet Svetlana.', 'You mean the world to me.']):
62
- """
63
- Test the vectorization service.
64
- """
65
- print('embedding list', list_of_strings)
66
- embeddings = self.get_embeddings(list_of_strings)
67
- print(embeddings)
68
- return embeddings
 
 
 
 
1
  from loguru import logger
2
+ from pydantic import BaseModel
3
+ from typing import Self, Sequence
 
 
4
 
5
  from ctp_slack_bot.core import Settings
6
+ from ctp_slack_bot.models import Chunk, VectorizedChunk
7
+ from ctp_slack_bot.services.embeddings_model_service import EmbeddingsModelService
8
 
9
  class VectorizationService(BaseModel):
10
  """
 
12
  """
13
 
14
  settings: Settings
15
+ embeddings_model_service: EmbeddingsModelService
16
 
17
  class Config:
18
+ frozen=True
19
 
20
+ def __init__(self: Self, **data) -> None:
21
+ super().__init__(**data)
22
  logger.debug("Created {}", self.__class__.__name__)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
+ def vectorize(self: Self, chunks: Sequence[Chunk]) -> Sequence[VectorizedChunk]:
25
+ embeddings = self.embeddings_model_service.get_embeddings([chunk.text for chunk in chunks])
26
+ return tuple(VectorizedChunk(
27
+ text=chunk.text,
28
+ parent_id=chunk.parent_id,
29
+ chunk_id=chunk.chunk_id,
30
+ metadata=chunk.metadata,
31
+ embedding=embedding
32
+ )
33
+ for chunk, embedding
34
+ in zip(chunks, embeddings))
src/ctp_slack_bot/tasks/__init__.py CHANGED
@@ -1 +0,0 @@
1
- from ctp_slack_bot.tasks.scheduler import start_scheduler, stop_scheduler