Spaces:
Runtime error
Runtime error
Merge pull request #5 from CUNYTechPrep/refactor-2
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .env.template +19 -19
- .github/workflows/main.yml +19 -0
- Dockerfile +5 -2
- README.MD → README.md +53 -41
- notebooks/container.ipynb +102 -0
- notebooks/google_drive.ipynb +0 -0
- notebooks/google_drive_web_vtt_vectorizer_and_storer.ipynb +585 -0
- notebooks/web_vtt.ipynb +355 -0
- pyproject.toml +15 -14
- scripts/run-dev.sh +1 -1
- src/ctp_slack_bot/__init__.py +0 -1
- src/ctp_slack_bot/api/__init__.py +0 -1
- src/ctp_slack_bot/api/main.py +0 -70
- src/ctp_slack_bot/api/routes.py +0 -67
- src/ctp_slack_bot/app.py +53 -0
- src/ctp_slack_bot/containers.py +21 -25
- src/ctp_slack_bot/core/__init__.py +0 -1
- src/ctp_slack_bot/core/config.py +37 -14
- src/ctp_slack_bot/core/logging.py +34 -30
- src/ctp_slack_bot/core/response_rendering.py +0 -13
- src/ctp_slack_bot/db/mongo_db.py +167 -94
- src/ctp_slack_bot/db/repositories/__init__.py +2 -0
- src/ctp_slack_bot/db/repositories/mongo_db_vectorized_chunk_repository.py +65 -0
- src/ctp_slack_bot/db/repositories/vectorized_chunk_repository.py +30 -0
- src/ctp_slack_bot/enums.py +6 -0
- src/ctp_slack_bot/models/__init__.py +4 -4
- src/ctp_slack_bot/models/base.py +44 -47
- src/ctp_slack_bot/models/content.py +0 -19
- src/ctp_slack_bot/models/google_drive.py +25 -0
- src/ctp_slack_bot/models/slack.py +80 -12
- src/ctp_slack_bot/models/vector_query.py +0 -16
- src/ctp_slack_bot/models/webvtt.py +73 -0
- src/ctp_slack_bot/services/GOOGLE_DRIVE_README.md +0 -228
- src/ctp_slack_bot/services/__init__.py +3 -0
- src/ctp_slack_bot/services/answer_retrieval_service.py +18 -49
- src/ctp_slack_bot/services/application_database_service.py +29 -0
- src/ctp_slack_bot/services/content_ingestion_service.py +33 -5
- src/ctp_slack_bot/services/context_retrieval_service.py +38 -44
- src/ctp_slack_bot/services/embeddings_model_service.py +47 -0
- src/ctp_slack_bot/services/event_brokerage_service.py +31 -22
- src/ctp_slack_bot/services/google_drive_access.py +0 -623
- src/ctp_slack_bot/services/google_drive_basic_usage.py +0 -178
- src/ctp_slack_bot/services/google_drive_service.py +142 -0
- src/ctp_slack_bot/services/language_model_service.py +55 -0
- src/ctp_slack_bot/services/question_dispatch_service.py +15 -10
- src/ctp_slack_bot/services/schedule_service.py +68 -0
- src/ctp_slack_bot/services/slack_service.py +55 -9
- src/ctp_slack_bot/services/vector_database_service.py +115 -86
- src/ctp_slack_bot/services/vectorization_service.py +19 -53
- src/ctp_slack_bot/tasks/__init__.py +0 -1
.env.template
CHANGED
@@ -1,41 +1,41 @@
|
|
1 |
# Copy this file and modify. Do not save or commit the secrets!
|
2 |
|
3 |
-
# Application Configuration
|
4 |
-
DEBUG=false
|
5 |
-
|
6 |
-
# Logging Configuration
|
7 |
-
LOG_LEVEL=INFO
|
8 |
-
LOG_FORMAT=text
|
9 |
-
|
10 |
# APScheduler Configuration
|
11 |
SCHEDULER_TIMEZONE=UTC
|
12 |
|
13 |
-
# API Configuration
|
14 |
-
API_HOST=0.0.0.0
|
15 |
-
API_PORT=8000
|
16 |
-
|
17 |
# Slack Configuration
|
18 |
SLACK_BOT_TOKEN=🪙
|
19 |
-
SLACK_SIGNING_SECRET=🔏
|
20 |
SLACK_APP_TOKEN=🦥
|
21 |
|
22 |
# Vectorization Configuration
|
23 |
EMBEDDING_MODEL=🌮
|
24 |
-
VECTOR_DIMENSION=
|
25 |
-
CHUNK_SIZE=
|
26 |
-
CHUNK_OVERLAP=
|
27 |
-
TOP_K_MATCHES=
|
28 |
|
29 |
# MongoDB Configuration
|
30 |
MONGODB_URI=mongodb+srv://username:[email protected]/database?retryWrites=true&w=majority
|
31 |
MONGODB_NAME=ctp_slack_bot
|
|
|
32 |
|
33 |
# Hugging Face Configuration
|
34 |
HF_API_TOKEN=🤗
|
35 |
|
36 |
# OpenAI Configuration
|
37 |
OPENAI_API_KEY=😐
|
38 |
-
CHAT_MODEL
|
39 |
-
MAX_TOKENS=
|
40 |
-
TEMPERATURE=0.
|
41 |
SYSTEM_PROMPT="You are a helpful teaching assistant for a data science class.\nBased on the students question, you will be given context retreived from class transcripts and materials to answer their question.\nYour responses should be:\n\n1. Accurate and based on the class content\n2. Clear and educational\n3. Concise but complete\nIf you're unsure about something, acknowledge it and suggest asking the professor."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# Copy this file and modify. Do not save or commit the secrets!
|
2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
# APScheduler Configuration
|
4 |
SCHEDULER_TIMEZONE=UTC
|
5 |
|
|
|
|
|
|
|
|
|
6 |
# Slack Configuration
|
7 |
SLACK_BOT_TOKEN=🪙
|
|
|
8 |
SLACK_APP_TOKEN=🦥
|
9 |
|
10 |
# Vectorization Configuration
|
11 |
EMBEDDING_MODEL=🌮
|
12 |
+
VECTOR_DIMENSION=1536
|
13 |
+
CHUNK_SIZE=1000
|
14 |
+
CHUNK_OVERLAP=200
|
15 |
+
TOP_K_MATCHES=5
|
16 |
|
17 |
# MongoDB Configuration
|
18 |
MONGODB_URI=mongodb+srv://username:[email protected]/database?retryWrites=true&w=majority
|
19 |
MONGODB_NAME=ctp_slack_bot
|
20 |
+
SCORE_THRESHOLD=0.5
|
21 |
|
22 |
# Hugging Face Configuration
|
23 |
HF_API_TOKEN=🤗
|
24 |
|
25 |
# OpenAI Configuration
|
26 |
OPENAI_API_KEY=😐
|
27 |
+
CHAT_MODEL=gpt-3.5-turbo
|
28 |
+
MAX_TOKENS=150
|
29 |
+
TEMPERATURE=0.8
|
30 |
SYSTEM_PROMPT="You are a helpful teaching assistant for a data science class.\nBased on the students question, you will be given context retreived from class transcripts and materials to answer their question.\nYour responses should be:\n\n1. Accurate and based on the class content\n2. Clear and educational\n3. Concise but complete\nIf you're unsure about something, acknowledge it and suggest asking the professor."
|
31 |
+
|
32 |
+
# Google Drive Configuration
|
33 |
+
GOOGLE_DRIVE_ROOT_ID=1NB91EcIUXbOVcdCkXOAHdmWrDfgoh9fQ
|
34 |
+
GOOGLE_PROJECT_ID=insufferable-slacker-123456
|
35 |
+
GOOGLE_PRIVATE_KEY_ID=1a2b3c4d5e6f748891091d21304e506674829507
|
36 |
+
GOOGLE_PRIVATE_KEY="-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASC...\n-----END PRIVATE KEY-----\n"
|
37 |
+
GOOGLE_CLIENT_EMAIL=botty-bot@insufferable-slacker-123456.iam.gserviceaccount.com
|
38 |
+
GOOGLE_CLIENT_ID=123456789012345678901
|
39 |
+
|
40 |
+
# File Monitoring Configuration
|
41 |
+
FILE_MONITOR_ROOT_PATH=Transcripts/Friday
|
.github/workflows/main.yml
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Sync to Hugging Face hub
|
2 |
+
on:
|
3 |
+
push:
|
4 |
+
branches: [main]
|
5 |
+
# to run this workflow manually from the Actions tab
|
6 |
+
workflow_dispatch:
|
7 |
+
|
8 |
+
jobs:
|
9 |
+
sync-to-hub:
|
10 |
+
runs-on: ubuntu-latest
|
11 |
+
steps:
|
12 |
+
- uses: actions/checkout@v3
|
13 |
+
with:
|
14 |
+
fetch-depth: 0
|
15 |
+
lfs: true
|
16 |
+
- name: Push to hub
|
17 |
+
env:
|
18 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
19 |
+
run: git push https://KingZack:[email protected]/spaces/KingZack/ctp-slack-bot main
|
Dockerfile
CHANGED
@@ -5,7 +5,7 @@ WORKDIR /app
|
|
5 |
# Set environment variables.
|
6 |
ENV PYTHONDONTWRITEBYTECODE=1 \
|
7 |
PYTHONUNBUFFERED=1 \
|
8 |
-
PYTHONPATH=/app
|
9 |
|
10 |
# Install system dependencies.
|
11 |
RUN apt-get update \
|
@@ -25,5 +25,8 @@ RUN pip install --no-cache-dir .
|
|
25 |
RUN useradd -m appuser
|
26 |
USER appuser
|
27 |
|
|
|
|
|
|
|
28 |
# Run the application.
|
29 |
-
CMD ["
|
|
|
5 |
# Set environment variables.
|
6 |
ENV PYTHONDONTWRITEBYTECODE=1 \
|
7 |
PYTHONUNBUFFERED=1 \
|
8 |
+
PYTHONPATH=/app/src
|
9 |
|
10 |
# Install system dependencies.
|
11 |
RUN apt-get update \
|
|
|
25 |
RUN useradd -m appuser
|
26 |
USER appuser
|
27 |
|
28 |
+
# Expose a volume mount for logs ― Hugging Face Spaces requires specifically /data.
|
29 |
+
VOLUME /data
|
30 |
+
|
31 |
# Run the application.
|
32 |
+
CMD ["python", "-m", "ctp_slack_bot.app"]
|
README.MD → README.md
RENAMED
@@ -1,42 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# CTP Slack Bot
|
2 |
|
3 |
## _Modus Operandi_ in a Nutshell
|
4 |
|
5 |
-
* Intelligently responds to Slack messages based on a repository of data.
|
6 |
* Periodically checks for new content to add to its repository.
|
7 |
|
8 |
-
## Tech Stack
|
9 |
-
|
10 |
-
* Hugging Face Spaces for hosting and serverless API
|
11 |
-
* Google Drive for reference data (i.e., the material to be incorporated into the bot’s knowledge base)
|
12 |
-
* MongoDB for data persistence
|
13 |
-
* Docker for containerization
|
14 |
-
* Python
|
15 |
-
* FastAPI for serving HTTP requests
|
16 |
-
* httpx for making HTTP requests
|
17 |
-
* APScheduler for running periodic tasks in the background
|
18 |
-
* See `pyproject.toml` for additional Python packages.
|
19 |
-
|
20 |
-
## General Project Structure
|
21 |
-
|
22 |
-
* `src/`
|
23 |
-
* `ctp_slack_bot/`
|
24 |
-
* `api/`: FastAPI application structure
|
25 |
-
* `routes.py`: API endpoint definitions
|
26 |
-
* `core/`: fundamental components like configuration (using pydantic), logging setup (loguru), and custom exceptions
|
27 |
-
* `db/`: database connection
|
28 |
-
* `repositories/`: repository pattern implementation
|
29 |
-
* `models/`: Pydantic models for data validation and serialization
|
30 |
-
* `services/`: business logic
|
31 |
-
* `tasks/`: background scheduled jobs
|
32 |
-
* `utils/`: reusable utilities
|
33 |
-
* `tests/`: unit tests
|
34 |
-
* `scripts/`: utility scripts for development, deployment, etc.
|
35 |
-
* `run-dev.sh`: script to run the application locally
|
36 |
-
* `notebooks/`: Jupyter notebooks for exploration and model development
|
37 |
-
* `.env`: local environment variables for development purposes (to be created for local use only from `.env.template`)
|
38 |
-
* `Dockerfile`: Docker container build definition
|
39 |
-
|
40 |
## How to Run the Application
|
41 |
|
42 |
### Normally
|
@@ -52,7 +32,7 @@ docker build . -t ctp-slack-bot
|
|
52 |
Run it with:
|
53 |
|
54 |
```sh
|
55 |
-
docker run --env-file=.env -p 8000:8000 --name my-ctp-slack-bot-instance ctp-slack-bot
|
56 |
```
|
57 |
|
58 |
### For Development
|
@@ -73,13 +53,45 @@ If `localhost` port `8000` is free, running the following will make the applicat
|
|
73 |
scripts/run-dev.sh
|
74 |
```
|
75 |
|
76 |
-
|
77 |
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
|
|
|
|
|
|
|
|
82 |
|
83 |
-
|
84 |
|
85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: CTP Slack Bot
|
3 |
+
emoji: 🦥
|
4 |
+
colorFrom: red
|
5 |
+
colorTo: green
|
6 |
+
sdk: docker
|
7 |
+
pinned: false
|
8 |
+
license: mit
|
9 |
+
short_description: Spring 2025 CTP Slack Bot RAG system
|
10 |
+
---
|
11 |
+
|
12 |
+
|
13 |
# CTP Slack Bot
|
14 |
|
15 |
## _Modus Operandi_ in a Nutshell
|
16 |
|
17 |
+
* Intelligently responds to Slack messages (when mentioned) based on a repository of data.
|
18 |
* Periodically checks for new content to add to its repository.
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
## How to Run the Application
|
21 |
|
22 |
### Normally
|
|
|
32 |
Run it with:
|
33 |
|
34 |
```sh
|
35 |
+
docker run --volume ./logs:/app/logs/ --env-file=.env -p 8000:8000 --name my-ctp-slack-bot-instance ctp-slack-bot
|
36 |
```
|
37 |
|
38 |
### For Development
|
|
|
53 |
scripts/run-dev.sh
|
54 |
```
|
55 |
|
56 |
+
## Tech Stack
|
57 |
|
58 |
+
* Hugging Face Spaces for hosting
|
59 |
+
* OpenAI for embeddings and language models
|
60 |
+
* Google Drive for reference data (i.e., the material to be incorporated into the bot’s knowledge base)
|
61 |
+
* MongoDB for data persistence
|
62 |
+
* Docker for containerization
|
63 |
+
* Python
|
64 |
+
* Slack Bolt client for interfacing with Slack
|
65 |
+
* See `pyproject.toml` for additional Python packages.
|
66 |
|
67 |
+
## General Project Structure
|
68 |
|
69 |
+
* `src/`
|
70 |
+
* `ctp_slack_bot/`
|
71 |
+
* `core/`: fundamental components like configuration (using pydantic), logging setup (loguru), and custom exceptions
|
72 |
+
* `db/`: database connection
|
73 |
+
* `repositories/`: repository pattern implementation
|
74 |
+
* `models/`: Pydantic models for data validation and serialization
|
75 |
+
* `services/`: business logic
|
76 |
+
* `answer_retrieval_service.py`: obtains an answer to a question from a language model using relevant context
|
77 |
+
* `content_ingestion_service.py`: converts content into chunks and stores them into the database
|
78 |
+
* `context_retrieval_service.py`: queries for relevant context from the database to answer a question
|
79 |
+
* `embeddings_model_service.py`: converts text to embeddings
|
80 |
+
* `event_brokerage_service.py`: brokers events between decoupled components
|
81 |
+
* `language_model_service.py`: answers questions using relevant context
|
82 |
+
* `question_dispatch_service.py`: listens for questions and retrieves relevant context to get answers
|
83 |
+
* `schedule_service.py`: runs background jobs
|
84 |
+
* `slack_service.py`: handles events from Slack and sends back responses
|
85 |
+
* `vector_database_service.py`: stores and queries chunks
|
86 |
+
* `vectorization_service.py`: converts chunks into chunks with embeddings
|
87 |
+
* `tasks/`: background scheduled jobs
|
88 |
+
* `utils/`: reusable utilities
|
89 |
+
* `app.py`: application entry point
|
90 |
+
* `containers.py`: the dependency injection container
|
91 |
+
* `tests/`: unit tests
|
92 |
+
* `scripts/`: utility scripts for development, deployment, etc.
|
93 |
+
* `run-dev.sh`: script to run the application locally
|
94 |
+
* `notebooks/`: Jupyter notebooks for exploration and model development
|
95 |
+
* `.env`: local environment variables for development purposes (to be created for local use only from `.env.template`)
|
96 |
+
* `Dockerfile`: Docker container build definition
|
97 |
+
* `pyproject.toml`: project definition and dependencies
|
notebooks/container.ipynb
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {},
|
6 |
+
"source": [
|
7 |
+
"# Loading Dependency Injection Container in Jupyter Notebook"
|
8 |
+
]
|
9 |
+
},
|
10 |
+
{
|
11 |
+
"cell_type": "code",
|
12 |
+
"execution_count": 4,
|
13 |
+
"metadata": {},
|
14 |
+
"outputs": [],
|
15 |
+
"source": [
|
16 |
+
"from ctp_slack_bot.containers import Container\n",
|
17 |
+
"from ctp_slack_bot.services import VectorDatabaseService\n",
|
18 |
+
"\n",
|
19 |
+
"container = Container()\n",
|
20 |
+
"container.wire(packages=['ctp_slack_bot'])"
|
21 |
+
]
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"cell_type": "code",
|
25 |
+
"execution_count": 2,
|
26 |
+
"metadata": {},
|
27 |
+
"outputs": [
|
28 |
+
{
|
29 |
+
"name": "stderr",
|
30 |
+
"output_type": "stream",
|
31 |
+
"text": [
|
32 |
+
"\u001b[32m2025-04-19 16:43:46.927\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.core.config\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m14\u001b[0m - \u001b[34m\u001b[1mCreated Settings\u001b[0m\n"
|
33 |
+
]
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"data": {
|
37 |
+
"text/plain": [
|
38 |
+
"Settings(LOG_LEVEL='INFO', LOG_FORMAT='json', SCHEDULER_TIMEZONE='America/New_York', SLACK_BOT_TOKEN=SecretStr('**********'), SLACK_APP_TOKEN=SecretStr('**********'), EMBEDDING_MODEL='text-embedding-3-small', VECTOR_DIMENSION=1536, CHUNK_SIZE=1000, CHUNK_OVERLAP=200, TOP_K_MATCHES=5, MONGODB_URI=SecretStr('**********'), MONGODB_NAME='ctp_slack_bot', SCORE_THRESHOLD=0.5, HF_API_TOKEN=SecretStr('**********'), OPENAI_API_KEY=SecretStr('**********'), CHAT_MODEL='gpt-3.5-turbo', MAX_TOKENS=150, TEMPERATURE=0.8, SYSTEM_PROMPT=\"You are a helpful teaching assistant for a data science class.\\nBased on the students question, you will be given context retreived from class transcripts and materials to answer their question.\\nYour responses should be:\\n\\n1. Accurate and based on the class content\\n2. Clear and educational\\n3. Concise but complete\\nIf you're unsure about something, acknowledge it and suggest asking the professor.\", GOOGLE_PROJECT_ID='voltaic-reducer-294821', GOOGLE_PRIVATE_KEY_ID=SecretStr('**********'), GOOGLE_PRIVATE_KEY=SecretStr('**********'), GOOGLE_CLIENT_ID='102943207835073856980', GOOGLE_CLIENT_EMAIL='[email protected]', GOOGLE_AUTH_URI='https://accounts.google.com/o/oauth2/auth', GOOGLE_TOKEN_URI='https://oauth2.googleapis.com/token', GOOGLE_AUTH_PROVIDER_CERT_URL='https://www.googleapis.com/oauth2/v1/certs', GOOGLE_CLIENT_CERT_URL='https://www.googleapis.com/robot/v1/metadata/x509/ctp-slack-bot-714%40voltaic-reducer-294821.iam.gserviceaccount.com', GOOGLE_UNIVERSE_DOMAIN='googleapis.com', FILE_MONITOR_ROOT_PATH='Transcripts/Friday Building AI Applications Session')"
|
39 |
+
]
|
40 |
+
},
|
41 |
+
"execution_count": 2,
|
42 |
+
"metadata": {},
|
43 |
+
"output_type": "execute_result"
|
44 |
+
}
|
45 |
+
],
|
46 |
+
"source": [
|
47 |
+
"container.settings()"
|
48 |
+
]
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"cell_type": "code",
|
52 |
+
"execution_count": null,
|
53 |
+
"metadata": {},
|
54 |
+
"outputs": [
|
55 |
+
{
|
56 |
+
"name": "stderr",
|
57 |
+
"output_type": "stream",
|
58 |
+
"text": [
|
59 |
+
"\u001b[32m2025-04-19 16:45:25.997\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.core.config\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m14\u001b[0m - \u001b[34m\u001b[1mCreated Settings\u001b[0m\n"
|
60 |
+
]
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"name": "stderr",
|
64 |
+
"output_type": "stream",
|
65 |
+
"text": [
|
66 |
+
"\u001b[32m2025-04-19 16:45:25.999\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36minit\u001b[0m:\u001b[36m175\u001b[0m - \u001b[1mInitializing MongoDB connection for database: ctp_slack_bot\u001b[0m\n",
|
67 |
+
"\u001b[32m2025-04-19 16:45:25.999\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m26\u001b[0m - \u001b[34m\u001b[1mCreated MongoDB\u001b[0m\n",
|
68 |
+
"\u001b[32m2025-04-19 16:45:25.999\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mconnect\u001b[0m:\u001b[36m32\u001b[0m - \u001b[34m\u001b[1mConnecting to MongoDB using URI: mongodb+srv://ctp-slack-bot.xkipuvm.mongodb.net/?retryWrites=true&w=majority&appName=ctp-slack-bot\u001b[0m\n",
|
69 |
+
"\u001b[32m2025-04-19 16:45:26.000\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mconnect\u001b[0m:\u001b[36m49\u001b[0m - \u001b[34m\u001b[1mMongoDB client initialized for database: ctp_slack_bot\u001b[0m\n",
|
70 |
+
"\u001b[32m2025-04-19 16:45:26.279\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
|
71 |
+
"\u001b[32m2025-04-19 16:45:26.280\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36m_test_connection\u001b[0m:\u001b[36m186\u001b[0m - \u001b[1mMongoDB connection test successful!\u001b[0m\n",
|
72 |
+
"\u001b[32m2025-04-19 16:45:26.280\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m21\u001b[0m - \u001b[34m\u001b[1mCreated VectorDatabaseService\u001b[0m\n"
|
73 |
+
]
|
74 |
+
}
|
75 |
+
],
|
76 |
+
"source": [
|
77 |
+
"vector_database_service: VectorDatabaseService = container.vector_database_service()"
|
78 |
+
]
|
79 |
+
}
|
80 |
+
],
|
81 |
+
"metadata": {
|
82 |
+
"kernelspec": {
|
83 |
+
"display_name": ".venv",
|
84 |
+
"language": "python",
|
85 |
+
"name": "python3"
|
86 |
+
},
|
87 |
+
"language_info": {
|
88 |
+
"codemirror_mode": {
|
89 |
+
"name": "ipython",
|
90 |
+
"version": 3
|
91 |
+
},
|
92 |
+
"file_extension": ".py",
|
93 |
+
"mimetype": "text/x-python",
|
94 |
+
"name": "python",
|
95 |
+
"nbconvert_exporter": "python",
|
96 |
+
"pygments_lexer": "ipython3",
|
97 |
+
"version": "3.12.3"
|
98 |
+
}
|
99 |
+
},
|
100 |
+
"nbformat": 4,
|
101 |
+
"nbformat_minor": 2
|
102 |
+
}
|
notebooks/google_drive.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
notebooks/google_drive_web_vtt_vectorizer_and_storer.ipynb
ADDED
@@ -0,0 +1,585 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {},
|
6 |
+
"source": [
|
7 |
+
"# Google Drive WebVTT Vectorizer and Storer"
|
8 |
+
]
|
9 |
+
},
|
10 |
+
{
|
11 |
+
"cell_type": "code",
|
12 |
+
"execution_count": null,
|
13 |
+
"metadata": {},
|
14 |
+
"outputs": [
|
15 |
+
{
|
16 |
+
"name": "stderr",
|
17 |
+
"output_type": "stream",
|
18 |
+
"text": [
|
19 |
+
"\u001b[32m2025-04-19 19:21:27.333\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.core.config\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m14\u001b[0m - \u001b[34m\u001b[1mCreated Settings\u001b[0m\n",
|
20 |
+
"\u001b[32m2025-04-19 19:21:27.334\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.core.config\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m14\u001b[0m - \u001b[34m\u001b[1mCreated Settings\u001b[0m\n",
|
21 |
+
"\u001b[32m2025-04-19 19:21:27.337\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.google_drive_service\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreated GoogleDriveService\u001b[0m\n",
|
22 |
+
"\u001b[32m2025-04-19 19:21:27.361\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.embeddings_model_service\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m22\u001b[0m - \u001b[34m\u001b[1mCreated EmbeddingsModelService\u001b[0m\n",
|
23 |
+
"\u001b[32m2025-04-19 19:21:27.362\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vectorization_service\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m22\u001b[0m - \u001b[34m\u001b[1mCreated VectorizationService\u001b[0m\n"
|
24 |
+
]
|
25 |
+
},
|
26 |
+
{
|
27 |
+
"name": "stderr",
|
28 |
+
"output_type": "stream",
|
29 |
+
"text": [
|
30 |
+
"\u001b[32m2025-04-19 19:21:27.364\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36minit\u001b[0m:\u001b[36m175\u001b[0m - \u001b[1mInitializing MongoDB connection for database: ctp_slack_bot\u001b[0m\n",
|
31 |
+
"\u001b[32m2025-04-19 19:21:27.364\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m26\u001b[0m - \u001b[34m\u001b[1mCreated MongoDB\u001b[0m\n",
|
32 |
+
"\u001b[32m2025-04-19 19:21:27.364\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mconnect\u001b[0m:\u001b[36m32\u001b[0m - \u001b[34m\u001b[1mConnecting to MongoDB using URI: mongodb+srv://ctp-slack-bot.xkipuvm.mongodb.net/?retryWrites=true&w=majority&appName=ctp-slack-bot\u001b[0m\n",
|
33 |
+
"\u001b[32m2025-04-19 19:21:27.365\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mconnect\u001b[0m:\u001b[36m49\u001b[0m - \u001b[34m\u001b[1mMongoDB client initialized for database: ctp_slack_bot\u001b[0m\n",
|
34 |
+
"\u001b[32m2025-04-19 19:21:27.825\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
|
35 |
+
"\u001b[32m2025-04-19 19:21:27.825\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36m_test_connection\u001b[0m:\u001b[36m186\u001b[0m - \u001b[1mMongoDB connection test successful!\u001b[0m\n",
|
36 |
+
"\u001b[32m2025-04-19 19:21:27.825\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m21\u001b[0m - \u001b[34m\u001b[1mCreated VectorDatabaseService\u001b[0m\n"
|
37 |
+
]
|
38 |
+
}
|
39 |
+
],
|
40 |
+
"source": [
|
41 |
+
"from datetime import datetime\n",
|
42 |
+
"from functools import partial\n",
|
43 |
+
"from html import escape\n",
|
44 |
+
"from IPython.display import display_html\n",
|
45 |
+
"from itertools import chain\n",
|
46 |
+
"from textwrap import wrap\n",
|
47 |
+
"from zoneinfo import ZoneInfo\n",
|
48 |
+
"\n",
|
49 |
+
"from ctp_slack_bot.containers import Container\n",
|
50 |
+
"from ctp_slack_bot.models import WebVTTContent\n",
|
51 |
+
"\n",
|
52 |
+
"display_html = partial(display_html, raw=True)\n",
|
53 |
+
"\n",
|
54 |
+
"container = Container()\n",
|
55 |
+
"google_drive_service = container.google_drive_service()\n",
|
56 |
+
"vectorization_service = container.vectorization_service()\n",
|
57 |
+
"vector_database_service = container.vector_database_service()"
|
58 |
+
]
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"cell_type": "markdown",
|
62 |
+
"metadata": {},
|
63 |
+
"source": [
|
64 |
+
"## Configuration\n",
|
65 |
+
"\n",
|
66 |
+
"⚠️ Configure before running the code to avoid processing the wrong file type or re-uploading past files which were already uploaded."
|
67 |
+
]
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"cell_type": "code",
|
71 |
+
"execution_count": 2,
|
72 |
+
"metadata": {},
|
73 |
+
"outputs": [],
|
74 |
+
"source": [
|
75 |
+
"MIME_TYPE = \"text/vtt\" # This should probably not be changed.\n",
|
76 |
+
"\n",
|
77 |
+
"MODIFICATION_TIME_CUTOFF = datetime(2024, 8, 30, tzinfo=ZoneInfo(\"UTC\"))"
|
78 |
+
]
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"cell_type": "markdown",
|
82 |
+
"metadata": {},
|
83 |
+
"source": [
|
84 |
+
"## Upload"
|
85 |
+
]
|
86 |
+
},
|
87 |
+
{
|
88 |
+
"cell_type": "code",
|
89 |
+
"execution_count": 3,
|
90 |
+
"metadata": {},
|
91 |
+
"outputs": [
|
92 |
+
{
|
93 |
+
"data": {
|
94 |
+
"text/html": [
|
95 |
+
"<p>Found 7 files/folders.</p>"
|
96 |
+
]
|
97 |
+
},
|
98 |
+
"metadata": {},
|
99 |
+
"output_type": "display_data"
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"data": {
|
103 |
+
"text/html": [
|
104 |
+
"<ul><li>Week-03-Analytics-Friday-2024-09-13.cc.vtt</li><li>Week-07-Regressors-via-Linear-Regression-Friday-2024-10-18.transcript.vtt</li><li>Week-06-Classifiers-via-Logistic-Regression-Friday-2024-10-11.transcript.vtt</li><li>Week-09-AI-Part-1-Neural-Networks-Intro-to-HuggingFace-Friday-2024-11-01.cc.vtt</li><li>Week-08-Decision-Trees-Random-Forest-Tuesday-2024-10-22.cc.vtt</li><li>Week-02-Finding-Cleaning-Data-Friday-2024-09-06.vtt</li><li>Week-01-Setup-Pandas-Friday-2024-08-30.vtt</li></ul>"
|
105 |
+
]
|
106 |
+
},
|
107 |
+
"metadata": {},
|
108 |
+
"output_type": "display_data"
|
109 |
+
},
|
110 |
+
{
|
111 |
+
"data": {
|
112 |
+
"text/html": [
|
113 |
+
"<p>7 files/folders pass the modification time (<em>2024-08-30 00:00:00+00:00</em>) cut-off.</p>"
|
114 |
+
]
|
115 |
+
},
|
116 |
+
"metadata": {},
|
117 |
+
"output_type": "display_data"
|
118 |
+
},
|
119 |
+
{
|
120 |
+
"data": {
|
121 |
+
"text/html": [
|
122 |
+
"<ul><li>Week-03-Analytics-Friday-2024-09-13.cc.vtt</li><li>Week-07-Regressors-via-Linear-Regression-Friday-2024-10-18.transcript.vtt</li><li>Week-06-Classifiers-via-Logistic-Regression-Friday-2024-10-11.transcript.vtt</li><li>Week-09-AI-Part-1-Neural-Networks-Intro-to-HuggingFace-Friday-2024-11-01.cc.vtt</li><li>Week-08-Decision-Trees-Random-Forest-Tuesday-2024-10-22.cc.vtt</li><li>Week-02-Finding-Cleaning-Data-Friday-2024-09-06.vtt</li><li>Week-01-Setup-Pandas-Friday-2024-08-30.vtt</li></ul>"
|
123 |
+
]
|
124 |
+
},
|
125 |
+
"metadata": {},
|
126 |
+
"output_type": "display_data"
|
127 |
+
},
|
128 |
+
{
|
129 |
+
"data": {
|
130 |
+
"text/html": [
|
131 |
+
"<p>7 files/folders pass the modification time (<em>2024-08-30 00:00:00+00:00</em>) cut-off and MIME type (<em>text/vtt</em>) criterion.</p>"
|
132 |
+
]
|
133 |
+
},
|
134 |
+
"metadata": {},
|
135 |
+
"output_type": "display_data"
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"data": {
|
139 |
+
"text/html": [
|
140 |
+
"<ul><li>Week-03-Analytics-Friday-2024-09-13.cc.vtt</li><li>Week-07-Regressors-via-Linear-Regression-Friday-2024-10-18.transcript.vtt</li><li>Week-06-Classifiers-via-Logistic-Regression-Friday-2024-10-11.transcript.vtt</li><li>Week-09-AI-Part-1-Neural-Networks-Intro-to-HuggingFace-Friday-2024-11-01.cc.vtt</li><li>Week-08-Decision-Trees-Random-Forest-Tuesday-2024-10-22.cc.vtt</li><li>Week-02-Finding-Cleaning-Data-Friday-2024-09-06.vtt</li><li>Week-01-Setup-Pandas-Friday-2024-08-30.vtt</li></ul>"
|
141 |
+
]
|
142 |
+
},
|
143 |
+
"metadata": {},
|
144 |
+
"output_type": "display_data"
|
145 |
+
}
|
146 |
+
],
|
147 |
+
"source": [
|
148 |
+
"item_metadata = google_drive_service.list_directory(\"\")\n",
|
149 |
+
"display_html(f\"<p>Found {len(item_metadata)} files/folders.</p>\")\n",
|
150 |
+
"display_html(\"\".join(chain(\"<ul>\", (f\"<li>{escape(metadata.name)}</li>\" for metadata in item_metadata), \"</ul>\")))\n",
|
151 |
+
"\n",
|
152 |
+
"recent_metadata = tuple(filter(lambda metadata: MODIFICATION_TIME_CUTOFF <= metadata.modified_time, item_metadata))\n",
|
153 |
+
"display_html(f\"<p>{len(item_metadata)} files/folders pass the modification time (<em>{MODIFICATION_TIME_CUTOFF}</em>) cut-off.</p>\")\n",
|
154 |
+
"display_html(\"\".join(chain(\"<ul>\", (f\"<li>{escape(metadata.name)}</li>\" for metadata in recent_metadata), \"</ul>\")))\n",
|
155 |
+
"\n",
|
156 |
+
"metadata_to_process = tuple(filter(lambda metadata: metadata.mime_type == MIME_TYPE, recent_metadata))\n",
|
157 |
+
"display_html(f\"<p>{len(item_metadata)} files/folders pass the modification time (<em>{MODIFICATION_TIME_CUTOFF}</em>) cut-off and MIME type (<em>{MIME_TYPE}</em>) criterion.</p>\")\n",
|
158 |
+
"display_html(\"\".join(chain(\"<ul>\", (f\"<li>{escape(metadata.name)}</li>\" for metadata in metadata_to_process), \"</ul>\")))"
|
159 |
+
]
|
160 |
+
},
|
161 |
+
{
|
162 |
+
"cell_type": "code",
|
163 |
+
"execution_count": 4,
|
164 |
+
"metadata": {},
|
165 |
+
"outputs": [
|
166 |
+
{
|
167 |
+
"data": {
|
168 |
+
"text/html": [
|
169 |
+
"Processed 7 files."
|
170 |
+
]
|
171 |
+
},
|
172 |
+
"metadata": {},
|
173 |
+
"output_type": "display_data"
|
174 |
+
}
|
175 |
+
],
|
176 |
+
"source": [
|
177 |
+
"web_vtts = tuple(WebVTTContent.from_bytes(f\"googledrive:{metadata.folder_path}/{metadata.name}\",\n",
|
178 |
+
" {\n",
|
179 |
+
" \"filename\": metadata.name,\n",
|
180 |
+
" \"mimeType\": metadata.mime_type,\n",
|
181 |
+
" \"modificationTime\": metadata.modified_time\n",
|
182 |
+
" },\n",
|
183 |
+
" google_drive_service.read_file_by_id(metadata.id))\n",
|
184 |
+
" for metadata\n",
|
185 |
+
" in metadata_to_process)\n",
|
186 |
+
"\n",
|
187 |
+
"display_html(f\"Processed {len(web_vtts)} files.\")"
|
188 |
+
]
|
189 |
+
},
|
190 |
+
{
|
191 |
+
"cell_type": "code",
|
192 |
+
"execution_count": 5,
|
193 |
+
"metadata": {},
|
194 |
+
"outputs": [
|
195 |
+
{
|
196 |
+
"data": {
|
197 |
+
"text/html": [
|
198 |
+
"Chunked Week-03-Analytics-Friday-2024-09-13.cc.vtt into 496 chunks."
|
199 |
+
]
|
200 |
+
},
|
201 |
+
"metadata": {},
|
202 |
+
"output_type": "display_data"
|
203 |
+
},
|
204 |
+
{
|
205 |
+
"name": "stderr",
|
206 |
+
"output_type": "stream",
|
207 |
+
"text": [
|
208 |
+
"\u001b[32m2025-04-19 19:21:37.826\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.embeddings_model_service\u001b[0m:\u001b[36mget_embeddings\u001b[0m:\u001b[36m36\u001b[0m - \u001b[34m\u001b[1mCreating embeddings for 496 text string(s)…\u001b[0m\n"
|
209 |
+
]
|
210 |
+
},
|
211 |
+
{
|
212 |
+
"data": {
|
213 |
+
"text/html": [
|
214 |
+
"Vectorized Week-03-Analytics-Friday-2024-09-13.cc.vtt’s 496 chunks."
|
215 |
+
]
|
216 |
+
},
|
217 |
+
"metadata": {},
|
218 |
+
"output_type": "display_data"
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"name": "stderr",
|
222 |
+
"output_type": "stream",
|
223 |
+
"text": [
|
224 |
+
"\u001b[32m2025-04-19 19:21:42.297\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m38\u001b[0m - \u001b[34m\u001b[1mGetting vectors collection for storing 496 chunks\u001b[0m\n",
|
225 |
+
"\u001b[32m2025-04-19 19:21:42.319\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
|
226 |
+
"\u001b[32m2025-04-19 19:21:42.320\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
|
227 |
+
"\u001b[32m2025-04-19 19:21:42.340\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
|
228 |
+
"\u001b[32m2025-04-19 19:21:42.341\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreating vector search index for vectors collection\u001b[0m\n",
|
229 |
+
"\u001b[32m2025-04-19 19:21:42.360\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
|
230 |
+
"\u001b[32m2025-04-19 19:21:42.360\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
|
231 |
+
"\u001b[32m2025-04-19 19:21:42.380\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
|
232 |
+
"\u001b[32m2025-04-19 19:21:42.500\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mcreate_indexes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1mVector search index 'vectors_vector_index' created for collection vectors.\u001b[0m\n",
|
233 |
+
"\u001b[32m2025-04-19 19:21:42.505\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m62\u001b[0m - \u001b[34m\u001b[1mInserting 496 documents into vectors collection\u001b[0m\n",
|
234 |
+
"\u001b[32m2025-04-19 19:21:48.862\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mStored 496 vector chunks in database\u001b[0m\n"
|
235 |
+
]
|
236 |
+
},
|
237 |
+
{
|
238 |
+
"data": {
|
239 |
+
"text/html": [
|
240 |
+
"Stored Week-03-Analytics-Friday-2024-09-13.cc.vtt’s 496 vectorized chunks to the database."
|
241 |
+
]
|
242 |
+
},
|
243 |
+
"metadata": {},
|
244 |
+
"output_type": "display_data"
|
245 |
+
},
|
246 |
+
{
|
247 |
+
"data": {
|
248 |
+
"text/html": [
|
249 |
+
"Chunked Week-07-Regressors-via-Linear-Regression-Friday-2024-10-18.transcript.vtt into 321 chunks."
|
250 |
+
]
|
251 |
+
},
|
252 |
+
"metadata": {},
|
253 |
+
"output_type": "display_data"
|
254 |
+
},
|
255 |
+
{
|
256 |
+
"name": "stderr",
|
257 |
+
"output_type": "stream",
|
258 |
+
"text": [
|
259 |
+
"\u001b[32m2025-04-19 19:21:48.866\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.embeddings_model_service\u001b[0m:\u001b[36mget_embeddings\u001b[0m:\u001b[36m36\u001b[0m - \u001b[34m\u001b[1mCreating embeddings for 321 text string(s)…\u001b[0m\n"
|
260 |
+
]
|
261 |
+
},
|
262 |
+
{
|
263 |
+
"data": {
|
264 |
+
"text/html": [
|
265 |
+
"Vectorized Week-07-Regressors-via-Linear-Regression-Friday-2024-10-18.transcript.vtt’s 321 chunks."
|
266 |
+
]
|
267 |
+
},
|
268 |
+
"metadata": {},
|
269 |
+
"output_type": "display_data"
|
270 |
+
},
|
271 |
+
{
|
272 |
+
"name": "stderr",
|
273 |
+
"output_type": "stream",
|
274 |
+
"text": [
|
275 |
+
"\u001b[32m2025-04-19 19:21:52.629\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m38\u001b[0m - \u001b[34m\u001b[1mGetting vectors collection for storing 321 chunks\u001b[0m\n",
|
276 |
+
"\u001b[32m2025-04-19 19:21:52.652\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
|
277 |
+
"\u001b[32m2025-04-19 19:21:52.652\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
|
278 |
+
"\u001b[32m2025-04-19 19:21:52.671\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
|
279 |
+
"\u001b[32m2025-04-19 19:21:52.672\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreating vector search index for vectors collection\u001b[0m\n",
|
280 |
+
"\u001b[32m2025-04-19 19:21:52.691\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
|
281 |
+
"\u001b[32m2025-04-19 19:21:52.691\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
|
282 |
+
"\u001b[32m2025-04-19 19:21:52.712\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
|
283 |
+
"\u001b[32m2025-04-19 19:21:52.829\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mcreate_indexes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1mVector search index 'vectors_vector_index' created for collection vectors.\u001b[0m\n",
|
284 |
+
"\u001b[32m2025-04-19 19:21:52.831\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m62\u001b[0m - \u001b[34m\u001b[1mInserting 321 documents into vectors collection\u001b[0m\n",
|
285 |
+
"\u001b[32m2025-04-19 19:21:58.227\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mStored 321 vector chunks in database\u001b[0m\n"
|
286 |
+
]
|
287 |
+
},
|
288 |
+
{
|
289 |
+
"data": {
|
290 |
+
"text/html": [
|
291 |
+
"Stored Week-07-Regressors-via-Linear-Regression-Friday-2024-10-18.transcript.vtt’s 321 vectorized chunks to the database."
|
292 |
+
]
|
293 |
+
},
|
294 |
+
"metadata": {},
|
295 |
+
"output_type": "display_data"
|
296 |
+
},
|
297 |
+
{
|
298 |
+
"data": {
|
299 |
+
"text/html": [
|
300 |
+
"Chunked Week-06-Classifiers-via-Logistic-Regression-Friday-2024-10-11.transcript.vtt into 337 chunks."
|
301 |
+
]
|
302 |
+
},
|
303 |
+
"metadata": {},
|
304 |
+
"output_type": "display_data"
|
305 |
+
},
|
306 |
+
{
|
307 |
+
"name": "stderr",
|
308 |
+
"output_type": "stream",
|
309 |
+
"text": [
|
310 |
+
"\u001b[32m2025-04-19 19:21:58.231\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.embeddings_model_service\u001b[0m:\u001b[36mget_embeddings\u001b[0m:\u001b[36m36\u001b[0m - \u001b[34m\u001b[1mCreating embeddings for 337 text string(s)…\u001b[0m\n"
|
311 |
+
]
|
312 |
+
},
|
313 |
+
{
|
314 |
+
"data": {
|
315 |
+
"text/html": [
|
316 |
+
"Vectorized Week-06-Classifiers-via-Logistic-Regression-Friday-2024-10-11.transcript.vtt’s 337 chunks."
|
317 |
+
]
|
318 |
+
},
|
319 |
+
"metadata": {},
|
320 |
+
"output_type": "display_data"
|
321 |
+
},
|
322 |
+
{
|
323 |
+
"name": "stderr",
|
324 |
+
"output_type": "stream",
|
325 |
+
"text": [
|
326 |
+
"\u001b[32m2025-04-19 19:22:02.126\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m38\u001b[0m - \u001b[34m\u001b[1mGetting vectors collection for storing 337 chunks\u001b[0m\n",
|
327 |
+
"\u001b[32m2025-04-19 19:22:02.147\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
|
328 |
+
"\u001b[32m2025-04-19 19:22:02.147\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
|
329 |
+
"\u001b[32m2025-04-19 19:22:02.167\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
|
330 |
+
"\u001b[32m2025-04-19 19:22:02.167\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreating vector search index for vectors collection\u001b[0m\n",
|
331 |
+
"\u001b[32m2025-04-19 19:22:02.186\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
|
332 |
+
"\u001b[32m2025-04-19 19:22:02.187\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
|
333 |
+
"\u001b[32m2025-04-19 19:22:02.207\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
|
334 |
+
"\u001b[32m2025-04-19 19:22:02.352\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mcreate_indexes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1mVector search index 'vectors_vector_index' created for collection vectors.\u001b[0m\n",
|
335 |
+
"\u001b[32m2025-04-19 19:22:02.354\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m62\u001b[0m - \u001b[34m\u001b[1mInserting 337 documents into vectors collection\u001b[0m\n",
|
336 |
+
"\u001b[32m2025-04-19 19:22:08.520\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mStored 337 vector chunks in database\u001b[0m\n"
|
337 |
+
]
|
338 |
+
},
|
339 |
+
{
|
340 |
+
"data": {
|
341 |
+
"text/html": [
|
342 |
+
"Stored Week-06-Classifiers-via-Logistic-Regression-Friday-2024-10-11.transcript.vtt’s 337 vectorized chunks to the database."
|
343 |
+
]
|
344 |
+
},
|
345 |
+
"metadata": {},
|
346 |
+
"output_type": "display_data"
|
347 |
+
},
|
348 |
+
{
|
349 |
+
"data": {
|
350 |
+
"text/html": [
|
351 |
+
"Chunked Week-09-AI-Part-1-Neural-Networks-Intro-to-HuggingFace-Friday-2024-11-01.cc.vtt into 341 chunks."
|
352 |
+
]
|
353 |
+
},
|
354 |
+
"metadata": {},
|
355 |
+
"output_type": "display_data"
|
356 |
+
},
|
357 |
+
{
|
358 |
+
"name": "stderr",
|
359 |
+
"output_type": "stream",
|
360 |
+
"text": [
|
361 |
+
"\u001b[32m2025-04-19 19:22:08.524\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.embeddings_model_service\u001b[0m:\u001b[36mget_embeddings\u001b[0m:\u001b[36m36\u001b[0m - \u001b[34m\u001b[1mCreating embeddings for 341 text string(s)…\u001b[0m\n"
|
362 |
+
]
|
363 |
+
},
|
364 |
+
{
|
365 |
+
"data": {
|
366 |
+
"text/html": [
|
367 |
+
"Vectorized Week-09-AI-Part-1-Neural-Networks-Intro-to-HuggingFace-Friday-2024-11-01.cc.vtt’s 341 chunks."
|
368 |
+
]
|
369 |
+
},
|
370 |
+
"metadata": {},
|
371 |
+
"output_type": "display_data"
|
372 |
+
},
|
373 |
+
{
|
374 |
+
"name": "stderr",
|
375 |
+
"output_type": "stream",
|
376 |
+
"text": [
|
377 |
+
"\u001b[32m2025-04-19 19:22:12.675\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m38\u001b[0m - \u001b[34m\u001b[1mGetting vectors collection for storing 341 chunks\u001b[0m\n",
|
378 |
+
"\u001b[32m2025-04-19 19:22:12.712\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
|
379 |
+
"\u001b[32m2025-04-19 19:22:12.712\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
|
380 |
+
"\u001b[32m2025-04-19 19:22:12.731\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
|
381 |
+
"\u001b[32m2025-04-19 19:22:12.731\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreating vector search index for vectors collection\u001b[0m\n",
|
382 |
+
"\u001b[32m2025-04-19 19:22:12.750\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
|
383 |
+
"\u001b[32m2025-04-19 19:22:12.751\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
|
384 |
+
"\u001b[32m2025-04-19 19:22:12.773\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
|
385 |
+
"\u001b[32m2025-04-19 19:22:12.924\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mcreate_indexes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1mVector search index 'vectors_vector_index' created for collection vectors.\u001b[0m\n",
|
386 |
+
"\u001b[32m2025-04-19 19:22:12.926\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m62\u001b[0m - \u001b[34m\u001b[1mInserting 341 documents into vectors collection\u001b[0m\n",
|
387 |
+
"\u001b[32m2025-04-19 19:22:18.356\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mStored 341 vector chunks in database\u001b[0m\n"
|
388 |
+
]
|
389 |
+
},
|
390 |
+
{
|
391 |
+
"data": {
|
392 |
+
"text/html": [
|
393 |
+
"Stored Week-09-AI-Part-1-Neural-Networks-Intro-to-HuggingFace-Friday-2024-11-01.cc.vtt’s 341 vectorized chunks to the database."
|
394 |
+
]
|
395 |
+
},
|
396 |
+
"metadata": {},
|
397 |
+
"output_type": "display_data"
|
398 |
+
},
|
399 |
+
{
|
400 |
+
"data": {
|
401 |
+
"text/html": [
|
402 |
+
"Chunked Week-08-Decision-Trees-Random-Forest-Tuesday-2024-10-22.cc.vtt into 378 chunks."
|
403 |
+
]
|
404 |
+
},
|
405 |
+
"metadata": {},
|
406 |
+
"output_type": "display_data"
|
407 |
+
},
|
408 |
+
{
|
409 |
+
"name": "stderr",
|
410 |
+
"output_type": "stream",
|
411 |
+
"text": [
|
412 |
+
"\u001b[32m2025-04-19 19:22:18.360\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.embeddings_model_service\u001b[0m:\u001b[36mget_embeddings\u001b[0m:\u001b[36m36\u001b[0m - \u001b[34m\u001b[1mCreating embeddings for 378 text string(s)…\u001b[0m\n"
|
413 |
+
]
|
414 |
+
},
|
415 |
+
{
|
416 |
+
"data": {
|
417 |
+
"text/html": [
|
418 |
+
"Vectorized Week-08-Decision-Trees-Random-Forest-Tuesday-2024-10-22.cc.vtt’s 378 chunks."
|
419 |
+
]
|
420 |
+
},
|
421 |
+
"metadata": {},
|
422 |
+
"output_type": "display_data"
|
423 |
+
},
|
424 |
+
{
|
425 |
+
"name": "stderr",
|
426 |
+
"output_type": "stream",
|
427 |
+
"text": [
|
428 |
+
"\u001b[32m2025-04-19 19:22:21.808\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m38\u001b[0m - \u001b[34m\u001b[1mGetting vectors collection for storing 378 chunks\u001b[0m\n",
|
429 |
+
"\u001b[32m2025-04-19 19:22:21.841\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
|
430 |
+
"\u001b[32m2025-04-19 19:22:21.841\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
|
431 |
+
"\u001b[32m2025-04-19 19:22:21.873\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
|
432 |
+
"\u001b[32m2025-04-19 19:22:21.874\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreating vector search index for vectors collection\u001b[0m\n",
|
433 |
+
"\u001b[32m2025-04-19 19:22:21.894\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
|
434 |
+
"\u001b[32m2025-04-19 19:22:21.894\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
|
435 |
+
"\u001b[32m2025-04-19 19:22:21.914\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
|
436 |
+
"\u001b[32m2025-04-19 19:22:22.029\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mcreate_indexes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1mVector search index 'vectors_vector_index' created for collection vectors.\u001b[0m\n",
|
437 |
+
"\u001b[32m2025-04-19 19:22:22.035\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m62\u001b[0m - \u001b[34m\u001b[1mInserting 378 documents into vectors collection\u001b[0m\n",
|
438 |
+
"\u001b[32m2025-04-19 19:22:28.108\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mStored 378 vector chunks in database\u001b[0m\n"
|
439 |
+
]
|
440 |
+
},
|
441 |
+
{
|
442 |
+
"data": {
|
443 |
+
"text/html": [
|
444 |
+
"Stored Week-08-Decision-Trees-Random-Forest-Tuesday-2024-10-22.cc.vtt’s 378 vectorized chunks to the database."
|
445 |
+
]
|
446 |
+
},
|
447 |
+
"metadata": {},
|
448 |
+
"output_type": "display_data"
|
449 |
+
},
|
450 |
+
{
|
451 |
+
"data": {
|
452 |
+
"text/html": [
|
453 |
+
"Chunked Week-02-Finding-Cleaning-Data-Friday-2024-09-06.vtt into 680 chunks."
|
454 |
+
]
|
455 |
+
},
|
456 |
+
"metadata": {},
|
457 |
+
"output_type": "display_data"
|
458 |
+
},
|
459 |
+
{
|
460 |
+
"name": "stderr",
|
461 |
+
"output_type": "stream",
|
462 |
+
"text": [
|
463 |
+
"\u001b[32m2025-04-19 19:22:28.113\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.embeddings_model_service\u001b[0m:\u001b[36mget_embeddings\u001b[0m:\u001b[36m36\u001b[0m - \u001b[34m\u001b[1mCreating embeddings for 680 text string(s)…\u001b[0m\n"
|
464 |
+
]
|
465 |
+
},
|
466 |
+
{
|
467 |
+
"data": {
|
468 |
+
"text/html": [
|
469 |
+
"Vectorized Week-02-Finding-Cleaning-Data-Friday-2024-09-06.vtt’s 680 chunks."
|
470 |
+
]
|
471 |
+
},
|
472 |
+
"metadata": {},
|
473 |
+
"output_type": "display_data"
|
474 |
+
},
|
475 |
+
{
|
476 |
+
"name": "stderr",
|
477 |
+
"output_type": "stream",
|
478 |
+
"text": [
|
479 |
+
"\u001b[32m2025-04-19 19:22:34.652\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m38\u001b[0m - \u001b[34m\u001b[1mGetting vectors collection for storing 680 chunks\u001b[0m\n",
|
480 |
+
"\u001b[32m2025-04-19 19:22:34.671\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
|
481 |
+
"\u001b[32m2025-04-19 19:22:34.671\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
|
482 |
+
"\u001b[32m2025-04-19 19:22:34.705\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
|
483 |
+
"\u001b[32m2025-04-19 19:22:34.705\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreating vector search index for vectors collection\u001b[0m\n",
|
484 |
+
"\u001b[32m2025-04-19 19:22:34.720\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
|
485 |
+
"\u001b[32m2025-04-19 19:22:34.720\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
|
486 |
+
"\u001b[32m2025-04-19 19:22:34.740\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
|
487 |
+
"\u001b[32m2025-04-19 19:22:34.859\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mcreate_indexes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1mVector search index 'vectors_vector_index' created for collection vectors.\u001b[0m\n",
|
488 |
+
"\u001b[32m2025-04-19 19:22:34.866\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m62\u001b[0m - \u001b[34m\u001b[1mInserting 680 documents into vectors collection\u001b[0m\n",
|
489 |
+
"\u001b[32m2025-04-19 19:22:43.431\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mStored 680 vector chunks in database\u001b[0m\n"
|
490 |
+
]
|
491 |
+
},
|
492 |
+
{
|
493 |
+
"data": {
|
494 |
+
"text/html": [
|
495 |
+
"Stored Week-02-Finding-Cleaning-Data-Friday-2024-09-06.vtt’s 680 vectorized chunks to the database."
|
496 |
+
]
|
497 |
+
},
|
498 |
+
"metadata": {},
|
499 |
+
"output_type": "display_data"
|
500 |
+
},
|
501 |
+
{
|
502 |
+
"data": {
|
503 |
+
"text/html": [
|
504 |
+
"Chunked Week-01-Setup-Pandas-Friday-2024-08-30.vtt into 742 chunks."
|
505 |
+
]
|
506 |
+
},
|
507 |
+
"metadata": {},
|
508 |
+
"output_type": "display_data"
|
509 |
+
},
|
510 |
+
{
|
511 |
+
"name": "stderr",
|
512 |
+
"output_type": "stream",
|
513 |
+
"text": [
|
514 |
+
"\u001b[32m2025-04-19 19:22:43.438\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.embeddings_model_service\u001b[0m:\u001b[36mget_embeddings\u001b[0m:\u001b[36m36\u001b[0m - \u001b[34m\u001b[1mCreating embeddings for 742 text string(s)…\u001b[0m\n"
|
515 |
+
]
|
516 |
+
},
|
517 |
+
{
|
518 |
+
"data": {
|
519 |
+
"text/html": [
|
520 |
+
"Vectorized Week-01-Setup-Pandas-Friday-2024-08-30.vtt’s 742 chunks."
|
521 |
+
]
|
522 |
+
},
|
523 |
+
"metadata": {},
|
524 |
+
"output_type": "display_data"
|
525 |
+
},
|
526 |
+
{
|
527 |
+
"name": "stderr",
|
528 |
+
"output_type": "stream",
|
529 |
+
"text": [
|
530 |
+
"\u001b[32m2025-04-19 19:22:50.402\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m38\u001b[0m - \u001b[34m\u001b[1mGetting vectors collection for storing 742 chunks\u001b[0m\n",
|
531 |
+
"\u001b[32m2025-04-19 19:22:50.426\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
|
532 |
+
"\u001b[32m2025-04-19 19:22:50.426\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
|
533 |
+
"\u001b[32m2025-04-19 19:22:50.452\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
|
534 |
+
"\u001b[32m2025-04-19 19:22:50.452\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreating vector search index for vectors collection\u001b[0m\n",
|
535 |
+
"\u001b[32m2025-04-19 19:22:50.475\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mping\u001b[0m:\u001b[36m85\u001b[0m - \u001b[34m\u001b[1mMongoDB connection is active!\u001b[0m\n",
|
536 |
+
"\u001b[32m2025-04-19 19:22:50.475\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m106\u001b[0m - \u001b[34m\u001b[1mChecking if collection 'vectors' exists…\u001b[0m\n",
|
537 |
+
"\u001b[32m2025-04-19 19:22:50.508\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mget_collection\u001b[0m:\u001b[36m115\u001b[0m - \u001b[34m\u001b[1mCollection 'vectors' already exists!\u001b[0m\n",
|
538 |
+
"\u001b[32m2025-04-19 19:22:50.617\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.db.mongo_db\u001b[0m:\u001b[36mcreate_indexes\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1mVector search index 'vectors_vector_index' created for collection vectors.\u001b[0m\n",
|
539 |
+
"\u001b[32m2025-04-19 19:22:50.626\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m62\u001b[0m - \u001b[34m\u001b[1mInserting 742 documents into vectors collection\u001b[0m\n",
|
540 |
+
"\u001b[32m2025-04-19 19:23:01.166\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mctp_slack_bot.services.vector_database_service\u001b[0m:\u001b[36mstore\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mStored 742 vector chunks in database\u001b[0m\n"
|
541 |
+
]
|
542 |
+
},
|
543 |
+
{
|
544 |
+
"data": {
|
545 |
+
"text/html": [
|
546 |
+
"Stored Week-01-Setup-Pandas-Friday-2024-08-30.vtt’s 742 vectorized chunks to the database."
|
547 |
+
]
|
548 |
+
},
|
549 |
+
"metadata": {},
|
550 |
+
"output_type": "display_data"
|
551 |
+
}
|
552 |
+
],
|
553 |
+
"source": [
|
554 |
+
"for web_vtt in web_vtts:\n",
|
555 |
+
" chunks = web_vtt.get_chunks()\n",
|
556 |
+
" display_html(f\"Chunked {web_vtt.get_metadata().get(\"filename\")} into {len(chunks)} chunks.\")\n",
|
557 |
+
" vectorized_chunks = vectorization_service.vectorize(chunks)\n",
|
558 |
+
" display_html(f\"Vectorized {web_vtt.get_metadata().get(\"filename\")}’s {len(vectorized_chunks)} chunks.\")\n",
|
559 |
+
" await (await vector_database_service).store(vectorized_chunks)\n",
|
560 |
+
" display_html(f\"Stored {web_vtt.get_metadata().get(\"filename\")}’s {len(vectorized_chunks)} vectorized chunks to the database.\")"
|
561 |
+
]
|
562 |
+
}
|
563 |
+
],
|
564 |
+
"metadata": {
|
565 |
+
"kernelspec": {
|
566 |
+
"display_name": ".venv",
|
567 |
+
"language": "python",
|
568 |
+
"name": "python3"
|
569 |
+
},
|
570 |
+
"language_info": {
|
571 |
+
"codemirror_mode": {
|
572 |
+
"name": "ipython",
|
573 |
+
"version": 3
|
574 |
+
},
|
575 |
+
"file_extension": ".py",
|
576 |
+
"mimetype": "text/x-python",
|
577 |
+
"name": "python",
|
578 |
+
"nbconvert_exporter": "python",
|
579 |
+
"pygments_lexer": "ipython3",
|
580 |
+
"version": "3.12.3"
|
581 |
+
}
|
582 |
+
},
|
583 |
+
"nbformat": 4,
|
584 |
+
"nbformat_minor": 2
|
585 |
+
}
|
notebooks/web_vtt.ipynb
ADDED
@@ -0,0 +1,355 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {},
|
6 |
+
"source": [
|
7 |
+
"# WebVTT Reading and Chunking Test"
|
8 |
+
]
|
9 |
+
},
|
10 |
+
{
|
11 |
+
"cell_type": "markdown",
|
12 |
+
"metadata": {},
|
13 |
+
"source": [
|
14 |
+
"## Pure `webvtt-py` as Proof-of-concept"
|
15 |
+
]
|
16 |
+
},
|
17 |
+
{
|
18 |
+
"cell_type": "code",
|
19 |
+
"execution_count": 1,
|
20 |
+
"metadata": {},
|
21 |
+
"outputs": [],
|
22 |
+
"source": [
|
23 |
+
"from datetime import datetime, timedelta\n",
|
24 |
+
"from functools import partial\n",
|
25 |
+
"from html import escape\n",
|
26 |
+
"from io import BytesIO\n",
|
27 |
+
"from IPython.display import display_html\n",
|
28 |
+
"from itertools import chain\n",
|
29 |
+
"import re\n",
|
30 |
+
"from webvtt import Caption, WebVTT\n",
|
31 |
+
"from webvtt.models import Timestamp\n",
|
32 |
+
"from zoneinfo import ZoneInfo\n",
|
33 |
+
"\n",
|
34 |
+
"display_html = partial(display_html, raw=True)"
|
35 |
+
]
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"cell_type": "code",
|
39 |
+
"execution_count": null,
|
40 |
+
"metadata": {},
|
41 |
+
"outputs": [],
|
42 |
+
"source": [
|
43 |
+
"FILE_PATH = \"GMT20250411-223535_Recording.transcript.vtt\"\n",
|
44 |
+
"TIME_ZONE = ZoneInfo(\"America/New_York\")\n",
|
45 |
+
"BASE_TIME = datetime(2025, 4, 11, hour=22, minute=35, second=35, tzinfo=ZoneInfo(\"GMT\")).astimezone(TIME_ZONE)"
|
46 |
+
]
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"cell_type": "code",
|
50 |
+
"execution_count": 3,
|
51 |
+
"metadata": {},
|
52 |
+
"outputs": [],
|
53 |
+
"source": [
|
54 |
+
"with open(FILE_PATH, \"rb\") as file:\n",
|
55 |
+
" web_vtt = WebVTT.from_buffer(BytesIO(file.read()))"
|
56 |
+
]
|
57 |
+
},
|
58 |
+
{
|
59 |
+
"cell_type": "code",
|
60 |
+
"execution_count": 4,
|
61 |
+
"metadata": {},
|
62 |
+
"outputs": [
|
63 |
+
{
|
64 |
+
"data": {
|
65 |
+
"text/html": [
|
66 |
+
"<ul><li>__class__</li><li>__delattr__</li><li>__dict__</li><li>__dir__</li><li>__doc__</li><li>__eq__</li><li>__format__</li><li>__ge__</li><li>__getattribute__</li><li>__getitem__</li><li>__getstate__</li><li>__gt__</li><li>__hash__</li><li>__init__</li><li>__init_subclass__</li><li>__le__</li><li>__len__</li><li>__lt__</li><li>__module__</li><li>__ne__</li><li>__new__</li><li>__reduce__</li><li>__reduce_ex__</li><li>__repr__</li><li>__setattr__</li><li>__sizeof__</li><li>__str__</li><li>__subclasshook__</li><li>__weakref__</li><li>_get_destination_file</li><li>_get_lines</li><li>_has_bom</li><li>captions</li><li>content</li><li>encoding</li><li>file</li><li>footer_comments</li><li>from_buffer</li><li>from_sbv</li><li>from_srt</li><li>from_string</li><li>header_comments</li><li>iter_slice</li><li>read</li><li>read_buffer</li><li>save</li><li>save_as_srt</li><li>styles</li><li>total_length</li><li>write</li></ul>"
|
67 |
+
]
|
68 |
+
},
|
69 |
+
"metadata": {},
|
70 |
+
"output_type": "display_data"
|
71 |
+
}
|
72 |
+
],
|
73 |
+
"source": [
|
74 |
+
"display_html(\"\".join(chain(\"<ul>\", (f\"<li>{escape(member)}</li>\" for member in dir(web_vtt)), \"</ul>\")))"
|
75 |
+
]
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"cell_type": "code",
|
79 |
+
"execution_count": 5,
|
80 |
+
"metadata": {},
|
81 |
+
"outputs": [
|
82 |
+
{
|
83 |
+
"data": {
|
84 |
+
"text/html": [
|
85 |
+
"\n",
|
86 |
+
" <strong>Caption</strong> #344\n",
|
87 |
+
" <ul>\n",
|
88 |
+
" <li><strong>Start:</strong> Friday, April 11, 2025, 07:36:54 PM EDT</li>\n",
|
89 |
+
" <li><strong>Speaker:</strong> CUNY Tech Prep (CTP)</li>\n",
|
90 |
+
" <li><strong>Speech:</strong> Alright. You can pick the rooms. Now go into your rooms.</li>\n",
|
91 |
+
" <li><strong>End:</strong> Friday, April 11, 2025, 07:36:57 PM EDT</li>\n",
|
92 |
+
" </ul>\n",
|
93 |
+
" "
|
94 |
+
]
|
95 |
+
},
|
96 |
+
"metadata": {},
|
97 |
+
"output_type": "display_data"
|
98 |
+
}
|
99 |
+
],
|
100 |
+
"source": [
|
101 |
+
"speaker_speech_pattern = re.compile(\"(?:([^:]+): )?(.*)\")\n",
|
102 |
+
"\n",
|
103 |
+
"match web_vtt.captions[343]:\n",
|
104 |
+
" case Caption(identifier=identifier, start_time=start_time, end_time=end_time, text=text):\n",
|
105 |
+
" match speaker_speech_pattern.search(text).groups():\n",
|
106 |
+
" case (speaker, speech):\n",
|
107 |
+
" display_html(f\"\"\"\n",
|
108 |
+
" <strong>Caption</strong> #{identifier}\n",
|
109 |
+
" <ul>\n",
|
110 |
+
" <li><strong>Start:</strong> {BASE_TIME + timedelta(**start_time.__dict__):%A, %B %d, %Y, %I:%M:%S %p %Z}</li>\n",
|
111 |
+
" <li><strong>Speaker:</strong> {escape(speaker)}</li>\n",
|
112 |
+
" <li><strong>Speech:</strong> {escape(speech)}</li>\n",
|
113 |
+
" <li><strong>End:</strong> {BASE_TIME + timedelta(**end_time.__dict__):%A, %B %d, %Y, %I:%M:%S %p %Z}</li>\n",
|
114 |
+
" </ul>\n",
|
115 |
+
" \"\"\")"
|
116 |
+
]
|
117 |
+
},
|
118 |
+
{
|
119 |
+
"cell_type": "markdown",
|
120 |
+
"metadata": {},
|
121 |
+
"source": [
|
122 |
+
"### Chunking\n",
|
123 |
+
"\n",
|
124 |
+
"In order for chunking to produce bits with useful context, we must not only use the caption (frame) itself, but bundle it with its surrounding frames (before and after messages)."
|
125 |
+
]
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"cell_type": "code",
|
129 |
+
"execution_count": 6,
|
130 |
+
"metadata": {},
|
131 |
+
"outputs": [],
|
132 |
+
"source": [
|
133 |
+
"from more_itertools import windowed"
|
134 |
+
]
|
135 |
+
},
|
136 |
+
{
|
137 |
+
"cell_type": "code",
|
138 |
+
"execution_count": 7,
|
139 |
+
"metadata": {},
|
140 |
+
"outputs": [],
|
141 |
+
"source": [
|
142 |
+
"CHUNK_FRAMES_OVERLAP = 1\n",
|
143 |
+
"CHUNK_FRAMES_WINDOW = 5"
|
144 |
+
]
|
145 |
+
},
|
146 |
+
{
|
147 |
+
"cell_type": "code",
|
148 |
+
"execution_count": 8,
|
149 |
+
"metadata": {},
|
150 |
+
"outputs": [
|
151 |
+
{
|
152 |
+
"data": {
|
153 |
+
"text/html": [
|
154 |
+
"<table><tr><td>A</td></tr><tr><td>B</td></tr><tr><td>C</td></tr><tr><td>D</td></tr><tr><td>E</td></tr><tr><td>F</td></tr><tr><td>G</td></tr><tr><td>H</td></tr><tr><td>I</td></tr><tr><td>J</td></tr><tr><td>K</td></tr><tr><td>L</td></tr><tr><td>M</td></tr><tr><td>N</td></tr><tr><td>O</td></tr><tr><td>P</td></tr><tr><td>Q</td></tr><tr><td>R</td></tr><tr><td>S</td></tr><tr><td>T</td></tr><tr><td>U</td></tr><tr><td>V</td></tr><tr><td>W</td></tr><tr><td>X</td></tr><tr><td>Y</td></tr><tr><td>Z</td></tr></table>"
|
155 |
+
]
|
156 |
+
},
|
157 |
+
"metadata": {},
|
158 |
+
"output_type": "display_data"
|
159 |
+
}
|
160 |
+
],
|
161 |
+
"source": [
|
162 |
+
"items = tuple(chr(code_point) for code_point in range(ord('A'), ord('[')))\n",
|
163 |
+
"display_html(f\"<table>{\"\".join(map(\"<tr><td>{}</td></tr>\".format, items))}</table>\")"
|
164 |
+
]
|
165 |
+
},
|
166 |
+
{
|
167 |
+
"cell_type": "code",
|
168 |
+
"execution_count": 9,
|
169 |
+
"metadata": {},
|
170 |
+
"outputs": [
|
171 |
+
{
|
172 |
+
"data": {
|
173 |
+
"text/html": [
|
174 |
+
"<table><tr><td>A</td><td>B</td><td>C</td><td>D</td><td>E</td></tr><tr><td>E</td><td>F</td><td>G</td><td>H</td><td>I</td></tr><tr><td>I</td><td>J</td><td>K</td><td>L</td><td>M</td></tr><tr><td>M</td><td>N</td><td>O</td><td>P</td><td>Q</td></tr><tr><td>Q</td><td>R</td><td>S</td><td>T</td><td>U</td></tr><tr><td>U</td><td>V</td><td>W</td><td>X</td><td>Y</td></tr><tr><td>Y</td><td>Z</td><td></td><td></td><td></td></tr></table>"
|
175 |
+
]
|
176 |
+
},
|
177 |
+
"metadata": {},
|
178 |
+
"output_type": "display_data"
|
179 |
+
}
|
180 |
+
],
|
181 |
+
"source": [
|
182 |
+
"chunks = tuple(windowed(items, CHUNK_FRAMES_WINDOW, step=(CHUNK_FRAMES_WINDOW - CHUNK_FRAMES_OVERLAP)))\n",
|
183 |
+
"display_html(f\"<table>{\"\".join(f\"<tr>{\"\".join(f\"<td>{item if item else \"\"}</td>\" for item in chunk)}</tr>\" for chunk in chunks)}</table>\")"
|
184 |
+
]
|
185 |
+
},
|
186 |
+
{
|
187 |
+
"cell_type": "markdown",
|
188 |
+
"metadata": {},
|
189 |
+
"source": [
|
190 |
+
"## Using the `WebVTTFile` Class"
|
191 |
+
]
|
192 |
+
},
|
193 |
+
{
|
194 |
+
"cell_type": "code",
|
195 |
+
"execution_count": 10,
|
196 |
+
"metadata": {},
|
197 |
+
"outputs": [],
|
198 |
+
"source": [
|
199 |
+
"from datetime import datetime\n",
|
200 |
+
"from hashlib import sha256\n",
|
201 |
+
"from zoneinfo import ZoneInfo\n",
|
202 |
+
"\n",
|
203 |
+
"from ctp_slack_bot.models import WebVTTContent"
|
204 |
+
]
|
205 |
+
},
|
206 |
+
{
|
207 |
+
"cell_type": "code",
|
208 |
+
"execution_count": null,
|
209 |
+
"metadata": {},
|
210 |
+
"outputs": [],
|
211 |
+
"source": [
|
212 |
+
"FILE_PATH = \"GMT20250411-223535_Recording.transcript.vtt\"\n",
|
213 |
+
"TIME_ZONE = ZoneInfo(\"America/New_York\")\n",
|
214 |
+
"MODIFICATION_TIME = datetime(2025, 4, 11, hour=22, minute=35, second=35, tzinfo=ZoneInfo(\"GMT\")).astimezone(TIME_ZONE)"
|
215 |
+
]
|
216 |
+
},
|
217 |
+
{
|
218 |
+
"cell_type": "code",
|
219 |
+
"execution_count": 12,
|
220 |
+
"metadata": {},
|
221 |
+
"outputs": [],
|
222 |
+
"source": [
|
223 |
+
"with open(FILE_PATH, \"rb\") as file:\n",
|
224 |
+
" bytes = file.read()\n",
|
225 |
+
" web_vtt_content = WebVTTContent.from_bytes(sha256(bytes).hexdigest(), {\"modification_time\": MODIFICATION_TIME}, bytes)"
|
226 |
+
]
|
227 |
+
},
|
228 |
+
{
|
229 |
+
"cell_type": "code",
|
230 |
+
"execution_count": 13,
|
231 |
+
"metadata": {},
|
232 |
+
"outputs": [
|
233 |
+
{
|
234 |
+
"data": {
|
235 |
+
"text/plain": [
|
236 |
+
"(Chunk(text=\"iyeshia: For the workshop. We want to set you up.\\n\\niyeshia: Thank you, Kevin, for a question. We want to set you up for success in year one. And so this workshop is to help you kind of like\\n\\niyeshia: figure out, or how to adjust, as you're coming into your careers what to expect like your 30 days of work, 60 days of work, 90 days of work when you are starting your full time roles. So with that, said, let us get started.\\n\\niyeshia: So the topic, of course, is going to be discussing things of like the onboarding process of what it looks like when you start your jobs. How to maneuver or move around in your workplace environments. We'll discuss negotiating raises, because last time we didn't negotiating offers. So now we pass that you already got the offer. So now we'd be at the\\n\\niyeshia: the race card after that year. Don't try to come into your job already. 5 days in somebody to raise. Wait, and then from there we'll do activity on asking for feedback when you have, like your supervisor or manager, and you want to discuss things like that.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='1-5', metadata={'start': datetime.timedelta(0), 'end': datetime.timedelta(seconds=60, microseconds=379000), 'speakers': frozenset({'iyeshia'})}),\n",
|
237 |
+
" Chunk(text=\"iyeshia: the race card after that year. Don't try to come into your job already. 5 days in somebody to raise. Wait, and then from there we'll do activity on asking for feedback when you have, like your supervisor or manager, and you want to discuss things like that.\\n\\niyeshia: So let's kick it off with the onboarding process.\\n\\niyeshia: So with this, what you can expect ideally when you start your your job. There could be some type of welcome package. They might have a folder. They might have an email electronically or things like that. But it's gonna describe the details of like the company's environment. What your 1st day, or your 1st week or 1st month, a couple of months, might look like. As you're starting your onboarding process and the paperwork they might even show with you on the 1st day\\n\\niyeshia: work. You might be paired up with a Buddy or other people who might be hired at the same day, or maybe someone who was hired a year before, and they might be shadowing you to help you join and to get comfortable with your work environment.\\n\\niyeshia: and then also, your manager will. Hopefully, our supervisor would let you know what to expect. As you're starting your new\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='5-9', metadata={'start': datetime.timedelta(seconds=45, microseconds=930000), 'end': datetime.timedelta(seconds=108, microseconds=640000), 'speakers': frozenset({'iyeshia'})}),\n",
|
238 |
+
" Chunk(text=\"iyeshia: and then also, your manager will. Hopefully, our supervisor would let you know what to expect. As you're starting your new\\n\\niyeshia: job or career, and then from there, if you're unsure about your onboarding process as you're starting off, please ask questions to your manager or supervisor. The best part is to ask as many questions as you can. You're new, you're learning. They understand that. So they want to hear from you and your input\\n\\niyeshia: from there, I would say, I'm just looking at the\\n\\niyeshia: the chat. Yes, prepare for a lot of paperwork. Yes, I mean W. 2 W. Fours. They might have you fill out all those things. And that was 2. Okay, all right, Kevin.\\n\\niyeshia: So from there we'll kick it off. So an idea of what that could look like for you from 30 days to 60 days to 90 days to infinity and beyond like buzz light year, but from there you would hopefully to have intros with your your team, your manager, different departments. When you're starting\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='9-13', metadata={'start': datetime.timedelta(seconds=102, microseconds=82000), 'end': datetime.timedelta(seconds=166, microseconds=199000), 'speakers': frozenset({'iyeshia'})}),\n",
|
239 |
+
" Chunk(text=\"iyeshia: So from there we'll kick it off. So an idea of what that could look like for you from 30 days to 60 days to 90 days to infinity and beyond like buzz light year, but from there you would hopefully to have intros with your your team, your manager, different departments. When you're starting\\n\\niyeshia: they'll go over etiquette with you of like what you can expect. At the job that can include your attire, your desk hygiene communication, checking in with managers or teams.\\n\\niyeshia: Once you, after the 30 days we get to maybe days, 60 days, and then you're able to develop like your needs. Gain a better understanding of the company, develop plans and deliverables and outcomes. And then you go into your 90 days of being on the job where you're kind of learning your role. You're kind of getting adjust, you're being more effective and being becoming more independent.\\n\\niyeshia: And then from there you be able to understand, like, after the 90 days that you're kind of like settled in maybe months 4 to 6, or maybe the whole year. You should be settled into your role, understanding what's going on understanding how different departments move and things like that. So this is just the overview of what that looks like. It's not necessarily concrete, because every job is different.\\n\\niyeshia: But this is just to give an idea of what you can expect of that. And please just be mindful like with every workshop. I'm definitely going to send you the Powerpoint at the end. So if you want to look over that on your own time, you definitely can.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='13-17', metadata={'start': datetime.timedelta(seconds=147, microseconds=8000), 'end': datetime.timedelta(seconds=233, microseconds=730000), 'speakers': frozenset({'iyeshia'})}),\n",
|
240 |
+
" Chunk(text=\"iyeshia: But this is just to give an idea of what you can expect of that. And please just be mindful like with every workshop. I'm definitely going to send you the Powerpoint at the end. So if you want to look over that on your own time, you definitely can.\\n\\niyeshia: And so now that we've got through the onboarding process, this is probably the quickest we've done onboarding process because Kevin did it in 2 weeks. So from there we are going to move to navigating the workplace environment.\\n\\niyeshia: And so with that said, some things that are really important in your workplace environment is building relationships. Whether that's with your peers, your colleagues. Your manager. Trying to have a mentor mentee connection. All relationships are important.\\n\\niyeshia: With that I would say that when it comes to identifying your relationship needs, you want to know what you're expecting like, what? How do you need to show up in your role. What do you need from others? Understanding those type of things can help build better, I would say. Connections with your teammates and things of that nature when it's time to like cover problems or solve projects and things like that.\\n\\niyeshia: Another thing, too, you want to focus on is your Eiq. Emotional intelligence and communication that is basically pretty much helpful on the ability of recognizing your own emotions. Are you adequate enough, or know where your emotions are where you can get things done, what you need, what you don't need? Can you articulate that to your employer when you know those you can be able to identify and handle your emotions.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='17-21', metadata={'start': datetime.timedelta(seconds=220, microseconds=406000), 'end': datetime.timedelta(seconds=315, microseconds=170000), 'speakers': frozenset({'iyeshia'})}),\n",
|
241 |
+
" Chunk(text=\"iyeshia: Another thing, too, you want to focus on is your Eiq. Emotional intelligence and communication that is basically pretty much helpful on the ability of recognizing your own emotions. Are you adequate enough, or know where your emotions are where you can get things done, what you need, what you don't need? Can you articulate that to your employer when you know those you can be able to identify and handle your emotions.\\n\\niyeshia: And you can add basically help also to learn how to understand and help others. As well.\\n\\niyeshia: Another thing, as far as building relationships goes, is practicing, mindful listening. So the best way to truly listen is to talk less, and of course to understand more. And so when you learn from your teammates, listen as much as you can gain as much knowledge as you can from others, and that's gonna help you kinda conduct, or, you know, be a better team player. In your work environment.\\n\\niyeshia: And then a few things that you can do is\\n\\niyeshia: another way to help build a relationship is manager boundaries, you know, saying what is for you, scheduling time? With colleagues trying not to go over certain tasks or assignments. So that time management is gonna definitely help when you want to focus on your boundaries and you want to set schedules to maybe build connections with your team, and these are ways that you can go about it. Introduce yourself to people, whether your peers, whether it's\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='21-25', metadata={'start': datetime.timedelta(seconds=288, microseconds=600000), 'end': datetime.timedelta(seconds=376, microseconds=110000), 'speakers': frozenset({'iyeshia'})}),\n",
|
242 |
+
" Chunk(text=\"iyeshia: another way to help build a relationship is manager boundaries, you know, saying what is for you, scheduling time? With colleagues trying not to go over certain tasks or assignments. So that time management is gonna definitely help when you want to focus on your boundaries and you want to set schedules to maybe build connections with your team, and these are ways that you can go about it. Introduce yourself to people, whether your peers, whether it's\\n\\niyeshia: I don't care if it's a janitor security. The Cfo treat everybody equal and the same. And get to know. Get to know people because you just never know when you're going to need someone or work with someone. During that time.\\n\\niyeshia: And so those are the ways you can go about it. Greet people. You can invite people to coffee breaks, do quick message, check-in, and things of that nature, and then from there the 6 or 7 1, i think, are really important in the workplace environment. Some of the things you want to do is show gratitude, embrace others, give.\\n\\niyeshia: you know, credit where credit is due. Don't try to take anybody's ideas. If it comes to projects and things like that, that is a serious no-no show gratitude, and by any means necessary, try to avoid any gossip, any issues with office politics stay out of it. This is your first.st\\n\\niyeshia: This might be your 1st real like role, as far as like full time. In your career. So you just want to make sure you just keep in the peace and be respectful from there. Gossiping is kind of a big deal and a big no-no as well. So just be mindful of that.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='25-29', metadata={'start': datetime.timedelta(seconds=351, microseconds=10000), 'end': datetime.timedelta(seconds=438, microseconds=590000), 'speakers': frozenset({'iyeshia'})}),\n",
|
243 |
+
" Chunk(text=\"iyeshia: This might be your 1st real like role, as far as like full time. In your career. So you just want to make sure you just keep in the peace and be respectful from there. Gossiping is kind of a big deal and a big no-no as well. So just be mindful of that.\\n\\niyeshia: So the next thing, as far as we're talking about building relationship goals, you definitely want to also build those relationships, as I stated, with your peers. And things like that. Your coworkers? But you want to make sure you build a relationship with your manager. And just remember that it's important to have a relationship with your manager. But that's not the only relationship that's like you should focus on, you know. Like, I said before, you want to be a team play. You want to treat everybody equally because you just never know who you connect with.\\n\\niyeshia: But when it comes to that manager time, or asking for I would say, supervisions or meetings with them. You can ask questions. Those are always encouraged. You can ask them about their you know, supervisor style. Are they transformative? Are they hands on?\\n\\niyeshia: Do they like feedback directly towards them? Is everything written email? How are they? What's their work? Style? You can even ask them for the expectations of what is this like in a role like, what are your expectations, as far as how you show up in your role to them? And what are they looking for like with the measurements of success. Of course we always tell fellows to document everything that you do, as far as like when it comes to any goals that you bring any success.\\n\\niyeshia: rate, that you have many tasks that you might have brought to the table any of your accomplishments I know some people carry, or they write down like a accomplishment form of all the things that they've done, which, while they were at work to help with the ideas of what they bring to the table when it's time to come up for that, raise negotiation process. So just make sure you also update your resume as we go along, too.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='29-33', metadata={'start': datetime.timedelta(seconds=424, microseconds=830000), 'end': datetime.timedelta(seconds=536, microseconds=219000), 'speakers': frozenset({'iyeshia'})}),\n",
|
244 |
+
" Chunk(text=\"iyeshia: rate, that you have many tasks that you might have brought to the table any of your accomplishments I know some people carry, or they write down like a accomplishment form of all the things that they've done, which, while they were at work to help with the ideas of what they bring to the table when it's time to come up for that, raise negotiation process. So just make sure you also update your resume as we go along, too.\\n\\niyeshia: and then to talk with your manager about not only your successes and what you accomplish, but maybe areas of where you can grow and what you've been struggling to focus on so they can help support you with that as well.\\n\\niyeshia: Be observant in meetings when you're meeting with your team and other people. So that way you could learn about what else is going on, or whatever what everybody else is doing. So you can see how things work together. If you want to connect and socialize, you can ask people to lunch or coffee chats and things like that, and then always just remain proactive. You know it's always a good gesture to ask for teammate. It's like, Hey, is there anything you need before you know the end of the day? Or before I'm about to leave. You know things like that. It's always\\n\\niyeshia: helpful, too, because you never know when it's like your time, and someone is asking or offering help to you. And you're like, Oh, yeah, definitely need help with this. So it's always great to return their favor.\\n\\niyeshia: And so\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='33-37', metadata={'start': datetime.timedelta(seconds=511, microseconds=850000), 'end': datetime.timedelta(seconds=589, microseconds=330000), 'speakers': frozenset({'iyeshia'})}),\n",
|
245 |
+
" Chunk(text=\"iyeshia: And so\\n\\niyeshia: from there I would say, overall in regards of meeting with your supervisor, depending on how they do it. It could be quarterly it could be every other month. It could be 3 times throughout the year. They have a performance review. And so some companies like to start with, maybe January, you start, or maybe June, you started\\n\\niyeshia: working with them, and you track goals and what you could accomplish. With your manager until, like the next meeting, you have to go over just to make sure that you're on track with your goals throughout the throughout the year, as you've been working with your with your company.\\n\\niyeshia: That you got hired by, and so sometimes they'll do like a mid year review report to see your progress. If there's any touch points they could assist you with or support you with. You can meet with them with one on one meetings. If you feel like that's too long, and you want to make suggestions to meet with them sooner. Maybe you want to do every 3 months\\n\\niyeshia: just to see what's going on and how you can stay on track, and so I would say. Performance reviews, I guess, could be nerve wracking if it's like your 1st time, because you don't know what to expect.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='37-41', metadata={'start': datetime.timedelta(seconds=587, microseconds=800000), 'end': datetime.timedelta(seconds=654, microseconds=640000), 'speakers': frozenset({'iyeshia'})}),\n",
|
246 |
+
" Chunk(text=\"iyeshia: just to see what's going on and how you can stay on track, and so I would say. Performance reviews, I guess, could be nerve wracking if it's like your 1st time, because you don't know what to expect.\\n\\niyeshia: but of course you'll get used to it. As it progresses. But then, of course, you're still maintaining those connections with your supervisor, so you can definitely ask them questions of what you can expect from a performance review and things like that.\\n\\niyeshia: I'll pause here. If anybody has any questions about anything that I've mentioned. Anything like that?\\n\\niyeshia: Any questions? Are we all good.\\n\\nCUNY Tech Prep (CTP): Now's your chance before you forget what you wanted to ask.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='41-45', metadata={'start': datetime.timedelta(seconds=645, microseconds=172000), 'end': datetime.timedelta(seconds=682, microseconds=250000), 'speakers': frozenset({'CUNY Tech Prep (CTP)', 'iyeshia'})}),\n",
|
247 |
+
" Chunk(text=\"CUNY Tech Prep (CTP): Now's your chance before you forget what you wanted to ask.\\n\\nCUNY Tech Prep (CTP): No takers.\\n\\nCUNY Tech Prep (CTP): I have a few comments.\\n\\niyeshia: You want to go ahead, Kevin.\\n\\nCUNY Tech Prep (CTP): Well, self, I see self document as also having a secondary goal, particularly if you find yourself in\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='45-49', metadata={'start': datetime.timedelta(seconds=678, microseconds=110000), 'end': datetime.timedelta(seconds=700, microseconds=910000), 'speakers': frozenset({'CUNY Tech Prep (CTP)', 'iyeshia'})}),\n",
|
248 |
+
" Chunk(text=\"CUNY Tech Prep (CTP): Well, self, I see self document as also having a secondary goal, particularly if you find yourself in\\n\\nCUNY Tech Prep (CTP): not such a nice work environment.\\n\\nCUNY Tech Prep (CTP): It helps prevent people from gaslighting. You, for example.\\n\\nCUNY Tech Prep (CTP): And like it keeps you out of trouble. Let's say cause if you self document, then\\n\\nCUNY Tech Prep (CTP): you know exactly what was decided on.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='49-53', metadata={'start': datetime.timedelta(seconds=693, microseconds=509000), 'end': datetime.timedelta(seconds=720, microseconds=809000), 'speakers': frozenset({'CUNY Tech Prep (CTP)'})}),\n",
|
249 |
+
" Chunk(text=\"CUNY Tech Prep (CTP): you know exactly what was decided on.\\n\\nCUNY Tech Prep (CTP): And you're just following exactly what was said.\\n\\niyeshia: That is correct.\\n\\nCUNY Tech Prep (CTP): And then the setting boundaries right.\\n\\nCUNY Tech Prep (CTP): and there are some. There are some\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='53-57', metadata={'start': datetime.timedelta(seconds=717, microseconds=970000), 'end': datetime.timedelta(seconds=732, microseconds=590000), 'speakers': frozenset({'CUNY Tech Prep (CTP)', 'iyeshia'})}),\n",
|
250 |
+
" Chunk(text=\"CUNY Tech Prep (CTP): and there are some. There are some\\n\\nCUNY Tech Prep (CTP): bosses who will push your boundaries. Try to get you to like\\n\\nCUNY Tech Prep (CTP): do overtime. Stay longer than like\\n\\nCUNY Tech Prep (CTP): your stay longer than what's on like the contract, or whatever.\\n\\nCUNY Tech Prep (CTP): If you give an inch sometimes they'll take a mile, so\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='57-61', metadata={'start': datetime.timedelta(seconds=729, microseconds=400000), 'end': datetime.timedelta(seconds=749, microseconds=960000), 'speakers': frozenset({'CUNY Tech Prep (CTP)'})}),\n",
|
251 |
+
" Chunk(text=\"CUNY Tech Prep (CTP): If you give an inch sometimes they'll take a mile, so\\n\\nCUNY Tech Prep (CTP): you should be very clear on\\n\\nCUNY Tech Prep (CTP): your time. Your time limits, like.\\n\\nCUNY Tech Prep (CTP): you know, have always have an out, for\\n\\nCUNY Tech Prep (CTP): when too much is being requested of you.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='61-65', metadata={'start': datetime.timedelta(seconds=745, microseconds=275000), 'end': datetime.timedelta(seconds=767, microseconds=120000), 'speakers': frozenset({'CUNY Tech Prep (CTP)'})}),\n",
|
252 |
+
" Chunk(text=\"CUNY Tech Prep (CTP): when too much is being requested of you.\\n\\nCUNY Tech Prep (CTP): My usual go to is like, Oh, I I have like I have a meeting for Ctp, or like I have class.\\n\\niyeshia: Very good. That's good to good to know. And I know. David. Put in the chat like for an example of documentation. On March 16, th at 4, 35, you said, and I quote that is, that is exactly.\\n\\nCUNY Tech Prep (CTP): Under my lap.\\n\\niyeshia: But if you're in that situation, you definitely, it's so fresh, and it's so like truthful, like someone's like, no, I'm not going to doubt that someone made that.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='65-69', metadata={'start': datetime.timedelta(seconds=764, microseconds=400000), 'end': datetime.timedelta(seconds=803, microseconds=550000), 'speakers': frozenset({'CUNY Tech Prep (CTP)', 'iyeshia'})}),\n",
|
253 |
+
" Chunk(text=\"iyeshia: But if you're in that situation, you definitely, it's so fresh, and it's so like truthful, like someone's like, no, I'm not going to doubt that someone made that.\\n\\nCUNY Tech Prep (CTP): Yeah.\\n\\niyeshia: We wrote that and gave them the time so absolutely documentation goals for the good and for the bad. So definitely. Thank you for sharing that Kevin and David?\\n\\niyeshia: And so with that said, We'll go on to the the next slide. Which is a question of is my manager the same as having a mentor. Does anybody want to come off the come off mute and say yes or no?\\n\\niyeshia: I can just call on Kyle.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='69-73', metadata={'start': datetime.timedelta(seconds=795, microseconds=400000), 'end': datetime.timedelta(seconds=831, microseconds=790000), 'speakers': frozenset({'CUNY Tech Prep (CTP)', 'iyeshia'})}),\n",
|
254 |
+
" Chunk(text=\"iyeshia: I can just call on Kyle.\\n\\nCUNY Tech Prep (CTP): Kyle, you there.\\n\\nKyle Schoenhardt: No, it's not.\\n\\niyeshia: Okay, let's see.\\n\\niyeshia: Yay, good job, PAL. The answer is, no.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='73-77', metadata={'start': datetime.timedelta(seconds=828, microseconds=820000), 'end': datetime.timedelta(seconds=844, microseconds=930000), 'speakers': frozenset({'CUNY Tech Prep (CTP)', 'Kyle Schoenhardt', 'iyeshia'})}),\n",
|
255 |
+
" Chunk(text=\"iyeshia: Yay, good job, PAL. The answer is, no.\\n\\niyeshia: Did you want to give more input?\\n\\nKyle Schoenhardt: Yeah. Sure.\\n\\niyeshia: Yeah.\\n\\nKyle Schoenhardt: Well, I mean, sometimes you can just have really bad managers who are there to cover their own self, make themselves look good sometimes at your expense, or they micromanage, or you just don't click well with that person. For whatever reason a mentor is akin to a leader, I think they are there to lift you up and show you\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='77-81', metadata={'start': datetime.timedelta(seconds=841, microseconds=340000), 'end': datetime.timedelta(seconds=869, microseconds=440000), 'speakers': frozenset({'Kyle Schoenhardt', 'iyeshia'})}),\n",
|
256 |
+
" Chunk(text=\"Kyle Schoenhardt: Well, I mean, sometimes you can just have really bad managers who are there to cover their own self, make themselves look good sometimes at your expense, or they micromanage, or you just don't click well with that person. For whatever reason a mentor is akin to a leader, I think they are there to lift you up and show you\\n\\nKyle Schoenhardt: how you can improve on yourself like a coach.\\n\\nKyle Schoenhardt: Constantly giving you feedback, whether positive or negative.\\n\\nKyle Schoenhardt: I would say someone you would\\n\\nKyle Schoenhardt: go to immediately like. If the 1st person you think of that you need help with something is not your manager, then that's\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='81-85', metadata={'start': datetime.timedelta(seconds=850, microseconds=340000), 'end': datetime.timedelta(seconds=885, microseconds=510000), 'speakers': frozenset({'Kyle Schoenhardt'})}),\n",
|
257 |
+
" Chunk(text=\"Kyle Schoenhardt: go to immediately like. If the 1st person you think of that you need help with something is not your manager, then that's\\n\\nKyle Schoenhardt: a good indicator, that that person is not a mentor, or, if you need help with something, your your 1st go to person to that you think of is\\n\\nKyle Schoenhardt: someone else that is probably who your mentor is most likely to be, could be a coworker. It could be a manager, but it's not always.\\n\\niyeshia: Got it. Thank you, Kevin. I mean. Thank you, Kyle, said Kevin. Thank you. Kyle. Appreciate that. With that, said, I don't feel like I need to add any more. I feel like Kyle took that. So I'm gonna move on to the day.\\n\\niyeshia: So the next question is, should my manager, be my mentor.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='85-89', metadata={'start': datetime.timedelta(seconds=879, microseconds=360000), 'end': datetime.timedelta(seconds=919, microseconds=30000), 'speakers': frozenset({'Kyle Schoenhardt', 'iyeshia'})}),\n",
|
258 |
+
" Chunk(text=\"iyeshia: So the next question is, should my manager, be my mentor.\\n\\niyeshia: Alison.\\n\\nAllison Lee: Well, you you can't force a mentor mentee relationship if that's not how it's going to work.\\n\\nAllison Lee: But it is possible for your manager to be some kind of mentor figure.\\n\\niyeshia: Thank you.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='89-93', metadata={'start': datetime.timedelta(seconds=914, microseconds=565000), 'end': datetime.timedelta(seconds=945, microseconds=810000), 'speakers': frozenset({'iyeshia', 'Allison Lee'})}),\n",
|
259 |
+
" Chunk(text=\"iyeshia: Thank you.\\n\\niyeshia: So with that, said.\\n\\niyeshia: that depends. So I appreciate Allison. Your response. It definitely depends. Can't force them. But of course, if you do get along with your supervisor, and you want to ask them that\\n\\niyeshia: by all means. But good, answers everyone.\\n\\niyeshia: So now we go more in depth of what can good mentorship look like? And so from there I would say, mentors, as\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='93-97', metadata={'start': datetime.timedelta(seconds=944, microseconds=920000), 'end': datetime.timedelta(seconds=975, microseconds=362000), 'speakers': frozenset({'iyeshia'})}),\n",
|
260 |
+
" Chunk(text=\"iyeshia: So now we go more in depth of what can good mentorship look like? And so from there I would say, mentors, as\\n\\niyeshia: Kyle touched on was that they provide support, wisdom to help you succeed in certain examples are, this is pretty much sharing any ideas you might have with them from paying program with you on a code base providing feedback, maybe on a slide deck to helping you remind that it's impossible to know everything. So they're kind of reassuring you in your in your role as you're starting your career.\\n\\niyeshia: and then you want to make sure your mentor is a is a safe space for you at the time. Sometimes your mentor. You can talk to your mentor about your manager sometimes if they are difficult or not, and so from there it's a form of trust\\n\\niyeshia: with your with your mentor. So if you have, if you are blessed to have a supervisor who can be both roles, a manager and a mentor. Go for it, if you're like. I'm still learning. I'm only 3, 30 days in 60 days, 90 days. Take your time, then. So that is definitely something to to know from that.\\n\\niyeshia: And then questions of Where can I find? A mentor? And so, before I even answer this question, who can tell me what erg stands for\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='97-101', metadata={'start': datetime.timedelta(seconds=964, microseconds=630000), 'end': datetime.timedelta(seconds=1046, microseconds=430000), 'speakers': frozenset({'iyeshia'})}),\n",
|
261 |
+
" Chunk(text=\"iyeshia: And then questions of Where can I find? A mentor? And so, before I even answer this question, who can tell me what erg stands for\\n\\niyeshia: anyone?\\n\\niyeshia: Go ahead, Devon, please.\\n\\nDevin Xie (no cam): Employee resource groups.\\n\\niyeshia: Thank you so much, Devin. I appreciate you and blouse right there. Next to erg. So the examples of that can be any groups that they have at your job related to Lgbtq. It could be groups related to race and identity. It could be anything from parenthood. I wish they had groups related for auntiehood and things of that nature. But it's all about finding your community and resources for things to help support you while you're working\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='101-105', metadata={'start': datetime.timedelta(seconds=1035, microseconds=839000), 'end': datetime.timedelta(seconds=1085, microseconds=780000), 'speakers': frozenset({'iyeshia', 'Devin Xie (no cam)'})}),\n",
|
262 |
+
" Chunk(text=\"iyeshia: Thank you so much, Devin. I appreciate you and blouse right there. Next to erg. So the examples of that can be any groups that they have at your job related to Lgbtq. It could be groups related to race and identity. It could be anything from parenthood. I wish they had groups related for auntiehood and things of that nature. But it's all about finding your community and resources for things to help support you while you're working\\n\\niyeshia: in some of your environments. And then, when you have your community, you can always reflect on interests related to tech.\\n\\niyeshia: or maybe research on your company like, who's in your area. And you could always reach out to some people for informational interviews. If you're really trying to seek this mentor Mentee relationship from people who are at your company. So just to keep that in mind.\\n\\niyeshia: I think I saw something.\\n\\niyeshia: Auntie Hood. Yes, and then I think, Mingle, said Manager supervisors are not your friend. Their one and only job is to find a person that can get the job done. Okay, come on, now, very good. And so\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='105-109', metadata={'start': datetime.timedelta(seconds=1057, microseconds=780000), 'end': datetime.timedelta(seconds=1131, microseconds=240000), 'speakers': frozenset({'iyeshia'})}),\n",
|
263 |
+
" Chunk(text=\"iyeshia: Auntie Hood. Yes, and then I think, Mingle, said Manager supervisors are not your friend. Their one and only job is to find a person that can get the job done. Okay, come on, now, very good. And so\\n\\niyeshia: with that, said, I think y'all know the roles between manager and mentor, and I appreciate that.\\n\\niyeshia: So now the next part is negotiating raises. So the last workshop we did was negotiating offers, as I stated before. So this one's gonna be a little different. You got the job. So now, after that whole success in your 1st year you want to start discussing maybe time for a raise. So let's get into that.\\n\\niyeshia: So you did a great job.\\n\\niyeshia: 1st year you knocked it out. You got outcomes, you got successes. You're amazing. On the 1st year what happens now?\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='109-113', metadata={'start': datetime.timedelta(seconds=1114, microseconds=170000), 'end': datetime.timedelta(seconds=1167, microseconds=119000), 'speakers': frozenset({'iyeshia'})}),\n",
|
264 |
+
" Chunk(text=\"iyeshia: 1st year you knocked it out. You got outcomes, you got successes. You're amazing. On the 1st year what happens now?\\n\\niyeshia: Your success is going to be measured by achievements, contributions into your organization, and that could be rewarded with\\n\\niyeshia: money or something else you value that could be related to time. Things of that nature. You want to go up based off your benefits. As we stated before, in the last workshop, you might wanna negotiate that. But if you want to talk about money first.st That's okay, too.\\n\\niyeshia: And these are gonna help you, too, as well with your I would say. Manager or supervisor. Meetings\\n\\niyeshia: from there.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='113-117', metadata={'start': datetime.timedelta(seconds=1160, microseconds=790000), 'end': datetime.timedelta(seconds=1199, microseconds=450000), 'speakers': frozenset({'iyeshia'})}),\n",
|
265 |
+
" Chunk(text=\"iyeshia: from there.\\n\\niyeshia: So just remember that it's okay when you when you flex those negotiating offers or flex those muscles during conversations around raises. It's not bragging. If you're talking about your achievements and things like that. It's okay to to talk about your successes, you know, especially during a raise time, because you're trying to show your manager or prove what you brought to the to the table. So keep that in mind.\\n\\niyeshia: So how does it look.\\n\\nCUNY Tech Prep (CTP): Comments, sorry.\\n\\niyeshia: Yeah, that is.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='117-121', metadata={'start': datetime.timedelta(seconds=1198, microseconds=703000), 'end': datetime.timedelta(seconds=1228, microseconds=390000), 'speakers': frozenset({'CUNY Tech Prep (CTP)', 'iyeshia'})}),\n",
|
266 |
+
" Chunk(text='iyeshia: Yeah, that is.\\n\\nCUNY Tech Prep (CTP): Something you would also document. If your manager praises you, you document that.\\n\\niyeshia: That.\\n\\nCUNY Tech Prep (CTP): Is evidence you can use in your negotiations.\\n\\niyeshia: That is such a fact.', parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='121-125', metadata={'start': datetime.timedelta(seconds=1227, microseconds=350000), 'end': datetime.timedelta(seconds=1240, microseconds=380000), 'speakers': frozenset({'CUNY Tech Prep (CTP)', 'iyeshia'})}),\n",
|
267 |
+
" Chunk(text=\"iyeshia: That is such a fact.\\n\\niyeshia: I literally just copy to paste everything, my manager said. Yep, one of my negotiation days. Yep, so thank you, Kevin, for saying that? So with that said, if you have those those meetings with them, document not only what you say, but what they said, as Kevin mentioned.\\n\\niyeshia: That was great in the negotiating offer. So how else do we prepare for this?\\n\\niyeshia: You're going to research? Yes, you're going to gather all your feedback, whether it's from your colleagues and meetings, whether it's from the success that you hear from your manager or tips from people that you work with, you're going to make sure you learn about your role. What's going on in the market. Just research is going to be your best.\\n\\niyeshia: Put input on this as well. When you're talking about your salary. The next thing you want to do is list the accomplishments. Keep those documents. Don't wait to the last minute you get to the end of the year. You're like, what did I do? It's been 12 months, like.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='125-129', metadata={'start': datetime.timedelta(seconds=1238, microseconds=990000), 'end': datetime.timedelta(seconds=1296, microseconds=189000), 'speakers': frozenset({'iyeshia'})}),\n",
|
268 |
+
" Chunk(text=\"iyeshia: Put input on this as well. When you're talking about your salary. The next thing you want to do is list the accomplishments. Keep those documents. Don't wait to the last minute you get to the end of the year. You're like, what did I do? It's been 12 months, like.\\n\\niyeshia: yeah, document everything, because you might forget some stuff. So that's definitely gonna help, too.\\n\\niyeshia: With that, said, you want to make sure you remind everyone. Maybe you save a bunch of money for the company. Oh, maybe you help them with other accomplishments, or maybe you spend off a project that's done really well. For your department. Share it. So please feel free to do that.\\n\\niyeshia: and then that will also help you keep your resume updated as well. So you don't have to worry about trying to\\n\\niyeshia: scatter or get all your thoughts together at the last minute.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='129-133', metadata={'start': datetime.timedelta(seconds=1281, microseconds=940000), 'end': datetime.timedelta(seconds=1331, microseconds=399000), 'speakers': frozenset({'iyeshia'})}),\n",
|
269 |
+
" Chunk(text=\"iyeshia: scatter or get all your thoughts together at the last minute.\\n\\niyeshia: And then with that status also, your manager needs to have the facts, too, to convince their boss to approve you for a raise. So if your manager is giving you the praises already, they're like, yeah, I did say that like\\n\\niyeshia: as well. Even if they make a joke like saying to you like, Hey, you deserve a raise document that you could go right back to like, you know. April 11th at 5, at 6 58 pm. You said, I deserve a raise this time like it. Just everything will just work for you in your favor for that, so please feel free to do that.\\n\\niyeshia: And so now you did the you did the raise. You had the meeting with your your manager. They're proposing it to the Supervisor, or things of that nature. I know different companies work in different ways, so they might have you go directly to your boss's boss to talk about the raise, or whoever is in charge of that\\n\\niyeshia: common, to negotiate that with them. But every company is different. But if they say yes, that's great job all done. Now, what if you get to a conversation where they say, No, what do you do, then? Well, there are alternatives for that. You can ask to work on, maybe towards a promotion. You know what I'm saying as far as if they say based off your level. We can't go any higher than that\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='133-137', metadata={'start': datetime.timedelta(seconds=1328, microseconds=370000), 'end': datetime.timedelta(seconds=1410, microseconds=720000), 'speakers': frozenset({'iyeshia'})}),\n",
|
270 |
+
" Chunk(text=\"iyeshia: common, to negotiate that with them. But every company is different. But if they say yes, that's great job all done. Now, what if you get to a conversation where they say, No, what do you do, then? Well, there are alternatives for that. You can ask to work on, maybe towards a promotion. You know what I'm saying as far as if they say based off your level. We can't go any higher than that\\n\\niyeshia: negotiate for promotion which would include maybe getting a title change, or better money that comes with it. This is why we say research, because you can definitely research what's going on in the market saying, Hey, that's my job. But the title is different.\\n\\niyeshia: Look that up and like definitely propose that if you want to. You can even ask for a faster review cycle. If they say something like, Hey, we can't give that to you. Just yet today. But let's revisit this topic on the 6 months, maybe, like, hey? Can we meet sooner, maybe in 3 months, to discuss more about how I can go about this\\n\\niyeshia: and then you could simply, if they say no. Ask why? Because you don't want to hear anything as far as like knowing that period. No, they should give you an explanation for it. So always ask questions with that to help like what's driving? That? Was it bad timing? Is there a gap? Is there their cap? Is there certain budgets. Did I miss anything that could help? So they can definitely\\n\\niyeshia: share with you and tell you that information of why they might have done. It could be a whole timing thing. It could be a budget thing. But just keep in mind to keep so just to keep in mind you could ask for like. Go around it 3 these ways, let's say 3 different ways. You can go about the answer and no from there. With that, said, does anyone have any questions so far?\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='137-141', metadata={'start': datetime.timedelta(seconds=1386, microseconds=520000), 'end': datetime.timedelta(seconds=1487, microseconds=429000), 'speakers': frozenset({'iyeshia'})}),\n",
|
271 |
+
" Chunk(text=\"iyeshia: share with you and tell you that information of why they might have done. It could be a whole timing thing. It could be a budget thing. But just keep in mind to keep so just to keep in mind you could ask for like. Go around it 3 these ways, let's say 3 different ways. You can go about the answer and no from there. With that, said, does anyone have any questions so far?\\n\\niyeshia: Nobody. Okay. Devin.\\n\\nCUNY Tech Prep (CTP): Devin does Devon.\\n\\nDevin Xie (no cam): Just curious. So like, say, we\\n\\nDevin Xie (no cam): find some opportunity after we graduate from Cuny Tech fair.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='141-145', metadata={'start': datetime.timedelta(seconds=1467, microseconds=260000), 'end': datetime.timedelta(seconds=1503, microseconds=140000), 'speakers': frozenset({'CUNY Tech Prep (CTP)', 'iyeshia', 'Devin Xie (no cam)'})}),\n",
|
272 |
+
" Chunk(text=\"Devin Xie (no cam): find some opportunity after we graduate from Cuny Tech fair.\\n\\nDevin Xie (no cam): And then we have questions about this stuff like.\\n\\nDevin Xie (no cam): let's say we work there for like a year. And we\\n\\nDevin Xie (no cam): we stop. We we want to ask for some advice. Can we still hit you guys up.\\n\\niyeshia: Yeah, but you become alumni. You're not just gonna drop you all off in May and be like, bye. No, you can definitely you'll be invited. May like, after the graduation, I want to say in the summertime you'll get an invite to the alumni slack channel and you can join\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='145-149', metadata={'start': datetime.timedelta(seconds=1499, microseconds=630000), 'end': datetime.timedelta(seconds=1531, microseconds=469000), 'speakers': frozenset({'iyeshia', 'Devin Xie (no cam)'})}),\n",
|
273 |
+
" Chunk(text=\"iyeshia: Yeah, but you become alumni. You're not just gonna drop you all off in May and be like, bye. No, you can definitely you'll be invited. May like, after the graduation, I want to say in the summertime you'll get an invite to the alumni slack channel and you can join\\n\\niyeshia: that, and I will be gladly to assist you. There. We have a career coach there, but usually all the the staff is on the Ctv team is on the alumni channel. So yeah, definitely. But we also like, I said before, Devin, save the Powerpoint, too.\\n\\niyeshia: Just putting that out there? So yeah, good question.\\n\\niyeshia: Okay?\\n\\niyeshia: And so the next part is after the conversation for the the raise. You want to make sure. The conversation goes well, timing is going to be a part of that. So clarifying the process, asking them like, you know, when should I expect the raise? You know that's not being thirsty. That's that's your money. You can ask questions about it. And what's the next step for that?\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='149-153', metadata={'start': datetime.timedelta(seconds=1513, microseconds=30000), 'end': datetime.timedelta(seconds=1577, microseconds=890000), 'speakers': frozenset({'iyeshia'})}),\n",
|
274 |
+
" Chunk(text=\"iyeshia: And so the next part is after the conversation for the the raise. You want to make sure. The conversation goes well, timing is going to be a part of that. So clarifying the process, asking them like, you know, when should I expect the raise? You know that's not being thirsty. That's that's your money. You can ask questions about it. And what's the next step for that?\\n\\niyeshia: You can always confirm with your manager? Like. If the reason they said no, was it because there's certain maybe I would say physical years of like, how they what deadline they have for the New Year or the new budget. Time or deadline, was it? Did I miss it when I asked for a salary? Or when's the next time I should ask for a salary. Increase, and things like that. Cause your your department, or you would hope the team that you're on will show you throughout the year of like what's coming up and what you can expect.\\n\\niyeshia: So you definitely want to plan ahead next time. If they say no, and then review the work and the feedback asking for feedback. Was it my, the way that I would propose the raise? Is there anything I could do to get? You know better on that? That would help with the mentor, of course.\\n\\niyeshia: Cause the person you're proposing it to might not give the input. But definitely, a mentor is gonna help you with that as well to see what's going on. You could definitely check in with your manager. If they had any feedback they might tell your manager to like, let them know like this is why they might have said No or this? Why, they might have said, Not yet, or they'll say yes later. So keep that in mind.\\n\\niyeshia: and then let's see right\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='153-157', metadata={'start': datetime.timedelta(seconds=1553, microseconds=290000), 'end': datetime.timedelta(seconds=1648, microseconds=679000), 'speakers': frozenset({'iyeshia'})}),\n",
|
275 |
+
" Chunk(text=\"iyeshia: and then let's see right\\n\\niyeshia: from there we'll go to the activity.\\n\\niyeshia: And so from there, this is an activity of asking for feedback.\\n\\niyeshia: And we're gonna do a scenario of you want to ask for feedback from your manager.\\n\\niyeshia: and you previously had passed up for raise and want to learn more about how you can ensure success earning one in the next review cycle.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='157-161', metadata={'start': datetime.timedelta(seconds=1641, microseconds=970000), 'end': datetime.timedelta(seconds=1673, microseconds=539000), 'speakers': frozenset({'iyeshia'})}),\n",
|
276 |
+
" Chunk(text=\"iyeshia: and you previously had passed up for raise and want to learn more about how you can ensure success earning one in the next review cycle.\\n\\niyeshia: So this part is, how would you start that conversation in your weekly check in?\\n\\niyeshia: So since we're virtual, we're gonna have, I'm gonna give you about 30 seconds to come up with your own answer, and then type it in the chat.\\n\\niyeshia: So review the scenario now and then we'll start in 30 seconds.\\n\\niyeshia: So\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='161-165', metadata={'start': datetime.timedelta(seconds=1665, microseconds=550000), 'end': datetime.timedelta(seconds=1692, microseconds=620000), 'speakers': frozenset({'iyeshia'})}),\n",
|
277 |
+
" Chunk(text='iyeshia: So\\n\\niyeshia: we set the timer for 30.\\n\\niyeshia: Okay?\\n\\niyeshia: Goes now\\n\\niyeshia: 10 seconds.', parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='165-169', metadata={'start': datetime.timedelta(seconds=1691, microseconds=890000), 'end': datetime.timedelta(seconds=1727, microseconds=70000), 'speakers': frozenset({'iyeshia'})}),\n",
|
278 |
+
" Chunk(text='iyeshia: 10 seconds.\\n\\niyeshia: Okay, time is up.\\n\\niyeshia: Okay, nice.\\n\\niyeshia: And look for a raise on to guarantee a raise in this performance. Review. Awesome. Thank you. Ty\\n\\niyeshia: and Mckenzie. Thank you.', parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='169-173', metadata={'start': datetime.timedelta(seconds=1725, microseconds=970000), 'end': datetime.timedelta(seconds=1767, microseconds=160000), 'speakers': frozenset({'iyeshia'})}),\n",
|
279 |
+
" Chunk(text='iyeshia: and Mckenzie. Thank you.\\n\\niyeshia: 13.\\n\\niyeshia: Some feedback to see what I can build. Awesome.\\n\\niyeshia: Hey, boys!\\n\\niyeshia: Oh, my God this time to reach out a bit. Okay, okay for me.', parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='173-177', metadata={'start': datetime.timedelta(seconds=1765, microseconds=20000), 'end': datetime.timedelta(seconds=1785, microseconds=509000), 'speakers': frozenset({'iyeshia'})}),\n",
|
280 |
+
" Chunk(text='iyeshia: Oh, my God this time to reach out a bit. Okay, okay for me.\\n\\niyeshia: No.\\n\\niyeshia: Okay.\\n\\niyeshia: Any improvement that you see that I cannot. Okay, thank you.\\n\\niyeshia: Let me check in with you.', parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='177-181', metadata={'start': datetime.timedelta(seconds=1780, microseconds=400000), 'end': datetime.timedelta(seconds=1810, microseconds=859000), 'speakers': frozenset({'iyeshia'})}),\n",
|
281 |
+
" Chunk(text=\"iyeshia: Let me check in with you.\\n\\niyeshia: There we go.\\n\\niyeshia: Okay, perfect.\\n\\niyeshia: So what I can make for the next recycle. Awesome. Thank you all for sharing so far, I'm gonna move on to the the next part. I think I kind of skipped\\n\\niyeshia: ahead.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='181-185', metadata={'start': datetime.timedelta(seconds=1807, microseconds=139000), 'end': datetime.timedelta(seconds=1830, microseconds=670000), 'speakers': frozenset({'iyeshia'})}),\n",
|
282 |
+
" Chunk(text=\"iyeshia: ahead.\\n\\niyeshia: Okay.\\n\\niyeshia: so right now, we have a role play example between a manager and you. Let's say you would.\\n\\niyeshia: it could be data science. Related. Right? So from here, I'm going to\\n\\niyeshia: probably volunteer, because I'm not sure if people will volunteer to be the manager and someone be you\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='185-189', metadata={'start': datetime.timedelta(seconds=1829, microseconds=480000), 'end': datetime.timedelta(seconds=1857, microseconds=657000), 'speakers': frozenset({'iyeshia'})}),\n",
|
283 |
+
" Chunk(text=\"iyeshia: probably volunteer, because I'm not sure if people will volunteer to be the manager and someone be you\\n\\niyeshia: So let me see who I can get.\\n\\niyeshia: Okay, I'll go with David for manager, and I'll go for\\n\\niyeshia: Let's try, Kevin for you.\\n\\niyeshia: If you have to read this role, play example.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='189-193', metadata={'start': datetime.timedelta(seconds=1850, microseconds=520000), 'end': datetime.timedelta(seconds=1877, microseconds=689000), 'speakers': frozenset({'iyeshia'})}),\n",
|
284 |
+
" Chunk(text='iyeshia: If you have to read this role, play example.\\n\\nDavid Rodriguez: Should I start now?\\n\\nCUNY Tech Prep (CTP): Kevin, you there?\\n\\nCUNY Tech Prep (CTP): Kevin? Chen.\\n\\nKevin Zheng: Right, right.', parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='193-197', metadata={'start': datetime.timedelta(seconds=1874, microseconds=660000), 'end': datetime.timedelta(seconds=1892, microseconds=270000), 'speakers': frozenset({'CUNY Tech Prep (CTP)', 'David Rodriguez', 'iyeshia', 'Kevin Zheng'})}),\n",
|
285 |
+
" Chunk(text=\"Kevin Zheng: Right, right.\\n\\nCUNY Tech Prep (CTP): Alright!\\n\\nDavid Rodriguez: Great I'll start.\\n\\nDavid Rodriguez: Is there anything else you'd like to talk about?\\n\\nKevin Zheng: Yes, as you know, I've been taking on additional responsibilities since we used the team, and I'd like to speak to you about my conversation package.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='197-201', metadata={'start': datetime.timedelta(seconds=1891, microseconds=450000), 'end': datetime.timedelta(seconds=1910, microseconds=499000), 'speakers': frozenset({'CUNY Tech Prep (CTP)', 'David Rodriguez', 'Kevin Zheng'})}),\n",
|
286 |
+
" Chunk(text=\"Kevin Zheng: Yes, as you know, I've been taking on additional responsibilities since we used the team, and I'd like to speak to you about my conversation package.\\n\\nDavid Rodriguez: We really appreciate your hard work.\\n\\nDavid Rodriguez: but it's still a tough economy, and we're not really in a position to give you anything more than a 2% raise. We can talk about a raise at your next review in about 6 months.\\n\\nKevin Zheng: I do understand that the economy has made things difficult. Can we set a time to discuss my compensation again before my next schedule Review.\\n\\nKevin Zheng: I appreciate an opportunity to talk in more detail on the additional work I've taken on, and its impact.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='201-205', metadata={'start': datetime.timedelta(seconds=1901, microseconds=690000), 'end': datetime.timedelta(seconds=1938, microseconds=959000), 'speakers': frozenset({'David Rodriguez', 'Kevin Zheng'})}),\n",
|
287 |
+
" Chunk(text=\"Kevin Zheng: I appreciate an opportunity to talk in more detail on the additional work I've taken on, and its impact.\\n\\nDavid Rodriguez: Sure that makes sense.\\n\\nDavid Rodriguez: I want to make sure you heard how about a month.\\n\\nKevin Zheng: Great. Thank you. I'll find some time on your calendar for us to meet.\\n\\niyeshia: Thank you. So with that, said, I. Just want to open up the the floor. To everyone. What did you notice?\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='205-209', metadata={'start': datetime.timedelta(seconds=1933, microseconds=720000), 'end': datetime.timedelta(seconds=1967, microseconds=303000), 'speakers': frozenset({'David Rodriguez', 'iyeshia', 'Kevin Zheng'})}),\n",
|
288 |
+
" Chunk(text=\"iyeshia: Thank you. So with that, said, I. Just want to open up the the floor. To everyone. What did you notice?\\n\\niyeshia: that during the the role play. That the let's say the data scientists who was played by Kevin,\\n\\niyeshia: did as far as like, maybe something different from your responses that you put in the chat. Did y'all notice anything differently?\\n\\niyeshia: Hey, Devin?\\n\\nDevin Xie (no cam): I don't know if I'm correct. But I think\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='209-213', metadata={'start': datetime.timedelta(seconds=1957, microseconds=300000), 'end': datetime.timedelta(seconds=2005, microseconds=496000), 'speakers': frozenset({'iyeshia', 'Devin Xie (no cam)'})}),\n",
|
289 |
+
" Chunk(text=\"Devin Xie (no cam): I don't know if I'm correct. But I think\\n\\nDevin Xie (no cam): the data scientists or us in this situation, we try to like Scheduler, a review like\\n\\nDevin Xie (no cam): in a later time.\\n\\niyeshia: absolutely. Thank you. He took initiative and be like, you know, hey, let me, let me get on your calendar for next time, instead of just like waiting around, you know, people be like, Oh, I'll get back to you and things like that. He's like, no, we can. We can discuss later, like, what's your schedule like? So that\\n\\niyeshia: that forwardness of just, you know, following up and seeing it through is definitely helpful.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='213-217', metadata={'start': datetime.timedelta(seconds=2002, microseconds=950000), 'end': datetime.timedelta(seconds=2041, microseconds=590000), 'speakers': frozenset({'iyeshia', 'Devin Xie (no cam)'})}),\n",
|
290 |
+
" Chunk(text=\"iyeshia: that forwardness of just, you know, following up and seeing it through is definitely helpful.\\n\\niyeshia: So and so, for now I would say this would take about maybe\\n\\niyeshia: so final reflection. We could talk about this for like maybe 3\\xa0min, or anybody could just like popcorn it out unless I just call on them. But for today's learning from the workshop what are some things you can generally expect when you 1st join a company? What is a manager's role in your success? And how do you find out your measures of success? Does anyone want to\\n\\niyeshia: volunteer and answer any of the any of the 3 questions that are of their choice\\n\\niyeshia: before I call on someone.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='217-221', metadata={'start': datetime.timedelta(seconds=2035, microseconds=850000), 'end': datetime.timedelta(seconds=2087, microseconds=550000), 'speakers': frozenset({'iyeshia'})}),\n",
|
291 |
+
" Chunk(text=\"iyeshia: before I call on someone.\\n\\niyeshia: Okay, anybody but Devin.\\n\\niyeshia: See, I'm gonna go with anthony.\\n\\nAnthony Jerez: Yes, I'm here.\\n\\niyeshia: Which question would you like to answer? You had to reflect.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='221-225', metadata={'start': datetime.timedelta(seconds=2086, microseconds=20000), 'end': datetime.timedelta(seconds=2122, microseconds=210000), 'speakers': frozenset({'Anthony Jerez', 'iyeshia'})}),\n",
|
292 |
+
" Chunk(text=\"iyeshia: Which question would you like to answer? You had to reflect.\\n\\nAnthony Jerez: On, I would say the 1st one.\\n\\niyeshia: Okay, go for it.\\n\\nAnthony Jerez: So some major things that I would expect would be we're going through like sessions like orientation, and like onboarding\\n\\nAnthony Jerez: also knowledge about like some some resources resources that we would have access to at any point.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='225-229', metadata={'start': datetime.timedelta(seconds=2119, microseconds=390000), 'end': datetime.timedelta(seconds=2147, microseconds=390000), 'speakers': frozenset({'Anthony Jerez', 'iyeshia'})}),\n",
|
293 |
+
" Chunk(text=\"Anthony Jerez: also knowledge about like some some resources resources that we would have access to at any point.\\n\\nAnthony Jerez: And yeah, stuff like that. I would say.\\n\\niyeshia: Thank you, Anthony, for sharing.\\n\\niyeshia: and then let me see, trying to see who's not making eye contact. Oh, oh, not everybody looks okay. So let's go with\\n\\niyeshia: Ibrahim.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='229-233', metadata={'start': datetime.timedelta(seconds=2139, microseconds=43000), 'end': datetime.timedelta(seconds=2167, microseconds=810000), 'speakers': frozenset({'Anthony Jerez', 'iyeshia'})}),\n",
|
294 |
+
" Chunk(text=\"iyeshia: Ibrahim.\\n\\nIbrahim Faruquee: Yeah, I'll answer question, too.\\n\\nIbrahim Faruquee: So your manager's role is mainly like for the company to manage like people and make sure that the right persons for the right job, but they can be like a mentor figure for you. So like, if there can be like good mentors who like help you throughout the process and help you with a raise, or they could also like, be difficult and make that like harder for you. But they're kind of. It's not like there's nothing to be, I guess, expected from a manager. It's just like\\n\\nIbrahim Faruquee: what they like. What do you, I guess. What do you end up with.\\n\\nIbrahim Faruquee: or what do you make the most of.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='233-237', metadata={'start': datetime.timedelta(seconds=2166, microseconds=780000), 'end': datetime.timedelta(seconds=2208, microseconds=880000), 'speakers': frozenset({'iyeshia', 'Ibrahim Faruquee'})}),\n",
|
295 |
+
" Chunk(text=\"Ibrahim Faruquee: or what do you make the most of.\\n\\niyeshia: Awesome. Thank you.\\n\\niyeshia: And then for the 3rd question.\\n\\niyeshia: and we're gonna go for Isabel.\\n\\nIsabel Loçi: Hello!\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='237-241', metadata={'start': datetime.timedelta(seconds=2207, microseconds=390000), 'end': datetime.timedelta(seconds=2223, microseconds=750000), 'speakers': frozenset({'Isabel Loçi', 'iyeshia', 'Ibrahim Faruquee'})}),\n",
|
296 |
+
" Chunk(text=\"Isabel Loçi: Hello!\\n\\niyeshia: Hello!\\n\\nIsabel Loçi: Sorry. My Internet's horrible, and might I might disconnect?\\n\\nIsabel Loçi: I'll see if I can answer the 3rd one. How do you find your measures of success.\\n\\nIsabel Loçi: I would say, ask for feedback from other people elsewhere, from other colleagues, from your manager.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='241-245', metadata={'start': datetime.timedelta(seconds=2222, microseconds=900000), 'end': datetime.timedelta(seconds=2245, microseconds=189000), 'speakers': frozenset({'Isabel Loçi', 'iyeshia'})}),\n",
|
297 |
+
" Chunk(text=\"Isabel Loçi: I would say, ask for feedback from other people elsewhere, from other colleagues, from your manager.\\n\\nIsabel Loçi: That way you get a better understanding of where you are right now. And also I would say to also look back on the goals that you've set for yourself, and see if you've reached those goals as well, and that would be a good measure of success.\\n\\niyeshia: Okay, very good. All right.\\n\\niyeshia: So yeah, definitely helped make my life easier with this presentation. So thank you. I'm glad things are sticking and so with that said, We will go and launch Kahoo. But before I do that I definitely want to say just be mindful of these things.\\n\\niyeshia: When you are starting in your 1st year, in your career. As it was stated in one of the slides, you don't have to have it all figured out is the perfect time to ask questions. You're gonna make mistakes, or you're not. But if you do, it's okay. Because it's all gonna be a learning process. For your 1st year, and your managers expect that.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='245-249', metadata={'start': datetime.timedelta(seconds=2238, microseconds=660000), 'end': datetime.timedelta(seconds=2306, microseconds=319000), 'speakers': frozenset({'Isabel Loçi', 'iyeshia'})}),\n",
|
298 |
+
" Chunk(text=\"iyeshia: When you are starting in your 1st year, in your career. As it was stated in one of the slides, you don't have to have it all figured out is the perfect time to ask questions. You're gonna make mistakes, or you're not. But if you do, it's okay. Because it's all gonna be a learning process. For your 1st year, and your managers expect that.\\n\\niyeshia: So just keep that in mind.\\n\\niyeshia: And then, if you are going to seek, you know, support, I think. It was great that it's a bell, stated asking for feedback from your manager, but you could also ask for feedback from your teammates, too. Cause they, if you work with them closely. If you have a team to see, like what your areas of strengths are your areas of growth.\\n\\niyeshia: and things that you're learning. That could be helpful. Towards that process if you're going up for a raise. But sometimes people could see our strengths stronger or clearer, or even faster than we can, and we don't even realize it.\\n\\niyeshia: And then even asking your mentors, too, as well, can be helpful. And then.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='249-253', metadata={'start': datetime.timedelta(seconds=2282, microseconds=771000), 'end': datetime.timedelta(seconds=2345, microseconds=799000), 'speakers': frozenset({'iyeshia'})}),\n",
|
299 |
+
" Chunk(text=\"iyeshia: And then even asking your mentors, too, as well, can be helpful. And then.\\n\\niyeshia: if you are going to negotiate, remember to keep for raise, to keep that documented focus on your skills. Make sure you do your research on the market and definitely, just try to figure out if you can negotiate other things.\\n\\niyeshia: And when it comes to relationships, at work, you wanna make sure to treat everybody equally so I hope that that helps. If you didn't get anything else. I hope that's what helps you with them\\n\\niyeshia: with your 1st year? As you enter into your careers. And so with that said, we'll go into Kahoot.\\n\\niyeshia: and so I'm going to launch it now.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='253-257', metadata={'start': datetime.timedelta(seconds=2341, microseconds=80000), 'end': datetime.timedelta(seconds=2390, microseconds=330000), 'speakers': frozenset({'iyeshia'})}),\n",
|
300 |
+
" Chunk(text=\"iyeshia: and so I'm going to launch it now.\\n\\niyeshia: Let's get it started.\\n\\niyeshia: I don't think my headphones died so\\n\\niyeshia: got 33 people on here, and only 16.\\n\\niyeshia: Okay.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='257-261', metadata={'start': datetime.timedelta(seconds=2387, microseconds=420000), 'end': datetime.timedelta(seconds=2445, microseconds=90000), 'speakers': frozenset({'iyeshia'})}),\n",
|
301 |
+
" Chunk(text=\"iyeshia: Okay.\\n\\niyeshia: sound. Good.\\n\\niyeshia: 33.\\n\\niyeshia: Well, I didn't cut myself. That's Kevin. You're playing too.\\n\\niyeshia: Figure out how to be successful on my own.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='261-265', metadata={'start': datetime.timedelta(seconds=2444, microseconds=230000), 'end': datetime.timedelta(seconds=2550, microseconds=965000), 'speakers': frozenset({'iyeshia'})}),\n",
|
302 |
+
" Chunk(text=\"iyeshia: Figure out how to be successful on my own.\\n\\niyeshia: Oh, you do not have to figure that out.\\n\\niyeshia: That's why we tell you, have mentors, extra peers and things of that nature.\\n\\niyeshia: Well, yeah, shout out to the 22. It's okay. One. I'll take the 22 others, you know. Wow!\\n\\niyeshia: Your boss. My goodness, okay, is in the lead.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='265-269', metadata={'start': datetime.timedelta(seconds=2547, microseconds=780000), 'end': datetime.timedelta(seconds=2583, microseconds=779000), 'speakers': frozenset({'iyeshia'})}),\n",
|
303 |
+
" Chunk(text=\"iyeshia: Your boss. My goodness, okay, is in the lead.\\n\\niyeshia: So let's go ahead\\n\\niyeshia: who should not go to\\n\\niyeshia: thank you definitely. The worst thing you could do is talk to no one. If you need support with something.\\n\\niyeshia: So I hope.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='269-273', metadata={'start': datetime.timedelta(seconds=2578, microseconds=507000), 'end': datetime.timedelta(seconds=2624, microseconds=130000), 'speakers': frozenset({'iyeshia'})}),\n",
|
304 |
+
" Chunk(text=\"iyeshia: So I hope.\\n\\nCUNY Tech Prep (CTP): I am shocked.\\n\\niyeshia: That one should you not go to? So yeah.\\n\\niyeshia: let's see. Okay, Jamie is in the name.\\n\\niyeshia: Okay, let's go.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='273-277', metadata={'start': datetime.timedelta(seconds=2622, microseconds=675000), 'end': datetime.timedelta(seconds=2641, microseconds=959000), 'speakers': frozenset({'CUNY Tech Prep (CTP)', 'iyeshia'})}),\n",
|
305 |
+
" Chunk(text=\"iyeshia: Okay, let's go.\\n\\niyeshia: 3rd question, what are not considerations to mention when providing reasons for a salary increase.\\n\\niyeshia: There aren't enough.\\n\\niyeshia: Okay? 18. Yes, the cost of living. That is correct. You should not consider that\\n\\niyeshia: They don't, they don't. They don't care so definitely the other ones. You could do that on your own when you're doing your negotiating your your budget. But don't come out and say, like, Hey, the cost of living in this city? They're like\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='277-281', metadata={'start': datetime.timedelta(seconds=2640, microseconds=140000), 'end': datetime.timedelta(seconds=2695, microseconds=309000), 'speakers': frozenset({'iyeshia'})}),\n",
|
306 |
+
" Chunk(text=\"iyeshia: They don't, they don't. They don't care so definitely the other ones. You could do that on your own when you're doing your negotiating your your budget. But don't come out and say, like, Hey, the cost of living in this city? They're like\\n\\niyeshia: or virtual.\\n\\niyeshia: our office in California, we have no idea. So yeah, just just keep that in mind. So good job to the the cost of living folks.\\n\\niyeshia: Okay, David Rv is in the lead.\\n\\niyeshia: Okay, let's go to the next question.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='281-285', metadata={'start': datetime.timedelta(seconds=2680, microseconds=250000), 'end': datetime.timedelta(seconds=2715, microseconds=419000), 'speakers': frozenset({'iyeshia'})}),\n",
|
307 |
+
" Chunk(text=\"iyeshia: Okay, let's go to the next question.\\n\\niyeshia: what is a thoughtful way to actually negotiate?\\n\\niyeshia: So we can negotiate? Very good. It's a thoughtful way to act\\n\\niyeshia: and I think most of y'all got that in the chat. I saw some other answers. I'm gonna leave that questionable. But for the ones who did shout out to y'all.\\n\\niyeshia: So I think this is the last question.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='285-289', metadata={'start': datetime.timedelta(seconds=2712, microseconds=460000), 'end': datetime.timedelta(seconds=2758, microseconds=389000), 'speakers': frozenset({'iyeshia'})}),\n",
|
308 |
+
" Chunk(text=\"iyeshia: So I think this is the last question.\\n\\niyeshia: But Kyle is in the lead now, and so shouts to Kyle. So here goes the last question.\\n\\niyeshia: The most important relationship at work is with my manager.\\n\\niyeshia: Shout out to the people who said, False I said, it is important, but not the most important. Yeah, there's team this\\n\\niyeshia: Ceos, what about yourself? You know, things like that? So I just want to keep that in mind. So\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='289-293', metadata={'start': datetime.timedelta(seconds=2755, microseconds=680000), 'end': datetime.timedelta(seconds=2795, microseconds=579000), 'speakers': frozenset({'iyeshia'})}),\n",
|
309 |
+
" Chunk(text=\"iyeshia: Ceos, what about yourself? You know, things like that? So I just want to keep that in mind. So\\n\\niyeshia: yeah, let's always about that. So let's go to the windows.\\n\\niyeshia: Okay, let's okay.\\n\\niyeshia: Number one.\\n\\niyeshia: Okay, at the bottom.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='293-297', metadata={'start': datetime.timedelta(seconds=2788, microseconds=670000), 'end': datetime.timedelta(seconds=2827, microseconds=966000), 'speakers': frozenset({'iyeshia'})}),\n",
|
310 |
+
" Chunk(text=\"iyeshia: Okay, at the bottom.\\n\\niyeshia: Okay, with that, said\\n\\niyeshia: the last thing I will do. These are some follow up questions that you can ask your career coach. If I'm your career coach, you could definitely ask me that.\\n\\niyeshia: But how much of a raise. Can you ask for? When do you? Should you start a retirement fund? I would say, Asap, how long should you take to figure out if your company is a good fit, and how do you approach a conflict with a manager or coworker? So if you have any questions about those, please feel free to reach out to me or your career coach, if you would like to discuss further details, and I do want to be mindful of time.\\n\\niyeshia: And so I want to thank you for your time, and just want to let you know. This is the feedback form that really helps me with this presentation\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='297-301', metadata={'start': datetime.timedelta(seconds=2822, microseconds=600000), 'end': datetime.timedelta(seconds=2879, microseconds=310000), 'speakers': frozenset({'iyeshia'})}),\n",
|
311 |
+
" Chunk(text=\"iyeshia: And so I want to thank you for your time, and just want to let you know. This is the feedback form that really helps me with this presentation\\n\\niyeshia: and help me to deliver it better or worse. So if I did a good job, that's great. But I'm going to put this in the chat.\\n\\niyeshia: So you could fill that out now and then. Also want to invite you all to Rsvp. For Ctp's graduation.\\n\\niyeshia: So I would say, you can do that right now as well\\n\\niyeshia: and please register as a student. For those who can attend. You're more than welcome for the I believe the May 20th ones. If you cannot attend because you have a final, you have an internship. It is okay. There's no pressure. We're not going to be like, Hey, you can't you got to make it? No, we totally get it, I mean, we understand. So blessings on your finals\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='301-305', metadata={'start': datetime.timedelta(seconds=2870, microseconds=460000), 'end': datetime.timedelta(seconds=2919, microseconds=640000), 'speakers': frozenset({'iyeshia'})}),\n",
|
312 |
+
" Chunk(text=\"iyeshia: and please register as a student. For those who can attend. You're more than welcome for the I believe the May 20th ones. If you cannot attend because you have a final, you have an internship. It is okay. There's no pressure. We're not going to be like, Hey, you can't you got to make it? No, we totally get it, I mean, we understand. So blessings on your finals\\n\\niyeshia: and your projects. But for those who can't attend come through. It's going to be great to see your projects to see each other one last time, like Demo Night. And it's gonna be it's going to be a great time as we close out the the cohort in in May. So, and also to Devin's question, just one more time. We won't leave you hanging you will get an invite to be alumni\\n\\niyeshia: for Ctp, and that way you'll be with everybody who did the cohorts before your cohorts, one through 9 and so it'll be one through 10 now. And so that'll be like over a thousand people in that slack channel. So you can definitely network with your peers and the people who came before you. So yeah, just keep that in mind.\\n\\niyeshia: So thank you all. And I will stop sharing.\\n\\niyeshia: And yeah, please. Rsvp for the graduation. And please fill out that feedback form. It is greatly appreciative. I want to thank you for your time lessons on your projects. And yeah, if any of my fellows have any questions about the presentation, you can highlight me on slack. I am there to support you, and other than that. I want to thank you. And, Kevin, I think it's all yours now.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='305-309', metadata={'start': datetime.timedelta(seconds=2901, microseconds=130000), 'end': datetime.timedelta(seconds=2988, microseconds=469000), 'speakers': frozenset({'iyeshia'})}),\n",
|
313 |
+
" Chunk(text=\"iyeshia: And yeah, please. Rsvp for the graduation. And please fill out that feedback form. It is greatly appreciative. I want to thank you for your time lessons on your projects. And yeah, if any of my fellows have any questions about the presentation, you can highlight me on slack. I am there to support you, and other than that. I want to thank you. And, Kevin, I think it's all yours now.\\n\\nCUNY Tech Prep (CTP): Definitely. Thank you, Aisha, for the valuable tips. I think. A lot of students, a lot of the students I've spoken to, at least are.\\n\\nCUNY Tech Prep (CTP): have got recently gotten jobs or are very close to getting them, and\\n\\nCUNY Tech Prep (CTP): they will find this material very useful. I'm actually kind of glad I remember to click record at the beginning, because some of them are like in traffic right now.\\n\\niyeshia: Got it. Okay.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='309-313', metadata={'start': datetime.timedelta(seconds=2964, microseconds=60000), 'end': datetime.timedelta(seconds=3011, microseconds=947000), 'speakers': frozenset({'CUNY Tech Prep (CTP)', 'iyeshia'})}),\n",
|
314 |
+
" Chunk(text=\"iyeshia: Got it. Okay.\\n\\niyeshia: I'm glad.\\n\\nCUNY Tech Prep (CTP): Okay, thank you. So I'm gonna give you all 10\\xa0min to fill this out. Since you got 2 things to fill out. One is the inviting yourself to the graduation, and then 2 is the survey.\\n\\nCUNY Tech Prep (CTP): Alright, so we will come back at 7, 35.\\n\\nCUNY Tech Prep (CTP): Oh, yes, there's good news for those of you who missed it.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='313-317', metadata={'start': datetime.timedelta(seconds=3010, microseconds=980000), 'end': datetime.timedelta(seconds=3063, microseconds=720000), 'speakers': frozenset({'CUNY Tech Prep (CTP)', 'iyeshia'})}),\n",
|
315 |
+
" Chunk(text=\"CUNY Tech Prep (CTP): Oh, yes, there's good news for those of you who missed it.\\n\\nCUNY Tech Prep (CTP): There's no homework for the next 2 weeks, and there's spring break. So which means.\\n\\nCUNY Tech Prep (CTP): after this class, I'll be seeing you the second Friday from now.\\n\\nCUNY Tech Prep (CTP): Not next Friday.\\n\\nCUNY Tech Prep (CTP): No, a break is not exactly a break, so you have projects.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='317-321', metadata={'start': datetime.timedelta(seconds=3060, microseconds=740000), 'end': datetime.timedelta(seconds=3115, microseconds=180000), 'speakers': frozenset({'CUNY Tech Prep (CTP)'})}),\n",
|
316 |
+
" Chunk(text='CUNY Tech Prep (CTP): No, a break is not exactly a break, so you have projects.\\n\\nCUNY Tech Prep (CTP): This is time to do your projects.\\n\\nCUNY Tech Prep (CTP): Alright, so just as a gift to all the people who are in class.\\n\\nCUNY Tech Prep (CTP): If you check the homework sheet.\\n\\nCUNY Tech Prep (CTP): there is actually a column where you can grade yourselves. You can give yourself any emoji you want.', parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='321-325', metadata={'start': datetime.timedelta(seconds=3110, microseconds=350000), 'end': datetime.timedelta(seconds=3275, microseconds=10000), 'speakers': frozenset({'CUNY Tech Prep (CTP)'})}),\n",
|
317 |
+
" Chunk(text=\"CUNY Tech Prep (CTP): there is actually a column where you can grade yourselves. You can give yourself any emoji you want.\\n\\nCUNY Tech Prep (CTP): I'll let you figure out which one that is\\n\\nCUNY Tech Prep (CTP): alright. We're back.\\n\\nCUNY Tech Prep (CTP): So go for the rest of this day. So we're gonna I'm gonna put you in breakout rooms\\n\\nCUNY Tech Prep (CTP): for your projects.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='325-329', metadata={'start': datetime.timedelta(seconds=3269, microseconds=390000), 'end': datetime.timedelta(seconds=3591, microseconds=359000), 'speakers': frozenset({'CUNY Tech Prep (CTP)'})}),\n",
|
318 |
+
" Chunk(text='CUNY Tech Prep (CTP): for your projects.\\n\\nCUNY Tech Prep (CTP): And what I want you to do is I need to think about the state of the project. You, the the state the project is in.\\n\\nCUNY Tech Prep (CTP): I will be coming around to check in\\n\\nCUNY Tech Prep (CTP): because you have 2 weeks and no homework.\\n\\nCUNY Tech Prep (CTP): I want you to put your all into the project. So', parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='329-333', metadata={'start': datetime.timedelta(seconds=3589, microseconds=600000), 'end': datetime.timedelta(seconds=3613, microseconds=269000), 'speakers': frozenset({'CUNY Tech Prep (CTP)'})}),\n",
|
319 |
+
" Chunk(text='CUNY Tech Prep (CTP): I want you to put your all into the project. So\\n\\nCUNY Tech Prep (CTP): let me make the breakout rooms first.st\\n\\nCUNY Tech Prep (CTP): Basically, what I want you to do is plan out the next 2 weeks. Okay, what do you want? What? What is missing from\\n\\nCUNY Tech Prep (CTP): your project that you need to complete it?\\n\\nCUNY Tech Prep (CTP): And how are you going to get there in the next 2 weeks?', parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='333-337', metadata={'start': datetime.timedelta(seconds=3609, microseconds=440000), 'end': datetime.timedelta(seconds=3646, microseconds=619000), 'speakers': frozenset({'CUNY Tech Prep (CTP)'})}),\n",
|
320 |
+
" Chunk(text=\"CUNY Tech Prep (CTP): And how are you going to get there in the next 2 weeks?\\n\\nCUNY Tech Prep (CTP): Because after the next 2 weeks you literally have only 2 weeks left.\\n\\nCUNY Tech Prep (CTP): There's class. There's week 11, and then there's week 12\\n\\nCUNY Tech Prep (CTP): week. 13 is like May May 10th or May 9, th\\n\\nCUNY Tech Prep (CTP): and then the week after that, I believe, is\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='337-341', metadata={'start': datetime.timedelta(seconds=3643, microseconds=720000), 'end': datetime.timedelta(seconds=3672, microseconds=696000), 'speakers': frozenset({'CUNY Tech Prep (CTP)'})}),\n",
|
321 |
+
" Chunk(text=\"CUNY Tech Prep (CTP): and then the week after that, I believe, is\\n\\nCUNY Tech Prep (CTP): when you're going to do Demos.\\n\\nCUNY Tech Prep (CTP): I could be wrong.\\n\\nCUNY Tech Prep (CTP): Alright. You can pick the rooms. Now go into your rooms.\", parent_id='38b3a5ac7de4b38806edbcce9f913d8518ebd2976083d54bad21f6b15fce4313', chunk_id='341-344', metadata={'start': datetime.timedelta(seconds=3670, microseconds=320000), 'end': datetime.timedelta(seconds=3682, microseconds=370000), 'speakers': frozenset({'CUNY Tech Prep (CTP)'})}))"
|
322 |
+
]
|
323 |
+
},
|
324 |
+
"execution_count": 13,
|
325 |
+
"metadata": {},
|
326 |
+
"output_type": "execute_result"
|
327 |
+
}
|
328 |
+
],
|
329 |
+
"source": [
|
330 |
+
"web_vtt_content.get_chunks()"
|
331 |
+
]
|
332 |
+
}
|
333 |
+
],
|
334 |
+
"metadata": {
|
335 |
+
"kernelspec": {
|
336 |
+
"display_name": ".venv",
|
337 |
+
"language": "python",
|
338 |
+
"name": "python3"
|
339 |
+
},
|
340 |
+
"language_info": {
|
341 |
+
"codemirror_mode": {
|
342 |
+
"name": "ipython",
|
343 |
+
"version": 3
|
344 |
+
},
|
345 |
+
"file_extension": ".py",
|
346 |
+
"mimetype": "text/x-python",
|
347 |
+
"name": "python",
|
348 |
+
"nbconvert_exporter": "python",
|
349 |
+
"pygments_lexer": "ipython3",
|
350 |
+
"version": "3.12.3"
|
351 |
+
}
|
352 |
+
},
|
353 |
+
"nbformat": 4,
|
354 |
+
"nbformat_minor": 2
|
355 |
+
}
|
pyproject.toml
CHANGED
@@ -7,7 +7,7 @@ name = "ctp-slack-bot"
|
|
7 |
version = "0.1.0"
|
8 |
description = "A Slack bot for processing and analyzing Zoom transcripts using AI"
|
9 |
readme = "README.md"
|
10 |
-
requires-python = ">=3.
|
11 |
license = {text = "MIT"}
|
12 |
authors = [
|
13 |
{name = "Your Name", email = "[email protected]"}
|
@@ -19,26 +19,27 @@ classifiers = [
|
|
19 |
"Operating System :: OS Independent",
|
20 |
]
|
21 |
dependencies = [
|
22 |
-
"dependency-injector>=4.46.0",
|
23 |
"pydantic>=2.11.2",
|
24 |
"pydantic-settings>=2.8.1",
|
25 |
-
"
|
26 |
-
"
|
27 |
-
"loguru>=0.7.3",
|
28 |
"python-dotenv>=1.1.0",
|
29 |
-
"
|
30 |
-
"
|
31 |
-
"pybreaker>=1.3.0",
|
32 |
"pytz>=2025.2",
|
33 |
"apscheduler>=3.11.0",
|
|
|
|
|
|
|
|
|
34 |
"slack-sdk>=3.35.0",
|
|
|
35 |
"pymongo>=4.11.3 ",
|
36 |
-
"
|
37 |
-
"webvtt-py>=0.5.1",
|
38 |
"openai>=1.70.0",
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
]
|
43 |
|
44 |
[project.optional-dependencies]
|
@@ -49,7 +50,7 @@ dev = [
|
|
49 |
"types-pytz>=2025.2",
|
50 |
"black>=25.1.0",
|
51 |
"isort>=6.0.1",
|
52 |
-
"ruff>=0.11.4"
|
53 |
]
|
54 |
|
55 |
[project.urls]
|
|
|
7 |
version = "0.1.0"
|
8 |
description = "A Slack bot for processing and analyzing Zoom transcripts using AI"
|
9 |
readme = "README.md"
|
10 |
+
requires-python = ">=3.12"
|
11 |
license = {text = "MIT"}
|
12 |
authors = [
|
13 |
{name = "Your Name", email = "[email protected]"}
|
|
|
19 |
"Operating System :: OS Independent",
|
20 |
]
|
21 |
dependencies = [
|
|
|
22 |
"pydantic>=2.11.2",
|
23 |
"pydantic-settings>=2.8.1",
|
24 |
+
"cachetools>=5.5.2",
|
25 |
+
"more-itertools>=10.6.0",
|
|
|
26 |
"python-dotenv>=1.1.0",
|
27 |
+
"loguru>=0.7.3",
|
28 |
+
"dependency-injector>=4.46.0",
|
|
|
29 |
"pytz>=2025.2",
|
30 |
"apscheduler>=3.11.0",
|
31 |
+
# "tenacity>=9.1.2",
|
32 |
+
# "pybreaker>=1.3.0",
|
33 |
+
"aiohttp>=3.11.16",
|
34 |
+
"webvtt-py>=0.5.1",
|
35 |
"slack-sdk>=3.35.0",
|
36 |
+
"slack_bolt>=1.23.0",
|
37 |
"pymongo>=4.11.3 ",
|
38 |
+
"motor>=3.7.0",
|
|
|
39 |
"openai>=1.70.0",
|
40 |
+
"google-api-python-client>=2.167.0",
|
41 |
+
"google-auth>=2.39.0",
|
42 |
+
"google-auth-oauthlib>=1.2.1"
|
43 |
]
|
44 |
|
45 |
[project.optional-dependencies]
|
|
|
50 |
"types-pytz>=2025.2",
|
51 |
"black>=25.1.0",
|
52 |
"isort>=6.0.1",
|
53 |
+
"ruff>=0.11.4"
|
54 |
]
|
55 |
|
56 |
[project.urls]
|
scripts/run-dev.sh
CHANGED
@@ -2,4 +2,4 @@
|
|
2 |
|
3 |
parent_path=$(cd "$(dirname "${BASH_SOURCE[0]}")"; pwd -P)
|
4 |
|
5 |
-
python3 "${parent_path}/../src/ctp_slack_bot/
|
|
|
2 |
|
3 |
parent_path=$(cd "$(dirname "${BASH_SOURCE[0]}")"; pwd -P)
|
4 |
|
5 |
+
LOG_LEVEL=DEBUG python3 "${parent_path}/../src/ctp_slack_bot/app.py"
|
src/ctp_slack_bot/__init__.py
CHANGED
@@ -1 +0,0 @@
|
|
1 |
-
from ctp_slack_bot.containers import Container
|
|
|
|
src/ctp_slack_bot/api/__init__.py
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
from ctp_slack_bot.api.main import app, run
|
|
|
|
src/ctp_slack_bot/api/main.py
DELETED
@@ -1,70 +0,0 @@
|
|
1 |
-
from contextlib import asynccontextmanager
|
2 |
-
from fastapi import FastAPI, HTTPException, Depends
|
3 |
-
from loguru import logger
|
4 |
-
from typing import AsyncGenerator
|
5 |
-
from dependency_injector.wiring import inject, Provide
|
6 |
-
|
7 |
-
from ctp_slack_bot import Container
|
8 |
-
from ctp_slack_bot.api.routes import router
|
9 |
-
from ctp_slack_bot.core import Settings, setup_logging
|
10 |
-
from ctp_slack_bot.core.response_rendering import PrettyJSONResponse
|
11 |
-
from ctp_slack_bot.tasks import start_scheduler, stop_scheduler
|
12 |
-
|
13 |
-
@asynccontextmanager
|
14 |
-
async def lifespan(app: FastAPI) -> AsyncGenerator:
|
15 |
-
"""
|
16 |
-
Lifespan context manager for FastAPI application.
|
17 |
-
Handles startup and shutdown events.
|
18 |
-
"""
|
19 |
-
# Initialize container and wire the container to modules that need dependency injection.
|
20 |
-
container = Container()
|
21 |
-
container.wire(packages=['ctp_slack_bot'])
|
22 |
-
app.container = container
|
23 |
-
|
24 |
-
# Setup logging.
|
25 |
-
setup_logging(container)
|
26 |
-
logger.info("Starting application")
|
27 |
-
|
28 |
-
# Start the scheduler.
|
29 |
-
scheduler = start_scheduler(container)
|
30 |
-
logger.info("Started scheduler")
|
31 |
-
|
32 |
-
yield # control to FastAPI until shutdown.
|
33 |
-
|
34 |
-
# Shutdown.
|
35 |
-
logger.info("Shutting down application")
|
36 |
-
stop_scheduler(scheduler)
|
37 |
-
logger.info("Stopped scheduler")
|
38 |
-
|
39 |
-
|
40 |
-
app = FastAPI(
|
41 |
-
title="CTP Slack Bot",
|
42 |
-
description="A Slack bot for processing and analyzing Zoom transcripts using AI",
|
43 |
-
version="0.1.0",
|
44 |
-
lifespan=lifespan,
|
45 |
-
)
|
46 |
-
|
47 |
-
# Include routers.
|
48 |
-
app.include_router(router)
|
49 |
-
|
50 |
-
# Provide a minimalist health check endpoint for clients to detect availability.
|
51 |
-
@app.get("/health")
|
52 |
-
async def get_health() -> dict[str, str]:
|
53 |
-
"""Health check"""
|
54 |
-
return {
|
55 |
-
"status": "healthy"
|
56 |
-
}
|
57 |
-
|
58 |
-
# Alternate starting path for development
|
59 |
-
def run() -> None:
|
60 |
-
import uvicorn
|
61 |
-
settings = Settings() # type: ignore
|
62 |
-
uvicorn.run(
|
63 |
-
"main:app",
|
64 |
-
host=settings.API_HOST,
|
65 |
-
port=settings.API_PORT,
|
66 |
-
reload=settings.DEBUG
|
67 |
-
)
|
68 |
-
|
69 |
-
if __name__ == "__main__":
|
70 |
-
run()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/ctp_slack_bot/api/routes.py
DELETED
@@ -1,67 +0,0 @@
|
|
1 |
-
from fastapi import APIRouter, Depends, HTTPException, status
|
2 |
-
from dependency_injector.wiring import inject, Provide
|
3 |
-
from loguru import logger
|
4 |
-
|
5 |
-
from ctp_slack_bot import Container
|
6 |
-
from ctp_slack_bot.core import Settings
|
7 |
-
from ctp_slack_bot.services import SlackService
|
8 |
-
|
9 |
-
router = APIRouter(prefix="/api/v1")
|
10 |
-
|
11 |
-
@router.get("/env", response_model=Settings)
|
12 |
-
@inject
|
13 |
-
async def get_env(settings: Settings = Depends(Provide[Container.settings])) -> Settings:
|
14 |
-
if not settings.DEBUG:
|
15 |
-
raise HTTPException(status_code=404)
|
16 |
-
return settings
|
17 |
-
|
18 |
-
# @router.post("/transcripts/analyze", response_model=TranscriptResponse)
|
19 |
-
# async def analyze_transcript(
|
20 |
-
# request: TranscriptRequest,
|
21 |
-
# transcript_service: TranscriptService = Depends(get_transcript_service),
|
22 |
-
# ):
|
23 |
-
# """
|
24 |
-
# Analyze a Zoom transcript and return insights.
|
25 |
-
# """
|
26 |
-
# logger.info(f"Analyzing transcript: {request.transcript_id}")
|
27 |
-
# try:
|
28 |
-
# result = await transcript_service.analyze_transcript(request)
|
29 |
-
# return result
|
30 |
-
# except Exception as e:
|
31 |
-
# logger.error(f"Error analyzing transcript: {e}")
|
32 |
-
# raise HTTPException(
|
33 |
-
# status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
34 |
-
# detail="Failed to analyze transcript",
|
35 |
-
# )
|
36 |
-
|
37 |
-
|
38 |
-
# @router.post("/slack/message")
|
39 |
-
# async def send_slack_message(
|
40 |
-
# channel: str,
|
41 |
-
# message: str,
|
42 |
-
# slack_service: SlackService = Depends(get_slack_service),
|
43 |
-
# ):
|
44 |
-
# """
|
45 |
-
# Send a message to a Slack channel.
|
46 |
-
# """
|
47 |
-
# logger.info(f"Sending message to Slack channel: {channel}")
|
48 |
-
# try:
|
49 |
-
# result = await slack_service.send_message(channel, message)
|
50 |
-
# return {"status": "success", "message_ts": result.get("ts")}
|
51 |
-
# except Exception as e:
|
52 |
-
# logger.error(f"Error sending Slack message: {e}")
|
53 |
-
# raise HTTPException(
|
54 |
-
# status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
55 |
-
# detail="Failed to send Slack message",
|
56 |
-
# )
|
57 |
-
|
58 |
-
|
59 |
-
# @router.post("/slack/webhook", include_in_schema=False)
|
60 |
-
# async def slack_webhook(
|
61 |
-
# slack_service: SlackService = Depends(get_slack_service),
|
62 |
-
# ):
|
63 |
-
# """
|
64 |
-
# Webhook endpoint for Slack events.
|
65 |
-
# """
|
66 |
-
# # This would typically handle Slack verification and event processing
|
67 |
-
# return {"challenge": "challenge_token"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/ctp_slack_bot/app.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from asyncio import all_tasks, CancelledError, create_task, current_task, get_running_loop, run
|
2 |
+
from loguru import logger
|
3 |
+
from signal import SIGINT, SIGTERM
|
4 |
+
from typing import Any, Callable
|
5 |
+
|
6 |
+
from ctp_slack_bot.containers import Container
|
7 |
+
from ctp_slack_bot.core.logging import setup_logging
|
8 |
+
|
9 |
+
async def handle_shutdown_signal() -> None:
|
10 |
+
logger.info("Received shutdown signal.")
|
11 |
+
for task in all_tasks():
|
12 |
+
if task is not current_task() and not task.done():
|
13 |
+
task.cancel()
|
14 |
+
logger.trace("Cancelled task {}.", task.get_name())
|
15 |
+
logger.info("Cancelled all tasks.")
|
16 |
+
|
17 |
+
def create_shutdown_signal_handler() -> Callable[[], None]:
|
18 |
+
def shutdown_signal_handler() -> None:
|
19 |
+
create_task(handle_shutdown_signal())
|
20 |
+
return shutdown_signal_handler
|
21 |
+
|
22 |
+
async def main() -> None:
|
23 |
+
# Setup logging.
|
24 |
+
setup_logging()
|
25 |
+
logger.info("Starting application…")
|
26 |
+
|
27 |
+
# Set up dependency injection container.
|
28 |
+
container = Container()
|
29 |
+
container.wire(packages=['ctp_slack_bot'])
|
30 |
+
|
31 |
+
# Kick off services which should be active from the start.
|
32 |
+
container.content_ingestion_service()
|
33 |
+
container.question_dispatch_service()
|
34 |
+
container.schedule_service()
|
35 |
+
|
36 |
+
# Start the Slack socket mode handler in the background.
|
37 |
+
socket_mode_handler = container.socket_mode_handler()
|
38 |
+
slack_bolt_task = create_task(socket_mode_handler.start_async())
|
39 |
+
shutdown_signal_handler = create_shutdown_signal_handler()
|
40 |
+
loop = get_running_loop()
|
41 |
+
loop.add_signal_handler(SIGINT, shutdown_signal_handler)
|
42 |
+
loop.add_signal_handler(SIGTERM, shutdown_signal_handler)
|
43 |
+
try:
|
44 |
+
logger.info("Starting Slack Socket Mode handler…")
|
45 |
+
await slack_bolt_task
|
46 |
+
except CancelledError:
|
47 |
+
logger.info("Shutting down application…")
|
48 |
+
finally:
|
49 |
+
await socket_mode_handler.close_async()
|
50 |
+
await container.shutdown_resources()
|
51 |
+
|
52 |
+
if __name__ == "__main__":
|
53 |
+
run(main())
|
src/ctp_slack_bot/containers.py
CHANGED
@@ -1,44 +1,40 @@
|
|
1 |
from dependency_injector.containers import DeclarativeContainer
|
2 |
-
from dependency_injector.providers import
|
3 |
-
from
|
|
|
4 |
|
5 |
from ctp_slack_bot.core.config import Settings
|
6 |
-
from ctp_slack_bot.db.mongo_db import
|
|
|
7 |
from ctp_slack_bot.services.answer_retrieval_service import AnswerRetrievalService
|
8 |
from ctp_slack_bot.services.content_ingestion_service import ContentIngestionService
|
9 |
from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
|
|
|
10 |
from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
|
|
|
|
11 |
from ctp_slack_bot.services.question_dispatch_service import QuestionDispatchService
|
12 |
-
from ctp_slack_bot.services.
|
|
|
13 |
from ctp_slack_bot.services.vector_database_service import VectorDatabaseService
|
14 |
from ctp_slack_bot.services.vectorization_service import VectorizationService
|
15 |
|
16 |
|
17 |
class Container(DeclarativeContainer):
|
18 |
settings = Singleton(Settings)
|
19 |
-
|
20 |
event_brokerage_service = Singleton(EventBrokerageService)
|
21 |
-
|
22 |
-
mongo_db =
|
23 |
-
|
24 |
-
# Repositories
|
25 |
-
# transcript_repository = Factory(
|
26 |
-
# # Your transcript repository class
|
27 |
-
# db=db
|
28 |
-
# )
|
29 |
-
|
30 |
-
open_ai_client = Factory(OpenAI, api_key=settings.provided.OPENAI_API_KEY) # TODO: poor practice to do it this way; create a LanguageModelService that creates an OpenAI client.
|
31 |
-
|
32 |
vector_database_service = Singleton(VectorDatabaseService, settings=settings, mongo_db=mongo_db)
|
33 |
-
|
34 |
-
vectorization_service = Singleton(VectorizationService, settings=settings,
|
35 |
-
|
36 |
content_ingestion_service = Singleton(ContentIngestionService, settings=settings, event_brokerage_service=event_brokerage_service, vector_database_service=vector_database_service, vectorization_service=vectorization_service)
|
37 |
-
|
38 |
context_retrieval_service = Singleton(ContextRetrievalService, settings=settings, vectorization_service=vectorization_service, vector_database_service=vector_database_service)
|
39 |
-
|
40 |
-
answer_retrieval_service = Singleton(AnswerRetrievalService, settings=settings, event_brokerage_service=event_brokerage_service,
|
41 |
-
|
42 |
question_dispatch_service = Singleton(QuestionDispatchService, settings=settings, event_brokerage_service=event_brokerage_service, content_ingestion_service=content_ingestion_service, context_retrieval_service=context_retrieval_service, answer_retrieval_service=answer_retrieval_service)
|
43 |
-
|
44 |
-
slack_service =
|
|
|
|
|
|
1 |
from dependency_injector.containers import DeclarativeContainer
|
2 |
+
from dependency_injector.providers import Resource, Singleton
|
3 |
+
from slack_bolt.adapter.socket_mode.async_handler import AsyncSocketModeHandler
|
4 |
+
from slack_bolt.async_app import AsyncApp
|
5 |
|
6 |
from ctp_slack_bot.core.config import Settings
|
7 |
+
from ctp_slack_bot.db.mongo_db import MongoDBResource
|
8 |
+
from ctp_slack_bot.db.repositories import MongoVectorizedChunkRepository
|
9 |
from ctp_slack_bot.services.answer_retrieval_service import AnswerRetrievalService
|
10 |
from ctp_slack_bot.services.content_ingestion_service import ContentIngestionService
|
11 |
from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
|
12 |
+
from ctp_slack_bot.services.embeddings_model_service import EmbeddingsModelService
|
13 |
from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
14 |
+
from ctp_slack_bot.services.google_drive_service import GoogleDriveService
|
15 |
+
from ctp_slack_bot.services.language_model_service import LanguageModelService
|
16 |
from ctp_slack_bot.services.question_dispatch_service import QuestionDispatchService
|
17 |
+
from ctp_slack_bot.services.schedule_service import ScheduleServiceResource
|
18 |
+
from ctp_slack_bot.services.slack_service import SlackServiceResource
|
19 |
from ctp_slack_bot.services.vector_database_service import VectorDatabaseService
|
20 |
from ctp_slack_bot.services.vectorization_service import VectorizationService
|
21 |
|
22 |
|
23 |
class Container(DeclarativeContainer):
|
24 |
settings = Singleton(Settings)
|
|
|
25 |
event_brokerage_service = Singleton(EventBrokerageService)
|
26 |
+
schedule_service = Resource(ScheduleServiceResource, settings=settings)
|
27 |
+
mongo_db = Resource(MongoDBResource, settings=settings) # TODO: generalize to any database.
|
28 |
+
vectorized_chunk_repository = Singleton(MongoVectorizedChunkRepository, mongo_db=mongo_db)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
vector_database_service = Singleton(VectorDatabaseService, settings=settings, mongo_db=mongo_db)
|
30 |
+
embeddings_model_service = Singleton(EmbeddingsModelService, settings=settings)
|
31 |
+
vectorization_service = Singleton(VectorizationService, settings=settings, embeddings_model_service=embeddings_model_service)
|
|
|
32 |
content_ingestion_service = Singleton(ContentIngestionService, settings=settings, event_brokerage_service=event_brokerage_service, vector_database_service=vector_database_service, vectorization_service=vectorization_service)
|
|
|
33 |
context_retrieval_service = Singleton(ContextRetrievalService, settings=settings, vectorization_service=vectorization_service, vector_database_service=vector_database_service)
|
34 |
+
language_model_service = Singleton(LanguageModelService, settings=settings)
|
35 |
+
answer_retrieval_service = Singleton(AnswerRetrievalService, settings=settings, event_brokerage_service=event_brokerage_service, language_model_service=language_model_service)
|
|
|
36 |
question_dispatch_service = Singleton(QuestionDispatchService, settings=settings, event_brokerage_service=event_brokerage_service, content_ingestion_service=content_ingestion_service, context_retrieval_service=context_retrieval_service, answer_retrieval_service=answer_retrieval_service)
|
37 |
+
slack_bolt_app = Singleton(AsyncApp, token=settings.provided.SLACK_BOT_TOKEN().get_secret_value())
|
38 |
+
slack_service = Resource(SlackServiceResource, event_brokerage_service=event_brokerage_service, slack_bolt_app=slack_bolt_app)
|
39 |
+
socket_mode_handler = Singleton(lambda _, app, app_token: AsyncSocketModeHandler(app, app_token), slack_service, slack_bolt_app, settings.provided.SLACK_APP_TOKEN().get_secret_value())
|
40 |
+
google_drive_service = Singleton(GoogleDriveService, settings=settings)
|
src/ctp_slack_bot/core/__init__.py
CHANGED
@@ -1,2 +1 @@
|
|
1 |
from ctp_slack_bot.core.config import Settings
|
2 |
-
from ctp_slack_bot.core.logging import logger, setup_logging
|
|
|
1 |
from ctp_slack_bot.core.config import Settings
|
|
src/ctp_slack_bot/core/config.py
CHANGED
@@ -1,28 +1,29 @@
|
|
|
|
1 |
from pydantic import Field, MongoDsn, NonNegativeFloat, NonNegativeInt, PositiveInt, SecretStr
|
2 |
from pydantic_settings import BaseSettings, SettingsConfigDict
|
3 |
-
from
|
|
|
4 |
|
5 |
-
class Settings(BaseSettings):
|
6 |
"""
|
7 |
Application settings loaded from environment variables.
|
8 |
"""
|
9 |
-
# Application Configuration
|
10 |
-
DEBUG: bool = False
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
LOG_LEVEL: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = Field(default_factory=lambda data: "DEBUG" if data.get("DEBUG", False) else "INFO")
|
14 |
LOG_FORMAT: Literal["text", "json"] = "json"
|
15 |
|
16 |
# APScheduler Configuration
|
17 |
-
SCHEDULER_TIMEZONE: str = "UTC"
|
18 |
-
|
19 |
-
# API Configuration
|
20 |
-
API_HOST: str
|
21 |
-
API_PORT: PositiveInt
|
22 |
|
23 |
# Slack Configuration
|
24 |
SLACK_BOT_TOKEN: SecretStr
|
25 |
-
SLACK_SIGNING_SECRET: SecretStr
|
26 |
SLACK_APP_TOKEN: SecretStr
|
27 |
|
28 |
# Vectorization Configuration
|
@@ -31,23 +32,45 @@ class Settings(BaseSettings): # TODO: Strong guarantees of validity, because gar
|
|
31 |
CHUNK_SIZE: PositiveInt
|
32 |
CHUNK_OVERLAP: NonNegativeInt
|
33 |
TOP_K_MATCHES: PositiveInt
|
34 |
-
|
35 |
# MongoDB Configuration
|
36 |
MONGODB_URI: SecretStr # TODO: Contemplate switching to MongoDsn type for the main URL, and separate out the credentials to SecretStr variables.
|
37 |
MONGODB_NAME: str
|
|
|
38 |
|
39 |
# Hugging Face Configuration
|
40 |
-
HF_API_TOKEN: Optional[SecretStr] = None
|
41 |
|
42 |
# OpenAI Configuration
|
43 |
-
OPENAI_API_KEY:
|
44 |
CHAT_MODEL: str
|
45 |
MAX_TOKENS: PositiveInt
|
46 |
TEMPERATURE: NonNegativeFloat
|
47 |
SYSTEM_PROMPT: str
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
model_config = SettingsConfigDict(
|
50 |
env_file=".env",
|
51 |
env_file_encoding="utf-8",
|
52 |
case_sensitive=True,
|
|
|
|
|
53 |
)
|
|
|
|
|
|
|
|
1 |
+
from loguru import logger
|
2 |
from pydantic import Field, MongoDsn, NonNegativeFloat, NonNegativeInt, PositiveInt, SecretStr
|
3 |
from pydantic_settings import BaseSettings, SettingsConfigDict
|
4 |
+
from types import MappingProxyType
|
5 |
+
from typing import Literal, Mapping, Optional, Self
|
6 |
|
7 |
+
class Settings(BaseSettings):
|
8 |
"""
|
9 |
Application settings loaded from environment variables.
|
10 |
"""
|
|
|
|
|
11 |
|
12 |
+
def __init__(self: Self, **data) -> None:
|
13 |
+
super().__init__(**data)
|
14 |
+
logger.debug("Created {}", self.__class__.__name__)
|
15 |
+
if self.__pydantic_extra__:
|
16 |
+
logger.warning("Extra unrecognized environment variables were provided: {}", ", ".join(self.__pydantic_extra__))
|
17 |
+
|
18 |
+
# Logging Configuration ― not actually used to configure Loguru, but defined to prevent warnings about “unknown” environment variables
|
19 |
LOG_LEVEL: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = Field(default_factory=lambda data: "DEBUG" if data.get("DEBUG", False) else "INFO")
|
20 |
LOG_FORMAT: Literal["text", "json"] = "json"
|
21 |
|
22 |
# APScheduler Configuration
|
23 |
+
SCHEDULER_TIMEZONE: Optional[str] = "UTC"
|
|
|
|
|
|
|
|
|
24 |
|
25 |
# Slack Configuration
|
26 |
SLACK_BOT_TOKEN: SecretStr
|
|
|
27 |
SLACK_APP_TOKEN: SecretStr
|
28 |
|
29 |
# Vectorization Configuration
|
|
|
32 |
CHUNK_SIZE: PositiveInt
|
33 |
CHUNK_OVERLAP: NonNegativeInt
|
34 |
TOP_K_MATCHES: PositiveInt
|
35 |
+
|
36 |
# MongoDB Configuration
|
37 |
MONGODB_URI: SecretStr # TODO: Contemplate switching to MongoDsn type for the main URL, and separate out the credentials to SecretStr variables.
|
38 |
MONGODB_NAME: str
|
39 |
+
SCORE_THRESHOLD: NonNegativeFloat
|
40 |
|
41 |
# Hugging Face Configuration
|
42 |
+
HF_API_TOKEN: Optional[SecretStr] = None # TODO: Currently, this is unused.
|
43 |
|
44 |
# OpenAI Configuration
|
45 |
+
OPENAI_API_KEY: SecretStr
|
46 |
CHAT_MODEL: str
|
47 |
MAX_TOKENS: PositiveInt
|
48 |
TEMPERATURE: NonNegativeFloat
|
49 |
SYSTEM_PROMPT: str
|
50 |
|
51 |
+
# Google Drive Configuration
|
52 |
+
GOOGLE_DRIVE_ROOT_ID: str
|
53 |
+
GOOGLE_PROJECT_ID: str
|
54 |
+
GOOGLE_PRIVATE_KEY_ID: SecretStr
|
55 |
+
GOOGLE_PRIVATE_KEY: SecretStr
|
56 |
+
GOOGLE_CLIENT_ID: str
|
57 |
+
GOOGLE_CLIENT_EMAIL: str
|
58 |
+
GOOGLE_AUTH_URI: str = "https://accounts.google.com/o/oauth2/auth"
|
59 |
+
GOOGLE_TOKEN_URI: str = "https://oauth2.googleapis.com/token"
|
60 |
+
GOOGLE_AUTH_PROVIDER_CERT_URL: str = "https://www.googleapis.com/oauth2/v1/certs"
|
61 |
+
GOOGLE_CLIENT_CERT_URL: str = "https://www.googleapis.com/robot/v1/metadata/x509/ctp-slack-bot-714%40voltaic-reducer-294821.iam.gserviceaccount.com"
|
62 |
+
GOOGLE_UNIVERSE_DOMAIN: str = "googleapis.com"
|
63 |
+
|
64 |
+
# File Monitoring Configuration
|
65 |
+
FILE_MONITOR_ROOT_PATH: Optional[str] = None
|
66 |
+
|
67 |
model_config = SettingsConfigDict(
|
68 |
env_file=".env",
|
69 |
env_file_encoding="utf-8",
|
70 |
case_sensitive=True,
|
71 |
+
extra="allow",
|
72 |
+
frozen=True
|
73 |
)
|
74 |
+
|
75 |
+
def get_extra_environment_variables(self: Self) -> Mapping[str, str]:
|
76 |
+
return MappingProxyType(self.__pydantic_extra__)
|
src/ctp_slack_bot/core/logging.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1 |
-
from logging import __file__ as logging_file, basicConfig, currentframe, getLogger, Handler, INFO, LogRecord
|
2 |
from loguru import logger
|
|
|
3 |
from sys import stderr
|
4 |
-
from typing import
|
5 |
|
6 |
class InterceptHandler(Handler):
|
7 |
"""
|
@@ -11,7 +12,7 @@ class InterceptHandler(Handler):
|
|
11 |
to Loguru, allowing unified logging across the application.
|
12 |
"""
|
13 |
|
14 |
-
def emit(self, record: LogRecord) -> None:
|
15 |
# Get corresponding Loguru level if it exists
|
16 |
try:
|
17 |
level = logger.level(record.levelname).name
|
@@ -29,22 +30,23 @@ class InterceptHandler(Handler):
|
|
29 |
)
|
30 |
|
31 |
|
32 |
-
def setup_logging(
|
33 |
"""
|
34 |
Configure logging with Loguru.
|
35 |
|
36 |
-
This function sets up Loguru as the main logging provider,
|
37 |
-
|
38 |
-
standard logging messages.
|
39 |
"""
|
40 |
-
from ctp_slack_bot import Container
|
41 |
-
settings = container.settings() if container else Provide[Container.settings]
|
42 |
|
43 |
-
#
|
|
|
|
|
|
|
|
|
44 |
logger.remove()
|
45 |
|
46 |
-
# Determine log format
|
47 |
-
if
|
48 |
log_format = {
|
49 |
"time": "{time:YYYY-MM-DD HH:mm:ss.SSS}",
|
50 |
"level": "{level}",
|
@@ -62,33 +64,35 @@ def setup_logging(container: "Container") -> None:
|
|
62 |
"<level>{message}</level>"
|
63 |
)
|
64 |
|
65 |
-
# Add console handler
|
66 |
logger.add(
|
67 |
stderr,
|
68 |
format=format_string,
|
69 |
-
level=
|
70 |
-
serialize=(
|
71 |
backtrace=True,
|
72 |
diagnose=True,
|
73 |
)
|
74 |
|
75 |
-
# Add file handler for non-DEBUG environments
|
76 |
-
if
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
|
87 |
-
# Intercept standard logging messages
|
88 |
basicConfig(handlers=[InterceptHandler()], level=0, force=True)
|
89 |
|
90 |
-
# Update logging levels for some noisy libraries
|
91 |
-
for logger_name in ("uvicorn", "uvicorn.error", "fastapi", "httpx", "
|
92 |
getLogger(logger_name).setLevel(INFO)
|
|
|
|
|
93 |
|
94 |
-
logger.info(f"Logging configured with level {
|
|
|
1 |
+
from logging import __file__ as logging_file, basicConfig, currentframe, getLogger, Handler, INFO, LogRecord, WARNING
|
2 |
from loguru import logger
|
3 |
+
from os import getenv
|
4 |
from sys import stderr
|
5 |
+
from typing import Self
|
6 |
|
7 |
class InterceptHandler(Handler):
|
8 |
"""
|
|
|
12 |
to Loguru, allowing unified logging across the application.
|
13 |
"""
|
14 |
|
15 |
+
def emit(self: Self, record: LogRecord) -> None:
|
16 |
# Get corresponding Loguru level if it exists
|
17 |
try:
|
18 |
level = logger.level(record.levelname).name
|
|
|
30 |
)
|
31 |
|
32 |
|
33 |
+
def setup_logging() -> None:
|
34 |
"""
|
35 |
Configure logging with Loguru.
|
36 |
|
37 |
+
This function sets up Loguru as the main logging provider, configures the log format based on environment variables,
|
38 |
+
and intercepts standard logging messages.
|
|
|
39 |
"""
|
|
|
|
|
40 |
|
41 |
+
# Get logger configuration from environment variables.
|
42 |
+
log_level = getenv("LOG_LEVEL", "INFO")
|
43 |
+
log_format = getenv("LOG_FORMAT", "text")
|
44 |
+
|
45 |
+
# Remove default loguru handler.
|
46 |
logger.remove()
|
47 |
|
48 |
+
# Determine log format.
|
49 |
+
if log_format == "json":
|
50 |
log_format = {
|
51 |
"time": "{time:YYYY-MM-DD HH:mm:ss.SSS}",
|
52 |
"level": "{level}",
|
|
|
64 |
"<level>{message}</level>"
|
65 |
)
|
66 |
|
67 |
+
# Add console handler.
|
68 |
logger.add(
|
69 |
stderr,
|
70 |
format=format_string,
|
71 |
+
level=log_level,
|
72 |
+
serialize=(log_format == "json"),
|
73 |
backtrace=True,
|
74 |
diagnose=True,
|
75 |
)
|
76 |
|
77 |
+
# Add file handler for non-DEBUG environments.
|
78 |
+
# if log_level != "DEBUG":
|
79 |
+
# logger.add(
|
80 |
+
# "/data/app.log",
|
81 |
+
# rotation="10 MB",
|
82 |
+
# retention="1 week",
|
83 |
+
# compression="zip",
|
84 |
+
# format=format_string,
|
85 |
+
# level=log_level,
|
86 |
+
# serialize=(log_format == "json"),
|
87 |
+
# )
|
88 |
|
89 |
+
# Intercept standard logging messages.
|
90 |
basicConfig(handlers=[InterceptHandler()], level=0, force=True)
|
91 |
|
92 |
+
# Update logging levels for some noisy libraries.
|
93 |
+
for logger_name in ("uvicorn", "uvicorn.error", "fastapi", "httpx", "pymongo"):
|
94 |
getLogger(logger_name).setLevel(INFO)
|
95 |
+
for logger_name in ("apscheduler"):
|
96 |
+
getLogger(logger_name).setLevel(WARNING)
|
97 |
|
98 |
+
logger.info(f"Logging configured with level {log_level}")
|
src/ctp_slack_bot/core/response_rendering.py
DELETED
@@ -1,13 +0,0 @@
|
|
1 |
-
from json import dumps
|
2 |
-
from starlette.responses import JSONResponse
|
3 |
-
from typing import Any, Self
|
4 |
-
|
5 |
-
class PrettyJSONResponse(JSONResponse):
|
6 |
-
def render(self: Self, content: Any) -> bytes:
|
7 |
-
return dumps(
|
8 |
-
content,
|
9 |
-
ensure_ascii=False,
|
10 |
-
allow_nan=False,
|
11 |
-
indent=4,
|
12 |
-
separators=(", ", ": "),
|
13 |
-
).encode("utf-8")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/ctp_slack_bot/db/mongo_db.py
CHANGED
@@ -1,125 +1,198 @@
|
|
1 |
-
from
|
|
|
|
|
|
|
|
|
2 |
from loguru import logger
|
3 |
-
from pydantic import BaseModel,
|
4 |
-
from typing import Optional, Self
|
5 |
|
6 |
from ctp_slack_bot.core.config import Settings
|
|
|
7 |
|
8 |
class MongoDB(BaseModel):
|
9 |
"""
|
10 |
-
MongoDB connection
|
11 |
-
Handles connection to MongoDB, database selection, and index creation.
|
12 |
"""
|
13 |
-
|
14 |
settings: Settings
|
15 |
-
|
16 |
-
|
17 |
-
vector_collection: Optional[Any] = None
|
18 |
-
initialized: bool = False
|
19 |
-
|
20 |
-
model_config = ConfigDict(arbitrary_types_allowed=True)
|
21 |
-
|
22 |
-
@model_validator(mode='after')
|
23 |
-
def post_init(self: Self) -> Self:
|
24 |
-
logger.debug("Created {}", self.__class__.__name__)
|
25 |
-
return self
|
26 |
|
27 |
-
|
28 |
-
|
29 |
-
Connect to MongoDB using connection string from settings.
|
30 |
-
"""
|
31 |
-
if self.client is not None:
|
32 |
-
return
|
33 |
|
34 |
-
|
35 |
-
|
|
|
36 |
|
|
|
|
|
37 |
try:
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
except Exception as e:
|
44 |
-
logger.error(
|
|
|
|
|
45 |
raise
|
46 |
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
"""
|
49 |
-
|
|
|
50 |
"""
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
try:
|
58 |
-
#
|
59 |
-
|
60 |
-
|
61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
except Exception as e:
|
63 |
-
logger.error(
|
64 |
raise
|
65 |
|
66 |
-
def
|
67 |
"""
|
68 |
-
Create vector
|
|
|
|
|
|
|
69 |
"""
|
|
|
|
|
70 |
try:
|
71 |
-
#
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
if "vector_index" not in index_names:
|
76 |
-
# Create vector search index
|
77 |
-
index_definition = {
|
78 |
-
"mappings": {
|
79 |
-
"dynamic": True,
|
80 |
-
"fields": {
|
81 |
-
"embedding": {
|
82 |
-
"dimensions": self.settings.VECTOR_DIMENSION,
|
83 |
-
"similarity": "cosine",
|
84 |
-
"type": "knnVector"
|
85 |
-
}
|
86 |
-
}
|
87 |
-
}
|
88 |
-
}
|
89 |
-
|
90 |
-
# Create the index
|
91 |
-
self.db.command({
|
92 |
-
"createIndexes": self.vector_collection.name,
|
93 |
-
"indexes": [
|
94 |
{
|
95 |
-
"
|
96 |
-
"
|
97 |
-
"
|
98 |
-
"
|
|
|
99 |
}
|
100 |
]
|
101 |
-
}
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
else:
|
109 |
-
logger.
|
110 |
-
|
111 |
except Exception as e:
|
112 |
-
logger.error(
|
113 |
raise
|
114 |
|
115 |
-
def
|
116 |
-
"""
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
self.client = None
|
122 |
-
self.db = None
|
123 |
-
self.vector_collection = None
|
124 |
-
self.initialized = False
|
125 |
-
logger.info("MongoDB connection closed")
|
|
|
1 |
+
from asyncio import create_task
|
2 |
+
from dependency_injector.resources import AsyncResource
|
3 |
+
from motor.motor_asyncio import AsyncIOMotorClient
|
4 |
+
from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError
|
5 |
+
from pymongo.operations import SearchIndexModel
|
6 |
from loguru import logger
|
7 |
+
from pydantic import BaseModel, PrivateAttr
|
8 |
+
from typing import Any, Dict, Optional, Self
|
9 |
|
10 |
from ctp_slack_bot.core.config import Settings
|
11 |
+
from ctp_slack_bot.utils import sanitize_mongo_db_uri
|
12 |
|
13 |
class MongoDB(BaseModel):
|
14 |
"""
|
15 |
+
MongoDB connection manager using Motor for async operations.
|
|
|
16 |
"""
|
|
|
17 |
settings: Settings
|
18 |
+
_client: PrivateAttr = PrivateAttr()
|
19 |
+
_db: PrivateAttr = PrivateAttr()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
+
class Config:
|
22 |
+
arbitrary_types_allowed = True
|
|
|
|
|
|
|
|
|
23 |
|
24 |
+
def __init__(self: Self, **data: Dict[str, Any]) -> None:
|
25 |
+
super().__init__(**data)
|
26 |
+
logger.debug("Created {}", self.__class__.__name__)
|
27 |
|
28 |
+
def connect(self: Self) -> None:
|
29 |
+
"""Initialize MongoDB client with settings."""
|
30 |
try:
|
31 |
+
connection_string = self.settings.MONGODB_URI.get_secret_value()
|
32 |
+
logger.debug("Connecting to MongoDB using URI: {}", sanitize_mongo_db_uri(connection_string))
|
33 |
+
|
34 |
+
# Create client with appropriate settings
|
35 |
+
self._client = AsyncIOMotorClient(
|
36 |
+
connection_string,
|
37 |
+
serverSelectionTimeoutMS=5000,
|
38 |
+
connectTimeoutMS=10000,
|
39 |
+
socketTimeoutMS=45000,
|
40 |
+
maxPoolSize=100,
|
41 |
+
retryWrites=True,
|
42 |
+
w="majority"
|
43 |
+
)
|
44 |
+
|
45 |
+
# Set database
|
46 |
+
db_name = self.settings.MONGODB_NAME
|
47 |
+
|
48 |
+
self._db = self._client[db_name]
|
49 |
+
logger.debug("MongoDB client initialized for database: {}", db_name)
|
50 |
+
|
51 |
except Exception as e:
|
52 |
+
logger.error("Failed to initialize MongoDB client: {}", e)
|
53 |
+
self._client = None
|
54 |
+
self._db = None
|
55 |
raise
|
56 |
|
57 |
+
@property
|
58 |
+
def client(self: Self) -> AsyncIOMotorClient:
|
59 |
+
"""Get the MongoDB client instance."""
|
60 |
+
if not hasattr(self, '_client') or self._client is None:
|
61 |
+
logger.warning("MongoDB client not initialized. Attempting to initialize…")
|
62 |
+
self.connect()
|
63 |
+
if not hasattr(self, '_client') or self._client is None:
|
64 |
+
raise ConnectionError("Failed to initialize MongoDB client.")
|
65 |
+
return self._client
|
66 |
+
|
67 |
+
@property
|
68 |
+
def db(self: Self) -> Any:
|
69 |
+
"""Get the MongoDB database instance."""
|
70 |
+
if not hasattr(self, '_db') or self._db is None:
|
71 |
+
logger.warning("MongoDB database not initialized. Attempting to initialize client…")
|
72 |
+
self.connect()
|
73 |
+
if not hasattr(self, '_db') or self._db is None:
|
74 |
+
raise ConnectionError("Failed to initialize MongoDB database.")
|
75 |
+
return self._db
|
76 |
+
|
77 |
+
async def ping(self: Self) -> bool:
|
78 |
+
"""Check if MongoDB connection is alive."""
|
79 |
+
try:
|
80 |
+
# Get client to ensure we're connected
|
81 |
+
client = self.client
|
82 |
+
|
83 |
+
# Try a simple ping command
|
84 |
+
await client.admin.command('ping')
|
85 |
+
logger.debug("MongoDB connection is active!")
|
86 |
+
return True
|
87 |
+
except (ConnectionFailure, ServerSelectionTimeoutError) as e:
|
88 |
+
logger.error("MongoDB connection failed: {}", e)
|
89 |
+
return False
|
90 |
+
except Exception as e:
|
91 |
+
logger.error("Unexpected error during MongoDB ping: {}", e)
|
92 |
+
return False
|
93 |
+
|
94 |
+
async def get_collection(self: Self, name: str) -> Any:
|
95 |
"""
|
96 |
+
Get a collection by name with validation.
|
97 |
+
Creates the collection if it doesn't exist.
|
98 |
"""
|
99 |
+
# First ensure we can connect at all
|
100 |
+
if not await self.ping():
|
101 |
+
logger.error("Cannot get collection '{}' because a MongoDB connection is not available.", name)
|
102 |
+
raise ConnectionError("MongoDB connection is not available.")
|
103 |
+
|
|
|
104 |
try:
|
105 |
+
# Get all collection names to check if this one exists
|
106 |
+
logger.debug("Checking if collection '{}' exists…", name)
|
107 |
+
collection_names = await self.db.list_collection_names()
|
108 |
+
|
109 |
+
if name not in collection_names:
|
110 |
+
logger.info("Collection '{}' does not exist. Creating it…", name)
|
111 |
+
# Create the collection
|
112 |
+
await self.db.create_collection(name)
|
113 |
+
logger.debug("Successfully created collection: {}", name)
|
114 |
+
else:
|
115 |
+
logger.debug("Collection '{}' already exists!", name)
|
116 |
+
|
117 |
+
# Get and return the collection
|
118 |
+
collection = self.db[name]
|
119 |
+
return collection
|
120 |
except Exception as e:
|
121 |
+
logger.error("Error accessing collection '{}': {}", name, e)
|
122 |
raise
|
123 |
|
124 |
+
async def create_indexes(self: Self, collection_name: str) -> None:
|
125 |
"""
|
126 |
+
Create a vector search index on a collection.
|
127 |
+
|
128 |
+
Args:
|
129 |
+
collection_name: Name of the collection
|
130 |
"""
|
131 |
+
collection = await self.get_collection(collection_name)
|
132 |
+
|
133 |
try:
|
134 |
+
# Create search index model using MongoDB's recommended approach
|
135 |
+
search_index_model = SearchIndexModel(
|
136 |
+
definition={
|
137 |
+
"fields": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
{
|
139 |
+
"type": "vector",
|
140 |
+
"path": "embedding",
|
141 |
+
"numDimensions": self.settings.VECTOR_DIMENSION,
|
142 |
+
"similarity": "cosine",
|
143 |
+
"quantization": "scalar"
|
144 |
}
|
145 |
]
|
146 |
+
},
|
147 |
+
name=f"{collection_name}_vector_index",
|
148 |
+
type="vectorSearch"
|
149 |
+
)
|
150 |
+
|
151 |
+
# Create the search index using the motor collection
|
152 |
+
result = await collection.create_search_index(search_index_model)
|
153 |
+
logger.info("Vector search index '{}' created for collection {}.", result, collection_name)
|
154 |
+
|
155 |
+
except Exception as e:
|
156 |
+
if "command not found" in str(e).lower():
|
157 |
+
logger.warning("Vector search not supported by this MongoDB instance. Some functionality may be limited.")
|
158 |
+
# Create a fallback standard index on embedding field
|
159 |
+
await collection.create_index("embedding")
|
160 |
+
logger.info("Created standard index on 'embedding' field as fallback.")
|
161 |
+
else:
|
162 |
+
logger.error("Failed to create vector index: {}", e)
|
163 |
+
raise
|
164 |
+
|
165 |
+
async def close(self: Self) -> None:
|
166 |
+
"""Close MongoDB connection."""
|
167 |
+
if self._client:
|
168 |
+
self._client.close()
|
169 |
+
logger.info("Closed MongoDB connection.")
|
170 |
+
self._client = None
|
171 |
+
self._db = None
|
172 |
+
|
173 |
+
class MongoDBResource(AsyncResource):
|
174 |
+
async def init(self: Self, settings: Settings) -> MongoDB:
|
175 |
+
logger.info("Initializing MongoDB connection for database: {}", settings.MONGODB_NAME)
|
176 |
+
mongo_db = MongoDB(settings=settings)
|
177 |
+
mongo_db.connect()
|
178 |
+
await self._test_connection(mongo_db)
|
179 |
+
return mongo_db
|
180 |
+
|
181 |
+
async def _test_connection(self: Self, mongo_db: MongoDB) -> None:
|
182 |
+
"""Test MongoDB connection and log the result."""
|
183 |
+
try:
|
184 |
+
is_connected = await mongo_db.ping()
|
185 |
+
if is_connected:
|
186 |
+
logger.info("MongoDB connection test successful!")
|
187 |
else:
|
188 |
+
logger.error("MongoDB connection test failed!")
|
|
|
189 |
except Exception as e:
|
190 |
+
logger.error("Error testing MongoDB connection: {}", e)
|
191 |
raise
|
192 |
|
193 |
+
async def shutdown(self: Self, mongo_db: MongoDB) -> None:
|
194 |
+
"""Close MongoDB connection on shutdown."""
|
195 |
+
try:
|
196 |
+
await mongo_db.close()
|
197 |
+
except Exception as e:
|
198 |
+
logger.error("Error closing MongoDB connection: {}", e)
|
|
|
|
|
|
|
|
|
|
src/ctp_slack_bot/db/repositories/__init__.py
CHANGED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from ctp_slack_bot.db.repositories.mongo_db_vectorized_chunk_repository import MongoVectorizedChunkRepository
|
2 |
+
from ctp_slack_bot.db.repositories.vectorized_chunk_repository import VectorizedChunkRepository
|
src/ctp_slack_bot/db/repositories/mongo_db_vectorized_chunk_repository.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Optional, Dict, Any
|
2 |
+
import pymongo
|
3 |
+
from bson import ObjectId
|
4 |
+
|
5 |
+
from ctp_slack_bot.db import MongoDB
|
6 |
+
from ctp_slack_bot.db.repositories.vectorized_chunk_repository import VectorizedChunkRepository
|
7 |
+
from ctp_slack_bot.models.base import VectorizedChunk
|
8 |
+
|
9 |
+
class MongoVectorizedChunkRepository(VectorizedChunkRepository):
|
10 |
+
"""MongoDB implementation of VectorizedChunkRepository."""
|
11 |
+
|
12 |
+
def __init__(self, mongo_db: MongoDB):
|
13 |
+
self.mongo_db = mongo_db
|
14 |
+
self.collection = self.mongo_db.db.get_collection("vectorized_chunks")
|
15 |
+
|
16 |
+
# Create indexes for efficient queries
|
17 |
+
self.collection.create_index("chunk_id")
|
18 |
+
self.collection.create_index("parent_id")
|
19 |
+
|
20 |
+
async def find_by_id(self, id: str) -> Optional[VectorizedChunk]:
|
21 |
+
doc = await self.collection.find_one({"_id": ObjectId(id)})
|
22 |
+
return self._map_to_entity(doc) if doc else None
|
23 |
+
|
24 |
+
async def find_all(self) -> List[VectorizedChunk]:
|
25 |
+
cursor = self.collection.find({})
|
26 |
+
return [self._map_to_entity(doc) async for doc in cursor]
|
27 |
+
|
28 |
+
async def find_by_parent_id(self, parent_id: str) -> List[VectorizedChunk]:
|
29 |
+
cursor = self.collection.find({"parent_id": parent_id})
|
30 |
+
return [self._map_to_entity(doc) async for doc in cursor]
|
31 |
+
|
32 |
+
async def save(self, chunk: VectorizedChunk) -> VectorizedChunk:
|
33 |
+
doc = self._map_to_document(chunk)
|
34 |
+
|
35 |
+
if "_id" in doc and doc["_id"]:
|
36 |
+
# Update existing document
|
37 |
+
await self.collection.replace_one({"_id": doc["_id"]}, doc)
|
38 |
+
else:
|
39 |
+
# Insert new document
|
40 |
+
result = await self.collection.insert_one(doc)
|
41 |
+
doc["_id"] = result.inserted_id
|
42 |
+
|
43 |
+
return self._map_to_entity(doc)
|
44 |
+
|
45 |
+
async def delete(self, id: str) -> bool:
|
46 |
+
result = await self.collection.delete_one({"_id": ObjectId(id)})
|
47 |
+
return result.deleted_count > 0
|
48 |
+
|
49 |
+
async def find_by_metadata(self, metadata_query: Dict[str, Any]) -> List[VectorizedChunk]:
|
50 |
+
# Convert the metadata query to MongoDB query format
|
51 |
+
query = {f"metadata.{k}": v for k, v in metadata_query.items()}
|
52 |
+
cursor = self.collection.find(query)
|
53 |
+
return [self._map_to_entity(doc) async for doc in cursor]
|
54 |
+
|
55 |
+
def _map_to_document(self, chunk: VectorizedChunk) -> Dict[str, Any]:
|
56 |
+
"""Convert a VectorizedChunk to a MongoDB document."""
|
57 |
+
doc = chunk.model_dump()
|
58 |
+
# Handle any special conversions needed
|
59 |
+
return doc
|
60 |
+
|
61 |
+
def _map_to_entity(self, doc: Dict[str, Any]) -> VectorizedChunk:
|
62 |
+
"""Convert a MongoDB document to a VectorizedChunk."""
|
63 |
+
if "_id" in doc:
|
64 |
+
doc["id"] = str(doc.pop("_id"))
|
65 |
+
return VectorizedChunk(**doc)
|
src/ctp_slack_bot/db/repositories/vectorized_chunk_repository.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Optional, Dict, Any
|
2 |
+
|
3 |
+
from ctp_slack_bot.models.base import VectorizedChunk
|
4 |
+
|
5 |
+
class VectorizedChunkRepository:
|
6 |
+
"""Repository interface for VectorizedChunk entities."""
|
7 |
+
|
8 |
+
async def find_by_id(self, id: str) -> Optional[VectorizedChunk]:
|
9 |
+
"""Find a chunk by its ID."""
|
10 |
+
pass
|
11 |
+
|
12 |
+
async def find_all(self) -> List[VectorizedChunk]:
|
13 |
+
"""Find all chunks."""
|
14 |
+
pass
|
15 |
+
|
16 |
+
async def find_by_parent_id(self, parent_id: str) -> List[VectorizedChunk]:
|
17 |
+
"""Find chunks by parent document ID."""
|
18 |
+
pass
|
19 |
+
|
20 |
+
async def save(self, chunk: VectorizedChunk) -> VectorizedChunk:
|
21 |
+
"""Save a chunk to the database."""
|
22 |
+
pass
|
23 |
+
|
24 |
+
async def delete(self, id: str) -> bool:
|
25 |
+
"""Delete a chunk by its ID."""
|
26 |
+
pass
|
27 |
+
|
28 |
+
async def find_by_metadata(self, metadata_query: Dict[str, Any]) -> List[VectorizedChunk]:
|
29 |
+
"""Find chunks by metadata criteria."""
|
30 |
+
pass
|
src/ctp_slack_bot/enums.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from enum import auto, StrEnum
|
2 |
+
|
3 |
+
class EventType(StrEnum):
|
4 |
+
INCOMING_CONTENT = auto()
|
5 |
+
INCOMING_SLACK_MESSAGE = auto()
|
6 |
+
OUTGOING_SLACK_RESPONSE = auto()
|
src/ctp_slack_bot/models/__init__.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from ctp_slack_bot.models.base import Content,
|
2 |
-
from ctp_slack_bot.models.
|
3 |
-
from ctp_slack_bot.models.slack import SlackMessage
|
4 |
-
from ctp_slack_bot.models.
|
|
|
1 |
+
from ctp_slack_bot.models.base import Chunk, Content, VectorizedChunk, VectorQuery
|
2 |
+
from ctp_slack_bot.models.google_drive import GoogleDriveMetadata
|
3 |
+
from ctp_slack_bot.models.slack import SlackEventPayload, SlackMessage, SlackReaction, SlackResponse, SlackUserTimestampPair
|
4 |
+
from ctp_slack_bot.models.webvtt import WebVTTContent, WebVTTFrame
|
src/ctp_slack_bot/models/base.py
CHANGED
@@ -1,61 +1,58 @@
|
|
1 |
from abc import ABC, abstractmethod
|
2 |
-
from
|
3 |
-
from
|
4 |
-
from typing import Dict, List, Optional, Union, Any, ClassVar
|
5 |
-
import hashlib
|
6 |
-
import json
|
7 |
|
8 |
|
9 |
-
class
|
10 |
-
"""A class representing
|
11 |
|
12 |
-
|
13 |
-
|
14 |
-
|
|
|
15 |
|
|
|
16 |
|
17 |
-
class Content(BaseModel):
|
18 |
-
"""A class representing ingested content."""
|
19 |
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
|
|
23 |
|
24 |
-
class Ingestible(ABC, BaseModel):
|
25 |
-
"""An abstract base class for ingestible content."""
|
26 |
|
27 |
-
|
|
|
|
|
|
|
28 |
|
29 |
-
@property
|
30 |
@abstractmethod
|
31 |
-
def
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
pass
|
41 |
-
|
42 |
-
def get_chunks(self) -> List[str]:
|
43 |
-
"""
|
44 |
-
Split content into chunks suitable for vectorization.
|
45 |
-
Override this in subclasses for specialized chunking logic.
|
46 |
-
"""
|
47 |
-
content = self.content
|
48 |
-
if isinstance(content, str):
|
49 |
-
# Simple chunking by character count
|
50 |
-
return [content[i:i+self.chunk_size]
|
51 |
-
for i in range(0, len(content), self.chunk_size)]
|
52 |
-
elif isinstance(content, list):
|
53 |
-
# Content is already chunked
|
54 |
-
return content
|
55 |
-
else:
|
56 |
-
raise ValueError(f"Unsupported content type: {type(content)}")
|
57 |
-
|
58 |
-
@property
|
59 |
-
def key(self) -> str:
|
60 |
-
"""Convenience accessor for the metadata key."""
|
61 |
-
return self.metadata.key
|
|
|
1 |
from abc import ABC, abstractmethod
|
2 |
+
from pydantic import BaseModel, ConfigDict, Field
|
3 |
+
from typing import Any, final, Mapping, Self, Sequence, Optional
|
|
|
|
|
|
|
4 |
|
5 |
|
6 |
+
class Chunk(BaseModel):
|
7 |
+
"""A class representing a chunk of content."""
|
8 |
|
9 |
+
text: str # The text representation
|
10 |
+
parent_id: str # The source content’s identity
|
11 |
+
chunk_id: str # This chunk’s identity—unique within the source content
|
12 |
+
metadata: Mapping[str, Any]
|
13 |
|
14 |
+
model_config = ConfigDict(frozen=True)
|
15 |
|
|
|
|
|
16 |
|
17 |
+
@final
|
18 |
+
class VectorQuery(BaseModel):
|
19 |
+
"""Model for vector database similarity search queries.
|
20 |
+
|
21 |
+
Attributes:
|
22 |
+
query_text: The text to be vectorized and used for similarity search
|
23 |
+
k: Number of similar documents to retrieve
|
24 |
+
score_threshold: Minimum similarity score threshold for inclusion in results
|
25 |
+
filter_metadata: Optional filters for metadata fields
|
26 |
+
"""
|
27 |
+
|
28 |
+
query_embeddings: Sequence[float]
|
29 |
+
k: int
|
30 |
+
score_threshold: float = Field(default=0.7)
|
31 |
+
filter_metadata: Optional[Mapping[str, Any]] = None
|
32 |
|
33 |
+
model_config = ConfigDict(frozen=True)
|
34 |
+
|
35 |
+
|
36 |
+
@final
|
37 |
+
class VectorizedChunk(Chunk):
|
38 |
+
"""A class representing a vectorized chunk of content."""
|
39 |
|
40 |
+
embedding: Sequence[float] # The vector representation
|
41 |
|
|
|
|
|
42 |
|
43 |
+
class Content(ABC, BaseModel):
|
44 |
+
"""An abstract base class for all types of content."""
|
45 |
+
|
46 |
+
model_config = ConfigDict(frozen=True)
|
47 |
|
|
|
48 |
@abstractmethod
|
49 |
+
def get_id(self: Self) -> str:
|
50 |
+
pass
|
51 |
+
|
52 |
+
@abstractmethod
|
53 |
+
def get_chunks(self: Self) -> Sequence[Chunk]:
|
54 |
+
pass
|
55 |
+
|
56 |
+
@abstractmethod
|
57 |
+
def get_metadata(self: Self) -> Mapping[str, Any]:
|
58 |
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/ctp_slack_bot/models/content.py
DELETED
@@ -1,19 +0,0 @@
|
|
1 |
-
from pydantic import BaseModel, Field
|
2 |
-
from typing import Optional, List, Dict, Any
|
3 |
-
from ctp_slack_bot.models.slack import SlackMessage
|
4 |
-
|
5 |
-
class RetreivedContext(BaseModel):
|
6 |
-
"""Represents a the context of a question from Slack returned from the Vector Store Database.
|
7 |
-
|
8 |
-
contextual_text: The text that is relevant to the question.
|
9 |
-
metadata_source: The source of the contextual text.
|
10 |
-
similarity_score: The similarity score of the contextual text to the question.
|
11 |
-
|
12 |
-
in_reation_to_question: OPTINAL: The question that the contextual text is related to.
|
13 |
-
"""
|
14 |
-
contextual_text: str
|
15 |
-
metadata_source: str
|
16 |
-
similarity_score: float
|
17 |
-
|
18 |
-
said_by: str = Optional[None]
|
19 |
-
in_reation_to_question: str = Optional[None]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/ctp_slack_bot/models/google_drive.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datetime import datetime
|
2 |
+
from pydantic import BaseModel, ConfigDict
|
3 |
+
from typing import Self
|
4 |
+
|
5 |
+
from ctp_slack_bot.models import FileContent
|
6 |
+
|
7 |
+
|
8 |
+
class GoogleDriveMetadata(BaseModel):
|
9 |
+
"""Represents Google Drive file or folder metadata."""
|
10 |
+
|
11 |
+
id: str
|
12 |
+
name: str
|
13 |
+
modified_time: datetime
|
14 |
+
mime_type: str
|
15 |
+
folder_path: str
|
16 |
+
|
17 |
+
model_config = ConfigDict(frozen=True)
|
18 |
+
|
19 |
+
@classmethod
|
20 |
+
def from_folder_path_and_dict(cls: type["GoogleDriveMetadata"], folder_path: str, dict: dict) -> Self:
|
21 |
+
id = dict["id"]
|
22 |
+
name = dict["name"]
|
23 |
+
modified_time = datetime.fromisoformat(dict["modifiedTime"])
|
24 |
+
mime_type = dict["mimeType"]
|
25 |
+
return GoogleDriveMetadata(id=id, name=name, modified_time=modified_time, mime_type=mime_type, folder_path=folder_path)
|
src/ctp_slack_bot/models/slack.py
CHANGED
@@ -1,16 +1,84 @@
|
|
1 |
-
from
|
2 |
-
from
|
|
|
|
|
|
|
3 |
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
"""Represents a message from Slack after adaptation."""
|
6 |
-
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
|
|
9 |
thread_ts: Optional[str] = None
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
"""Unique identifier for this message."""
|
16 |
-
return f"slack:{self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datetime import datetime
|
2 |
+
from json import dumps
|
3 |
+
from pydantic import BaseModel, ConfigDict, PositiveInt, PrivateAttr
|
4 |
+
from types import MappingProxyType
|
5 |
+
from typing import Any, Dict, Literal, Mapping, Optional, Self, Sequence
|
6 |
|
7 |
+
from ctp_slack_bot.models.base import Chunk, Content
|
8 |
+
|
9 |
+
class SlackEventPayload(BaseModel):
|
10 |
+
"""Represents a general event payload from Slack."""
|
11 |
+
type: str
|
12 |
+
event_ts: str
|
13 |
+
|
14 |
+
model_config = ConfigDict(extra='allow', frozen=True)
|
15 |
+
|
16 |
+
class SlackEvent(BaseModel):
|
17 |
+
"""Represents a general event from Slack."""
|
18 |
+
|
19 |
+
token: str
|
20 |
+
team_id: str
|
21 |
+
api_app_id: str
|
22 |
+
event: SlackEventPayload
|
23 |
+
type: str
|
24 |
+
event_id: str
|
25 |
+
event_time: int
|
26 |
+
authed_users: Sequence[str]
|
27 |
+
|
28 |
+
model_config = ConfigDict(frozen=True)
|
29 |
+
|
30 |
+
class SlackUserTimestampPair(BaseModel):
|
31 |
+
"""Represents a Slack user-timestamp pair."""
|
32 |
+
|
33 |
+
user: str
|
34 |
+
ts: str
|
35 |
+
|
36 |
+
model_config = ConfigDict(frozen=True)
|
37 |
+
|
38 |
+
class SlackReaction(BaseModel):
|
39 |
+
"""Represents a Slack reaction information."""
|
40 |
+
|
41 |
+
name: str
|
42 |
+
count: PositiveInt
|
43 |
+
users: Sequence[str]
|
44 |
+
|
45 |
+
model_config = ConfigDict(frozen=True)
|
46 |
+
|
47 |
+
class SlackMessage(Content):
|
48 |
"""Represents a message from Slack after adaptation."""
|
49 |
+
|
50 |
+
type: Literal["app_mention", "message"]
|
51 |
+
subtype: Optional[str] = None
|
52 |
+
channel: str
|
53 |
+
channel_type: Optional[str] = None
|
54 |
+
user: Optional[str] = None
|
55 |
+
bot_id: Optional[str] = None
|
56 |
thread_ts: Optional[str] = None
|
57 |
+
text: str
|
58 |
+
ts: str
|
59 |
+
edited: Optional[SlackUserTimestampPair] = None
|
60 |
+
event_ts: str
|
61 |
+
deleted_ts: Optional[str] = None
|
62 |
+
hidden: bool = False
|
63 |
+
is_starred: Optional[bool] = None
|
64 |
+
pinned_to: Optional[Sequence[str]] = None
|
65 |
+
reactions: Optional[Sequence[SlackReaction]] = None
|
66 |
+
|
67 |
+
def get_id(self: Self) -> str:
|
68 |
"""Unique identifier for this message."""
|
69 |
+
return f"slack-message:{self.channel}:{self.ts}"
|
70 |
+
|
71 |
+
def get_chunks(self: Self) -> Sequence[Chunk]:
|
72 |
+
return (Chunk(text=self.text, parent_id=self.get_id(), chunk_id="", metadata=self.get_metadata()), )
|
73 |
+
|
74 |
+
def get_metadata(self: Self) -> Mapping[str, Any]:
|
75 |
+
return MappingProxyType({
|
76 |
+
"modificationTime": datetime.fromtimestamp(float(self.ts))
|
77 |
+
})
|
78 |
+
|
79 |
+
class SlackResponse(BaseModel): # TODO: This should also be based on Content as it is a SlackMessage―just not one for which we know the identity yet.
|
80 |
+
"""Represents a response message to be sent to Slack."""
|
81 |
+
|
82 |
+
text: str
|
83 |
+
channel: Optional[str]
|
84 |
+
thread_ts: Optional[str] = None
|
src/ctp_slack_bot/models/vector_query.py
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
from pydantic import BaseModel, Field, validator
|
2 |
-
from typing import Optional, List, Dict, Any
|
3 |
-
|
4 |
-
class VectorQuery(BaseModel):
|
5 |
-
"""Model for vector database similarity search queries.
|
6 |
-
|
7 |
-
Attributes:
|
8 |
-
query_text: The text to be vectorized and used for similarity search
|
9 |
-
k: Number of similar documents to retrieve
|
10 |
-
score_threshold: Minimum similarity score threshold for inclusion in results
|
11 |
-
filter_metadata: Optional filters for metadata fields
|
12 |
-
"""
|
13 |
-
query_text: str
|
14 |
-
k: int
|
15 |
-
score_threshold: float = Field(default=0.7)
|
16 |
-
filter_metadata: Optional[Dict[str, Any]] = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/ctp_slack_bot/models/webvtt.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datetime import datetime, timedelta
|
2 |
+
from io import BytesIO
|
3 |
+
from itertools import starmap
|
4 |
+
from json import dumps
|
5 |
+
from more_itertools import windowed
|
6 |
+
from pydantic import BaseModel, ConfigDict, Field, PositiveInt, PrivateAttr
|
7 |
+
from types import MappingProxyType
|
8 |
+
from typing import Any, Dict, Literal, Mapping, Optional, Self, Sequence
|
9 |
+
from webvtt import Caption, WebVTT
|
10 |
+
|
11 |
+
from ctp_slack_bot.models.base import Chunk, Content
|
12 |
+
|
13 |
+
CHUNK_FRAMES_OVERLAP = 1
|
14 |
+
CHUNK_FRAMES_WINDOW = 5
|
15 |
+
SPEAKER_SPEECH_TEXT_SEPARATOR = ": "
|
16 |
+
|
17 |
+
class WebVTTFrame(BaseModel):
|
18 |
+
"""Represents a WebVTT frame"""
|
19 |
+
|
20 |
+
identifier: str
|
21 |
+
start: timedelta
|
22 |
+
end: timedelta
|
23 |
+
speaker: Optional[str] = None
|
24 |
+
speech: str
|
25 |
+
|
26 |
+
model_config = ConfigDict(frozen=True)
|
27 |
+
|
28 |
+
@classmethod
|
29 |
+
def from_webvtt_caption(cls: type["WebVTTFrame"], index: int, caption: Caption) -> Self:
|
30 |
+
identifier = caption.identifier if caption.identifier else str(index)
|
31 |
+
start = timedelta(**caption.start_time.__dict__)
|
32 |
+
end = timedelta(**caption.end_time.__dict__)
|
33 |
+
match caption.text.split(SPEAKER_SPEECH_TEXT_SEPARATOR, 1):
|
34 |
+
case [speaker, speech]:
|
35 |
+
return cls(identifier=identifier, start=start, end=end, speaker=speaker, speech=speech)
|
36 |
+
case [speech]:
|
37 |
+
return cls(identifier=identifier, start=start, end=end, speech=speech)
|
38 |
+
|
39 |
+
|
40 |
+
class WebVTTContent(Content):
|
41 |
+
"""Represents parsed WebVTT content."""
|
42 |
+
|
43 |
+
id: str
|
44 |
+
metadata: Mapping[str, Any] = Field(default_factory=dict)
|
45 |
+
frames: Sequence[WebVTTFrame]
|
46 |
+
|
47 |
+
def get_id(self: Self) -> str:
|
48 |
+
return self.id
|
49 |
+
|
50 |
+
def get_chunks(self: Self) -> Sequence[Chunk]:
|
51 |
+
windows = (tuple(filter(None, window))
|
52 |
+
for window
|
53 |
+
in windowed(self.frames, CHUNK_FRAMES_WINDOW, step=CHUNK_FRAMES_WINDOW-CHUNK_FRAMES_OVERLAP))
|
54 |
+
return tuple(Chunk(text="\n\n".join(": ".join(filter(None, (frame.speaker, frame.speech)))
|
55 |
+
for frame
|
56 |
+
in frames),
|
57 |
+
parent_id=self.get_id(),
|
58 |
+
chunk_id=f"{frames[0].identifier}-{frames[-1].identifier}",
|
59 |
+
metadata={
|
60 |
+
"start": str(frames[0].start), # TODO: This is a harder problem: to get the offsets to become real datetimes so that they can be queryable using MongoDB.
|
61 |
+
"end": str(frames[-1].end),
|
62 |
+
"speakers": [frame.speaker for frame in frames if frame.speaker]
|
63 |
+
})
|
64 |
+
for frames
|
65 |
+
in windows)
|
66 |
+
|
67 |
+
def get_metadata(self: Self) -> Mapping[str, Any]:
|
68 |
+
return MappingProxyType(self.metadata)
|
69 |
+
|
70 |
+
@classmethod
|
71 |
+
def from_bytes(cls: type["WebVTTContent"], id: str, metadata: Mapping[str, Any], buffer: bytes) -> Self:
|
72 |
+
frames = tuple(starmap(WebVTTFrame.from_webvtt_caption, enumerate(WebVTT.from_buffer(BytesIO(buffer)).captions, 1)))
|
73 |
+
return WebVTTContent(id=id, metadata=MappingProxyType(metadata), frames=frames)
|
src/ctp_slack_bot/services/GOOGLE_DRIVE_README.md
DELETED
@@ -1,228 +0,0 @@
|
|
1 |
-
# Google Drive Access Module
|
2 |
-
|
3 |
-
This Python module provides a simplified way to interact with Google Drive, focusing on easy access to files in nested folders using path-like syntax. It handles various Google file formats and provides comprehensive metadata for files and folders.
|
4 |
-
|
5 |
-
## Features
|
6 |
-
|
7 |
-
- **Path-based folder access**: Access files using simple paths like `folder1/folder2/folder3`
|
8 |
-
- **Efficient caching**: Folder IDs are cached to improve performance
|
9 |
-
- **Comprehensive metadata**: Get detailed information about files and folders
|
10 |
-
- **Read various file types**:
|
11 |
-
- Text files
|
12 |
-
- Google Docs
|
13 |
-
- VTT files
|
14 |
-
- **Robust folder finding**: Works with exact and partial name matching
|
15 |
-
- **Simple API**: Designed for ease of use with minimal code
|
16 |
-
|
17 |
-
## Setup Instructions
|
18 |
-
|
19 |
-
### 1. Create a Google Cloud Project
|
20 |
-
|
21 |
-
1. Go to the [Google Cloud Console](https://console.cloud.google.com/)
|
22 |
-
2. Click on the project dropdown at the top of the page and select "New Project"
|
23 |
-
3. Enter a project name and click "Create"
|
24 |
-
4. Once created, make sure your new project is selected in the dropdown
|
25 |
-
|
26 |
-
### 2. Enable the Google Drive API
|
27 |
-
|
28 |
-
1. In the Google Cloud Console, navigate to "APIs & Services" > "Library" in the left sidebar
|
29 |
-
2. Search for "Google Drive API" in the search bar
|
30 |
-
3. Click on "Google Drive API" in the results
|
31 |
-
4. Click the "Enable" button
|
32 |
-
|
33 |
-
### 3. Create OAuth Credentials
|
34 |
-
|
35 |
-
1. In the Google Cloud Console, go to "APIs & Services" > "Credentials" in the left sidebar
|
36 |
-
2. Click "Create Credentials" at the top and select "OAuth client ID"
|
37 |
-
3. If prompted to configure the OAuth consent screen:
|
38 |
-
- Choose "External" user type (or "Internal" if you're in a Google Workspace organization)
|
39 |
-
- Fill in the required information (App name, User support email, Developer contact email)
|
40 |
-
- Click "Save and Continue"
|
41 |
-
- Add the following scopes:
|
42 |
-
- `.../auth/drive` (Full access to Google Drive)
|
43 |
-
- Click "Save and Continue" and complete the registration
|
44 |
-
4. Return to the "Create OAuth client ID" screen
|
45 |
-
5. Select "Desktop application" as the Application type
|
46 |
-
6. Enter a name for your OAuth client (e.g., "Google Drive Access Desktop")
|
47 |
-
7. Click "Create"
|
48 |
-
8. Download the JSON file (this is your `client_secret.json`)
|
49 |
-
|
50 |
-
### 4. Project Setup
|
51 |
-
|
52 |
-
1. Setup a virtual environment and install dependencies:
|
53 |
-
```bash
|
54 |
-
python -m venv venv
|
55 |
-
source venv/bin/activate # On Windows: venv\Scripts\activate
|
56 |
-
pip install -r requirements.txt
|
57 |
-
```
|
58 |
-
|
59 |
-
2. Place your credentials:
|
60 |
-
- Create a `credentials` directory in your project root
|
61 |
-
- Move the downloaded OAuth client JSON file to the `credentials` directory
|
62 |
-
- Rename it to `client_secret.json`
|
63 |
-
|
64 |
-
### 5. Authentication Process
|
65 |
-
|
66 |
-
When you run the application for the first time:
|
67 |
-
1. A browser window will open automatically
|
68 |
-
2. You'll be asked to sign in to your Google account
|
69 |
-
3. You'll see a consent screen asking for permission to access your Google Drive
|
70 |
-
4. After granting permission, the browser will display a success message
|
71 |
-
5. The application will save a token file (`token.pickle`) in the credentials directory for future use
|
72 |
-
|
73 |
-
## Usage Guide
|
74 |
-
|
75 |
-
The `EasyGoogleDrive` class provides several methods to interact with Google Drive. Here's how to use the core functionality:
|
76 |
-
|
77 |
-
### Basic Usage
|
78 |
-
|
79 |
-
```python
|
80 |
-
from google_drive_access import EasyGoogleDrive
|
81 |
-
|
82 |
-
# Initialize the Google Drive client
|
83 |
-
drive = EasyGoogleDrive()
|
84 |
-
|
85 |
-
# Example folder path - replace with your actual folder path
|
86 |
-
folder_path = "Spring-2025-BAI"
|
87 |
-
subfolder_path = "Spring-2025-BAI/transcripts"
|
88 |
-
```
|
89 |
-
|
90 |
-
### Listing Folders
|
91 |
-
|
92 |
-
```python
|
93 |
-
# List folders in a directory
|
94 |
-
folders = drive.get_folders_in_folder(folder_path)
|
95 |
-
|
96 |
-
# Access folder properties
|
97 |
-
for folder in folders:
|
98 |
-
print(f"Folder: {folder['name']}")
|
99 |
-
print(f" Created: {folder.get('createdTimeFormatted', 'Unknown')}")
|
100 |
-
print(f" Modified: {folder.get('modifiedTimeFormatted', 'Unknown')}")
|
101 |
-
```
|
102 |
-
|
103 |
-
### Listing Files
|
104 |
-
|
105 |
-
```python
|
106 |
-
# List files in a directory
|
107 |
-
files = drive.get_files_in_folder(subfolder_path)
|
108 |
-
|
109 |
-
# Access file properties
|
110 |
-
for file in files:
|
111 |
-
print(f"File: {file['name']}")
|
112 |
-
print(f" Type: {file.get('fileType', 'Unknown')}")
|
113 |
-
print(f" Created: {file.get('createdTimeFormatted', 'Unknown')}")
|
114 |
-
print(f" Modified: {file.get('modifiedTimeFormatted', 'Unknown')}")
|
115 |
-
print(f" Size: {file.get('sizeFormatted', 'Unknown')}")
|
116 |
-
```
|
117 |
-
|
118 |
-
### Getting a Specific File
|
119 |
-
|
120 |
-
```python
|
121 |
-
# Get a specific file with metadata
|
122 |
-
file = drive.get_file("example.txt", subfolder_path, include_metadata=True)
|
123 |
-
|
124 |
-
if file:
|
125 |
-
print(f"File: {file['name']}")
|
126 |
-
print(f" Type: {file.get('fileType', 'Unknown')}")
|
127 |
-
print(f" Created: {file.get('createdTimeFormatted', 'Unknown')}")
|
128 |
-
print(f" Modified: {file.get('modifiedTimeFormatted', 'Unknown')}")
|
129 |
-
print(f" Size: {file.get('sizeFormatted', 'Unknown')}")
|
130 |
-
```
|
131 |
-
|
132 |
-
### Getting All Items in a Folder
|
133 |
-
|
134 |
-
```python
|
135 |
-
# Get all items (files and folders) in a folder
|
136 |
-
all_items = drive.get_all_files_in_folder(folder_path)
|
137 |
-
|
138 |
-
# Access item properties
|
139 |
-
for item in all_items:
|
140 |
-
item_type = "Folder" if item.get('mimeType') == drive.MIME_TYPES['folder'] else item.get('fileType', 'Unknown')
|
141 |
-
print(f"Item: {item['name']} ({item_type})")
|
142 |
-
```
|
143 |
-
|
144 |
-
### Checking if a File Exists
|
145 |
-
|
146 |
-
```python
|
147 |
-
# Check if a file exists
|
148 |
-
exists = drive.file_exists("example.txt", subfolder_path)
|
149 |
-
print(f"File exists: {exists}")
|
150 |
-
```
|
151 |
-
|
152 |
-
### Getting File Modified Time
|
153 |
-
|
154 |
-
```python
|
155 |
-
# Get file modified time
|
156 |
-
modified_time = drive.get_file_modified_time("example.txt", subfolder_path)
|
157 |
-
if modified_time:
|
158 |
-
print(f"Last modified: {modified_time}")
|
159 |
-
```
|
160 |
-
|
161 |
-
### Reading File Content
|
162 |
-
|
163 |
-
```python
|
164 |
-
# Get file with content
|
165 |
-
file_with_content = drive.get_file("example.txt", subfolder_path, include_content=True)
|
166 |
-
|
167 |
-
if file_with_content and 'file_content' in file_with_content:
|
168 |
-
content = file_with_content['file_content']
|
169 |
-
if content:
|
170 |
-
print(f"Content: {content[:100]}...") # Print first 100 characters
|
171 |
-
```
|
172 |
-
|
173 |
-
## Complete Example
|
174 |
-
|
175 |
-
For a complete example of how to use the `EasyGoogleDrive` class, see the `basic_usage.py` file included in this package. This file demonstrates all the core functionality with practical examples.
|
176 |
-
|
177 |
-
## Key Concepts
|
178 |
-
|
179 |
-
### Path-based Folder Access
|
180 |
-
|
181 |
-
The module uses a simple path-like syntax to access folders:
|
182 |
-
|
183 |
-
```python
|
184 |
-
# Access a deeply nested folder
|
185 |
-
folder_path = "folder1/folder2/folder3"
|
186 |
-
files = drive.get_files_in_folder(folder_path)
|
187 |
-
```
|
188 |
-
|
189 |
-
This makes it much easier to work with nested folder structures compared to using folder IDs.
|
190 |
-
|
191 |
-
### Metadata Fields
|
192 |
-
|
193 |
-
The module provides comprehensive metadata for files and folders, including:
|
194 |
-
|
195 |
-
- **Creation and modification dates**: Both as datetime objects and formatted strings
|
196 |
-
- **File size**: Both in bytes and human-readable format (KB, MB, GB)
|
197 |
-
- **File type**: Simplified type based on MIME type
|
198 |
-
- **Owner information**: Names and email addresses of file owners
|
199 |
-
- **Sharing status**: Whether the file is shared
|
200 |
-
- **Web links**: Direct links to view the file in a browser
|
201 |
-
|
202 |
-
## Error Handling
|
203 |
-
|
204 |
-
The module includes comprehensive error handling:
|
205 |
-
|
206 |
-
- **Authentication errors**: Clear messages when credentials are missing or invalid
|
207 |
-
- **Folder not found**: Helpful messages when a folder in the path cannot be found
|
208 |
-
- **File not found**: Attempts partial name matching before giving up
|
209 |
-
- **Decoding errors**: Handles issues with file content encoding
|
210 |
-
|
211 |
-
## Dependencies
|
212 |
-
|
213 |
-
- **Required**:
|
214 |
-
- google-auth-oauthlib
|
215 |
-
- google-auth-httplib2
|
216 |
-
- google-api-python-client
|
217 |
-
- python-dateutil
|
218 |
-
|
219 |
-
## Security Notes
|
220 |
-
|
221 |
-
- Never commit your `client_secret.json` or token files to version control
|
222 |
-
- Add `credentials/` to your `.gitignore` file
|
223 |
-
- Keep your credentials secure and don't share them
|
224 |
-
- For production applications, consider using service accounts with the minimum required permissions
|
225 |
-
|
226 |
-
## Contributing
|
227 |
-
|
228 |
-
Feel free to contribute to this project by submitting issues or pull requests.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/ctp_slack_bot/services/__init__.py
CHANGED
@@ -1,7 +1,10 @@
|
|
1 |
from ctp_slack_bot.services.answer_retrieval_service import AnswerRetrievalService
|
2 |
from ctp_slack_bot.services.content_ingestion_service import ContentIngestionService
|
3 |
from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
|
|
|
4 |
from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
|
|
|
|
5 |
from ctp_slack_bot.services.question_dispatch_service import QuestionDispatchService
|
6 |
from ctp_slack_bot.services.slack_service import SlackService
|
7 |
from ctp_slack_bot.services.vector_database_service import VectorDatabaseService
|
|
|
1 |
from ctp_slack_bot.services.answer_retrieval_service import AnswerRetrievalService
|
2 |
from ctp_slack_bot.services.content_ingestion_service import ContentIngestionService
|
3 |
from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
|
4 |
+
from ctp_slack_bot.services.embeddings_model_service import EmbeddingsModelService
|
5 |
from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
6 |
+
from ctp_slack_bot.services.google_drive_service import GoogleDriveService
|
7 |
+
from ctp_slack_bot.services.language_model_service import LanguageModelService
|
8 |
from ctp_slack_bot.services.question_dispatch_service import QuestionDispatchService
|
9 |
from ctp_slack_bot.services.slack_service import SlackService
|
10 |
from ctp_slack_bot.services.vector_database_service import VectorDatabaseService
|
src/ctp_slack_bot/services/answer_retrieval_service.py
CHANGED
@@ -1,65 +1,34 @@
|
|
1 |
-
# from asyncio import create_task
|
2 |
from loguru import logger
|
3 |
-
from
|
4 |
-
from
|
5 |
-
from typing import List, Optional, Self, Tuple
|
6 |
|
7 |
from ctp_slack_bot.core import Settings
|
8 |
-
from ctp_slack_bot.
|
|
|
9 |
from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
|
|
10 |
|
11 |
|
12 |
-
class AnswerRetrievalService(BaseModel):
|
13 |
"""
|
14 |
-
Service for language model
|
15 |
"""
|
16 |
|
17 |
settings: Settings
|
18 |
event_brokerage_service: EventBrokerageService
|
19 |
-
|
20 |
|
21 |
class Config:
|
22 |
-
|
23 |
|
24 |
-
|
25 |
-
|
26 |
logger.debug("Created {}", self.__class__.__name__)
|
27 |
-
return self
|
28 |
|
29 |
-
def
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
Returns:
|
37 |
-
str: Generated answer
|
38 |
-
"""
|
39 |
-
# Prepare context string from retrieved chunks
|
40 |
-
context_str = ""
|
41 |
-
for c in context:
|
42 |
-
context_str += f"{c.contextual_text}\n"
|
43 |
-
|
44 |
-
|
45 |
-
# logger.info(f"Generating response for question: {question}")
|
46 |
-
# logger.info(f"Using {len(context)} context chunks")
|
47 |
-
|
48 |
-
# Create messages for the chat completion
|
49 |
-
messages = [
|
50 |
-
{"role": "system", "content": settings.SYSTEM_PROMPT},
|
51 |
-
{"role": "user", "content":
|
52 |
-
f"""Student Auestion: {question.text}
|
53 |
-
Context from class materials and transcripts: {context_str}
|
54 |
-
Please answer the Student Auestion based on the Context from class materials and transcripts. If the context doesn't contain relevant information, acknowledge that and suggest asking the professor."""}
|
55 |
-
]
|
56 |
-
|
57 |
-
# Generate response
|
58 |
-
response = self.client.chat.completions.create(
|
59 |
-
model=settings.CHAT_MODEL,
|
60 |
-
messages=messages,
|
61 |
-
max_tokens=settings.MAX_TOKENS,
|
62 |
-
temperature=settings.TEMPERATURE
|
63 |
-
)
|
64 |
-
|
65 |
-
return response.choices[0].message.content
|
|
|
|
|
1 |
from loguru import logger
|
2 |
+
from pydantic import BaseModel
|
3 |
+
from typing import Collection, Self
|
|
|
4 |
|
5 |
from ctp_slack_bot.core import Settings
|
6 |
+
from ctp_slack_bot.enums import EventType
|
7 |
+
from ctp_slack_bot.models import Chunk, SlackMessage, SlackResponse
|
8 |
from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
9 |
+
from ctp_slack_bot.services.language_model_service import LanguageModelService
|
10 |
|
11 |
|
12 |
+
class AnswerRetrievalService(BaseModel):
|
13 |
"""
|
14 |
+
Service for context-based answer retrievel from a language model.
|
15 |
"""
|
16 |
|
17 |
settings: Settings
|
18 |
event_brokerage_service: EventBrokerageService
|
19 |
+
language_model_service: LanguageModelService
|
20 |
|
21 |
class Config:
|
22 |
+
frozen=True
|
23 |
|
24 |
+
def __init__(self: Self, **data) -> None:
|
25 |
+
super().__init__(**data)
|
26 |
logger.debug("Created {}", self.__class__.__name__)
|
|
|
27 |
|
28 |
+
async def push(self: Self, question: SlackMessage, context: Collection[Chunk]) -> None:
|
29 |
+
channel_to_respond_to = question.channel
|
30 |
+
thread_to_respond_to = question.thread_ts if question.thread_ts else question.ts
|
31 |
+
answer = self.language_model_service.answer_question(question.text, context)
|
32 |
+
logger.debug("Pushing response to channel {} and thread {}: {}", channel_to_respond_to, thread_to_respond_to, answer)
|
33 |
+
slack_response = SlackResponse(text=answer, channel=channel_to_respond_to, thread_ts=thread_to_respond_to)
|
34 |
+
await self.event_brokerage_service.publish(EventType.OUTGOING_SLACK_RESPONSE, slack_response)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/ctp_slack_bot/services/application_database_service.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datetime import datetime
|
2 |
+
from loguru import logger
|
3 |
+
from pydantic import BaseModel, PrivateAttr
|
4 |
+
from typing import Iterable, Mapping, Self
|
5 |
+
|
6 |
+
from ctp_slack_bot.core import Settings
|
7 |
+
from ctp_slack_bot.db import MongoDB
|
8 |
+
|
9 |
+
|
10 |
+
class ApplicationDatabaseService(BaseModel):
|
11 |
+
"""Service for retrieving and persisting application state."""
|
12 |
+
|
13 |
+
settings: Settings
|
14 |
+
mongo_db: MongoDB # TODO: This should be replaced following the repository pattern―one repository class per collection.
|
15 |
+
|
16 |
+
class Config:
|
17 |
+
frozen=True
|
18 |
+
|
19 |
+
def __init__(self: Self, **data) -> None:
|
20 |
+
super().__init__(**data)
|
21 |
+
logger.debug("Created {}", self.__class__.__name__)
|
22 |
+
|
23 |
+
async def get_last_modification_times_by_file_paths(self: Self, file_paths: Iterable[str]) -> Mapping[str, datetime]:
|
24 |
+
"""Retrieve the last modification time for each file path."""
|
25 |
+
raise NotImplementedError() # TODO
|
26 |
+
|
27 |
+
async def set_last_modification_time_by_file_path(self: Self, file_path: str, modification_time: datetime) -> None:
|
28 |
+
"""Set the last modification time for a file path."""
|
29 |
+
raise NotImplementedError() # TODO
|
src/ctp_slack_bot/services/content_ingestion_service.py
CHANGED
@@ -1,8 +1,11 @@
|
|
1 |
from loguru import logger
|
2 |
-
from pydantic import BaseModel
|
3 |
-
from typing import Self
|
4 |
|
5 |
from ctp_slack_bot.core import Settings
|
|
|
|
|
|
|
6 |
from ctp_slack_bot.services.vector_database_service import VectorDatabaseService
|
7 |
from ctp_slack_bot.services.vectorization_service import VectorizationService
|
8 |
|
@@ -12,10 +15,35 @@ class ContentIngestionService(BaseModel):
|
|
12 |
"""
|
13 |
|
14 |
settings: Settings
|
|
|
15 |
vector_database_service: VectorDatabaseService
|
16 |
vectorization_service: VectorizationService
|
17 |
|
18 |
-
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
20 |
logger.debug("Created {}", self.__class__.__name__)
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from loguru import logger
|
2 |
+
from pydantic import BaseModel
|
3 |
+
from typing import Self, Sequence
|
4 |
|
5 |
from ctp_slack_bot.core import Settings
|
6 |
+
from ctp_slack_bot.enums import EventType
|
7 |
+
from ctp_slack_bot.models import Chunk, Content, SlackMessage
|
8 |
+
from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
9 |
from ctp_slack_bot.services.vector_database_service import VectorDatabaseService
|
10 |
from ctp_slack_bot.services.vectorization_service import VectorizationService
|
11 |
|
|
|
15 |
"""
|
16 |
|
17 |
settings: Settings
|
18 |
+
event_brokerage_service: EventBrokerageService
|
19 |
vector_database_service: VectorDatabaseService
|
20 |
vectorization_service: VectorizationService
|
21 |
|
22 |
+
class Config:
|
23 |
+
frozen=True
|
24 |
+
|
25 |
+
def __init__(self: Self, **data) -> None:
|
26 |
+
super().__init__(**data)
|
27 |
+
self.event_brokerage_service.subscribe(EventType.INCOMING_CONTENT, self.process_incoming_content)
|
28 |
+
self.event_brokerage_service.subscribe(EventType.INCOMING_SLACK_MESSAGE, self.process_incoming_slack_message)
|
29 |
logger.debug("Created {}", self.__class__.__name__)
|
30 |
+
|
31 |
+
async def process_incoming_content(self: Self, content: Content) -> None:
|
32 |
+
logger.debug("Content ingestion service received content with metadata: {}", content.get_metadata())
|
33 |
+
# if self.vector_database_service.has_content(content.get_id()) # TODO
|
34 |
+
# logger.debug("Ignored content with ID {} because it already exists in the database.", content.get_id())
|
35 |
+
# return
|
36 |
+
chunks = content.get_chunks()
|
37 |
+
await self.__vectorize_and_store_chunks_in_database(chunks)
|
38 |
+
logger.debug("Stored {} vectorized chunk(s) in the database.", len(chunks))
|
39 |
+
|
40 |
+
async def process_incoming_slack_message(self: Self, slack_message: SlackMessage) -> None:
|
41 |
+
logger.debug("Content ingestion service received a Slack message: {}", slack_message.text)
|
42 |
+
chunks = slack_message.get_chunks()
|
43 |
+
await self.__vectorize_and_store_chunks_in_database(chunks)
|
44 |
+
logger.debug("Stored {} vectorized chunk(s) in the database.", len(chunks))
|
45 |
+
|
46 |
+
async def __vectorize_and_store_chunks_in_database(self: Self, chunks: Sequence[Chunk]) -> None:
|
47 |
+
vectorized_chunks = self.vectorization_service.vectorize(chunks) # TODO
|
48 |
+
await self.vector_database_service.store(vectorized_chunks) # TODO
|
49 |
+
|
src/ctp_slack_bot/services/context_retrieval_service.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
from loguru import logger
|
2 |
-
from pydantic import BaseModel
|
3 |
-
from typing import Self,
|
4 |
|
5 |
from ctp_slack_bot.core.config import Settings
|
6 |
-
from ctp_slack_bot.models import
|
7 |
from ctp_slack_bot.services.vector_database_service import VectorDatabaseService
|
8 |
from ctp_slack_bot.services.vectorization_service import VectorizationService
|
9 |
|
@@ -16,57 +16,51 @@ class ContextRetrievalService(BaseModel):
|
|
16 |
vectorization_service: VectorizationService
|
17 |
vector_database_service: VectorDatabaseService
|
18 |
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
21 |
logger.debug("Created {}", self.__class__.__name__)
|
22 |
-
|
23 |
-
|
24 |
-
def get_context(self, message: SlackMessage) -> List[RetreivedContext]:
|
25 |
"""
|
26 |
-
Retrieve relevant context for a given
|
27 |
-
|
28 |
-
This function:
|
29 |
-
1. Extracts the question text from the message
|
30 |
-
2. Vectorizes the question using VectorizationService
|
31 |
-
3. Queries VectorDatabaseService for similar context
|
32 |
-
4. Returns the relevant context as a list of RetreivedContext objects
|
33 |
|
34 |
Args:
|
35 |
message: The SlackMessage containing the user's question
|
36 |
|
37 |
Returns:
|
38 |
-
|
39 |
"""
|
40 |
-
|
41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
|
|
44 |
try:
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
logger.error(f"Failed to generate embedding for message: {message.key}")
|
49 |
-
return []
|
50 |
-
|
51 |
-
query_embedding = embeddings[0].tolist()
|
52 |
-
|
53 |
-
# Create vector query
|
54 |
-
vector_query = VectorQuery(
|
55 |
-
query_text=message.text,
|
56 |
-
k=self.settings.TOP_K_MATCHES,
|
57 |
-
score_threshold=0.7 # Minimum similarity threshold
|
58 |
-
)
|
59 |
-
|
60 |
-
# Search for similar content in vector database
|
61 |
-
context_results = self.vector_database_service.search_by_similarity(
|
62 |
-
query=vector_query,
|
63 |
-
query_embedding=query_embedding
|
64 |
-
)
|
65 |
-
|
66 |
-
logger.info(f"Retrieved {len(context_results)} context items for message: {message.key}")
|
67 |
-
return context_results
|
68 |
-
|
69 |
except Exception as e:
|
70 |
-
logger.error(f"Error retrieving context
|
71 |
return []
|
72 |
-
|
|
|
|
|
|
|
|
1 |
from loguru import logger
|
2 |
+
from pydantic import BaseModel
|
3 |
+
from typing import Self, Sequence
|
4 |
|
5 |
from ctp_slack_bot.core.config import Settings
|
6 |
+
from ctp_slack_bot.models import Chunk, SlackMessage, VectorQuery, VectorizedChunk
|
7 |
from ctp_slack_bot.services.vector_database_service import VectorDatabaseService
|
8 |
from ctp_slack_bot.services.vectorization_service import VectorizationService
|
9 |
|
|
|
16 |
vectorization_service: VectorizationService
|
17 |
vector_database_service: VectorDatabaseService
|
18 |
|
19 |
+
class Config:
|
20 |
+
frozen=True
|
21 |
+
|
22 |
+
def __init__(self: Self, **data) -> None:
|
23 |
+
super().__init__(**data)
|
24 |
logger.debug("Created {}", self.__class__.__name__)
|
25 |
+
|
26 |
+
async def get_context(self: Self, message: SlackMessage) -> Sequence[Chunk]:
|
|
|
27 |
"""
|
28 |
+
Retrieve relevant context for a given SlackMessage by vectorizing the message and
|
29 |
+
querying the vectorstore.
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
Args:
|
32 |
message: The SlackMessage containing the user's question
|
33 |
|
34 |
Returns:
|
35 |
+
Sequence[Chunk]: List of retrieved context items with similarity scores
|
36 |
"""
|
37 |
+
# Extract chunks from the message
|
38 |
+
message_chunks = message.get_chunks()
|
39 |
+
|
40 |
+
# Vectorize the chunks
|
41 |
+
vectorized_chunks = self.vectorization_service.vectorize(message_chunks)
|
42 |
+
|
43 |
+
# Create vector query using the first chunk's embedding (typically there's only one chunk for a message)
|
44 |
+
if not vectorized_chunks:
|
45 |
+
logger.warning("No vectorized chunks were created for message")
|
46 |
return []
|
47 |
+
|
48 |
+
query = VectorQuery(
|
49 |
+
query_embeddings=vectorized_chunks[0].embedding,
|
50 |
+
k=self.settings.TOP_K_MATCHES,
|
51 |
+
score_threshold=self.settings.SCORE_THRESHOLD,
|
52 |
+
filter_metadata=None # Can be expanded to include filters based on message metadata
|
53 |
+
)
|
54 |
|
55 |
+
# Perform similarity search
|
56 |
try:
|
57 |
+
results = await self.vector_database_service.search_by_similarity(query)
|
58 |
+
# logger.info(f"Retrieved {len(results)} context chunks for query")
|
59 |
+
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
except Exception as e:
|
61 |
+
logger.error(f"Error retrieving context: {str(e)}")
|
62 |
return []
|
63 |
+
|
64 |
+
# test return statement
|
65 |
+
# return (VectorizedChunk(text="Mock context chunk", parent_id="lol", chunk_id="no", metadata={}, embedding=tuple()),
|
66 |
+
# VectorizedChunk(text="Moar mock context chunk", parent_id="lol", chunk_id="wut", metadata={}, embedding=tuple()))
|
src/ctp_slack_bot/services/embeddings_model_service.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from loguru import logger
|
2 |
+
from openai import OpenAI
|
3 |
+
from pydantic import BaseModel, PrivateAttr
|
4 |
+
from typing import Any, Dict, Sequence, Self
|
5 |
+
|
6 |
+
from ctp_slack_bot.core import Settings
|
7 |
+
|
8 |
+
class EmbeddingsModelService(BaseModel):
|
9 |
+
"""
|
10 |
+
Service for embeddings model operations.
|
11 |
+
"""
|
12 |
+
|
13 |
+
settings: Settings
|
14 |
+
_open_ai_client: PrivateAttr = PrivateAttr()
|
15 |
+
|
16 |
+
class Config:
|
17 |
+
frozen=True
|
18 |
+
|
19 |
+
def __init__(self: Self, **data: Dict[str, Any]) -> None:
|
20 |
+
super().__init__(**data)
|
21 |
+
self._open_ai_client = OpenAI(api_key=self.settings.OPENAI_API_KEY.get_secret_value())
|
22 |
+
logger.debug("Created {}", self.__class__.__name__)
|
23 |
+
|
24 |
+
def get_embeddings(self: Self, texts: Sequence[str]) -> Sequence[Sequence[float]]:
|
25 |
+
"""Get embeddings for a collection of texts using OpenAI’s API.
|
26 |
+
|
27 |
+
Args:
|
28 |
+
texts (Collection[str]): Collection of text chunks to embed
|
29 |
+
|
30 |
+
Returns:
|
31 |
+
NDArray: Array of embeddings with shape (n_texts, VECTOR_DIMENSION)
|
32 |
+
|
33 |
+
Raises:
|
34 |
+
ValueError: If the embedding dimensions don't match expected size
|
35 |
+
"""
|
36 |
+
logger.debug("Creating embeddings for {} text string(s)…", len(texts))
|
37 |
+
response = self._open_ai_client.embeddings.create(
|
38 |
+
model=self.settings.EMBEDDING_MODEL,
|
39 |
+
input=texts,
|
40 |
+
encoding_format="float" # Ensure we get raw float values.
|
41 |
+
)
|
42 |
+
embeddings = tuple(tuple(data.embedding) for data in response.data)
|
43 |
+
match embeddings:
|
44 |
+
case (first, _) if len(first) != self.settings.VECTOR_DIMENSION:
|
45 |
+
logger.error("Embedding dimension mismatch and/or misconfiguration: expected configured dimension {}, but got {}.", self.settings.VECTOR_DIMENSION, len(first))
|
46 |
+
raise ValueError() # TODO: raise a more specific type.
|
47 |
+
return embeddings
|
src/ctp_slack_bot/services/event_brokerage_service.py
CHANGED
@@ -1,38 +1,47 @@
|
|
1 |
-
|
|
|
2 |
from loguru import logger
|
3 |
-
from
|
4 |
-
from pydantic import BaseModel, model_validator
|
5 |
from typing import Any, Callable, Dict, List, Self
|
6 |
|
7 |
-
from ctp_slack_bot.
|
8 |
-
from ctp_slack_bot.models import RetreivedContext, SlackMessage
|
9 |
-
from ctp_slack_bot.services.content_ingestion_service import ContentIngestionService
|
10 |
-
from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
|
11 |
-
|
12 |
|
13 |
class EventBrokerageService(BaseModel):
|
14 |
"""
|
15 |
Service for brokering events between services.
|
16 |
"""
|
17 |
|
18 |
-
|
19 |
|
20 |
class Config:
|
21 |
-
|
22 |
|
23 |
-
|
24 |
-
|
25 |
logger.debug("Created {}", self.__class__.__name__)
|
26 |
-
return self
|
27 |
|
28 |
-
def subscribe(self: Self,
|
29 |
"""Subscribe to an event type with a callback function."""
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
|
|
35 |
"""Publish an event with optional data to all subscribers."""
|
36 |
-
|
37 |
-
|
38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from asyncio import create_task, iscoroutinefunction, to_thread
|
2 |
+
from collections import defaultdict
|
3 |
from loguru import logger
|
4 |
+
from pydantic import BaseModel, PrivateAttr
|
|
|
5 |
from typing import Any, Callable, Dict, List, Self
|
6 |
|
7 |
+
from ctp_slack_bot.enums import EventType
|
|
|
|
|
|
|
|
|
8 |
|
9 |
class EventBrokerageService(BaseModel):
|
10 |
"""
|
11 |
Service for brokering events between services.
|
12 |
"""
|
13 |
|
14 |
+
_subscribers: PrivateAttr = PrivateAttr(default_factory=lambda: defaultdict(list))
|
15 |
|
16 |
class Config:
|
17 |
+
frozen=True
|
18 |
|
19 |
+
def __init__(self: Self, **data) -> None:
|
20 |
+
super().__init__(**data)
|
21 |
logger.debug("Created {}", self.__class__.__name__)
|
|
|
22 |
|
23 |
+
def subscribe(self: Self, type: EventType, callback: Callable) -> None:
|
24 |
"""Subscribe to an event type with a callback function."""
|
25 |
+
logger.debug("1 new subscriber is listening for {} events.", type)
|
26 |
+
subscribers = self._subscribers[type]
|
27 |
+
subscribers.append(callback)
|
28 |
+
logger.debug("Event type {} has {} subscriber(s).", type, len(subscribers))
|
29 |
+
|
30 |
+
async def publish(self: Self, type: EventType, data: Any = None) -> None:
|
31 |
"""Publish an event with optional data to all subscribers."""
|
32 |
+
subscribers = self._subscribers[type]
|
33 |
+
if not subscribers:
|
34 |
+
logger.debug("No subscribers handle event {}: {}", type, len(subscribers), data)
|
35 |
+
return
|
36 |
+
logger.debug("Broadcasting event {} to {} subscriber(s): {}", type, len(subscribers), data)
|
37 |
+
for callback in subscribers:
|
38 |
+
if iscoroutinefunction(callback):
|
39 |
+
task = create_task(callback(data))
|
40 |
+
task.add_done_callback(lambda done_task: logger.error("Error in asynchronous event callback handling event {}: {}", type, done_task.exception())
|
41 |
+
if done_task.exception()
|
42 |
+
else None)
|
43 |
+
else:
|
44 |
+
try:
|
45 |
+
create_task(to_thread(callback, data))
|
46 |
+
except Exception as e:
|
47 |
+
logger.error("Error scheduling synchronous callback to handle event {}: {}", type, e)
|
src/ctp_slack_bot/services/google_drive_access.py
DELETED
@@ -1,623 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
Easy Google Drive Access
|
3 |
-
|
4 |
-
A simplified module for accessing Google Drive files in nested folders.
|
5 |
-
Designed to make it as easy as possible to access files using path-like syntax.
|
6 |
-
"""
|
7 |
-
|
8 |
-
import os
|
9 |
-
import pickle
|
10 |
-
import io
|
11 |
-
import datetime
|
12 |
-
from typing import List, Dict, Optional, Any, Union
|
13 |
-
|
14 |
-
from google.oauth2.credentials import Credentials
|
15 |
-
from google_auth_oauthlib.flow import InstalledAppFlow
|
16 |
-
from google.auth.transport.requests import Request
|
17 |
-
from googleapiclient.discovery import build
|
18 |
-
from googleapiclient.http import MediaIoBaseDownload
|
19 |
-
from googleapiclient.errors import HttpError
|
20 |
-
|
21 |
-
|
22 |
-
class EasyGoogleDrive:
|
23 |
-
"""
|
24 |
-
Simplified Google Drive access focused on accessing files in nested folders.
|
25 |
-
"""
|
26 |
-
|
27 |
-
# Define the scopes needed for the application
|
28 |
-
SCOPES = ['https://www.googleapis.com/auth/drive']
|
29 |
-
|
30 |
-
# Define common MIME types
|
31 |
-
MIME_TYPES = {
|
32 |
-
'folder': 'application/vnd.google-apps.folder',
|
33 |
-
'document': 'application/vnd.google-apps.document',
|
34 |
-
'spreadsheet': 'application/vnd.google-apps.spreadsheet',
|
35 |
-
'text': 'text/plain',
|
36 |
-
'pdf': 'application/pdf',
|
37 |
-
'image': 'image/jpeg',
|
38 |
-
'video': 'video/mp4',
|
39 |
-
'audio': 'audio/mpeg',
|
40 |
-
}
|
41 |
-
|
42 |
-
# Define metadata fields to retrieve
|
43 |
-
FILE_FIELDS = 'id, name, mimeType, createdTime, modifiedTime, size, description, webViewLink, thumbnailLink, owners, shared, sharingUser, lastModifyingUser, capabilities, permissions'
|
44 |
-
FOLDER_FIELDS = 'id, name, createdTime, modifiedTime, description, webViewLink, owners, shared, sharingUser, lastModifyingUser, capabilities, permissions'
|
45 |
-
|
46 |
-
def __init__(self, credentials_dir: str = 'credentials'):
|
47 |
-
"""Initialize the Google Drive access."""
|
48 |
-
self.credentials_dir = credentials_dir
|
49 |
-
self.credentials_path = os.path.join(credentials_dir, 'client_secret.json')
|
50 |
-
self.token_path = os.path.join(credentials_dir, 'token.pickle')
|
51 |
-
|
52 |
-
# Ensure credentials directory exists
|
53 |
-
os.makedirs(credentials_dir, exist_ok=True)
|
54 |
-
|
55 |
-
# Initialize the Drive API service
|
56 |
-
self.service = build('drive', 'v3', credentials=self._get_credentials())
|
57 |
-
|
58 |
-
# Cache for folder IDs to avoid repeated lookups
|
59 |
-
self.folder_id_cache = {}
|
60 |
-
|
61 |
-
def _get_credentials(self) -> Credentials:
|
62 |
-
"""Get and refresh Google Drive API credentials."""
|
63 |
-
creds = None
|
64 |
-
|
65 |
-
# Load existing token if it exists
|
66 |
-
if os.path.exists(self.token_path):
|
67 |
-
with open(self.token_path, 'rb') as token:
|
68 |
-
creds = pickle.load(token)
|
69 |
-
|
70 |
-
# If credentials need refresh or don't exist
|
71 |
-
if not creds or not creds.valid:
|
72 |
-
if creds and creds.expired and creds.refresh_token:
|
73 |
-
creds.refresh(Request())
|
74 |
-
else:
|
75 |
-
if not os.path.exists(self.credentials_path):
|
76 |
-
raise FileNotFoundError(
|
77 |
-
f"Client secrets file not found at {self.credentials_path}. "
|
78 |
-
"Please follow the setup instructions in the README."
|
79 |
-
)
|
80 |
-
|
81 |
-
flow = InstalledAppFlow.from_client_secrets_file(
|
82 |
-
self.credentials_path, self.SCOPES)
|
83 |
-
creds = flow.run_local_server(port=0)
|
84 |
-
|
85 |
-
# Save the credentials for future use
|
86 |
-
with open(self.token_path, 'wb') as token:
|
87 |
-
pickle.dump(creds, token)
|
88 |
-
|
89 |
-
return creds
|
90 |
-
|
91 |
-
def _format_metadata(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
|
92 |
-
"""
|
93 |
-
Format metadata for easier reading and usage.
|
94 |
-
|
95 |
-
Args:
|
96 |
-
metadata: Raw metadata from Google Drive API
|
97 |
-
|
98 |
-
Returns:
|
99 |
-
Formatted metadata dictionary
|
100 |
-
"""
|
101 |
-
formatted = metadata.copy()
|
102 |
-
|
103 |
-
# Format dates
|
104 |
-
for date_field in ['createdTime', 'modifiedTime']:
|
105 |
-
if date_field in formatted:
|
106 |
-
try:
|
107 |
-
# Convert ISO 8601 string to datetime object
|
108 |
-
dt = datetime.datetime.fromisoformat(formatted[date_field].replace('Z', '+00:00'))
|
109 |
-
formatted[date_field] = dt
|
110 |
-
# Add a formatted date string for easier reading
|
111 |
-
formatted[f"{date_field}Formatted"] = dt.strftime('%Y-%m-%d %H:%M:%S')
|
112 |
-
except (ValueError, AttributeError):
|
113 |
-
pass
|
114 |
-
|
115 |
-
# Format size
|
116 |
-
if 'size' in formatted and formatted['size']:
|
117 |
-
try:
|
118 |
-
size_bytes = int(formatted['size'])
|
119 |
-
# Add human-readable size
|
120 |
-
if size_bytes < 1024:
|
121 |
-
formatted['sizeFormatted'] = f"{size_bytes} B"
|
122 |
-
elif size_bytes < 1024 * 1024:
|
123 |
-
formatted['sizeFormatted'] = f"{size_bytes / 1024:.1f} KB"
|
124 |
-
elif size_bytes < 1024 * 1024 * 1024:
|
125 |
-
formatted['sizeFormatted'] = f"{size_bytes / (1024 * 1024):.1f} MB"
|
126 |
-
else:
|
127 |
-
formatted['sizeFormatted'] = f"{size_bytes / (1024 * 1024 * 1024):.1f} GB"
|
128 |
-
except (ValueError, TypeError):
|
129 |
-
pass
|
130 |
-
|
131 |
-
# Extract owner names
|
132 |
-
if 'owners' in formatted and formatted['owners']:
|
133 |
-
formatted['ownerNames'] = [owner.get('displayName', 'Unknown') for owner in formatted['owners']]
|
134 |
-
formatted['ownerEmails'] = [owner.get('emailAddress', 'Unknown') for owner in formatted['owners']]
|
135 |
-
|
136 |
-
# Add file type description
|
137 |
-
if 'mimeType' in formatted:
|
138 |
-
mime_type = formatted['mimeType']
|
139 |
-
for key, value in self.MIME_TYPES.items():
|
140 |
-
if mime_type == value:
|
141 |
-
formatted['fileType'] = key
|
142 |
-
break
|
143 |
-
else:
|
144 |
-
# If not found in our predefined types
|
145 |
-
formatted['fileType'] = mime_type.split('/')[-1]
|
146 |
-
|
147 |
-
return formatted
|
148 |
-
|
149 |
-
def get_folder_id(self, folder_path: str) -> Optional[str]:
|
150 |
-
"""
|
151 |
-
Get a folder ID from a path like 'folder1/folder2/folder3'.
|
152 |
-
|
153 |
-
Args:
|
154 |
-
folder_path: Path to the folder, using '/' as separator
|
155 |
-
|
156 |
-
Returns:
|
157 |
-
The folder ID if found, None otherwise
|
158 |
-
"""
|
159 |
-
# Check if we've already resolved this path
|
160 |
-
if folder_path in self.folder_id_cache:
|
161 |
-
return self.folder_id_cache[folder_path]
|
162 |
-
|
163 |
-
# If it looks like an ID already, return it
|
164 |
-
if len(folder_path) > 25 and '/' not in folder_path:
|
165 |
-
return folder_path
|
166 |
-
|
167 |
-
# Split the path into components
|
168 |
-
parts = folder_path.split('/')
|
169 |
-
|
170 |
-
# Start from the root
|
171 |
-
current_folder_id = None
|
172 |
-
current_path = ""
|
173 |
-
|
174 |
-
# Traverse the path one folder at a time
|
175 |
-
for i, folder_name in enumerate(parts):
|
176 |
-
if not folder_name: # Skip empty parts
|
177 |
-
continue
|
178 |
-
|
179 |
-
# Update the current path for caching
|
180 |
-
if current_path:
|
181 |
-
current_path += f"/{folder_name}"
|
182 |
-
else:
|
183 |
-
current_path = folder_name
|
184 |
-
|
185 |
-
# Check if we've already resolved this subpath
|
186 |
-
if current_path in self.folder_id_cache:
|
187 |
-
current_folder_id = self.folder_id_cache[current_path]
|
188 |
-
continue
|
189 |
-
|
190 |
-
# Search for the folder by name
|
191 |
-
query = f"mimeType='{self.MIME_TYPES['folder']}' and name='{folder_name}'"
|
192 |
-
if current_folder_id:
|
193 |
-
query += f" and '{current_folder_id}' in parents"
|
194 |
-
|
195 |
-
try:
|
196 |
-
results = self.service.files().list(
|
197 |
-
q=query,
|
198 |
-
spaces='drive',
|
199 |
-
fields='files(id, name)',
|
200 |
-
pageSize=10
|
201 |
-
).execute()
|
202 |
-
|
203 |
-
files = results.get('files', [])
|
204 |
-
if not files:
|
205 |
-
# Try a more flexible search if exact match fails
|
206 |
-
query = query.replace(f"name='{folder_name}'", f"name contains '{folder_name}'")
|
207 |
-
results = self.service.files().list(
|
208 |
-
q=query,
|
209 |
-
spaces='drive',
|
210 |
-
fields='files(id, name)',
|
211 |
-
pageSize=10
|
212 |
-
).execute()
|
213 |
-
|
214 |
-
files = results.get('files', [])
|
215 |
-
if not files:
|
216 |
-
print(f"Could not find folder '{folder_name}' in path '{folder_path}'")
|
217 |
-
return None
|
218 |
-
|
219 |
-
# Use the first match
|
220 |
-
current_folder_id = files[0]['id']
|
221 |
-
|
222 |
-
# Cache this result
|
223 |
-
self.folder_id_cache[current_path] = current_folder_id
|
224 |
-
|
225 |
-
except HttpError as error:
|
226 |
-
print(f"Error finding folder: {error}")
|
227 |
-
return None
|
228 |
-
|
229 |
-
return current_folder_id
|
230 |
-
|
231 |
-
def get_folders_in_folder(self, folder_path: str, include_metadata: bool = True) -> List[Dict[str, Any]]:
|
232 |
-
"""
|
233 |
-
Get all subfolders in a folder specified by path.
|
234 |
-
|
235 |
-
Args:
|
236 |
-
folder_path: Path to the folder, using '/' as separator
|
237 |
-
include_metadata: Whether to include detailed metadata (default: True)
|
238 |
-
|
239 |
-
Returns:
|
240 |
-
List of folder metadata dictionaries
|
241 |
-
"""
|
242 |
-
# Get the folder ID
|
243 |
-
folder_id = self.get_folder_id(folder_path)
|
244 |
-
if not folder_id:
|
245 |
-
print(f"Could not find folder: '{folder_path}'")
|
246 |
-
return []
|
247 |
-
|
248 |
-
# List all folders in this folder
|
249 |
-
query = f"'{folder_id}' in parents and mimeType = '{self.MIME_TYPES['folder']}'"
|
250 |
-
|
251 |
-
try:
|
252 |
-
results = self.service.files().list(
|
253 |
-
q=query,
|
254 |
-
spaces='drive',
|
255 |
-
fields=f'files({self.FOLDER_FIELDS})' if include_metadata else 'files(id, name)',
|
256 |
-
pageSize=1000
|
257 |
-
).execute()
|
258 |
-
|
259 |
-
folders = results.get('files', [])
|
260 |
-
|
261 |
-
# Format metadata if requested
|
262 |
-
if include_metadata and folders:
|
263 |
-
folders = [self._format_metadata(folder) for folder in folders]
|
264 |
-
|
265 |
-
if folders:
|
266 |
-
print(f"Found {len(folders)} subfolders in '{folder_path}':")
|
267 |
-
for folder in folders:
|
268 |
-
if include_metadata and 'createdTimeFormatted' in folder:
|
269 |
-
print(f" - {folder['name']} (Created: {folder['createdTimeFormatted']})")
|
270 |
-
else:
|
271 |
-
print(f" - {folder['name']}")
|
272 |
-
else:
|
273 |
-
print(f"No subfolders found in '{folder_path}'")
|
274 |
-
|
275 |
-
return folders
|
276 |
-
|
277 |
-
except HttpError as error:
|
278 |
-
print(f"Error listing folders: {error}")
|
279 |
-
return []
|
280 |
-
|
281 |
-
def get_files_in_folder(self, folder_path: str, include_metadata: bool = True, include_content: bool = False) -> List[Dict[str, Any]]:
|
282 |
-
"""
|
283 |
-
Get all files in a folder specified by path.
|
284 |
-
|
285 |
-
Args:
|
286 |
-
folder_path: Path to the folder, using '/' as separator
|
287 |
-
include_metadata: Whether to include detailed metadata (default: True)
|
288 |
-
include_content: Whether to include file content (default: False)
|
289 |
-
|
290 |
-
Returns:
|
291 |
-
List of file metadata dictionaries, optionally including file content
|
292 |
-
"""
|
293 |
-
# Get the folder ID
|
294 |
-
folder_id = self.get_folder_id(folder_path)
|
295 |
-
if not folder_id:
|
296 |
-
print(f"Could not find folder: '{folder_path}'")
|
297 |
-
return []
|
298 |
-
|
299 |
-
# List all non-folder files in this folder
|
300 |
-
query = f"'{folder_id}' in parents and mimeType != '{self.MIME_TYPES['folder']}'"
|
301 |
-
|
302 |
-
try:
|
303 |
-
results = self.service.files().list(
|
304 |
-
q=query,
|
305 |
-
spaces='drive',
|
306 |
-
fields=f'files({self.FILE_FIELDS})' if include_metadata else 'files(id, name, mimeType)',
|
307 |
-
pageSize=1000
|
308 |
-
).execute()
|
309 |
-
|
310 |
-
files = results.get('files', [])
|
311 |
-
|
312 |
-
# Format metadata if requested
|
313 |
-
if include_metadata and files:
|
314 |
-
files = [self._format_metadata(file) for file in files]
|
315 |
-
|
316 |
-
# Add file content if requested
|
317 |
-
if include_content and files:
|
318 |
-
for file in files:
|
319 |
-
try:
|
320 |
-
# Skip files that are likely not text-based
|
321 |
-
if any(ext in file['name'].lower() for ext in ['.jpg', '.png', '.gif', '.mp3', '.mp4']):
|
322 |
-
print(f"Skipping content for non-text file: {file['name']}")
|
323 |
-
file['file_content'] = None
|
324 |
-
continue
|
325 |
-
|
326 |
-
# Read the file content
|
327 |
-
content = self.read_file_from_object(file)
|
328 |
-
file['file_content'] = content
|
329 |
-
|
330 |
-
if content is not None:
|
331 |
-
print(f"Successfully read content for: {file['name']} ({len(content)} characters)")
|
332 |
-
else:
|
333 |
-
print(f"Unable to read content for: {file['name']}")
|
334 |
-
except Exception as e:
|
335 |
-
print(f"Error reading content for {file['name']}: {e}")
|
336 |
-
file['file_content'] = None
|
337 |
-
|
338 |
-
if files:
|
339 |
-
print(f"Found {len(files)} files in '{folder_path}':")
|
340 |
-
for file in files:
|
341 |
-
if include_metadata and 'createdTimeFormatted' in file:
|
342 |
-
print(f" - {file['name']} ({file.get('fileType', 'Unknown')}, Created: {file['createdTimeFormatted']})")
|
343 |
-
else:
|
344 |
-
print(f" - {file['name']} ({file.get('mimeType', 'Unknown')})")
|
345 |
-
else:
|
346 |
-
print(f"No files found in '{folder_path}'")
|
347 |
-
|
348 |
-
return files
|
349 |
-
|
350 |
-
except HttpError as error:
|
351 |
-
print(f"Error listing files: {error}")
|
352 |
-
return []
|
353 |
-
|
354 |
-
def get_file(self, file_name: str, folder_path: str, include_metadata: bool = True, include_content: bool = False) -> Optional[Dict[str, Any]]:
|
355 |
-
"""
|
356 |
-
Get a specific file by name from a folder.
|
357 |
-
|
358 |
-
Args:
|
359 |
-
file_name: Name of the file to get
|
360 |
-
folder_path: Path to the folder containing the file
|
361 |
-
include_metadata: Whether to include detailed metadata (default: True)
|
362 |
-
include_content: Whether to include file content (default: False)
|
363 |
-
|
364 |
-
Returns:
|
365 |
-
File metadata dictionary, optionally including content, or None if file not found
|
366 |
-
"""
|
367 |
-
# Get the folder ID
|
368 |
-
folder_id = self.get_folder_id(folder_path)
|
369 |
-
if not folder_id:
|
370 |
-
print(f"Could not find folder: '{folder_path}'")
|
371 |
-
return None
|
372 |
-
|
373 |
-
# Find the file by name in this folder
|
374 |
-
query = f"'{folder_id}' in parents and name = '{file_name}'"
|
375 |
-
|
376 |
-
try:
|
377 |
-
results = self.service.files().list(
|
378 |
-
q=query,
|
379 |
-
spaces='drive',
|
380 |
-
fields=f'files({self.FILE_FIELDS})' if include_metadata else 'files(id, name, mimeType)',
|
381 |
-
pageSize=1
|
382 |
-
).execute()
|
383 |
-
|
384 |
-
files = results.get('files', [])
|
385 |
-
if not files:
|
386 |
-
# Try a more flexible search
|
387 |
-
query = query.replace(f"name = '{file_name}'", f"name contains '{file_name}'")
|
388 |
-
results = self.service.files().list(
|
389 |
-
q=query,
|
390 |
-
spaces='drive',
|
391 |
-
fields=f'files({self.FILE_FIELDS})' if include_metadata else 'files(id, name, mimeType)',
|
392 |
-
pageSize=10
|
393 |
-
).execute()
|
394 |
-
|
395 |
-
files = results.get('files', [])
|
396 |
-
if not files:
|
397 |
-
print(f"Could not find file '{file_name}' in '{folder_path}'")
|
398 |
-
return None
|
399 |
-
|
400 |
-
# Use the first match
|
401 |
-
file = files[0]
|
402 |
-
|
403 |
-
# Format metadata if requested
|
404 |
-
if include_metadata:
|
405 |
-
file = self._format_metadata(file)
|
406 |
-
|
407 |
-
# Add file content if requested
|
408 |
-
if include_content:
|
409 |
-
try:
|
410 |
-
# Skip files that are likely not text-based
|
411 |
-
if any(ext in file['name'].lower() for ext in ['.jpg', '.png', '.gif', '.mp3', '.mp4']):
|
412 |
-
print(f"Skipping content for non-text file: {file['name']}")
|
413 |
-
file['file_content'] = None
|
414 |
-
else:
|
415 |
-
# Read the file content
|
416 |
-
content = self.read_file_from_object(file)
|
417 |
-
file['file_content'] = content
|
418 |
-
|
419 |
-
if content is not None:
|
420 |
-
print(f"Successfully read content for: {file['name']} ({len(content)} characters)")
|
421 |
-
else:
|
422 |
-
print(f"Unable to read content for: {file['name']}")
|
423 |
-
except Exception as e:
|
424 |
-
print(f"Error reading content for {file['name']}: {e}")
|
425 |
-
file['file_content'] = None
|
426 |
-
|
427 |
-
print(f"Found file: {file['name']}")
|
428 |
-
return file
|
429 |
-
|
430 |
-
except HttpError as error:
|
431 |
-
print(f"Error getting file: {error}")
|
432 |
-
return None
|
433 |
-
|
434 |
-
def get_all_files_in_folder(self, folder_path: str, include_metadata: bool = True, include_content: bool = False) -> List[Dict[str, Any]]:
|
435 |
-
"""
|
436 |
-
Get all items (files and folders) in a folder specified by path.
|
437 |
-
|
438 |
-
Args:
|
439 |
-
folder_path: Path to the folder, using '/' as separator
|
440 |
-
include_metadata: Whether to include detailed metadata (default: True)
|
441 |
-
include_content: Whether to include file content (default: False)
|
442 |
-
|
443 |
-
Returns:
|
444 |
-
List of file and folder metadata dictionaries, optionally including file content
|
445 |
-
"""
|
446 |
-
# Get the folder ID
|
447 |
-
folder_id = self.get_folder_id(folder_path)
|
448 |
-
if not folder_id:
|
449 |
-
print(f"Could not find folder: '{folder_path}'")
|
450 |
-
return []
|
451 |
-
|
452 |
-
# List all items in this folder
|
453 |
-
query = f"'{folder_id}' in parents"
|
454 |
-
|
455 |
-
try:
|
456 |
-
results = self.service.files().list(
|
457 |
-
q=query,
|
458 |
-
spaces='drive',
|
459 |
-
fields=f'files({self.FILE_FIELDS})' if include_metadata else 'files(id, name, mimeType)',
|
460 |
-
pageSize=1000
|
461 |
-
).execute()
|
462 |
-
|
463 |
-
items = results.get('files', [])
|
464 |
-
|
465 |
-
# Format metadata if requested
|
466 |
-
if include_metadata and items:
|
467 |
-
items = [self._format_metadata(item) for item in items]
|
468 |
-
|
469 |
-
# Add file content if requested
|
470 |
-
if include_content and items:
|
471 |
-
for item in items:
|
472 |
-
# Skip folders and non-text files
|
473 |
-
if item.get('mimeType') == self.MIME_TYPES['folder'] or any(ext in item['name'].lower() for ext in ['.jpg', '.png', '.gif', '.mp3', '.mp4']):
|
474 |
-
item['file_content'] = None
|
475 |
-
continue
|
476 |
-
|
477 |
-
try:
|
478 |
-
# Read the file content
|
479 |
-
content = self.read_file_from_object(item)
|
480 |
-
item['file_content'] = content
|
481 |
-
|
482 |
-
if content is not None:
|
483 |
-
print(f"Successfully read content for: {item['name']} ({len(content)} characters)")
|
484 |
-
else:
|
485 |
-
print(f"Unable to read content for: {item['name']}")
|
486 |
-
except Exception as e:
|
487 |
-
print(f"Error reading content for {item['name']}: {e}")
|
488 |
-
item['file_content'] = None
|
489 |
-
|
490 |
-
if items:
|
491 |
-
print(f"Found {len(items)} items in '{folder_path}':")
|
492 |
-
for item in items:
|
493 |
-
if include_metadata and 'createdTimeFormatted' in item:
|
494 |
-
item_type = 'Folder' if item.get('mimeType') == self.MIME_TYPES['folder'] else item.get('fileType', 'Unknown')
|
495 |
-
print(f" - {item['name']} ({item_type}, Created: {item['createdTimeFormatted']})")
|
496 |
-
else:
|
497 |
-
item_type = 'Folder' if item.get('mimeType') == self.MIME_TYPES['folder'] else item.get('mimeType', 'Unknown')
|
498 |
-
print(f" - {item['name']} ({item_type})")
|
499 |
-
else:
|
500 |
-
print(f"No items found in '{folder_path}'")
|
501 |
-
|
502 |
-
return items
|
503 |
-
|
504 |
-
except HttpError as error:
|
505 |
-
print(f"Error listing items: {error}")
|
506 |
-
return []
|
507 |
-
|
508 |
-
def file_exists(self, file_name: str, folder_path: str) -> bool:
|
509 |
-
"""
|
510 |
-
Check if a file exists at the specified path in Google Drive.
|
511 |
-
|
512 |
-
Args:
|
513 |
-
file_name: Name of the file to check
|
514 |
-
folder_path: Path to the folder containing the file
|
515 |
-
|
516 |
-
Returns:
|
517 |
-
True if the file exists, False otherwise
|
518 |
-
"""
|
519 |
-
# Get the folder ID
|
520 |
-
folder_id = self.get_folder_id(folder_path)
|
521 |
-
if not folder_id:
|
522 |
-
print(f"Could not find folder: '{folder_path}'")
|
523 |
-
return False
|
524 |
-
|
525 |
-
# Check if the file exists in this folder
|
526 |
-
query = f"'{folder_id}' in parents and name = '{file_name}'"
|
527 |
-
|
528 |
-
try:
|
529 |
-
results = self.service.files().list(
|
530 |
-
q=query,
|
531 |
-
spaces='drive',
|
532 |
-
fields='files(id, name)',
|
533 |
-
pageSize=1
|
534 |
-
).execute()
|
535 |
-
|
536 |
-
files = results.get('files', [])
|
537 |
-
if not files:
|
538 |
-
# Try a more flexible search
|
539 |
-
query = query.replace(f"name = '{file_name}'", f"name contains '{file_name}'")
|
540 |
-
results = self.service.files().list(
|
541 |
-
q=query,
|
542 |
-
spaces='drive',
|
543 |
-
fields='files(id, name)',
|
544 |
-
pageSize=10
|
545 |
-
).execute()
|
546 |
-
|
547 |
-
files = results.get('files', [])
|
548 |
-
if not files:
|
549 |
-
print(f"File '{file_name}' does not exist in '{folder_path}'")
|
550 |
-
return False
|
551 |
-
|
552 |
-
# File exists
|
553 |
-
print(f"File '{file_name}' exists in '{folder_path}'")
|
554 |
-
return True
|
555 |
-
|
556 |
-
except HttpError as error:
|
557 |
-
print(f"Error checking if file exists: {error}")
|
558 |
-
return False
|
559 |
-
|
560 |
-
def get_file_modified_time(self, file_name: str, folder_path: str) -> Optional[datetime.datetime]:
|
561 |
-
"""
|
562 |
-
Get the last modified time of a file.
|
563 |
-
|
564 |
-
Args:
|
565 |
-
file_name: Name of the file
|
566 |
-
folder_path: Path to the folder containing the file
|
567 |
-
|
568 |
-
Returns:
|
569 |
-
The last modified time as a datetime object, or None if the file doesn't exist
|
570 |
-
"""
|
571 |
-
# Get the file metadata
|
572 |
-
file = self.get_file(file_name, folder_path, include_metadata=True)
|
573 |
-
if not file:
|
574 |
-
return None
|
575 |
-
|
576 |
-
# Return the modified time
|
577 |
-
return file.get('modifiedTime')
|
578 |
-
|
579 |
-
def read_file_from_object(self, file_object: Dict[str, Any]) -> Optional[str]:
|
580 |
-
"""
|
581 |
-
Read the contents of a file using a file object.
|
582 |
-
|
583 |
-
Args:
|
584 |
-
file_object: A Google file object with at least 'id' and 'mimeType' fields
|
585 |
-
|
586 |
-
Returns:
|
587 |
-
The file contents as a string, or None if the file couldn't be read
|
588 |
-
"""
|
589 |
-
file_id = file_object.get('id')
|
590 |
-
mime_type = file_object.get('mimeType')
|
591 |
-
|
592 |
-
if not file_id or not mime_type:
|
593 |
-
print("File object is missing 'id' or 'mimeType' fields.")
|
594 |
-
return None
|
595 |
-
|
596 |
-
try:
|
597 |
-
# Read the file based on its type
|
598 |
-
if mime_type == self.MIME_TYPES['document']:
|
599 |
-
# Export Google Doc as plain text
|
600 |
-
response = self.service.files().export(
|
601 |
-
fileId=file_id,
|
602 |
-
mimeType='text/plain'
|
603 |
-
).execute()
|
604 |
-
return response.decode('utf-8')
|
605 |
-
|
606 |
-
else:
|
607 |
-
# Download regular files
|
608 |
-
request = self.service.files().get_media(fileId=file_id)
|
609 |
-
fh = io.BytesIO()
|
610 |
-
downloader = MediaIoBaseDownload(fh, request)
|
611 |
-
|
612 |
-
done = False
|
613 |
-
while not done:
|
614 |
-
_, done = downloader.next_chunk()
|
615 |
-
|
616 |
-
return fh.getvalue().decode('utf-8')
|
617 |
-
|
618 |
-
except HttpError as error:
|
619 |
-
print(f"Error reading file: {error}")
|
620 |
-
return None
|
621 |
-
except Exception as e:
|
622 |
-
print(f"Error decoding file content: {e}")
|
623 |
-
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/ctp_slack_bot/services/google_drive_basic_usage.py
DELETED
@@ -1,178 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
Basic Usage Examples for EasyGoogleDrive
|
3 |
-
|
4 |
-
This file demonstrates how to use the EasyGoogleDrive class to interact with Google Drive.
|
5 |
-
It provides examples of the main functionality without printing all results to keep the output clean.
|
6 |
-
"""
|
7 |
-
|
8 |
-
from google_drive_access import EasyGoogleDrive
|
9 |
-
import datetime
|
10 |
-
|
11 |
-
def main():
|
12 |
-
"""
|
13 |
-
Main function demonstrating the basic usage of EasyGoogleDrive.
|
14 |
-
"""
|
15 |
-
# Initialize the Google Drive client
|
16 |
-
# This will prompt for authentication the first time it's run
|
17 |
-
drive = EasyGoogleDrive()
|
18 |
-
|
19 |
-
# Example folder path - replace with your actual folder path
|
20 |
-
folder_path = "Spring-2025-BAI"
|
21 |
-
subfolder_path = "Spring-2025-BAI/transcripts"
|
22 |
-
|
23 |
-
print("=== Basic Usage Examples for EasyGoogleDrive ===\n")
|
24 |
-
|
25 |
-
# Example 1: List folders in a directory
|
26 |
-
print("Example 1: Listing folders in a directory")
|
27 |
-
print("----------------------------------------")
|
28 |
-
folders = drive.get_folders_in_folder(folder_path)
|
29 |
-
|
30 |
-
# Print only the first 3 folders (if any exist)
|
31 |
-
if folders:
|
32 |
-
print(f"Found {len(folders)} folders. Showing first 3:")
|
33 |
-
for i, folder in enumerate(folders[:3]):
|
34 |
-
print(f" - {folder['name']} (Created: {folder.get('createdTimeFormatted', 'Unknown')})")
|
35 |
-
if len(folders) > 3:
|
36 |
-
print(f" ... and {len(folders) - 3} more folders")
|
37 |
-
else:
|
38 |
-
print("No folders found.")
|
39 |
-
print()
|
40 |
-
|
41 |
-
# Example 2: List files in a directory
|
42 |
-
print("Example 2: Listing files in a directory")
|
43 |
-
print("--------------------------------------")
|
44 |
-
files = drive.get_files_in_folder(subfolder_path)
|
45 |
-
|
46 |
-
# Print only the first 3 files (if any exist)
|
47 |
-
if files:
|
48 |
-
print(f"Found {len(files)} files. Showing first 3:")
|
49 |
-
for i, file in enumerate(files[:3]):
|
50 |
-
file_type = file.get('fileType', 'Unknown')
|
51 |
-
created_time = file.get('createdTimeFormatted', 'Unknown')
|
52 |
-
print(f" - {file['name']} ({file_type}, Created: {created_time})")
|
53 |
-
if len(files) > 3:
|
54 |
-
print(f" ... and {len(files) - 3} more files")
|
55 |
-
else:
|
56 |
-
print("No files found.")
|
57 |
-
print()
|
58 |
-
|
59 |
-
# Example 3: Get a specific file
|
60 |
-
print("Example 3: Getting a specific file")
|
61 |
-
print("--------------------------------")
|
62 |
-
# Use the first file found in the previous example, or a default if none were found
|
63 |
-
file_name = files[-1]['name'] if files and len(files) > 0 else "example.txt"
|
64 |
-
|
65 |
-
file = drive.get_file(file_name, subfolder_path, include_metadata=True)
|
66 |
-
if file:
|
67 |
-
print(f"File found: {file['name']}")
|
68 |
-
print(f" Type: {file.get('fileType', 'Unknown')}")
|
69 |
-
print(f" Created: {file.get('createdTimeFormatted', 'Unknown')}")
|
70 |
-
print(f" Modified: {file.get('modifiedTimeFormatted', 'Unknown')}")
|
71 |
-
print(f" Size: {file.get('sizeFormatted', 'Unknown')}")
|
72 |
-
else:
|
73 |
-
print(f"File '{file_name}' not found.")
|
74 |
-
print()
|
75 |
-
|
76 |
-
# Example 4: Get all items in a folder (files and folders)
|
77 |
-
print("Example 4: Getting all items in a folder")
|
78 |
-
print("--------------------------------------")
|
79 |
-
all_items = drive.get_all_files_in_folder(folder_path)
|
80 |
-
|
81 |
-
# Print only the first 3 items (if any exist)
|
82 |
-
if all_items:
|
83 |
-
print(f"Found {len(all_items)} items. Showing first 3:")
|
84 |
-
for i, item in enumerate(all_items[:3]):
|
85 |
-
item_type = "Folder" if item.get('mimeType') == drive.MIME_TYPES['folder'] else item.get('fileType', 'Unknown')
|
86 |
-
created_time = item.get('createdTimeFormatted', 'Unknown')
|
87 |
-
print(f" - {item['name']} ({item_type}, Created: {created_time})")
|
88 |
-
if len(all_items) > 3:
|
89 |
-
print(f" ... and {len(all_items) - 3} more items")
|
90 |
-
else:
|
91 |
-
print("No items found.")
|
92 |
-
print()
|
93 |
-
|
94 |
-
# Example 5: Check if a file exists
|
95 |
-
print("Example 5: Checking if a file exists")
|
96 |
-
print("----------------------------------")
|
97 |
-
# Use the same file name from Example 3
|
98 |
-
file_to_check = file_name
|
99 |
-
|
100 |
-
exists = drive.file_exists(file_to_check, subfolder_path)
|
101 |
-
print(f"File '{file_to_check}' {'exists' if exists else 'does not exist'} in '{subfolder_path}'.")
|
102 |
-
print()
|
103 |
-
|
104 |
-
# Example 6: Get file modified time
|
105 |
-
print("Example 6: Getting file modified time")
|
106 |
-
print("-----------------------------------")
|
107 |
-
# Use the same file name from Example 3
|
108 |
-
file_to_check_time = file_name
|
109 |
-
|
110 |
-
modified_time = drive.get_file_modified_time(file_to_check_time, subfolder_path)
|
111 |
-
if modified_time:
|
112 |
-
print(f"File '{file_to_check_time}' was last modified on: {modified_time}")
|
113 |
-
else:
|
114 |
-
print(f"Could not get modified time for '{file_to_check_time}'.")
|
115 |
-
print()
|
116 |
-
|
117 |
-
# Example 7: Get file with content
|
118 |
-
print("Example 7: Getting file with content")
|
119 |
-
print("----------------------------------")
|
120 |
-
# Use the same file name from Example 3
|
121 |
-
file_with_content = file_name
|
122 |
-
|
123 |
-
file_with_content_obj = drive.get_file(file_with_content, subfolder_path, include_content=True)
|
124 |
-
if file_with_content_obj and 'file_content' in file_with_content_obj:
|
125 |
-
content = file_with_content_obj['file_content']
|
126 |
-
if content:
|
127 |
-
print(f"File '{file_with_content}' content (first 100 chars):")
|
128 |
-
print(f" {content[:100]}...")
|
129 |
-
else:
|
130 |
-
print(f"File '{file_with_content}' has no content or content could not be read.")
|
131 |
-
else:
|
132 |
-
print(f"File '{file_with_content}' not found or content could not be retrieved.")
|
133 |
-
print()
|
134 |
-
|
135 |
-
# Example 8: Get contents of all files in a folder
|
136 |
-
print("Example 8: Getting contents of all files in a folder")
|
137 |
-
print("------------------------------------------------")
|
138 |
-
# Get all files with content
|
139 |
-
all_files_with_content = drive.get_files_in_folder(subfolder_path, include_content=True)
|
140 |
-
|
141 |
-
if all_files_with_content:
|
142 |
-
print(f"Found {len(all_files_with_content)} files. Showing content preview for first 3:")
|
143 |
-
for i, file in enumerate(all_files_with_content[:3]):
|
144 |
-
print(f" File: {file['name']}")
|
145 |
-
if 'file_content' in file and file['file_content']:
|
146 |
-
content = file['file_content']
|
147 |
-
print(f" Content preview: {content[:50]}...")
|
148 |
-
else:
|
149 |
-
print(f" No content available or file is not text-based.")
|
150 |
-
|
151 |
-
if len(all_files_with_content) > 3:
|
152 |
-
print(f" ... and {len(all_files_with_content) - 3} more files with content")
|
153 |
-
else:
|
154 |
-
print("No files found or no content could be retrieved.")
|
155 |
-
print()
|
156 |
-
|
157 |
-
# Example 9: Get content from a specific file using read_file_from_object
|
158 |
-
print("Example 9: Getting content from a specific file using read_file_from_object")
|
159 |
-
print("------------------------------------------------------------------------")
|
160 |
-
# Get a file object first
|
161 |
-
file_obj = drive.get_file(file_name, subfolder_path)
|
162 |
-
|
163 |
-
if file_obj:
|
164 |
-
# Read the content directly from the file object
|
165 |
-
content = drive.read_file_from_object(file_obj)
|
166 |
-
if content:
|
167 |
-
print(f"File '{file_obj['name']}' content (first 100 chars):")
|
168 |
-
print(f" {content[:100]}...")
|
169 |
-
else:
|
170 |
-
print(f"File '{file_obj['name']}' has no content or content could not be read.")
|
171 |
-
else:
|
172 |
-
print(f"File '{file_name}' not found.")
|
173 |
-
print()
|
174 |
-
|
175 |
-
print("=== End of Examples ===")
|
176 |
-
|
177 |
-
if __name__ == "__main__":
|
178 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/ctp_slack_bot/services/google_drive_service.py
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datetime import datetime
|
2 |
+
from cachetools import TTLCache
|
3 |
+
from google.oauth2 import service_account
|
4 |
+
from googleapiclient.discovery import build
|
5 |
+
from googleapiclient.http import MediaIoBaseDownload
|
6 |
+
from googleapiclient.errors import HttpError
|
7 |
+
from io import BytesIO
|
8 |
+
from loguru import logger
|
9 |
+
from pydantic import BaseModel, PrivateAttr
|
10 |
+
from typing import Collection, Dict, List, Optional, Self
|
11 |
+
|
12 |
+
from ctp_slack_bot.core import Settings
|
13 |
+
from ctp_slack_bot.models import GoogleDriveMetadata
|
14 |
+
|
15 |
+
|
16 |
+
FOLDER_MIME_TYPE: str = "application/vnd.google-apps.folder"
|
17 |
+
PATH_SEPARATOR: str = "/"
|
18 |
+
|
19 |
+
|
20 |
+
class GoogleDriveService(BaseModel):
|
21 |
+
"""Service for interacting with Google Drive."""
|
22 |
+
|
23 |
+
settings: Settings
|
24 |
+
_google_drive_client: PrivateAttr = PrivateAttr()
|
25 |
+
_folder_cache: PrivateAttr = PrivateAttr(default_factory=lambda: TTLCache(maxsize=256, ttl=60))
|
26 |
+
|
27 |
+
class Config:
|
28 |
+
frozen=True
|
29 |
+
|
30 |
+
def __init__(self: Self, **data) -> None:
|
31 |
+
super().__init__(**data)
|
32 |
+
credentials = service_account.Credentials.from_service_account_info({
|
33 |
+
"type": "service_account",
|
34 |
+
"project_id": self.settings.GOOGLE_PROJECT_ID,
|
35 |
+
"private_key_id": self.settings.GOOGLE_PRIVATE_KEY_ID.get_secret_value(),
|
36 |
+
"private_key": self.settings.GOOGLE_PRIVATE_KEY.get_secret_value(),
|
37 |
+
"client_email": self.settings.GOOGLE_CLIENT_EMAIL,
|
38 |
+
"client_id": self.settings.GOOGLE_CLIENT_ID,
|
39 |
+
"token_uri": self.settings.GOOGLE_TOKEN_URI,
|
40 |
+
}, scopes=["https://www.googleapis.com/auth/drive"])
|
41 |
+
self._google_drive_client = build('drive', 'v3', credentials=credentials)
|
42 |
+
logger.debug("Created {}", self.__class__.__name__)
|
43 |
+
|
44 |
+
def _resolve_folder_id(self: Self, folder_path: str) -> Optional[str]:
|
45 |
+
"""Resolve a folder path to a Google Drive ID."""
|
46 |
+
|
47 |
+
if not folder_path:
|
48 |
+
return self.settings.GOOGLE_DRIVE_ROOT_ID
|
49 |
+
|
50 |
+
if folder_path in self._folder_cache:
|
51 |
+
return self._folder_cache[folder_path]
|
52 |
+
|
53 |
+
current_id = self.settings.GOOGLE_DRIVE_ROOT_ID
|
54 |
+
try:
|
55 |
+
for part in folder_path.split(PATH_SEPARATOR):
|
56 |
+
results = self._google_drive_client.files().list(
|
57 |
+
q=f"name='{part.replace("\\", "\\\\").replace("'", "\\'")}' and mimeType='{FOLDER_MIME_TYPE}' and '{current_id}' in parents",
|
58 |
+
fields="files(id,name)",
|
59 |
+
supportsAllDrives=True,
|
60 |
+
includeItemsFromAllDrives=True
|
61 |
+
).execute()
|
62 |
+
match results:
|
63 |
+
case {"files": [ {"id": id} ]}:
|
64 |
+
current_id = id
|
65 |
+
case _:
|
66 |
+
logger.debug("Folder not found by path: {}", folder_path)
|
67 |
+
return None
|
68 |
+
except HttpError as e:
|
69 |
+
logger.error("Error resolving folder path: {}", folder_path)
|
70 |
+
return None
|
71 |
+
|
72 |
+
self._folder_cache[folder_path] = current_id
|
73 |
+
return current_id
|
74 |
+
|
75 |
+
def list_directory(self: Self, folder_path: str) -> Collection[GoogleDriveMetadata]:
|
76 |
+
"""List contents of a directory with basic metadata."""
|
77 |
+
|
78 |
+
folder_id = self._resolve_folder_id(folder_path)
|
79 |
+
if not folder_id:
|
80 |
+
logger.debug("Folder not found by path: {}", folder_path)
|
81 |
+
return ()
|
82 |
+
|
83 |
+
try:
|
84 |
+
results = self._google_drive_client.files().list(
|
85 |
+
q=f"'{folder_id}' in parents",
|
86 |
+
fields="files(id,name,mimeType,modifiedTime)",
|
87 |
+
supportsAllDrives=True,
|
88 |
+
includeItemsFromAllDrives=True,
|
89 |
+
pageSize=1000
|
90 |
+
).execute()
|
91 |
+
return tuple(GoogleDriveMetadata.from_folder_path_and_dict(folder_path, result)
|
92 |
+
for result
|
93 |
+
in results.get('files', ()))
|
94 |
+
except HttpError as e:
|
95 |
+
logger.error("Error listing folder by path, {}: {}", folder_path, e)
|
96 |
+
return ()
|
97 |
+
|
98 |
+
def get_metadata(self: Self, item_path: str) -> Optional[GoogleDriveMetadata]:
|
99 |
+
"""Get metadata for a specific file/folder by path."""
|
100 |
+
|
101 |
+
match item_path.rsplit(PATH_SEPARATOR, 1):
|
102 |
+
case [item_name]:
|
103 |
+
folder_path = ""
|
104 |
+
folder_id = self.settings.GOOGLE_DRIVE_ROOT_ID
|
105 |
+
case [folder_path, item_name]:
|
106 |
+
folder_id = self._resolve_folder_id(folder_path)
|
107 |
+
|
108 |
+
if not folder_id:
|
109 |
+
logger.debug("Folder not found by path: {}", folder_path)
|
110 |
+
return None
|
111 |
+
|
112 |
+
try:
|
113 |
+
results = self._google_drive_client.files().list(
|
114 |
+
q=f"name='{item_name}' and '{folder_id}' in parents",
|
115 |
+
fields="files(id,name,mimeType,modifiedTime)",
|
116 |
+
supportsAllDrives=True,
|
117 |
+
includeItemsFromAllDrives=True,
|
118 |
+
pageSize=1
|
119 |
+
).execute()
|
120 |
+
match results:
|
121 |
+
case {"files": [result]}:
|
122 |
+
return GoogleDriveMetadata.from_folder_path_and_dict(folder_path, result)
|
123 |
+
except HttpError as e:
|
124 |
+
logger.error("Error getting metadata for item by path, {}: {}", item_path, e)
|
125 |
+
|
126 |
+
logger.debug("Item not found by path: {}", item_path)
|
127 |
+
return None
|
128 |
+
|
129 |
+
def read_file_by_id(self: Self, file_id: str) -> Optional[bytes]:
|
130 |
+
"""Read contents of a file by its unique identifier."""
|
131 |
+
|
132 |
+
try:
|
133 |
+
request = self._google_drive_client.files().get_media(fileId=file_id)
|
134 |
+
buffer = BytesIO()
|
135 |
+
downloader = MediaIoBaseDownload(buffer, request)
|
136 |
+
done = False
|
137 |
+
while not done:
|
138 |
+
_, done = downloader.next_chunk()
|
139 |
+
return buffer.getvalue()
|
140 |
+
except HttpError as e:
|
141 |
+
logger.error("Error reading file by ID, {}: {}", file_id, e)
|
142 |
+
return None
|
src/ctp_slack_bot/services/language_model_service.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from loguru import logger
|
2 |
+
from openai import OpenAI
|
3 |
+
from openai.types.chat import ChatCompletion
|
4 |
+
from pydantic import BaseModel, PrivateAttr
|
5 |
+
from typing import Collection, Self
|
6 |
+
|
7 |
+
from ctp_slack_bot.core import Settings
|
8 |
+
from ctp_slack_bot.models import Chunk
|
9 |
+
|
10 |
+
class LanguageModelService(BaseModel):
|
11 |
+
"""
|
12 |
+
Service for language model operations.
|
13 |
+
"""
|
14 |
+
|
15 |
+
settings: Settings
|
16 |
+
_open_ai_client: PrivateAttr = PrivateAttr()
|
17 |
+
|
18 |
+
class Config:
|
19 |
+
frozen=True
|
20 |
+
|
21 |
+
def __init__(self: Self, **data) -> None:
|
22 |
+
super().__init__(**data)
|
23 |
+
self._open_ai_client = OpenAI(api_key=self.settings.OPENAI_API_KEY.get_secret_value())
|
24 |
+
logger.debug("Created {}", self.__class__.__name__)
|
25 |
+
|
26 |
+
def answer_question(self, question: str, context: Collection[Chunk]) -> str:
|
27 |
+
"""Generate a response using OpenAI’s API with retrieved context.
|
28 |
+
|
29 |
+
Args:
|
30 |
+
question (str): The user’s question
|
31 |
+
context (List[RetreivedContext]): The context retreived for answering the question
|
32 |
+
|
33 |
+
Returns:
|
34 |
+
str: Generated answer
|
35 |
+
"""
|
36 |
+
logger.debug("Generating response for question “{}” using {} context chunks…", question, len(context))
|
37 |
+
messages = [
|
38 |
+
{"role": "system", "content": self.settings.SYSTEM_PROMPT},
|
39 |
+
{"role": "user", "content":
|
40 |
+
f"""Student Question: {question}
|
41 |
+
|
42 |
+
Context from class materials and transcripts:
|
43 |
+
{'\n'.join(chunk.text for chunk in context)}
|
44 |
+
|
45 |
+
Please answer the Student Question based on the Context from class materials and transcripts. If the context doesn’t contain relevant information, acknowledge that and suggest asking the professor."""}
|
46 |
+
]
|
47 |
+
response: ChatCompletion = self._open_ai_client.chat.completions.create(
|
48 |
+
model=self.settings.CHAT_MODEL,
|
49 |
+
messages=messages,
|
50 |
+
max_tokens=self.settings.MAX_TOKENS,
|
51 |
+
temperature=self.settings.TEMPERATURE
|
52 |
+
)
|
53 |
+
|
54 |
+
return response.choices[0].message.content
|
55 |
+
# return f"Mock response to “{question}”"
|
src/ctp_slack_bot/services/question_dispatch_service.py
CHANGED
@@ -1,11 +1,11 @@
|
|
1 |
# from asyncio import create_task
|
2 |
from loguru import logger
|
3 |
-
from
|
4 |
-
from
|
5 |
-
from typing import List, Optional, Self, Tuple
|
6 |
|
7 |
from ctp_slack_bot.core import Settings
|
8 |
-
from ctp_slack_bot.
|
|
|
9 |
from ctp_slack_bot.services.answer_retrieval_service import AnswerRetrievalService
|
10 |
from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
|
11 |
from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
@@ -21,11 +21,16 @@ class QuestionDispatchService(BaseModel):
|
|
21 |
context_retrieval_service: ContextRetrievalService
|
22 |
answer_retrieval_service: AnswerRetrievalService
|
23 |
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
|
|
26 |
logger.debug("Created {}", self.__class__.__name__)
|
27 |
-
return self
|
28 |
|
29 |
-
def
|
30 |
-
|
31 |
-
|
|
|
|
|
|
1 |
# from asyncio import create_task
|
2 |
from loguru import logger
|
3 |
+
from pydantic import BaseModel
|
4 |
+
from typing import Self
|
|
|
5 |
|
6 |
from ctp_slack_bot.core import Settings
|
7 |
+
from ctp_slack_bot.enums import EventType
|
8 |
+
from ctp_slack_bot.models import Chunk, SlackMessage
|
9 |
from ctp_slack_bot.services.answer_retrieval_service import AnswerRetrievalService
|
10 |
from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
|
11 |
from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
|
|
21 |
context_retrieval_service: ContextRetrievalService
|
22 |
answer_retrieval_service: AnswerRetrievalService
|
23 |
|
24 |
+
class Config:
|
25 |
+
frozen=True
|
26 |
+
|
27 |
+
def __init__(self: Self, **data) -> None:
|
28 |
+
super().__init__(**data)
|
29 |
+
self.event_brokerage_service.subscribe(EventType.INCOMING_SLACK_MESSAGE, self.__process_incoming_slack_message)
|
30 |
logger.debug("Created {}", self.__class__.__name__)
|
|
|
31 |
|
32 |
+
async def __process_incoming_slack_message(self: Self, message: SlackMessage) -> None:
|
33 |
+
if message.subtype != 'bot_message':
|
34 |
+
logger.debug("Question dispatch service received an answerable question: {}", message.text)
|
35 |
+
context = await self.context_retrieval_service.get_context(message)
|
36 |
+
await self.answer_retrieval_service.push(message, context)
|
src/ctp_slack_bot/services/schedule_service.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
2 |
+
from apscheduler.triggers.cron import CronTrigger
|
3 |
+
from asyncio import create_task, iscoroutinefunction, to_thread
|
4 |
+
from datetime import datetime
|
5 |
+
from dependency_injector.resources import Resource
|
6 |
+
from loguru import logger
|
7 |
+
from pydantic import BaseModel, PrivateAttr
|
8 |
+
from pytz import timezone
|
9 |
+
from typing import Optional, Self
|
10 |
+
|
11 |
+
from ctp_slack_bot.core import Settings
|
12 |
+
|
13 |
+
class ScheduleService(BaseModel):
|
14 |
+
"""
|
15 |
+
Service for running scheduled tasks.
|
16 |
+
"""
|
17 |
+
|
18 |
+
settings: Settings
|
19 |
+
_scheduler: PrivateAttr
|
20 |
+
|
21 |
+
class Config:
|
22 |
+
frozen=True
|
23 |
+
|
24 |
+
def __init__(self: Self, **data) -> None:
|
25 |
+
super().__init__(**data)
|
26 |
+
zone = self.settings.SCHEDULER_TIMEZONE
|
27 |
+
self._configure_jobs()
|
28 |
+
self._scheduler = AsyncIOScheduler(timezone=timezone(zone))
|
29 |
+
logger.debug("Created {}", self.__class__.__name__)
|
30 |
+
|
31 |
+
def _configure_jobs(self: Self) -> None:
|
32 |
+
# Example jobs (uncomment and implement as needed)
|
33 |
+
# self._scheduler.add_job(
|
34 |
+
# send_error_report,
|
35 |
+
# CronTrigger(hour=7, minute=0),
|
36 |
+
# id="daily_error_report",
|
37 |
+
# name="Daily Error Report",
|
38 |
+
# replace_existing=True,
|
39 |
+
# )
|
40 |
+
# self._scheduler.add_job(
|
41 |
+
# cleanup_old_transcripts,
|
42 |
+
# CronTrigger(day_of_week="sun", hour=1, minute=0),
|
43 |
+
# id="weekly_transcript_cleanup",
|
44 |
+
# name="Weekly Transcript Cleanup",
|
45 |
+
# replace_existing=True,
|
46 |
+
# )
|
47 |
+
pass
|
48 |
+
|
49 |
+
def start(self: Self) -> None:
|
50 |
+
self._scheduler.start()
|
51 |
+
|
52 |
+
def stop(self: Self) -> None:
|
53 |
+
if self._scheduler.running:
|
54 |
+
self._scheduler.shutdown()
|
55 |
+
else:
|
56 |
+
logger.debug("The scheduler is not running. There is no scheduler to shut down.")
|
57 |
+
|
58 |
+
class ScheduleServiceResource(Resource):
|
59 |
+
def init(self: Self, settings: Settings) -> ScheduleService:
|
60 |
+
logger.info("Starting scheduler…")
|
61 |
+
schedule_service = ScheduleService(settings=settings)
|
62 |
+
schedule_service.start()
|
63 |
+
return schedule_service
|
64 |
+
|
65 |
+
def shutdown(self: Self, schedule_service: ScheduleService) -> None:
|
66 |
+
"""Stop scheduler on shutdown."""
|
67 |
+
schedule_service.stop()
|
68 |
+
logger.info("Stopped scheduler.")
|
src/ctp_slack_bot/services/slack_service.py
CHANGED
@@ -1,11 +1,12 @@
|
|
1 |
-
|
2 |
from loguru import logger
|
3 |
from openai import OpenAI
|
4 |
-
from pydantic import BaseModel
|
5 |
-
from
|
|
|
6 |
|
7 |
-
from ctp_slack_bot.
|
8 |
-
from ctp_slack_bot.models import
|
9 |
from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
10 |
|
11 |
|
@@ -14,10 +15,55 @@ class SlackService(BaseModel):
|
|
14 |
Service for interfacing with Slack.
|
15 |
"""
|
16 |
|
17 |
-
settings: Settings
|
18 |
event_brokerage_service: EventBrokerageService
|
|
|
19 |
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
22 |
logger.debug("Created {}", self.__class__.__name__)
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dependency_injector.resources import Resource
|
2 |
from loguru import logger
|
3 |
from openai import OpenAI
|
4 |
+
from pydantic import BaseModel
|
5 |
+
from slack_bolt.async_app import AsyncApp
|
6 |
+
from typing import Any, Mapping, Self
|
7 |
|
8 |
+
from ctp_slack_bot.enums import EventType
|
9 |
+
from ctp_slack_bot.models import SlackMessage, SlackResponse
|
10 |
from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
11 |
|
12 |
|
|
|
15 |
Service for interfacing with Slack.
|
16 |
"""
|
17 |
|
|
|
18 |
event_brokerage_service: EventBrokerageService
|
19 |
+
slack_bolt_app: AsyncApp
|
20 |
|
21 |
+
class Config:
|
22 |
+
arbitrary_types_allowed = True
|
23 |
+
frozen=True
|
24 |
+
|
25 |
+
def __init__(self: Self, **data) -> None:
|
26 |
+
super().__init__(**data)
|
27 |
+
self.event_brokerage_service.subscribe(EventType.OUTGOING_SLACK_RESPONSE, self.send_message)
|
28 |
logger.debug("Created {}", self.__class__.__name__)
|
29 |
+
|
30 |
+
def adapt_event_payload(self: Self, event: Mapping[str, Any]) -> SlackMessage:
|
31 |
+
return SlackMessage(
|
32 |
+
type=event.get("type"),
|
33 |
+
subtype=event.get("subtype"),
|
34 |
+
channel=event.get("channel"),
|
35 |
+
channel_type=event.get("channel_type"),
|
36 |
+
user=event.get("user"),
|
37 |
+
bot_id=event.get("bot_id"),
|
38 |
+
thread_ts=event.get("thread_ts"),
|
39 |
+
text=event.get("text", ""),
|
40 |
+
ts=event.get("ts"),
|
41 |
+
event_ts=event.get("event_ts")
|
42 |
+
)
|
43 |
+
|
44 |
+
async def process_message(self: Self, event: Mapping[str, Any]) -> None:
|
45 |
+
slack_message = self.adapt_event_payload(event.get("event", {}))
|
46 |
+
logger.debug("Received message from Slack: {}", slack_message)
|
47 |
+
await self.event_brokerage_service.publish(EventType.INCOMING_SLACK_MESSAGE, slack_message)
|
48 |
+
|
49 |
+
async def send_message(self: Self, message: SlackResponse) -> None:
|
50 |
+
await self.slack_bolt_app.client.chat_postMessage(channel=message.channel, text=message.text, thread_ts=message.thread_ts)
|
51 |
+
|
52 |
+
async def handle_message_event(self: Self, body: Mapping[str, Any]) -> None:
|
53 |
+
logger.debug("Ignored regular message: {}", body.get("event", {}).get("text"))
|
54 |
+
# await self.process_message(body)
|
55 |
+
|
56 |
+
async def handle_app_mention_event(self: Self, body: Mapping[str, Any]) -> None:
|
57 |
+
logger.debug("Received app mention for processing: {}", body.get("event", {}).get("text"))
|
58 |
+
await self.process_message(body)
|
59 |
+
|
60 |
+
def register(self: Self) -> None:
|
61 |
+
self.slack_bolt_app.event("message")(self.handle_message_event)
|
62 |
+
self.slack_bolt_app.event("app_mention")(self.handle_app_mention_event)
|
63 |
+
logger.debug("Registered 2 handlers for Slack Bolt message and app mention events.")
|
64 |
+
|
65 |
+
class SlackServiceResource(Resource):
|
66 |
+
def init(self: Self, event_brokerage_service: EventBrokerageService, slack_bolt_app: AsyncApp) -> SlackService:
|
67 |
+
slack_service = SlackService(event_brokerage_service=event_brokerage_service, slack_bolt_app=slack_bolt_app)
|
68 |
+
slack_service.register()
|
69 |
+
return slack_service
|
src/ctp_slack_bot/services/vector_database_service.py
CHANGED
@@ -1,111 +1,118 @@
|
|
1 |
from loguru import logger
|
2 |
-
from pydantic import BaseModel
|
3 |
-
from typing import Any, Dict, List, Self
|
4 |
|
5 |
from ctp_slack_bot.core import Settings
|
6 |
from ctp_slack_bot.db import MongoDB
|
7 |
-
from ctp_slack_bot.models import
|
8 |
|
9 |
class VectorDatabaseService(BaseModel): # TODO: this should not rely specifically on MongoDB.
|
10 |
"""
|
11 |
Service for storing and retrieving vector embeddings from MongoDB.
|
12 |
"""
|
13 |
-
|
14 |
settings: Settings
|
15 |
mongo_db: MongoDB
|
16 |
|
17 |
-
|
18 |
-
|
|
|
|
|
|
|
19 |
logger.debug("Created {}", self.__class__.__name__)
|
20 |
-
return self
|
21 |
|
22 |
-
def
|
23 |
"""
|
24 |
-
|
25 |
|
26 |
Args:
|
27 |
-
|
28 |
|
29 |
-
Returns:
|
30 |
-
bool: True if the content exists, False otherwise
|
31 |
"""
|
32 |
-
if not
|
33 |
-
|
|
|
34 |
|
35 |
try:
|
36 |
-
#
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
-
return result is not None
|
40 |
except Exception as e:
|
41 |
-
logger.error(
|
|
|
|
|
|
|
42 |
raise
|
43 |
|
44 |
-
def
|
45 |
"""
|
46 |
-
|
47 |
|
48 |
Args:
|
49 |
-
|
50 |
-
embedding: The vector embedding of the text
|
51 |
-
metadata: Additional metadata about the text (source, timestamp, etc.)
|
52 |
-
|
53 |
-
Returns:
|
54 |
-
str: The ID of the stored document
|
55 |
"""
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
try:
|
60 |
-
# Create document to store
|
61 |
-
document = {
|
62 |
-
"text": text,
|
63 |
-
"embedding": embedding,
|
64 |
-
"metadata": metadata
|
65 |
-
}
|
66 |
-
|
67 |
-
# Insert into collection
|
68 |
-
result = self.mongo_db.vector_collection.insert_one(document)
|
69 |
-
logger.debug(f"Stored document with ID: {result.inserted_id}")
|
70 |
-
|
71 |
-
return str(result.inserted_id)
|
72 |
-
except Exception as e:
|
73 |
-
logger.error(f"Error storing embedding: {str(e)}")
|
74 |
-
raise
|
75 |
-
|
76 |
-
def search_by_similarity(self, query: VectorQuery, query_embedding: List[float]) -> List[RetreivedContext]:
|
77 |
"""
|
78 |
Query the vector database for similar documents.
|
79 |
|
80 |
Args:
|
81 |
query: VectorQuery object with search parameters
|
82 |
-
|
83 |
-
|
84 |
Returns:
|
85 |
-
|
86 |
"""
|
87 |
-
if not self.mongo_db.initialized:
|
88 |
-
self.mongo_db.initialize()
|
89 |
-
|
90 |
try:
|
91 |
-
#
|
|
|
|
|
|
|
|
|
|
|
92 |
pipeline = [
|
93 |
{
|
94 |
-
"$
|
95 |
-
"index": "
|
96 |
-
"
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
}
|
101 |
}
|
102 |
},
|
103 |
{
|
104 |
"$project": {
|
105 |
-
"_id": 0,
|
106 |
"text": 1,
|
107 |
"metadata": 1,
|
108 |
-
"
|
|
|
|
|
109 |
}
|
110 |
}
|
111 |
]
|
@@ -114,33 +121,55 @@ class VectorDatabaseService(BaseModel): # TODO: this should not rely specificall
|
|
114 |
if query.filter_metadata:
|
115 |
metadata_filter = {f"metadata.{k}": v for k, v in query.filter_metadata.items()}
|
116 |
pipeline.insert(1, {"$match": metadata_filter})
|
|
|
117 |
|
118 |
-
#
|
119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
|
|
|
|
|
|
139 |
)
|
|
|
140 |
|
141 |
-
logger.
|
142 |
-
return
|
143 |
|
144 |
except Exception as e:
|
145 |
-
logger.error(
|
146 |
-
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from loguru import logger
|
2 |
+
from pydantic import BaseModel
|
3 |
+
from typing import Any, Collection, Dict, List, Optional, Self, Sequence
|
4 |
|
5 |
from ctp_slack_bot.core import Settings
|
6 |
from ctp_slack_bot.db import MongoDB
|
7 |
+
from ctp_slack_bot.models import Chunk, VectorizedChunk, VectorQuery
|
8 |
|
9 |
class VectorDatabaseService(BaseModel): # TODO: this should not rely specifically on MongoDB.
|
10 |
"""
|
11 |
Service for storing and retrieving vector embeddings from MongoDB.
|
12 |
"""
|
|
|
13 |
settings: Settings
|
14 |
mongo_db: MongoDB
|
15 |
|
16 |
+
class Config:
|
17 |
+
frozen=True
|
18 |
+
|
19 |
+
def __init__(self: Self, **data) -> None:
|
20 |
+
super().__init__(**data)
|
21 |
logger.debug("Created {}", self.__class__.__name__)
|
|
|
22 |
|
23 |
+
async def store(self: Self, chunks: Collection[VectorizedChunk]) -> None:
|
24 |
"""
|
25 |
+
Stores vectorized chunks and their embedding vectors in the database.
|
26 |
|
27 |
Args:
|
28 |
+
chunks: Collection of VectorizedChunk objects to store
|
29 |
|
30 |
+
Returns: None
|
|
|
31 |
"""
|
32 |
+
if not chunks:
|
33 |
+
logger.debug("No chunks to store")
|
34 |
+
return
|
35 |
|
36 |
try:
|
37 |
+
# Get the vector collection - this will create it if it doesn't exist
|
38 |
+
logger.debug("Getting vectors collection for storing {} chunks", len(chunks))
|
39 |
+
vector_collection = await self.mongo_db.get_collection("vectors")
|
40 |
+
|
41 |
+
# Ensure vector search index exists
|
42 |
+
logger.debug("Creating vector search index for vectors collection")
|
43 |
+
await self.mongo_db.create_indexes("vectors")
|
44 |
+
|
45 |
+
# Create documents to store, ensuring compatibility with BSON
|
46 |
+
documents = []
|
47 |
+
for chunk in chunks:
|
48 |
+
# Convert embedding to standard list format (important for BSON compatibility)
|
49 |
+
embedding = list(chunk.embedding) if not isinstance(chunk.embedding, list) else chunk.embedding
|
50 |
+
|
51 |
+
# Build document with proper structure
|
52 |
+
document = {
|
53 |
+
"text": chunk.text,
|
54 |
+
"embedding": embedding,
|
55 |
+
"metadata": chunk.metadata,
|
56 |
+
"parent_id": chunk.parent_id,
|
57 |
+
"chunk_id": chunk.chunk_id
|
58 |
+
}
|
59 |
+
documents.append(document)
|
60 |
+
|
61 |
+
# Insert into collection as a batch
|
62 |
+
logger.debug("Inserting {} documents into vectors collection", len(documents))
|
63 |
+
result = await vector_collection.insert_many(documents)
|
64 |
+
logger.info("Stored {} vector chunks in database", len(result.inserted_ids))
|
65 |
|
|
|
66 |
except Exception as e:
|
67 |
+
logger.error("Error storing vector embeddings: {}", str(e))
|
68 |
+
# Include more diagnostic information
|
69 |
+
logger.debug("MongoDB connection info: URI defined: {}, DB name: {}",
|
70 |
+
bool(self.settings.MONGODB_URI), self.settings.MONGODB_NAME)
|
71 |
raise
|
72 |
|
73 |
+
async def content_exists(self: Self, key: str)-> bool: # TODO: implement this.
|
74 |
"""
|
75 |
+
Check if content exists in the database.
|
76 |
|
77 |
Args:
|
78 |
+
key: The key to check for content existence
|
|
|
|
|
|
|
|
|
|
|
79 |
"""
|
80 |
+
pass
|
81 |
+
|
82 |
+
async def search_by_similarity(self: Self, query: VectorQuery) -> Sequence[Chunk]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
"""
|
84 |
Query the vector database for similar documents.
|
85 |
|
86 |
Args:
|
87 |
query: VectorQuery object with search parameters
|
88 |
+
|
|
|
89 |
Returns:
|
90 |
+
Sequence[Chunk]: List of similar chunks
|
91 |
"""
|
|
|
|
|
|
|
92 |
try:
|
93 |
+
# Get the vector collection
|
94 |
+
logger.debug("Getting vectors collection for similarity search")
|
95 |
+
vector_collection = await self.mongo_db.get_collection("vectors")
|
96 |
+
|
97 |
+
# Build aggregation pipeline for vector search using official MongoDB format
|
98 |
+
logger.debug("Building vector search pipeline with query embedding dimension: {}", len(query.query_embeddings))
|
99 |
pipeline = [
|
100 |
{
|
101 |
+
"$vectorSearch": {
|
102 |
+
"index": "vectors_vector_index",
|
103 |
+
"path": "embedding",
|
104 |
+
"queryVector": query.query_embeddings, #list(query.query_embeddings),
|
105 |
+
"numCandidates": query.k * 10,
|
106 |
+
"limit": query.k
|
|
|
107 |
}
|
108 |
},
|
109 |
{
|
110 |
"$project": {
|
|
|
111 |
"text": 1,
|
112 |
"metadata": 1,
|
113 |
+
"parent_id": 1,
|
114 |
+
"chunk_id": 1,
|
115 |
+
"score": { "$meta": "vectorSearchScore" }
|
116 |
}
|
117 |
}
|
118 |
]
|
|
|
121 |
if query.filter_metadata:
|
122 |
metadata_filter = {f"metadata.{k}": v for k, v in query.filter_metadata.items()}
|
123 |
pipeline.insert(1, {"$match": metadata_filter})
|
124 |
+
logger.debug("Added metadata filters to search: {}", query.filter_metadata)
|
125 |
|
126 |
+
# Add score threshold filter if needed
|
127 |
+
if query.score_threshold > 0:
|
128 |
+
pipeline.append({
|
129 |
+
"$match": {
|
130 |
+
"score": { "$gte": query.score_threshold }
|
131 |
+
}
|
132 |
+
})
|
133 |
+
logger.debug("Added score threshold filter: {}", query.score_threshold)
|
134 |
|
135 |
+
try:
|
136 |
+
# Execute the vector search pipeline
|
137 |
+
logger.debug("Executing vector search pipeline")
|
138 |
+
results = await vector_collection.aggregate(pipeline).to_list(length=query.k)
|
139 |
+
logger.debug("Vector search returned {} results", len(results))
|
140 |
+
except Exception as e:
|
141 |
+
logger.warning("Vector search failed: {}. Falling back to basic text search.", str(e))
|
142 |
+
# Fall back to basic filtering with limit
|
143 |
+
query_filter = {}
|
144 |
+
if query.filter_metadata:
|
145 |
+
query_filter.update({f"metadata.{k}": v for k, v in query.filter_metadata.items()})
|
146 |
|
147 |
+
logger.debug("Executing fallback basic search with filter: {}", query_filter)
|
148 |
+
results = await vector_collection.find(query_filter).limit(query.k).to_list(length=query.k)
|
149 |
+
logger.debug("Fallback search returned {} results", len(results))
|
150 |
+
|
151 |
+
# Convert results to Chunk objects
|
152 |
+
chunks = []
|
153 |
+
for result in results:
|
154 |
+
chunk = Chunk(
|
155 |
+
text=result["text"],
|
156 |
+
parent_id=result["parent_id"],
|
157 |
+
chunk_id=result["chunk_id"],
|
158 |
+
metadata={
|
159 |
+
**result["metadata"],
|
160 |
+
"similarity_score": result.get("score", 0)
|
161 |
+
}
|
162 |
)
|
163 |
+
chunks.append(chunk)
|
164 |
|
165 |
+
logger.info("Found {} similar chunks with similarity search", len(chunks))
|
166 |
+
return chunks
|
167 |
|
168 |
except Exception as e:
|
169 |
+
logger.error("Error in similarity search: {}", str(e))
|
170 |
+
# Include additional diagnostic information
|
171 |
+
logger.debug("MongoDB connection info: URI defined: {}, DB name: {}",
|
172 |
+
bool(self.settings.MONGODB_URI), self.settings.MONGODB_NAME)
|
173 |
+
logger.debug("Query details: k={}, dimension={}",
|
174 |
+
query.k, len(query.query_embeddings) if query.query_embeddings else "None")
|
175 |
+
raise
|
src/ctp_slack_bot/services/vectorization_service.py
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
from loguru import logger
|
2 |
-
|
3 |
-
from
|
4 |
-
from pydantic import BaseModel, model_validator
|
5 |
-
from typing import List, Optional, Self
|
6 |
|
7 |
from ctp_slack_bot.core import Settings
|
|
|
|
|
8 |
|
9 |
class VectorizationService(BaseModel):
|
10 |
"""
|
@@ -12,57 +12,23 @@ class VectorizationService(BaseModel):
|
|
12 |
"""
|
13 |
|
14 |
settings: Settings
|
15 |
-
|
16 |
|
17 |
class Config:
|
18 |
-
|
19 |
|
20 |
-
|
21 |
-
|
22 |
logger.debug("Created {}", self.__class__.__name__)
|
23 |
-
return self
|
24 |
-
|
25 |
-
def get_embeddings(self, texts: List[str]) -> np.ndarray:
|
26 |
-
"""Get embeddings for a list of texts using OpenAI's API.
|
27 |
-
|
28 |
-
Args:
|
29 |
-
texts (List[str]): List of text chunks to embed
|
30 |
-
|
31 |
-
Returns:
|
32 |
-
np.ndarray: Array of embeddings with shape (n_texts, VECTOR_DIMENSION)
|
33 |
-
|
34 |
-
Raises:
|
35 |
-
ValueError: If the embedding dimensions don't match expected size
|
36 |
-
"""
|
37 |
-
try:
|
38 |
-
# Use the initialized client instead of the global openai module
|
39 |
-
response = self.client.embeddings.create(
|
40 |
-
model=self.settings.EMBEDDING_MODEL,
|
41 |
-
input=texts,
|
42 |
-
encoding_format="float" # Ensure we get raw float values
|
43 |
-
)
|
44 |
-
|
45 |
-
# Extract embeddings and verify dimensions
|
46 |
-
embeddings = np.array([data.embedding for data in response.data])
|
47 |
-
|
48 |
-
if embeddings.shape[1] != self.settings.VECTOR_DIMENSION:
|
49 |
-
raise ValueError(
|
50 |
-
f"Embedding dimension mismatch. Expected {self.settings.VECTOR_DIMENSION}, "
|
51 |
-
f"but got {embeddings.shape[1]}. Please update VECTOR_DIMENSION "
|
52 |
-
f"in config.py to match the model's output."
|
53 |
-
)
|
54 |
-
|
55 |
-
return embeddings
|
56 |
-
|
57 |
-
except Exception as e:
|
58 |
-
print(f"Error getting embeddings: {str(e)}")
|
59 |
-
pass
|
60 |
|
61 |
-
def
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
|
|
|
|
|
|
|
1 |
from loguru import logger
|
2 |
+
from pydantic import BaseModel
|
3 |
+
from typing import Self, Sequence
|
|
|
|
|
4 |
|
5 |
from ctp_slack_bot.core import Settings
|
6 |
+
from ctp_slack_bot.models import Chunk, VectorizedChunk
|
7 |
+
from ctp_slack_bot.services.embeddings_model_service import EmbeddingsModelService
|
8 |
|
9 |
class VectorizationService(BaseModel):
|
10 |
"""
|
|
|
12 |
"""
|
13 |
|
14 |
settings: Settings
|
15 |
+
embeddings_model_service: EmbeddingsModelService
|
16 |
|
17 |
class Config:
|
18 |
+
frozen=True
|
19 |
|
20 |
+
def __init__(self: Self, **data) -> None:
|
21 |
+
super().__init__(**data)
|
22 |
logger.debug("Created {}", self.__class__.__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
+
def vectorize(self: Self, chunks: Sequence[Chunk]) -> Sequence[VectorizedChunk]:
|
25 |
+
embeddings = self.embeddings_model_service.get_embeddings([chunk.text for chunk in chunks])
|
26 |
+
return tuple(VectorizedChunk(
|
27 |
+
text=chunk.text,
|
28 |
+
parent_id=chunk.parent_id,
|
29 |
+
chunk_id=chunk.chunk_id,
|
30 |
+
metadata=chunk.metadata,
|
31 |
+
embedding=embedding
|
32 |
+
)
|
33 |
+
for chunk, embedding
|
34 |
+
in zip(chunks, embeddings))
|
src/ctp_slack_bot/tasks/__init__.py
CHANGED
@@ -1 +0,0 @@
|
|
1 |
-
from ctp_slack_bot.tasks.scheduler import start_scheduler, stop_scheduler
|
|
|
|