Spaces:
Runtime error
Runtime error
Update `GoogleDriveService` and add example usage notebook
Browse files- .env.template +1 -0
- notebooks/google_drive.ipynb +0 -0
- notebooks/{web-vtt.ipynb → web_vtt.ipynb} +0 -0
- pyproject.toml +1 -0
- src/ctp_slack_bot/containers.py +2 -0
- src/ctp_slack_bot/core/config.py +1 -0
- src/ctp_slack_bot/models/__init__.py +1 -0
- src/ctp_slack_bot/models/google_drive.py +4 -4
- src/ctp_slack_bot/services/__init__.py +1 -0
- src/ctp_slack_bot/services/google_drive_service.py +48 -50
.env.template
CHANGED
@@ -30,6 +30,7 @@ TEMPERATURE=0.8
|
|
30 |
SYSTEM_PROMPT="You are a helpful teaching assistant for a data science class.\nBased on the students question, you will be given context retreived from class transcripts and materials to answer their question.\nYour responses should be:\n\n1. Accurate and based on the class content\n2. Clear and educational\n3. Concise but complete\nIf you're unsure about something, acknowledge it and suggest asking the professor."
|
31 |
|
32 |
# Google Drive Configuration
|
|
|
33 |
GOOGLE_PROJECT_ID=insufferable-slacker-123456
|
34 |
GOOGLE_PRIVATE_KEY_ID=1a2b3c4d5e6f748891091d21304e506674829507
|
35 |
GOOGLE_PRIVATE_KEY="-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASC...\n-----END PRIVATE KEY-----\n"
|
|
|
30 |
SYSTEM_PROMPT="You are a helpful teaching assistant for a data science class.\nBased on the students question, you will be given context retreived from class transcripts and materials to answer their question.\nYour responses should be:\n\n1. Accurate and based on the class content\n2. Clear and educational\n3. Concise but complete\nIf you're unsure about something, acknowledge it and suggest asking the professor."
|
31 |
|
32 |
# Google Drive Configuration
|
33 |
+
GOOGLE_DRIVE_ROOT_ID=1NB91EcIUXbOVcdCkXOAHdmWrDfgoh9fQ
|
34 |
GOOGLE_PROJECT_ID=insufferable-slacker-123456
|
35 |
GOOGLE_PRIVATE_KEY_ID=1a2b3c4d5e6f748891091d21304e506674829507
|
36 |
GOOGLE_PRIVATE_KEY="-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASC...\n-----END PRIVATE KEY-----\n"
|
notebooks/google_drive.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
notebooks/{web-vtt.ipynb → web_vtt.ipynb}
RENAMED
File without changes
|
pyproject.toml
CHANGED
@@ -21,6 +21,7 @@ classifiers = [
|
|
21 |
dependencies = [
|
22 |
"pydantic>=2.11.2",
|
23 |
"pydantic-settings>=2.8.1",
|
|
|
24 |
"more-itertools>=10.6.0",
|
25 |
"python-dotenv>=1.1.0",
|
26 |
"loguru>=0.7.3",
|
|
|
21 |
dependencies = [
|
22 |
"pydantic>=2.11.2",
|
23 |
"pydantic-settings>=2.8.1",
|
24 |
+
"cachetools>=5.5.2",
|
25 |
"more-itertools>=10.6.0",
|
26 |
"python-dotenv>=1.1.0",
|
27 |
"loguru>=0.7.3",
|
src/ctp_slack_bot/containers.py
CHANGED
@@ -11,6 +11,7 @@ from ctp_slack_bot.services.content_ingestion_service import ContentIngestionSer
|
|
11 |
from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
|
12 |
from ctp_slack_bot.services.embeddings_model_service import EmbeddingsModelService
|
13 |
from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
|
|
14 |
from ctp_slack_bot.services.language_model_service import LanguageModelService
|
15 |
from ctp_slack_bot.services.question_dispatch_service import QuestionDispatchService
|
16 |
from ctp_slack_bot.services.schedule_service import ScheduleServiceResource
|
@@ -36,3 +37,4 @@ class Container(DeclarativeContainer):
|
|
36 |
slack_bolt_app = Singleton(AsyncApp, token=settings.provided.SLACK_BOT_TOKEN().get_secret_value())
|
37 |
slack_service = Resource(SlackServiceResource, event_brokerage_service=event_brokerage_service, slack_bolt_app=slack_bolt_app)
|
38 |
socket_mode_handler = Singleton(lambda _, app, app_token: AsyncSocketModeHandler(app, app_token), slack_service, slack_bolt_app, settings.provided.SLACK_APP_TOKEN().get_secret_value())
|
|
|
|
11 |
from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
|
12 |
from ctp_slack_bot.services.embeddings_model_service import EmbeddingsModelService
|
13 |
from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
14 |
+
from ctp_slack_bot.services.google_drive_service import GoogleDriveService
|
15 |
from ctp_slack_bot.services.language_model_service import LanguageModelService
|
16 |
from ctp_slack_bot.services.question_dispatch_service import QuestionDispatchService
|
17 |
from ctp_slack_bot.services.schedule_service import ScheduleServiceResource
|
|
|
37 |
slack_bolt_app = Singleton(AsyncApp, token=settings.provided.SLACK_BOT_TOKEN().get_secret_value())
|
38 |
slack_service = Resource(SlackServiceResource, event_brokerage_service=event_brokerage_service, slack_bolt_app=slack_bolt_app)
|
39 |
socket_mode_handler = Singleton(lambda _, app, app_token: AsyncSocketModeHandler(app, app_token), slack_service, slack_bolt_app, settings.provided.SLACK_APP_TOKEN().get_secret_value())
|
40 |
+
google_drive_service = Singleton(GoogleDriveService, settings=settings)
|
src/ctp_slack_bot/core/config.py
CHANGED
@@ -49,6 +49,7 @@ class Settings(BaseSettings):
|
|
49 |
SYSTEM_PROMPT: str
|
50 |
|
51 |
# Google Drive Configuration
|
|
|
52 |
GOOGLE_PROJECT_ID: str
|
53 |
GOOGLE_PRIVATE_KEY_ID: SecretStr
|
54 |
GOOGLE_PRIVATE_KEY: SecretStr
|
|
|
49 |
SYSTEM_PROMPT: str
|
50 |
|
51 |
# Google Drive Configuration
|
52 |
+
GOOGLE_DRIVE_ROOT_ID: str
|
53 |
GOOGLE_PROJECT_ID: str
|
54 |
GOOGLE_PRIVATE_KEY_ID: SecretStr
|
55 |
GOOGLE_PRIVATE_KEY: SecretStr
|
src/ctp_slack_bot/models/__init__.py
CHANGED
@@ -1,3 +1,4 @@
|
|
1 |
from ctp_slack_bot.models.base import Chunk, Content, VectorizedChunk, VectorQuery
|
|
|
2 |
from ctp_slack_bot.models.slack import SlackEventPayload, SlackMessage, SlackReaction, SlackResponse, SlackUserTimestampPair
|
3 |
from ctp_slack_bot.models.webvtt import WebVTTContent, WebVTTFrame
|
|
|
1 |
from ctp_slack_bot.models.base import Chunk, Content, VectorizedChunk, VectorQuery
|
2 |
+
from ctp_slack_bot.models.google_drive import GoogleDriveMetadata
|
3 |
from ctp_slack_bot.models.slack import SlackEventPayload, SlackMessage, SlackReaction, SlackResponse, SlackUserTimestampPair
|
4 |
from ctp_slack_bot.models.webvtt import WebVTTContent, WebVTTFrame
|
src/ctp_slack_bot/models/google_drive.py
CHANGED
@@ -18,8 +18,8 @@ class GoogleDriveMetadata(BaseModel):
|
|
18 |
|
19 |
@classmethod
|
20 |
def from_folder_path_and_dict(cls: type["GoogleDriveMetadata"], folder_path: str, dict: dict) -> Self:
|
21 |
-
id =
|
22 |
-
name =
|
23 |
-
modified_time = datetime.fromisoformat(
|
24 |
-
mime_type =
|
25 |
return GoogleDriveMetadata(id=id, name=name, modified_time=modified_time, mime_type=mime_type, folder_path=folder_path)
|
|
|
18 |
|
19 |
@classmethod
|
20 |
def from_folder_path_and_dict(cls: type["GoogleDriveMetadata"], folder_path: str, dict: dict) -> Self:
|
21 |
+
id = dict["id"]
|
22 |
+
name = dict["name"]
|
23 |
+
modified_time = datetime.fromisoformat(dict["modifiedTime"])
|
24 |
+
mime_type = dict["mimeType"]
|
25 |
return GoogleDriveMetadata(id=id, name=name, modified_time=modified_time, mime_type=mime_type, folder_path=folder_path)
|
src/ctp_slack_bot/services/__init__.py
CHANGED
@@ -3,6 +3,7 @@ from ctp_slack_bot.services.content_ingestion_service import ContentIngestionSer
|
|
3 |
from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
|
4 |
from ctp_slack_bot.services.embeddings_model_service import EmbeddingsModelService
|
5 |
from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
|
|
6 |
from ctp_slack_bot.services.language_model_service import LanguageModelService
|
7 |
from ctp_slack_bot.services.question_dispatch_service import QuestionDispatchService
|
8 |
from ctp_slack_bot.services.slack_service import SlackService
|
|
|
3 |
from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
|
4 |
from ctp_slack_bot.services.embeddings_model_service import EmbeddingsModelService
|
5 |
from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
|
6 |
+
from ctp_slack_bot.services.google_drive_service import GoogleDriveService
|
7 |
from ctp_slack_bot.services.language_model_service import LanguageModelService
|
8 |
from ctp_slack_bot.services.question_dispatch_service import QuestionDispatchService
|
9 |
from ctp_slack_bot.services.slack_service import SlackService
|
src/ctp_slack_bot/services/google_drive_service.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
from datetime import datetime
|
|
|
2 |
from google.oauth2 import service_account
|
3 |
from googleapiclient.discovery import build
|
4 |
from googleapiclient.http import MediaIoBaseDownload
|
@@ -9,16 +10,19 @@ from pydantic import BaseModel, PrivateAttr
|
|
9 |
from typing import Collection, Dict, List, Optional, Self
|
10 |
|
11 |
from ctp_slack_bot.core import Settings
|
|
|
|
|
12 |
|
13 |
FOLDER_MIME_TYPE: str = "application/vnd.google-apps.folder"
|
14 |
-
|
|
|
15 |
|
16 |
class GoogleDriveService(BaseModel):
|
17 |
"""Service for interacting with Google Drive."""
|
18 |
|
19 |
settings: Settings
|
20 |
_google_drive_client: PrivateAttr = PrivateAttr()
|
21 |
-
_folder_cache: PrivateAttr = PrivateAttr(default_factory=
|
22 |
|
23 |
class Config:
|
24 |
frozen=True
|
@@ -28,12 +32,12 @@ class GoogleDriveService(BaseModel):
|
|
28 |
credentials = service_account.Credentials.from_service_account_info({
|
29 |
"type": "service_account",
|
30 |
"project_id": self.settings.GOOGLE_PROJECT_ID,
|
31 |
-
"private_key_id": self.settings.GOOGLE_PRIVATE_KEY_ID,
|
32 |
-
"private_key": self.settings.GOOGLE_PRIVATE_KEY,
|
33 |
"client_email": self.settings.GOOGLE_CLIENT_EMAIL,
|
34 |
"client_id": self.settings.GOOGLE_CLIENT_ID,
|
35 |
"token_uri": self.settings.GOOGLE_TOKEN_URI,
|
36 |
-
}, scopes=[
|
37 |
self._google_drive_client = build('drive', 'v3', credentials=credentials)
|
38 |
logger.debug("Created {}", self.__class__.__name__)
|
39 |
|
@@ -41,98 +45,92 @@ class GoogleDriveService(BaseModel):
|
|
41 |
"""Resolve a folder path to a Google Drive ID."""
|
42 |
|
43 |
if not folder_path:
|
44 |
-
return
|
45 |
|
46 |
if folder_path in self._folder_cache:
|
47 |
return self._folder_cache[folder_path]
|
48 |
|
49 |
-
current_id =
|
50 |
-
|
51 |
-
|
52 |
-
continue
|
53 |
-
try:
|
54 |
results = self._google_drive_client.files().list(
|
55 |
-
q=f"name='{part}' and mimeType='{FOLDER_MIME_TYPE}' and '{current_id}' in parents",
|
56 |
-
fields=
|
57 |
supportsAllDrives=True,
|
58 |
includeItemsFromAllDrives=True
|
59 |
).execute()
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
|
|
|
|
|
|
66 |
|
67 |
self._folder_cache[folder_path] = current_id
|
68 |
return current_id
|
69 |
|
70 |
-
def list_directory(self: Self, folder_path: str) ->
|
71 |
"""List contents of a directory with basic metadata."""
|
72 |
|
73 |
folder_id = self._resolve_folder_id(folder_path)
|
74 |
if not folder_id:
|
75 |
-
|
|
|
|
|
76 |
try:
|
77 |
results = self._google_drive_client.files().list(
|
78 |
q=f"'{folder_id}' in parents",
|
79 |
-
fields=
|
80 |
supportsAllDrives=True,
|
81 |
includeItemsFromAllDrives=True,
|
82 |
pageSize=1000
|
83 |
).execute()
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
'name': f['name'],
|
88 |
-
'modified': f['modifiedTime'],
|
89 |
-
'mime_type': f['mimeType']
|
90 |
-
} for f in results.get('files', [])]
|
91 |
except HttpError as e:
|
92 |
logger.error("Error listing folder by path, {}: {}", folder_path, e)
|
93 |
-
return
|
94 |
|
95 |
-
def get_metadata(self: Self, item_path: str) -> Optional[
|
96 |
"""Get metadata for a specific file/folder by path."""
|
97 |
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
folder_id = self._resolve_folder_id(folder_path)
|
106 |
-
|
107 |
if not folder_id:
|
|
|
108 |
return None
|
109 |
|
110 |
try:
|
111 |
results = self._google_drive_client.files().list(
|
112 |
q=f"name='{item_name}' and '{folder_id}' in parents",
|
113 |
-
fields=
|
114 |
supportsAllDrives=True,
|
115 |
includeItemsFromAllDrives=True,
|
116 |
pageSize=1
|
117 |
).execute()
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
'id': files[0]['id'],
|
122 |
-
'name': files[0]['name'],
|
123 |
-
'modified': files[0]['modifiedTime'],
|
124 |
-
'mime_type': files[0]['mimeType']
|
125 |
-
}
|
126 |
except HttpError as e:
|
127 |
logger.error("Error getting metadata for item by path, {}: {}", item_path, e)
|
128 |
|
|
|
129 |
return None
|
130 |
|
131 |
def read_file_by_id(self: Self, file_id: str) -> Optional[bytes]:
|
132 |
"""Read contents of a file by its unique identifier."""
|
133 |
|
134 |
try:
|
135 |
-
request = self.
|
136 |
buffer = BytesIO()
|
137 |
downloader = MediaIoBaseDownload(buffer, request)
|
138 |
done = False
|
|
|
1 |
from datetime import datetime
|
2 |
+
from cachetools import TTLCache
|
3 |
from google.oauth2 import service_account
|
4 |
from googleapiclient.discovery import build
|
5 |
from googleapiclient.http import MediaIoBaseDownload
|
|
|
10 |
from typing import Collection, Dict, List, Optional, Self
|
11 |
|
12 |
from ctp_slack_bot.core import Settings
|
13 |
+
from ctp_slack_bot.models import GoogleDriveMetadata
|
14 |
+
|
15 |
|
16 |
FOLDER_MIME_TYPE: str = "application/vnd.google-apps.folder"
|
17 |
+
PATH_SEPARATOR: str = "/"
|
18 |
+
|
19 |
|
20 |
class GoogleDriveService(BaseModel):
|
21 |
"""Service for interacting with Google Drive."""
|
22 |
|
23 |
settings: Settings
|
24 |
_google_drive_client: PrivateAttr = PrivateAttr()
|
25 |
+
_folder_cache: PrivateAttr = PrivateAttr(default_factory=lambda: TTLCache(maxsize=256, ttl=60))
|
26 |
|
27 |
class Config:
|
28 |
frozen=True
|
|
|
32 |
credentials = service_account.Credentials.from_service_account_info({
|
33 |
"type": "service_account",
|
34 |
"project_id": self.settings.GOOGLE_PROJECT_ID,
|
35 |
+
"private_key_id": self.settings.GOOGLE_PRIVATE_KEY_ID.get_secret_value(),
|
36 |
+
"private_key": self.settings.GOOGLE_PRIVATE_KEY.get_secret_value(),
|
37 |
"client_email": self.settings.GOOGLE_CLIENT_EMAIL,
|
38 |
"client_id": self.settings.GOOGLE_CLIENT_ID,
|
39 |
"token_uri": self.settings.GOOGLE_TOKEN_URI,
|
40 |
+
}, scopes=["https://www.googleapis.com/auth/drive"])
|
41 |
self._google_drive_client = build('drive', 'v3', credentials=credentials)
|
42 |
logger.debug("Created {}", self.__class__.__name__)
|
43 |
|
|
|
45 |
"""Resolve a folder path to a Google Drive ID."""
|
46 |
|
47 |
if not folder_path:
|
48 |
+
return self.settings.GOOGLE_DRIVE_ROOT_ID
|
49 |
|
50 |
if folder_path in self._folder_cache:
|
51 |
return self._folder_cache[folder_path]
|
52 |
|
53 |
+
current_id = self.settings.GOOGLE_DRIVE_ROOT_ID
|
54 |
+
try:
|
55 |
+
for part in folder_path.split(PATH_SEPARATOR):
|
|
|
|
|
56 |
results = self._google_drive_client.files().list(
|
57 |
+
q=f"name='{part.replace("\\", "\\\\").replace("'", "\\'")}' and mimeType='{FOLDER_MIME_TYPE}' and '{current_id}' in parents",
|
58 |
+
fields="files(id,name)",
|
59 |
supportsAllDrives=True,
|
60 |
includeItemsFromAllDrives=True
|
61 |
).execute()
|
62 |
+
match results:
|
63 |
+
case {"files": [ {"id": id} ]}:
|
64 |
+
current_id = id
|
65 |
+
case _:
|
66 |
+
logger.debug("Folder not found by path: {}", folder_path)
|
67 |
+
return None
|
68 |
+
except HttpError as e:
|
69 |
+
logger.error("Error resolving folder path: {}", folder_path)
|
70 |
+
return None
|
71 |
|
72 |
self._folder_cache[folder_path] = current_id
|
73 |
return current_id
|
74 |
|
75 |
+
def list_directory(self: Self, folder_path: str) -> Collection[GoogleDriveMetadata]:
|
76 |
"""List contents of a directory with basic metadata."""
|
77 |
|
78 |
folder_id = self._resolve_folder_id(folder_path)
|
79 |
if not folder_id:
|
80 |
+
logger.debug("Folder not found by path: {}", folder_path)
|
81 |
+
return ()
|
82 |
+
|
83 |
try:
|
84 |
results = self._google_drive_client.files().list(
|
85 |
q=f"'{folder_id}' in parents",
|
86 |
+
fields="files(id,name,mimeType,modifiedTime)",
|
87 |
supportsAllDrives=True,
|
88 |
includeItemsFromAllDrives=True,
|
89 |
pageSize=1000
|
90 |
).execute()
|
91 |
+
return tuple(GoogleDriveMetadata.from_folder_path_and_dict(folder_path, result)
|
92 |
+
for result
|
93 |
+
in results.get('files', ()))
|
|
|
|
|
|
|
|
|
94 |
except HttpError as e:
|
95 |
logger.error("Error listing folder by path, {}: {}", folder_path, e)
|
96 |
+
return ()
|
97 |
|
98 |
+
def get_metadata(self: Self, item_path: str) -> Optional[GoogleDriveMetadata]:
|
99 |
"""Get metadata for a specific file/folder by path."""
|
100 |
|
101 |
+
match item_path.rsplit(PATH_SEPARATOR, 1):
|
102 |
+
case [item_name]:
|
103 |
+
folder_path = ""
|
104 |
+
folder_id = self.settings.GOOGLE_DRIVE_ROOT_ID
|
105 |
+
case [folder_path, item_name]:
|
106 |
+
folder_id = self._resolve_folder_id(folder_path)
|
107 |
+
|
|
|
|
|
108 |
if not folder_id:
|
109 |
+
logger.debug("Folder not found by path: {}", folder_path)
|
110 |
return None
|
111 |
|
112 |
try:
|
113 |
results = self._google_drive_client.files().list(
|
114 |
q=f"name='{item_name}' and '{folder_id}' in parents",
|
115 |
+
fields="files(id,name,mimeType,modifiedTime)",
|
116 |
supportsAllDrives=True,
|
117 |
includeItemsFromAllDrives=True,
|
118 |
pageSize=1
|
119 |
).execute()
|
120 |
+
match results:
|
121 |
+
case {"files": [result]}:
|
122 |
+
return GoogleDriveMetadata.from_folder_path_and_dict(folder_path, result)
|
|
|
|
|
|
|
|
|
|
|
123 |
except HttpError as e:
|
124 |
logger.error("Error getting metadata for item by path, {}: {}", item_path, e)
|
125 |
|
126 |
+
logger.debug("Item not found by path: {}", item_path)
|
127 |
return None
|
128 |
|
129 |
def read_file_by_id(self: Self, file_id: str) -> Optional[bytes]:
|
130 |
"""Read contents of a file by its unique identifier."""
|
131 |
|
132 |
try:
|
133 |
+
request = self._google_drive_client.files().get_media(fileId=file_id)
|
134 |
buffer = BytesIO()
|
135 |
downloader = MediaIoBaseDownload(buffer, request)
|
136 |
done = False
|