LiKenun commited on
Commit
c5fe6f5
·
1 Parent(s): 059c10d

Update `GoogleDriveService` and add example usage notebook

Browse files
.env.template CHANGED
@@ -30,6 +30,7 @@ TEMPERATURE=0.8
30
  SYSTEM_PROMPT="You are a helpful teaching assistant for a data science class.\nBased on the students question, you will be given context retreived from class transcripts and materials to answer their question.\nYour responses should be:\n\n1. Accurate and based on the class content\n2. Clear and educational\n3. Concise but complete\nIf you're unsure about something, acknowledge it and suggest asking the professor."
31
 
32
  # Google Drive Configuration
 
33
  GOOGLE_PROJECT_ID=insufferable-slacker-123456
34
  GOOGLE_PRIVATE_KEY_ID=1a2b3c4d5e6f748891091d21304e506674829507
35
  GOOGLE_PRIVATE_KEY="-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASC...\n-----END PRIVATE KEY-----\n"
 
30
  SYSTEM_PROMPT="You are a helpful teaching assistant for a data science class.\nBased on the students question, you will be given context retreived from class transcripts and materials to answer their question.\nYour responses should be:\n\n1. Accurate and based on the class content\n2. Clear and educational\n3. Concise but complete\nIf you're unsure about something, acknowledge it and suggest asking the professor."
31
 
32
  # Google Drive Configuration
33
+ GOOGLE_DRIVE_ROOT_ID=1NB91EcIUXbOVcdCkXOAHdmWrDfgoh9fQ
34
  GOOGLE_PROJECT_ID=insufferable-slacker-123456
35
  GOOGLE_PRIVATE_KEY_ID=1a2b3c4d5e6f748891091d21304e506674829507
36
  GOOGLE_PRIVATE_KEY="-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASC...\n-----END PRIVATE KEY-----\n"
notebooks/google_drive.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/{web-vtt.ipynb → web_vtt.ipynb} RENAMED
File without changes
pyproject.toml CHANGED
@@ -21,6 +21,7 @@ classifiers = [
21
  dependencies = [
22
  "pydantic>=2.11.2",
23
  "pydantic-settings>=2.8.1",
 
24
  "more-itertools>=10.6.0",
25
  "python-dotenv>=1.1.0",
26
  "loguru>=0.7.3",
 
21
  dependencies = [
22
  "pydantic>=2.11.2",
23
  "pydantic-settings>=2.8.1",
24
+ "cachetools>=5.5.2",
25
  "more-itertools>=10.6.0",
26
  "python-dotenv>=1.1.0",
27
  "loguru>=0.7.3",
src/ctp_slack_bot/containers.py CHANGED
@@ -11,6 +11,7 @@ from ctp_slack_bot.services.content_ingestion_service import ContentIngestionSer
11
  from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
12
  from ctp_slack_bot.services.embeddings_model_service import EmbeddingsModelService
13
  from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
 
14
  from ctp_slack_bot.services.language_model_service import LanguageModelService
15
  from ctp_slack_bot.services.question_dispatch_service import QuestionDispatchService
16
  from ctp_slack_bot.services.schedule_service import ScheduleServiceResource
@@ -36,3 +37,4 @@ class Container(DeclarativeContainer):
36
  slack_bolt_app = Singleton(AsyncApp, token=settings.provided.SLACK_BOT_TOKEN().get_secret_value())
37
  slack_service = Resource(SlackServiceResource, event_brokerage_service=event_brokerage_service, slack_bolt_app=slack_bolt_app)
38
  socket_mode_handler = Singleton(lambda _, app, app_token: AsyncSocketModeHandler(app, app_token), slack_service, slack_bolt_app, settings.provided.SLACK_APP_TOKEN().get_secret_value())
 
 
11
  from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
12
  from ctp_slack_bot.services.embeddings_model_service import EmbeddingsModelService
13
  from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
14
+ from ctp_slack_bot.services.google_drive_service import GoogleDriveService
15
  from ctp_slack_bot.services.language_model_service import LanguageModelService
16
  from ctp_slack_bot.services.question_dispatch_service import QuestionDispatchService
17
  from ctp_slack_bot.services.schedule_service import ScheduleServiceResource
 
37
  slack_bolt_app = Singleton(AsyncApp, token=settings.provided.SLACK_BOT_TOKEN().get_secret_value())
38
  slack_service = Resource(SlackServiceResource, event_brokerage_service=event_brokerage_service, slack_bolt_app=slack_bolt_app)
39
  socket_mode_handler = Singleton(lambda _, app, app_token: AsyncSocketModeHandler(app, app_token), slack_service, slack_bolt_app, settings.provided.SLACK_APP_TOKEN().get_secret_value())
40
+ google_drive_service = Singleton(GoogleDriveService, settings=settings)
src/ctp_slack_bot/core/config.py CHANGED
@@ -49,6 +49,7 @@ class Settings(BaseSettings):
49
  SYSTEM_PROMPT: str
50
 
51
  # Google Drive Configuration
 
52
  GOOGLE_PROJECT_ID: str
53
  GOOGLE_PRIVATE_KEY_ID: SecretStr
54
  GOOGLE_PRIVATE_KEY: SecretStr
 
49
  SYSTEM_PROMPT: str
50
 
51
  # Google Drive Configuration
52
+ GOOGLE_DRIVE_ROOT_ID: str
53
  GOOGLE_PROJECT_ID: str
54
  GOOGLE_PRIVATE_KEY_ID: SecretStr
55
  GOOGLE_PRIVATE_KEY: SecretStr
src/ctp_slack_bot/models/__init__.py CHANGED
@@ -1,3 +1,4 @@
1
  from ctp_slack_bot.models.base import Chunk, Content, VectorizedChunk, VectorQuery
 
2
  from ctp_slack_bot.models.slack import SlackEventPayload, SlackMessage, SlackReaction, SlackResponse, SlackUserTimestampPair
3
  from ctp_slack_bot.models.webvtt import WebVTTContent, WebVTTFrame
 
1
  from ctp_slack_bot.models.base import Chunk, Content, VectorizedChunk, VectorQuery
2
+ from ctp_slack_bot.models.google_drive import GoogleDriveMetadata
3
  from ctp_slack_bot.models.slack import SlackEventPayload, SlackMessage, SlackReaction, SlackResponse, SlackUserTimestampPair
4
  from ctp_slack_bot.models.webvtt import WebVTTContent, WebVTTFrame
src/ctp_slack_bot/models/google_drive.py CHANGED
@@ -18,8 +18,8 @@ class GoogleDriveMetadata(BaseModel):
18
 
19
  @classmethod
20
  def from_folder_path_and_dict(cls: type["GoogleDriveMetadata"], folder_path: str, dict: dict) -> Self:
21
- id = result["id"]
22
- name = result["name"]
23
- modified_time = datetime.fromisoformat(result["modifiedTime"])
24
- mime_type = result["mimeType"]
25
  return GoogleDriveMetadata(id=id, name=name, modified_time=modified_time, mime_type=mime_type, folder_path=folder_path)
 
18
 
19
  @classmethod
20
  def from_folder_path_and_dict(cls: type["GoogleDriveMetadata"], folder_path: str, dict: dict) -> Self:
21
+ id = dict["id"]
22
+ name = dict["name"]
23
+ modified_time = datetime.fromisoformat(dict["modifiedTime"])
24
+ mime_type = dict["mimeType"]
25
  return GoogleDriveMetadata(id=id, name=name, modified_time=modified_time, mime_type=mime_type, folder_path=folder_path)
src/ctp_slack_bot/services/__init__.py CHANGED
@@ -3,6 +3,7 @@ from ctp_slack_bot.services.content_ingestion_service import ContentIngestionSer
3
  from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
4
  from ctp_slack_bot.services.embeddings_model_service import EmbeddingsModelService
5
  from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
 
6
  from ctp_slack_bot.services.language_model_service import LanguageModelService
7
  from ctp_slack_bot.services.question_dispatch_service import QuestionDispatchService
8
  from ctp_slack_bot.services.slack_service import SlackService
 
3
  from ctp_slack_bot.services.context_retrieval_service import ContextRetrievalService
4
  from ctp_slack_bot.services.embeddings_model_service import EmbeddingsModelService
5
  from ctp_slack_bot.services.event_brokerage_service import EventBrokerageService
6
+ from ctp_slack_bot.services.google_drive_service import GoogleDriveService
7
  from ctp_slack_bot.services.language_model_service import LanguageModelService
8
  from ctp_slack_bot.services.question_dispatch_service import QuestionDispatchService
9
  from ctp_slack_bot.services.slack_service import SlackService
src/ctp_slack_bot/services/google_drive_service.py CHANGED
@@ -1,4 +1,5 @@
1
  from datetime import datetime
 
2
  from google.oauth2 import service_account
3
  from googleapiclient.discovery import build
4
  from googleapiclient.http import MediaIoBaseDownload
@@ -9,16 +10,19 @@ from pydantic import BaseModel, PrivateAttr
9
  from typing import Collection, Dict, List, Optional, Self
10
 
11
  from ctp_slack_bot.core import Settings
 
 
12
 
13
  FOLDER_MIME_TYPE: str = "application/vnd.google-apps.folder"
14
- ROOT_FOLDER_NAME: str = "root"
 
15
 
16
  class GoogleDriveService(BaseModel):
17
  """Service for interacting with Google Drive."""
18
 
19
  settings: Settings
20
  _google_drive_client: PrivateAttr = PrivateAttr()
21
- _folder_cache: PrivateAttr = PrivateAttr(default_factory=dict)
22
 
23
  class Config:
24
  frozen=True
@@ -28,12 +32,12 @@ class GoogleDriveService(BaseModel):
28
  credentials = service_account.Credentials.from_service_account_info({
29
  "type": "service_account",
30
  "project_id": self.settings.GOOGLE_PROJECT_ID,
31
- "private_key_id": self.settings.GOOGLE_PRIVATE_KEY_ID,
32
- "private_key": self.settings.GOOGLE_PRIVATE_KEY,
33
  "client_email": self.settings.GOOGLE_CLIENT_EMAIL,
34
  "client_id": self.settings.GOOGLE_CLIENT_ID,
35
  "token_uri": self.settings.GOOGLE_TOKEN_URI,
36
- }, scopes=['https://www.googleapis.com/auth/drive'])
37
  self._google_drive_client = build('drive', 'v3', credentials=credentials)
38
  logger.debug("Created {}", self.__class__.__name__)
39
 
@@ -41,98 +45,92 @@ class GoogleDriveService(BaseModel):
41
  """Resolve a folder path to a Google Drive ID."""
42
 
43
  if not folder_path:
44
- return ROOT_FOLDER_NAME
45
 
46
  if folder_path in self._folder_cache:
47
  return self._folder_cache[folder_path]
48
 
49
- current_id = ROOT_FOLDER_NAME
50
- for part in folder_path.split('/'):
51
- if not part:
52
- continue
53
- try:
54
  results = self._google_drive_client.files().list(
55
- q=f"name='{part}' and mimeType='{FOLDER_MIME_TYPE}' and '{current_id}' in parents",
56
- fields='files(id,name)',
57
  supportsAllDrives=True,
58
  includeItemsFromAllDrives=True
59
  ).execute()
60
- if not results.get('files'):
61
- return None
62
- current_id = results['files'][0]['id']
63
- except HttpError as e:
64
- logger.error("Error resolving folder path: {}", folder_path)
65
- return None
 
 
 
66
 
67
  self._folder_cache[folder_path] = current_id
68
  return current_id
69
 
70
- def list_directory(self: Self, folder_path: str) -> List[Dict]:
71
  """List contents of a directory with basic metadata."""
72
 
73
  folder_id = self._resolve_folder_id(folder_path)
74
  if not folder_id:
75
- return []
 
 
76
  try:
77
  results = self._google_drive_client.files().list(
78
  q=f"'{folder_id}' in parents",
79
- fields='files(id,name,mimeType,modifiedTime)',
80
  supportsAllDrives=True,
81
  includeItemsFromAllDrives=True,
82
  pageSize=1000
83
  ).execute()
84
-
85
- return [{
86
- 'id': f['id'],
87
- 'name': f['name'],
88
- 'modified': f['modifiedTime'],
89
- 'mime_type': f['mimeType']
90
- } for f in results.get('files', [])]
91
  except HttpError as e:
92
  logger.error("Error listing folder by path, {}: {}", folder_path, e)
93
- return []
94
 
95
- def get_metadata(self: Self, item_path: str) -> Optional[Dict]:
96
  """Get metadata for a specific file/folder by path."""
97
 
98
- if '/' not in item_path:
99
- folder_id = ROOT_FOLDER_NAME
100
- item_name = item_path
101
- else:
102
- parts = item_path.split('/')
103
- item_name = parts[-1]
104
- folder_path = '/'.join(parts[:-1])
105
- folder_id = self._resolve_folder_id(folder_path)
106
-
107
  if not folder_id:
 
108
  return None
109
 
110
  try:
111
  results = self._google_drive_client.files().list(
112
  q=f"name='{item_name}' and '{folder_id}' in parents",
113
- fields='files(id,name,mimeType,modifiedTime)',
114
  supportsAllDrives=True,
115
  includeItemsFromAllDrives=True,
116
  pageSize=1
117
  ).execute()
118
-
119
- if files := results.get('files'):
120
- return {
121
- 'id': files[0]['id'],
122
- 'name': files[0]['name'],
123
- 'modified': files[0]['modifiedTime'],
124
- 'mime_type': files[0]['mimeType']
125
- }
126
  except HttpError as e:
127
  logger.error("Error getting metadata for item by path, {}: {}", item_path, e)
128
 
 
129
  return None
130
 
131
  def read_file_by_id(self: Self, file_id: str) -> Optional[bytes]:
132
  """Read contents of a file by its unique identifier."""
133
 
134
  try:
135
- request = self.service.files().get_media(fileId=file_id)
136
  buffer = BytesIO()
137
  downloader = MediaIoBaseDownload(buffer, request)
138
  done = False
 
1
  from datetime import datetime
2
+ from cachetools import TTLCache
3
  from google.oauth2 import service_account
4
  from googleapiclient.discovery import build
5
  from googleapiclient.http import MediaIoBaseDownload
 
10
  from typing import Collection, Dict, List, Optional, Self
11
 
12
  from ctp_slack_bot.core import Settings
13
+ from ctp_slack_bot.models import GoogleDriveMetadata
14
+
15
 
16
  FOLDER_MIME_TYPE: str = "application/vnd.google-apps.folder"
17
+ PATH_SEPARATOR: str = "/"
18
+
19
 
20
  class GoogleDriveService(BaseModel):
21
  """Service for interacting with Google Drive."""
22
 
23
  settings: Settings
24
  _google_drive_client: PrivateAttr = PrivateAttr()
25
+ _folder_cache: PrivateAttr = PrivateAttr(default_factory=lambda: TTLCache(maxsize=256, ttl=60))
26
 
27
  class Config:
28
  frozen=True
 
32
  credentials = service_account.Credentials.from_service_account_info({
33
  "type": "service_account",
34
  "project_id": self.settings.GOOGLE_PROJECT_ID,
35
+ "private_key_id": self.settings.GOOGLE_PRIVATE_KEY_ID.get_secret_value(),
36
+ "private_key": self.settings.GOOGLE_PRIVATE_KEY.get_secret_value(),
37
  "client_email": self.settings.GOOGLE_CLIENT_EMAIL,
38
  "client_id": self.settings.GOOGLE_CLIENT_ID,
39
  "token_uri": self.settings.GOOGLE_TOKEN_URI,
40
+ }, scopes=["https://www.googleapis.com/auth/drive"])
41
  self._google_drive_client = build('drive', 'v3', credentials=credentials)
42
  logger.debug("Created {}", self.__class__.__name__)
43
 
 
45
  """Resolve a folder path to a Google Drive ID."""
46
 
47
  if not folder_path:
48
+ return self.settings.GOOGLE_DRIVE_ROOT_ID
49
 
50
  if folder_path in self._folder_cache:
51
  return self._folder_cache[folder_path]
52
 
53
+ current_id = self.settings.GOOGLE_DRIVE_ROOT_ID
54
+ try:
55
+ for part in folder_path.split(PATH_SEPARATOR):
 
 
56
  results = self._google_drive_client.files().list(
57
+ q=f"name='{part.replace("\\", "\\\\").replace("'", "\\'")}' and mimeType='{FOLDER_MIME_TYPE}' and '{current_id}' in parents",
58
+ fields="files(id,name)",
59
  supportsAllDrives=True,
60
  includeItemsFromAllDrives=True
61
  ).execute()
62
+ match results:
63
+ case {"files": [ {"id": id} ]}:
64
+ current_id = id
65
+ case _:
66
+ logger.debug("Folder not found by path: {}", folder_path)
67
+ return None
68
+ except HttpError as e:
69
+ logger.error("Error resolving folder path: {}", folder_path)
70
+ return None
71
 
72
  self._folder_cache[folder_path] = current_id
73
  return current_id
74
 
75
+ def list_directory(self: Self, folder_path: str) -> Collection[GoogleDriveMetadata]:
76
  """List contents of a directory with basic metadata."""
77
 
78
  folder_id = self._resolve_folder_id(folder_path)
79
  if not folder_id:
80
+ logger.debug("Folder not found by path: {}", folder_path)
81
+ return ()
82
+
83
  try:
84
  results = self._google_drive_client.files().list(
85
  q=f"'{folder_id}' in parents",
86
+ fields="files(id,name,mimeType,modifiedTime)",
87
  supportsAllDrives=True,
88
  includeItemsFromAllDrives=True,
89
  pageSize=1000
90
  ).execute()
91
+ return tuple(GoogleDriveMetadata.from_folder_path_and_dict(folder_path, result)
92
+ for result
93
+ in results.get('files', ()))
 
 
 
 
94
  except HttpError as e:
95
  logger.error("Error listing folder by path, {}: {}", folder_path, e)
96
+ return ()
97
 
98
+ def get_metadata(self: Self, item_path: str) -> Optional[GoogleDriveMetadata]:
99
  """Get metadata for a specific file/folder by path."""
100
 
101
+ match item_path.rsplit(PATH_SEPARATOR, 1):
102
+ case [item_name]:
103
+ folder_path = ""
104
+ folder_id = self.settings.GOOGLE_DRIVE_ROOT_ID
105
+ case [folder_path, item_name]:
106
+ folder_id = self._resolve_folder_id(folder_path)
107
+
 
 
108
  if not folder_id:
109
+ logger.debug("Folder not found by path: {}", folder_path)
110
  return None
111
 
112
  try:
113
  results = self._google_drive_client.files().list(
114
  q=f"name='{item_name}' and '{folder_id}' in parents",
115
+ fields="files(id,name,mimeType,modifiedTime)",
116
  supportsAllDrives=True,
117
  includeItemsFromAllDrives=True,
118
  pageSize=1
119
  ).execute()
120
+ match results:
121
+ case {"files": [result]}:
122
+ return GoogleDriveMetadata.from_folder_path_and_dict(folder_path, result)
 
 
 
 
 
123
  except HttpError as e:
124
  logger.error("Error getting metadata for item by path, {}: {}", item_path, e)
125
 
126
+ logger.debug("Item not found by path: {}", item_path)
127
  return None
128
 
129
  def read_file_by_id(self: Self, file_id: str) -> Optional[bytes]:
130
  """Read contents of a file by its unique identifier."""
131
 
132
  try:
133
+ request = self._google_drive_client.files().get_media(fileId=file_id)
134
  buffer = BytesIO()
135
  downloader = MediaIoBaseDownload(buffer, request)
136
  done = False