LiKenun commited on
Commit
af0a2bd
·
1 Parent(s): c1b84d6

Bug fixes; remove references to uncommitted work-in-progress

Browse files
notebooks/google_drive.ipynb CHANGED
@@ -9,17 +9,9 @@
9
  },
10
  {
11
  "cell_type": "code",
12
- "execution_count": null,
13
  "metadata": {},
14
- "outputs": [
15
- {
16
- "name": "stderr",
17
- "output_type": "stream",
18
- "text": [
19
- "\u001b[32m2025-04-19 18:17:19.845\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.core.config\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m14\u001b[0m - \u001b[34m\u001b[1mCreated Settings\u001b[0m\n"
20
- ]
21
- }
22
- ],
23
  "source": [
24
  "from functools import partial\n",
25
  "from html import escape\n",
@@ -43,15 +35,15 @@
43
  },
44
  {
45
  "cell_type": "code",
46
- "execution_count": 2,
47
  "metadata": {},
48
  "outputs": [
49
  {
50
  "name": "stderr",
51
  "output_type": "stream",
52
  "text": [
53
- "\u001b[32m2025-04-19 18:17:19.850\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.core.config\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m14\u001b[0m - \u001b[34m\u001b[1mCreated Settings\u001b[0m\n",
54
- "\u001b[32m2025-04-19 18:17:19.853\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.google_drive_service\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreated GoogleDriveService\u001b[0m\n"
55
  ]
56
  }
57
  ],
@@ -61,7 +53,7 @@
61
  },
62
  {
63
  "cell_type": "code",
64
- "execution_count": 3,
65
  "metadata": {},
66
  "outputs": [
67
  {
@@ -82,7 +74,7 @@
82
  },
83
  {
84
  "cell_type": "code",
85
- "execution_count": 4,
86
  "metadata": {},
87
  "outputs": [
88
  {
 
9
  },
10
  {
11
  "cell_type": "code",
12
+ "execution_count": 2,
13
  "metadata": {},
14
+ "outputs": [],
 
 
 
 
 
 
 
 
15
  "source": [
16
  "from functools import partial\n",
17
  "from html import escape\n",
 
35
  },
36
  {
37
  "cell_type": "code",
38
+ "execution_count": 3,
39
  "metadata": {},
40
  "outputs": [
41
  {
42
  "name": "stderr",
43
  "output_type": "stream",
44
  "text": [
45
+ "\u001b[32m2025-04-19 22:51:18.841\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.core.config\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m14\u001b[0m - \u001b[34m\u001b[1mCreated Settings\u001b[0m\n",
46
+ "\u001b[32m2025-04-19 22:51:18.844\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.google_drive_service\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreated GoogleDriveService\u001b[0m\n"
47
  ]
48
  }
49
  ],
 
53
  },
54
  {
55
  "cell_type": "code",
56
+ "execution_count": 4,
57
  "metadata": {},
58
  "outputs": [
59
  {
 
74
  },
75
  {
76
  "cell_type": "code",
77
+ "execution_count": 5,
78
  "metadata": {},
79
  "outputs": [
80
  {
notebooks/web_vtt.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
src/ctp_slack_bot/core/config.py CHANGED
@@ -62,7 +62,7 @@ class Settings(BaseSettings):
62
  GOOGLE_UNIVERSE_DOMAIN: str = "googleapis.com"
63
 
64
  # File Monitoring Configuration
65
- FILE_MONITOR_ROOT_PATH: Optional[str] = None
66
 
67
  model_config = SettingsConfigDict(
68
  env_file=".env",
 
62
  GOOGLE_UNIVERSE_DOMAIN: str = "googleapis.com"
63
 
64
  # File Monitoring Configuration
65
+ FILE_MONITOR_ROOT_PATH: str = ""
66
 
67
  model_config = SettingsConfigDict(
68
  env_file=".env",
src/ctp_slack_bot/models/google_drive.py CHANGED
@@ -2,8 +2,6 @@ from datetime import datetime
2
  from pydantic import BaseModel, ConfigDict
3
  from typing import Self
4
 
5
- from ctp_slack_bot.models import FileContent
6
-
7
 
8
  class GoogleDriveMetadata(BaseModel):
9
  """Represents Google Drive file or folder metadata."""
@@ -17,7 +15,7 @@ class GoogleDriveMetadata(BaseModel):
17
  model_config = ConfigDict(frozen=True)
18
 
19
  @classmethod
20
- def from_folder_path_and_dict(cls: type["GoogleDriveMetadata"], folder_path: str, dict: dict) -> Self:
21
  id = dict["id"]
22
  name = dict["name"]
23
  modified_time = datetime.fromisoformat(dict["modifiedTime"])
 
2
  from pydantic import BaseModel, ConfigDict
3
  from typing import Self
4
 
 
 
5
 
6
  class GoogleDriveMetadata(BaseModel):
7
  """Represents Google Drive file or folder metadata."""
 
15
  model_config = ConfigDict(frozen=True)
16
 
17
  @classmethod
18
+ def from_folder_path_and_dict(cls, folder_path: str, dict: dict) -> Self:
19
  id = dict["id"]
20
  name = dict["name"]
21
  modified_time = datetime.fromisoformat(dict["modifiedTime"])
src/ctp_slack_bot/models/webvtt.py CHANGED
@@ -4,15 +4,19 @@ from itertools import starmap
4
  from json import dumps
5
  from more_itertools import windowed
6
  from pydantic import BaseModel, ConfigDict, Field, PositiveInt, PrivateAttr
 
7
  from types import MappingProxyType
8
  from typing import Any, Dict, Literal, Mapping, Optional, Self, Sequence
9
  from webvtt import Caption, WebVTT
10
 
11
  from ctp_slack_bot.models.base import Chunk, Content
12
 
 
13
  CHUNK_FRAMES_OVERLAP = 1
14
  CHUNK_FRAMES_WINDOW = 5
15
  SPEAKER_SPEECH_TEXT_SEPARATOR = ": "
 
 
16
 
17
  class WebVTTFrame(BaseModel):
18
  """Represents a WebVTT frame"""
@@ -26,7 +30,7 @@ class WebVTTFrame(BaseModel):
26
  model_config = ConfigDict(frozen=True)
27
 
28
  @classmethod
29
- def from_webvtt_caption(cls: type["WebVTTFrame"], index: int, caption: Caption) -> Self:
30
  identifier = caption.identifier if caption.identifier else str(index)
31
  start = timedelta(**caption.start_time.__dict__)
32
  end = timedelta(**caption.end_time.__dict__)
@@ -42,6 +46,7 @@ class WebVTTContent(Content):
42
 
43
  id: str
44
  metadata: Mapping[str, Any] = Field(default_factory=dict)
 
45
  frames: Sequence[WebVTTFrame]
46
 
47
  def get_id(self: Self) -> str:
@@ -57,9 +62,9 @@ class WebVTTContent(Content):
57
  parent_id=self.get_id(),
58
  chunk_id=f"{frames[0].identifier}-{frames[-1].identifier}",
59
  metadata={
60
- "start": str(frames[0].start), # TODO: This is a harder problem: to get the offsets to become real datetimes so that they can be queryable using MongoDB.
61
- "end": str(frames[-1].end),
62
- "speakers": [frame.speaker for frame in frames if frame.speaker]
63
  })
64
  for frames
65
  in windows)
@@ -68,6 +73,19 @@ class WebVTTContent(Content):
68
  return MappingProxyType(self.metadata)
69
 
70
  @classmethod
71
- def from_bytes(cls: type["WebVTTContent"], id: str, metadata: Mapping[str, Any], buffer: bytes) -> Self:
72
- frames = tuple(starmap(WebVTTFrame.from_webvtt_caption, enumerate(WebVTT.from_buffer(BytesIO(buffer)).captions, 1)))
73
- return WebVTTContent(id=id, metadata=MappingProxyType(metadata), frames=frames)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  from json import dumps
5
  from more_itertools import windowed
6
  from pydantic import BaseModel, ConfigDict, Field, PositiveInt, PrivateAttr
7
+ from re import compile as compile_re
8
  from types import MappingProxyType
9
  from typing import Any, Dict, Literal, Mapping, Optional, Self, Sequence
10
  from webvtt import Caption, WebVTT
11
 
12
  from ctp_slack_bot.models.base import Chunk, Content
13
 
14
+
15
  CHUNK_FRAMES_OVERLAP = 1
16
  CHUNK_FRAMES_WINDOW = 5
17
  SPEAKER_SPEECH_TEXT_SEPARATOR = ": "
18
+ ISO_DATE_TIME_PATTERN = compile_re(r"Start time: (\d{4}-\d{2}-\d{2}(?: \d{2}:\d{2}:\d{2}(?:Z|[+-]\d{2}:\d{2})?)?)")
19
+
20
 
21
  class WebVTTFrame(BaseModel):
22
  """Represents a WebVTT frame"""
 
30
  model_config = ConfigDict(frozen=True)
31
 
32
  @classmethod
33
+ def from_webvtt_caption(cls, caption: Caption, index: int) -> Self:
34
  identifier = caption.identifier if caption.identifier else str(index)
35
  start = timedelta(**caption.start_time.__dict__)
36
  end = timedelta(**caption.end_time.__dict__)
 
46
 
47
  id: str
48
  metadata: Mapping[str, Any] = Field(default_factory=dict)
49
+ start_time: Optional[datetime]
50
  frames: Sequence[WebVTTFrame]
51
 
52
  def get_id(self: Self) -> str:
 
62
  parent_id=self.get_id(),
63
  chunk_id=f"{frames[0].identifier}-{frames[-1].identifier}",
64
  metadata={
65
+ "start": self.start_time + frames[0].start if self.start_time else None,
66
+ "end": self.start_time + frames[-1].end if self.start_time else None,
67
+ "speakers": tuple(frame.speaker for frame in frames if frame.speaker)
68
  })
69
  for frames
70
  in windows)
 
73
  return MappingProxyType(self.metadata)
74
 
75
  @classmethod
76
+ def __get_start_time(cls, web_vtt: WebVTT) -> Optional[datetime]:
77
+ try:
78
+ return next(datetime.fromisoformat(result[0])
79
+ for result
80
+ in map(ISO_DATE_TIME_PATTERN.findall, web_vtt.header_comments)
81
+ if result)
82
+ except ValueError:
83
+ return None
84
+
85
+ @classmethod
86
+ def from_bytes(cls, id: str, metadata: Mapping[str, Any], buffer: bytes) -> Self:
87
+ web_vtt = WebVTT.from_buffer(BytesIO(buffer))
88
+ frames = tuple(WebVTTFrame.from_webvtt_caption(caption, index)
89
+ for index, caption
90
+ in enumerate(web_vtt.captions, 1))
91
+ return WebVTTContent(id=id, metadata=MappingProxyType(metadata), start_time=cls.__get_start_time(web_vtt), frames=frames)