Spaces:
Runtime error
Runtime error
Bug fixes; remove references to uncommitted work-in-progress
Browse files
notebooks/google_drive.ipynb
CHANGED
@@ -9,17 +9,9 @@
|
|
9 |
},
|
10 |
{
|
11 |
"cell_type": "code",
|
12 |
-
"execution_count":
|
13 |
"metadata": {},
|
14 |
-
"outputs": [
|
15 |
-
{
|
16 |
-
"name": "stderr",
|
17 |
-
"output_type": "stream",
|
18 |
-
"text": [
|
19 |
-
"\u001b[32m2025-04-19 18:17:19.845\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.core.config\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m14\u001b[0m - \u001b[34m\u001b[1mCreated Settings\u001b[0m\n"
|
20 |
-
]
|
21 |
-
}
|
22 |
-
],
|
23 |
"source": [
|
24 |
"from functools import partial\n",
|
25 |
"from html import escape\n",
|
@@ -43,15 +35,15 @@
|
|
43 |
},
|
44 |
{
|
45 |
"cell_type": "code",
|
46 |
-
"execution_count":
|
47 |
"metadata": {},
|
48 |
"outputs": [
|
49 |
{
|
50 |
"name": "stderr",
|
51 |
"output_type": "stream",
|
52 |
"text": [
|
53 |
-
"\u001b[32m2025-04-19
|
54 |
-
"\u001b[32m2025-04-19
|
55 |
]
|
56 |
}
|
57 |
],
|
@@ -61,7 +53,7 @@
|
|
61 |
},
|
62 |
{
|
63 |
"cell_type": "code",
|
64 |
-
"execution_count":
|
65 |
"metadata": {},
|
66 |
"outputs": [
|
67 |
{
|
@@ -82,7 +74,7 @@
|
|
82 |
},
|
83 |
{
|
84 |
"cell_type": "code",
|
85 |
-
"execution_count":
|
86 |
"metadata": {},
|
87 |
"outputs": [
|
88 |
{
|
|
|
9 |
},
|
10 |
{
|
11 |
"cell_type": "code",
|
12 |
+
"execution_count": 2,
|
13 |
"metadata": {},
|
14 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
"source": [
|
16 |
"from functools import partial\n",
|
17 |
"from html import escape\n",
|
|
|
35 |
},
|
36 |
{
|
37 |
"cell_type": "code",
|
38 |
+
"execution_count": 3,
|
39 |
"metadata": {},
|
40 |
"outputs": [
|
41 |
{
|
42 |
"name": "stderr",
|
43 |
"output_type": "stream",
|
44 |
"text": [
|
45 |
+
"\u001b[32m2025-04-19 22:51:18.841\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.core.config\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m14\u001b[0m - \u001b[34m\u001b[1mCreated Settings\u001b[0m\n",
|
46 |
+
"\u001b[32m2025-04-19 22:51:18.844\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mctp_slack_bot.services.google_drive_service\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreated GoogleDriveService\u001b[0m\n"
|
47 |
]
|
48 |
}
|
49 |
],
|
|
|
53 |
},
|
54 |
{
|
55 |
"cell_type": "code",
|
56 |
+
"execution_count": 4,
|
57 |
"metadata": {},
|
58 |
"outputs": [
|
59 |
{
|
|
|
74 |
},
|
75 |
{
|
76 |
"cell_type": "code",
|
77 |
+
"execution_count": 5,
|
78 |
"metadata": {},
|
79 |
"outputs": [
|
80 |
{
|
notebooks/web_vtt.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
src/ctp_slack_bot/core/config.py
CHANGED
@@ -62,7 +62,7 @@ class Settings(BaseSettings):
|
|
62 |
GOOGLE_UNIVERSE_DOMAIN: str = "googleapis.com"
|
63 |
|
64 |
# File Monitoring Configuration
|
65 |
-
FILE_MONITOR_ROOT_PATH:
|
66 |
|
67 |
model_config = SettingsConfigDict(
|
68 |
env_file=".env",
|
|
|
62 |
GOOGLE_UNIVERSE_DOMAIN: str = "googleapis.com"
|
63 |
|
64 |
# File Monitoring Configuration
|
65 |
+
FILE_MONITOR_ROOT_PATH: str = ""
|
66 |
|
67 |
model_config = SettingsConfigDict(
|
68 |
env_file=".env",
|
src/ctp_slack_bot/models/google_drive.py
CHANGED
@@ -2,8 +2,6 @@ from datetime import datetime
|
|
2 |
from pydantic import BaseModel, ConfigDict
|
3 |
from typing import Self
|
4 |
|
5 |
-
from ctp_slack_bot.models import FileContent
|
6 |
-
|
7 |
|
8 |
class GoogleDriveMetadata(BaseModel):
|
9 |
"""Represents Google Drive file or folder metadata."""
|
@@ -17,7 +15,7 @@ class GoogleDriveMetadata(BaseModel):
|
|
17 |
model_config = ConfigDict(frozen=True)
|
18 |
|
19 |
@classmethod
|
20 |
-
def from_folder_path_and_dict(cls
|
21 |
id = dict["id"]
|
22 |
name = dict["name"]
|
23 |
modified_time = datetime.fromisoformat(dict["modifiedTime"])
|
|
|
2 |
from pydantic import BaseModel, ConfigDict
|
3 |
from typing import Self
|
4 |
|
|
|
|
|
5 |
|
6 |
class GoogleDriveMetadata(BaseModel):
|
7 |
"""Represents Google Drive file or folder metadata."""
|
|
|
15 |
model_config = ConfigDict(frozen=True)
|
16 |
|
17 |
@classmethod
|
18 |
+
def from_folder_path_and_dict(cls, folder_path: str, dict: dict) -> Self:
|
19 |
id = dict["id"]
|
20 |
name = dict["name"]
|
21 |
modified_time = datetime.fromisoformat(dict["modifiedTime"])
|
src/ctp_slack_bot/models/webvtt.py
CHANGED
@@ -4,15 +4,19 @@ from itertools import starmap
|
|
4 |
from json import dumps
|
5 |
from more_itertools import windowed
|
6 |
from pydantic import BaseModel, ConfigDict, Field, PositiveInt, PrivateAttr
|
|
|
7 |
from types import MappingProxyType
|
8 |
from typing import Any, Dict, Literal, Mapping, Optional, Self, Sequence
|
9 |
from webvtt import Caption, WebVTT
|
10 |
|
11 |
from ctp_slack_bot.models.base import Chunk, Content
|
12 |
|
|
|
13 |
CHUNK_FRAMES_OVERLAP = 1
|
14 |
CHUNK_FRAMES_WINDOW = 5
|
15 |
SPEAKER_SPEECH_TEXT_SEPARATOR = ": "
|
|
|
|
|
16 |
|
17 |
class WebVTTFrame(BaseModel):
|
18 |
"""Represents a WebVTT frame"""
|
@@ -26,7 +30,7 @@ class WebVTTFrame(BaseModel):
|
|
26 |
model_config = ConfigDict(frozen=True)
|
27 |
|
28 |
@classmethod
|
29 |
-
def from_webvtt_caption(cls:
|
30 |
identifier = caption.identifier if caption.identifier else str(index)
|
31 |
start = timedelta(**caption.start_time.__dict__)
|
32 |
end = timedelta(**caption.end_time.__dict__)
|
@@ -42,6 +46,7 @@ class WebVTTContent(Content):
|
|
42 |
|
43 |
id: str
|
44 |
metadata: Mapping[str, Any] = Field(default_factory=dict)
|
|
|
45 |
frames: Sequence[WebVTTFrame]
|
46 |
|
47 |
def get_id(self: Self) -> str:
|
@@ -57,9 +62,9 @@ class WebVTTContent(Content):
|
|
57 |
parent_id=self.get_id(),
|
58 |
chunk_id=f"{frames[0].identifier}-{frames[-1].identifier}",
|
59 |
metadata={
|
60 |
-
"start":
|
61 |
-
"end":
|
62 |
-
"speakers":
|
63 |
})
|
64 |
for frames
|
65 |
in windows)
|
@@ -68,6 +73,19 @@ class WebVTTContent(Content):
|
|
68 |
return MappingProxyType(self.metadata)
|
69 |
|
70 |
@classmethod
|
71 |
-
def
|
72 |
-
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
from json import dumps
|
5 |
from more_itertools import windowed
|
6 |
from pydantic import BaseModel, ConfigDict, Field, PositiveInt, PrivateAttr
|
7 |
+
from re import compile as compile_re
|
8 |
from types import MappingProxyType
|
9 |
from typing import Any, Dict, Literal, Mapping, Optional, Self, Sequence
|
10 |
from webvtt import Caption, WebVTT
|
11 |
|
12 |
from ctp_slack_bot.models.base import Chunk, Content
|
13 |
|
14 |
+
|
15 |
CHUNK_FRAMES_OVERLAP = 1
|
16 |
CHUNK_FRAMES_WINDOW = 5
|
17 |
SPEAKER_SPEECH_TEXT_SEPARATOR = ": "
|
18 |
+
ISO_DATE_TIME_PATTERN = compile_re(r"Start time: (\d{4}-\d{2}-\d{2}(?: \d{2}:\d{2}:\d{2}(?:Z|[+-]\d{2}:\d{2})?)?)")
|
19 |
+
|
20 |
|
21 |
class WebVTTFrame(BaseModel):
|
22 |
"""Represents a WebVTT frame"""
|
|
|
30 |
model_config = ConfigDict(frozen=True)
|
31 |
|
32 |
@classmethod
|
33 |
+
def from_webvtt_caption(cls, caption: Caption, index: int) -> Self:
|
34 |
identifier = caption.identifier if caption.identifier else str(index)
|
35 |
start = timedelta(**caption.start_time.__dict__)
|
36 |
end = timedelta(**caption.end_time.__dict__)
|
|
|
46 |
|
47 |
id: str
|
48 |
metadata: Mapping[str, Any] = Field(default_factory=dict)
|
49 |
+
start_time: Optional[datetime]
|
50 |
frames: Sequence[WebVTTFrame]
|
51 |
|
52 |
def get_id(self: Self) -> str:
|
|
|
62 |
parent_id=self.get_id(),
|
63 |
chunk_id=f"{frames[0].identifier}-{frames[-1].identifier}",
|
64 |
metadata={
|
65 |
+
"start": self.start_time + frames[0].start if self.start_time else None,
|
66 |
+
"end": self.start_time + frames[-1].end if self.start_time else None,
|
67 |
+
"speakers": tuple(frame.speaker for frame in frames if frame.speaker)
|
68 |
})
|
69 |
for frames
|
70 |
in windows)
|
|
|
73 |
return MappingProxyType(self.metadata)
|
74 |
|
75 |
@classmethod
|
76 |
+
def __get_start_time(cls, web_vtt: WebVTT) -> Optional[datetime]:
|
77 |
+
try:
|
78 |
+
return next(datetime.fromisoformat(result[0])
|
79 |
+
for result
|
80 |
+
in map(ISO_DATE_TIME_PATTERN.findall, web_vtt.header_comments)
|
81 |
+
if result)
|
82 |
+
except ValueError:
|
83 |
+
return None
|
84 |
+
|
85 |
+
@classmethod
|
86 |
+
def from_bytes(cls, id: str, metadata: Mapping[str, Any], buffer: bytes) -> Self:
|
87 |
+
web_vtt = WebVTT.from_buffer(BytesIO(buffer))
|
88 |
+
frames = tuple(WebVTTFrame.from_webvtt_caption(caption, index)
|
89 |
+
for index, caption
|
90 |
+
in enumerate(web_vtt.captions, 1))
|
91 |
+
return WebVTTContent(id=id, metadata=MappingProxyType(metadata), start_time=cls.__get_start_time(web_vtt), frames=frames)
|