Spaces:

KingZack
/

ctp-slack-bot

Runtime error

App Files Files Community

LiKenun commited on Apr 20

Commit

af0a2bd

1 Parent(s): c1b84d6

Bug fixes; remove references to uncommitted work-in-progress

Browse files

Files changed (5) hide show

notebooks/google_drive.ipynb +7 -15
notebooks/web_vtt.ipynb +0 -0
src/ctp_slack_bot/core/config.py +1 -1
src/ctp_slack_bot/models/google_drive.py +1 -3
src/ctp_slack_bot/models/webvtt.py +25 -7

notebooks/google_drive.ipynb CHANGED Viewed

@@ -9,17 +9,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\u001b[32m2025-04-19 18:17:19.845\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mctp_slack_bot.core.config\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m14\u001b[0m - \u001b[34m\u001b[1mCreated Settings\u001b[0m\n"
-     ]
-    }
-   ],
    "source": [
     "from functools import partial\n",
     "from html import escape\n",
@@ -43,15 +35,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[32m2025-04-19 18:17:19.850\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mctp_slack_bot.core.config\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m14\u001b[0m - \u001b[34m\u001b[1mCreated Settings\u001b[0m\n",
-      "\u001b[32m2025-04-19 18:17:19.853\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mctp_slack_bot.services.google_drive_service\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreated GoogleDriveService\u001b[0m\n"
      ]
     }
    ],
@@ -61,7 +53,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -82,7 +74,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {

   },
   {
    "cell_type": "code",
+   "execution_count": 2,
    "metadata": {},
+   "outputs": [],
    "source": [
     "from functools import partial\n",
     "from html import escape\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "\u001b[32m2025-04-19 22:51:18.841\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mctp_slack_bot.core.config\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m14\u001b[0m - \u001b[34m\u001b[1mCreated Settings\u001b[0m\n",
+      "\u001b[32m2025-04-19 22:51:18.844\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mctp_slack_bot.services.google_drive_service\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m42\u001b[0m - \u001b[34m\u001b[1mCreated GoogleDriveService\u001b[0m\n"
      ]
     }
    ],
   },
   {
    "cell_type": "code",
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
   },
   {
    "cell_type": "code",
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {

notebooks/web_vtt.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

src/ctp_slack_bot/core/config.py CHANGED Viewed

@@ -62,7 +62,7 @@ class Settings(BaseSettings):
     GOOGLE_UNIVERSE_DOMAIN: str = "googleapis.com"
     # File Monitoring Configuration
-    FILE_MONITOR_ROOT_PATH: Optional[str] = None
     model_config = SettingsConfigDict(
         env_file=".env",

     GOOGLE_UNIVERSE_DOMAIN: str = "googleapis.com"
     # File Monitoring Configuration
+    FILE_MONITOR_ROOT_PATH: str = ""
     model_config = SettingsConfigDict(
         env_file=".env",

src/ctp_slack_bot/models/google_drive.py CHANGED Viewed

@@ -2,8 +2,6 @@ from datetime import datetime
 from pydantic import BaseModel, ConfigDict
 from typing import Self
-from ctp_slack_bot.models import FileContent
 class GoogleDriveMetadata(BaseModel):
     """Represents Google Drive file or folder metadata."""
@@ -17,7 +15,7 @@ class GoogleDriveMetadata(BaseModel):
     model_config = ConfigDict(frozen=True)
     @classmethod
-    def from_folder_path_and_dict(cls: type["GoogleDriveMetadata"], folder_path: str, dict: dict) -> Self:
         id = dict["id"]
         name = dict["name"]
         modified_time = datetime.fromisoformat(dict["modifiedTime"])

 from pydantic import BaseModel, ConfigDict
 from typing import Self
 class GoogleDriveMetadata(BaseModel):
     """Represents Google Drive file or folder metadata."""
     model_config = ConfigDict(frozen=True)
     @classmethod
+    def from_folder_path_and_dict(cls, folder_path: str, dict: dict) -> Self:
         id = dict["id"]
         name = dict["name"]
         modified_time = datetime.fromisoformat(dict["modifiedTime"])

src/ctp_slack_bot/models/webvtt.py CHANGED Viewed

@@ -4,15 +4,19 @@ from itertools import starmap
 from json import dumps
 from more_itertools import windowed
 from pydantic import BaseModel, ConfigDict, Field, PositiveInt, PrivateAttr
 from types import MappingProxyType
 from typing import Any, Dict, Literal, Mapping, Optional, Self, Sequence
 from webvtt import Caption, WebVTT
 from ctp_slack_bot.models.base import Chunk, Content
 CHUNK_FRAMES_OVERLAP = 1
 CHUNK_FRAMES_WINDOW = 5
 SPEAKER_SPEECH_TEXT_SEPARATOR = ": "
 class WebVTTFrame(BaseModel):
     """Represents a WebVTT frame"""
@@ -26,7 +30,7 @@ class WebVTTFrame(BaseModel):
     model_config = ConfigDict(frozen=True)
     @classmethod
-    def from_webvtt_caption(cls: type["WebVTTFrame"], index: int, caption: Caption) -> Self:
         identifier = caption.identifier if caption.identifier else str(index)
         start = timedelta(**caption.start_time.__dict__)
         end = timedelta(**caption.end_time.__dict__)
@@ -42,6 +46,7 @@ class WebVTTContent(Content):
     id: str
     metadata: Mapping[str, Any] = Field(default_factory=dict)
     frames: Sequence[WebVTTFrame]
     def get_id(self: Self) -> str:
@@ -57,9 +62,9 @@ class WebVTTContent(Content):
                            parent_id=self.get_id(),
                            chunk_id=f"{frames[0].identifier}-{frames[-1].identifier}",
                            metadata={
-                               "start": str(frames[0].start), # TODO: This is a harder problem: to get the offsets to become real datetimes so that they can be queryable using MongoDB.
-                               "end": str(frames[-1].end),
-                               "speakers": [frame.speaker for frame in frames if frame.speaker]
                            })
                      for frames
                      in windows)
@@ -68,6 +73,19 @@ class WebVTTContent(Content):
         return MappingProxyType(self.metadata)
     @classmethod
-    def from_bytes(cls: type["WebVTTContent"], id: str, metadata: Mapping[str, Any], buffer: bytes) -> Self:
-        frames = tuple(starmap(WebVTTFrame.from_webvtt_caption, enumerate(WebVTT.from_buffer(BytesIO(buffer)).captions, 1)))
-        return WebVTTContent(id=id, metadata=MappingProxyType(metadata), frames=frames)

 from json import dumps
 from more_itertools import windowed
 from pydantic import BaseModel, ConfigDict, Field, PositiveInt, PrivateAttr
+from re import compile as compile_re
 from types import MappingProxyType
 from typing import Any, Dict, Literal, Mapping, Optional, Self, Sequence
 from webvtt import Caption, WebVTT
 from ctp_slack_bot.models.base import Chunk, Content
 CHUNK_FRAMES_OVERLAP = 1
 CHUNK_FRAMES_WINDOW = 5
 SPEAKER_SPEECH_TEXT_SEPARATOR = ": "
+ISO_DATE_TIME_PATTERN = compile_re(r"Start time: (\d{4}-\d{2}-\d{2}(?: \d{2}:\d{2}:\d{2}(?:Z|[+-]\d{2}:\d{2})?)?)")
 class WebVTTFrame(BaseModel):
     """Represents a WebVTT frame"""
     model_config = ConfigDict(frozen=True)
     @classmethod
+    def from_webvtt_caption(cls, caption: Caption, index: int) -> Self:
         identifier = caption.identifier if caption.identifier else str(index)
         start = timedelta(**caption.start_time.__dict__)
         end = timedelta(**caption.end_time.__dict__)
     id: str
     metadata: Mapping[str, Any] = Field(default_factory=dict)
+    start_time: Optional[datetime]
     frames: Sequence[WebVTTFrame]
     def get_id(self: Self) -> str:
                            parent_id=self.get_id(),
                            chunk_id=f"{frames[0].identifier}-{frames[-1].identifier}",
                            metadata={
+                               "start": self.start_time + frames[0].start if self.start_time else None,
+                               "end": self.start_time + frames[-1].end if self.start_time else None,
+                               "speakers": tuple(frame.speaker for frame in frames if frame.speaker)
                            })
                      for frames
                      in windows)
         return MappingProxyType(self.metadata)
     @classmethod
+    def __get_start_time(cls, web_vtt: WebVTT) -> Optional[datetime]:
+        try:
+            return next(datetime.fromisoformat(result[0])
+                        for result
+                        in map(ISO_DATE_TIME_PATTERN.findall, web_vtt.header_comments)
+                        if result)
+        except ValueError:
+            return None
+    @classmethod
+    def from_bytes(cls, id: str, metadata: Mapping[str, Any], buffer: bytes) -> Self:
+        web_vtt = WebVTT.from_buffer(BytesIO(buffer))
+        frames = tuple(WebVTTFrame.from_webvtt_caption(caption, index)
+                       for index, caption
+                       in enumerate(web_vtt.captions, 1))
+        return WebVTTContent(id=id, metadata=MappingProxyType(metadata), start_time=cls.__get_start_time(web_vtt), frames=frames)