LiKenun commited on
Commit
a60b3fc
·
1 Parent(s): 3300601

WebVTT models and changes to support it

Browse files
src/ctp_slack_bot/models/base.py CHANGED
@@ -1,6 +1,7 @@
1
  from abc import ABC, abstractmethod
2
  from pydantic import BaseModel, ConfigDict, Field
3
- from typing import Any, Dict, final, Self, Sequence, Optional
 
4
 
5
 
6
  class Chunk(BaseModel):
@@ -9,7 +10,7 @@ class Chunk(BaseModel):
9
  text: str # The text representation
10
  parent_id: str # The source content’s identity
11
  chunk_id: str # This chunk’s identity—unique within the source content
12
- metadata: Dict[str, Any]
13
 
14
  model_config = ConfigDict(frozen=True)
15
 
@@ -27,7 +28,7 @@ class VectorQuery(BaseModel):
27
  query_embeddings: Sequence[float]
28
  k: int
29
  score_threshold: float = Field(default=0.7)
30
- filter_metadata: Optional[Dict[str, Any]] = None
31
 
32
 
33
  @final
@@ -47,17 +48,15 @@ class Content(ABC, BaseModel):
47
  pass
48
 
49
  @abstractmethod
50
- def get_metadata(self: Self) -> Dict[str, Any]:
51
  pass
52
 
53
- @property
54
  @abstractmethod
55
  def get_text(self: Self) -> str:
56
  pass
57
 
58
- @property
59
  @abstractmethod
60
- def bytes(self: Self) -> bytes:
61
  pass
62
 
63
  @property
 
1
  from abc import ABC, abstractmethod
2
  from pydantic import BaseModel, ConfigDict, Field
3
+ from types import MappingProxyType
4
+ from typing import Any, Dict, final, Mapping, Self, Sequence, Optional
5
 
6
 
7
  class Chunk(BaseModel):
 
10
  text: str # The text representation
11
  parent_id: str # The source content’s identity
12
  chunk_id: str # This chunk’s identity—unique within the source content
13
+ metadata: Mapping[str, Any]
14
 
15
  model_config = ConfigDict(frozen=True)
16
 
 
28
  query_embeddings: Sequence[float]
29
  k: int
30
  score_threshold: float = Field(default=0.7)
31
+ filter_metadata: Optional[Mapping[str, Any]] = None
32
 
33
 
34
  @final
 
48
  pass
49
 
50
  @abstractmethod
51
+ def get_metadata(self: Self) -> Mapping[str, Any]:
52
  pass
53
 
 
54
  @abstractmethod
55
  def get_text(self: Self) -> str:
56
  pass
57
 
 
58
  @abstractmethod
59
+ def get_bytes(self: Self) -> bytes:
60
  pass
61
 
62
  @property
src/ctp_slack_bot/models/slack.py CHANGED
@@ -1,7 +1,8 @@
1
  from datetime import datetime
2
  from json import dumps
3
  from pydantic import BaseModel, ConfigDict, PositiveInt, PrivateAttr
4
- from typing import Any, Dict, Literal, Optional, Self, Sequence
 
5
 
6
  from ctp_slack_bot.models.base import Chunk, Content
7
 
@@ -71,16 +72,15 @@ class SlackMessage(Content):
71
  def get_chunks(self: Self) -> Sequence[Chunk]:
72
  return (Chunk(text=self.text, parent_id=self.id, chunk_id="", metadata=self.get_metadata()), )
73
 
74
- def get_metadata(self: Self) -> Dict[str, Any]:
75
- return {
76
  "modificationTime": datetime.fromtimestamp(float(self.ts))
77
- }
78
 
79
  def get_text(self: Self) -> str:
80
  return self.text
81
 
82
- @property
83
- def bytes(self: Self) -> bytes:
84
  return self._canonical_json
85
 
86
  @property
 
1
  from datetime import datetime
2
  from json import dumps
3
  from pydantic import BaseModel, ConfigDict, PositiveInt, PrivateAttr
4
+ from types import MappingProxyType
5
+ from typing import Any, Dict, Literal, Mapping, Optional, Self, Sequence
6
 
7
  from ctp_slack_bot.models.base import Chunk, Content
8
 
 
72
  def get_chunks(self: Self) -> Sequence[Chunk]:
73
  return (Chunk(text=self.text, parent_id=self.id, chunk_id="", metadata=self.get_metadata()), )
74
 
75
+ def get_metadata(self: Self) -> Mapping[str, Any]:
76
+ return MappingProxyType({
77
  "modificationTime": datetime.fromtimestamp(float(self.ts))
78
+ })
79
 
80
  def get_text(self: Self) -> str:
81
  return self.text
82
 
83
+ def get_bytes(self: Self) -> bytes:
 
84
  return self._canonical_json
85
 
86
  @property
src/ctp_slack_bot/models/webvtt.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime, timedelta
2
+ from io import BytesIO
3
+ from json import dumps
4
+ from pydantic import BaseModel, ConfigDict, PositiveInt, PrivateAttr
5
+ import re
6
+ from types import MappingProxyType
7
+ from typing import Any, Dict, Literal, Mapping, Optional, Self, Sequence
8
+ from webvtt import Caption, WebVTT
9
+
10
+ from ctp_slack_bot.models.base import Chunk, Content
11
+
12
+ SPEAKER_SPEECH_CAPTION_TEXT_PATTERN = re.compile('(?:([^:]+): )?(.*)')
13
+
14
+ class WebVTTFrame(BaseModel):
15
+ """Represents a WebVTT frame"""
16
+
17
+ identifier: str
18
+ start: timedelta
19
+ end: timedelta
20
+ speaker: Optional[str] = None
21
+ speech: str
22
+
23
+ model_config = ConfigDict(frozen=True)
24
+
25
+ @classmethod
26
+ def from_webvtt_caption(cls: type["WebVTTFrame"], caption: Caption) -> Self:
27
+ identifier = caption.identifier
28
+ start = timedelta(**caption.start_time.__dict__)
29
+ end = timedelta(**caption.end_time.__dict__)
30
+ speech = caption.text
31
+ match SPEAKER_SPEECH_CAPTION_TEXT_PATTERN.search(speech).groups():
32
+ case (speaker, speech):
33
+ return cls(identifier=identifier, start=start, end=end, speaker=speaker, speech=speech)
34
+ case _:
35
+ return cls(identifier=identifier, start=start, end=end, speech=speech)
36
+
37
+
38
+ class WebVTTFile(Content): # TODO: insert a FileContent class in the object inheritance hierarchy.
39
+ """Represents a message from Slack after adaptation."""
40
+
41
+ filename: str
42
+ modification_time: datetime
43
+ bytes: bytes
44
+
45
+
46
+ def get_chunks(self: Self) -> Sequence[Chunk]:
47
+ return tuple(Chunk(text=frame.speech,
48
+ parent_id=self.id,
49
+ chunk_id=frame.identifier,
50
+ metadata={
51
+ "filename": self.filename,
52
+ "start": self.modification_time + frame.start,
53
+ "end": self.modification_time + frame.end,
54
+ "user": frame.speaker
55
+ })
56
+ for frame
57
+ in self.get_frames())
58
+
59
+ def get_metadata(self: Self) -> Mapping[str, Any]:
60
+ return MappingProxyType({
61
+ "filename": self.filename,
62
+ "modificationTime": self.modification_time
63
+ })
64
+
65
+ def get_text(self: Self) -> str: # TODO
66
+ raise NotImplemented()
67
+
68
+ def get_bytes(self: Self) -> bytes:
69
+ return self.bytes
70
+
71
+ def get_frames(self: Self) -> Sequence[WebVTTFrame]:
72
+ return tuple(map(WebVTTFrame.from_webvtt_caption, WebVTT.from_buffer(BytesIO(buffer)).captions))
73
+
74
+ @property
75
+ def id(self: Self) -> str:
76
+ return f"file:{self.filename}"
src/ctp_slack_bot/services/context_retrieval_service.py CHANGED
@@ -53,8 +53,8 @@ class ContextRetrievalService(BaseModel):
53
 
54
  # Perform similarity search
55
  try:
56
- results = await self.vector_db_service.search_by_similarity(query)
57
- #logger.info(f"Retrieved {len(results)} context chunks for query")
58
  return results
59
  except Exception as e:
60
  logger.error(f"Error retrieving context: {str(e)}")
 
53
 
54
  # Perform similarity search
55
  try:
56
+ results = await self.vector_database_service.search_by_similarity(query)
57
+ logger.info(f"Retrieved {len(results)} context chunks for query")
58
  return results
59
  except Exception as e:
60
  logger.error(f"Error retrieving context: {str(e)}")