LiKenun's picture
Provisional project structure
005a292
raw
history blame
1.86 kB
from abc import ABC, abstractmethod
from datetime import datetime
from pydantic import BaseModel, Field, validator
from typing import Dict, List, Optional, Union, Any, ClassVar
import hashlib
import json
class Metadata(BaseModel):
"""A class representing metadata about content."""
id: str # The content’s identity consistent across modifications
modification_time: datetime # The content’s modification for detection of alterations
hash: str # The content’s hash for detection of alterations
class Content(BaseModel):
"""A class representing ingested content."""
metadata: Metadata
class Ingestible(ABC, BaseModel):
"""An abstract base class for ingestible content."""
metadata: Metadata
@property
@abstractmethod
def content(self) -> Content:
"""
Return content ready for vectorization.
This could be:
- A single string
- A list of strings (pre-chunked)
- A more complex structure that can be recursively processed
"""
pass
def get_chunks(self) -> List[str]:
"""
Split content into chunks suitable for vectorization.
Override this in subclasses for specialized chunking logic.
"""
content = self.content
if isinstance(content, str):
# Simple chunking by character count
return [content[i:i+self.chunk_size]
for i in range(0, len(content), self.chunk_size)]
elif isinstance(content, list):
# Content is already chunked
return content
else:
raise ValueError(f"Unsupported content type: {type(content)}")
@property
def key(self) -> str:
"""Convenience accessor for the metadata key."""
return self.metadata.key