from abc import ABC, abstractmethod from datetime import datetime from pydantic import BaseModel, Field, validator from typing import Dict, List, Optional, Union, Any, ClassVar import hashlib import json class Metadata(BaseModel): """A class representing metadata about content.""" id: str # The content’s identity consistent across modifications modification_time: datetime # The content’s modification for detection of alterations hash: str # The content’s hash for detection of alterations class Content(BaseModel): """A class representing ingested content.""" metadata: Metadata class Ingestible(ABC, BaseModel): """An abstract base class for ingestible content.""" metadata: Metadata @property @abstractmethod def content(self) -> Content: """ Return content ready for vectorization. This could be: - A single string - A list of strings (pre-chunked) - A more complex structure that can be recursively processed """ pass def get_chunks(self) -> List[str]: """ Split content into chunks suitable for vectorization. Override this in subclasses for specialized chunking logic. """ content = self.content if isinstance(content, str): # Simple chunking by character count return [content[i:i+self.chunk_size] for i in range(0, len(content), self.chunk_size)] elif isinstance(content, list): # Content is already chunked return content else: raise ValueError(f"Unsupported content type: {type(content)}") @property def key(self) -> str: """Convenience accessor for the metadata key.""" return self.metadata.key