|
from typing import Any, Literal |
|
|
|
from llama_index import Document |
|
from pydantic import BaseModel, Field |
|
|
|
|
|
class IngestedDoc(BaseModel): |
|
object: Literal["ingest.document"] |
|
doc_id: str = Field(examples=["c202d5e6-7b69-4869-81cc-dd574ee8ee11"]) |
|
doc_metadata: dict[str, Any] | None = Field( |
|
examples=[ |
|
{ |
|
"page_label": "2", |
|
"file_name": "Sales Report Q3 2023.pdf", |
|
} |
|
] |
|
) |
|
|
|
@staticmethod |
|
def curate_metadata(metadata: dict[str, Any]) -> dict[str, Any]: |
|
"""Remove unwanted metadata keys.""" |
|
for key in ["doc_id", "window", "original_text"]: |
|
metadata.pop(key, None) |
|
return metadata |
|
|
|
@staticmethod |
|
def from_document(document: Document) -> "IngestedDoc": |
|
return IngestedDoc( |
|
object="ingest.document", |
|
doc_id=document.doc_id, |
|
doc_metadata=IngestedDoc.curate_metadata(document.metadata), |
|
) |
|
|