Spaces:
Sleeping
Sleeping
File size: 5,317 Bytes
bf6d237 7d09ea9 bf6d237 7d09ea9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import logging
import tempfile
from pathlib import Path
from typing import BinaryIO, List
from injector import inject, singleton
from llama_index import (
ServiceContext,
StorageContext,
)
from llama_index.node_parser import SentenceWindowNodeParser
from private_gpt.components.embedding.embedding_component import EmbeddingComponent
from private_gpt.components.ingest.ingest_component import get_ingestion_component
from private_gpt.components.llm.llm_component import LLMComponent
from private_gpt.components.node_store.node_store_component import NodeStoreComponent
from private_gpt.components.vector_store.vector_store_component import (
VectorStoreComponent,
)
from private_gpt.server.ingest.model import IngestedDoc
from private_gpt.settings.settings import settings
logger = logging.getLogger(__name__)
@singleton
class IngestService:
@inject
def __init__(
self,
llm_component: LLMComponent,
vector_store_component: VectorStoreComponent,
embedding_component: EmbeddingComponent,
node_store_component: NodeStoreComponent,
) -> None:
self.llm_service = llm_component
self.storage_context = StorageContext.from_defaults(
vector_store=vector_store_component.vector_store,
docstore=node_store_component.doc_store,
index_store=node_store_component.index_store,
)
node_parser = SentenceWindowNodeParser.from_defaults()
self.ingest_service_context = ServiceContext.from_defaults(
llm=self.llm_service.llm,
embed_model=embedding_component.embedding_model,
node_parser=node_parser,
# Embeddings done early in the pipeline of node transformations, right
# after the node parsing
transformations=[node_parser, embedding_component.embedding_model],
)
self.ingest_component = get_ingestion_component(
self.storage_context, self.ingest_service_context, settings=settings()
)
def ingest(self, file_name: str, file_data: Path) -> list[IngestedDoc]:
logger.info("Ingesting file_name=%s", file_name)
documents = self.ingest_component.ingest(file_name, file_data)
return [IngestedDoc.from_document(document) for document in documents]
def ingest_bin_data(
self, file_name: str, raw_file_data: BinaryIO
) -> list[IngestedDoc]:
logger.debug("Ingesting binary data with file_name=%s", file_name)
file_data = raw_file_data.read()
logger.debug("Got file data of size=%s to ingest", len(file_data))
# llama-index mainly supports reading from files, so
# we have to create a tmp file to read for it to work
# delete=False to avoid a Windows 11 permission error.
with tempfile.NamedTemporaryFile(delete=False) as tmp:
try:
path_to_tmp = Path(tmp.name)
if isinstance(file_data, bytes):
path_to_tmp.write_bytes(file_data)
else:
path_to_tmp.write_text(str(file_data))
return self.ingest(file_name, path_to_tmp)
finally:
tmp.close()
path_to_tmp.unlink()
def bulk_ingest(self, files: list[tuple[str, Path]]) -> list[IngestedDoc]:
logger.info("Ingesting file_names=%s", [f[0] for f in files])
documents = self.ingest_component.bulk_ingest(files)
return [IngestedDoc.from_document(document) for document in documents]
def list_ingested(self) -> list[IngestedDoc]:
ingested_docs = []
try:
docstore = self.storage_context.docstore
ingested_docs_ids: set[str] = set()
for node in docstore.docs.values():
if node.ref_doc_id is not None:
ingested_docs_ids.add(node.ref_doc_id)
for doc_id in ingested_docs_ids:
ref_doc_info = docstore.get_ref_doc_info(ref_doc_id=doc_id)
doc_metadata = None
if ref_doc_info is not None and ref_doc_info.metadata is not None:
doc_metadata = IngestedDoc.curate_metadata(ref_doc_info.metadata)
ingested_docs.append(
IngestedDoc(
object="ingest.document",
doc_id=doc_id,
doc_metadata=doc_metadata,
)
)
except ValueError:
logger.warning("Got an exception when getting list of docs", exc_info=True)
pass
logger.debug("Found count=%s ingested documents", len(ingested_docs))
return ingested_docs
def delete(self, doc_id: str) -> None:
"""Delete an ingested document.
:raises ValueError: if the document does not exist
"""
logger.info(
"Deleting the ingested document=%s in the doc and index store", doc_id
)
self.ingest_component.delete(doc_id)
def list_ingested_filenames(self) -> List[str]:
"""Lists the filenames of ingested documents."""
ingested_documents = self.list_ingested()
unique_filenames = set(doc.doc_metadata.get("file_name", "") for doc in ingested_documents)
return list(unique_filenames)
|