Project / core /services /document /add_document.py
puzan789's picture
updated
ad87194
raw
history blame
1.51 kB
import string
from langchain.docstore.document import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from core.services.vector_db.qdrent.upload_document import upload_document_existing_collection
class AddDocument:
def __init__(self, vector_embedding, sparse_embedding):
self.vector_embed = vector_embedding
self.sparse_embed = sparse_embedding
def clean_text(self,text:str)->str:
text=text.replace("\n", "")
text = text.translate(str.maketrans('', '', string.punctuation.replace(".", "")))
text = text.encode('utf-8', errors='ignore').decode('utf-8')
return text
def add_documents(self, texts: list[tuple[str]], vectorstore: str):
splitter = RecursiveCharacterTextSplitter(
chunk_size=400,
chunk_overlap=100,
add_start_index=True
)
contents, sources = zip(*texts)
cleaned_texts = [self.clean_text(text) for text in contents]
# Create Document objects
docs = [
Document(page_content=text, metadata={"source": source})
for text, source in zip(cleaned_texts, sources)
]
# Split documents and upload
documents = splitter.split_documents(docs)
upload_document_existing_collection(vector_embed=self.vector_embed,
sparse_embed=self.sparse_embed,
vectorstore=vectorstore, documents=documents)