import string from langchain.docstore.document import Document from langchain_text_splitters import RecursiveCharacterTextSplitter from core.services.vector_db.qdrent.upload_document import upload_document_existing_collection class AddDocument: def __init__(self, vector_embedding, sparse_embedding): self.vector_embed = vector_embedding self.sparse_embed = sparse_embedding def clean_text(self,text:str)->str: text=text.replace("\n", "") text = text.translate(str.maketrans('', '', string.punctuation.replace(".", ""))) text = text.encode('utf-8', errors='ignore').decode('utf-8') return text def add_documents(self, texts: list[tuple[str]], vectorstore: str): splitter = RecursiveCharacterTextSplitter( chunk_size=400, chunk_overlap=100, add_start_index=True ) contents, sources = zip(*texts) cleaned_texts = [self.clean_text(text) for text in contents] # Create Document objects docs = [ Document(page_content=text, metadata={"source": source}) for text, source in zip(cleaned_texts, sources) ] # Split documents and upload documents = splitter.split_documents(docs) upload_document_existing_collection(vector_embed=self.vector_embed, sparse_embed=self.sparse_embed, vectorstore=vectorstore, documents=documents)