File size: 1,510 Bytes
ad87194
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38

import string
from langchain.docstore.document import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from core.services.vector_db.qdrent.upload_document import upload_document_existing_collection


class AddDocument:
    def __init__(self, vector_embedding, sparse_embedding):
        self.vector_embed = vector_embedding
        self.sparse_embed = sparse_embedding

    def clean_text(self,text:str)->str:
        text=text.replace("\n", "")
        text = text.translate(str.maketrans('', '', string.punctuation.replace(".", "")))
        text = text.encode('utf-8', errors='ignore').decode('utf-8')
        return text
    def add_documents(self, texts: list[tuple[str]], vectorstore: str):
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=400,
            chunk_overlap=100,
            add_start_index=True
        )
        contents, sources = zip(*texts)
        cleaned_texts = [self.clean_text(text) for text in contents]

        # Create Document objects
        docs = [
            Document(page_content=text, metadata={"source": source})
            for text, source in zip(cleaned_texts, sources)
        ]

        # Split documents and upload
        documents = splitter.split_documents(docs)
        upload_document_existing_collection(vector_embed=self.vector_embed,
                                            sparse_embed=self.sparse_embed,
                                            vectorstore=vectorstore, documents=documents)