|
|
|
import string |
|
from langchain.docstore.document import Document |
|
from langchain_text_splitters import RecursiveCharacterTextSplitter |
|
from core.services.vector_db.qdrent.upload_document import upload_document_existing_collection |
|
|
|
|
|
class AddDocument: |
|
def __init__(self, vector_embedding, sparse_embedding): |
|
self.vector_embed = vector_embedding |
|
self.sparse_embed = sparse_embedding |
|
|
|
def clean_text(self,text:str)->str: |
|
text=text.replace("\n", "") |
|
text = text.translate(str.maketrans('', '', string.punctuation.replace(".", ""))) |
|
text = text.encode('utf-8', errors='ignore').decode('utf-8') |
|
return text |
|
def add_documents(self, texts: list[tuple[str]], vectorstore: str): |
|
splitter = RecursiveCharacterTextSplitter( |
|
chunk_size=400, |
|
chunk_overlap=100, |
|
add_start_index=True |
|
) |
|
contents, sources = zip(*texts) |
|
cleaned_texts = [self.clean_text(text) for text in contents] |
|
|
|
|
|
docs = [ |
|
Document(page_content=text, metadata={"source": source}) |
|
for text, source in zip(cleaned_texts, sources) |
|
] |
|
|
|
|
|
documents = splitter.split_documents(docs) |
|
upload_document_existing_collection(vector_embed=self.vector_embed, |
|
sparse_embed=self.sparse_embed, |
|
vectorstore=vectorstore, documents=documents) |
|
|