Spaces:

puzan8932
/

Project

Runtime error

Project / core /services /document /add_document.py

updated

ad87194 5 months ago

1.51 kB


	import string
	from langchain.docstore.document import Document
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from core.services.vector_db.qdrent.upload_document import upload_document_existing_collection


	class AddDocument:
	def __init__(self, vector_embedding, sparse_embedding):
	self.vector_embed = vector_embedding
	self.sparse_embed = sparse_embedding

	def clean_text(self,text:str)->str:
	text=text.replace("\n", "")
	text = text.translate(str.maketrans('', '', string.punctuation.replace(".", "")))
	text = text.encode('utf-8', errors='ignore').decode('utf-8')
	return text
	def add_documents(self, texts: list[tuple[str]], vectorstore: str):
	splitter = RecursiveCharacterTextSplitter(
	chunk_size=400,
	chunk_overlap=100,
	add_start_index=True
	)
	contents, sources = zip(*texts)
	cleaned_texts = [self.clean_text(text) for text in contents]

	# Create Document objects
	docs = [
	Document(page_content=text, metadata={"source": source})
	for text, source in zip(cleaned_texts, sources)
	]

	# Split documents and upload
	documents = splitter.split_documents(docs)
	upload_document_existing_collection(vector_embed=self.vector_embed,
	sparse_embed=self.sparse_embed,
	vectorstore=vectorstore, documents=documents)