Spaces:

sabazo
/

insurance_advisor_wb

Sleeping

App Files Files Community

insurance_advisor_wb / rag_app /knowledge_base /utils.py

isayahc

updating constants and refactoring functions

5ffaf21 unverified over 1 year ago

raw

history blame

3.07 kB

	from langchain_core.documents import Document
	from chains import generate_document_summary_prompt
	from config import SEVEN_B_LLM_MODEL
	# embeddings functions
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_community.embeddings.sentence_transformer import (
	SentenceTransformerEmbeddings,
	)
	import time
	from langchain_core.documents import Document
	from config import EMBEDDING_MODEL
	from langchain.retrievers import VectorStoreRetriever
	from langchain_core.vectorstores import VectorStoreRetriever

	def create_embeddings(
	docs: list[Document],
	chunk_size:int = 500,
	chunk_overlap:int = 50,
	):
	"""given a sequence of `Document` objects this fucntion will
	generate embeddings for it.

	## argument
	:params docs (list[Document]) -> list of `list[Document]`
	:params chunk_size (int) -> chunk size in which documents are chunks, defaults to 500
	:params chunk_overlap (int) -> the amount of token that will be overlapped between chunks, defaults to 50
	:params embedding_model (str) -> the huggingspace model that will embed the documents
	## Return
	Tuple of embedding and chunks
	"""


	text_splitter = RecursiveCharacterTextSplitter(
	separators=["\n\n", "\n", "(?<=\. )", " ", ""],
	chunk_size = chunk_size,
	chunk_overlap = chunk_overlap,
	length_function = len,
	)

	# Stage one: read all the docs, split them into chunks.
	st = time.time()
	print('Loading documents and creating chunks ...')

	# Split each document into chunks using the configured text splitter
	chunks = text_splitter.create_documents([doc.page_content for doc in docs], metadatas=[doc.metadata for doc in docs])
	et = time.time() - st
	print(f'Time taken to chunk {len(docs)} documents: {et} seconds.')

	#Stage two: embed the docs.
	embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL)
	print(f"created a total of {len(chunks)} chunks")

	return embeddings,chunks


	def generate_document_summaries(
	docs: list[Document]
	) -> list[Document]:
	"""
	Generates summaries for a list of Document objects and updates their metadata with the summaries.

	Args:
	docs (List[Document]): A list of Document objects to generate summaries for.

	Returns:
	List[Document]: A new list of Document objects with updated metadata containing the summaries.

	Example:
	docs = [Document(metadata={"title": "Doc1"}), Document(metadata={"title": "Doc2"})]
	updated_docs = generate_document_summaries(docs)
	for doc in updated_docs:
	print(doc.metadata["summary"])

	"""

	new_docs = docs.copy()

	for doc in new_docs:

	genrate_summary_chain = generate_document_summary_prompt \| SEVEN_B_LLM_MODEL
	summary = genrate_summary_chain.invoke(
	{"document":str(doc.metadata)}
	)

	doc.metadata.update(
	{"summary":summary}
	)

	return new_docs