isayahc's picture
attempt to refactor module
3a85771 unverified
raw
history blame
8.88 kB
from langchain_core.documents import Document
from chains import generate_document_summary_prompt
from config import SEVEN_B_LLM_MODEL
# embeddings functions
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings.sentence_transformer import (
SentenceTransformerEmbeddings,
)
import time
from langchain_core.documents import Document
from config import EMBEDDING_MODEL
from langchain.retrievers import VectorStoreRetriever
from langchain_core.vectorstores import VectorStoreRetriever
# vectorization functions
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores import Chroma
from langchain_community.retrievers import BM25Retriever
from rag_app.knowledge_base.utils import create_embeddings
from rag_app.utils.generate_summary import generate_description, generate_keywords
import time
import os
from config import FAISS_INDEX_PATH
from pathlib import Path
from langchain_community.vectorstores import FAISS
from dotenv import load_dotenv
import os
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
import requests
from langchain_community.vectorstores import Chroma
def create_embeddings(
docs: list[Document],
chunk_size:int = 500,
chunk_overlap:int = 50,
):
"""given a sequence of `Document` objects this fucntion will
generate embeddings for it.
## argument
:params docs (list[Document]) -> list of `list[Document]`
:params chunk_size (int) -> chunk size in which documents are chunks, defaults to 500
:params chunk_overlap (int) -> the amount of token that will be overlapped between chunks, defaults to 50
:params embedding_model (str) -> the huggingspace model that will embed the documents
## Return
Tuple of embedding and chunks
"""
text_splitter = RecursiveCharacterTextSplitter(
separators=["\n\n", "\n", "(?<=\. )", " ", ""],
chunk_size = chunk_size,
chunk_overlap = chunk_overlap,
length_function = len,
)
# Stage one: read all the docs, split them into chunks.
st = time.time()
print('Loading documents and creating chunks ...')
# Split each document into chunks using the configured text splitter
chunks = text_splitter.create_documents([doc.page_content for doc in docs], metadatas=[doc.metadata for doc in docs])
et = time.time() - st
print(f'Time taken to chunk {len(docs)} documents: {et} seconds.')
#Stage two: embed the docs.
embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL)
print(f"created a total of {len(chunks)} chunks")
return embeddings,chunks
def generate_document_summaries(
docs: list[Document]
) -> list[Document]:
"""
Generates summaries for a list of Document objects and updates their metadata with the summaries.
Args:
docs (List[Document]): A list of Document objects to generate summaries for.
Returns:
List[Document]: A new list of Document objects with updated metadata containing the summaries.
Example:
docs = [Document(metadata={"title": "Doc1"}), Document(metadata={"title": "Doc2"})]
updated_docs = generate_document_summaries(docs)
for doc in updated_docs:
print(doc.metadata["summary"])
"""
new_docs = docs.copy()
for doc in new_docs:
genrate_summary_chain = generate_document_summary_prompt | SEVEN_B_LLM_MODEL
summary = genrate_summary_chain.invoke(
{"document":str(doc.metadata)}
)
doc.metadata.update(
{"summary":summary}
)
return new_docs
def build_vector_store(
docs: list,
embedding_model: str,
new_db:bool=False,
chunk_size:int=500,
chunk_overlap:int=50,
):
"""
"""
embeddings,chunks = create_embeddings(
docs,
chunk_size,
chunk_overlap,
embedding_model
)
#load chunks into vector store
print(f'Loading chunks into faiss vector store ...')
st = time.time()
if new_db:
db_faiss = FAISS.from_documents(chunks, embeddings)
bm25_retriever = BM25Retriever.from_documents(chunks)
else:
db_faiss = FAISS.add_documents(chunks, embeddings)
bm25_retriever = BM25Retriever.add_documents(chunks)
db_faiss.save_local(FAISS_INDEX_PATH)
et = time.time() - st
print(f'Time taken: {et} seconds.')
print(f'Loading chunks into chroma vector store ...')
st = time.time()
persist_directory='./vectorstore/chroma-insurance-agent-1500'
db_chroma = Chroma.from_documents(chunks, embeddings, persist_directory=persist_directory)
et = time.time() - st
print(f'Time taken: {et} seconds.')
result = f"built vectore store at {FAISS_INDEX_PATH}"
return result
def get_reranked_docs_faiss(
query:str,
path_to_db:str,
embedding_model:str,
hf_api_key:str,
num_docs:int=5
) -> list:
""" Re-ranks the similarity search results and returns top-k highest ranked docs
Args:
query (str): The search query
path_to_db (str): Path to the vectorstore database
embedding_model (str): Embedding model used in the vector store
num_docs (int): Number of documents to return
Returns: A list of documents with the highest rank
"""
assert num_docs <= 10, "num_docs should be less than similarity search results"
embeddings = HuggingFaceInferenceAPIEmbeddings(
api_key=hf_api_key,
model_name=embedding_model
)
# Load the vectorstore database
db = FAISS.load_local(
folder_path=path_to_db,
embeddings=embeddings,
allow_dangerous_deserialization=True
)
# Get 10 documents based on similarity search
docs = db.similarity_search(query=query, k=10)
# Add the page_content, description and title together
passages = [doc.page_content + "\n" + doc.metadata.get('title', "") +"\n"+ doc.metadata.get('description', "")
for doc in docs]
# Prepare the payload
inputs = [{"text": query, "text_pair": passage} for passage in passages]
API_URL = "https://api-inference.huggingface.co/models/deepset/gbert-base-germandpr-reranking"
headers = {"Authorization": f"Bearer {hf_api_key}"}
response = requests.post(API_URL, headers=headers, json=inputs)
scores = response.json()
try:
relevance_scores = [item[1]['score'] for item in scores]
except ValueError as e:
print('Could not get the relevance_scores -> something might be wrong with the json output')
return
if relevance_scores:
ranked_results = sorted(zip(docs, passages, relevance_scores), key=lambda x: x[2], reverse=True)
top_k_results = ranked_results[:num_docs]
return [doc for doc, _, _ in top_k_results]
def get_reranked_docs_chroma(query:str,
path_to_db:str,
embedding_model:str,
hf_api_key:str,
reranking_hf_url:str = "https://api-inference.huggingface.co/models/sentence-transformers/all-mpnet-base-v2",
num_docs:int=5) -> list:
""" Re-ranks the similarity search results and returns top-k highest ranked docs
Args:
query (str): The search query
path_to_db (str): Path to the vectorstore database
embedding_model (str): Embedding model used in the vector store
num_docs (int): Number of documents to return
Returns: A list of documents with the highest rank
"""
embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=hf_api_key,
model_name=embedding_model)
# Load the vectorstore database
db = Chroma(persist_directory=path_to_db, embedding_function=embeddings)
# Get k documents based on similarity search
sim_docs = db.similarity_search(query=query, k=10)
passages = [doc.page_content for doc in sim_docs]
# Prepare the payload
payload = {"inputs":
{"source_sentence": query,
"sentences": passages}}
headers = {"Authorization": f"Bearer {hf_api_key}"}
response = requests.post(url=reranking_hf_url, headers=headers, json=payload)
print(f'{response = }')
if response.status_code != 200:
print('Something went wrong with the response')
return
similarity_scores = response.json()
ranked_results = sorted(zip(sim_docs, passages, similarity_scores), key=lambda x: x[2], reverse=True)
top_k_results = ranked_results[:num_docs]
return [doc for doc, _, _ in top_k_results]