Spaces:
Sleeping
Sleeping
general refactoring
Browse files
rag_app/knowledge_base/build_vector_store.py
CHANGED
@@ -1,18 +1,13 @@
|
|
1 |
# vectorization functions
|
2 |
from langchain_community.vectorstores import FAISS
|
3 |
from langchain_community.vectorstores import Chroma
|
4 |
-
#from langchain_community.document_loaders import DirectoryLoader
|
5 |
-
#from langchain_text_splitters import RecursiveCharacterTextSplitter
|
6 |
-
#from langchain_community.embeddings.sentence_transformer import (
|
7 |
-
# SentenceTransformerEmbeddings,
|
8 |
-
#)
|
9 |
-
#from langchain_huggingface import HuggingFaceEmbeddings
|
10 |
from langchain_community.retrievers import BM25Retriever
|
|
|
11 |
from rag_app.knowledge_base.create_embedding import create_embeddings
|
12 |
from rag_app.utils.generate_summary import generate_description, generate_keywords
|
|
|
13 |
import time
|
14 |
import os
|
15 |
-
#from dotenv import load_dotenv
|
16 |
|
17 |
def build_vector_store(
|
18 |
docs: list,
|
@@ -32,11 +27,6 @@ def build_vector_store(
|
|
32 |
FAISS_INDEX_PATH = db_path
|
33 |
|
34 |
embeddings,chunks = create_embeddings(docs, chunk_size, chunk_overlap, embedding_model)
|
35 |
-
# for chunk in chunks:
|
36 |
-
# keywords=generate_keywords(chunk)
|
37 |
-
# description=generate_description(chunk)
|
38 |
-
# chunk.metadata['keywords']=keywords
|
39 |
-
# chunk.metadata['description']=description
|
40 |
|
41 |
#load chunks into vector store
|
42 |
print(f'Loading chunks into faiss vector store ...')
|
|
|
1 |
# vectorization functions
|
2 |
from langchain_community.vectorstores import FAISS
|
3 |
from langchain_community.vectorstores import Chroma
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
from langchain_community.retrievers import BM25Retriever
|
5 |
+
|
6 |
from rag_app.knowledge_base.create_embedding import create_embeddings
|
7 |
from rag_app.utils.generate_summary import generate_description, generate_keywords
|
8 |
+
|
9 |
import time
|
10 |
import os
|
|
|
11 |
|
12 |
def build_vector_store(
|
13 |
docs: list,
|
|
|
27 |
FAISS_INDEX_PATH = db_path
|
28 |
|
29 |
embeddings,chunks = create_embeddings(docs, chunk_size, chunk_overlap, embedding_model)
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
#load chunks into vector store
|
32 |
print(f'Loading chunks into faiss vector store ...')
|
rag_app/knowledge_base/create_embedding.py
CHANGED
@@ -1,21 +1,16 @@
|
|
1 |
# embeddings functions
|
2 |
-
#from langchain_community.vectorstores import FAISS
|
3 |
-
#from langchain_community.document_loaders import ReadTheDocsLoader
|
4 |
-
#from langchain_community.vectorstores.utils import filter_complex_metadata
|
5 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
6 |
-
# from langchain_huggingface import HuggingFaceEmbeddings
|
7 |
from langchain_community.embeddings.sentence_transformer import (
|
8 |
SentenceTransformerEmbeddings,
|
9 |
)
|
10 |
import time
|
11 |
from langchain_core.documents import Document
|
12 |
-
|
13 |
|
14 |
def create_embeddings(
|
15 |
docs: list[Document],
|
16 |
chunk_size:int = 500,
|
17 |
chunk_overlap:int = 50,
|
18 |
-
embedding_model: str = "sentence-transformers/multi-qa-mpnet-base-dot-v1",
|
19 |
):
|
20 |
"""given a sequence of `Document` objects this fucntion will
|
21 |
generate embeddings for it.
|
@@ -47,8 +42,7 @@ def create_embeddings(
|
|
47 |
print(f'Time taken to chunk {len(docs)} documents: {et} seconds.')
|
48 |
|
49 |
#Stage two: embed the docs.
|
50 |
-
|
51 |
-
embeddings = SentenceTransformerEmbeddings(model_name=embedding_model)
|
52 |
print(f"created a total of {len(chunks)} chunks")
|
53 |
|
54 |
return embeddings,chunks
|
|
|
1 |
# embeddings functions
|
|
|
|
|
|
|
2 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
|
3 |
from langchain_community.embeddings.sentence_transformer import (
|
4 |
SentenceTransformerEmbeddings,
|
5 |
)
|
6 |
import time
|
7 |
from langchain_core.documents import Document
|
8 |
+
from config import EMBEDDING_MODEL
|
9 |
|
10 |
def create_embeddings(
|
11 |
docs: list[Document],
|
12 |
chunk_size:int = 500,
|
13 |
chunk_overlap:int = 50,
|
|
|
14 |
):
|
15 |
"""given a sequence of `Document` objects this fucntion will
|
16 |
generate embeddings for it.
|
|
|
42 |
print(f'Time taken to chunk {len(docs)} documents: {et} seconds.')
|
43 |
|
44 |
#Stage two: embed the docs.
|
45 |
+
embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL)
|
|
|
46 |
print(f"created a total of {len(chunks)} chunks")
|
47 |
|
48 |
return embeddings,chunks
|
rag_app/knowledge_base/reranking.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
# from get_db_retriever import get_db_retriever
|
2 |
from pathlib import Path
|
3 |
from langchain_community.vectorstores import FAISS
|
4 |
from dotenv import load_dotenv
|
@@ -11,29 +10,36 @@ from langchain_community.vectorstores import Chroma
|
|
11 |
load_dotenv()
|
12 |
|
13 |
|
14 |
-
def get_reranked_docs_faiss(
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
|
|
|
|
19 |
""" Re-ranks the similarity search results and returns top-k highest ranked docs
|
20 |
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
"""
|
29 |
assert num_docs <= 10, "num_docs should be less than similarity search results"
|
30 |
|
31 |
-
embeddings = HuggingFaceInferenceAPIEmbeddings(
|
32 |
-
|
|
|
|
|
|
|
33 |
# Load the vectorstore database
|
34 |
-
db = FAISS.load_local(
|
35 |
-
|
36 |
-
|
|
|
|
|
37 |
|
38 |
# Get 10 documents based on similarity search
|
39 |
docs = db.similarity_search(query=query, k=10)
|
|
|
|
|
1 |
from pathlib import Path
|
2 |
from langchain_community.vectorstores import FAISS
|
3 |
from dotenv import load_dotenv
|
|
|
10 |
load_dotenv()
|
11 |
|
12 |
|
13 |
+
def get_reranked_docs_faiss(
|
14 |
+
query:str,
|
15 |
+
path_to_db:str,
|
16 |
+
embedding_model:str,
|
17 |
+
hf_api_key:str,
|
18 |
+
num_docs:int=5
|
19 |
+
) -> list:
|
20 |
""" Re-ranks the similarity search results and returns top-k highest ranked docs
|
21 |
|
22 |
+
Args:
|
23 |
+
query (str): The search query
|
24 |
+
path_to_db (str): Path to the vectorstore database
|
25 |
+
embedding_model (str): Embedding model used in the vector store
|
26 |
+
num_docs (int): Number of documents to return
|
27 |
+
|
28 |
+
Returns: A list of documents with the highest rank
|
29 |
"""
|
30 |
assert num_docs <= 10, "num_docs should be less than similarity search results"
|
31 |
|
32 |
+
embeddings = HuggingFaceInferenceAPIEmbeddings(
|
33 |
+
api_key=hf_api_key,
|
34 |
+
model_name=embedding_model
|
35 |
+
)
|
36 |
+
|
37 |
# Load the vectorstore database
|
38 |
+
db = FAISS.load_local(
|
39 |
+
folder_path=path_to_db,
|
40 |
+
embeddings=embeddings,
|
41 |
+
allow_dangerous_deserialization=True
|
42 |
+
)
|
43 |
|
44 |
# Get 10 documents based on similarity search
|
45 |
docs = db.similarity_search(query=query, k=10)
|