isayahc commited on
Commit
5439651
·
unverified ·
1 Parent(s): dc0ebaa

general refactoring

Browse files
rag_app/knowledge_base/build_vector_store.py CHANGED
@@ -1,18 +1,13 @@
1
  # vectorization functions
2
  from langchain_community.vectorstores import FAISS
3
  from langchain_community.vectorstores import Chroma
4
- #from langchain_community.document_loaders import DirectoryLoader
5
- #from langchain_text_splitters import RecursiveCharacterTextSplitter
6
- #from langchain_community.embeddings.sentence_transformer import (
7
- # SentenceTransformerEmbeddings,
8
- #)
9
- #from langchain_huggingface import HuggingFaceEmbeddings
10
  from langchain_community.retrievers import BM25Retriever
 
11
  from rag_app.knowledge_base.create_embedding import create_embeddings
12
  from rag_app.utils.generate_summary import generate_description, generate_keywords
 
13
  import time
14
  import os
15
- #from dotenv import load_dotenv
16
 
17
  def build_vector_store(
18
  docs: list,
@@ -32,11 +27,6 @@ def build_vector_store(
32
  FAISS_INDEX_PATH = db_path
33
 
34
  embeddings,chunks = create_embeddings(docs, chunk_size, chunk_overlap, embedding_model)
35
- # for chunk in chunks:
36
- # keywords=generate_keywords(chunk)
37
- # description=generate_description(chunk)
38
- # chunk.metadata['keywords']=keywords
39
- # chunk.metadata['description']=description
40
 
41
  #load chunks into vector store
42
  print(f'Loading chunks into faiss vector store ...')
 
1
  # vectorization functions
2
  from langchain_community.vectorstores import FAISS
3
  from langchain_community.vectorstores import Chroma
 
 
 
 
 
 
4
  from langchain_community.retrievers import BM25Retriever
5
+
6
  from rag_app.knowledge_base.create_embedding import create_embeddings
7
  from rag_app.utils.generate_summary import generate_description, generate_keywords
8
+
9
  import time
10
  import os
 
11
 
12
  def build_vector_store(
13
  docs: list,
 
27
  FAISS_INDEX_PATH = db_path
28
 
29
  embeddings,chunks = create_embeddings(docs, chunk_size, chunk_overlap, embedding_model)
 
 
 
 
 
30
 
31
  #load chunks into vector store
32
  print(f'Loading chunks into faiss vector store ...')
rag_app/knowledge_base/create_embedding.py CHANGED
@@ -1,21 +1,16 @@
1
  # embeddings functions
2
- #from langchain_community.vectorstores import FAISS
3
- #from langchain_community.document_loaders import ReadTheDocsLoader
4
- #from langchain_community.vectorstores.utils import filter_complex_metadata
5
  from langchain_text_splitters import RecursiveCharacterTextSplitter
6
- # from langchain_huggingface import HuggingFaceEmbeddings
7
  from langchain_community.embeddings.sentence_transformer import (
8
  SentenceTransformerEmbeddings,
9
  )
10
  import time
11
  from langchain_core.documents import Document
12
-
13
 
14
  def create_embeddings(
15
  docs: list[Document],
16
  chunk_size:int = 500,
17
  chunk_overlap:int = 50,
18
- embedding_model: str = "sentence-transformers/multi-qa-mpnet-base-dot-v1",
19
  ):
20
  """given a sequence of `Document` objects this fucntion will
21
  generate embeddings for it.
@@ -47,8 +42,7 @@ def create_embeddings(
47
  print(f'Time taken to chunk {len(docs)} documents: {et} seconds.')
48
 
49
  #Stage two: embed the docs.
50
- #embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
51
- embeddings = SentenceTransformerEmbeddings(model_name=embedding_model)
52
  print(f"created a total of {len(chunks)} chunks")
53
 
54
  return embeddings,chunks
 
1
  # embeddings functions
 
 
 
2
  from langchain_text_splitters import RecursiveCharacterTextSplitter
 
3
  from langchain_community.embeddings.sentence_transformer import (
4
  SentenceTransformerEmbeddings,
5
  )
6
  import time
7
  from langchain_core.documents import Document
8
+ from config import EMBEDDING_MODEL
9
 
10
  def create_embeddings(
11
  docs: list[Document],
12
  chunk_size:int = 500,
13
  chunk_overlap:int = 50,
 
14
  ):
15
  """given a sequence of `Document` objects this fucntion will
16
  generate embeddings for it.
 
42
  print(f'Time taken to chunk {len(docs)} documents: {et} seconds.')
43
 
44
  #Stage two: embed the docs.
45
+ embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL)
 
46
  print(f"created a total of {len(chunks)} chunks")
47
 
48
  return embeddings,chunks
rag_app/knowledge_base/reranking.py CHANGED
@@ -1,4 +1,3 @@
1
- # from get_db_retriever import get_db_retriever
2
  from pathlib import Path
3
  from langchain_community.vectorstores import FAISS
4
  from dotenv import load_dotenv
@@ -11,29 +10,36 @@ from langchain_community.vectorstores import Chroma
11
  load_dotenv()
12
 
13
 
14
- def get_reranked_docs_faiss(query:str,
15
- path_to_db:str,
16
- embedding_model:str,
17
- hf_api_key:str,
18
- num_docs:int=5) -> list:
 
 
19
  """ Re-ranks the similarity search results and returns top-k highest ranked docs
20
 
21
- Args:
22
- query (str): The search query
23
- path_to_db (str): Path to the vectorstore database
24
- embedding_model (str): Embedding model used in the vector store
25
- num_docs (int): Number of documents to return
26
-
27
- Returns: A list of documents with the highest rank
28
  """
29
  assert num_docs <= 10, "num_docs should be less than similarity search results"
30
 
31
- embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=hf_api_key,
32
- model_name=embedding_model)
 
 
 
33
  # Load the vectorstore database
34
- db = FAISS.load_local(folder_path=path_to_db,
35
- embeddings=embeddings,
36
- allow_dangerous_deserialization=True)
 
 
37
 
38
  # Get 10 documents based on similarity search
39
  docs = db.similarity_search(query=query, k=10)
 
 
1
  from pathlib import Path
2
  from langchain_community.vectorstores import FAISS
3
  from dotenv import load_dotenv
 
10
  load_dotenv()
11
 
12
 
13
+ def get_reranked_docs_faiss(
14
+ query:str,
15
+ path_to_db:str,
16
+ embedding_model:str,
17
+ hf_api_key:str,
18
+ num_docs:int=5
19
+ ) -> list:
20
  """ Re-ranks the similarity search results and returns top-k highest ranked docs
21
 
22
+ Args:
23
+ query (str): The search query
24
+ path_to_db (str): Path to the vectorstore database
25
+ embedding_model (str): Embedding model used in the vector store
26
+ num_docs (int): Number of documents to return
27
+
28
+ Returns: A list of documents with the highest rank
29
  """
30
  assert num_docs <= 10, "num_docs should be less than similarity search results"
31
 
32
+ embeddings = HuggingFaceInferenceAPIEmbeddings(
33
+ api_key=hf_api_key,
34
+ model_name=embedding_model
35
+ )
36
+
37
  # Load the vectorstore database
38
+ db = FAISS.load_local(
39
+ folder_path=path_to_db,
40
+ embeddings=embeddings,
41
+ allow_dangerous_deserialization=True
42
+ )
43
 
44
  # Get 10 documents based on similarity search
45
  docs = db.similarity_search(query=query, k=10)