isayahc commited on
Commit
5ffaf21
·
unverified ·
1 Parent(s): 5e5ca32

updating constants and refactoring functions

Browse files
config.py CHANGED
@@ -10,6 +10,7 @@ VECTOR_DATABASE_LOCATION = os.getenv('VECTOR_DATABASE_LOCATION')
10
  EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
11
  SEVEN_B_LLM_MODEL = os.getenv("SEVEN_B_LLM_MODEL")
12
  BERT_MODEL = os.getenv("BERT_MODEL")
 
13
 
14
 
15
  db = DataBaseHandler()
 
10
  EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
11
  SEVEN_B_LLM_MODEL = os.getenv("SEVEN_B_LLM_MODEL")
12
  BERT_MODEL = os.getenv("BERT_MODEL")
13
+ FAISS_INDEX_PATH = os.getenv("FAISS_INDEX_PATH")
14
 
15
 
16
  db = DataBaseHandler()
rag_app/knowledge_base/create_embedding.py DELETED
@@ -1,48 +0,0 @@
1
- # embeddings functions
2
- from langchain_text_splitters import RecursiveCharacterTextSplitter
3
- from langchain_community.embeddings.sentence_transformer import (
4
- SentenceTransformerEmbeddings,
5
- )
6
- import time
7
- from langchain_core.documents import Document
8
- from config import EMBEDDING_MODEL
9
-
10
- def create_embeddings(
11
- docs: list[Document],
12
- chunk_size:int = 500,
13
- chunk_overlap:int = 50,
14
- ):
15
- """given a sequence of `Document` objects this fucntion will
16
- generate embeddings for it.
17
-
18
- ## argument
19
- :params docs (list[Document]) -> list of `list[Document]`
20
- :params chunk_size (int) -> chunk size in which documents are chunks, defaults to 500
21
- :params chunk_overlap (int) -> the amount of token that will be overlapped between chunks, defaults to 50
22
- :params embedding_model (str) -> the huggingspace model that will embed the documents
23
- ## Return
24
- Tuple of embedding and chunks
25
- """
26
-
27
-
28
- text_splitter = RecursiveCharacterTextSplitter(
29
- separators=["\n\n", "\n", "(?<=\. )", " ", ""],
30
- chunk_size = chunk_size,
31
- chunk_overlap = chunk_overlap,
32
- length_function = len,
33
- )
34
-
35
- # Stage one: read all the docs, split them into chunks.
36
- st = time.time()
37
- print('Loading documents and creating chunks ...')
38
-
39
- # Split each document into chunks using the configured text splitter
40
- chunks = text_splitter.create_documents([doc.page_content for doc in docs], metadatas=[doc.metadata for doc in docs])
41
- et = time.time() - st
42
- print(f'Time taken to chunk {len(docs)} documents: {et} seconds.')
43
-
44
- #Stage two: embed the docs.
45
- embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL)
46
- print(f"created a total of {len(chunks)} chunks")
47
-
48
- return embeddings,chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
rag_app/knowledge_base/utils.py CHANGED
@@ -1,6 +1,56 @@
1
  from langchain_core.documents import Document
2
  from chains import generate_document_summary_prompt
3
  from config import SEVEN_B_LLM_MODEL
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
 
6
  def generate_document_summaries(
 
1
  from langchain_core.documents import Document
2
  from chains import generate_document_summary_prompt
3
  from config import SEVEN_B_LLM_MODEL
4
+ # embeddings functions
5
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
6
+ from langchain_community.embeddings.sentence_transformer import (
7
+ SentenceTransformerEmbeddings,
8
+ )
9
+ import time
10
+ from langchain_core.documents import Document
11
+ from config import EMBEDDING_MODEL
12
+ from langchain.retrievers import VectorStoreRetriever
13
+ from langchain_core.vectorstores import VectorStoreRetriever
14
+
15
+ def create_embeddings(
16
+ docs: list[Document],
17
+ chunk_size:int = 500,
18
+ chunk_overlap:int = 50,
19
+ ):
20
+ """given a sequence of `Document` objects this fucntion will
21
+ generate embeddings for it.
22
+
23
+ ## argument
24
+ :params docs (list[Document]) -> list of `list[Document]`
25
+ :params chunk_size (int) -> chunk size in which documents are chunks, defaults to 500
26
+ :params chunk_overlap (int) -> the amount of token that will be overlapped between chunks, defaults to 50
27
+ :params embedding_model (str) -> the huggingspace model that will embed the documents
28
+ ## Return
29
+ Tuple of embedding and chunks
30
+ """
31
+
32
+
33
+ text_splitter = RecursiveCharacterTextSplitter(
34
+ separators=["\n\n", "\n", "(?<=\. )", " ", ""],
35
+ chunk_size = chunk_size,
36
+ chunk_overlap = chunk_overlap,
37
+ length_function = len,
38
+ )
39
+
40
+ # Stage one: read all the docs, split them into chunks.
41
+ st = time.time()
42
+ print('Loading documents and creating chunks ...')
43
+
44
+ # Split each document into chunks using the configured text splitter
45
+ chunks = text_splitter.create_documents([doc.page_content for doc in docs], metadatas=[doc.metadata for doc in docs])
46
+ et = time.time() - st
47
+ print(f'Time taken to chunk {len(docs)} documents: {et} seconds.')
48
+
49
+ #Stage two: embed the docs.
50
+ embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL)
51
+ print(f"created a total of {len(chunks)} chunks")
52
+
53
+ return embeddings,chunks
54
 
55
 
56
  def generate_document_summaries(