Spaces:
Sleeping
Sleeping
updating constants and refactoring functions
Browse files- config.py +1 -0
- rag_app/knowledge_base/create_embedding.py +0 -48
- rag_app/knowledge_base/utils.py +50 -0
config.py
CHANGED
@@ -10,6 +10,7 @@ VECTOR_DATABASE_LOCATION = os.getenv('VECTOR_DATABASE_LOCATION')
|
|
10 |
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
|
11 |
SEVEN_B_LLM_MODEL = os.getenv("SEVEN_B_LLM_MODEL")
|
12 |
BERT_MODEL = os.getenv("BERT_MODEL")
|
|
|
13 |
|
14 |
|
15 |
db = DataBaseHandler()
|
|
|
10 |
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
|
11 |
SEVEN_B_LLM_MODEL = os.getenv("SEVEN_B_LLM_MODEL")
|
12 |
BERT_MODEL = os.getenv("BERT_MODEL")
|
13 |
+
FAISS_INDEX_PATH = os.getenv("FAISS_INDEX_PATH")
|
14 |
|
15 |
|
16 |
db = DataBaseHandler()
|
rag_app/knowledge_base/create_embedding.py
DELETED
@@ -1,48 +0,0 @@
|
|
1 |
-
# embeddings functions
|
2 |
-
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
3 |
-
from langchain_community.embeddings.sentence_transformer import (
|
4 |
-
SentenceTransformerEmbeddings,
|
5 |
-
)
|
6 |
-
import time
|
7 |
-
from langchain_core.documents import Document
|
8 |
-
from config import EMBEDDING_MODEL
|
9 |
-
|
10 |
-
def create_embeddings(
|
11 |
-
docs: list[Document],
|
12 |
-
chunk_size:int = 500,
|
13 |
-
chunk_overlap:int = 50,
|
14 |
-
):
|
15 |
-
"""given a sequence of `Document` objects this fucntion will
|
16 |
-
generate embeddings for it.
|
17 |
-
|
18 |
-
## argument
|
19 |
-
:params docs (list[Document]) -> list of `list[Document]`
|
20 |
-
:params chunk_size (int) -> chunk size in which documents are chunks, defaults to 500
|
21 |
-
:params chunk_overlap (int) -> the amount of token that will be overlapped between chunks, defaults to 50
|
22 |
-
:params embedding_model (str) -> the huggingspace model that will embed the documents
|
23 |
-
## Return
|
24 |
-
Tuple of embedding and chunks
|
25 |
-
"""
|
26 |
-
|
27 |
-
|
28 |
-
text_splitter = RecursiveCharacterTextSplitter(
|
29 |
-
separators=["\n\n", "\n", "(?<=\. )", " ", ""],
|
30 |
-
chunk_size = chunk_size,
|
31 |
-
chunk_overlap = chunk_overlap,
|
32 |
-
length_function = len,
|
33 |
-
)
|
34 |
-
|
35 |
-
# Stage one: read all the docs, split them into chunks.
|
36 |
-
st = time.time()
|
37 |
-
print('Loading documents and creating chunks ...')
|
38 |
-
|
39 |
-
# Split each document into chunks using the configured text splitter
|
40 |
-
chunks = text_splitter.create_documents([doc.page_content for doc in docs], metadatas=[doc.metadata for doc in docs])
|
41 |
-
et = time.time() - st
|
42 |
-
print(f'Time taken to chunk {len(docs)} documents: {et} seconds.')
|
43 |
-
|
44 |
-
#Stage two: embed the docs.
|
45 |
-
embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL)
|
46 |
-
print(f"created a total of {len(chunks)} chunks")
|
47 |
-
|
48 |
-
return embeddings,chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
rag_app/knowledge_base/utils.py
CHANGED
@@ -1,6 +1,56 @@
|
|
1 |
from langchain_core.documents import Document
|
2 |
from chains import generate_document_summary_prompt
|
3 |
from config import SEVEN_B_LLM_MODEL
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
|
6 |
def generate_document_summaries(
|
|
|
1 |
from langchain_core.documents import Document
|
2 |
from chains import generate_document_summary_prompt
|
3 |
from config import SEVEN_B_LLM_MODEL
|
4 |
+
# embeddings functions
|
5 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
6 |
+
from langchain_community.embeddings.sentence_transformer import (
|
7 |
+
SentenceTransformerEmbeddings,
|
8 |
+
)
|
9 |
+
import time
|
10 |
+
from langchain_core.documents import Document
|
11 |
+
from config import EMBEDDING_MODEL
|
12 |
+
from langchain.retrievers import VectorStoreRetriever
|
13 |
+
from langchain_core.vectorstores import VectorStoreRetriever
|
14 |
+
|
15 |
+
def create_embeddings(
|
16 |
+
docs: list[Document],
|
17 |
+
chunk_size:int = 500,
|
18 |
+
chunk_overlap:int = 50,
|
19 |
+
):
|
20 |
+
"""given a sequence of `Document` objects this fucntion will
|
21 |
+
generate embeddings for it.
|
22 |
+
|
23 |
+
## argument
|
24 |
+
:params docs (list[Document]) -> list of `list[Document]`
|
25 |
+
:params chunk_size (int) -> chunk size in which documents are chunks, defaults to 500
|
26 |
+
:params chunk_overlap (int) -> the amount of token that will be overlapped between chunks, defaults to 50
|
27 |
+
:params embedding_model (str) -> the huggingspace model that will embed the documents
|
28 |
+
## Return
|
29 |
+
Tuple of embedding and chunks
|
30 |
+
"""
|
31 |
+
|
32 |
+
|
33 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
34 |
+
separators=["\n\n", "\n", "(?<=\. )", " ", ""],
|
35 |
+
chunk_size = chunk_size,
|
36 |
+
chunk_overlap = chunk_overlap,
|
37 |
+
length_function = len,
|
38 |
+
)
|
39 |
+
|
40 |
+
# Stage one: read all the docs, split them into chunks.
|
41 |
+
st = time.time()
|
42 |
+
print('Loading documents and creating chunks ...')
|
43 |
+
|
44 |
+
# Split each document into chunks using the configured text splitter
|
45 |
+
chunks = text_splitter.create_documents([doc.page_content for doc in docs], metadatas=[doc.metadata for doc in docs])
|
46 |
+
et = time.time() - st
|
47 |
+
print(f'Time taken to chunk {len(docs)} documents: {et} seconds.')
|
48 |
+
|
49 |
+
#Stage two: embed the docs.
|
50 |
+
embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL)
|
51 |
+
print(f"created a total of {len(chunks)} chunks")
|
52 |
+
|
53 |
+
return embeddings,chunks
|
54 |
|
55 |
|
56 |
def generate_document_summaries(
|