Spaces:

sabazo
/

insurance_advisor_wb

Sleeping

App Files Files Community

isayahc commited on Jul 18, 2024

Commit

5ffaf21

unverified ·

1 Parent(s): 5e5ca32

updating constants and refactoring functions

Browse files

Files changed (3) hide show

config.py +1 -0
rag_app/knowledge_base/create_embedding.py +0 -48
rag_app/knowledge_base/utils.py +50 -0

config.py CHANGED Viewed

@@ -10,6 +10,7 @@ VECTOR_DATABASE_LOCATION = os.getenv('VECTOR_DATABASE_LOCATION')
 EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
 SEVEN_B_LLM_MODEL = os.getenv("SEVEN_B_LLM_MODEL")
 BERT_MODEL = os.getenv("BERT_MODEL")
 db = DataBaseHandler()

 EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
 SEVEN_B_LLM_MODEL = os.getenv("SEVEN_B_LLM_MODEL")
 BERT_MODEL = os.getenv("BERT_MODEL")
+FAISS_INDEX_PATH = os.getenv("FAISS_INDEX_PATH")
 db = DataBaseHandler()

rag_app/knowledge_base/create_embedding.py DELETED Viewed

@@ -1,48 +0,0 @@
-# embeddings functions
-from langchain_text_splitters import RecursiveCharacterTextSplitter
-from langchain_community.embeddings.sentence_transformer import (
-    SentenceTransformerEmbeddings,
-)
-import time
-from langchain_core.documents import Document
-from config import EMBEDDING_MODEL
-def create_embeddings(
-        docs: list[Document],
-        chunk_size:int = 500,
-        chunk_overlap:int = 50,
-        ):
-    """given a sequence of `Document` objects this fucntion will
-    generate embeddings for it.
-    ## argument
-    :params docs (list[Document]) -> list of `list[Document]`
-    :params chunk_size (int) -> chunk size in which documents are chunks, defaults to 500
-    :params chunk_overlap (int) -> the amount of token that will be overlapped between chunks, defaults to 50
-    :params embedding_model (str) -> the huggingspace model that will embed the documents
-    ## Return
-    Tuple of embedding and chunks
-    """
-    text_splitter = RecursiveCharacterTextSplitter(
-        separators=["\n\n", "\n", "(?<=\. )", " ", ""],
-        chunk_size = chunk_size,
-        chunk_overlap  = chunk_overlap,
-        length_function = len,
-    )
-    # Stage one: read all the docs, split them into chunks.
-    st = time.time()
-    print('Loading documents and creating chunks ...')
-    # Split each document into chunks using the configured text splitter
-    chunks = text_splitter.create_documents([doc.page_content for doc in docs], metadatas=[doc.metadata for doc in docs])
-    et = time.time() - st
-    print(f'Time taken to chunk {len(docs)} documents: {et} seconds.')
-    #Stage two: embed the docs.
-    embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL)
-    print(f"created a total of {len(chunks)} chunks")
-    return embeddings,chunks

rag_app/knowledge_base/utils.py CHANGED Viewed

@@ -1,6 +1,56 @@
 from langchain_core.documents import Document
 from chains import generate_document_summary_prompt
 from config import SEVEN_B_LLM_MODEL
 def generate_document_summaries(

 from langchain_core.documents import Document
 from chains import generate_document_summary_prompt
 from config import SEVEN_B_LLM_MODEL
+# embeddings functions
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_community.embeddings.sentence_transformer import (
+    SentenceTransformerEmbeddings,
+)
+import time
+from langchain_core.documents import Document
+from config import EMBEDDING_MODEL
+from langchain.retrievers import VectorStoreRetriever
+from langchain_core.vectorstores import VectorStoreRetriever
+def create_embeddings(
+        docs: list[Document],
+        chunk_size:int = 500,
+        chunk_overlap:int = 50,
+        ):
+    """given a sequence of `Document` objects this fucntion will
+    generate embeddings for it.
+    ## argument
+    :params docs (list[Document]) -> list of `list[Document]`
+    :params chunk_size (int) -> chunk size in which documents are chunks, defaults to 500
+    :params chunk_overlap (int) -> the amount of token that will be overlapped between chunks, defaults to 50
+    :params embedding_model (str) -> the huggingspace model that will embed the documents
+    ## Return
+    Tuple of embedding and chunks
+    """
+    text_splitter = RecursiveCharacterTextSplitter(
+        separators=["\n\n", "\n", "(?<=\. )", " ", ""],
+        chunk_size = chunk_size,
+        chunk_overlap  = chunk_overlap,
+        length_function = len,
+    )
+    # Stage one: read all the docs, split them into chunks.
+    st = time.time()
+    print('Loading documents and creating chunks ...')
+    # Split each document into chunks using the configured text splitter
+    chunks = text_splitter.create_documents([doc.page_content for doc in docs], metadatas=[doc.metadata for doc in docs])
+    et = time.time() - st
+    print(f'Time taken to chunk {len(docs)} documents: {et} seconds.')
+    #Stage two: embed the docs.
+    embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL)
+    print(f"created a total of {len(chunks)} chunks")
+    return embeddings,chunks
 def generate_document_summaries(