WebashalarForML commited on
Commit
894171e
·
verified ·
1 Parent(s): c51231e

Update retrival.py

Browse files
Files changed (1) hide show
  1. retrival.py +128 -127
retrival.py CHANGED
@@ -1,127 +1,128 @@
1
- from langchain_community.document_loaders import DirectoryLoader
2
- from langchain.embeddings import HuggingFaceInstructEmbeddings,HuggingFaceEmbeddings # for embedding task
3
- from langchain.text_splitter import RecursiveCharacterTextSplitter # for converting the large documents into smaller chunks
4
- from langchain.schema import Document
5
- from langchain_core.documents import Document
6
- from langchain_openai import OpenAIEmbeddings
7
- from langchain_community.vectorstores import Chroma
8
- import openai
9
- import openai
10
- import os
11
- import shutil
12
- import uuid
13
-
14
-
15
- # Configurations
16
- UPLOAD_FOLDER = "./uploads"
17
- VECTOR_DB_FOLDER = "./VectorDB"
18
- os.makedirs(UPLOAD_FOLDER, exist_ok=True)
19
- os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
20
-
21
-
22
- def load_document(data_path):
23
-
24
- # Load documents
25
- loader = DirectoryLoader(data_path, glob="*.*")
26
- print("loader",loader)
27
- document = loader.load()
28
- return document
29
-
30
- # Creating the chunks of Data from the knowledge
31
- def split_text(documents: list[Document]):
32
- text_splitter = RecursiveCharacterTextSplitter(
33
- chunk_size = 1000,
34
- chunk_overlap = 500,
35
- length_function = len,
36
- add_start_index=True,
37
- )
38
- chunks = text_splitter.split_documents(documents)
39
- print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
40
-
41
- return chunks
42
-
43
- # # Chroma for creating the vector db whcch we will use for the searching relvant data.
44
- # def save_to_chroma(chunks: list[Document],name: str):
45
- # print
46
- # CHROMA_PATH = f"./VectorDB/chroma_{name}"
47
- # # Clear out the database first.
48
- # if os.path.exists(CHROMA_PATH):
49
- # shutil.rmtree(CHROMA_PATH)
50
-
51
- # # Initialize SBERT embedding function
52
- # embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
53
- # db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
54
-
55
- # # Add documents and persist the database
56
- # db.add_documents(chunks)
57
- # db.persist()
58
- # # Return the database instance or a success status
59
- # return db
60
-
61
- async def save_to_chroma(chunks: list[Document], name: str):
62
- CHROMA_PATH = f"./VectorDB/chroma_{name}"
63
-
64
- # Clear out the database first
65
- if os.path.exists(CHROMA_PATH):
66
- shutil.rmtree(CHROMA_PATH)
67
-
68
- try:
69
- # Initialize SBERT embedding function
70
- embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
71
- db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
72
-
73
- # Add documents and persist the database
74
- print("Adding documents to the database...")
75
- db.add_documents(chunks)
76
- print("Persisting the database...")
77
- db.persist()
78
- print("Database successfully saved.")
79
-
80
- return db
81
- except Exception as e:
82
- print("Error while saving to Chroma:", e)
83
- return None
84
-
85
- def get_unique_sources(chroma_path):
86
- # Load the Chroma database
87
- db = Chroma(persist_directory=chroma_path)
88
-
89
- # Retrieve all metadata from the database
90
- metadata_list = db.get()['metadatas']
91
-
92
- # Extract unique sources from metadata
93
- unique_sources = {metadata['source'] for metadata in metadata_list if 'source' in metadata}
94
- return list(unique_sources)
95
-
96
- def generate_data_store(file_path,db_name):
97
- CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
98
- print(f"filepath===>{file_path} db_name =====>{db_name}")
99
- try:
100
- documents = load_document(file_path)
101
- print("Documents loaded successfully.")
102
- except Exception as e:
103
- print(f"Error loading documents: {e}")
104
- return
105
-
106
- try:
107
- chunks = split_text(documents)
108
- print(f"Text split into {len(chunks)} chunks.")
109
- except Exception as e:
110
- print(f"Error splitting text: {e}")
111
- return
112
-
113
- try:
114
- save_to_chroma(chunks, db_name)
115
- print(f"Data saved to Chroma for database {db_name}.")
116
- except Exception as e:
117
- print(f"Error saving to Chroma: {e}")
118
- return
119
- # def main():
120
- # data_path = "H:\\DEV PATEL\\RAG Project\\data1"
121
- # db_name = "Product_data"
122
- # generate_data_store(data_path,db_name)
123
-
124
- # if __name__ == "__main__":
125
- # main()
126
-
127
-
 
 
1
+ from langchain_community.document_loaders import DirectoryLoader
2
+ from langchain.embeddings import HuggingFaceInstructEmbeddings,HuggingFaceEmbeddings # for embedding task
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter # for converting the large documents into smaller chunks
4
+ from langchain.schema import Document
5
+ from langchain_core.documents import Document
6
+ from langchain_openai import OpenAIEmbeddings
7
+ from langchain_community.vectorstores import Chroma
8
+ import openai
9
+ import openai
10
+ import os
11
+ import shutil
12
+ import uuid
13
+ import asyncio # async
14
+
15
+
16
+ # Configurations
17
+ UPLOAD_FOLDER = "./uploads"
18
+ VECTOR_DB_FOLDER = "./VectorDB"
19
+ os.makedirs(UPLOAD_FOLDER, exist_ok=True)
20
+ os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
21
+
22
+
23
+ def load_document(data_path):
24
+
25
+ # Load documents
26
+ loader = DirectoryLoader(data_path, glob="*.*")
27
+ print("loader",loader)
28
+ document = loader.load()
29
+ return document
30
+
31
+ # Creating the chunks of Data from the knowledge
32
+ def split_text(documents: list[Document]):
33
+ text_splitter = RecursiveCharacterTextSplitter(
34
+ chunk_size = 1000,
35
+ chunk_overlap = 500,
36
+ length_function = len,
37
+ add_start_index=True,
38
+ )
39
+ chunks = text_splitter.split_documents(documents)
40
+ print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
41
+
42
+ return chunks
43
+
44
+ # # Chroma for creating the vector db whcch we will use for the searching relvant data.
45
+ # def save_to_chroma(chunks: list[Document],name: str):
46
+ # print
47
+ # CHROMA_PATH = f"./VectorDB/chroma_{name}"
48
+ # # Clear out the database first.
49
+ # if os.path.exists(CHROMA_PATH):
50
+ # shutil.rmtree(CHROMA_PATH)
51
+
52
+ # # Initialize SBERT embedding function
53
+ # embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
54
+ # db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
55
+
56
+ # # Add documents and persist the database
57
+ # db.add_documents(chunks)
58
+ # db.persist()
59
+ # # Return the database instance or a success status
60
+ # return db
61
+
62
+ def save_to_chroma(chunks: list[Document], name: str):
63
+ CHROMA_PATH = f"./VectorDB/chroma_{name}"
64
+
65
+ # Clear out the database first
66
+ if os.path.exists(CHROMA_PATH):
67
+ shutil.rmtree(CHROMA_PATH)
68
+
69
+ try:
70
+ # Initialize SBERT embedding function
71
+ embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
72
+ db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
73
+
74
+ # Add documents and persist the database
75
+ print("Adding documents to the database...")
76
+ db.add_documents(chunks)
77
+ print("Persisting the database...")
78
+ db.persist()
79
+ print("Database successfully saved.")
80
+
81
+ return db
82
+ except Exception as e:
83
+ print("Error while saving to Chroma:", e)
84
+ return None
85
+
86
+ def get_unique_sources(chroma_path):
87
+ # Load the Chroma database
88
+ db = Chroma(persist_directory=chroma_path)
89
+
90
+ # Retrieve all metadata from the database
91
+ metadata_list = db.get()['metadatas']
92
+
93
+ # Extract unique sources from metadata
94
+ unique_sources = {metadata['source'] for metadata in metadata_list if 'source' in metadata}
95
+ return list(unique_sources)
96
+
97
+ def generate_data_store(file_path,db_name):
98
+ CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
99
+ print(f"filepath===>{file_path} db_name =====>{db_name}")
100
+ try:
101
+ documents = load_document(file_path)
102
+ print("Documents loaded successfully.")
103
+ except Exception as e:
104
+ print(f"Error loading documents: {e}")
105
+ return
106
+
107
+ try:
108
+ chunks = split_text(documents)
109
+ print(f"Text split into {len(chunks)} chunks.")
110
+ except Exception as e:
111
+ print(f"Error splitting text: {e}")
112
+ return
113
+
114
+ try:
115
+ asyncio.run(save_to_chroma(chunks, db_name))
116
+ print(f"Data saved to Chroma for database {db_name}.")
117
+ except Exception as e:
118
+ print(f"Error saving to Chroma: {e}")
119
+ return
120
+ # def main():
121
+ # data_path = "H:\\DEV PATEL\\RAG Project\\data1"
122
+ # db_name = "Product_data"
123
+ # generate_data_store(data_path,db_name)
124
+
125
+ # if __name__ == "__main__":
126
+ # main()
127
+
128
+