WebashalarForML commited on
Commit
263ccc7
·
verified ·
1 Parent(s): 5f2ea7c

Update retrival.py

Browse files
Files changed (1) hide show
  1. retrival.py +358 -75
retrival.py CHANGED
@@ -1,17 +1,19 @@
1
  from langchain_community.document_loaders import DirectoryLoader
2
- from langchain.embeddings import HuggingFaceInstructEmbeddings,HuggingFaceEmbeddings # for embedding task
3
- from langchain.text_splitter import RecursiveCharacterTextSplitter # for converting the large documents into smaller chunks
4
- from langchain.schema import Document
5
  from langchain_core.documents import Document
6
- from langchain_openai import OpenAIEmbeddings
7
  from langchain_community.vectorstores import Chroma
8
- import openai
9
- import openai
10
  import os
11
  import shutil
 
 
 
 
 
 
12
  import uuid
13
- import asyncio # async
14
-
15
 
16
  # Configurations
17
  UPLOAD_FOLDER = "./uploads"
@@ -19,86 +21,375 @@ VECTOR_DB_FOLDER = "./VectorDB"
19
  os.makedirs(UPLOAD_FOLDER, exist_ok=True)
20
  os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
21
 
22
-
 
 
 
 
23
  def load_document(data_path):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
- # Load documents
 
26
  loader = DirectoryLoader(data_path, glob="*.*")
27
- print("loader",loader)
28
- document = loader.load()
29
- return document
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
- # Creating the chunks of Data from the knowledge
32
  def split_text(documents: list[Document]):
33
  text_splitter = RecursiveCharacterTextSplitter(
34
- chunk_size = 1000,
35
- chunk_overlap = 500,
36
- length_function = len,
37
  add_start_index=True,
38
- )
39
- chunks = text_splitter.split_documents(documents)
 
 
40
  print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
41
-
42
  return chunks
43
 
44
- # # Chroma for creating the vector db whcch we will use for the searching relvant data.
45
- # def save_to_chroma(chunks: list[Document],name: str):
46
- # print
47
- # CHROMA_PATH = f"./VectorDB/chroma_{name}"
48
- # # Clear out the database first.
49
- # if os.path.exists(CHROMA_PATH):
50
- # shutil.rmtree(CHROMA_PATH)
51
-
52
- # # Initialize SBERT embedding function
53
- # embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
54
- # db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
55
-
56
- # # Add documents and persist the database
57
- # db.add_documents(chunks)
58
- # db.persist()
59
- # # Return the database instance or a success status
60
- # return db
61
 
62
- def save_to_chroma(chunks: list[Document], name: str):
63
  CHROMA_PATH = f"./VectorDB/chroma_{name}"
64
-
65
- # Clear out the database first
66
  if os.path.exists(CHROMA_PATH):
67
  shutil.rmtree(CHROMA_PATH)
68
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  try:
70
- # Initialize SBERT embedding function
71
  embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
 
72
  db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
73
-
74
- # Add documents and persist the database
75
- print("Adding documents to the database...")
76
  db.add_documents(chunks)
77
- print("Persisting the database...")
78
  db.persist()
79
- print("Database successfully saved.")
80
-
81
- return db
82
  except Exception as e:
83
- print("Error while saving to Chroma:", e)
84
- return None
85
-
86
- def get_unique_sources(chroma_path):
87
- # Load the Chroma database
88
- db = Chroma(persist_directory=chroma_path)
89
-
90
- # Retrieve all metadata from the database
91
- metadata_list = db.get()['metadatas']
92
-
93
- # Extract unique sources from metadata
94
- unique_sources = {metadata['source'] for metadata in metadata_list if 'source' in metadata}
95
- return list(unique_sources)
96
-
97
- def generate_data_store(file_path,db_name):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
99
- print(f"filepath===>{file_path} db_name =====>{db_name}")
 
100
  try:
101
- documents = load_document(file_path)
102
  print("Documents loaded successfully.")
103
  except Exception as e:
104
  print(f"Error loading documents: {e}")
@@ -112,17 +403,9 @@ def generate_data_store(file_path,db_name):
112
  return
113
 
114
  try:
115
- asyncio.run(save_to_chroma(chunks, db_name))
116
  print(f"Data saved to Chroma for database {db_name}.")
117
  except Exception as e:
118
  print(f"Error saving to Chroma: {e}")
119
  return
120
- # def main():
121
- # data_path = "H:\\DEV PATEL\\RAG Project\\data1"
122
- # db_name = "Product_data"
123
- # generate_data_store(data_path,db_name)
124
-
125
- # if __name__ == "__main__":
126
- # main()
127
-
128
 
 
1
  from langchain_community.document_loaders import DirectoryLoader
2
+ from langchain.embeddings import HuggingFaceEmbeddings
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain.schema import Document
5
  from langchain_core.documents import Document
 
6
  from langchain_community.vectorstores import Chroma
 
 
7
  import os
8
  import shutil
9
+ import asyncio
10
+ from unstructured.partition.pdf import partition_pdf
11
+ from unstructured.partition.auto import partition
12
+ import pytesseract
13
+ import os
14
+ import re
15
  import uuid
16
+ pytesseract.pytesseract.tesseract_cmd = (r'/usr/bin/tesseract')
 
17
 
18
  # Configurations
19
  UPLOAD_FOLDER = "./uploads"
 
21
  os.makedirs(UPLOAD_FOLDER, exist_ok=True)
22
  os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
23
 
24
+ ########################################################################################################################################################
25
+ ####-------------------------------------------------------------- Documnet Loader ---------------------------------------------------------------####
26
+ ########################################################################################################################################################
27
+ # Loaders for loading Document text, tables and images from any file format.
28
+ #data_path=r"H:\DEV PATEL\2025\RAG Project\test_data\google data"
29
  def load_document(data_path):
30
+ processed_documents = []
31
+ element_content = []
32
+ table_document = []
33
+ #having different process for the pdf
34
+ for root, _, files in os.walk(data_path):
35
+ for file in files:
36
+ file_path = os.path.join(root, file)
37
+ doc_id = str(uuid.uuid4()) # Generate a unique ID for the document
38
+
39
+ print(f"Processing document ID: {doc_id}, Path: {file_path}")
40
+
41
+ try:
42
+ # Determine the file type based on extension
43
+ filename, file_extension = os.path.splitext(file.lower())
44
+ image_output = f"H:/DEV PATEL/2025/RAG Project/Images/{filename}/"
45
+ # Use specific partition techniques based on file extension
46
+ if file_extension == ".pdf":
47
+ elements = partition_pdf(
48
+ filename=file_path,
49
+ strategy="hi_res", # Use layout detection
50
+ infer_table_structure=True,
51
+ hi_res_model_name="yolox",
52
+ extract_images_in_pdf=True,
53
+ extract_image_block_types=["Image","Table"],
54
+ extract_image_block_output_dir=image_output,
55
+ show_progress=True,
56
+ #chunking_strategy="by_title",
57
+ )
58
+ else:
59
+ # Default to auto partition if no specific handler is found
60
+ elements = partition(
61
+ filename=file_path,
62
+ strategy="hi_res",
63
+ infer_table_structure=True,
64
+ show_progress=True,
65
+ #chunking_strategy="by_title"
66
+ )
67
+ except Exception as e:
68
+ print(f"Failed to process document {file_path}: {e}")
69
+ continue
70
+ categorized_content = {
71
+ "tables": {"content": [], "Metadata": []},
72
+ "images": {"content": [], "Metadata": []},
73
+ "text": {"content": [], "Metadata": []},
74
+ "text2": {"content": [], "Metadata": []}
75
+ }
76
+ element_content.append(elements)
77
+ CNT=1
78
+ for chunk in elements:
79
+ # Safely extract metadata and text
80
+ chunk_type = str(type(chunk))
81
+ chunk_metadata = chunk.metadata.to_dict() if chunk.metadata else {}
82
+ chunk_text = getattr(chunk, "text", None)
83
+
84
+ # Separate content into categories
85
+ #if "Table" in chunk_type:
86
+ if any(
87
+ keyword in chunk_type
88
+ for keyword in [
89
+ "Table",
90
+ "TableChunk"]):
91
+ categorized_content["tables"]["content"].append(chunk_text)
92
+ categorized_content["tables"]["Metadata"].append(chunk_metadata)
93
+
94
+ #test1
95
+ TABLE_DATA=f"Table number {CNT} "+chunk_metadata.get("text_as_html", "")+" "
96
+ CNT+=1
97
+ categorized_content["text"]["content"].append(TABLE_DATA)
98
+ categorized_content["text"]["Metadata"].append(chunk_metadata)
99
+
100
+ elif "Image" in chunk_type:
101
+ categorized_content["images"]["content"].append(chunk_text)
102
+ categorized_content["images"]["Metadata"].append(chunk_metadata)
103
+ elif any(
104
+ keyword in chunk_type
105
+ for keyword in [
106
+ "CompositeElement",
107
+ "Text",
108
+ "NarrativeText",
109
+ "Title",
110
+ "Header",
111
+ "Footer",
112
+ "FigureCaption",
113
+ "ListItem",
114
+ "UncategorizedText",
115
+ "Formula",
116
+ "CodeSnippet",
117
+ "Address",
118
+ "EmailAddress",
119
+ "PageBreak",
120
+ ]
121
+ ):
122
+ categorized_content["text"]["content"].append(chunk_text)
123
+ categorized_content["text"]["Metadata"].append(chunk_metadata)
124
+
125
+ else:
126
+ continue
127
+ # Append processed document
128
+ processed_documents.append({
129
+ "doc_id": doc_id,
130
+ "source": file_path,
131
+ **categorized_content,
132
+ })
133
+
134
+ # Loop over tables and match text from the same document and page
135
+
136
+ for doc in processed_documents:
137
+ cnt=1 # count for storing number of the table
138
+ for table_metadata in doc.get("tables", {}).get("Metadata", []):
139
+ page_number = table_metadata.get("page_number")
140
+ source = doc.get("source")
141
+ page_content = ""
142
+
143
+ for text_metadata, text_content in zip(
144
+ doc.get("text", {}).get("Metadata", []),
145
+ doc.get("text", {}).get("content", [])
146
+ ):
147
+ page_number2 = text_metadata.get("page_number")
148
+ source2 = doc.get("source")
149
+
150
+ if source == source2 and page_number == page_number2:
151
+ print(f"Matching text found for source: {source}, page: {page_number}")
152
+ page_content += f"{text_content} " # Concatenate text with a space
153
+
154
+ # Add the matched content to the table metadata
155
+ table_metadata["page_content"] =f"Table number {cnt} "+table_metadata.get("text_as_html", "")+" "+page_content.strip() # Remove trailing spaces and have the content proper here
156
+ table_metadata["text_as_html"] = table_metadata.get("text_as_html", "") # we are also storing it seperatly
157
+ table_metadata["Table_number"] = cnt # addiing the table number it will be use in retrival
158
+ cnt+=1
159
+
160
+ # Custom loader of document which will store the table along with the text on that page specifically
161
+ # making document of each table with its content
162
+ unique_id = str(uuid.uuid4())
163
+ table_document.append(
164
+ Document(
165
+
166
+ id =unique_id, # Add doc_id directly
167
+ page_content=table_metadata.get("page_content", ""), # Get page_content from metadata, default to empty string if missing
168
+ metadata={
169
+ "source": doc["source"],
170
+ "text_as_html": table_metadata.get("text_as_html", ""),
171
+ "filetype": table_metadata.get("filetype", ""),
172
+ "page_number": str(table_metadata.get("page_number", 0)), # Default to 0 if missing
173
+ "image_path": table_metadata.get("image_path", ""),
174
+ "file_directory": table_metadata.get("file_directory", ""),
175
+ "filename": table_metadata.get("filename", ""),
176
+ "Table_number": str(table_metadata.get("Table_number", 0)) # Default to 0 if missing
177
+ }
178
+ )
179
+ )
180
+
181
+ # Initialize a structure to group content by doc_id
182
+ grouped_by_doc_id = defaultdict(lambda: {
183
+ "text_content": [],
184
+ "metadata": None, # Metadata will only be set once per doc_id
185
+ })
186
+
187
+ for doc in processed_documents:
188
+ doc_id = doc.get("doc_id")
189
+ source = doc.get("source")
190
+ text_content = doc.get("text", {}).get("content", [])
191
+ metadata_list = doc.get("text", {}).get("Metadata", [])
192
+
193
+ # Merge text content
194
+ grouped_by_doc_id[doc_id]["text_content"].extend(text_content)
195
+
196
+ # Set metadata (if not already set)
197
+ if grouped_by_doc_id[doc_id]["metadata"] is None and metadata_list:
198
+ metadata = metadata_list[0] # Assuming metadata is consistent
199
+ grouped_by_doc_id[doc_id]["metadata"] = {
200
+ "source": source,
201
+ "filetype": metadata.get("filetype"),
202
+ "file_directory": metadata.get("file_directory"),
203
+ "filename": metadata.get("filename"),
204
+ "languages": str(metadata.get("languages")),
205
+ }
206
+
207
+ # Convert grouped content into Document objects
208
+ grouped_documents = []
209
+ for doc_id, data in grouped_by_doc_id.items():
210
+ grouped_documents.append(
211
+ Document(
212
+ id=doc_id,
213
+ page_content=" ".join(data["text_content"]).strip(),
214
+ metadata=data["metadata"],
215
+ )
216
+ )
217
+
218
+ # Output the grouped documents
219
+ for document in grouped_documents:
220
+ print(document)
221
 
222
+
223
+ #Dirctory loader for loading the text data only to specific db
224
  loader = DirectoryLoader(data_path, glob="*.*")
225
+ documents = loader.load()
226
+
227
+ # update the metadata adding filname to the met
228
+ for doc in documents:
229
+ unique_id = str(uuid.uuid4())
230
+ doc.id = unique_id
231
+ path=doc.metadata.get("source")
232
+ match = re.search(r'([^\\]+\.[^\\]+)$', path)
233
+ doc.metadata.update({"filename":match.group(1)})
234
+
235
+ return documents,grouped_documents
236
+ #documents,processed_documents,table_document = load_document(data_path)
237
+
238
+
239
+ ########################################################################################################################################################
240
+ ####-------------------------------------------------------------- Chunking the Text --------------------------------------------------------------####
241
+ ########################################################################################################################################################
242
 
 
243
  def split_text(documents: list[Document]):
244
  text_splitter = RecursiveCharacterTextSplitter(
245
+ chunk_size=1000,
246
+ chunk_overlap=500,
247
+ length_function=len,
248
  add_start_index=True,
249
+ )
250
+ chunks = text_splitter.split_documents(documents) # splitting the document into chunks
251
+ for index in chunks:
252
+ index.metadata["start_index"]=str(index.metadata["start_index"]) # the converstion of int metadata to str was done to store it in sqlite3
253
  print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
 
254
  return chunks
255
 
256
+ ########################################################################################################################################################
257
+ ####---------------------------------------------------- Creating and Storeing Data in Vector DB --------------------------------------------------####
258
+ ########################################################################################################################################################
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
 
260
+ def save_to_chroma(chunks: list[Document], name: str, tables: list[Document]):
261
  CHROMA_PATH = f"./VectorDB/chroma_{name}"
262
+ TABLE_PATH = f"./TableDB/chroma_{name}"
 
263
  if os.path.exists(CHROMA_PATH):
264
  shutil.rmtree(CHROMA_PATH)
265
+ if os.path.exists(TABLE_PATH):
266
+ shutil.rmtree(TABLE_PATH)
267
+
268
+ try:
269
+ # Load the embedding model
270
+ #embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
271
+ embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
272
+ # Create Chroma DB for documents using from_documents [NOTE: Some of the data is converted to string because int and float show null if added]
273
+ print("Creating document vector database...")
274
+ db = Chroma.from_documents(
275
+ documents=chunks,
276
+ embedding=embedding_function,
277
+ persist_directory=CHROMA_PATH,
278
+ )
279
+ print("Document database successfully saved.")
280
+
281
+ # Create Chroma DB for tables if available [NOTE: Some of the data is converted to string because int and float show null if added]
282
+ if tables:
283
+ print("Creating table vector database...")
284
+ tdb = Chroma.from_documents(
285
+ documents=tables,
286
+ embedding=embedding_function,
287
+ persist_directory=TABLE_PATH,
288
+ )
289
+ print("Table database successfully saved.")
290
+ else:
291
+ tdb = None
292
+
293
+ return db, tdb
294
+ except Exception as e:
295
+ print("Error while saving to Chroma:", e)
296
+ return None
297
+
298
+ # def get_unique_sources(chroma_path):
299
+ # db = Chroma(persist_directory=chroma_path)
300
+ # metadata_list = db.get()["metadatas"]
301
+ # unique_sources = {metadata["source"] for metadata in metadata_list if "source" in metadata}
302
+ # return list(unique_sources)
303
+
304
+ ########################################################################################################################################################
305
+ ####----------------------------------------------------------- Updating Existing Data in Vector DB -----------------------------------------------####
306
+ ########################################################################################################################################################
307
+
308
+ def add_document_to_existing_db(new_documents: list[Document], db_name: str):
309
+ CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
310
+
311
+ if not os.path.exists(CHROMA_PATH):
312
+ print(f"Database '{db_name}' does not exist. Please create it first.")
313
+ return
314
+
315
  try:
 
316
  embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
317
+ #embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
318
  db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
319
+
320
+ print("Adding new documents to the existing database...")
321
+ chunks = split_text(new_documents)
322
  db.add_documents(chunks)
 
323
  db.persist()
324
+ print("New documents added and database updated successfully.")
 
 
325
  except Exception as e:
326
+ print("Error while adding documents to existing database:", e)
327
+
328
+ def delete_chunks_by_source(chroma_path, source_to_delete):
329
+ if not os.path.exists(chroma_path):
330
+ print(f"Database at path '{chroma_path}' does not exist.")
331
+ return
332
+
333
+ try:
334
+ #embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
335
+ embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
336
+ db = Chroma(persist_directory=chroma_path, embedding_function=embedding_function)
337
+
338
+ print(f"Retrieving all metadata to identify chunks with source '{source_to_delete}'...")
339
+ metadata_list = db.get()["metadatas"]
340
+
341
+ # Identify indices of chunks to delete
342
+ indices_to_delete = [
343
+ idx for idx, metadata in enumerate(metadata_list) if metadata.get("source") == source_to_delete
344
+ ]
345
+
346
+ if not indices_to_delete:
347
+ print(f"No chunks found with source '{source_to_delete}'.")
348
+ return
349
+
350
+ print(f"Deleting {len(indices_to_delete)} chunks with source '{source_to_delete}'...")
351
+ db.delete(indices=indices_to_delete)
352
+ db.persist()
353
+ print("Chunks deleted and database updated successfully.")
354
+ except Exception as e:
355
+ print(f"Error while deleting chunks by source: {e}")
356
+
357
+ # # update a data store
358
+ # def update_data_store(file_path, db_name):
359
+ # CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
360
+ # print(f"Filepath ===> {file_path} DB Name ====> {db_name}")
361
+
362
+ # try:
363
+ # documents,table_document = load_document(file_path)
364
+ # print("Documents loaded successfully.")
365
+ # except Exception as e:
366
+ # print(f"Error loading documents: {e}")
367
+ # return
368
+
369
+ # try:
370
+ # chunks = split_text(documents)
371
+ # print(f"Text split into {len(chunks)} chunks.")
372
+ # except Exception as e:
373
+ # print(f"Error splitting text: {e}")
374
+ # return
375
+
376
+ # try:
377
+ # asyncio.run(save_to_chroma(save_to_chroma(chunks, db_name, table_document)))
378
+ # print(f"Data saved to Chroma for database {db_name}.")
379
+ # except Exception as e:
380
+ # print(f"Error saving to Chroma: {e}")
381
+ # return
382
+
383
+ ########################################################################################################################################################
384
+ ####------------------------------------------------------- Combine Process of Load, Chunk and Store ----------------------------------------------####
385
+ ########################################################################################################################################################
386
+
387
+ def generate_data_store(file_path, db_name):
388
  CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
389
+ print(f"Filepath ===> {file_path} DB Name ====> {db_name}")
390
+
391
  try:
392
+ documents,processed_documents,table_document = load_document(file_path)
393
  print("Documents loaded successfully.")
394
  except Exception as e:
395
  print(f"Error loading documents: {e}")
 
403
  return
404
 
405
  try:
406
+ asyncio.run(save_to_chroma(save_to_chroma(chunks, db_name, table_document)))
407
  print(f"Data saved to Chroma for database {db_name}.")
408
  except Exception as e:
409
  print(f"Error saving to Chroma: {e}")
410
  return
 
 
 
 
 
 
 
 
411