WebashalarForML commited on
Commit
8ce0321
·
verified ·
1 Parent(s): 5eb0b04

Update retrival.py

Browse files
Files changed (1) hide show
  1. retrival.py +53 -51
retrival.py CHANGED
@@ -13,14 +13,14 @@ import pytesseract
13
  import os
14
  import re
15
  import uuid
 
16
  from collections import defaultdict
17
-
18
  pytesseract.pytesseract.tesseract_cmd = (r'/usr/bin/tesseract')
19
 
20
  # Configurations
21
  UPLOAD_FOLDER = "./uploads"
22
  VECTOR_DB_FOLDER = "./VectorDB"
23
- IMAGE_DB_FOLDER = "./Images"
24
  os.makedirs(UPLOAD_FOLDER, exist_ok=True)
25
  os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
26
 
@@ -31,7 +31,7 @@ os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
31
  #data_path=r"H:\DEV PATEL\2025\RAG Project\test_data\google data"
32
  def load_document(data_path):
33
  processed_documents = []
34
- element_content = []
35
  table_document = []
36
  #having different process for the pdf
37
  for root, _, files in os.walk(data_path):
@@ -44,7 +44,7 @@ def load_document(data_path):
44
  try:
45
  # Determine the file type based on extension
46
  filename, file_extension = os.path.splitext(file.lower())
47
- image_output = f"./Images/{filename}/"
48
  # Use specific partition techniques based on file extension
49
  if file_extension == ".pdf":
50
  elements = partition_pdf(
@@ -73,10 +73,9 @@ def load_document(data_path):
73
  categorized_content = {
74
  "tables": {"content": [], "Metadata": []},
75
  "images": {"content": [], "Metadata": []},
76
- "text": {"content": [], "Metadata": []},
77
- "text2": {"content": [], "Metadata": []}
78
  }
79
- element_content.append(elements)
80
  CNT=1
81
  for chunk in elements:
82
  # Safely extract metadata and text
@@ -136,7 +135,6 @@ def load_document(data_path):
136
 
137
  # Loop over tables and match text from the same document and page
138
 
139
- '''
140
  for doc in processed_documents:
141
  cnt=1 # count for storing number of the table
142
  for table_metadata in doc.get("tables", {}).get("Metadata", []):
@@ -181,7 +179,6 @@ def load_document(data_path):
181
  }
182
  )
183
  )
184
- '''
185
 
186
  # Initialize a structure to group content by doc_id
187
  grouped_by_doc_id = defaultdict(lambda: {
@@ -203,10 +200,10 @@ def load_document(data_path):
203
  metadata = metadata_list[0] # Assuming metadata is consistent
204
  grouped_by_doc_id[doc_id]["metadata"] = {
205
  "source": source,
206
- "filetype": metadata.get("filetype"),
207
  "file_directory": metadata.get("file_directory"),
208
  "filename": metadata.get("filename"),
209
- "languages": str(metadata.get("languages")),
210
  }
211
 
212
  # Convert grouped content into Document objects
@@ -221,12 +218,11 @@ def load_document(data_path):
221
  )
222
 
223
  # Output the grouped documents
224
- for document in grouped_documents:
225
- print(document)
226
 
227
 
228
  #Dirctory loader for loading the text data only to specific db
229
- '''
230
  loader = DirectoryLoader(data_path, glob="*.*")
231
  documents = loader.load()
232
 
@@ -237,9 +233,9 @@ def load_document(data_path):
237
  path=doc.metadata.get("source")
238
  match = re.search(r'([^\\]+\.[^\\]+)$', path)
239
  doc.metadata.update({"filename":match.group(1)})
240
- return documents,
241
- '''
242
- return grouped_documents
243
  #documents,processed_documents,table_document = load_document(data_path)
244
 
245
 
@@ -249,8 +245,8 @@ def load_document(data_path):
249
 
250
  def split_text(documents: list[Document]):
251
  text_splitter = RecursiveCharacterTextSplitter(
252
- chunk_size=1000,
253
- chunk_overlap=500,
254
  length_function=len,
255
  add_start_index=True,
256
  )
@@ -265,41 +261,47 @@ def split_text(documents: list[Document]):
265
  ########################################################################################################################################################
266
 
267
  #def save_to_chroma(chunks: list[Document], name: str, tables: list[Document]):
268
- def save_to_chroma(chunks: list[Document], name: str):
269
  CHROMA_PATH = f"./VectorDB/chroma_{name}"
270
- #TABLE_PATH = f"./TableDB/chroma_{name}"
271
  if os.path.exists(CHROMA_PATH):
272
  shutil.rmtree(CHROMA_PATH)
273
- # if os.path.exists(TABLE_PATH):
274
- # shutil.rmtree(TABLE_PATH)
275
 
276
  try:
277
  # Load the embedding model
278
- embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
279
  #embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
280
  # Create Chroma DB for documents using from_documents [NOTE: Some of the data is converted to string because int and float show null if added]
281
  print("Creating document vector database...")
282
- db = Chroma.from_documents(
283
- documents=chunks,
284
- embedding=embedding_function,
285
- persist_directory=CHROMA_PATH,
286
- )
 
 
 
 
287
  print("Document database successfully saved.")
288
-
289
- # # Create Chroma DB for tables if available [NOTE: Some of the data is converted to string because int and float show null if added]
290
- # if tables:
291
- # print("Creating table vector database...")
292
- # tdb = Chroma.from_documents(
293
- # documents=tables,
294
- # embedding=embedding_function,
295
- # persist_directory=TABLE_PATH,
296
- # )
297
- # print("Table database successfully saved.")
298
- # else:
299
- # tdb = None
300
-
301
- #return db, tdb
302
- return db
 
 
303
 
304
  except Exception as e:
305
  print("Error while saving to Chroma:", e)
@@ -394,30 +396,30 @@ def save_to_chroma(chunks: list[Document], name: str):
394
  ####------------------------------------------------------- Combine Process of Load, Chunk and Store ----------------------------------------------####
395
  ########################################################################################################################################################
396
 
397
- def generate_data_store(file_path, db_name):
398
  CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
399
  print(f"Filepath ===> {file_path} DB Name ====> {db_name}")
400
 
401
  try:
402
- #documents,grouped_documents = load_document(file_path)
403
- grouped_documents = load_document(file_path)
404
  print("Documents loaded successfully.")
405
  except Exception as e:
406
  print(f"Error loading documents: {e}")
407
  return
408
 
409
  try:
410
- chunks = split_text(grouped_documents)
411
  print(f"Text split into {len(chunks)} chunks.")
412
  except Exception as e:
413
  print(f"Error splitting text: {e}")
414
  return
415
 
416
  try:
417
- #asyncio.run(save_to_chroma(save_to_chroma(chunks, db_name, table_document)))
418
- asyncio.run(save_to_chroma(chunks, db_name))
419
  print(f"Data saved to Chroma for database {db_name}.")
420
  except Exception as e:
421
  print(f"Error saving to Chroma: {e}")
422
  return
423
-
 
13
  import os
14
  import re
15
  import uuid
16
+ from langchain.schema import Document
17
  from collections import defaultdict
18
+ pytesseract.pytesseract.tesseract_cmd = (r'/usr/bin/tesseract')
19
  pytesseract.pytesseract.tesseract_cmd = (r'/usr/bin/tesseract')
20
 
21
  # Configurations
22
  UPLOAD_FOLDER = "./uploads"
23
  VECTOR_DB_FOLDER = "./VectorDB"
 
24
  os.makedirs(UPLOAD_FOLDER, exist_ok=True)
25
  os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
26
 
 
31
  #data_path=r"H:\DEV PATEL\2025\RAG Project\test_data\google data"
32
  def load_document(data_path):
33
  processed_documents = []
34
+ #element_content = []
35
  table_document = []
36
  #having different process for the pdf
37
  for root, _, files in os.walk(data_path):
 
44
  try:
45
  # Determine the file type based on extension
46
  filename, file_extension = os.path.splitext(file.lower())
47
+ image_output = f"H:/DEV PATEL/2025/RAG Project/Images/{filename}/"
48
  # Use specific partition techniques based on file extension
49
  if file_extension == ".pdf":
50
  elements = partition_pdf(
 
73
  categorized_content = {
74
  "tables": {"content": [], "Metadata": []},
75
  "images": {"content": [], "Metadata": []},
76
+ "text": {"content": [], "Metadata": []},
 
77
  }
78
+ #element_content.append(elements)
79
  CNT=1
80
  for chunk in elements:
81
  # Safely extract metadata and text
 
135
 
136
  # Loop over tables and match text from the same document and page
137
 
 
138
  for doc in processed_documents:
139
  cnt=1 # count for storing number of the table
140
  for table_metadata in doc.get("tables", {}).get("Metadata", []):
 
179
  }
180
  )
181
  )
 
182
 
183
  # Initialize a structure to group content by doc_id
184
  grouped_by_doc_id = defaultdict(lambda: {
 
200
  metadata = metadata_list[0] # Assuming metadata is consistent
201
  grouped_by_doc_id[doc_id]["metadata"] = {
202
  "source": source,
203
+ #"filetype": metadata.get("filetype"),
204
  "file_directory": metadata.get("file_directory"),
205
  "filename": metadata.get("filename"),
206
+ #"languages": str(metadata.get("languages")),
207
  }
208
 
209
  # Convert grouped content into Document objects
 
218
  )
219
 
220
  # Output the grouped documents
221
+ # for document in grouped_documents:
222
+ # print(document)
223
 
224
 
225
  #Dirctory loader for loading the text data only to specific db
 
226
  loader = DirectoryLoader(data_path, glob="*.*")
227
  documents = loader.load()
228
 
 
233
  path=doc.metadata.get("source")
234
  match = re.search(r'([^\\]+\.[^\\]+)$', path)
235
  doc.metadata.update({"filename":match.group(1)})
236
+
237
+ return grouped_documents,documents,table_document
238
+ #grouped_documents = load_document(data_path)
239
  #documents,processed_documents,table_document = load_document(data_path)
240
 
241
 
 
245
 
246
  def split_text(documents: list[Document]):
247
  text_splitter = RecursiveCharacterTextSplitter(
248
+ chunk_size=2000,
249
+ chunk_overlap=600,
250
  length_function=len,
251
  add_start_index=True,
252
  )
 
261
  ########################################################################################################################################################
262
 
263
  #def save_to_chroma(chunks: list[Document], name: str, tables: list[Document]):
264
+ async def save_to_chroma(chunks: list[Document], name: str, tables: list[Document]):
265
  CHROMA_PATH = f"./VectorDB/chroma_{name}"
266
+ TABLE_PATH = f"./TableDB/chroma_{name}"
267
  if os.path.exists(CHROMA_PATH):
268
  shutil.rmtree(CHROMA_PATH)
269
+ if os.path.exists(TABLE_PATH):
270
+ shutil.rmtree(TABLE_PATH)
271
 
272
  try:
273
  # Load the embedding model
274
+ embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2",show_progress=True)
275
  #embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
276
  # Create Chroma DB for documents using from_documents [NOTE: Some of the data is converted to string because int and float show null if added]
277
  print("Creating document vector database...")
278
+ db =Chroma.from_documents(
279
+ documents=chunks,
280
+ embedding=embedding_function,
281
+ persist_directory=CHROMA_PATH,
282
+
283
+ )
284
+
285
+ print("Persisting the document database...")
286
+ db.persist()
287
  print("Document database successfully saved.")
288
+
289
+ # Create Chroma DB for tables if available [NOTE: Some of the data is converted to string because int and float show null if added]
290
+ if tables !=[]:
291
+ print("Creating table vector database...")
292
+ tdb =Chroma.from_documents(
293
+ documents=tables,
294
+ embedding=embedding_function,
295
+ persist_directory=TABLE_PATH,
296
+ )
297
+ print("Persisting the table database...")
298
+ db.persist()
299
+ print("Table database successfully saved.")
300
+ else:
301
+ tdb = None
302
+
303
+ return db, tdb
304
+ #return db
305
 
306
  except Exception as e:
307
  print("Error while saving to Chroma:", e)
 
396
  ####------------------------------------------------------- Combine Process of Load, Chunk and Store ----------------------------------------------####
397
  ########################################################################################################################################################
398
 
399
+ async def generate_data_store(file_path, db_name):
400
  CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
401
  print(f"Filepath ===> {file_path} DB Name ====> {db_name}")
402
 
403
  try:
404
+ documents,processed_documents,table_document = load_document(file_path)
405
+ #grouped_document,document = load_document(file_path)
406
  print("Documents loaded successfully.")
407
  except Exception as e:
408
  print(f"Error loading documents: {e}")
409
  return
410
 
411
  try:
412
+ chunks = split_text(documents)
413
  print(f"Text split into {len(chunks)} chunks.")
414
  except Exception as e:
415
  print(f"Error splitting text: {e}")
416
  return
417
 
418
  try:
419
+ await save_to_chroma(chunks, db_name, table_document)
420
+ #await asyncio.run(save_to_chroma(chunks, db_name,table_document))
421
  print(f"Data saved to Chroma for database {db_name}.")
422
  except Exception as e:
423
  print(f"Error saving to Chroma: {e}")
424
  return
425
+