WebashalarForML commited on
Commit
f33b573
·
verified ·
1 Parent(s): dfd51c8

Update retrival.py

Browse files
Files changed (1) hide show
  1. retrival.py +5 -13
retrival.py CHANGED
@@ -21,14 +21,16 @@ pytesseract.pytesseract.tesseract_cmd = (r'/usr/bin/tesseract')
21
  # Configurations
22
  UPLOAD_FOLDER = "./uploads"
23
  VECTOR_DB_FOLDER = "./VectorDB"
 
24
  os.makedirs(UPLOAD_FOLDER, exist_ok=True)
25
  os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
 
26
 
27
  ########################################################################################################################################################
28
  ####-------------------------------------------------------------- Documnet Loader ---------------------------------------------------------------####
29
  ########################################################################################################################################################
30
  # Loaders for loading Document text, tables and images from any file format.
31
- #data_path=r"H:\DEV PATEL\2025\RAG Project\test_data\google data"
32
  def load_document(data_path):
33
  processed_documents = []
34
  #element_content = []
@@ -44,7 +46,7 @@ def load_document(data_path):
44
  try:
45
  # Determine the file type based on extension
46
  filename, file_extension = os.path.splitext(file.lower())
47
- image_output = f"H:/DEV PATEL/2025/RAG Project/Images/{filename}/"
48
  # Use specific partition techniques based on file extension
49
  if file_extension == ".pdf":
50
  elements = partition_pdf(
@@ -217,11 +219,6 @@ def load_document(data_path):
217
  )
218
  )
219
 
220
- # Output the grouped documents
221
- # for document in grouped_documents:
222
- # print(document)
223
-
224
-
225
  #Dirctory loader for loading the text data only to specific db
226
  loader = DirectoryLoader(data_path, glob="*.*")
227
  documents = loader.load()
@@ -235,6 +232,7 @@ def load_document(data_path):
235
  doc.metadata.update({"filename":match.group(1)})
236
 
237
  return grouped_documents,documents,table_document
 
238
  #grouped_documents = load_document(data_path)
239
  #documents,processed_documents,table_document = load_document(data_path)
240
 
@@ -307,12 +305,6 @@ async def save_to_chroma(chunks: list[Document], name: str, tables: list[Documen
307
  print("Error while saving to Chroma:", e)
308
  return None
309
 
310
- # def get_unique_sources(chroma_path):
311
- # db = Chroma(persist_directory=chroma_path)
312
- # metadata_list = db.get()["metadatas"]
313
- # unique_sources = {metadata["source"] for metadata in metadata_list if "source" in metadata}
314
- # return list(unique_sources)
315
-
316
  ########################################################################################################################################################
317
  ####----------------------------------------------------------- Updating Existing Data in Vector DB -----------------------------------------------####
318
  ########################################################################################################################################################
 
21
  # Configurations
22
  UPLOAD_FOLDER = "./uploads"
23
  VECTOR_DB_FOLDER = "./VectorDB"
24
+ IMAGE_DB_FOLDER = "./ImageDB"
25
  os.makedirs(UPLOAD_FOLDER, exist_ok=True)
26
  os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
27
+ os.makedirs(IMAGE_DB_FOLDER, exist_ok=True)
28
 
29
  ########################################################################################################################################################
30
  ####-------------------------------------------------------------- Documnet Loader ---------------------------------------------------------------####
31
  ########################################################################################################################################################
32
  # Loaders for loading Document text, tables and images from any file format.
33
+
34
  def load_document(data_path):
35
  processed_documents = []
36
  #element_content = []
 
46
  try:
47
  # Determine the file type based on extension
48
  filename, file_extension = os.path.splitext(file.lower())
49
+ image_output = f"./ImageDB/{filename}/"
50
  # Use specific partition techniques based on file extension
51
  if file_extension == ".pdf":
52
  elements = partition_pdf(
 
219
  )
220
  )
221
 
 
 
 
 
 
222
  #Dirctory loader for loading the text data only to specific db
223
  loader = DirectoryLoader(data_path, glob="*.*")
224
  documents = loader.load()
 
232
  doc.metadata.update({"filename":match.group(1)})
233
 
234
  return grouped_documents,documents,table_document
235
+
236
  #grouped_documents = load_document(data_path)
237
  #documents,processed_documents,table_document = load_document(data_path)
238
 
 
305
  print("Error while saving to Chroma:", e)
306
  return None
307
 
 
 
 
 
 
 
308
  ########################################################################################################################################################
309
  ####----------------------------------------------------------- Updating Existing Data in Vector DB -----------------------------------------------####
310
  ########################################################################################################################################################