Spaces:
Running
Running
Update retrival.py
Browse files- retrival.py +5 -13
retrival.py
CHANGED
@@ -21,14 +21,16 @@ pytesseract.pytesseract.tesseract_cmd = (r'/usr/bin/tesseract')
|
|
21 |
# Configurations
|
22 |
UPLOAD_FOLDER = "./uploads"
|
23 |
VECTOR_DB_FOLDER = "./VectorDB"
|
|
|
24 |
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
25 |
os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
|
|
|
26 |
|
27 |
########################################################################################################################################################
|
28 |
####-------------------------------------------------------------- Documnet Loader ---------------------------------------------------------------####
|
29 |
########################################################################################################################################################
|
30 |
# Loaders for loading Document text, tables and images from any file format.
|
31 |
-
|
32 |
def load_document(data_path):
|
33 |
processed_documents = []
|
34 |
#element_content = []
|
@@ -44,7 +46,7 @@ def load_document(data_path):
|
|
44 |
try:
|
45 |
# Determine the file type based on extension
|
46 |
filename, file_extension = os.path.splitext(file.lower())
|
47 |
-
image_output = f"
|
48 |
# Use specific partition techniques based on file extension
|
49 |
if file_extension == ".pdf":
|
50 |
elements = partition_pdf(
|
@@ -217,11 +219,6 @@ def load_document(data_path):
|
|
217 |
)
|
218 |
)
|
219 |
|
220 |
-
# Output the grouped documents
|
221 |
-
# for document in grouped_documents:
|
222 |
-
# print(document)
|
223 |
-
|
224 |
-
|
225 |
#Dirctory loader for loading the text data only to specific db
|
226 |
loader = DirectoryLoader(data_path, glob="*.*")
|
227 |
documents = loader.load()
|
@@ -235,6 +232,7 @@ def load_document(data_path):
|
|
235 |
doc.metadata.update({"filename":match.group(1)})
|
236 |
|
237 |
return grouped_documents,documents,table_document
|
|
|
238 |
#grouped_documents = load_document(data_path)
|
239 |
#documents,processed_documents,table_document = load_document(data_path)
|
240 |
|
@@ -307,12 +305,6 @@ async def save_to_chroma(chunks: list[Document], name: str, tables: list[Documen
|
|
307 |
print("Error while saving to Chroma:", e)
|
308 |
return None
|
309 |
|
310 |
-
# def get_unique_sources(chroma_path):
|
311 |
-
# db = Chroma(persist_directory=chroma_path)
|
312 |
-
# metadata_list = db.get()["metadatas"]
|
313 |
-
# unique_sources = {metadata["source"] for metadata in metadata_list if "source" in metadata}
|
314 |
-
# return list(unique_sources)
|
315 |
-
|
316 |
########################################################################################################################################################
|
317 |
####----------------------------------------------------------- Updating Existing Data in Vector DB -----------------------------------------------####
|
318 |
########################################################################################################################################################
|
|
|
21 |
# Configurations
|
22 |
UPLOAD_FOLDER = "./uploads"
|
23 |
VECTOR_DB_FOLDER = "./VectorDB"
|
24 |
+
IMAGE_DB_FOLDER = "./ImageDB"
|
25 |
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
26 |
os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
|
27 |
+
os.makedirs(IMAGE_DB_FOLDER, exist_ok=True)
|
28 |
|
29 |
########################################################################################################################################################
|
30 |
####-------------------------------------------------------------- Documnet Loader ---------------------------------------------------------------####
|
31 |
########################################################################################################################################################
|
32 |
# Loaders for loading Document text, tables and images from any file format.
|
33 |
+
|
34 |
def load_document(data_path):
|
35 |
processed_documents = []
|
36 |
#element_content = []
|
|
|
46 |
try:
|
47 |
# Determine the file type based on extension
|
48 |
filename, file_extension = os.path.splitext(file.lower())
|
49 |
+
image_output = f"./ImageDB/{filename}/"
|
50 |
# Use specific partition techniques based on file extension
|
51 |
if file_extension == ".pdf":
|
52 |
elements = partition_pdf(
|
|
|
219 |
)
|
220 |
)
|
221 |
|
|
|
|
|
|
|
|
|
|
|
222 |
#Dirctory loader for loading the text data only to specific db
|
223 |
loader = DirectoryLoader(data_path, glob="*.*")
|
224 |
documents = loader.load()
|
|
|
232 |
doc.metadata.update({"filename":match.group(1)})
|
233 |
|
234 |
return grouped_documents,documents,table_document
|
235 |
+
|
236 |
#grouped_documents = load_document(data_path)
|
237 |
#documents,processed_documents,table_document = load_document(data_path)
|
238 |
|
|
|
305 |
print("Error while saving to Chroma:", e)
|
306 |
return None
|
307 |
|
|
|
|
|
|
|
|
|
|
|
|
|
308 |
########################################################################################################################################################
|
309 |
####----------------------------------------------------------- Updating Existing Data in Vector DB -----------------------------------------------####
|
310 |
########################################################################################################################################################
|