Spaces:
Running
Running
Update retrival.py
Browse files- retrival.py +53 -51
retrival.py
CHANGED
@@ -13,14 +13,14 @@ import pytesseract
|
|
13 |
import os
|
14 |
import re
|
15 |
import uuid
|
|
|
16 |
from collections import defaultdict
|
17 |
-
|
18 |
pytesseract.pytesseract.tesseract_cmd = (r'/usr/bin/tesseract')
|
19 |
|
20 |
# Configurations
|
21 |
UPLOAD_FOLDER = "./uploads"
|
22 |
VECTOR_DB_FOLDER = "./VectorDB"
|
23 |
-
IMAGE_DB_FOLDER = "./Images"
|
24 |
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
25 |
os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
|
26 |
|
@@ -31,7 +31,7 @@ os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
|
|
31 |
#data_path=r"H:\DEV PATEL\2025\RAG Project\test_data\google data"
|
32 |
def load_document(data_path):
|
33 |
processed_documents = []
|
34 |
-
element_content = []
|
35 |
table_document = []
|
36 |
#having different process for the pdf
|
37 |
for root, _, files in os.walk(data_path):
|
@@ -44,7 +44,7 @@ def load_document(data_path):
|
|
44 |
try:
|
45 |
# Determine the file type based on extension
|
46 |
filename, file_extension = os.path.splitext(file.lower())
|
47 |
-
image_output = f"
|
48 |
# Use specific partition techniques based on file extension
|
49 |
if file_extension == ".pdf":
|
50 |
elements = partition_pdf(
|
@@ -73,10 +73,9 @@ def load_document(data_path):
|
|
73 |
categorized_content = {
|
74 |
"tables": {"content": [], "Metadata": []},
|
75 |
"images": {"content": [], "Metadata": []},
|
76 |
-
"text": {"content": [], "Metadata": []},
|
77 |
-
"text2": {"content": [], "Metadata": []}
|
78 |
}
|
79 |
-
element_content.append(elements)
|
80 |
CNT=1
|
81 |
for chunk in elements:
|
82 |
# Safely extract metadata and text
|
@@ -136,7 +135,6 @@ def load_document(data_path):
|
|
136 |
|
137 |
# Loop over tables and match text from the same document and page
|
138 |
|
139 |
-
'''
|
140 |
for doc in processed_documents:
|
141 |
cnt=1 # count for storing number of the table
|
142 |
for table_metadata in doc.get("tables", {}).get("Metadata", []):
|
@@ -181,7 +179,6 @@ def load_document(data_path):
|
|
181 |
}
|
182 |
)
|
183 |
)
|
184 |
-
'''
|
185 |
|
186 |
# Initialize a structure to group content by doc_id
|
187 |
grouped_by_doc_id = defaultdict(lambda: {
|
@@ -203,10 +200,10 @@ def load_document(data_path):
|
|
203 |
metadata = metadata_list[0] # Assuming metadata is consistent
|
204 |
grouped_by_doc_id[doc_id]["metadata"] = {
|
205 |
"source": source,
|
206 |
-
"filetype": metadata.get("filetype"),
|
207 |
"file_directory": metadata.get("file_directory"),
|
208 |
"filename": metadata.get("filename"),
|
209 |
-
"languages": str(metadata.get("languages")),
|
210 |
}
|
211 |
|
212 |
# Convert grouped content into Document objects
|
@@ -221,12 +218,11 @@ def load_document(data_path):
|
|
221 |
)
|
222 |
|
223 |
# Output the grouped documents
|
224 |
-
for document in grouped_documents:
|
225 |
-
|
226 |
|
227 |
|
228 |
#Dirctory loader for loading the text data only to specific db
|
229 |
-
'''
|
230 |
loader = DirectoryLoader(data_path, glob="*.*")
|
231 |
documents = loader.load()
|
232 |
|
@@ -237,9 +233,9 @@ def load_document(data_path):
|
|
237 |
path=doc.metadata.get("source")
|
238 |
match = re.search(r'([^\\]+\.[^\\]+)$', path)
|
239 |
doc.metadata.update({"filename":match.group(1)})
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
#documents,processed_documents,table_document = load_document(data_path)
|
244 |
|
245 |
|
@@ -249,8 +245,8 @@ def load_document(data_path):
|
|
249 |
|
250 |
def split_text(documents: list[Document]):
|
251 |
text_splitter = RecursiveCharacterTextSplitter(
|
252 |
-
chunk_size=
|
253 |
-
chunk_overlap=
|
254 |
length_function=len,
|
255 |
add_start_index=True,
|
256 |
)
|
@@ -265,41 +261,47 @@ def split_text(documents: list[Document]):
|
|
265 |
########################################################################################################################################################
|
266 |
|
267 |
#def save_to_chroma(chunks: list[Document], name: str, tables: list[Document]):
|
268 |
-
def save_to_chroma(chunks: list[Document], name: str):
|
269 |
CHROMA_PATH = f"./VectorDB/chroma_{name}"
|
270 |
-
|
271 |
if os.path.exists(CHROMA_PATH):
|
272 |
shutil.rmtree(CHROMA_PATH)
|
273 |
-
|
274 |
-
|
275 |
|
276 |
try:
|
277 |
# Load the embedding model
|
278 |
-
embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
|
279 |
#embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
|
280 |
# Create Chroma DB for documents using from_documents [NOTE: Some of the data is converted to string because int and float show null if added]
|
281 |
print("Creating document vector database...")
|
282 |
-
db =
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
|
|
|
|
|
|
|
|
287 |
print("Document database successfully saved.")
|
288 |
-
|
289 |
-
#
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
|
|
|
|
303 |
|
304 |
except Exception as e:
|
305 |
print("Error while saving to Chroma:", e)
|
@@ -394,30 +396,30 @@ def save_to_chroma(chunks: list[Document], name: str):
|
|
394 |
####------------------------------------------------------- Combine Process of Load, Chunk and Store ----------------------------------------------####
|
395 |
########################################################################################################################################################
|
396 |
|
397 |
-
def generate_data_store(file_path, db_name):
|
398 |
CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
|
399 |
print(f"Filepath ===> {file_path} DB Name ====> {db_name}")
|
400 |
|
401 |
try:
|
402 |
-
|
403 |
-
|
404 |
print("Documents loaded successfully.")
|
405 |
except Exception as e:
|
406 |
print(f"Error loading documents: {e}")
|
407 |
return
|
408 |
|
409 |
try:
|
410 |
-
chunks = split_text(
|
411 |
print(f"Text split into {len(chunks)} chunks.")
|
412 |
except Exception as e:
|
413 |
print(f"Error splitting text: {e}")
|
414 |
return
|
415 |
|
416 |
try:
|
417 |
-
|
418 |
-
asyncio.run(save_to_chroma(chunks, db_name))
|
419 |
print(f"Data saved to Chroma for database {db_name}.")
|
420 |
except Exception as e:
|
421 |
print(f"Error saving to Chroma: {e}")
|
422 |
return
|
423 |
-
|
|
|
13 |
import os
|
14 |
import re
|
15 |
import uuid
|
16 |
+
from langchain.schema import Document
|
17 |
from collections import defaultdict
|
18 |
+
pytesseract.pytesseract.tesseract_cmd = (r'/usr/bin/tesseract')
|
19 |
pytesseract.pytesseract.tesseract_cmd = (r'/usr/bin/tesseract')
|
20 |
|
21 |
# Configurations
|
22 |
UPLOAD_FOLDER = "./uploads"
|
23 |
VECTOR_DB_FOLDER = "./VectorDB"
|
|
|
24 |
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
25 |
os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
|
26 |
|
|
|
31 |
#data_path=r"H:\DEV PATEL\2025\RAG Project\test_data\google data"
|
32 |
def load_document(data_path):
|
33 |
processed_documents = []
|
34 |
+
#element_content = []
|
35 |
table_document = []
|
36 |
#having different process for the pdf
|
37 |
for root, _, files in os.walk(data_path):
|
|
|
44 |
try:
|
45 |
# Determine the file type based on extension
|
46 |
filename, file_extension = os.path.splitext(file.lower())
|
47 |
+
image_output = f"H:/DEV PATEL/2025/RAG Project/Images/{filename}/"
|
48 |
# Use specific partition techniques based on file extension
|
49 |
if file_extension == ".pdf":
|
50 |
elements = partition_pdf(
|
|
|
73 |
categorized_content = {
|
74 |
"tables": {"content": [], "Metadata": []},
|
75 |
"images": {"content": [], "Metadata": []},
|
76 |
+
"text": {"content": [], "Metadata": []},
|
|
|
77 |
}
|
78 |
+
#element_content.append(elements)
|
79 |
CNT=1
|
80 |
for chunk in elements:
|
81 |
# Safely extract metadata and text
|
|
|
135 |
|
136 |
# Loop over tables and match text from the same document and page
|
137 |
|
|
|
138 |
for doc in processed_documents:
|
139 |
cnt=1 # count for storing number of the table
|
140 |
for table_metadata in doc.get("tables", {}).get("Metadata", []):
|
|
|
179 |
}
|
180 |
)
|
181 |
)
|
|
|
182 |
|
183 |
# Initialize a structure to group content by doc_id
|
184 |
grouped_by_doc_id = defaultdict(lambda: {
|
|
|
200 |
metadata = metadata_list[0] # Assuming metadata is consistent
|
201 |
grouped_by_doc_id[doc_id]["metadata"] = {
|
202 |
"source": source,
|
203 |
+
#"filetype": metadata.get("filetype"),
|
204 |
"file_directory": metadata.get("file_directory"),
|
205 |
"filename": metadata.get("filename"),
|
206 |
+
#"languages": str(metadata.get("languages")),
|
207 |
}
|
208 |
|
209 |
# Convert grouped content into Document objects
|
|
|
218 |
)
|
219 |
|
220 |
# Output the grouped documents
|
221 |
+
# for document in grouped_documents:
|
222 |
+
# print(document)
|
223 |
|
224 |
|
225 |
#Dirctory loader for loading the text data only to specific db
|
|
|
226 |
loader = DirectoryLoader(data_path, glob="*.*")
|
227 |
documents = loader.load()
|
228 |
|
|
|
233 |
path=doc.metadata.get("source")
|
234 |
match = re.search(r'([^\\]+\.[^\\]+)$', path)
|
235 |
doc.metadata.update({"filename":match.group(1)})
|
236 |
+
|
237 |
+
return grouped_documents,documents,table_document
|
238 |
+
#grouped_documents = load_document(data_path)
|
239 |
#documents,processed_documents,table_document = load_document(data_path)
|
240 |
|
241 |
|
|
|
245 |
|
246 |
def split_text(documents: list[Document]):
|
247 |
text_splitter = RecursiveCharacterTextSplitter(
|
248 |
+
chunk_size=2000,
|
249 |
+
chunk_overlap=600,
|
250 |
length_function=len,
|
251 |
add_start_index=True,
|
252 |
)
|
|
|
261 |
########################################################################################################################################################
|
262 |
|
263 |
#def save_to_chroma(chunks: list[Document], name: str, tables: list[Document]):
|
264 |
+
async def save_to_chroma(chunks: list[Document], name: str, tables: list[Document]):
|
265 |
CHROMA_PATH = f"./VectorDB/chroma_{name}"
|
266 |
+
TABLE_PATH = f"./TableDB/chroma_{name}"
|
267 |
if os.path.exists(CHROMA_PATH):
|
268 |
shutil.rmtree(CHROMA_PATH)
|
269 |
+
if os.path.exists(TABLE_PATH):
|
270 |
+
shutil.rmtree(TABLE_PATH)
|
271 |
|
272 |
try:
|
273 |
# Load the embedding model
|
274 |
+
embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2",show_progress=True)
|
275 |
#embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
|
276 |
# Create Chroma DB for documents using from_documents [NOTE: Some of the data is converted to string because int and float show null if added]
|
277 |
print("Creating document vector database...")
|
278 |
+
db =Chroma.from_documents(
|
279 |
+
documents=chunks,
|
280 |
+
embedding=embedding_function,
|
281 |
+
persist_directory=CHROMA_PATH,
|
282 |
+
|
283 |
+
)
|
284 |
+
|
285 |
+
print("Persisting the document database...")
|
286 |
+
db.persist()
|
287 |
print("Document database successfully saved.")
|
288 |
+
|
289 |
+
# Create Chroma DB for tables if available [NOTE: Some of the data is converted to string because int and float show null if added]
|
290 |
+
if tables !=[]:
|
291 |
+
print("Creating table vector database...")
|
292 |
+
tdb =Chroma.from_documents(
|
293 |
+
documents=tables,
|
294 |
+
embedding=embedding_function,
|
295 |
+
persist_directory=TABLE_PATH,
|
296 |
+
)
|
297 |
+
print("Persisting the table database...")
|
298 |
+
db.persist()
|
299 |
+
print("Table database successfully saved.")
|
300 |
+
else:
|
301 |
+
tdb = None
|
302 |
+
|
303 |
+
return db, tdb
|
304 |
+
#return db
|
305 |
|
306 |
except Exception as e:
|
307 |
print("Error while saving to Chroma:", e)
|
|
|
396 |
####------------------------------------------------------- Combine Process of Load, Chunk and Store ----------------------------------------------####
|
397 |
########################################################################################################################################################
|
398 |
|
399 |
+
async def generate_data_store(file_path, db_name):
|
400 |
CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
|
401 |
print(f"Filepath ===> {file_path} DB Name ====> {db_name}")
|
402 |
|
403 |
try:
|
404 |
+
documents,processed_documents,table_document = load_document(file_path)
|
405 |
+
#grouped_document,document = load_document(file_path)
|
406 |
print("Documents loaded successfully.")
|
407 |
except Exception as e:
|
408 |
print(f"Error loading documents: {e}")
|
409 |
return
|
410 |
|
411 |
try:
|
412 |
+
chunks = split_text(documents)
|
413 |
print(f"Text split into {len(chunks)} chunks.")
|
414 |
except Exception as e:
|
415 |
print(f"Error splitting text: {e}")
|
416 |
return
|
417 |
|
418 |
try:
|
419 |
+
await save_to_chroma(chunks, db_name, table_document)
|
420 |
+
#await asyncio.run(save_to_chroma(chunks, db_name,table_document))
|
421 |
print(f"Data saved to Chroma for database {db_name}.")
|
422 |
except Exception as e:
|
423 |
print(f"Error saving to Chroma: {e}")
|
424 |
return
|
425 |
+
|