Spaces:
Sleeping
Sleeping
Commit
·
204d47b
1
Parent(s):
f53cb7b
Add detailed logging and error handling for PDF processing and vector store initialization
Browse files- pdf_processor.py +26 -2
- rag_engine.py +21 -2
pdf_processor.py
CHANGED
@@ -37,16 +37,30 @@ class PDFProcessor:
|
|
37 |
pages = loader.load()
|
38 |
print(f"Successfully loaded {len(pages)} pages with PyPDFLoader")
|
39 |
|
|
|
|
|
|
|
40 |
# Split the text into chunks
|
41 |
chunks = []
|
42 |
for page in pages:
|
|
|
|
|
|
|
|
|
43 |
page_chunks = self.text_splitter.split_text(page.page_content)
|
|
|
|
|
44 |
for chunk in page_chunks:
|
45 |
chunks.append({
|
46 |
'text': chunk,
|
47 |
'metadata': {'page': page.metadata['page']}
|
48 |
})
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
50 |
return chunks
|
51 |
|
52 |
except Exception as e:
|
@@ -63,14 +77,24 @@ class PDFProcessor:
|
|
63 |
|
64 |
for page_num in range(len(pdf.pages)):
|
65 |
text = pdf.pages[page_num].extract_text()
|
|
|
|
|
|
|
|
|
66 |
page_chunks = self.text_splitter.split_text(text)
|
|
|
67 |
|
68 |
for chunk in page_chunks:
|
69 |
chunks.append({
|
70 |
'text': chunk,
|
71 |
'metadata': {'page': page_num + 1}
|
72 |
})
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
74 |
return chunks
|
75 |
|
76 |
except Exception as e2:
|
|
|
37 |
pages = loader.load()
|
38 |
print(f"Successfully loaded {len(pages)} pages with PyPDFLoader")
|
39 |
|
40 |
+
if not pages:
|
41 |
+
raise ValueError("No pages extracted from PDF")
|
42 |
+
|
43 |
# Split the text into chunks
|
44 |
chunks = []
|
45 |
for page in pages:
|
46 |
+
if not page.page_content.strip():
|
47 |
+
print(f"Warning: Empty content on page {page.metadata.get('page', 'unknown')}")
|
48 |
+
continue
|
49 |
+
|
50 |
page_chunks = self.text_splitter.split_text(page.page_content)
|
51 |
+
print(f"Created {len(page_chunks)} chunks from page {page.metadata.get('page', 'unknown')}")
|
52 |
+
|
53 |
for chunk in page_chunks:
|
54 |
chunks.append({
|
55 |
'text': chunk,
|
56 |
'metadata': {'page': page.metadata['page']}
|
57 |
})
|
58 |
+
|
59 |
+
if not chunks:
|
60 |
+
raise ValueError("No text chunks created from PDF")
|
61 |
+
|
62 |
+
print(f"Created total of {len(chunks)} chunks from PyPDFLoader method")
|
63 |
+
print(f"First chunk preview: {chunks[0]['text'][:200]}...")
|
64 |
return chunks
|
65 |
|
66 |
except Exception as e:
|
|
|
77 |
|
78 |
for page_num in range(len(pdf.pages)):
|
79 |
text = pdf.pages[page_num].extract_text()
|
80 |
+
if not text.strip():
|
81 |
+
print(f"Warning: Empty content on page {page_num + 1}")
|
82 |
+
continue
|
83 |
+
|
84 |
page_chunks = self.text_splitter.split_text(text)
|
85 |
+
print(f"Created {len(page_chunks)} chunks from page {page_num + 1}")
|
86 |
|
87 |
for chunk in page_chunks:
|
88 |
chunks.append({
|
89 |
'text': chunk,
|
90 |
'metadata': {'page': page_num + 1}
|
91 |
})
|
92 |
+
|
93 |
+
if not chunks:
|
94 |
+
raise ValueError("No text chunks created from PDF")
|
95 |
+
|
96 |
+
print(f"Created total of {len(chunks)} chunks from direct pypdf method")
|
97 |
+
print(f"First chunk preview: {chunks[0]['text'][:200]}...")
|
98 |
return chunks
|
99 |
|
100 |
except Exception as e2:
|
rag_engine.py
CHANGED
@@ -59,18 +59,36 @@ class RAGEngine:
|
|
59 |
Args:
|
60 |
chunks (List[Dict]): List of dictionaries containing text and metadata
|
61 |
"""
|
|
|
|
|
|
|
|
|
|
|
62 |
texts = [chunk['text'] for chunk in chunks]
|
63 |
metadatas = [chunk['metadata'] for chunk in chunks]
|
64 |
|
|
|
|
|
|
|
65 |
# Create vector store
|
|
|
66 |
self.vector_store = Chroma.from_texts(
|
67 |
texts=texts,
|
68 |
embedding=self.embeddings,
|
69 |
-
metadatas=metadatas
|
|
|
70 |
)
|
|
|
71 |
|
72 |
# Initialize QA chain
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
self.qa_chain = RetrievalQA.from_chain_type(
|
75 |
llm=llm,
|
76 |
chain_type="stuff",
|
@@ -78,6 +96,7 @@ class RAGEngine:
|
|
78 |
search_kwargs={"k": 3}
|
79 |
)
|
80 |
)
|
|
|
81 |
|
82 |
def answer_question(self, question: str) -> Dict:
|
83 |
"""
|
|
|
59 |
Args:
|
60 |
chunks (List[Dict]): List of dictionaries containing text and metadata
|
61 |
"""
|
62 |
+
print(f"Initializing vector store with {len(chunks)} chunks")
|
63 |
+
|
64 |
+
if not chunks:
|
65 |
+
raise ValueError("No text chunks provided. PDF processing may have failed.")
|
66 |
+
|
67 |
texts = [chunk['text'] for chunk in chunks]
|
68 |
metadatas = [chunk['metadata'] for chunk in chunks]
|
69 |
|
70 |
+
print(f"First chunk preview: {texts[0][:200]}...")
|
71 |
+
print(f"First chunk metadata: {metadatas[0]}")
|
72 |
+
|
73 |
# Create vector store
|
74 |
+
print("Creating Chroma vector store...")
|
75 |
self.vector_store = Chroma.from_texts(
|
76 |
texts=texts,
|
77 |
embedding=self.embeddings,
|
78 |
+
metadatas=metadatas,
|
79 |
+
persist_directory="./chroma_db" # Add persistence
|
80 |
)
|
81 |
+
print("Vector store created successfully")
|
82 |
|
83 |
# Initialize QA chain
|
84 |
+
print("Initializing QA chain...")
|
85 |
+
llm = AzureChatOpenAI(
|
86 |
+
temperature=0,
|
87 |
+
model_name="gpt-3.5-turbo",
|
88 |
+
azure_deployment_name=os.getenv('AZURE_OPENAI_DEPLOYMENT_NAME'),
|
89 |
+
azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT'),
|
90 |
+
api_key=os.getenv('AZURE_OPENAI_KEY')
|
91 |
+
)
|
92 |
self.qa_chain = RetrievalQA.from_chain_type(
|
93 |
llm=llm,
|
94 |
chain_type="stuff",
|
|
|
96 |
search_kwargs={"k": 3}
|
97 |
)
|
98 |
)
|
99 |
+
print("QA chain initialized successfully")
|
100 |
|
101 |
def answer_question(self, question: str) -> Dict:
|
102 |
"""
|