tony-42069 commited on
Commit
204d47b
·
1 Parent(s): f53cb7b

Add detailed logging and error handling for PDF processing and vector store initialization

Browse files
Files changed (2) hide show
  1. pdf_processor.py +26 -2
  2. rag_engine.py +21 -2
pdf_processor.py CHANGED
@@ -37,16 +37,30 @@ class PDFProcessor:
37
  pages = loader.load()
38
  print(f"Successfully loaded {len(pages)} pages with PyPDFLoader")
39
 
 
 
 
40
  # Split the text into chunks
41
  chunks = []
42
  for page in pages:
 
 
 
 
43
  page_chunks = self.text_splitter.split_text(page.page_content)
 
 
44
  for chunk in page_chunks:
45
  chunks.append({
46
  'text': chunk,
47
  'metadata': {'page': page.metadata['page']}
48
  })
49
- print(f"Created {len(chunks)} chunks from PyPDFLoader method")
 
 
 
 
 
50
  return chunks
51
 
52
  except Exception as e:
@@ -63,14 +77,24 @@ class PDFProcessor:
63
 
64
  for page_num in range(len(pdf.pages)):
65
  text = pdf.pages[page_num].extract_text()
 
 
 
 
66
  page_chunks = self.text_splitter.split_text(text)
 
67
 
68
  for chunk in page_chunks:
69
  chunks.append({
70
  'text': chunk,
71
  'metadata': {'page': page_num + 1}
72
  })
73
- print(f"Created {len(chunks)} chunks from direct pypdf method")
 
 
 
 
 
74
  return chunks
75
 
76
  except Exception as e2:
 
37
  pages = loader.load()
38
  print(f"Successfully loaded {len(pages)} pages with PyPDFLoader")
39
 
40
+ if not pages:
41
+ raise ValueError("No pages extracted from PDF")
42
+
43
  # Split the text into chunks
44
  chunks = []
45
  for page in pages:
46
+ if not page.page_content.strip():
47
+ print(f"Warning: Empty content on page {page.metadata.get('page', 'unknown')}")
48
+ continue
49
+
50
  page_chunks = self.text_splitter.split_text(page.page_content)
51
+ print(f"Created {len(page_chunks)} chunks from page {page.metadata.get('page', 'unknown')}")
52
+
53
  for chunk in page_chunks:
54
  chunks.append({
55
  'text': chunk,
56
  'metadata': {'page': page.metadata['page']}
57
  })
58
+
59
+ if not chunks:
60
+ raise ValueError("No text chunks created from PDF")
61
+
62
+ print(f"Created total of {len(chunks)} chunks from PyPDFLoader method")
63
+ print(f"First chunk preview: {chunks[0]['text'][:200]}...")
64
  return chunks
65
 
66
  except Exception as e:
 
77
 
78
  for page_num in range(len(pdf.pages)):
79
  text = pdf.pages[page_num].extract_text()
80
+ if not text.strip():
81
+ print(f"Warning: Empty content on page {page_num + 1}")
82
+ continue
83
+
84
  page_chunks = self.text_splitter.split_text(text)
85
+ print(f"Created {len(page_chunks)} chunks from page {page_num + 1}")
86
 
87
  for chunk in page_chunks:
88
  chunks.append({
89
  'text': chunk,
90
  'metadata': {'page': page_num + 1}
91
  })
92
+
93
+ if not chunks:
94
+ raise ValueError("No text chunks created from PDF")
95
+
96
+ print(f"Created total of {len(chunks)} chunks from direct pypdf method")
97
+ print(f"First chunk preview: {chunks[0]['text'][:200]}...")
98
  return chunks
99
 
100
  except Exception as e2:
rag_engine.py CHANGED
@@ -59,18 +59,36 @@ class RAGEngine:
59
  Args:
60
  chunks (List[Dict]): List of dictionaries containing text and metadata
61
  """
 
 
 
 
 
62
  texts = [chunk['text'] for chunk in chunks]
63
  metadatas = [chunk['metadata'] for chunk in chunks]
64
 
 
 
 
65
  # Create vector store
 
66
  self.vector_store = Chroma.from_texts(
67
  texts=texts,
68
  embedding=self.embeddings,
69
- metadatas=metadatas
 
70
  )
 
71
 
72
  # Initialize QA chain
73
- llm = AzureChatOpenAI(temperature=0, model_name="gpt-3.5-turbo", azure_deployment_name=os.getenv('AZURE_OPENAI_DEPLOYMENT_NAME'), azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT'), api_key=os.getenv('AZURE_OPENAI_KEY'))
 
 
 
 
 
 
 
74
  self.qa_chain = RetrievalQA.from_chain_type(
75
  llm=llm,
76
  chain_type="stuff",
@@ -78,6 +96,7 @@ class RAGEngine:
78
  search_kwargs={"k": 3}
79
  )
80
  )
 
81
 
82
  def answer_question(self, question: str) -> Dict:
83
  """
 
59
  Args:
60
  chunks (List[Dict]): List of dictionaries containing text and metadata
61
  """
62
+ print(f"Initializing vector store with {len(chunks)} chunks")
63
+
64
+ if not chunks:
65
+ raise ValueError("No text chunks provided. PDF processing may have failed.")
66
+
67
  texts = [chunk['text'] for chunk in chunks]
68
  metadatas = [chunk['metadata'] for chunk in chunks]
69
 
70
+ print(f"First chunk preview: {texts[0][:200]}...")
71
+ print(f"First chunk metadata: {metadatas[0]}")
72
+
73
  # Create vector store
74
+ print("Creating Chroma vector store...")
75
  self.vector_store = Chroma.from_texts(
76
  texts=texts,
77
  embedding=self.embeddings,
78
+ metadatas=metadatas,
79
+ persist_directory="./chroma_db" # Add persistence
80
  )
81
+ print("Vector store created successfully")
82
 
83
  # Initialize QA chain
84
+ print("Initializing QA chain...")
85
+ llm = AzureChatOpenAI(
86
+ temperature=0,
87
+ model_name="gpt-3.5-turbo",
88
+ azure_deployment_name=os.getenv('AZURE_OPENAI_DEPLOYMENT_NAME'),
89
+ azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT'),
90
+ api_key=os.getenv('AZURE_OPENAI_KEY')
91
+ )
92
  self.qa_chain = RetrievalQA.from_chain_type(
93
  llm=llm,
94
  chain_type="stuff",
 
96
  search_kwargs={"k": 3}
97
  )
98
  )
99
+ print("QA chain initialized successfully")
100
 
101
  def answer_question(self, question: str) -> Dict:
102
  """