Spaces:

tony-42069
/

cre-chatbot-rag

Sleeping

App Files Files Community

tony-42069 commited on Nov 27, 2024

Commit

204d47b

1 Parent(s): f53cb7b

Add detailed logging and error handling for PDF processing and vector store initialization

Browse files

Files changed (2) hide show

pdf_processor.py +26 -2
rag_engine.py +21 -2

pdf_processor.py CHANGED Viewed

@@ -37,16 +37,30 @@ class PDFProcessor:
             pages = loader.load()
             print(f"Successfully loaded {len(pages)} pages with PyPDFLoader")
             # Split the text into chunks
             chunks = []
             for page in pages:
                 page_chunks = self.text_splitter.split_text(page.page_content)
                 for chunk in page_chunks:
                     chunks.append({
                         'text': chunk,
                         'metadata': {'page': page.metadata['page']}
                     })
-            print(f"Created {len(chunks)} chunks from PyPDFLoader method")
             return chunks
         except Exception as e:
@@ -63,14 +77,24 @@ class PDFProcessor:
                     for page_num in range(len(pdf.pages)):
                         text = pdf.pages[page_num].extract_text()
                         page_chunks = self.text_splitter.split_text(text)
                         for chunk in page_chunks:
                             chunks.append({
                                 'text': chunk,
                                 'metadata': {'page': page_num + 1}
                             })
-                    print(f"Created {len(chunks)} chunks from direct pypdf method")
                     return chunks
             except Exception as e2:

             pages = loader.load()
             print(f"Successfully loaded {len(pages)} pages with PyPDFLoader")
+            if not pages:
+                raise ValueError("No pages extracted from PDF")
             # Split the text into chunks
             chunks = []
             for page in pages:
+                if not page.page_content.strip():
+                    print(f"Warning: Empty content on page {page.metadata.get('page', 'unknown')}")
+                    continue
                 page_chunks = self.text_splitter.split_text(page.page_content)
+                print(f"Created {len(page_chunks)} chunks from page {page.metadata.get('page', 'unknown')}")
                 for chunk in page_chunks:
                     chunks.append({
                         'text': chunk,
                         'metadata': {'page': page.metadata['page']}
                     })
+            if not chunks:
+                raise ValueError("No text chunks created from PDF")
+            print(f"Created total of {len(chunks)} chunks from PyPDFLoader method")
+            print(f"First chunk preview: {chunks[0]['text'][:200]}...")
             return chunks
         except Exception as e:
                     for page_num in range(len(pdf.pages)):
                         text = pdf.pages[page_num].extract_text()
+                        if not text.strip():
+                            print(f"Warning: Empty content on page {page_num + 1}")
+                            continue
                         page_chunks = self.text_splitter.split_text(text)
+                        print(f"Created {len(page_chunks)} chunks from page {page_num + 1}")
                         for chunk in page_chunks:
                             chunks.append({
                                 'text': chunk,
                                 'metadata': {'page': page_num + 1}
                             })
+                    if not chunks:
+                        raise ValueError("No text chunks created from PDF")
+                    print(f"Created total of {len(chunks)} chunks from direct pypdf method")
+                    print(f"First chunk preview: {chunks[0]['text'][:200]}...")
                     return chunks
             except Exception as e2:

rag_engine.py CHANGED Viewed

@@ -59,18 +59,36 @@ class RAGEngine:
         Args:
             chunks (List[Dict]): List of dictionaries containing text and metadata
         """
         texts = [chunk['text'] for chunk in chunks]
         metadatas = [chunk['metadata'] for chunk in chunks]
         # Create vector store
         self.vector_store = Chroma.from_texts(
             texts=texts,
             embedding=self.embeddings,
-            metadatas=metadatas
         )
         # Initialize QA chain
-        llm = AzureChatOpenAI(temperature=0, model_name="gpt-3.5-turbo", azure_deployment_name=os.getenv('AZURE_OPENAI_DEPLOYMENT_NAME'), azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT'), api_key=os.getenv('AZURE_OPENAI_KEY'))
         self.qa_chain = RetrievalQA.from_chain_type(
             llm=llm,
             chain_type="stuff",
@@ -78,6 +96,7 @@ class RAGEngine:
                 search_kwargs={"k": 3}
             )
         )
     def answer_question(self, question: str) -> Dict:
         """

         Args:
             chunks (List[Dict]): List of dictionaries containing text and metadata
         """
+        print(f"Initializing vector store with {len(chunks)} chunks")
+        if not chunks:
+            raise ValueError("No text chunks provided. PDF processing may have failed.")
         texts = [chunk['text'] for chunk in chunks]
         metadatas = [chunk['metadata'] for chunk in chunks]
+        print(f"First chunk preview: {texts[0][:200]}...")
+        print(f"First chunk metadata: {metadatas[0]}")
         # Create vector store
+        print("Creating Chroma vector store...")
         self.vector_store = Chroma.from_texts(
             texts=texts,
             embedding=self.embeddings,
+            metadatas=metadatas,
+            persist_directory="./chroma_db"  # Add persistence
         )
+        print("Vector store created successfully")
         # Initialize QA chain
+        print("Initializing QA chain...")
+        llm = AzureChatOpenAI(
+            temperature=0,
+            model_name="gpt-3.5-turbo",
+            azure_deployment_name=os.getenv('AZURE_OPENAI_DEPLOYMENT_NAME'),
+            azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT'),
+            api_key=os.getenv('AZURE_OPENAI_KEY')
+        )
         self.qa_chain = RetrievalQA.from_chain_type(
             llm=llm,
             chain_type="stuff",
                 search_kwargs={"k": 3}
             )
         )
+        print("QA chain initialized successfully")
     def answer_question(self, question: str) -> Dict:
         """