Spaces:

tony-42069
/

cre-chatbot-rag

Sleeping

App Files Files Community

tony-42069 commited on Nov 27, 2024

Commit

e0eceb3

1 Parent(s): ae615e3

Major overhaul of PDF processing and Docker build

Browse files

Files changed (2) hide show

Dockerfile +11 -9
pdf_processor.py +86 -93

Dockerfile CHANGED Viewed

@@ -2,27 +2,29 @@ FROM python:3.10-slim
 WORKDIR /home/user/app
-# Install git and git-lfs
 RUN apt-get update && \
-    apt-get install -y git git-lfs && \
-    rm -rf /var/lib/apt/lists/*
 # Copy requirements first for better caching
 COPY requirements.txt .
 RUN pip install -r requirements.txt
 # Copy the rest of the application
 COPY . .
-# Explicitly copy and verify the PDF file
-COPY Dataset/Commercial\ Lending\ 101.pdf /home/user/app/Dataset/
-RUN ls -l /home/user/app/Dataset/Commercial\ Lending\ 101.pdf && \
-    echo "PDF file size: $(stat -f%z /home/user/app/Dataset/Commercial\ Lending\ 101.pdf) bytes"
 # Make port configurable via environment variable
 ENV PORT=8501
 EXPOSE ${PORT}
 # Use the correct path to app.py and make port configurable
-ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=${PORT}", "--server.address=0.0.0.0"]

 WORKDIR /home/user/app
+# Install git-lfs and other dependencies
 RUN apt-get update && \
+    apt-get install -y git git-lfs poppler-utils && \
+    rm -rf /var/lib/apt/lists/* && \
+    git lfs install
 # Copy requirements first for better caching
 COPY requirements.txt .
 RUN pip install -r requirements.txt
+# Initialize git-lfs and copy the application
+COPY .gitattributes .
+COPY Dataset/Commercial\ Lending\ 101.pdf Dataset/
+RUN ls -la Dataset && \
+    stat Dataset/Commercial\ Lending\ 101.pdf
 # Copy the rest of the application
 COPY . .
 # Make port configurable via environment variable
 ENV PORT=8501
 EXPOSE ${PORT}
 # Use the correct path to app.py and make port configurable
+CMD ["streamlit", "run", "app.py", "--server.port=${PORT}", "--server.address=0.0.0.0"]

pdf_processor.py CHANGED Viewed

@@ -1,19 +1,34 @@
 from typing import List, Dict
 import os
 import pypdf
 from langchain.document_loaders import PyPDFLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 class PDFProcessor:
     def __init__(self):
-        # Adjust text splitter settings for more lenient chunking
         self.text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=500,  # Smaller chunks
-            chunk_overlap=50,  # Less overlap
             length_function=len,
-            separators=["\n\n", "\n", ".", " ", ""]  # More granular separators
         )
     def process_pdf(self, pdf_path: str) -> List[Dict]:
         """
         Process a PDF file and return chunks of text with metadata.
@@ -29,102 +44,80 @@ class PDFProcessor:
         if not os.path.exists(pdf_path):
             raise FileNotFoundError(f"PDF file not found at {pdf_path}")
-        print(f"PDF file exists, size: {os.path.getsize(pdf_path)} bytes")
-        try:
-            print("Attempting to use PyPDFLoader...")
-            # Try using PyPDFLoader from langchain
-            loader = PyPDFLoader(pdf_path)
-            pages = loader.load()
-            print(f"Successfully loaded {len(pages)} pages with PyPDFLoader")
-            if not pages:
-                raise ValueError("No pages extracted from PDF")
-            # Debug: Print raw content from first few pages
-            for i, page in enumerate(pages[:2]):
-                print(f"\nPage {i+1} preview (first 200 chars):")
-                print(page.page_content[:200])
-            # Split the text into chunks
             chunks = []
-            for page in pages:
-                if not page.page_content.strip():
-                    print(f"Warning: Empty content on page {page.metadata.get('page', 'unknown')}")
-                    continue
-                # Debug: Print content length
-                content = page.page_content.strip()
-                print(f"Page {page.metadata.get('page', 'unknown')} content length: {len(content)} chars")
-                try:
                     page_chunks = self.text_splitter.split_text(content)
-                    print(f"Created {len(page_chunks)} chunks from page {page.metadata.get('page', 'unknown')}")
                     for chunk in page_chunks:
-                        if chunk.strip():  # Only add non-empty chunks
                             chunks.append({
                                 'text': chunk,
-                                'metadata': {'page': page.metadata['page']}
                             })
-                except Exception as chunk_error:
-                    print(f"Error splitting page {page.metadata.get('page', 'unknown')}: {str(chunk_error)}")
-            if not chunks:
-                raise ValueError("No text chunks created from PDF")
-            print(f"Created total of {len(chunks)} chunks from PyPDFLoader method")
-            print(f"First chunk preview: {chunks[0]['text'][:200]}...")
             return chunks
-        except Exception as e:
-            print(f"Error with PyPDFLoader: {str(e)}")
-            print("Trying alternative PDF processing method...")
-            # Fallback to direct pypdf usage
-            try:
-                print("Attempting to use pypdf directly...")
-                with open(pdf_path, 'rb') as file:
-                    pdf = pypdf.PdfReader(file)
-                    print(f"Successfully opened PDF with {len(pdf.pages)} pages")
-                    # Debug: Print raw content from first few pages
-                    for i in range(min(2, len(pdf.pages))):
-                        print(f"\nPage {i+1} preview (first 200 chars):")
-                        print(pdf.pages[i].extract_text()[:200])
-                    chunks = []
-                    for page_num in range(len(pdf.pages)):
-                        text = pdf.pages[page_num].extract_text()
-                        if not text.strip():
-                            print(f"Warning: Empty content on page {page_num + 1}")
-                            continue
-                        # Debug: Print content length
-                        content = text.strip()
-                        print(f"Page {page_num + 1} content length: {len(content)} chars")
-                        try:
-                            page_chunks = self.text_splitter.split_text(content)
-                            print(f"Created {len(page_chunks)} chunks from page {page_num + 1}")
-                            for chunk in page_chunks:
-                                if chunk.strip():  # Only add non-empty chunks
-                                    chunks.append({
-                                        'text': chunk,
-                                        'metadata': {'page': page_num + 1}
-                                    })
-                        except Exception as chunk_error:
-                            print(f"Error splitting page {page_num + 1}: {str(chunk_error)}")
-                    if not chunks:
-                        raise ValueError("No text chunks created from PDF")
-                    print(f"Created total of {len(chunks)} chunks from direct pypdf method")
-                    print(f"First chunk preview: {chunks[0]['text'][:200]}...")
-                    return chunks
-            except Exception as e2:
-                error_msg = f"Failed to process PDF with both methods.\nPyPDFLoader error: {str(e)}\npypdf error: {str(e2)}"
-                print(error_msg)
-                raise Exception(error_msg)

 from typing import List, Dict
 import os
+import subprocess
+import tempfile
 import pypdf
 from langchain.document_loaders import PyPDFLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 class PDFProcessor:
     def __init__(self):
         self.text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=500,
+            chunk_overlap=50,
             length_function=len,
+            separators=["\n\n", "\n", ".", " ", ""]
         )
+    def extract_text_with_pdftotext(self, pdf_path: str) -> str:
+        """Use pdftotext (from poppler-utils) to extract text."""
+        try:
+            result = subprocess.run(
+                ['pdftotext', pdf_path, '-'],
+                capture_output=True,
+                text=True,
+                check=True
+            )
+            return result.stdout
+        except Exception as e:
+            print(f"pdftotext extraction failed: {str(e)}")
+            return ""
     def process_pdf(self, pdf_path: str) -> List[Dict]:
         """
         Process a PDF file and return chunks of text with metadata.
         if not os.path.exists(pdf_path):
             raise FileNotFoundError(f"PDF file not found at {pdf_path}")
+        file_size = os.path.getsize(pdf_path)
+        print(f"PDF file exists, size: {file_size} bytes")
+        if file_size < 1000:  # Less than 1KB
+            raise ValueError(f"PDF file seems too small ({file_size} bytes). Might be corrupted or a pointer file.")
+        # Try all three methods
+        methods = [
+            ("PyPDFLoader", self._try_pypdf_loader),
+            ("pypdf", self._try_pypdf_direct),
+            ("pdftotext", self._try_pdftotext)
+        ]
+        last_error = None
+        for method_name, method in methods:
+            try:
+                print(f"\nTrying {method_name} method...")
+                chunks = method(pdf_path)
+                if chunks:
+                    print(f"Successfully extracted {len(chunks)} chunks using {method_name}")
+                    return chunks
+            except Exception as e:
+                print(f"Error with {method_name}: {str(e)}")
+                last_error = e
+        raise Exception(f"All PDF processing methods failed. Last error: {str(last_error)}")
+    def _try_pypdf_loader(self, pdf_path: str) -> List[Dict]:
+        loader = PyPDFLoader(pdf_path)
+        pages = loader.load()
+        print(f"Loaded {len(pages)} pages")
+        chunks = []
+        for page in pages:
+            content = page.page_content.strip()
+            if content:
+                page_chunks = self.text_splitter.split_text(content)
+                for chunk in page_chunks:
+                    if chunk.strip():
+                        chunks.append({
+                            'text': chunk,
+                            'metadata': {'page': page.metadata['page']}
+                        })
+        return chunks
+    def _try_pypdf_direct(self, pdf_path: str) -> List[Dict]:
+        with open(pdf_path, 'rb') as file:
+            pdf = pypdf.PdfReader(file)
+            print(f"Opened PDF with {len(pdf.pages)} pages")
             chunks = []
+            for page_num in range(len(pdf.pages)):
+                content = pdf.pages[page_num].extract_text().strip()
+                if content:
                     page_chunks = self.text_splitter.split_text(content)
                     for chunk in page_chunks:
+                        if chunk.strip():
                             chunks.append({
                                 'text': chunk,
+                                'metadata': {'page': page_num + 1}
                             })
             return chunks
+    def _try_pdftotext(self, pdf_path: str) -> List[Dict]:
+        text = self.extract_text_with_pdftotext(pdf_path)
+        if not text.strip():
+            return []
+        chunks = []
+        page_chunks = self.text_splitter.split_text(text)
+        for i, chunk in enumerate(page_chunks):
+            if chunk.strip():
+                chunks.append({
+                    'text': chunk,
+                    'metadata': {'page': 1}  # Page info not available with this method
+                })
+        return chunks