tony-42069 commited on
Commit
e0eceb3
·
1 Parent(s): ae615e3

Major overhaul of PDF processing and Docker build

Browse files
Files changed (2) hide show
  1. Dockerfile +11 -9
  2. pdf_processor.py +86 -93
Dockerfile CHANGED
@@ -2,27 +2,29 @@ FROM python:3.10-slim
2
 
3
  WORKDIR /home/user/app
4
 
5
- # Install git and git-lfs
6
  RUN apt-get update && \
7
- apt-get install -y git git-lfs && \
8
- rm -rf /var/lib/apt/lists/*
 
9
 
10
  # Copy requirements first for better caching
11
  COPY requirements.txt .
12
  RUN pip install -r requirements.txt
13
 
 
 
 
 
 
 
14
  # Copy the rest of the application
15
  COPY . .
16
 
17
- # Explicitly copy and verify the PDF file
18
- COPY Dataset/Commercial\ Lending\ 101.pdf /home/user/app/Dataset/
19
- RUN ls -l /home/user/app/Dataset/Commercial\ Lending\ 101.pdf && \
20
- echo "PDF file size: $(stat -f%z /home/user/app/Dataset/Commercial\ Lending\ 101.pdf) bytes"
21
-
22
  # Make port configurable via environment variable
23
  ENV PORT=8501
24
 
25
  EXPOSE ${PORT}
26
 
27
  # Use the correct path to app.py and make port configurable
28
- ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=${PORT}", "--server.address=0.0.0.0"]
 
2
 
3
  WORKDIR /home/user/app
4
 
5
+ # Install git-lfs and other dependencies
6
  RUN apt-get update && \
7
+ apt-get install -y git git-lfs poppler-utils && \
8
+ rm -rf /var/lib/apt/lists/* && \
9
+ git lfs install
10
 
11
  # Copy requirements first for better caching
12
  COPY requirements.txt .
13
  RUN pip install -r requirements.txt
14
 
15
+ # Initialize git-lfs and copy the application
16
+ COPY .gitattributes .
17
+ COPY Dataset/Commercial\ Lending\ 101.pdf Dataset/
18
+ RUN ls -la Dataset && \
19
+ stat Dataset/Commercial\ Lending\ 101.pdf
20
+
21
  # Copy the rest of the application
22
  COPY . .
23
 
 
 
 
 
 
24
  # Make port configurable via environment variable
25
  ENV PORT=8501
26
 
27
  EXPOSE ${PORT}
28
 
29
  # Use the correct path to app.py and make port configurable
30
+ CMD ["streamlit", "run", "app.py", "--server.port=${PORT}", "--server.address=0.0.0.0"]
pdf_processor.py CHANGED
@@ -1,19 +1,34 @@
1
  from typing import List, Dict
2
  import os
 
 
3
  import pypdf
4
  from langchain.document_loaders import PyPDFLoader
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
 
7
  class PDFProcessor:
8
  def __init__(self):
9
- # Adjust text splitter settings for more lenient chunking
10
  self.text_splitter = RecursiveCharacterTextSplitter(
11
- chunk_size=500, # Smaller chunks
12
- chunk_overlap=50, # Less overlap
13
  length_function=len,
14
- separators=["\n\n", "\n", ".", " ", ""] # More granular separators
15
  )
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  def process_pdf(self, pdf_path: str) -> List[Dict]:
18
  """
19
  Process a PDF file and return chunks of text with metadata.
@@ -29,102 +44,80 @@ class PDFProcessor:
29
  if not os.path.exists(pdf_path):
30
  raise FileNotFoundError(f"PDF file not found at {pdf_path}")
31
 
32
- print(f"PDF file exists, size: {os.path.getsize(pdf_path)} bytes")
 
33
 
34
- try:
35
- print("Attempting to use PyPDFLoader...")
36
- # Try using PyPDFLoader from langchain
37
- loader = PyPDFLoader(pdf_path)
38
- pages = loader.load()
39
- print(f"Successfully loaded {len(pages)} pages with PyPDFLoader")
40
-
41
- if not pages:
42
- raise ValueError("No pages extracted from PDF")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
- # Debug: Print raw content from first few pages
45
- for i, page in enumerate(pages[:2]):
46
- print(f"\nPage {i+1} preview (first 200 chars):")
47
- print(page.page_content[:200])
48
-
49
- # Split the text into chunks
50
  chunks = []
51
- for page in pages:
52
- if not page.page_content.strip():
53
- print(f"Warning: Empty content on page {page.metadata.get('page', 'unknown')}")
54
- continue
55
-
56
- # Debug: Print content length
57
- content = page.page_content.strip()
58
- print(f"Page {page.metadata.get('page', 'unknown')} content length: {len(content)} chars")
59
-
60
- try:
61
  page_chunks = self.text_splitter.split_text(content)
62
- print(f"Created {len(page_chunks)} chunks from page {page.metadata.get('page', 'unknown')}")
63
-
64
  for chunk in page_chunks:
65
- if chunk.strip(): # Only add non-empty chunks
66
  chunks.append({
67
  'text': chunk,
68
- 'metadata': {'page': page.metadata['page']}
69
  })
70
- except Exception as chunk_error:
71
- print(f"Error splitting page {page.metadata.get('page', 'unknown')}: {str(chunk_error)}")
72
-
73
- if not chunks:
74
- raise ValueError("No text chunks created from PDF")
75
-
76
- print(f"Created total of {len(chunks)} chunks from PyPDFLoader method")
77
- print(f"First chunk preview: {chunks[0]['text'][:200]}...")
78
  return chunks
 
 
 
 
 
79
 
80
- except Exception as e:
81
- print(f"Error with PyPDFLoader: {str(e)}")
82
- print("Trying alternative PDF processing method...")
83
-
84
- # Fallback to direct pypdf usage
85
- try:
86
- print("Attempting to use pypdf directly...")
87
- with open(pdf_path, 'rb') as file:
88
- pdf = pypdf.PdfReader(file)
89
- print(f"Successfully opened PDF with {len(pdf.pages)} pages")
90
-
91
- # Debug: Print raw content from first few pages
92
- for i in range(min(2, len(pdf.pages))):
93
- print(f"\nPage {i+1} preview (first 200 chars):")
94
- print(pdf.pages[i].extract_text()[:200])
95
-
96
- chunks = []
97
- for page_num in range(len(pdf.pages)):
98
- text = pdf.pages[page_num].extract_text()
99
- if not text.strip():
100
- print(f"Warning: Empty content on page {page_num + 1}")
101
- continue
102
-
103
- # Debug: Print content length
104
- content = text.strip()
105
- print(f"Page {page_num + 1} content length: {len(content)} chars")
106
-
107
- try:
108
- page_chunks = self.text_splitter.split_text(content)
109
- print(f"Created {len(page_chunks)} chunks from page {page_num + 1}")
110
-
111
- for chunk in page_chunks:
112
- if chunk.strip(): # Only add non-empty chunks
113
- chunks.append({
114
- 'text': chunk,
115
- 'metadata': {'page': page_num + 1}
116
- })
117
- except Exception as chunk_error:
118
- print(f"Error splitting page {page_num + 1}: {str(chunk_error)}")
119
-
120
- if not chunks:
121
- raise ValueError("No text chunks created from PDF")
122
-
123
- print(f"Created total of {len(chunks)} chunks from direct pypdf method")
124
- print(f"First chunk preview: {chunks[0]['text'][:200]}...")
125
- return chunks
126
-
127
- except Exception as e2:
128
- error_msg = f"Failed to process PDF with both methods.\nPyPDFLoader error: {str(e)}\npypdf error: {str(e2)}"
129
- print(error_msg)
130
- raise Exception(error_msg)
 
1
  from typing import List, Dict
2
  import os
3
+ import subprocess
4
+ import tempfile
5
  import pypdf
6
  from langchain.document_loaders import PyPDFLoader
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
 
9
  class PDFProcessor:
10
  def __init__(self):
 
11
  self.text_splitter = RecursiveCharacterTextSplitter(
12
+ chunk_size=500,
13
+ chunk_overlap=50,
14
  length_function=len,
15
+ separators=["\n\n", "\n", ".", " ", ""]
16
  )
17
 
18
+ def extract_text_with_pdftotext(self, pdf_path: str) -> str:
19
+ """Use pdftotext (from poppler-utils) to extract text."""
20
+ try:
21
+ result = subprocess.run(
22
+ ['pdftotext', pdf_path, '-'],
23
+ capture_output=True,
24
+ text=True,
25
+ check=True
26
+ )
27
+ return result.stdout
28
+ except Exception as e:
29
+ print(f"pdftotext extraction failed: {str(e)}")
30
+ return ""
31
+
32
  def process_pdf(self, pdf_path: str) -> List[Dict]:
33
  """
34
  Process a PDF file and return chunks of text with metadata.
 
44
  if not os.path.exists(pdf_path):
45
  raise FileNotFoundError(f"PDF file not found at {pdf_path}")
46
 
47
+ file_size = os.path.getsize(pdf_path)
48
+ print(f"PDF file exists, size: {file_size} bytes")
49
 
50
+ if file_size < 1000: # Less than 1KB
51
+ raise ValueError(f"PDF file seems too small ({file_size} bytes). Might be corrupted or a pointer file.")
52
+
53
+ # Try all three methods
54
+ methods = [
55
+ ("PyPDFLoader", self._try_pypdf_loader),
56
+ ("pypdf", self._try_pypdf_direct),
57
+ ("pdftotext", self._try_pdftotext)
58
+ ]
59
+
60
+ last_error = None
61
+ for method_name, method in methods:
62
+ try:
63
+ print(f"\nTrying {method_name} method...")
64
+ chunks = method(pdf_path)
65
+ if chunks:
66
+ print(f"Successfully extracted {len(chunks)} chunks using {method_name}")
67
+ return chunks
68
+ except Exception as e:
69
+ print(f"Error with {method_name}: {str(e)}")
70
+ last_error = e
71
+
72
+ raise Exception(f"All PDF processing methods failed. Last error: {str(last_error)}")
73
+
74
+ def _try_pypdf_loader(self, pdf_path: str) -> List[Dict]:
75
+ loader = PyPDFLoader(pdf_path)
76
+ pages = loader.load()
77
+ print(f"Loaded {len(pages)} pages")
78
+
79
+ chunks = []
80
+ for page in pages:
81
+ content = page.page_content.strip()
82
+ if content:
83
+ page_chunks = self.text_splitter.split_text(content)
84
+ for chunk in page_chunks:
85
+ if chunk.strip():
86
+ chunks.append({
87
+ 'text': chunk,
88
+ 'metadata': {'page': page.metadata['page']}
89
+ })
90
+ return chunks
91
+
92
+ def _try_pypdf_direct(self, pdf_path: str) -> List[Dict]:
93
+ with open(pdf_path, 'rb') as file:
94
+ pdf = pypdf.PdfReader(file)
95
+ print(f"Opened PDF with {len(pdf.pages)} pages")
96
 
 
 
 
 
 
 
97
  chunks = []
98
+ for page_num in range(len(pdf.pages)):
99
+ content = pdf.pages[page_num].extract_text().strip()
100
+ if content:
 
 
 
 
 
 
 
101
  page_chunks = self.text_splitter.split_text(content)
 
 
102
  for chunk in page_chunks:
103
+ if chunk.strip():
104
  chunks.append({
105
  'text': chunk,
106
+ 'metadata': {'page': page_num + 1}
107
  })
 
 
 
 
 
 
 
 
108
  return chunks
109
+
110
+ def _try_pdftotext(self, pdf_path: str) -> List[Dict]:
111
+ text = self.extract_text_with_pdftotext(pdf_path)
112
+ if not text.strip():
113
+ return []
114
 
115
+ chunks = []
116
+ page_chunks = self.text_splitter.split_text(text)
117
+ for i, chunk in enumerate(page_chunks):
118
+ if chunk.strip():
119
+ chunks.append({
120
+ 'text': chunk,
121
+ 'metadata': {'page': 1} # Page info not available with this method
122
+ })
123
+ return chunks