Spaces:
Sleeping
Sleeping
Commit
·
e0eceb3
1
Parent(s):
ae615e3
Major overhaul of PDF processing and Docker build
Browse files- Dockerfile +11 -9
- pdf_processor.py +86 -93
Dockerfile
CHANGED
@@ -2,27 +2,29 @@ FROM python:3.10-slim
|
|
2 |
|
3 |
WORKDIR /home/user/app
|
4 |
|
5 |
-
# Install git and
|
6 |
RUN apt-get update && \
|
7 |
-
apt-get install -y git git-lfs && \
|
8 |
-
rm -rf /var/lib/apt/lists/*
|
|
|
9 |
|
10 |
# Copy requirements first for better caching
|
11 |
COPY requirements.txt .
|
12 |
RUN pip install -r requirements.txt
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
# Copy the rest of the application
|
15 |
COPY . .
|
16 |
|
17 |
-
# Explicitly copy and verify the PDF file
|
18 |
-
COPY Dataset/Commercial\ Lending\ 101.pdf /home/user/app/Dataset/
|
19 |
-
RUN ls -l /home/user/app/Dataset/Commercial\ Lending\ 101.pdf && \
|
20 |
-
echo "PDF file size: $(stat -f%z /home/user/app/Dataset/Commercial\ Lending\ 101.pdf) bytes"
|
21 |
-
|
22 |
# Make port configurable via environment variable
|
23 |
ENV PORT=8501
|
24 |
|
25 |
EXPOSE ${PORT}
|
26 |
|
27 |
# Use the correct path to app.py and make port configurable
|
28 |
-
|
|
|
2 |
|
3 |
WORKDIR /home/user/app
|
4 |
|
5 |
+
# Install git-lfs and other dependencies
|
6 |
RUN apt-get update && \
|
7 |
+
apt-get install -y git git-lfs poppler-utils && \
|
8 |
+
rm -rf /var/lib/apt/lists/* && \
|
9 |
+
git lfs install
|
10 |
|
11 |
# Copy requirements first for better caching
|
12 |
COPY requirements.txt .
|
13 |
RUN pip install -r requirements.txt
|
14 |
|
15 |
+
# Initialize git-lfs and copy the application
|
16 |
+
COPY .gitattributes .
|
17 |
+
COPY Dataset/Commercial\ Lending\ 101.pdf Dataset/
|
18 |
+
RUN ls -la Dataset && \
|
19 |
+
stat Dataset/Commercial\ Lending\ 101.pdf
|
20 |
+
|
21 |
# Copy the rest of the application
|
22 |
COPY . .
|
23 |
|
|
|
|
|
|
|
|
|
|
|
24 |
# Make port configurable via environment variable
|
25 |
ENV PORT=8501
|
26 |
|
27 |
EXPOSE ${PORT}
|
28 |
|
29 |
# Use the correct path to app.py and make port configurable
|
30 |
+
CMD ["streamlit", "run", "app.py", "--server.port=${PORT}", "--server.address=0.0.0.0"]
|
pdf_processor.py
CHANGED
@@ -1,19 +1,34 @@
|
|
1 |
from typing import List, Dict
|
2 |
import os
|
|
|
|
|
3 |
import pypdf
|
4 |
from langchain.document_loaders import PyPDFLoader
|
5 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
6 |
|
7 |
class PDFProcessor:
|
8 |
def __init__(self):
|
9 |
-
# Adjust text splitter settings for more lenient chunking
|
10 |
self.text_splitter = RecursiveCharacterTextSplitter(
|
11 |
-
chunk_size=500,
|
12 |
-
chunk_overlap=50,
|
13 |
length_function=len,
|
14 |
-
separators=["\n\n", "\n", ".", " ", ""]
|
15 |
)
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
def process_pdf(self, pdf_path: str) -> List[Dict]:
|
18 |
"""
|
19 |
Process a PDF file and return chunks of text with metadata.
|
@@ -29,102 +44,80 @@ class PDFProcessor:
|
|
29 |
if not os.path.exists(pdf_path):
|
30 |
raise FileNotFoundError(f"PDF file not found at {pdf_path}")
|
31 |
|
32 |
-
|
|
|
33 |
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
-
# Debug: Print raw content from first few pages
|
45 |
-
for i, page in enumerate(pages[:2]):
|
46 |
-
print(f"\nPage {i+1} preview (first 200 chars):")
|
47 |
-
print(page.page_content[:200])
|
48 |
-
|
49 |
-
# Split the text into chunks
|
50 |
chunks = []
|
51 |
-
for
|
52 |
-
|
53 |
-
|
54 |
-
continue
|
55 |
-
|
56 |
-
# Debug: Print content length
|
57 |
-
content = page.page_content.strip()
|
58 |
-
print(f"Page {page.metadata.get('page', 'unknown')} content length: {len(content)} chars")
|
59 |
-
|
60 |
-
try:
|
61 |
page_chunks = self.text_splitter.split_text(content)
|
62 |
-
print(f"Created {len(page_chunks)} chunks from page {page.metadata.get('page', 'unknown')}")
|
63 |
-
|
64 |
for chunk in page_chunks:
|
65 |
-
if chunk.strip():
|
66 |
chunks.append({
|
67 |
'text': chunk,
|
68 |
-
'metadata': {'page':
|
69 |
})
|
70 |
-
except Exception as chunk_error:
|
71 |
-
print(f"Error splitting page {page.metadata.get('page', 'unknown')}: {str(chunk_error)}")
|
72 |
-
|
73 |
-
if not chunks:
|
74 |
-
raise ValueError("No text chunks created from PDF")
|
75 |
-
|
76 |
-
print(f"Created total of {len(chunks)} chunks from PyPDFLoader method")
|
77 |
-
print(f"First chunk preview: {chunks[0]['text'][:200]}...")
|
78 |
return chunks
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
print(f"Successfully opened PDF with {len(pdf.pages)} pages")
|
90 |
-
|
91 |
-
# Debug: Print raw content from first few pages
|
92 |
-
for i in range(min(2, len(pdf.pages))):
|
93 |
-
print(f"\nPage {i+1} preview (first 200 chars):")
|
94 |
-
print(pdf.pages[i].extract_text()[:200])
|
95 |
-
|
96 |
-
chunks = []
|
97 |
-
for page_num in range(len(pdf.pages)):
|
98 |
-
text = pdf.pages[page_num].extract_text()
|
99 |
-
if not text.strip():
|
100 |
-
print(f"Warning: Empty content on page {page_num + 1}")
|
101 |
-
continue
|
102 |
-
|
103 |
-
# Debug: Print content length
|
104 |
-
content = text.strip()
|
105 |
-
print(f"Page {page_num + 1} content length: {len(content)} chars")
|
106 |
-
|
107 |
-
try:
|
108 |
-
page_chunks = self.text_splitter.split_text(content)
|
109 |
-
print(f"Created {len(page_chunks)} chunks from page {page_num + 1}")
|
110 |
-
|
111 |
-
for chunk in page_chunks:
|
112 |
-
if chunk.strip(): # Only add non-empty chunks
|
113 |
-
chunks.append({
|
114 |
-
'text': chunk,
|
115 |
-
'metadata': {'page': page_num + 1}
|
116 |
-
})
|
117 |
-
except Exception as chunk_error:
|
118 |
-
print(f"Error splitting page {page_num + 1}: {str(chunk_error)}")
|
119 |
-
|
120 |
-
if not chunks:
|
121 |
-
raise ValueError("No text chunks created from PDF")
|
122 |
-
|
123 |
-
print(f"Created total of {len(chunks)} chunks from direct pypdf method")
|
124 |
-
print(f"First chunk preview: {chunks[0]['text'][:200]}...")
|
125 |
-
return chunks
|
126 |
-
|
127 |
-
except Exception as e2:
|
128 |
-
error_msg = f"Failed to process PDF with both methods.\nPyPDFLoader error: {str(e)}\npypdf error: {str(e2)}"
|
129 |
-
print(error_msg)
|
130 |
-
raise Exception(error_msg)
|
|
|
1 |
from typing import List, Dict
|
2 |
import os
|
3 |
+
import subprocess
|
4 |
+
import tempfile
|
5 |
import pypdf
|
6 |
from langchain.document_loaders import PyPDFLoader
|
7 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
8 |
|
9 |
class PDFProcessor:
|
10 |
def __init__(self):
|
|
|
11 |
self.text_splitter = RecursiveCharacterTextSplitter(
|
12 |
+
chunk_size=500,
|
13 |
+
chunk_overlap=50,
|
14 |
length_function=len,
|
15 |
+
separators=["\n\n", "\n", ".", " ", ""]
|
16 |
)
|
17 |
|
18 |
+
def extract_text_with_pdftotext(self, pdf_path: str) -> str:
|
19 |
+
"""Use pdftotext (from poppler-utils) to extract text."""
|
20 |
+
try:
|
21 |
+
result = subprocess.run(
|
22 |
+
['pdftotext', pdf_path, '-'],
|
23 |
+
capture_output=True,
|
24 |
+
text=True,
|
25 |
+
check=True
|
26 |
+
)
|
27 |
+
return result.stdout
|
28 |
+
except Exception as e:
|
29 |
+
print(f"pdftotext extraction failed: {str(e)}")
|
30 |
+
return ""
|
31 |
+
|
32 |
def process_pdf(self, pdf_path: str) -> List[Dict]:
|
33 |
"""
|
34 |
Process a PDF file and return chunks of text with metadata.
|
|
|
44 |
if not os.path.exists(pdf_path):
|
45 |
raise FileNotFoundError(f"PDF file not found at {pdf_path}")
|
46 |
|
47 |
+
file_size = os.path.getsize(pdf_path)
|
48 |
+
print(f"PDF file exists, size: {file_size} bytes")
|
49 |
|
50 |
+
if file_size < 1000: # Less than 1KB
|
51 |
+
raise ValueError(f"PDF file seems too small ({file_size} bytes). Might be corrupted or a pointer file.")
|
52 |
+
|
53 |
+
# Try all three methods
|
54 |
+
methods = [
|
55 |
+
("PyPDFLoader", self._try_pypdf_loader),
|
56 |
+
("pypdf", self._try_pypdf_direct),
|
57 |
+
("pdftotext", self._try_pdftotext)
|
58 |
+
]
|
59 |
+
|
60 |
+
last_error = None
|
61 |
+
for method_name, method in methods:
|
62 |
+
try:
|
63 |
+
print(f"\nTrying {method_name} method...")
|
64 |
+
chunks = method(pdf_path)
|
65 |
+
if chunks:
|
66 |
+
print(f"Successfully extracted {len(chunks)} chunks using {method_name}")
|
67 |
+
return chunks
|
68 |
+
except Exception as e:
|
69 |
+
print(f"Error with {method_name}: {str(e)}")
|
70 |
+
last_error = e
|
71 |
+
|
72 |
+
raise Exception(f"All PDF processing methods failed. Last error: {str(last_error)}")
|
73 |
+
|
74 |
+
def _try_pypdf_loader(self, pdf_path: str) -> List[Dict]:
|
75 |
+
loader = PyPDFLoader(pdf_path)
|
76 |
+
pages = loader.load()
|
77 |
+
print(f"Loaded {len(pages)} pages")
|
78 |
+
|
79 |
+
chunks = []
|
80 |
+
for page in pages:
|
81 |
+
content = page.page_content.strip()
|
82 |
+
if content:
|
83 |
+
page_chunks = self.text_splitter.split_text(content)
|
84 |
+
for chunk in page_chunks:
|
85 |
+
if chunk.strip():
|
86 |
+
chunks.append({
|
87 |
+
'text': chunk,
|
88 |
+
'metadata': {'page': page.metadata['page']}
|
89 |
+
})
|
90 |
+
return chunks
|
91 |
+
|
92 |
+
def _try_pypdf_direct(self, pdf_path: str) -> List[Dict]:
|
93 |
+
with open(pdf_path, 'rb') as file:
|
94 |
+
pdf = pypdf.PdfReader(file)
|
95 |
+
print(f"Opened PDF with {len(pdf.pages)} pages")
|
96 |
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
chunks = []
|
98 |
+
for page_num in range(len(pdf.pages)):
|
99 |
+
content = pdf.pages[page_num].extract_text().strip()
|
100 |
+
if content:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
page_chunks = self.text_splitter.split_text(content)
|
|
|
|
|
102 |
for chunk in page_chunks:
|
103 |
+
if chunk.strip():
|
104 |
chunks.append({
|
105 |
'text': chunk,
|
106 |
+
'metadata': {'page': page_num + 1}
|
107 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
return chunks
|
109 |
+
|
110 |
+
def _try_pdftotext(self, pdf_path: str) -> List[Dict]:
|
111 |
+
text = self.extract_text_with_pdftotext(pdf_path)
|
112 |
+
if not text.strip():
|
113 |
+
return []
|
114 |
|
115 |
+
chunks = []
|
116 |
+
page_chunks = self.text_splitter.split_text(text)
|
117 |
+
for i, chunk in enumerate(page_chunks):
|
118 |
+
if chunk.strip():
|
119 |
+
chunks.append({
|
120 |
+
'text': chunk,
|
121 |
+
'metadata': {'page': 1} # Page info not available with this method
|
122 |
+
})
|
123 |
+
return chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|