Spaces:
Sleeping
Sleeping
Commit
·
ae615e3
1
Parent(s):
204d47b
Improve PDF processing with more lenient text splitting and detailed debugging
Browse files- pdf_processor.py +47 -20
pdf_processor.py
CHANGED
@@ -6,11 +6,12 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
6 |
|
7 |
class PDFProcessor:
|
8 |
def __init__(self):
|
|
|
9 |
self.text_splitter = RecursiveCharacterTextSplitter(
|
10 |
-
chunk_size=
|
11 |
-
chunk_overlap=
|
12 |
length_function=len,
|
13 |
-
separators=["\n\n", "\n", " ", ""]
|
14 |
)
|
15 |
|
16 |
def process_pdf(self, pdf_path: str) -> List[Dict]:
|
@@ -39,6 +40,11 @@ class PDFProcessor:
|
|
39 |
|
40 |
if not pages:
|
41 |
raise ValueError("No pages extracted from PDF")
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
# Split the text into chunks
|
44 |
chunks = []
|
@@ -46,15 +52,23 @@ class PDFProcessor:
|
|
46 |
if not page.page_content.strip():
|
47 |
print(f"Warning: Empty content on page {page.metadata.get('page', 'unknown')}")
|
48 |
continue
|
49 |
-
|
50 |
-
page_chunks = self.text_splitter.split_text(page.page_content)
|
51 |
-
print(f"Created {len(page_chunks)} chunks from page {page.metadata.get('page', 'unknown')}")
|
52 |
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
if not chunks:
|
60 |
raise ValueError("No text chunks created from PDF")
|
@@ -73,22 +87,35 @@ class PDFProcessor:
|
|
73 |
with open(pdf_path, 'rb') as file:
|
74 |
pdf = pypdf.PdfReader(file)
|
75 |
print(f"Successfully opened PDF with {len(pdf.pages)} pages")
|
76 |
-
chunks = []
|
77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
for page_num in range(len(pdf.pages)):
|
79 |
text = pdf.pages[page_num].extract_text()
|
80 |
if not text.strip():
|
81 |
print(f"Warning: Empty content on page {page_num + 1}")
|
82 |
continue
|
83 |
-
|
84 |
-
page_chunks = self.text_splitter.split_text(text)
|
85 |
-
print(f"Created {len(page_chunks)} chunks from page {page_num + 1}")
|
86 |
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
|
93 |
if not chunks:
|
94 |
raise ValueError("No text chunks created from PDF")
|
|
|
6 |
|
7 |
class PDFProcessor:
|
8 |
def __init__(self):
|
9 |
+
# Adjust text splitter settings for more lenient chunking
|
10 |
self.text_splitter = RecursiveCharacterTextSplitter(
|
11 |
+
chunk_size=500, # Smaller chunks
|
12 |
+
chunk_overlap=50, # Less overlap
|
13 |
length_function=len,
|
14 |
+
separators=["\n\n", "\n", ".", " ", ""] # More granular separators
|
15 |
)
|
16 |
|
17 |
def process_pdf(self, pdf_path: str) -> List[Dict]:
|
|
|
40 |
|
41 |
if not pages:
|
42 |
raise ValueError("No pages extracted from PDF")
|
43 |
+
|
44 |
+
# Debug: Print raw content from first few pages
|
45 |
+
for i, page in enumerate(pages[:2]):
|
46 |
+
print(f"\nPage {i+1} preview (first 200 chars):")
|
47 |
+
print(page.page_content[:200])
|
48 |
|
49 |
# Split the text into chunks
|
50 |
chunks = []
|
|
|
52 |
if not page.page_content.strip():
|
53 |
print(f"Warning: Empty content on page {page.metadata.get('page', 'unknown')}")
|
54 |
continue
|
|
|
|
|
|
|
55 |
|
56 |
+
# Debug: Print content length
|
57 |
+
content = page.page_content.strip()
|
58 |
+
print(f"Page {page.metadata.get('page', 'unknown')} content length: {len(content)} chars")
|
59 |
+
|
60 |
+
try:
|
61 |
+
page_chunks = self.text_splitter.split_text(content)
|
62 |
+
print(f"Created {len(page_chunks)} chunks from page {page.metadata.get('page', 'unknown')}")
|
63 |
+
|
64 |
+
for chunk in page_chunks:
|
65 |
+
if chunk.strip(): # Only add non-empty chunks
|
66 |
+
chunks.append({
|
67 |
+
'text': chunk,
|
68 |
+
'metadata': {'page': page.metadata['page']}
|
69 |
+
})
|
70 |
+
except Exception as chunk_error:
|
71 |
+
print(f"Error splitting page {page.metadata.get('page', 'unknown')}: {str(chunk_error)}")
|
72 |
|
73 |
if not chunks:
|
74 |
raise ValueError("No text chunks created from PDF")
|
|
|
87 |
with open(pdf_path, 'rb') as file:
|
88 |
pdf = pypdf.PdfReader(file)
|
89 |
print(f"Successfully opened PDF with {len(pdf.pages)} pages")
|
|
|
90 |
|
91 |
+
# Debug: Print raw content from first few pages
|
92 |
+
for i in range(min(2, len(pdf.pages))):
|
93 |
+
print(f"\nPage {i+1} preview (first 200 chars):")
|
94 |
+
print(pdf.pages[i].extract_text()[:200])
|
95 |
+
|
96 |
+
chunks = []
|
97 |
for page_num in range(len(pdf.pages)):
|
98 |
text = pdf.pages[page_num].extract_text()
|
99 |
if not text.strip():
|
100 |
print(f"Warning: Empty content on page {page_num + 1}")
|
101 |
continue
|
|
|
|
|
|
|
102 |
|
103 |
+
# Debug: Print content length
|
104 |
+
content = text.strip()
|
105 |
+
print(f"Page {page_num + 1} content length: {len(content)} chars")
|
106 |
+
|
107 |
+
try:
|
108 |
+
page_chunks = self.text_splitter.split_text(content)
|
109 |
+
print(f"Created {len(page_chunks)} chunks from page {page_num + 1}")
|
110 |
+
|
111 |
+
for chunk in page_chunks:
|
112 |
+
if chunk.strip(): # Only add non-empty chunks
|
113 |
+
chunks.append({
|
114 |
+
'text': chunk,
|
115 |
+
'metadata': {'page': page_num + 1}
|
116 |
+
})
|
117 |
+
except Exception as chunk_error:
|
118 |
+
print(f"Error splitting page {page_num + 1}: {str(chunk_error)}")
|
119 |
|
120 |
if not chunks:
|
121 |
raise ValueError("No text chunks created from PDF")
|