tony-42069 commited on
Commit
ae615e3
·
1 Parent(s): 204d47b

Improve PDF processing with more lenient text splitting and detailed debugging

Browse files
Files changed (1) hide show
  1. pdf_processor.py +47 -20
pdf_processor.py CHANGED
@@ -6,11 +6,12 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
 
7
  class PDFProcessor:
8
  def __init__(self):
 
9
  self.text_splitter = RecursiveCharacterTextSplitter(
10
- chunk_size=1000,
11
- chunk_overlap=200,
12
  length_function=len,
13
- separators=["\n\n", "\n", " ", ""]
14
  )
15
 
16
  def process_pdf(self, pdf_path: str) -> List[Dict]:
@@ -39,6 +40,11 @@ class PDFProcessor:
39
 
40
  if not pages:
41
  raise ValueError("No pages extracted from PDF")
 
 
 
 
 
42
 
43
  # Split the text into chunks
44
  chunks = []
@@ -46,15 +52,23 @@ class PDFProcessor:
46
  if not page.page_content.strip():
47
  print(f"Warning: Empty content on page {page.metadata.get('page', 'unknown')}")
48
  continue
49
-
50
- page_chunks = self.text_splitter.split_text(page.page_content)
51
- print(f"Created {len(page_chunks)} chunks from page {page.metadata.get('page', 'unknown')}")
52
 
53
- for chunk in page_chunks:
54
- chunks.append({
55
- 'text': chunk,
56
- 'metadata': {'page': page.metadata['page']}
57
- })
 
 
 
 
 
 
 
 
 
 
 
58
 
59
  if not chunks:
60
  raise ValueError("No text chunks created from PDF")
@@ -73,22 +87,35 @@ class PDFProcessor:
73
  with open(pdf_path, 'rb') as file:
74
  pdf = pypdf.PdfReader(file)
75
  print(f"Successfully opened PDF with {len(pdf.pages)} pages")
76
- chunks = []
77
 
 
 
 
 
 
 
78
  for page_num in range(len(pdf.pages)):
79
  text = pdf.pages[page_num].extract_text()
80
  if not text.strip():
81
  print(f"Warning: Empty content on page {page_num + 1}")
82
  continue
83
-
84
- page_chunks = self.text_splitter.split_text(text)
85
- print(f"Created {len(page_chunks)} chunks from page {page_num + 1}")
86
 
87
- for chunk in page_chunks:
88
- chunks.append({
89
- 'text': chunk,
90
- 'metadata': {'page': page_num + 1}
91
- })
 
 
 
 
 
 
 
 
 
 
 
92
 
93
  if not chunks:
94
  raise ValueError("No text chunks created from PDF")
 
6
 
7
  class PDFProcessor:
8
  def __init__(self):
9
+ # Adjust text splitter settings for more lenient chunking
10
  self.text_splitter = RecursiveCharacterTextSplitter(
11
+ chunk_size=500, # Smaller chunks
12
+ chunk_overlap=50, # Less overlap
13
  length_function=len,
14
+ separators=["\n\n", "\n", ".", " ", ""] # More granular separators
15
  )
16
 
17
  def process_pdf(self, pdf_path: str) -> List[Dict]:
 
40
 
41
  if not pages:
42
  raise ValueError("No pages extracted from PDF")
43
+
44
+ # Debug: Print raw content from first few pages
45
+ for i, page in enumerate(pages[:2]):
46
+ print(f"\nPage {i+1} preview (first 200 chars):")
47
+ print(page.page_content[:200])
48
 
49
  # Split the text into chunks
50
  chunks = []
 
52
  if not page.page_content.strip():
53
  print(f"Warning: Empty content on page {page.metadata.get('page', 'unknown')}")
54
  continue
 
 
 
55
 
56
+ # Debug: Print content length
57
+ content = page.page_content.strip()
58
+ print(f"Page {page.metadata.get('page', 'unknown')} content length: {len(content)} chars")
59
+
60
+ try:
61
+ page_chunks = self.text_splitter.split_text(content)
62
+ print(f"Created {len(page_chunks)} chunks from page {page.metadata.get('page', 'unknown')}")
63
+
64
+ for chunk in page_chunks:
65
+ if chunk.strip(): # Only add non-empty chunks
66
+ chunks.append({
67
+ 'text': chunk,
68
+ 'metadata': {'page': page.metadata['page']}
69
+ })
70
+ except Exception as chunk_error:
71
+ print(f"Error splitting page {page.metadata.get('page', 'unknown')}: {str(chunk_error)}")
72
 
73
  if not chunks:
74
  raise ValueError("No text chunks created from PDF")
 
87
  with open(pdf_path, 'rb') as file:
88
  pdf = pypdf.PdfReader(file)
89
  print(f"Successfully opened PDF with {len(pdf.pages)} pages")
 
90
 
91
+ # Debug: Print raw content from first few pages
92
+ for i in range(min(2, len(pdf.pages))):
93
+ print(f"\nPage {i+1} preview (first 200 chars):")
94
+ print(pdf.pages[i].extract_text()[:200])
95
+
96
+ chunks = []
97
  for page_num in range(len(pdf.pages)):
98
  text = pdf.pages[page_num].extract_text()
99
  if not text.strip():
100
  print(f"Warning: Empty content on page {page_num + 1}")
101
  continue
 
 
 
102
 
103
+ # Debug: Print content length
104
+ content = text.strip()
105
+ print(f"Page {page_num + 1} content length: {len(content)} chars")
106
+
107
+ try:
108
+ page_chunks = self.text_splitter.split_text(content)
109
+ print(f"Created {len(page_chunks)} chunks from page {page_num + 1}")
110
+
111
+ for chunk in page_chunks:
112
+ if chunk.strip(): # Only add non-empty chunks
113
+ chunks.append({
114
+ 'text': chunk,
115
+ 'metadata': {'page': page_num + 1}
116
+ })
117
+ except Exception as chunk_error:
118
+ print(f"Error splitting page {page_num + 1}: {str(chunk_error)}")
119
 
120
  if not chunks:
121
  raise ValueError("No text chunks created from PDF")