tony-42069 commited on
Commit
0881f45
·
1 Parent(s): 8b012a5

Update PDF processing with fallback methods and add both PDF packages

Browse files
Files changed (2) hide show
  1. pdf_processor.py +41 -20
  2. requirements.txt +1 -0
pdf_processor.py CHANGED
@@ -1,4 +1,5 @@
1
  from typing import List, Dict
 
2
  from langchain.document_loaders import PyPDFLoader
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
4
 
@@ -19,24 +20,44 @@ class PDFProcessor:
19
  pdf_path (str): Path to the PDF file
20
 
21
  Returns:
22
- List[Dict]: List of dictionaries containing text chunks and metadata
23
  """
24
- # Load PDF
25
- loader = PyPDFLoader(pdf_path)
26
- pages = loader.load()
27
-
28
- # Split text into chunks
29
- chunks = self.text_splitter.split_documents(pages)
30
-
31
- # Format chunks with metadata
32
- processed_chunks = []
33
- for chunk in chunks:
34
- processed_chunks.append({
35
- 'text': chunk.page_content,
36
- 'metadata': {
37
- 'page': chunk.metadata.get('page', 0) + 1,
38
- 'source': pdf_path
39
- }
40
- })
41
-
42
- return processed_chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from typing import List, Dict
2
+ import pypdf
3
  from langchain.document_loaders import PyPDFLoader
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
 
 
20
  pdf_path (str): Path to the PDF file
21
 
22
  Returns:
23
+ List[Dict]: List of text chunks with metadata
24
  """
25
+ try:
26
+ # Try using PyPDFLoader from langchain
27
+ loader = PyPDFLoader(pdf_path)
28
+ pages = loader.load()
29
+
30
+ # Split the text into chunks
31
+ chunks = []
32
+ for page in pages:
33
+ page_chunks = self.text_splitter.split_text(page.page_content)
34
+ for chunk in page_chunks:
35
+ chunks.append({
36
+ 'text': chunk,
37
+ 'metadata': {'page': page.metadata['page']}
38
+ })
39
+ return chunks
40
+
41
+ except Exception as e:
42
+ print(f"Error with PyPDFLoader: {str(e)}")
43
+ print("Trying alternative PDF processing method...")
44
+
45
+ # Fallback to direct pypdf usage
46
+ try:
47
+ with open(pdf_path, 'rb') as file:
48
+ pdf = pypdf.PdfReader(file)
49
+ chunks = []
50
+
51
+ for page_num in range(len(pdf.pages)):
52
+ text = pdf.pages[page_num].extract_text()
53
+ page_chunks = self.text_splitter.split_text(text)
54
+
55
+ for chunk in page_chunks:
56
+ chunks.append({
57
+ 'text': chunk,
58
+ 'metadata': {'page': page_num + 1}
59
+ })
60
+ return chunks
61
+
62
+ except Exception as e2:
63
+ raise Exception(f"Failed to process PDF with both methods. Error: {str(e2)}")
requirements.txt CHANGED
@@ -2,6 +2,7 @@ streamlit==1.29.0
2
  openai==1.6.1
3
  python-dotenv==1.0.0
4
  pypdf==3.17.1
 
5
  langchain==0.0.352
6
  chromadb==0.4.18
7
  pydantic==2.5.2
 
2
  openai==1.6.1
3
  python-dotenv==1.0.0
4
  pypdf==3.17.1
5
+ PyPDF2==3.0.1
6
  langchain==0.0.352
7
  chromadb==0.4.18
8
  pydantic==2.5.2