genaibeauty commited on
Commit
98d9183
Β·
verified Β·
1 Parent(s): 42f6bbe

Create pdf_processing.py

Browse files
Files changed (1) hide show
  1. pdf_processing.py +35 -0
pdf_processing.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+ import pdfplumber
3
+ import pytesseract
4
+ from PIL import Image
5
+ import io
6
+
7
+ def extract_text_from_pdf(pdf_path):
8
+ """Extract text from normal and scanned PDFs."""
9
+ text = ""
10
+
11
+ # Try PyMuPDF first
12
+ doc = fitz.open(pdf_path)
13
+ for page in doc:
14
+ text += page.get_text("text") + "\n"
15
+
16
+ # If no text is extracted, use PDFPlumber
17
+ if not text.strip():
18
+ with pdfplumber.open(pdf_path) as pdf:
19
+ for page in pdf.pages:
20
+ text += page.extract_text() + "\n"
21
+
22
+ return text.strip()
23
+
24
+ def extract_text_from_scanned_pdf(pdf_path):
25
+ """Extract text from scanned PDFs using OCR."""
26
+ doc = fitz.open(pdf_path)
27
+ text = ""
28
+
29
+ for page in doc:
30
+ img = page.get_pixmap()
31
+ img_bytes = img.tobytes("png")
32
+ img_pil = Image.open(io.BytesIO(img_bytes))
33
+ text += pytesseract.image_to_string(img_pil) + "\n"
34
+
35
+ return text.strip()