Spaces:
Runtime error
Runtime error
File size: 920 Bytes
98d9183 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
import fitz # PyMuPDF
import pdfplumber
import pytesseract
from PIL import Image
import io
def extract_text_from_pdf(pdf_path):
"""Extract text from normal and scanned PDFs."""
text = ""
# Try PyMuPDF first
doc = fitz.open(pdf_path)
for page in doc:
text += page.get_text("text") + "\n"
# If no text is extracted, use PDFPlumber
if not text.strip():
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text += page.extract_text() + "\n"
return text.strip()
def extract_text_from_scanned_pdf(pdf_path):
"""Extract text from scanned PDFs using OCR."""
doc = fitz.open(pdf_path)
text = ""
for page in doc:
img = page.get_pixmap()
img_bytes = img.tobytes("png")
img_pil = Image.open(io.BytesIO(img_bytes))
text += pytesseract.image_to_string(img_pil) + "\n"
return text.strip()
|