Spaces:
Runtime error
Runtime error
import fitz # PyMuPDF | |
import pdfplumber | |
import pytesseract | |
from PIL import Image | |
import io | |
def extract_text_from_pdf(pdf_path): | |
"""Extract text from normal and scanned PDFs.""" | |
text = "" | |
# Try PyMuPDF first | |
doc = fitz.open(pdf_path) | |
for page in doc: | |
text += page.get_text("text") + "\n" | |
# If no text is extracted, use PDFPlumber | |
if not text.strip(): | |
with pdfplumber.open(pdf_path) as pdf: | |
for page in pdf.pages: | |
text += page.extract_text() + "\n" | |
return text.strip() | |
def extract_text_from_scanned_pdf(pdf_path): | |
"""Extract text from scanned PDFs using OCR.""" | |
doc = fitz.open(pdf_path) | |
text = "" | |
for page in doc: | |
img = page.get_pixmap() | |
img_bytes = img.tobytes("png") | |
img_pil = Image.open(io.BytesIO(img_bytes)) | |
text += pytesseract.image_to_string(img_pil) + "\n" | |
return text.strip() | |