import fitz # PyMuPDF import pdfplumber import pytesseract from PIL import Image import io def extract_text_from_pdf(pdf_path): """Extract text from normal and scanned PDFs.""" text = "" # Try PyMuPDF first doc = fitz.open(pdf_path) for page in doc: text += page.get_text("text") + "\n" # If no text is extracted, use PDFPlumber if not text.strip(): with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: text += page.extract_text() + "\n" return text.strip() def extract_text_from_scanned_pdf(pdf_path): """Extract text from scanned PDFs using OCR.""" doc = fitz.open(pdf_path) text = "" for page in doc: img = page.get_pixmap() img_bytes = img.tobytes("png") img_pil = Image.open(io.BytesIO(img_bytes)) text += pytesseract.image_to_string(img_pil) + "\n" return text.strip()