File size: 920 Bytes
98d9183
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import fitz  # PyMuPDF
import pdfplumber
import pytesseract
from PIL import Image
import io

def extract_text_from_pdf(pdf_path):
    """Extract text from normal and scanned PDFs."""
    text = ""

    # Try PyMuPDF first
    doc = fitz.open(pdf_path)
    for page in doc:
        text += page.get_text("text") + "\n"

    # If no text is extracted, use PDFPlumber
    if not text.strip():
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text() + "\n"

    return text.strip()

def extract_text_from_scanned_pdf(pdf_path):
    """Extract text from scanned PDFs using OCR."""
    doc = fitz.open(pdf_path)
    text = ""

    for page in doc:
        img = page.get_pixmap()
        img_bytes = img.tobytes("png")
        img_pil = Image.open(io.BytesIO(img_bytes))
        text += pytesseract.image_to_string(img_pil) + "\n"

    return text.strip()