File size: 5,013 Bytes
fda9a03
83d2c4f
 
 
 
2b19583
 
52020a7
83d2c4f
 
 
2b19583
 
 
 
 
 
a4ef596
2b19583
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83d2c4f
 
2b19583
 
 
83d2c4f
2b19583
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83d2c4f
2b19583
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83d2c4f
2b19583
 
 
 
 
 
 
 
83d2c4f
2b19583
 
 
83d2c4f
2b19583
83d2c4f
2b19583
83d2c4f
 
 
a4ef596
83d2c4f
2b19583
 
a4ef596
2b19583
 
83d2c4f
fda9a03
 
2b19583
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import gradio as gr
import tempfile, os
from pdf2image import convert_from_path
import pytesseract, pdfplumber, camelot
from PIL import Image, ImageOps
import ocrmypdf
import subprocess

def extract_text_from_pdf(file):
    extracted = []
    pdf_path = file.name
    
    # Create temporary paths for OCR'd PDF and text output
    temp_dir = tempfile.gettempdir()
    ocr_pdf_path = os.path.join(temp_dir, "ocr_searchable.pdf")
    output_txt_path = os.path.join(temp_dir, "extracted_text.txt")
    
    try:
        # Step 1: Use OCRmyPDF to create a searchable PDF
        print("Processing PDF with OCRmyPDF...")
        ocrmypdf.ocr(
            pdf_path, 
            ocr_pdf_path, 
            deskew=True,
            clean=True,
            force_ocr=False,  # Only OCR if needed
            skip_text=False,
            optimize=1
        )
        
        # Step 2: Extract text from the OCR'd searchable PDF using pdfplumber
        print("Extracting text from OCR'd PDF...")
        with pdfplumber.open(ocr_pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages):
                text = page.extract_text(layout=True)
                if text:
                    extracted.append(f"--- Page {page_num + 1} ---\n{text}")
                
                # Extract tables if any
                tables = page.extract_tables()
                for table_num, table in enumerate(tables):
                    if table:
                        table_text = f"TABLE {table_num + 1} (Page {page_num + 1}):\n"
                        table_text += "\n".join([", ".join([str(cell) if cell else "" for cell in row]) for row in table])
                        extracted.append(table_text)
        
        # Step 3: Try Camelot for additional table extraction
        try:
            tables = camelot.read_pdf(ocr_pdf_path, pages="all", flavor="lattice")
            for i, table in enumerate(tables):
                extracted.append(f"CAMELOT TABLE {i + 1}:\n{table.df.to_csv(index=False)}")
        except Exception as e:
            print(f"Camelot extraction failed: {e}")
        
        # Combine all extracted text
        combined_text = "\n\n".join(extracted).strip()
        
        # If still no text, fallback to direct OCR
        if len(combined_text) < 50:
            print("Fallback to direct OCR...")
            images = convert_from_path(pdf_path, dpi=300)
            ocr_text = []
            for i, img in enumerate(images):
                img = img.convert("L")
                img = ImageOps.invert(img)
                page_text = pytesseract.image_to_string(img, config="--psm 6")
                if page_text.strip():
                    ocr_text.append(f"--- Page {i + 1} (Direct OCR) ---\n{page_text}")
            combined_text = "\n\n".join(ocr_text)
        
        # Save the extracted text
        with open(output_txt_path, "w", encoding="utf-8") as f:
            f.write(combined_text)
        
        return combined_text, output_txt_path, ocr_pdf_path
        
    except Exception as e:
        error_msg = f"Error processing PDF: {str(e)}\n\nFalling back to original extraction methods..."
        print(error_msg)
        
        # Fallback to original method if OCRmyPDF fails
        try:
            with pdfplumber.open(pdf_path) as pdf:
                for page in pdf.pages:
                    text = page.extract_text(layout=True)
                    if text:
                        extracted.append(text)
                    tables = page.extract_tables()
                    for table in tables:
                        extracted.append("TABLE:\n" + "\n".join([", ".join(row) for row in table]))
        except Exception as e2:
            print("pdfplumber error:", e2)

        # OCR fallback if text is too short
        combined = "\n".join(extracted).strip()
        if len(combined) < 100:
            images = convert_from_path(pdf_path, dpi=300)
            for img in images:
                img = img.convert("L")
                img = ImageOps.invert(img)
                combined += pytesseract.image_to_string(img, config="--psm 6") + "\n"

        # Save fallback output
        with open(output_txt_path, "w", encoding="utf-8") as f:
            f.write(combined)

        return combined, output_txt_path, pdf_path  # Return original PDF if OCR failed

# Create Gradio interface
app = gr.Interface(
    fn=extract_text_from_pdf,
    inputs=gr.File(label="πŸ“€ Upload PDF", file_types=[".pdf"]),
    outputs=[
        gr.Textbox(label="πŸ“„ Extracted Text", lines=25, show_copy_button=True),
        gr.File(label="πŸ“₯ Download Extracted Text (.txt)"),
        gr.File(label="πŸ“₯ Download OCR'd Searchable PDF")
    ],
    title="Advanced PDF OCR Extractor with OCRmyPDF",
    description="Upload a PDF to get: 1) Extracted text displayed and downloadable as .txt, 2) OCR'd searchable PDF download. Uses OCRmyPDF for superior OCR quality.",
    allow_flagging="never",
)

if __name__ == "__main__":
    app.launch()