Spaces:

drewThomasson
/

PDF-to-TXT-OCR

Running

File size: 5,013 Bytes

import gradio as gr
import tempfile, os
from pdf2image import convert_from_path
import pytesseract, pdfplumber, camelot
from PIL import Image, ImageOps
import ocrmypdf
import subprocess

def extract_text_from_pdf(file):
    extracted = []
    pdf_path = file.name
    
    # Create temporary paths for OCR'd PDF and text output
    temp_dir = tempfile.gettempdir()
    ocr_pdf_path = os.path.join(temp_dir, "ocr_searchable.pdf")
    output_txt_path = os.path.join(temp_dir, "extracted_text.txt")
    
    try:
        # Step 1: Use OCRmyPDF to create a searchable PDF
        print("Processing PDF with OCRmyPDF...")
        ocrmypdf.ocr(
            pdf_path, 
            ocr_pdf_path, 
            deskew=True,
            clean=True,
            force_ocr=False,  # Only OCR if needed
            skip_text=False,
            optimize=1
        )
        
        # Step 2: Extract text from the OCR'd searchable PDF using pdfplumber
        print("Extracting text from OCR'd PDF...")
        with pdfplumber.open(ocr_pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages):
                text = page.extract_text(layout=True)
                if text:
                    extracted.append(f"--- Page {page_num + 1} ---\n{text}")
                
                # Extract tables if any
                tables = page.extract_tables()
                for table_num, table in enumerate(tables):
                    if table:
                        table_text = f"TABLE {table_num + 1} (Page {page_num + 1}):\n"
                        table_text += "\n".join([", ".join([str(cell) if cell else "" for cell in row]) for row in table])
                        extracted.append(table_text)
        
        # Step 3: Try Camelot for additional table extraction
        try:
            tables = camelot.read_pdf(ocr_pdf_path, pages="all", flavor="lattice")
            for i, table in enumerate(tables):
                extracted.append(f"CAMELOT TABLE {i + 1}:\n{table.df.to_csv(index=False)}")
        except Exception as e:
            print(f"Camelot extraction failed: {e}")
        
        # Combine all extracted text
        combined_text = "\n\n".join(extracted).strip()
        
        # If still no text, fallback to direct OCR
        if len(combined_text) < 50:
            print("Fallback to direct OCR...")
            images = convert_from_path(pdf_path, dpi=300)
            ocr_text = []
            for i, img in enumerate(images):
                img = img.convert("L")
                img = ImageOps.invert(img)
                page_text = pytesseract.image_to_string(img, config="--psm 6")
                if page_text.strip():
                    ocr_text.append(f"--- Page {i + 1} (Direct OCR) ---\n{page_text}")
            combined_text = "\n\n".join(ocr_text)
        
        # Save the extracted text
        with open(output_txt_path, "w", encoding="utf-8") as f:
            f.write(combined_text)
        
        return combined_text, output_txt_path, ocr_pdf_path
        
    except Exception as e:
        error_msg = f"Error processing PDF: {str(e)}\n\nFalling back to original extraction methods..."
        print(error_msg)
        
        # Fallback to original method if OCRmyPDF fails
        try:
            with pdfplumber.open(pdf_path) as pdf:
                for page in pdf.pages:
                    text = page.extract_text(layout=True)
                    if text:
                        extracted.append(text)
                    tables = page.extract_tables()
                    for table in tables:
                        extracted.append("TABLE:\n" + "\n".join([", ".join(row) for row in table]))
        except Exception as e2:
            print("pdfplumber error:", e2)

        # OCR fallback if text is too short
        combined = "\n".join(extracted).strip()
        if len(combined) < 100:
            images = convert_from_path(pdf_path, dpi=300)
            for img in images:
                img = img.convert("L")
                img = ImageOps.invert(img)
                combined += pytesseract.image_to_string(img, config="--psm 6") + "\n"

        # Save fallback output
        with open(output_txt_path, "w", encoding="utf-8") as f:
            f.write(combined)

        return combined, output_txt_path, pdf_path  # Return original PDF if OCR failed

# Create Gradio interface
app = gr.Interface(
    fn=extract_text_from_pdf,
    inputs=gr.File(label="📤 Upload PDF", file_types=[".pdf"]),
    outputs=[
        gr.Textbox(label="📄 Extracted Text", lines=25, show_copy_button=True),
        gr.File(label="📥 Download Extracted Text (.txt)"),
        gr.File(label="📥 Download OCR'd Searchable PDF")
    ],
    title="Advanced PDF OCR Extractor with OCRmyPDF",
    description="Upload a PDF to get: 1) Extracted text displayed and downloadable as .txt, 2) OCR'd searchable PDF download. Uses OCRmyPDF for superior OCR quality.",
    allow_flagging="never",
)

if __name__ == "__main__":
    app.launch()