Spaces:
Running
Running
File size: 5,013 Bytes
fda9a03 83d2c4f 2b19583 52020a7 83d2c4f 2b19583 a4ef596 2b19583 83d2c4f 2b19583 83d2c4f 2b19583 83d2c4f 2b19583 83d2c4f 2b19583 83d2c4f 2b19583 83d2c4f 2b19583 83d2c4f 2b19583 83d2c4f a4ef596 83d2c4f 2b19583 a4ef596 2b19583 83d2c4f fda9a03 2b19583 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import gradio as gr
import tempfile, os
from pdf2image import convert_from_path
import pytesseract, pdfplumber, camelot
from PIL import Image, ImageOps
import ocrmypdf
import subprocess
def extract_text_from_pdf(file):
extracted = []
pdf_path = file.name
# Create temporary paths for OCR'd PDF and text output
temp_dir = tempfile.gettempdir()
ocr_pdf_path = os.path.join(temp_dir, "ocr_searchable.pdf")
output_txt_path = os.path.join(temp_dir, "extracted_text.txt")
try:
# Step 1: Use OCRmyPDF to create a searchable PDF
print("Processing PDF with OCRmyPDF...")
ocrmypdf.ocr(
pdf_path,
ocr_pdf_path,
deskew=True,
clean=True,
force_ocr=False, # Only OCR if needed
skip_text=False,
optimize=1
)
# Step 2: Extract text from the OCR'd searchable PDF using pdfplumber
print("Extracting text from OCR'd PDF...")
with pdfplumber.open(ocr_pdf_path) as pdf:
for page_num, page in enumerate(pdf.pages):
text = page.extract_text(layout=True)
if text:
extracted.append(f"--- Page {page_num + 1} ---\n{text}")
# Extract tables if any
tables = page.extract_tables()
for table_num, table in enumerate(tables):
if table:
table_text = f"TABLE {table_num + 1} (Page {page_num + 1}):\n"
table_text += "\n".join([", ".join([str(cell) if cell else "" for cell in row]) for row in table])
extracted.append(table_text)
# Step 3: Try Camelot for additional table extraction
try:
tables = camelot.read_pdf(ocr_pdf_path, pages="all", flavor="lattice")
for i, table in enumerate(tables):
extracted.append(f"CAMELOT TABLE {i + 1}:\n{table.df.to_csv(index=False)}")
except Exception as e:
print(f"Camelot extraction failed: {e}")
# Combine all extracted text
combined_text = "\n\n".join(extracted).strip()
# If still no text, fallback to direct OCR
if len(combined_text) < 50:
print("Fallback to direct OCR...")
images = convert_from_path(pdf_path, dpi=300)
ocr_text = []
for i, img in enumerate(images):
img = img.convert("L")
img = ImageOps.invert(img)
page_text = pytesseract.image_to_string(img, config="--psm 6")
if page_text.strip():
ocr_text.append(f"--- Page {i + 1} (Direct OCR) ---\n{page_text}")
combined_text = "\n\n".join(ocr_text)
# Save the extracted text
with open(output_txt_path, "w", encoding="utf-8") as f:
f.write(combined_text)
return combined_text, output_txt_path, ocr_pdf_path
except Exception as e:
error_msg = f"Error processing PDF: {str(e)}\n\nFalling back to original extraction methods..."
print(error_msg)
# Fallback to original method if OCRmyPDF fails
try:
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text = page.extract_text(layout=True)
if text:
extracted.append(text)
tables = page.extract_tables()
for table in tables:
extracted.append("TABLE:\n" + "\n".join([", ".join(row) for row in table]))
except Exception as e2:
print("pdfplumber error:", e2)
# OCR fallback if text is too short
combined = "\n".join(extracted).strip()
if len(combined) < 100:
images = convert_from_path(pdf_path, dpi=300)
for img in images:
img = img.convert("L")
img = ImageOps.invert(img)
combined += pytesseract.image_to_string(img, config="--psm 6") + "\n"
# Save fallback output
with open(output_txt_path, "w", encoding="utf-8") as f:
f.write(combined)
return combined, output_txt_path, pdf_path # Return original PDF if OCR failed
# Create Gradio interface
app = gr.Interface(
fn=extract_text_from_pdf,
inputs=gr.File(label="π€ Upload PDF", file_types=[".pdf"]),
outputs=[
gr.Textbox(label="π Extracted Text", lines=25, show_copy_button=True),
gr.File(label="π₯ Download Extracted Text (.txt)"),
gr.File(label="π₯ Download OCR'd Searchable PDF")
],
title="Advanced PDF OCR Extractor with OCRmyPDF",
description="Upload a PDF to get: 1) Extracted text displayed and downloadable as .txt, 2) OCR'd searchable PDF download. Uses OCRmyPDF for superior OCR quality.",
allow_flagging="never",
)
if __name__ == "__main__":
app.launch() |