Spaces:
Running
Running
import gradio as gr | |
import tempfile, os | |
from pdf2image import convert_from_path | |
import pytesseract, pdfplumber, camelot | |
from PIL import Image, ImageOps | |
import ocrmypdf | |
import subprocess | |
def extract_text_from_pdf(file): | |
extracted = [] | |
pdf_path = file.name | |
# Create temporary paths for OCR'd PDF and text output | |
temp_dir = tempfile.gettempdir() | |
ocr_pdf_path = os.path.join(temp_dir, "ocr_searchable.pdf") | |
output_txt_path = os.path.join(temp_dir, "extracted_text.txt") | |
try: | |
# Step 1: Use OCRmyPDF to create a searchable PDF | |
print("Processing PDF with OCRmyPDF...") | |
ocrmypdf.ocr( | |
pdf_path, | |
ocr_pdf_path, | |
deskew=True, | |
clean=True, | |
force_ocr=False, # Only OCR if needed | |
skip_text=False, | |
optimize=1 | |
) | |
# Step 2: Extract text from the OCR'd searchable PDF using pdfplumber | |
print("Extracting text from OCR'd PDF...") | |
with pdfplumber.open(ocr_pdf_path) as pdf: | |
for page_num, page in enumerate(pdf.pages): | |
text = page.extract_text(layout=True) | |
if text: | |
extracted.append(f"--- Page {page_num + 1} ---\n{text}") | |
# Extract tables if any | |
tables = page.extract_tables() | |
for table_num, table in enumerate(tables): | |
if table: | |
table_text = f"TABLE {table_num + 1} (Page {page_num + 1}):\n" | |
table_text += "\n".join([", ".join([str(cell) if cell else "" for cell in row]) for row in table]) | |
extracted.append(table_text) | |
# Step 3: Try Camelot for additional table extraction | |
try: | |
tables = camelot.read_pdf(ocr_pdf_path, pages="all", flavor="lattice") | |
for i, table in enumerate(tables): | |
extracted.append(f"CAMELOT TABLE {i + 1}:\n{table.df.to_csv(index=False)}") | |
except Exception as e: | |
print(f"Camelot extraction failed: {e}") | |
# Combine all extracted text | |
combined_text = "\n\n".join(extracted).strip() | |
# If still no text, fallback to direct OCR | |
if len(combined_text) < 50: | |
print("Fallback to direct OCR...") | |
images = convert_from_path(pdf_path, dpi=300) | |
ocr_text = [] | |
for i, img in enumerate(images): | |
img = img.convert("L") | |
img = ImageOps.invert(img) | |
page_text = pytesseract.image_to_string(img, config="--psm 6") | |
if page_text.strip(): | |
ocr_text.append(f"--- Page {i + 1} (Direct OCR) ---\n{page_text}") | |
combined_text = "\n\n".join(ocr_text) | |
# Save the extracted text | |
with open(output_txt_path, "w", encoding="utf-8") as f: | |
f.write(combined_text) | |
return combined_text, output_txt_path, ocr_pdf_path | |
except Exception as e: | |
error_msg = f"Error processing PDF: {str(e)}\n\nFalling back to original extraction methods..." | |
print(error_msg) | |
# Fallback to original method if OCRmyPDF fails | |
try: | |
with pdfplumber.open(pdf_path) as pdf: | |
for page in pdf.pages: | |
text = page.extract_text(layout=True) | |
if text: | |
extracted.append(text) | |
tables = page.extract_tables() | |
for table in tables: | |
extracted.append("TABLE:\n" + "\n".join([", ".join(row) for row in table])) | |
except Exception as e2: | |
print("pdfplumber error:", e2) | |
# OCR fallback if text is too short | |
combined = "\n".join(extracted).strip() | |
if len(combined) < 100: | |
images = convert_from_path(pdf_path, dpi=300) | |
for img in images: | |
img = img.convert("L") | |
img = ImageOps.invert(img) | |
combined += pytesseract.image_to_string(img, config="--psm 6") + "\n" | |
# Save fallback output | |
with open(output_txt_path, "w", encoding="utf-8") as f: | |
f.write(combined) | |
return combined, output_txt_path, pdf_path # Return original PDF if OCR failed | |
# Create Gradio interface | |
app = gr.Interface( | |
fn=extract_text_from_pdf, | |
inputs=gr.File(label="π€ Upload PDF", file_types=[".pdf"]), | |
outputs=[ | |
gr.Textbox(label="π Extracted Text", lines=25, show_copy_button=True), | |
gr.File(label="π₯ Download Extracted Text (.txt)"), | |
gr.File(label="π₯ Download OCR'd Searchable PDF") | |
], | |
title="Advanced PDF OCR Extractor with OCRmyPDF", | |
description="Upload a PDF to get: 1) Extracted text displayed and downloadable as .txt, 2) OCR'd searchable PDF download. Uses OCRmyPDF for superior OCR quality.", | |
allow_flagging="never", | |
) | |
if __name__ == "__main__": | |
app.launch() |