Spaces:

drewThomasson
/

PDF-to-TXT-OCR

Running

App Files Files Community

PDF-to-TXT-OCR / app.py

drewThomasson

Update app.py

2b19583 verified 20 days ago

raw

history blame contribute delete

5.01 kB

	import gradio as gr
	import tempfile, os
	from pdf2image import convert_from_path
	import pytesseract, pdfplumber, camelot
	from PIL import Image, ImageOps
	import ocrmypdf
	import subprocess

	def extract_text_from_pdf(file):
	extracted = []
	pdf_path = file.name

	# Create temporary paths for OCR'd PDF and text output
	temp_dir = tempfile.gettempdir()
	ocr_pdf_path = os.path.join(temp_dir, "ocr_searchable.pdf")
	output_txt_path = os.path.join(temp_dir, "extracted_text.txt")

	try:
	# Step 1: Use OCRmyPDF to create a searchable PDF
	print("Processing PDF with OCRmyPDF...")
	ocrmypdf.ocr(
	pdf_path,
	ocr_pdf_path,
	deskew=True,
	clean=True,
	force_ocr=False, # Only OCR if needed
	skip_text=False,
	optimize=1
	)

	# Step 2: Extract text from the OCR'd searchable PDF using pdfplumber
	print("Extracting text from OCR'd PDF...")
	with pdfplumber.open(ocr_pdf_path) as pdf:
	for page_num, page in enumerate(pdf.pages):
	text = page.extract_text(layout=True)
	if text:
	extracted.append(f"--- Page {page_num + 1} ---\n{text}")

	# Extract tables if any
	tables = page.extract_tables()
	for table_num, table in enumerate(tables):
	if table:
	table_text = f"TABLE {table_num + 1} (Page {page_num + 1}):\n"
	table_text += "\n".join([", ".join([str(cell) if cell else "" for cell in row]) for row in table])
	extracted.append(table_text)

	# Step 3: Try Camelot for additional table extraction
	try:
	tables = camelot.read_pdf(ocr_pdf_path, pages="all", flavor="lattice")
	for i, table in enumerate(tables):
	extracted.append(f"CAMELOT TABLE {i + 1}:\n{table.df.to_csv(index=False)}")
	except Exception as e:
	print(f"Camelot extraction failed: {e}")

	# Combine all extracted text
	combined_text = "\n\n".join(extracted).strip()

	# If still no text, fallback to direct OCR
	if len(combined_text) < 50:
	print("Fallback to direct OCR...")
	images = convert_from_path(pdf_path, dpi=300)
	ocr_text = []
	for i, img in enumerate(images):
	img = img.convert("L")
	img = ImageOps.invert(img)
	page_text = pytesseract.image_to_string(img, config="--psm 6")
	if page_text.strip():
	ocr_text.append(f"--- Page {i + 1} (Direct OCR) ---\n{page_text}")
	combined_text = "\n\n".join(ocr_text)

	# Save the extracted text
	with open(output_txt_path, "w", encoding="utf-8") as f:
	f.write(combined_text)

	return combined_text, output_txt_path, ocr_pdf_path

	except Exception as e:
	error_msg = f"Error processing PDF: {str(e)}\n\nFalling back to original extraction methods..."
	print(error_msg)

	# Fallback to original method if OCRmyPDF fails
	try:
	with pdfplumber.open(pdf_path) as pdf:
	for page in pdf.pages:
	text = page.extract_text(layout=True)
	if text:
	extracted.append(text)
	tables = page.extract_tables()
	for table in tables:
	extracted.append("TABLE:\n" + "\n".join([", ".join(row) for row in table]))
	except Exception as e2:
	print("pdfplumber error:", e2)

	# OCR fallback if text is too short
	combined = "\n".join(extracted).strip()
	if len(combined) < 100:
	images = convert_from_path(pdf_path, dpi=300)
	for img in images:
	img = img.convert("L")
	img = ImageOps.invert(img)
	combined += pytesseract.image_to_string(img, config="--psm 6") + "\n"

	# Save fallback output
	with open(output_txt_path, "w", encoding="utf-8") as f:
	f.write(combined)

	return combined, output_txt_path, pdf_path # Return original PDF if OCR failed

	# Create Gradio interface
	app = gr.Interface(
	fn=extract_text_from_pdf,
	inputs=gr.File(label="📤 Upload PDF", file_types=[".pdf"]),
	outputs=[
	gr.Textbox(label="📄 Extracted Text", lines=25, show_copy_button=True),
	gr.File(label="📥 Download Extracted Text (.txt)"),
	gr.File(label="📥 Download OCR'd Searchable PDF")
	],
	title="Advanced PDF OCR Extractor with OCRmyPDF",
	description="Upload a PDF to get: 1) Extracted text displayed and downloadable as .txt, 2) OCR'd searchable PDF download. Uses OCRmyPDF for superior OCR quality.",
	allow_flagging="never",
	)

	if __name__ == "__main__":
	app.launch()