Spaces:

drewThomasson
/

PDF-to-TXT-OCR

Running

App Files Files Community

PDF-to-TXT-OCR / app.py

drewThomasson

Update app.py

a4ef596 verified 25 days ago

raw

history blame

3.22 kB

	import gradio as gr
	import fitz # PyMuPDF
	from PIL import Image
	import pytesseract
	import io
	import tempfile
	import os

	def pdf_to_text_ocr(pdf_file):
	"""
	Extracts text from a PDF file using OCR, displays it, and provides a download link.

	This function takes an uploaded PDF, converts each page to an image, uses
	Tesseract OCR to extract text, and then returns both the concatenated text
	for display and a path to a temporary .txt file for download.

	Args:
	pdf_file (gradio.File): The uploaded PDF file object from Gradio.

	Returns:
	tuple[str, str \| None]: A tuple containing the extracted text and the
	filepath for the downloadable text file.
	Returns (error_message, None) on failure.
	"""
	if pdf_file is None:
	return "Please upload a PDF file.", None

	try:
	# Open the PDF file from the uploaded file's temporary path
	pdf_document = fitz.open(stream=pdf_file.file.read(), filetype="pdf")
	extracted_text = ""

	# Iterate through each page of the PDF
	for page_num in range(len(pdf_document)):
	page = pdf_document.load_page(page_num)

	# Convert the page to an image (pixmap)
	pix = page.get_pixmap(dpi=300) # Increase DPI for better OCR quality

	# Convert the pixmap to a PIL Image
	img_data = pix.tobytes("png")
	image = Image.open(io.BytesIO(img_data))

	# Use Tesseract to do OCR on the image
	text = pytesseract.image_to_string(image)
	extracted_text += f"--- Page {page_num + 1} ---\n{text}\n\n"

	pdf_document.close()

	if not extracted_text.strip():
	return "No text could be extracted from the PDF.", None

	# Create a temporary file to store the extracted text
	# delete=False is important so Gradio can access the file
	with tempfile.NamedTemporaryFile(delete=False, mode="w", suffix=".txt", encoding="utf-8") as temp_file:
	temp_file.write(extracted_text)
	temp_filepath = temp_file.name

	# Return the text for the textbox and the filepath for the download button
	return extracted_text, temp_filepath

	except Exception as e:
	# Return the error message to the textbox and None for the file output
	return f"An error occurred: {str(e)}", None

	# Define the Gradio interface with two output components
	iface = gr.Interface(
	fn=pdf_to_text_ocr,
	inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
	outputs=[
	gr.Textbox(label="Extracted Text (Scrollable)", lines=20, placeholder="Text from your PDF will appear here..."),
	gr.File(label="Download Extracted Text")
	],
	title="PDF OCR Extractor with Download",
	description="Upload a PDF to extract its text. The text will be displayed below and a download link for a .txt file will be provided.",
	article="Powered by PyMuPDF, Tesseract, and Gradio.",
	examples=[["sample.pdf"]] # You can add a sample pdf file in the same directory
	)

	# Launch the app
	if __name__ == "__main__":
	iface.launch()