PDF-to-TXT-OCR / app.py
drewThomasson's picture
Update app.py
a4ef596 verified
raw
history blame
3.22 kB
import gradio as gr
import fitz # PyMuPDF
from PIL import Image
import pytesseract
import io
import tempfile
import os
def pdf_to_text_ocr(pdf_file):
"""
Extracts text from a PDF file using OCR, displays it, and provides a download link.
This function takes an uploaded PDF, converts each page to an image, uses
Tesseract OCR to extract text, and then returns both the concatenated text
for display and a path to a temporary .txt file for download.
Args:
pdf_file (gradio.File): The uploaded PDF file object from Gradio.
Returns:
tuple[str, str | None]: A tuple containing the extracted text and the
filepath for the downloadable text file.
Returns (error_message, None) on failure.
"""
if pdf_file is None:
return "Please upload a PDF file.", None
try:
# Open the PDF file from the uploaded file's temporary path
pdf_document = fitz.open(stream=pdf_file.file.read(), filetype="pdf")
extracted_text = ""
# Iterate through each page of the PDF
for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num)
# Convert the page to an image (pixmap)
pix = page.get_pixmap(dpi=300) # Increase DPI for better OCR quality
# Convert the pixmap to a PIL Image
img_data = pix.tobytes("png")
image = Image.open(io.BytesIO(img_data))
# Use Tesseract to do OCR on the image
text = pytesseract.image_to_string(image)
extracted_text += f"--- Page {page_num + 1} ---\n{text}\n\n"
pdf_document.close()
if not extracted_text.strip():
return "No text could be extracted from the PDF.", None
# Create a temporary file to store the extracted text
# delete=False is important so Gradio can access the file
with tempfile.NamedTemporaryFile(delete=False, mode="w", suffix=".txt", encoding="utf-8") as temp_file:
temp_file.write(extracted_text)
temp_filepath = temp_file.name
# Return the text for the textbox and the filepath for the download button
return extracted_text, temp_filepath
except Exception as e:
# Return the error message to the textbox and None for the file output
return f"An error occurred: {str(e)}", None
# Define the Gradio interface with two output components
iface = gr.Interface(
fn=pdf_to_text_ocr,
inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
outputs=[
gr.Textbox(label="Extracted Text (Scrollable)", lines=20, placeholder="Text from your PDF will appear here..."),
gr.File(label="Download Extracted Text")
],
title="PDF OCR Extractor with Download",
description="Upload a PDF to extract its text. The text will be displayed below and a download link for a .txt file will be provided.",
article="Powered by PyMuPDF, Tesseract, and Gradio.",
examples=[["sample.pdf"]] # You can add a sample pdf file in the same directory
)
# Launch the app
if __name__ == "__main__":
iface.launch()