Spaces:
Running
Running
import gradio as gr | |
import fitz # PyMuPDF | |
from PIL import Image | |
import pytesseract | |
import io | |
import tempfile | |
import os | |
def pdf_to_text_ocr(pdf_file): | |
""" | |
Extracts text from a PDF file using OCR, displays it, and provides a download link. | |
This function takes an uploaded PDF, converts each page to an image, uses | |
Tesseract OCR to extract text, and then returns both the concatenated text | |
for display and a path to a temporary .txt file for download. | |
Args: | |
pdf_file (gradio.File): The uploaded PDF file object from Gradio. | |
Returns: | |
tuple[str, str | None]: A tuple containing the extracted text and the | |
filepath for the downloadable text file. | |
Returns (error_message, None) on failure. | |
""" | |
if pdf_file is None: | |
return "Please upload a PDF file.", None | |
try: | |
# Open the PDF file from the uploaded file's temporary path | |
pdf_document = fitz.open(stream=pdf_file.file.read(), filetype="pdf") | |
extracted_text = "" | |
# Iterate through each page of the PDF | |
for page_num in range(len(pdf_document)): | |
page = pdf_document.load_page(page_num) | |
# Convert the page to an image (pixmap) | |
pix = page.get_pixmap(dpi=300) # Increase DPI for better OCR quality | |
# Convert the pixmap to a PIL Image | |
img_data = pix.tobytes("png") | |
image = Image.open(io.BytesIO(img_data)) | |
# Use Tesseract to do OCR on the image | |
text = pytesseract.image_to_string(image) | |
extracted_text += f"--- Page {page_num + 1} ---\n{text}\n\n" | |
pdf_document.close() | |
if not extracted_text.strip(): | |
return "No text could be extracted from the PDF.", None | |
# Create a temporary file to store the extracted text | |
# delete=False is important so Gradio can access the file | |
with tempfile.NamedTemporaryFile(delete=False, mode="w", suffix=".txt", encoding="utf-8") as temp_file: | |
temp_file.write(extracted_text) | |
temp_filepath = temp_file.name | |
# Return the text for the textbox and the filepath for the download button | |
return extracted_text, temp_filepath | |
except Exception as e: | |
# Return the error message to the textbox and None for the file output | |
return f"An error occurred: {str(e)}", None | |
# Define the Gradio interface with two output components | |
iface = gr.Interface( | |
fn=pdf_to_text_ocr, | |
inputs=gr.File(label="Upload PDF", file_types=[".pdf"]), | |
outputs=[ | |
gr.Textbox(label="Extracted Text (Scrollable)", lines=20, placeholder="Text from your PDF will appear here..."), | |
gr.File(label="Download Extracted Text") | |
], | |
title="PDF OCR Extractor with Download", | |
description="Upload a PDF to extract its text. The text will be displayed below and a download link for a .txt file will be provided.", | |
article="Powered by PyMuPDF, Tesseract, and Gradio.", | |
examples=[["sample.pdf"]] # You can add a sample pdf file in the same directory | |
) | |
# Launch the app | |
if __name__ == "__main__": | |
iface.launch() | |