File size: 3,218 Bytes
fda9a03
a4ef596
 
fda9a03
a4ef596
fda9a03
 
52020a7
a4ef596
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52020a7
a4ef596
 
 
 
 
 
 
 
 
 
 
 
 
52020a7
a4ef596
 
 
52020a7
a4ef596
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52020a7
a4ef596
fda9a03
a4ef596
 
 
 
 
 
 
 
 
 
fda9a03
 
a4ef596
fda9a03
a4ef596
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import gradio as gr
import fitz  # PyMuPDF
from PIL import Image
import pytesseract
import io
import tempfile
import os

def pdf_to_text_ocr(pdf_file):
    """
    Extracts text from a PDF file using OCR, displays it, and provides a download link.

    This function takes an uploaded PDF, converts each page to an image, uses
    Tesseract OCR to extract text, and then returns both the concatenated text
    for display and a path to a temporary .txt file for download.

    Args:
        pdf_file (gradio.File): The uploaded PDF file object from Gradio.

    Returns:
        tuple[str, str | None]: A tuple containing the extracted text and the
                                 filepath for the downloadable text file.
                                 Returns (error_message, None) on failure.
    """
    if pdf_file is None:
        return "Please upload a PDF file.", None

    try:
        # Open the PDF file from the uploaded file's temporary path
        pdf_document = fitz.open(stream=pdf_file.file.read(), filetype="pdf")
        extracted_text = ""

        # Iterate through each page of the PDF
        for page_num in range(len(pdf_document)):
            page = pdf_document.load_page(page_num)
            
            # Convert the page to an image (pixmap)
            pix = page.get_pixmap(dpi=300) # Increase DPI for better OCR quality
            
            # Convert the pixmap to a PIL Image
            img_data = pix.tobytes("png")
            image = Image.open(io.BytesIO(img_data))
            
            # Use Tesseract to do OCR on the image
            text = pytesseract.image_to_string(image)
            extracted_text += f"--- Page {page_num + 1} ---\n{text}\n\n"

        pdf_document.close()
        
        if not extracted_text.strip():
            return "No text could be extracted from the PDF.", None

        # Create a temporary file to store the extracted text
        # delete=False is important so Gradio can access the file
        with tempfile.NamedTemporaryFile(delete=False, mode="w", suffix=".txt", encoding="utf-8") as temp_file:
            temp_file.write(extracted_text)
            temp_filepath = temp_file.name

        # Return the text for the textbox and the filepath for the download button
        return extracted_text, temp_filepath

    except Exception as e:
        # Return the error message to the textbox and None for the file output
        return f"An error occurred: {str(e)}", None

# Define the Gradio interface with two output components
iface = gr.Interface(
    fn=pdf_to_text_ocr,
    inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
    outputs=[
        gr.Textbox(label="Extracted Text (Scrollable)", lines=20, placeholder="Text from your PDF will appear here..."),
        gr.File(label="Download Extracted Text")
    ],
    title="PDF OCR Extractor with Download",
    description="Upload a PDF to extract its text. The text will be displayed below and a download link for a .txt file will be provided.",
    article="Powered by PyMuPDF, Tesseract, and Gradio.",
    examples=[["sample.pdf"]] # You can add a sample pdf file in the same directory
)

# Launch the app
if __name__ == "__main__":
    iface.launch()