Spaces:

drewThomasson
/

PDF-to-TXT-OCR

Running

App Files Files Community

drewThomasson commited on 20 days ago

Commit

a4ef596

verified ·

1 Parent(s): b2c785b

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -27

app.py CHANGED Viewed

@@ -1,40 +1,82 @@
 import gradio as gr
 import pytesseract
-from pdf2image import convert_from_path
 import tempfile
 import os
-import shutil
-def ocr_pdf(file_path):
-    # Temporary directory for processing
-    with tempfile.TemporaryDirectory() as temp_dir:
-        # Convert PDF to images
-        images = convert_from_path(file_path, output_folder=temp_dir)
-        # Extract text from each page image
         extracted_text = ""
-        for i, image in enumerate(images):
             text = pytesseract.image_to_string(image)
-            extracted_text += f"\n{text}\n\n"
-        # Save the extracted text to a .txt file in a persistent location
-        output_txt_path = os.path.join(temp_dir, "extracted_text.txt")
-        with open(output_txt_path, "w") as f:
-            f.write(extracted_text)
-        # Create a persistent file to serve for download
-        final_output_path = "/tmp/extracted_text.txt"
-        shutil.copy(output_txt_path, final_output_path)  # Copy to a persistent location
-        return final_output_path
-# Gradio Interface
 iface = gr.Interface(
-    fn=lambda file: ocr_pdf(file.name),  # Pass file path instead of file object
-    inputs=gr.File(label="Upload PDF File"),
-    outputs=gr.File(label="Download Extracted Text (.txt)"),  # Outputs a downloadable .txt file
-    title="PDF to Text OCR"
 )
 if __name__ == "__main__":
-    iface.launch()

 import gradio as gr
+import fitz  # PyMuPDF
+from PIL import Image
 import pytesseract
+import io
 import tempfile
 import os
+def pdf_to_text_ocr(pdf_file):
+    """
+    Extracts text from a PDF file using OCR, displays it, and provides a download link.
+    This function takes an uploaded PDF, converts each page to an image, uses
+    Tesseract OCR to extract text, and then returns both the concatenated text
+    for display and a path to a temporary .txt file for download.
+    Args:
+        pdf_file (gradio.File): The uploaded PDF file object from Gradio.
+    Returns:
+        tuple[str, str | None]: A tuple containing the extracted text and the
+                                 filepath for the downloadable text file.
+                                 Returns (error_message, None) on failure.
+    """
+    if pdf_file is None:
+        return "Please upload a PDF file.", None
+    try:
+        # Open the PDF file from the uploaded file's temporary path
+        pdf_document = fitz.open(stream=pdf_file.file.read(), filetype="pdf")
         extracted_text = ""
+        # Iterate through each page of the PDF
+        for page_num in range(len(pdf_document)):
+            page = pdf_document.load_page(page_num)
+            # Convert the page to an image (pixmap)
+            pix = page.get_pixmap(dpi=300) # Increase DPI for better OCR quality
+            # Convert the pixmap to a PIL Image
+            img_data = pix.tobytes("png")
+            image = Image.open(io.BytesIO(img_data))
+            # Use Tesseract to do OCR on the image
             text = pytesseract.image_to_string(image)
+            extracted_text += f"--- Page {page_num + 1} ---\n{text}\n\n"
+        pdf_document.close()
+        if not extracted_text.strip():
+            return "No text could be extracted from the PDF.", None
+        # Create a temporary file to store the extracted text
+        # delete=False is important so Gradio can access the file
+        with tempfile.NamedTemporaryFile(delete=False, mode="w", suffix=".txt", encoding="utf-8") as temp_file:
+            temp_file.write(extracted_text)
+            temp_filepath = temp_file.name
+        # Return the text for the textbox and the filepath for the download button
+        return extracted_text, temp_filepath
+    except Exception as e:
+        # Return the error message to the textbox and None for the file output
+        return f"An error occurred: {str(e)}", None
+# Define the Gradio interface with two output components
 iface = gr.Interface(
+    fn=pdf_to_text_ocr,
+    inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
+    outputs=[
+        gr.Textbox(label="Extracted Text (Scrollable)", lines=20, placeholder="Text from your PDF will appear here..."),
+        gr.File(label="Download Extracted Text")
+    ],
+    title="PDF OCR Extractor with Download",
+    description="Upload a PDF to extract its text. The text will be displayed below and a download link for a .txt file will be provided.",
+    article="Powered by PyMuPDF, Tesseract, and Gradio.",
+    examples=[["sample.pdf"]] # You can add a sample pdf file in the same directory
 )
+# Launch the app
 if __name__ == "__main__":
+    iface.launch()