Spaces:

drewThomasson
/

PDF-to-TXT-OCR

Running

App Files Files Community

drewThomasson commited on 20 days ago

Commit

83d2c4f

verified ·

1 Parent(s): e19d42c

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -72

app.py CHANGED Viewed

@@ -1,82 +1,62 @@
 import gradio as gr
-import fitz  # PyMuPDF
-from PIL import Image
-import pytesseract
-import io
-import tempfile
-import os
-def pdf_to_text_ocr(pdf_file):
-    """
-    Extracts text from a PDF file using OCR, displays it, and provides a download link.
-    This function takes an uploaded PDF, converts each page to an image, uses
-    Tesseract OCR to extract text, and then returns both the concatenated text
-    for display and a path to a temporary .txt file for download.
-    Args:
-        pdf_file (gradio.File): The uploaded PDF file object from Gradio.
-    Returns:
-        tuple[str, str | None]: A tuple containing the extracted text and the
-                                 filepath for the downloadable text file.
-                                 Returns (error_message, None) on failure.
-    """
-    if pdf_file is None:
-        return "Please upload a PDF file.", None
     try:
-        # Open the PDF file from the uploaded file's temporary path
-        pdf_document = fitz.open(stream=pdf_file.file.read(), filetype="pdf")
-        extracted_text = ""
-        # Iterate through each page of the PDF
-        for page_num in range(len(pdf_document)):
-            page = pdf_document.load_page(page_num)
-            # Convert the page to an image (pixmap)
-            pix = page.get_pixmap(dpi=300) # Increase DPI for better OCR quality
-            # Convert the pixmap to a PIL Image
-            img_data = pix.tobytes("png")
-            image = Image.open(io.BytesIO(img_data))
-            # Use Tesseract to do OCR on the image
-            text = pytesseract.image_to_string(image)
-            extracted_text += f"--- Page {page_num + 1} ---\n{text}\n\n"
-        pdf_document.close()
-        if not extracted_text.strip():
-            return "No text could be extracted from the PDF.", None
-        # Create a temporary file to store the extracted text
-        # delete=False is important so Gradio can access the file
-        with tempfile.NamedTemporaryFile(delete=False, mode="w", suffix=".txt", encoding="utf-8") as temp_file:
-            temp_file.write(extracted_text)
-            temp_filepath = temp_file.name
-        # Return the text for the textbox and the filepath for the download button
-        return extracted_text, temp_filepath
     except Exception as e:
-        # Return the error message to the textbox and None for the file output
-        return f"An error occurred: {str(e)}", None
-# Define the Gradio interface with two output components
-iface = gr.Interface(
-    fn=pdf_to_text_ocr,
-    inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
     outputs=[
-        gr.Textbox(label="Extracted Text (Scrollable)", lines=20, placeholder="Text from your PDF will appear here..."),
-        gr.File(label="Download Extracted Text")
     ],
-    title="PDF OCR Extractor with Download",
-    description="Upload a PDF to extract its text. The text will be displayed below and a download link for a .txt file will be provided.",
-    article="Powered by PyMuPDF, Tesseract, and Gradio.",
-    examples=[["sample.pdf"]] # You can add a sample pdf file in the same directory
 )
-# Launch the app
-if __name__ == "__main__":
-    iface.launch()

 import gradio as gr
+import tempfile, os
+from pdf2image import convert_from_path
+import pytesseract, pdfplumber, camelot
+from PIL import Image, ImageOps
+# ✅ Must be named "file" for Gradio API to detect correctly
+def extract_text_from_pdf(file):
+    extracted = []
+    pdf_path = file.name
+    # 1. Extract using pdfplumber
     try:
+        with pdfplumber.open(pdf_path) as pdf:
+            for page in pdf.pages:
+                text = page.extract_text(layout=True)
+                if text:
+                    extracted.append(text)
+                tables = page.extract_tables()
+                for table in tables:
+                    extracted.append("TABLE:\n" + "\n".join([", ".join(row) for row in table]))
     except Exception as e:
+        print("pdfplumber error:", e)
+    # 2. Table extraction with Camelot
+    try:
+        tables = camelot.read_pdf(pdf_path, pages="all", flavor="lattice")
+        for table in tables:
+            extracted.append("CAMELOT TABLE:\n" + table.df.to_csv(index=False))
+    except Exception as e:
+        print("Camelot error:", e)
+    # 3. OCR fallback if text is too short
+    combined = "\n".join(extracted).strip()
+    if len(combined) < 100:
+        images = convert_from_path(pdf_path, dpi=300)
+        for img in images:
+            img = img.convert("L")
+            img = ImageOps.invert(img)
+            combined += pytesseract.image_to_string(img, config="--psm 6") + "\n"
+    # Save output
+    output_path = os.path.join(tempfile.gettempdir(), "extracted_text.txt")
+    with open(output_path, "w", encoding="utf-8") as f:
+        f.write(combined)
+    return combined, output_path
+# ✅ Use Gr.Interface (NOT Blocks) with correct api_name
+app = gr.Interface(
+    fn=extract_text_from_pdf,
+    inputs=gr.File(label="📤 Upload PDF", file_types=[".pdf"]),
     outputs=[
+        gr.Textbox(label="📄 Extracted Text", lines=25, show_copy_button=True),
+        gr.File(label="📥 Download .txt")
     ],
+    title="Advanced PDF Extractor",
+    description="Extract text + tables + OCR from scanned/digital PDFs.",
+    allow_flagging="never",
 )
+app.launch()