Spaces:

drewThomasson
/

PDF-to-TXT-OCR

Running

drewThomasson commited on Oct 9, 2024

Commit

fda9a03

verified ·

1 Parent(s): 71f9e38

Create app.py

Files changed (1) hide show

app.py ADDED Viewed

+import gradio as gr
+import pytesseract
+from pdf2image import convert_from_path
+import tempfile
+import os
+def ocr_pdf(file_path):
+    with tempfile.TemporaryDirectory() as temp_dir:
+        # Convert PDF to images
+        images = convert_from_path(file_path, output_folder=temp_dir)
+        # Extract text from each page image
+        extracted_text = ""
+        for i, image in enumerate(images):
+            text = pytesseract.image_to_string(image)
+            extracted_text += f"--- Page {i+1} ---\n{text}\n\n"
+        # Save the extracted text to a .txt file
+        output_txt_path = os.path.join(temp_dir, "extracted_text.txt")
+        with open(output_txt_path, "w") as f:
+            f.write(extracted_text)
+        return output_txt_path
+# Gradio Interface
+iface = gr.Interface(
+    fn=lambda file: ocr_pdf(file.name),  # Pass file path instead of file object
+    inputs=gr.File(label="Upload PDF File"),
+    outputs=gr.File(label="Download Extracted Text (.txt)"),  # Outputs a downloadable .txt file
+    title="PDF to Text OCR"
+)
+if __name__ == "__main__":
+    iface.launch()