PDF-Editor

Build error

AzizWazir commited on Dec 12, 2024

Commit

4f0d3b9

verified ·

1 Parent(s): d40eb05

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -2,9 +2,11 @@ import streamlit as st
 from PyPDF2 import PdfReader
 from docx import Document
 from io import BytesIO
 def pdf_to_word(pdf_file, password=None):
-    """Convert a PDF file to a Word file with optional decryption."""
     reader = PdfReader(pdf_file)
     # Decrypt the PDF if it's encrypted
@@ -18,12 +20,19 @@ def pdf_to_word(pdf_file, password=None):
             raise ValueError("The PDF is encrypted. Please provide a password.")
     document = Document()
     for page in reader.pages:
-        if page.extract_text():  # Ensure text is extracted
             text = page.extract_text()
             document.add_paragraph(text)
         else:
-            document.add_paragraph("[This page contains non-extractable content or images]")
     word_file = BytesIO()
     document.save(word_file)
@@ -53,5 +62,4 @@ if uploaded_file is not None:
                 mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
             )
         except ValueError as ve:
-            st.error(str(ve))
-        ex

 from PyPDF2 import PdfReader
 from docx import Document
 from io import BytesIO
+from pdf2image import convert_from_bytes
+import pytesseract
 def pdf_to_word(pdf_file, password=None):
+    """Convert a PDF file to a Word file with optional decryption and OCR support."""
     reader = PdfReader(pdf_file)
     # Decrypt the PDF if it's encrypted
             raise ValueError("The PDF is encrypted. Please provide a password.")
     document = Document()
+    # Extract text from each page
     for page in reader.pages:
+        if page.extract_text():  # Use PyPDF2 for text extraction
             text = page.extract_text()
             document.add_paragraph(text)
         else:
+            # Convert the page to an image and use OCR
+            pdf_bytes = pdf_file.read()
+            images = convert_from_bytes(pdf_bytes)
+            for image in images:
+                text = pytesseract.image_to_string(image)
+                document.add_paragraph(text)
     word_file = BytesIO()
     document.save(word_file)
                 mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
             )
         except ValueError as ve:
+            st.error(str(ve