PDF-Editor

Sleeping

App Files Files Community

AzizWazir commited on Dec 12, 2024

Commit

576dfa7

verified ·

1 Parent(s): 972cb11

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -24

app.py CHANGED Viewed

@@ -2,9 +2,11 @@ import streamlit as st
 from PyPDF2 import PdfReader
 from docx import Document
 from io import BytesIO
 def pdf_to_word(pdf_file, password=None):
-    """Convert a PDF file to a Word file with optional decryption."""
     reader = PdfReader(pdf_file)
     # Decrypt the PDF if it's encrypted
@@ -18,12 +20,22 @@ def pdf_to_word(pdf_file, password=None):
             raise ValueError("The PDF is encrypted. Please provide a password.")
     document = Document()
     for page in reader.pages:
-        if page.extract_text():  # Ensure text is extracted
             text = page.extract_text()
             document.add_paragraph(text)
         else:
-            document.add_paragraph("[This page contains non-extractable content or images]")
     word_file = BytesIO()
     document.save(word_file)
@@ -35,24 +47,4 @@ st.set_page_config(page_title="PDF to Word Converter", page_icon="🖋", layout=
 # App header
 st.title("PDF to Word Converter")
-st.write("Upload a PDF file, and we will convert it into a Word document for you.")
-# File uploader
-uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
-password = st.text_input("Enter password (if the PDF is encrypted):", type="password")
-if uploaded_file is not None:
-    with st.spinner("Converting PDF to Word..."):
-        try:
-            word_file = pdf_to_word(uploaded_file, password)
-            st.success("Conversion successful!")
-            st.download_button(
-                label="Download Word file",
-                data=word_file,
-                file_name="converted.docx",
-                mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
-            )
-        except ValueError as ve:
-            st.error(str(ve))
-        except Exception as e:
-            st.error(f"An error occurred: {str(e)}")

 from PyPDF2 import PdfReader
 from docx import Document
 from io import BytesIO
+from pdf2image import convert_from_bytes
+import pytesseract
 def pdf_to_word(pdf_file, password=None):
+    """Convert a PDF file to a Word file with optional decryption and OCR."""
     reader = PdfReader(pdf_file)
     # Decrypt the PDF if it's encrypted
             raise ValueError("The PDF is encrypted. Please provide a password.")
     document = Document()
+    # Extract text from each page
+    pdf_bytes = pdf_file.read()
     for page in reader.pages:
+        if page.extract_text():  # Use PyPDF2 for text extraction
             text = page.extract_text()
             document.add_paragraph(text)
         else:
+            # Use OCR for non-extractable pages
+            images = convert_from_bytes(pdf_bytes)
+            for image in images:
+                ocr_text = pytesseract.image_to_string(image)
+                if ocr_text.strip():
+                    document.add_paragraph(ocr_text)
+                else:
+                    document.add_paragraph("[This page contains non-extractable content or images]")
     word_file = BytesIO()
     document.save(word_file)
 # App header
 st.title("PDF to Word Converter")
+st.write("Upload a PDF file,