PDF-Editor

Running

File size: 2,291 Bytes

9a256f9
 
 
 
576dfa7
 
9a256f9
d40eb05
576dfa7
b695c33
9a256f9
 
d40eb05
 
 
 
 
 
 
 
 
 
 
576dfa7
 
 
9a256f9
576dfa7
ecb38ba
 
 
576dfa7
 
 
 
 
 
 
 
9a256f9
 
 
 
 
 
 
 
 
 
 
b695c33

import streamlit as st
from PyPDF2 import PdfReader
from docx import Document
from io import BytesIO
from pdf2image import convert_from_bytes
import pytesseract

def pdf_to_word(pdf_file, password=None):
    """Convert a PDF file to a Word file with optional decryption and OCR."""
    # Ensure the file is a valid file-like object
    reader = PdfReader(pdf_file)

    # Decrypt the PDF if it's encrypted
    if reader.is_encrypted:
        if password:
            try:
                reader.decrypt(password)
            except Exception as e:
                raise ValueError("Failed to decrypt the PDF. Check the password.") from e
        else:
            raise ValueError("The PDF is encrypted. Please provide a password.")

    document = Document()

    # Extract text from each page
    pdf_bytes = pdf_file.read()
    for page in reader.pages:
        if page.extract_text():  # Use PyPDF2 for text extraction
            text = page.extract_text()
            document.add_paragraph(text)
        else:
            # Use OCR for non-extractable pages
            images = convert_from_bytes(pdf_bytes)
            for image in images:
                ocr_text = pytesseract.image_to_string(image)
                if ocr_text.strip():
                    document.add_paragraph(ocr_text)
                else:
                    document.add_paragraph("[This page contains non-extractable content or images]")

    word_file = BytesIO()
    document.save(word_file)
    word_file.seek(0)
    return word_file

# Streamlit app configuration
st.set_page_config(page_title="PDF to Word Converter", page_icon="🖋", layout="centered")

# App header
st.title("PDF to Word Converter")
st.write("Upload a PDF file")

# Upload PDF file widget
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

if uploaded_file is not None:
    # Optionally ask for a password if the PDF is encrypted
    password = st.text_input("Enter PDF password (if encrypted)", type="password")

    try:
        # Convert the PDF to Word
        word_file = pdf_to_word(uploaded_file, password)

        # Provide a download link for the Word file
        st.download_button("Download Word file", word_file, file_name="converted.docx")
    except Exception as e:
        st.error(f"Error: {e}")