PDF-Editor

Running

File size: 1,673 Bytes

9a256f9
 
 
 
576dfa7
 
9a256f9
d40eb05
576dfa7
9a256f9
 
d40eb05
 
 
 
 
 
 
 
 
 
 
576dfa7
 
 
9a256f9
576dfa7
ecb38ba
 
 
576dfa7
 
 
 
 
 
 
 
9a256f9
 
 
 
 
 
 
 
 
 
 
576dfa7

import streamlit as st
from PyPDF2 import PdfReader
from docx import Document
from io import BytesIO
from pdf2image import convert_from_bytes
import pytesseract

def pdf_to_word(pdf_file, password=None):
    """Convert a PDF file to a Word file with optional decryption and OCR."""
    reader = PdfReader(pdf_file)

    # Decrypt the PDF if it's encrypted
    if reader.is_encrypted:
        if password:
            try:
                reader.decrypt(password)
            except Exception as e:
                raise ValueError("Failed to decrypt the PDF. Check the password.") from e
        else:
            raise ValueError("The PDF is encrypted. Please provide a password.")

    document = Document()

    # Extract text from each page
    pdf_bytes = pdf_file.read()
    for page in reader.pages:
        if page.extract_text():  # Use PyPDF2 for text extraction
            text = page.extract_text()
            document.add_paragraph(text)
        else:
            # Use OCR for non-extractable pages
            images = convert_from_bytes(pdf_bytes)
            for image in images:
                ocr_text = pytesseract.image_to_string(image)
                if ocr_text.strip():
                    document.add_paragraph(ocr_text)
                else:
                    document.add_paragraph("[This page contains non-extractable content or images]")

    word_file = BytesIO()
    document.save(word_file)
    word_file.seek(0)
    return word_file

# Streamlit app configuration
st.set_page_config(page_title="PDF to Word Converter", page_icon="🖋", layout="centered")

# App header
st.title("PDF to Word Converter")
st.write("Upload a PDF file,