import streamlit as st from PyPDF2 import PdfReader from docx import Document from io import BytesIO from pdf2image import convert_from_bytes import pytesseract def pdf_to_word(pdf_file, password=None): """Convert a PDF file to a Word file with optional decryption and OCR.""" reader = PdfReader(pdf_file) # Decrypt the PDF if it's encrypted if reader.is_encrypted: if password: try: reader.decrypt(password) except Exception as e: raise ValueError("Failed to decrypt the PDF. Check the password.") from e else: raise ValueError("The PDF is encrypted. Please provide a password.") document = Document() # Extract text from each page pdf_bytes = pdf_file.read() for page in reader.pages: if page.extract_text(): # Use PyPDF2 for text extraction text = page.extract_text() document.add_paragraph(text) else: # Use OCR for non-extractable pages images = convert_from_bytes(pdf_bytes) for image in images: ocr_text = pytesseract.image_to_string(image) if ocr_text.strip(): document.add_paragraph(ocr_text) else: document.add_paragraph("[This page contains non-extractable content or images]") word_file = BytesIO() document.save(word_file) word_file.seek(0) return word_file # Streamlit app configuration st.set_page_config(page_title="PDF to Word Converter", page_icon="🖋", layout="centered") # App header st.title("PDF to Word Converter") st.write("Upload a PDF file,