import streamlit as st from PyPDF2 import PdfReader from docx import Document from io import BytesIO from pdf2image import convert_from_bytes import pytesseract def pdf_to_word(pdf_file, password=None): """Convert a PDF file to a Word file with optional decryption and OCR support.""" reader = PdfReader(pdf_file) # Decrypt the PDF if it's encrypted if reader.is_encrypted: if password: try: reader.decrypt(password) except Exception as e: raise ValueError("Failed to decrypt the PDF. Check the password.") from e else: raise ValueError("The PDF is encrypted. Please provide a password.") document = Document() # Extract text from each page for page in reader.pages: if page.extract_text(): # Use PyPDF2 for text extraction text = page.extract_text() document.add_paragraph(text) else: # Convert the page to an image and use OCR pdf_bytes = pdf_file.read() images = convert_from_bytes(pdf_bytes) for image in images: text = pytesseract.image_to_string(image) document.add_paragraph(text) word_file = BytesIO() document.save(word_file) word_file.seek(0) return word_file # Streamlit app configuration st.set_page_config(page_title="PDF to Word Converter", page_icon="🖋", layout="centered") # App header st.title("PDF to Word Converter") st.write("Upload a PDF file, and we will convert it into a Word document for you.") # File uploader uploaded_file = st.file_uploader("Choose a PDF file", type="pdf") password = st.text_input("Enter password (if the PDF is encrypted):", type="password") if uploaded_file is not None: with st.spinner("Converting PDF to Word..."): try: word_file = pdf_to_word(uploaded_file, password) st.success("Conversion successful!") st.download_button( label="Download Word file", data=word_file, file_name="converted.docx", mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document" ) except ValueError as ve: st.error(str(ve