import streamlit as st from PyPDF2 import PdfReader from docx import Document from io import BytesIO from pdf2image import convert_from_bytes import pytesseract import time # Configure Tesseract path (if needed) # pytesseract.pytesseract.tesseract_cmd = r'/path/to/tesseract' def pdf_to_word(pdf_file, password=None): """Convert a PDF file to a Word file with optional decryption and OCR.""" try: # Ensure the file is a valid PDF if pdf_file.type != "application/pdf": raise ValueError("Invalid file type. Please upload a PDF file.") # Initialize PDF reader reader = PdfReader(pdf_file) # Decrypt the PDF if it's encrypted if reader.is_encrypted: if password: try: reader.decrypt(password) except Exception as e: raise ValueError("Failed to decrypt the PDF. Check the password.") from e else: raise ValueError("The PDF is encrypted. Please provide a password.") # Create a Word document document = Document() # Extract text from each page pdf_bytes = pdf_file.read() total_pages = len(reader.pages) progress_bar = st.progress(0) status_text = st.empty() for i, page in enumerate(reader.pages): status_text.text(f"Processing page {i + 1} of {total_pages}...") progress_bar.progress((i + 1) / total_pages) # Try extracting text directly text = page.extract_text() if text: document.add_paragraph(text) else: # Use OCR for non-extractable pages images = convert_from_bytes(pdf_bytes, first_page=i + 1, last_page=i + 1) for image in images: ocr_text = pytesseract.image_to_string(image) if ocr_text.strip(): document.add_paragraph(ocr_text) else: document.add_paragraph("[This page contains non-extractable content or images]") # Save the Word document to a BytesIO object word_file = BytesIO() document.save(word_file) word_file.seek(0) return word_file except Exception as e: raise ValueError(f"An error occurred: {e}") # Streamlit app configuration st.set_page_config(page_title="PDF to Word Converter", page_icon="🖋", layout="centered") # App header st.title("📄 PDF to Word Converter") st.write("Upload a PDF file to convert it into an editable Word document.") # Upload PDF file widget uploaded_file = st.file_uploader("Choose a PDF file", type="pdf") if uploaded_file is not None: # Optionally ask for a password if the PDF is encrypted password = st.text_input("Enter PDF password (if encrypted)", type="password") if st.button("Convert to Word"): try: # Convert the PDF to Word with st.spinner("Converting PDF to Word..."): word_file = pdf_to_word(uploaded_file, password) # Provide a download link for the Word file st.success("Conversion successful!") st.download_button( label="Download Word file", data=word_file, file_name="converted.docx", mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document" ) except Exception as e: st.error(f"Error: {e}")