Spaces:
Running
Running
import streamlit as st | |
from PyPDF2 import PdfReader | |
from docx import Document | |
from io import BytesIO | |
from pdf2image import convert_from_bytes | |
import pytesseract | |
def pdf_to_word(pdf_file, password=None): | |
"""Convert a PDF file to a Word file with optional decryption and OCR.""" | |
# Ensure the file is a valid file-like object | |
reader = PdfReader(pdf_file) | |
# Decrypt the PDF if it's encrypted | |
if reader.is_encrypted: | |
if password: | |
try: | |
reader.decrypt(password) | |
except Exception as e: | |
raise ValueError("Failed to decrypt the PDF. Check the password.") from e | |
else: | |
raise ValueError("The PDF is encrypted. Please provide a password.") | |
document = Document() | |
# Extract text from each page | |
pdf_bytes = pdf_file.read() | |
for page in reader.pages: | |
if page.extract_text(): # Use PyPDF2 for text extraction | |
text = page.extract_text() | |
document.add_paragraph(text) | |
else: | |
# Use OCR for non-extractable pages | |
images = convert_from_bytes(pdf_bytes) | |
for image in images: | |
ocr_text = pytesseract.image_to_string(image) | |
if ocr_text.strip(): | |
document.add_paragraph(ocr_text) | |
else: | |
document.add_paragraph("[This page contains non-extractable content or images]") | |
word_file = BytesIO() | |
document.save(word_file) | |
word_file.seek(0) | |
return word_file | |
# Streamlit app configuration | |
st.set_page_config(page_title="PDF to Word Converter", page_icon="🖋", layout="centered") | |
# App header | |
st.title("PDF to Word Converter") | |
st.write("Upload a PDF file") | |
# Upload PDF file widget | |
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf") | |
if uploaded_file is not None: | |
# Optionally ask for a password if the PDF is encrypted | |
password = st.text_input("Enter PDF password (if encrypted)", type="password") | |
try: | |
# Convert the PDF to Word | |
word_file = pdf_to_word(uploaded_file, password) | |
# Provide a download link for the Word file | |
st.download_button("Download Word file", word_file, file_name="converted.docx") | |
except Exception as e: | |
st.error(f"Error: {e}") | |