PDF-Editor

Build error

PDF-Editor / app.py

Update app.py

576dfa7 verified 10 months ago

1.67 kB

	import streamlit as st
	from PyPDF2 import PdfReader
	from docx import Document
	from io import BytesIO
	from pdf2image import convert_from_bytes
	import pytesseract

	def pdf_to_word(pdf_file, password=None):
	"""Convert a PDF file to a Word file with optional decryption and OCR."""
	reader = PdfReader(pdf_file)

	# Decrypt the PDF if it's encrypted
	if reader.is_encrypted:
	if password:
	try:
	reader.decrypt(password)
	except Exception as e:
	raise ValueError("Failed to decrypt the PDF. Check the password.") from e
	else:
	raise ValueError("The PDF is encrypted. Please provide a password.")

	document = Document()

	# Extract text from each page
	pdf_bytes = pdf_file.read()
	for page in reader.pages:
	if page.extract_text(): # Use PyPDF2 for text extraction
	text = page.extract_text()
	document.add_paragraph(text)
	else:
	# Use OCR for non-extractable pages
	images = convert_from_bytes(pdf_bytes)
	for image in images:
	ocr_text = pytesseract.image_to_string(image)
	if ocr_text.strip():
	document.add_paragraph(ocr_text)
	else:
	document.add_paragraph("[This page contains non-extractable content or images]")

	word_file = BytesIO()
	document.save(word_file)
	word_file.seek(0)
	return word_file

	# Streamlit app configuration
	st.set_page_config(page_title="PDF to Word Converter", page_icon="🖋", layout="centered")

	# App header
	st.title("PDF to Word Converter")
	st.write("Upload a PDF file,