AzizWazir commited on
Commit
576dfa7
·
verified ·
1 Parent(s): 972cb11

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -24
app.py CHANGED
@@ -2,9 +2,11 @@ import streamlit as st
2
  from PyPDF2 import PdfReader
3
  from docx import Document
4
  from io import BytesIO
 
 
5
 
6
  def pdf_to_word(pdf_file, password=None):
7
- """Convert a PDF file to a Word file with optional decryption."""
8
  reader = PdfReader(pdf_file)
9
 
10
  # Decrypt the PDF if it's encrypted
@@ -18,12 +20,22 @@ def pdf_to_word(pdf_file, password=None):
18
  raise ValueError("The PDF is encrypted. Please provide a password.")
19
 
20
  document = Document()
 
 
 
21
  for page in reader.pages:
22
- if page.extract_text(): # Ensure text is extracted
23
  text = page.extract_text()
24
  document.add_paragraph(text)
25
  else:
26
- document.add_paragraph("[This page contains non-extractable content or images]")
 
 
 
 
 
 
 
27
 
28
  word_file = BytesIO()
29
  document.save(word_file)
@@ -35,24 +47,4 @@ st.set_page_config(page_title="PDF to Word Converter", page_icon="🖋", layout=
35
 
36
  # App header
37
  st.title("PDF to Word Converter")
38
- st.write("Upload a PDF file, and we will convert it into a Word document for you.")
39
-
40
- # File uploader
41
- uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
42
- password = st.text_input("Enter password (if the PDF is encrypted):", type="password")
43
-
44
- if uploaded_file is not None:
45
- with st.spinner("Converting PDF to Word..."):
46
- try:
47
- word_file = pdf_to_word(uploaded_file, password)
48
- st.success("Conversion successful!")
49
- st.download_button(
50
- label="Download Word file",
51
- data=word_file,
52
- file_name="converted.docx",
53
- mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
54
- )
55
- except ValueError as ve:
56
- st.error(str(ve))
57
- except Exception as e:
58
- st.error(f"An error occurred: {str(e)}")
 
2
  from PyPDF2 import PdfReader
3
  from docx import Document
4
  from io import BytesIO
5
+ from pdf2image import convert_from_bytes
6
+ import pytesseract
7
 
8
  def pdf_to_word(pdf_file, password=None):
9
+ """Convert a PDF file to a Word file with optional decryption and OCR."""
10
  reader = PdfReader(pdf_file)
11
 
12
  # Decrypt the PDF if it's encrypted
 
20
  raise ValueError("The PDF is encrypted. Please provide a password.")
21
 
22
  document = Document()
23
+
24
+ # Extract text from each page
25
+ pdf_bytes = pdf_file.read()
26
  for page in reader.pages:
27
+ if page.extract_text(): # Use PyPDF2 for text extraction
28
  text = page.extract_text()
29
  document.add_paragraph(text)
30
  else:
31
+ # Use OCR for non-extractable pages
32
+ images = convert_from_bytes(pdf_bytes)
33
+ for image in images:
34
+ ocr_text = pytesseract.image_to_string(image)
35
+ if ocr_text.strip():
36
+ document.add_paragraph(ocr_text)
37
+ else:
38
+ document.add_paragraph("[This page contains non-extractable content or images]")
39
 
40
  word_file = BytesIO()
41
  document.save(word_file)
 
47
 
48
  # App header
49
  st.title("PDF to Word Converter")
50
+ st.write("Upload a PDF file,