AzizWazir commited on
Commit
4f0d3b9
·
verified ·
1 Parent(s): d40eb05

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -5
app.py CHANGED
@@ -2,9 +2,11 @@ import streamlit as st
2
  from PyPDF2 import PdfReader
3
  from docx import Document
4
  from io import BytesIO
 
 
5
 
6
  def pdf_to_word(pdf_file, password=None):
7
- """Convert a PDF file to a Word file with optional decryption."""
8
  reader = PdfReader(pdf_file)
9
 
10
  # Decrypt the PDF if it's encrypted
@@ -18,12 +20,19 @@ def pdf_to_word(pdf_file, password=None):
18
  raise ValueError("The PDF is encrypted. Please provide a password.")
19
 
20
  document = Document()
 
 
21
  for page in reader.pages:
22
- if page.extract_text(): # Ensure text is extracted
23
  text = page.extract_text()
24
  document.add_paragraph(text)
25
  else:
26
- document.add_paragraph("[This page contains non-extractable content or images]")
 
 
 
 
 
27
 
28
  word_file = BytesIO()
29
  document.save(word_file)
@@ -53,5 +62,4 @@ if uploaded_file is not None:
53
  mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
54
  )
55
  except ValueError as ve:
56
- st.error(str(ve))
57
- ex
 
2
  from PyPDF2 import PdfReader
3
  from docx import Document
4
  from io import BytesIO
5
+ from pdf2image import convert_from_bytes
6
+ import pytesseract
7
 
8
  def pdf_to_word(pdf_file, password=None):
9
+ """Convert a PDF file to a Word file with optional decryption and OCR support."""
10
  reader = PdfReader(pdf_file)
11
 
12
  # Decrypt the PDF if it's encrypted
 
20
  raise ValueError("The PDF is encrypted. Please provide a password.")
21
 
22
  document = Document()
23
+
24
+ # Extract text from each page
25
  for page in reader.pages:
26
+ if page.extract_text(): # Use PyPDF2 for text extraction
27
  text = page.extract_text()
28
  document.add_paragraph(text)
29
  else:
30
+ # Convert the page to an image and use OCR
31
+ pdf_bytes = pdf_file.read()
32
+ images = convert_from_bytes(pdf_bytes)
33
+ for image in images:
34
+ text = pytesseract.image_to_string(image)
35
+ document.add_paragraph(text)
36
 
37
  word_file = BytesIO()
38
  document.save(word_file)
 
62
  mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
63
  )
64
  except ValueError as ve:
65
+ st.error(str(ve