Spaces:
Running
Running
File size: 3,838 Bytes
5374321 fb15ef8 5374321 fb15ef8 5374321 fb15ef8 5374321 fb15ef8 5374321 fb15ef8 5374321 fb15ef8 5374321 fb15ef8 5374321 fb15ef8 5374321 fb15ef8 5374321 fb15ef8 5374321 fb15ef8 5374321 fb15ef8 5374321 fb15ef8 5374321 fb15ef8 5374321 fb15ef8 5374321 fb15ef8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
import os
import fitz # PyMuPDF for PDF handling
import pytesseract # OCR for text extraction
from PIL import Image
import tempfile
import streamlit as st
def extract_text_with_tesseract(pdf_path):
"""
Extract text with bounding box positions using Tesseract OCR.
:param pdf_path: Path to the input PDF file.
:return: List of dictionaries containing text and positions for each page.
"""
extracted_data = []
doc = fitz.open(pdf_path)
for page_num in range(len(doc)):
page = doc.load_page(page_num)
pix = page.get_pixmap(dpi=300) # Convert PDF page to high-resolution image
image_path = f"temp_page_{page_num}.png"
pix.save(image_path)
# Perform OCR using Tesseract
img = Image.open(image_path)
ocr_result = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
page_data = []
for i in range(len(ocr_result["text"])):
if ocr_result["text"][i].strip(): # Ignore empty text
page_data.append({
"text": ocr_result["text"][i],
"x0": ocr_result["left"][i],
"y0": ocr_result["top"][i],
"x1": ocr_result["left"][i] + ocr_result["width"][i],
"y1": ocr_result["top"][i] + ocr_result["height"][i],
"font_size": ocr_result["height"][i]
})
extracted_data.append(page_data)
# Cleanup temporary image
os.remove(image_path)
return extracted_data
def overlay_text_with_fonts(pdf_path, extracted_data, output_pdf_path):
"""
Overlay extracted text onto the original PDF using PyMuPDF.
:param pdf_path: Path to the input PDF file.
:param extracted_data: Extracted text and positions.
:param output_pdf_path: Path to save the output PDF file.
"""
doc = fitz.open(pdf_path)
default_font = "Helvetica"
for page_num, page_data in enumerate(extracted_data):
page = doc[page_num]
for item in page_data:
page.insert_text(
(item["x0"], item["y0"]),
item["text"],
fontsize=item["font_size"] / 2, # Adjust font size for better scaling
fontname=default_font,
color=(0, 0, 0) # Black text
)
doc.save(output_pdf_path)
print(f"PDF saved to: {output_pdf_path}")
def process_pdf(uploaded_pdf, output_pdf_path):
"""
Process the uploaded PDF to extract text using Tesseract and overlay it.
:param uploaded_pdf: Uploaded PDF file.
:param output_pdf_path: Path to save the output PDF file.
"""
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
temp_pdf.write(uploaded_pdf.read())
temp_pdf_path = temp_pdf.name
extracted_data = extract_text_with_tesseract(temp_pdf_path)
overlay_text_with_fonts(temp_pdf_path, extracted_data, output_pdf_path)
os.remove(temp_pdf_path)
# Streamlit App
def main():
st.title("PDF OCR and Text Conversion Tool")
st.write("Upload a PDF to extract and overlay text as editable layers.")
uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
if uploaded_file:
output_pdf_path = "converted_output.pdf"
with st.spinner("Processing your PDF..."):
process_pdf(uploaded_file, output_pdf_path)
st.success("PDF processing complete!")
# Provide a download button for the processed PDF
with open(output_pdf_path, "rb") as f:
st.download_button(
label="Download Converted PDF",
data=f,
file_name="converted_output.pdf",
mime="application/pdf"
)
os.remove(output_pdf_path)
if __name__ == "__main__":
main()
|