File size: 3,838 Bytes
5374321
 
fb15ef8
 
5374321
 
 
fb15ef8
5374321
fb15ef8
5374321
 
 
 
 
 
 
 
fb15ef8
5374321
 
 
fb15ef8
 
 
5374321
 
fb15ef8
 
 
 
 
 
 
 
 
 
5374321
 
 
 
 
 
 
 
 
 
 
fb15ef8
5374321
fb15ef8
5374321
 
 
 
fb15ef8
5374321
 
 
 
 
fb15ef8
 
 
 
 
 
 
5374321
 
 
 
 
 
 
fb15ef8
 
5374321
 
 
 
 
 
fb15ef8
5374321
 
fb15ef8
5374321
 
 
 
fb15ef8
 
5374321
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fb15ef8
5374321
 
 
fb15ef8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import os
import fitz  # PyMuPDF for PDF handling
import pytesseract  # OCR for text extraction
from PIL import Image
import tempfile
import streamlit as st

def extract_text_with_tesseract(pdf_path):
    """
    Extract text with bounding box positions using Tesseract OCR.
    :param pdf_path: Path to the input PDF file.
    :return: List of dictionaries containing text and positions for each page.
    """
    extracted_data = []
    doc = fitz.open(pdf_path)

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        pix = page.get_pixmap(dpi=300)  # Convert PDF page to high-resolution image
        image_path = f"temp_page_{page_num}.png"
        pix.save(image_path)

        # Perform OCR using Tesseract
        img = Image.open(image_path)
        ocr_result = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)

        page_data = []
        for i in range(len(ocr_result["text"])):
            if ocr_result["text"][i].strip():  # Ignore empty text
                page_data.append({
                    "text": ocr_result["text"][i],
                    "x0": ocr_result["left"][i],
                    "y0": ocr_result["top"][i],
                    "x1": ocr_result["left"][i] + ocr_result["width"][i],
                    "y1": ocr_result["top"][i] + ocr_result["height"][i],
                    "font_size": ocr_result["height"][i]
                })

        extracted_data.append(page_data)

        # Cleanup temporary image
        os.remove(image_path)

    return extracted_data


def overlay_text_with_fonts(pdf_path, extracted_data, output_pdf_path):
    """
    Overlay extracted text onto the original PDF using PyMuPDF.
    :param pdf_path: Path to the input PDF file.
    :param extracted_data: Extracted text and positions.
    :param output_pdf_path: Path to save the output PDF file.
    """
    doc = fitz.open(pdf_path)

    default_font = "Helvetica"

    for page_num, page_data in enumerate(extracted_data):
        page = doc[page_num]

        for item in page_data:
            page.insert_text(
                (item["x0"], item["y0"]),
                item["text"],
                fontsize=item["font_size"] / 2,  # Adjust font size for better scaling
                fontname=default_font,
                color=(0, 0, 0)  # Black text
            )

    doc.save(output_pdf_path)
    print(f"PDF saved to: {output_pdf_path}")


def process_pdf(uploaded_pdf, output_pdf_path):
    """
    Process the uploaded PDF to extract text using Tesseract and overlay it.
    :param uploaded_pdf: Uploaded PDF file.
    :param output_pdf_path: Path to save the output PDF file.
    """
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
        temp_pdf.write(uploaded_pdf.read())
        temp_pdf_path = temp_pdf.name

    extracted_data = extract_text_with_tesseract(temp_pdf_path)
    overlay_text_with_fonts(temp_pdf_path, extracted_data, output_pdf_path)

    os.remove(temp_pdf_path)


# Streamlit App
def main():
    st.title("PDF OCR and Text Conversion Tool")
    st.write("Upload a PDF to extract and overlay text as editable layers.")

    uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
    if uploaded_file:
        output_pdf_path = "converted_output.pdf"

        with st.spinner("Processing your PDF..."):
            process_pdf(uploaded_file, output_pdf_path)

        st.success("PDF processing complete!")

        # Provide a download button for the processed PDF
        with open(output_pdf_path, "rb") as f:
            st.download_button(
                label="Download Converted PDF",
                data=f,
                file_name="converted_output.pdf",
                mime="application/pdf"
            )

        os.remove(output_pdf_path)


if __name__ == "__main__":
    main()