File size: 4,461 Bytes
5374321
 
 
 
 
 
6bc31ab
 
5374321
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8290c12
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import os
import fitz  # PyMuPDF for PDF handling
import easyocr  # OCR for text extraction
import tempfile
import streamlit as st

# Disable GStreamer to prevent OpenCV-related errors
os.environ["OPENCV_VIDEOIO_PRIORITY_GSTREAMER"] = "0"

def extract_text_with_ocr(pdf_path):
    """
    Extract text with bounding box positions using OCR for both English and Arabic text.
    :param pdf_path: Path to the input PDF file.
    :return: List of dictionaries containing text and positions for each page.
    """
    extracted_data = []
    doc = fitz.open(pdf_path)

    # Convert each PDF page to an image for OCR processing
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        pix = page.get_pixmap(dpi=300)  # Convert PDF page to image
        image_path = f"temp_page_{page_num}.png"
        pix.save(image_path)

        # Perform OCR on the image
        reader = easyocr.Reader(['en'])  # Supports English (add 'ar' for Arabic if needed)
        results = reader.readtext(image_path, detail=1)  # detail=1 returns bounding box info

        # Extract text and positions
        page_data = []
        for (bbox, text, confidence) in results:
            (x0, y0), (x1, y1) = bbox[0], bbox[2]
            page_data.append({
                "text": text,
                "x0": x0,
                "y0": y1,  # Adjust to bottom-left corner (PDF coordinates)
                "font_size": y1 - y0,  # Approximate font size
                "confidence": confidence
            })

        extracted_data.append(page_data)

        # Cleanup temporary image
        os.remove(image_path)

    return extracted_data


def overlay_text_with_fonts(pdf_path, extracted_data, output_pdf_path):
    """
    Overlay extracted text onto the original PDF using fonts from different font families.
    :param pdf_path: Path to the input PDF file.
    :param extracted_data: List of extracted text with positions.
    :param output_pdf_path: Path to save the output PDF file.
    """
    doc = fitz.open(pdf_path)

    # Define default font settings
    default_font = "Helvetica"  # You can replace it with specific fonts like "Arial" or others.

    for page_num, page_data in enumerate(extracted_data):
        page = doc[page_num]

        for item in page_data:
            if item["confidence"] > 0.8:  # Only overlay high-confidence text
                page.insert_text(
                    (item["x0"], item["y0"]),
                    item["text"],
                    fontsize=item["font_size"],
                    fontname=default_font,
                    color=(0, 0, 0),  # Black text
                    render_mode=0  # Ensure text is not outlined
                )

    doc.save(output_pdf_path)
    print(f"PDF saved to: {output_pdf_path}")


def process_pdf(uploaded_pdf, output_pdf_path):
    """
    Process the uploaded PDF to extract text using OCR and overlay it as editable text.
    :param uploaded_pdf: The uploaded PDF file.
    :param output_pdf_path: Path to save the output PDF file.
    """
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
        temp_pdf.write(uploaded_pdf.read())
        temp_pdf_path = temp_pdf.name

    # Step 1: Extract text using OCR
    extracted_data = extract_text_with_ocr(temp_pdf_path)

    # Step 2: Overlay extracted text onto the original PDF
    overlay_text_with_fonts(temp_pdf_path, extracted_data, output_pdf_path)

    # Cleanup temporary file
    if os.path.exists(temp_pdf_path):
        os.remove(temp_pdf_path)


# Streamlit App
def main():
    st.title("PDF Text Conversion Tool")
    st.write("Upload a PDF to convert vector text into regular, editable text.")

    uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
    if uploaded_file:
        output_pdf_path = "converted_output.pdf"

        with st.spinner("Processing your PDF..."):
            process_pdf(uploaded_file, output_pdf_path)

        st.success("PDF processing complete!")

        # Provide a download button for the processed PDF
        with open(output_pdf_path, "rb") as f:
            st.download_button(
                label="Download Converted PDF",
                data=f,
                file_name="converted_output.pdf",
                mime="application/pdf"
            )

        # Cleanup the processed output PDF
        if os.path.exists(output_pdf_path):
            os.remove(output_pdf_path)


if __name__ == "__main__":
    main()