File size: 3,546 Bytes
5374321
 
2d1a10e
fb15ef8
5374321
 
 
2d1a10e
 
5374321
2d1a10e
5374321
2d1a10e
5374321
2d1a10e
 
 
 
 
5374321
 
 
 
fb15ef8
5374321
 
 
2d1a10e
 
 
 
5374321
2d1a10e
 
5374321
 
 
 
2d1a10e
5374321
 
 
 
2d1a10e
5374321
2d1a10e
5374321
 
 
 
2d1a10e
 
 
5374321
 
 
2d1a10e
 
 
 
 
5374321
 
 
 
 
 
 
2d1a10e
fb15ef8
5374321
 
 
 
 
 
2d1a10e
5374321
 
fb15ef8
5374321
 
 
 
2d1a10e
 
5374321
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fb15ef8
5374321
 
 
fb15ef8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import os
import fitz  # PyMuPDF for PDF handling
from transformers import DonutProcessor, VisionEncoderDecoderModel
from PIL import Image
import tempfile
import streamlit as st


def extract_text_with_donut(pdf_path):
    """
    Extract text using Hugging Face Donut model for OCR.
    :param pdf_path: Path to the input PDF file.
    :return: List of extracted text for each page.
    """
    # Load the model and processor
    processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base")
    model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base")

    extracted_text = []
    doc = fitz.open(pdf_path)

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        pix = page.get_pixmap(dpi=300)  # Convert PDF page to high-resolution image
        image_path = f"temp_page_{page_num}.png"
        pix.save(image_path)

        # Perform OCR using Donut
        image = Image.open(image_path).convert("RGB")
        inputs = processor(images=image, return_tensors="pt")
        outputs = model.generate(**inputs)

        page_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
        extracted_text.append({"page_num": page_num, "text": page_text})

        # Cleanup temporary image
        os.remove(image_path)

    return extracted_text


def overlay_text_with_fonts(pdf_path, extracted_data, output_pdf_path):
    """
    Overlay extracted text onto the original PDF.
    :param pdf_path: Path to the input PDF file.
    :param extracted_data: Extracted text for each page.
    :param output_pdf_path: Path to save the output PDF file.
    """
    doc = fitz.open(pdf_path)

    for item in extracted_data:
        page_num = item["page_num"]
        text = item["text"]

        page = doc[page_num]

        # Add extracted text to the page
        y = 50  # Starting position
        for line in text.split("\n"):
            page.insert_text((50, y), line, fontsize=10, fontname="Helvetica", color=(0, 0, 0))
            y += 12  # Line spacing

    doc.save(output_pdf_path)
    print(f"PDF saved to: {output_pdf_path}")


def process_pdf(uploaded_pdf, output_pdf_path):
    """
    Process the uploaded PDF to extract text using Hugging Face Donut and overlay it.
    :param uploaded_pdf: Uploaded PDF file.
    :param output_pdf_path: Path to save the output PDF file.
    """
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
        temp_pdf.write(uploaded_pdf.read())
        temp_pdf_path = temp_pdf.name

    extracted_data = extract_text_with_donut(temp_pdf_path)
    overlay_text_with_fonts(temp_pdf_path, extracted_data, output_pdf_path)

    os.remove(temp_pdf_path)


# Streamlit App
def main():
    st.title("Hugging Face OCR Text Extraction Tool")
    st.write("Upload a PDF to extract and overlay text using Hugging Face Donut.")

    uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
    if uploaded_file:
        output_pdf_path = "converted_output.pdf"

        with st.spinner("Processing your PDF..."):
            process_pdf(uploaded_file, output_pdf_path)

        st.success("PDF processing complete!")

        # Provide a download button for the processed PDF
        with open(output_pdf_path, "rb") as f:
            st.download_button(
                label="Download Converted PDF",
                data=f,
                file_name="converted_output.pdf",
                mime="application/pdf"
            )

        os.remove(output_pdf_path)


if __name__ == "__main__":
    main()