Spaces:
Running
Running
File size: 3,546 Bytes
5374321 2d1a10e fb15ef8 5374321 2d1a10e 5374321 2d1a10e 5374321 2d1a10e 5374321 2d1a10e 5374321 fb15ef8 5374321 2d1a10e 5374321 2d1a10e 5374321 2d1a10e 5374321 2d1a10e 5374321 2d1a10e 5374321 2d1a10e 5374321 2d1a10e 5374321 2d1a10e fb15ef8 5374321 2d1a10e 5374321 fb15ef8 5374321 2d1a10e 5374321 fb15ef8 5374321 fb15ef8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
import os
import fitz # PyMuPDF for PDF handling
from transformers import DonutProcessor, VisionEncoderDecoderModel
from PIL import Image
import tempfile
import streamlit as st
def extract_text_with_donut(pdf_path):
"""
Extract text using Hugging Face Donut model for OCR.
:param pdf_path: Path to the input PDF file.
:return: List of extracted text for each page.
"""
# Load the model and processor
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base")
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base")
extracted_text = []
doc = fitz.open(pdf_path)
for page_num in range(len(doc)):
page = doc.load_page(page_num)
pix = page.get_pixmap(dpi=300) # Convert PDF page to high-resolution image
image_path = f"temp_page_{page_num}.png"
pix.save(image_path)
# Perform OCR using Donut
image = Image.open(image_path).convert("RGB")
inputs = processor(images=image, return_tensors="pt")
outputs = model.generate(**inputs)
page_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
extracted_text.append({"page_num": page_num, "text": page_text})
# Cleanup temporary image
os.remove(image_path)
return extracted_text
def overlay_text_with_fonts(pdf_path, extracted_data, output_pdf_path):
"""
Overlay extracted text onto the original PDF.
:param pdf_path: Path to the input PDF file.
:param extracted_data: Extracted text for each page.
:param output_pdf_path: Path to save the output PDF file.
"""
doc = fitz.open(pdf_path)
for item in extracted_data:
page_num = item["page_num"]
text = item["text"]
page = doc[page_num]
# Add extracted text to the page
y = 50 # Starting position
for line in text.split("\n"):
page.insert_text((50, y), line, fontsize=10, fontname="Helvetica", color=(0, 0, 0))
y += 12 # Line spacing
doc.save(output_pdf_path)
print(f"PDF saved to: {output_pdf_path}")
def process_pdf(uploaded_pdf, output_pdf_path):
"""
Process the uploaded PDF to extract text using Hugging Face Donut and overlay it.
:param uploaded_pdf: Uploaded PDF file.
:param output_pdf_path: Path to save the output PDF file.
"""
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
temp_pdf.write(uploaded_pdf.read())
temp_pdf_path = temp_pdf.name
extracted_data = extract_text_with_donut(temp_pdf_path)
overlay_text_with_fonts(temp_pdf_path, extracted_data, output_pdf_path)
os.remove(temp_pdf_path)
# Streamlit App
def main():
st.title("Hugging Face OCR Text Extraction Tool")
st.write("Upload a PDF to extract and overlay text using Hugging Face Donut.")
uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
if uploaded_file:
output_pdf_path = "converted_output.pdf"
with st.spinner("Processing your PDF..."):
process_pdf(uploaded_file, output_pdf_path)
st.success("PDF processing complete!")
# Provide a download button for the processed PDF
with open(output_pdf_path, "rb") as f:
st.download_button(
label="Download Converted PDF",
data=f,
file_name="converted_output.pdf",
mime="application/pdf"
)
os.remove(output_pdf_path)
if __name__ == "__main__":
main()
|