Vector_Text / app.py
DSatishchandra's picture
Update app.py
2d1a10e verified
import os
import fitz # PyMuPDF for PDF handling
from transformers import DonutProcessor, VisionEncoderDecoderModel
from PIL import Image
import tempfile
import streamlit as st
def extract_text_with_donut(pdf_path):
"""
Extract text using Hugging Face Donut model for OCR.
:param pdf_path: Path to the input PDF file.
:return: List of extracted text for each page.
"""
# Load the model and processor
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base")
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base")
extracted_text = []
doc = fitz.open(pdf_path)
for page_num in range(len(doc)):
page = doc.load_page(page_num)
pix = page.get_pixmap(dpi=300) # Convert PDF page to high-resolution image
image_path = f"temp_page_{page_num}.png"
pix.save(image_path)
# Perform OCR using Donut
image = Image.open(image_path).convert("RGB")
inputs = processor(images=image, return_tensors="pt")
outputs = model.generate(**inputs)
page_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
extracted_text.append({"page_num": page_num, "text": page_text})
# Cleanup temporary image
os.remove(image_path)
return extracted_text
def overlay_text_with_fonts(pdf_path, extracted_data, output_pdf_path):
"""
Overlay extracted text onto the original PDF.
:param pdf_path: Path to the input PDF file.
:param extracted_data: Extracted text for each page.
:param output_pdf_path: Path to save the output PDF file.
"""
doc = fitz.open(pdf_path)
for item in extracted_data:
page_num = item["page_num"]
text = item["text"]
page = doc[page_num]
# Add extracted text to the page
y = 50 # Starting position
for line in text.split("\n"):
page.insert_text((50, y), line, fontsize=10, fontname="Helvetica", color=(0, 0, 0))
y += 12 # Line spacing
doc.save(output_pdf_path)
print(f"PDF saved to: {output_pdf_path}")
def process_pdf(uploaded_pdf, output_pdf_path):
"""
Process the uploaded PDF to extract text using Hugging Face Donut and overlay it.
:param uploaded_pdf: Uploaded PDF file.
:param output_pdf_path: Path to save the output PDF file.
"""
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
temp_pdf.write(uploaded_pdf.read())
temp_pdf_path = temp_pdf.name
extracted_data = extract_text_with_donut(temp_pdf_path)
overlay_text_with_fonts(temp_pdf_path, extracted_data, output_pdf_path)
os.remove(temp_pdf_path)
# Streamlit App
def main():
st.title("Hugging Face OCR Text Extraction Tool")
st.write("Upload a PDF to extract and overlay text using Hugging Face Donut.")
uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
if uploaded_file:
output_pdf_path = "converted_output.pdf"
with st.spinner("Processing your PDF..."):
process_pdf(uploaded_file, output_pdf_path)
st.success("PDF processing complete!")
# Provide a download button for the processed PDF
with open(output_pdf_path, "rb") as f:
st.download_button(
label="Download Converted PDF",
data=f,
file_name="converted_output.pdf",
mime="application/pdf"
)
os.remove(output_pdf_path)
if __name__ == "__main__":
main()