Spaces:

DSatishchandra
/

Vector_Text

Sleeping

App Files Files

xet

Community

DSatishchandra commited on Jan 8

Commit

2d1a10e

verified ·

1 Parent(s): 43cec96

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -38

app.py CHANGED Viewed

@@ -1,17 +1,22 @@
 import os
 import fitz  # PyMuPDF for PDF handling
-import pytesseract  # OCR for text extraction
 from PIL import Image
 import tempfile
 import streamlit as st
-def extract_text_with_tesseract(pdf_path):
     """
-    Extract text with bounding box positions using Tesseract OCR.
     :param pdf_path: Path to the input PDF file.
-    :return: List of dictionaries containing text and positions for each page.
     """
-    extracted_data = []
     doc = fitz.open(pdf_path)
     for page_num in range(len(doc)):
@@ -20,52 +25,40 @@ def extract_text_with_tesseract(pdf_path):
         image_path = f"temp_page_{page_num}.png"
         pix.save(image_path)
-        # Perform OCR using Tesseract
-        img = Image.open(image_path)
-        ocr_result = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
-        page_data = []
-        for i in range(len(ocr_result["text"])):
-            if ocr_result["text"][i].strip():  # Ignore empty text
-                page_data.append({
-                    "text": ocr_result["text"][i],
-                    "x0": ocr_result["left"][i],
-                    "y0": ocr_result["top"][i],
-                    "x1": ocr_result["left"][i] + ocr_result["width"][i],
-                    "y1": ocr_result["top"][i] + ocr_result["height"][i],
-                    "font_size": ocr_result["height"][i]
-                })
-        extracted_data.append(page_data)
         # Cleanup temporary image
         os.remove(image_path)
-    return extracted_data
 def overlay_text_with_fonts(pdf_path, extracted_data, output_pdf_path):
     """
-    Overlay extracted text onto the original PDF using PyMuPDF.
     :param pdf_path: Path to the input PDF file.
-    :param extracted_data: Extracted text and positions.
     :param output_pdf_path: Path to save the output PDF file.
     """
     doc = fitz.open(pdf_path)
-    default_font = "Helvetica"
-    for page_num, page_data in enumerate(extracted_data):
         page = doc[page_num]
-        for item in page_data:
-            page.insert_text(
-                (item["x0"], item["y0"]),
-                item["text"],
-                fontsize=item["font_size"] / 2,  # Adjust font size for better scaling
-                fontname=default_font,
-                color=(0, 0, 0)  # Black text
-            )
     doc.save(output_pdf_path)
     print(f"PDF saved to: {output_pdf_path}")
@@ -73,7 +66,7 @@ def overlay_text_with_fonts(pdf_path, extracted_data, output_pdf_path):
 def process_pdf(uploaded_pdf, output_pdf_path):
     """
-    Process the uploaded PDF to extract text using Tesseract and overlay it.
     :param uploaded_pdf: Uploaded PDF file.
     :param output_pdf_path: Path to save the output PDF file.
     """
@@ -81,7 +74,7 @@ def process_pdf(uploaded_pdf, output_pdf_path):
         temp_pdf.write(uploaded_pdf.read())
         temp_pdf_path = temp_pdf.name
-    extracted_data = extract_text_with_tesseract(temp_pdf_path)
     overlay_text_with_fonts(temp_pdf_path, extracted_data, output_pdf_path)
     os.remove(temp_pdf_path)
@@ -89,8 +82,8 @@ def process_pdf(uploaded_pdf, output_pdf_path):
 # Streamlit App
 def main():
-    st.title("PDF OCR and Text Conversion Tool")
-    st.write("Upload a PDF to extract and overlay text as editable layers.")
     uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
     if uploaded_file:

 import os
 import fitz  # PyMuPDF for PDF handling
+from transformers import DonutProcessor, VisionEncoderDecoderModel
 from PIL import Image
 import tempfile
 import streamlit as st
+def extract_text_with_donut(pdf_path):
     """
+    Extract text using Hugging Face Donut model for OCR.
     :param pdf_path: Path to the input PDF file.
+    :return: List of extracted text for each page.
     """
+    # Load the model and processor
+    processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base")
+    model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base")
+    extracted_text = []
     doc = fitz.open(pdf_path)
     for page_num in range(len(doc)):
         image_path = f"temp_page_{page_num}.png"
         pix.save(image_path)
+        # Perform OCR using Donut
+        image = Image.open(image_path).convert("RGB")
+        inputs = processor(images=image, return_tensors="pt")
+        outputs = model.generate(**inputs)
+        page_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
+        extracted_text.append({"page_num": page_num, "text": page_text})
         # Cleanup temporary image
         os.remove(image_path)
+    return extracted_text
 def overlay_text_with_fonts(pdf_path, extracted_data, output_pdf_path):
     """
+    Overlay extracted text onto the original PDF.
     :param pdf_path: Path to the input PDF file.
+    :param extracted_data: Extracted text for each page.
     :param output_pdf_path: Path to save the output PDF file.
     """
     doc = fitz.open(pdf_path)
+    for item in extracted_data:
+        page_num = item["page_num"]
+        text = item["text"]
         page = doc[page_num]
+        # Add extracted text to the page
+        y = 50  # Starting position
+        for line in text.split("\n"):
+            page.insert_text((50, y), line, fontsize=10, fontname="Helvetica", color=(0, 0, 0))
+            y += 12  # Line spacing
     doc.save(output_pdf_path)
     print(f"PDF saved to: {output_pdf_path}")
 def process_pdf(uploaded_pdf, output_pdf_path):
     """
+    Process the uploaded PDF to extract text using Hugging Face Donut and overlay it.
     :param uploaded_pdf: Uploaded PDF file.
     :param output_pdf_path: Path to save the output PDF file.
     """
         temp_pdf.write(uploaded_pdf.read())
         temp_pdf_path = temp_pdf.name
+    extracted_data = extract_text_with_donut(temp_pdf_path)
     overlay_text_with_fonts(temp_pdf_path, extracted_data, output_pdf_path)
     os.remove(temp_pdf_path)
 # Streamlit App
 def main():
+    st.title("Hugging Face OCR Text Extraction Tool")
+    st.write("Upload a PDF to extract and overlay text using Hugging Face Donut.")
     uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
     if uploaded_file: