Spaces:
Running
Running
File size: 4,461 Bytes
5374321 6bc31ab 5374321 8290c12 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import os
import fitz # PyMuPDF for PDF handling
import easyocr # OCR for text extraction
import tempfile
import streamlit as st
# Disable GStreamer to prevent OpenCV-related errors
os.environ["OPENCV_VIDEOIO_PRIORITY_GSTREAMER"] = "0"
def extract_text_with_ocr(pdf_path):
"""
Extract text with bounding box positions using OCR for both English and Arabic text.
:param pdf_path: Path to the input PDF file.
:return: List of dictionaries containing text and positions for each page.
"""
extracted_data = []
doc = fitz.open(pdf_path)
# Convert each PDF page to an image for OCR processing
for page_num in range(len(doc)):
page = doc.load_page(page_num)
pix = page.get_pixmap(dpi=300) # Convert PDF page to image
image_path = f"temp_page_{page_num}.png"
pix.save(image_path)
# Perform OCR on the image
reader = easyocr.Reader(['en']) # Supports English (add 'ar' for Arabic if needed)
results = reader.readtext(image_path, detail=1) # detail=1 returns bounding box info
# Extract text and positions
page_data = []
for (bbox, text, confidence) in results:
(x0, y0), (x1, y1) = bbox[0], bbox[2]
page_data.append({
"text": text,
"x0": x0,
"y0": y1, # Adjust to bottom-left corner (PDF coordinates)
"font_size": y1 - y0, # Approximate font size
"confidence": confidence
})
extracted_data.append(page_data)
# Cleanup temporary image
os.remove(image_path)
return extracted_data
def overlay_text_with_fonts(pdf_path, extracted_data, output_pdf_path):
"""
Overlay extracted text onto the original PDF using fonts from different font families.
:param pdf_path: Path to the input PDF file.
:param extracted_data: List of extracted text with positions.
:param output_pdf_path: Path to save the output PDF file.
"""
doc = fitz.open(pdf_path)
# Define default font settings
default_font = "Helvetica" # You can replace it with specific fonts like "Arial" or others.
for page_num, page_data in enumerate(extracted_data):
page = doc[page_num]
for item in page_data:
if item["confidence"] > 0.8: # Only overlay high-confidence text
page.insert_text(
(item["x0"], item["y0"]),
item["text"],
fontsize=item["font_size"],
fontname=default_font,
color=(0, 0, 0), # Black text
render_mode=0 # Ensure text is not outlined
)
doc.save(output_pdf_path)
print(f"PDF saved to: {output_pdf_path}")
def process_pdf(uploaded_pdf, output_pdf_path):
"""
Process the uploaded PDF to extract text using OCR and overlay it as editable text.
:param uploaded_pdf: The uploaded PDF file.
:param output_pdf_path: Path to save the output PDF file.
"""
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
temp_pdf.write(uploaded_pdf.read())
temp_pdf_path = temp_pdf.name
# Step 1: Extract text using OCR
extracted_data = extract_text_with_ocr(temp_pdf_path)
# Step 2: Overlay extracted text onto the original PDF
overlay_text_with_fonts(temp_pdf_path, extracted_data, output_pdf_path)
# Cleanup temporary file
if os.path.exists(temp_pdf_path):
os.remove(temp_pdf_path)
# Streamlit App
def main():
st.title("PDF Text Conversion Tool")
st.write("Upload a PDF to convert vector text into regular, editable text.")
uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
if uploaded_file:
output_pdf_path = "converted_output.pdf"
with st.spinner("Processing your PDF..."):
process_pdf(uploaded_file, output_pdf_path)
st.success("PDF processing complete!")
# Provide a download button for the processed PDF
with open(output_pdf_path, "rb") as f:
st.download_button(
label="Download Converted PDF",
data=f,
file_name="converted_output.pdf",
mime="application/pdf"
)
# Cleanup the processed output PDF
if os.path.exists(output_pdf_path):
os.remove(output_pdf_path)
if __name__ == "__main__":
main() |