Spaces:
Sleeping
Sleeping
File size: 5,509 Bytes
e9c4101 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
import boto3
from PIL import Image
import io
import json
import pikepdf
# Example: converting this single page to an image
from pdf2image import convert_from_bytes
from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
def analyse_page_with_textract(pdf_page_bytes, json_file_path):
'''
Analyse page with AWS Textract
'''
try:
client = boto3.client('textract')
except:
print("Cannot connect to AWS Textract")
return "", "", ""
print("Analysing page with AWS Textract")
# Convert the image to bytes using an in-memory buffer
#image_buffer = io.BytesIO()
#image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
#image_bytes = image_buffer.getvalue()
#response = client.detect_document_text(Document={'Bytes': image_bytes})
response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
text_blocks = response['Blocks']
# Write the response to a JSON file
with open(json_file_path, 'w') as json_file:
json.dump(response, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
print("Response has been written to output:", json_file_path)
return text_blocks
def convert_pike_pdf_page_to_bytes(pdf, page_num):
# Create a new empty PDF
new_pdf = pikepdf.Pdf.new()
# Specify the page number you want to extract (0-based index)
page_num = 0 # Example: first page
# Extract the specific page and add it to the new PDF
new_pdf.pages.append(pdf.pages[page_num])
# Save the new PDF to a bytes buffer
buffer = io.BytesIO()
new_pdf.save(buffer)
# Get the PDF bytes
pdf_bytes = buffer.getvalue()
# Now you can use the `pdf_bytes` to convert it to an image or further process
buffer.close()
#images = convert_from_bytes(pdf_bytes)
#image = images[0]
return pdf_bytes
def json_to_ocrresult(json_data, page_width, page_height):
'''
Convert the json response from textract to the OCRResult format used elsewhere in the code.
'''
all_ocr_results = []
signature_or_handwriting_recogniser_results = []
signatures = []
handwriting = []
for text_block in json_data:
is_signature = False
is_handwriting = False
if (text_block['BlockType'] == 'WORD') | (text_block['BlockType'] == 'LINE'):
text = text_block['Text']
# Extract BoundingBox details
bbox = text_block["Geometry"]["BoundingBox"]
left = bbox["Left"]
top = bbox["Top"]
width = bbox["Width"]
height = bbox["Height"]
# Convert proportional coordinates to absolute coordinates
left_abs = int(left * page_width)
top_abs = int(top * page_height)
width_abs = int(width * page_width)
height_abs = int(height * page_height)
# Create OCRResult with absolute coordinates
ocr_result = OCRResult(text, left_abs, top_abs, width_abs, height_abs)
# If handwriting or signature, add to bounding box
confidence = text_block['Confidence']
if 'TextType' in text_block:
text_type = text_block["TextType"]
if text_type == "HANDWRITING":
is_handwriting = True
entity_name = "HANDWRITING"
word_end = len(entity_name)
recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= text, score= confidence, start=0, end=word_end, left=left_abs, top=top_abs, width=width_abs, height=height_abs)
handwriting.append(recogniser_result)
print("Handwriting found:", handwriting[-1])
all_ocr_results.append(ocr_result)
elif (text_block['BlockType'] == 'SIGNATURE'):
text = "SIGNATURE"
# Extract BoundingBox details
bbox = text_block["Geometry"]["BoundingBox"]
left = bbox["Left"]
top = bbox["Top"]
width = bbox["Width"]
height = bbox["Height"]
# Convert proportional coordinates to absolute coordinates
left_abs = int(left * page_width)
top_abs = int(top * page_height)
width_abs = int(width * page_width)
height_abs = int(height * page_height)
# Create OCRResult with absolute coordinates
ocr_result = OCRResult(text, left_abs, top_abs, width_abs, height_abs)
is_signature = True
entity_name = "Signature"
word_end = len(entity_name)
recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= text, score= confidence, start=0, end=word_end, left=left_abs, top=top_abs, width=width_abs, height=height_abs)
signatures.append(recogniser_result)
print("Signature found:", signatures[-1])
all_ocr_results.append(ocr_result)
is_signature_or_handwriting = is_signature | is_handwriting
# If it is signature or handwriting, will overwrite the default behaviour of the PII analyser
if is_signature_or_handwriting:
signature_or_handwriting_recogniser_results.append(recogniser_result)
return all_ocr_results, signature_or_handwriting_recogniser_results |