import boto3 |
from PIL import Image |
import io |
import json |
import pikepdf |
from pdf2image import convert_from_bytes |
from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult |
def analyse_page_with_textract(pdf_page_bytes, json_file_path): |
''' |
Analyse page with AWS Textract |
''' |
try: |
client = boto3.client('textract') |
except: |
print("Cannot connect to AWS Textract") |
return "", "", "" |
print("Analysing page with AWS Textract") |
response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"]) |
text_blocks = response['Blocks'] |
with open(json_file_path, 'w') as json_file: |
json.dump(response, json_file, indent=4) |
print("Response has been written to output:", json_file_path) |
return text_blocks |
def convert_pike_pdf_page_to_bytes(pdf, page_num): |
new_pdf = pikepdf.Pdf.new() |
page_num = 0 |
new_pdf.pages.append(pdf.pages[page_num]) |
buffer = io.BytesIO() |
new_pdf.save(buffer) |
pdf_bytes = buffer.getvalue() |
buffer.close() |
return pdf_bytes |
def json_to_ocrresult(json_data, page_width, page_height): |
''' |
Convert the json response from textract to the OCRResult format used elsewhere in the code. |
''' |
all_ocr_results = [] |
signature_or_handwriting_recogniser_results = [] |
signatures = [] |
handwriting = [] |
for text_block in json_data: |
is_signature = False |
is_handwriting = False |
if (text_block['BlockType'] == 'WORD') | (text_block['BlockType'] == 'LINE'): |
text = text_block['Text'] |
bbox = text_block["Geometry"]["BoundingBox"] |
left = bbox["Left"] |
top = bbox["Top"] |
width = bbox["Width"] |
height = bbox["Height"] |
left_abs = int(left * page_width) |
top_abs = int(top * page_height) |
width_abs = int(width * page_width) |
height_abs = int(height * page_height) |
ocr_result = OCRResult(text, left_abs, top_abs, width_abs, height_abs) |
confidence = text_block['Confidence'] |
if 'TextType' in text_block: |
text_type = text_block["TextType"] |
if text_type == "HANDWRITING": |
is_handwriting = True |
entity_name = "HANDWRITING" |
word_end = len(entity_name) |
recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= text, score= confidence, start=0, end=word_end, left=left_abs, top=top_abs, width=width_abs, height=height_abs) |
handwriting.append(recogniser_result) |
print("Handwriting found:", handwriting[-1]) |
all_ocr_results.append(ocr_result) |
elif (text_block['BlockType'] == 'SIGNATURE'): |
text = "SIGNATURE" |
bbox = text_block["Geometry"]["BoundingBox"] |
left = bbox["Left"] |
top = bbox["Top"] |
width = bbox["Width"] |
height = bbox["Height"] |
left_abs = int(left * page_width) |
top_abs = int(top * page_height) |
width_abs = int(width * page_width) |
height_abs = int(height * page_height) |
ocr_result = OCRResult(text, left_abs, top_abs, width_abs, height_abs) |
is_signature = True |
entity_name = "Signature" |
word_end = len(entity_name) |
recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= text, score= confidence, start=0, end=word_end, left=left_abs, top=top_abs, width=width_abs, height=height_abs) |
signatures.append(recogniser_result) |
print("Signature found:", signatures[-1]) |
all_ocr_results.append(ocr_result) |
is_signature_or_handwriting = is_signature | is_handwriting |
if is_signature_or_handwriting: |
signature_or_handwriting_recogniser_results.append(recogniser_result) |
return all_ocr_results, signature_or_handwriting_recogniser_results |