|
import boto3 |
|
from PIL import Image |
|
import io |
|
import json |
|
import pikepdf |
|
|
|
from pdf2image import convert_from_bytes |
|
from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult |
|
|
|
def analyse_page_with_textract(pdf_page_bytes, json_file_path): |
|
''' |
|
Analyse page with AWS Textract |
|
''' |
|
try: |
|
client = boto3.client('textract') |
|
except: |
|
print("Cannot connect to AWS Textract") |
|
return "", "", "" |
|
|
|
print("Analysing page with AWS Textract") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"]) |
|
|
|
text_blocks = response['Blocks'] |
|
|
|
|
|
with open(json_file_path, 'w') as json_file: |
|
json.dump(response, json_file, indent=4) |
|
|
|
print("Response has been written to output:", json_file_path) |
|
|
|
return text_blocks |
|
|
|
|
|
def convert_pike_pdf_page_to_bytes(pdf, page_num): |
|
|
|
new_pdf = pikepdf.Pdf.new() |
|
|
|
|
|
page_num = 0 |
|
|
|
|
|
new_pdf.pages.append(pdf.pages[page_num]) |
|
|
|
|
|
buffer = io.BytesIO() |
|
new_pdf.save(buffer) |
|
|
|
|
|
pdf_bytes = buffer.getvalue() |
|
|
|
|
|
buffer.close() |
|
|
|
|
|
|
|
|
|
return pdf_bytes |
|
|
|
|
|
def json_to_ocrresult(json_data, page_width, page_height): |
|
''' |
|
Convert the json response from textract to the OCRResult format used elsewhere in the code. |
|
''' |
|
all_ocr_results = [] |
|
signature_or_handwriting_recogniser_results = [] |
|
signatures = [] |
|
handwriting = [] |
|
|
|
for text_block in json_data: |
|
|
|
is_signature = False |
|
is_handwriting = False |
|
|
|
if (text_block['BlockType'] == 'WORD') | (text_block['BlockType'] == 'LINE'): |
|
text = text_block['Text'] |
|
|
|
|
|
bbox = text_block["Geometry"]["BoundingBox"] |
|
left = bbox["Left"] |
|
top = bbox["Top"] |
|
width = bbox["Width"] |
|
height = bbox["Height"] |
|
|
|
|
|
left_abs = int(left * page_width) |
|
top_abs = int(top * page_height) |
|
width_abs = int(width * page_width) |
|
height_abs = int(height * page_height) |
|
|
|
|
|
ocr_result = OCRResult(text, left_abs, top_abs, width_abs, height_abs) |
|
|
|
|
|
confidence = text_block['Confidence'] |
|
|
|
if 'TextType' in text_block: |
|
text_type = text_block["TextType"] |
|
|
|
if text_type == "HANDWRITING": |
|
is_handwriting = True |
|
entity_name = "HANDWRITING" |
|
word_end = len(entity_name) |
|
recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= text, score= confidence, start=0, end=word_end, left=left_abs, top=top_abs, width=width_abs, height=height_abs) |
|
handwriting.append(recogniser_result) |
|
print("Handwriting found:", handwriting[-1]) |
|
|
|
all_ocr_results.append(ocr_result) |
|
|
|
elif (text_block['BlockType'] == 'SIGNATURE'): |
|
text = "SIGNATURE" |
|
|
|
|
|
bbox = text_block["Geometry"]["BoundingBox"] |
|
left = bbox["Left"] |
|
top = bbox["Top"] |
|
width = bbox["Width"] |
|
height = bbox["Height"] |
|
|
|
|
|
left_abs = int(left * page_width) |
|
top_abs = int(top * page_height) |
|
width_abs = int(width * page_width) |
|
height_abs = int(height * page_height) |
|
|
|
|
|
ocr_result = OCRResult(text, left_abs, top_abs, width_abs, height_abs) |
|
|
|
|
|
is_signature = True |
|
entity_name = "Signature" |
|
word_end = len(entity_name) |
|
recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= text, score= confidence, start=0, end=word_end, left=left_abs, top=top_abs, width=width_abs, height=height_abs) |
|
signatures.append(recogniser_result) |
|
print("Signature found:", signatures[-1]) |
|
|
|
all_ocr_results.append(ocr_result) |
|
|
|
is_signature_or_handwriting = is_signature | is_handwriting |
|
|
|
|
|
if is_signature_or_handwriting: |
|
signature_or_handwriting_recogniser_results.append(recogniser_result) |
|
|
|
return all_ocr_results, signature_or_handwriting_recogniser_results |