Sean Pedrick-Case commited on
Commit
45c751d
·
unverified ·
2 Parent(s): 9de60e6 c3a8cd7

Merge pull request #2 from seanpedrick-case/dev

Browse files
app.py CHANGED
@@ -13,7 +13,7 @@ from gradio_image_annotation.image_annotator import AnnotatedImageData
13
  from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
14
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
15
  from tools.file_redaction import choose_and_run_redactor
16
- from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
17
  from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df, df_select_callback
18
  from tools.data_anonymise import anonymise_data_files
19
  from tools.auth import authenticate_user
@@ -41,6 +41,8 @@ full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREET
41
 
42
  language = 'en'
43
 
 
 
44
  host_name = socket.gethostname()
45
  feedback_logs_folder = 'feedback/' + today_rev + '/' + host_name + '/'
46
  access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
@@ -84,6 +86,8 @@ with app:
84
  log_files_output_list_state = gr.State([])
85
 
86
  review_file_state = gr.State(pd.DataFrame())
 
 
87
 
88
  # Logging state
89
  log_file_name = 'log.csv'
@@ -117,7 +121,7 @@ with app:
117
 
118
 
119
  ## Annotator zoom value
120
- annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=80, precision=0, visible=False)
121
  zoom_true_bool = gr.State(True)
122
  zoom_false_bool = gr.State(False)
123
 
@@ -344,16 +348,18 @@ with app:
344
  # Page controls at top
345
  annotate_current_page.submit(
346
  modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
347
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
348
-
349
-
350
 
351
  annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
352
  then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
353
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
 
 
354
  annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
355
  then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
356
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
 
357
 
358
  # Zoom in and out on annotator
359
  annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
@@ -370,18 +376,23 @@ with app:
370
  #annotation_button_get.click(get_boxes_json, annotator, json_boxes)
371
  annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
372
 
 
 
373
  # Page controls at bottom
374
  annotate_current_page_bottom.submit(
375
  modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
376
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
 
377
 
378
  annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
379
  then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
380
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
 
381
 
382
  annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
383
  then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
384
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
 
385
 
386
  # Review side bar controls
387
  recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
@@ -420,13 +431,13 @@ with app:
420
  app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
421
 
422
  # If running on AWS, load in the default allow list file from S3
423
- if RUN_AWS_FUNCTIONS == "1":
424
- print("default_allow_list_output_folder_location:", default_allow_list_loc)
425
- if not os.path.exists(default_allow_list_loc):
426
- app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_allow_list_file, default_allow_list_output_folder_location]).\
427
- then(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
428
- else:
429
- app.load(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
430
 
431
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
432
  access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
 
13
  from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
14
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
15
  from tools.file_redaction import choose_and_run_redactor
16
+ from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
17
  from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df, df_select_callback
18
  from tools.data_anonymise import anonymise_data_files
19
  from tools.auth import authenticate_user
 
41
 
42
  language = 'en'
43
 
44
+
45
+
46
  host_name = socket.gethostname()
47
  feedback_logs_folder = 'feedback/' + today_rev + '/' + host_name + '/'
48
  access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
 
86
  log_files_output_list_state = gr.State([])
87
 
88
  review_file_state = gr.State(pd.DataFrame())
89
+
90
+ do_not_save_pdf_state = gr.State(False)
91
 
92
  # Logging state
93
  log_file_name = 'log.csv'
 
121
 
122
 
123
  ## Annotator zoom value
124
+ annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=100, precision=0, visible=False)
125
  zoom_true_bool = gr.State(True)
126
  zoom_false_bool = gr.State(False)
127
 
 
348
  # Page controls at top
349
  annotate_current_page.submit(
350
  modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
351
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
352
+ then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
 
353
 
354
  annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
355
  then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
356
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
357
+ then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
358
+
359
  annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
360
  then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
361
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
362
+ then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
363
 
364
  # Zoom in and out on annotator
365
  annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
 
376
  #annotation_button_get.click(get_boxes_json, annotator, json_boxes)
377
  annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
378
 
379
+ do_not_save_pdf_state
380
+
381
  # Page controls at bottom
382
  annotate_current_page_bottom.submit(
383
  modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
384
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
385
+ then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
386
 
387
  annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
388
  then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
389
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
390
+ then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
391
 
392
  annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
393
  then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
394
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
395
+ then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
396
 
397
  # Review side bar controls
398
  recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
 
431
  app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
432
 
433
  # If running on AWS, load in the default allow list file from S3
434
+ # if RUN_AWS_FUNCTIONS == "1":
435
+ # print("default_allow_list_output_folder_location:", default_allow_list_loc)
436
+ # if not os.path.exists(default_allow_list_loc):
437
+ # app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_allow_list_file, default_allow_list_output_folder_location]).\
438
+ # then(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
439
+ # else:
440
+ # app.load(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
441
 
442
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
443
  access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
doc_redaction_amplify_app ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit 9585642e4d1f72fc49971789693d5584661084c8
tools/auth.py CHANGED
@@ -7,13 +7,13 @@ import base64
7
  from tools.helper_functions import get_or_create_env_var
8
 
9
  client_id = get_or_create_env_var('AWS_CLIENT_ID', '')
10
- print(f'The value of AWS_CLIENT_ID is {client_id}')
11
 
12
  client_secret = get_or_create_env_var('AWS_CLIENT_SECRET', '')
13
- print(f'The value of AWS_CLIENT_SECRET is {client_secret}')
14
 
15
  user_pool_id = get_or_create_env_var('AWS_USER_POOL_ID', '')
16
- print(f'The value of AWS_USER_POOL_ID is {user_pool_id}')
17
 
18
  def calculate_secret_hash(client_id, client_secret, username):
19
  message = username + client_id
@@ -46,24 +46,26 @@ def authenticate_user(username:str, password:str, user_pool_id:str=user_pool_id,
46
 
47
  try:
48
 
49
- # response = client.initiate_auth(
50
- # AuthFlow='USER_PASSWORD_AUTH',
51
- # AuthParameters={
52
- # 'USERNAME': username,
53
- # 'PASSWORD': password,
54
- # },
55
- # ClientId=client_id
56
- # )
 
57
 
58
- response = client.initiate_auth(
59
- AuthFlow='USER_PASSWORD_AUTH',
60
- AuthParameters={
61
- 'USERNAME': username,
62
- 'PASSWORD': password,
63
- 'SECRET_HASH': secret_hash
64
- },
65
- ClientId=client_id
66
- )
 
67
 
68
  # If successful, you'll receive an AuthenticationResult in the response
69
  if response.get('AuthenticationResult'):
 
7
  from tools.helper_functions import get_or_create_env_var
8
 
9
  client_id = get_or_create_env_var('AWS_CLIENT_ID', '')
10
+ #print(f'The value of AWS_CLIENT_ID is {client_id}')
11
 
12
  client_secret = get_or_create_env_var('AWS_CLIENT_SECRET', '')
13
+ #print(f'The value of AWS_CLIENT_SECRET is {client_secret}')
14
 
15
  user_pool_id = get_or_create_env_var('AWS_USER_POOL_ID', '')
16
+ #print(f'The value of AWS_USER_POOL_ID is {user_pool_id}')
17
 
18
  def calculate_secret_hash(client_id, client_secret, username):
19
  message = username + client_id
 
46
 
47
  try:
48
 
49
+ if client_secret == '':
50
+ response = client.initiate_auth(
51
+ AuthFlow='USER_PASSWORD_AUTH',
52
+ AuthParameters={
53
+ 'USERNAME': username,
54
+ 'PASSWORD': password,
55
+ },
56
+ ClientId=client_id
57
+ )
58
 
59
+ else:
60
+ response = client.initiate_auth(
61
+ AuthFlow='USER_PASSWORD_AUTH',
62
+ AuthParameters={
63
+ 'USERNAME': username,
64
+ 'PASSWORD': password,
65
+ 'SECRET_HASH': secret_hash
66
+ },
67
+ ClientId=client_id
68
+ )
69
 
70
  # If successful, you'll receive an AuthenticationResult in the response
71
  if response.get('AuthenticationResult'):
tools/aws_textract.py CHANGED
@@ -145,8 +145,9 @@ def json_to_ocrresult(json_data, page_width, page_height, page_no):
145
 
146
  # Extract text and bounding box for the line
147
  line_text = text_block.get('Text', '')
148
-
149
  words = []
 
 
150
  if 'Relationships' in text_block:
151
  for relationship in text_block['Relationships']:
152
  if relationship['Type'] == 'CHILD':
@@ -179,35 +180,56 @@ def json_to_ocrresult(json_data, page_width, page_height, page_no):
179
  if text_type == "HANDWRITING":
180
  is_handwriting = True
181
  entity_name = "HANDWRITING"
182
- word_end = len(entity_name)
183
-
184
- recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= word_text, score= confidence, start=0, end=word_end, left=word_left, top=word_top, width=word_width_abs, height=word_height_abs)
185
-
186
- if recogniser_result not in handwriting:
187
- handwriting.append(recogniser_result)
188
- #print("Handwriting found:", handwriting[-1])
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
  # If handwriting or signature, add to bounding box
191
 
192
  elif (text_block['BlockType'] == 'SIGNATURE'):
193
  line_text = "SIGNATURE"
194
-
195
  is_signature = True
196
  entity_name = "SIGNATURE"
197
- confidence = text_block['Confidence']
198
- word_end = len(entity_name)
199
-
200
- recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= line_text, score= confidence, start=0, end=word_end, left=line_left, top=line_top, width=width_abs, height=height_abs)
201
-
202
- if recogniser_result not in signatures:
203
- signatures.append(recogniser_result)
204
- #print("Signature found:", signatures[-1])
205
-
206
- words = []
207
- words.append({
208
- 'text': line_text,
209
- 'bounding_box': (line_left, line_top, line_right, line_bottom)
210
- })
 
 
 
 
 
 
 
 
 
 
211
 
212
  ocr_results_with_children["text_line_" + str(i)] = {
213
  "line": i,
 
145
 
146
  # Extract text and bounding box for the line
147
  line_text = text_block.get('Text', '')
 
148
  words = []
149
+ current_line_handwriting_results = [] # Track handwriting results for this line
150
+
151
  if 'Relationships' in text_block:
152
  for relationship in text_block['Relationships']:
153
  if relationship['Type'] == 'CHILD':
 
180
  if text_type == "HANDWRITING":
181
  is_handwriting = True
182
  entity_name = "HANDWRITING"
183
+ word_end = len(word_text)
184
+
185
+ recogniser_result = CustomImageRecognizerResult(
186
+ entity_type=entity_name,
187
+ text=word_text,
188
+ score=confidence,
189
+ start=0,
190
+ end=word_end,
191
+ left=word_left,
192
+ top=word_top,
193
+ width=word_width_abs,
194
+ height=word_height_abs
195
+ )
196
+
197
+ # Add to handwriting collections immediately
198
+ handwriting.append(recogniser_result)
199
+ handwriting_recogniser_results.append(recogniser_result)
200
+ signature_or_handwriting_recogniser_results.append(recogniser_result)
201
+ current_line_handwriting_results.append(recogniser_result)
202
 
203
  # If handwriting or signature, add to bounding box
204
 
205
  elif (text_block['BlockType'] == 'SIGNATURE'):
206
  line_text = "SIGNATURE"
 
207
  is_signature = True
208
  entity_name = "SIGNATURE"
209
+ confidence = text_block.get('Confidence', 0)
210
+ word_end = len(line_text)
211
+
212
+ recogniser_result = CustomImageRecognizerResult(
213
+ entity_type=entity_name,
214
+ text=line_text,
215
+ score=confidence,
216
+ start=0,
217
+ end=word_end,
218
+ left=line_left,
219
+ top=line_top,
220
+ width=width_abs,
221
+ height=height_abs
222
+ )
223
+
224
+ # Add to signature collections immediately
225
+ signatures.append(recogniser_result)
226
+ signature_recogniser_results.append(recogniser_result)
227
+ signature_or_handwriting_recogniser_results.append(recogniser_result)
228
+
229
+ words = [{
230
+ 'text': line_text,
231
+ 'bounding_box': (line_left, line_top, line_right, line_bottom)
232
+ }]
233
 
234
  ocr_results_with_children["text_line_" + str(i)] = {
235
  "line": i,
tools/custom_image_analyser_engine.py CHANGED
@@ -14,6 +14,7 @@ from tools.helper_functions import clean_unicode_text
14
  from tools.presidio_analyzer_custom import recognizer_result_from_dict
15
  from tools.load_spacy_model_custom_recognisers import custom_entities
16
  #import string # Import string to get a list of common punctuation characters
 
17
 
18
  @dataclass
19
  class OCRResult:
@@ -493,11 +494,12 @@ class CustomImageAnalyzerEngine:
493
 
494
  elif pii_identification_method == "AWS Comprehend":
495
 
496
- # If using AWS Comprehend, Spacy model is only used to identify the custom entities created. Comprehend can't pick up Titles, Streetnames, and UKPostcodes specifically
497
  text_analyzer_kwargs["entities"] = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
498
 
499
  spacy_analyzer_result = self.analyzer_engine.analyze(
500
  text=line_level_ocr_result.text, **text_analyzer_kwargs)
 
501
  analyzer_results_by_line[i].extend(spacy_analyzer_result)
502
 
503
  if len(line_level_ocr_result.text) >= 3:
@@ -573,7 +575,7 @@ class CustomImageAnalyzerEngine:
573
  for result in analyzer_result:
574
  # Extract the relevant portion of text based on start and end
575
  relevant_text = line_level_ocr_results[i].text[result.start:result.end]
576
-
577
  # Find the corresponding entry in ocr_results_with_children
578
  child_words = ocr_results_with_children_line_level['words']
579
 
@@ -583,13 +585,23 @@ class CustomImageAnalyzerEngine:
583
  word_num = 0 # Initialize word count
584
  total_width = 0 # Initialize total width
585
 
586
- for word_text in relevant_text.split(): # Iterate through each word in relevant_text
587
- #print("Looking for word_text:", word_text)
588
- for word in child_words:
589
- #if word['text'].strip(string.punctuation).strip() == word_text.strip(string.punctuation).strip(): # Check for exact match
590
- if word_text in word['text']:
 
 
 
 
 
 
 
 
 
 
 
591
  found_word = word
592
- #print("found_word:", found_word)
593
 
594
  if word_num == 0: # First word
595
  left = found_word['bounding_box'][0]
@@ -598,6 +610,10 @@ class CustomImageAnalyzerEngine:
598
  all_words += found_word['text'] + " " # Concatenate words
599
  total_width = found_word['bounding_box'][2] - left # Add each word's width
600
  word_num += 1
 
 
 
 
601
  break # Move to the next word in relevant_text
602
 
603
  width = total_width + horizontal_buffer # Set width to total width of all matched words
@@ -621,9 +637,9 @@ class CustomImageAnalyzerEngine:
621
  result_reset_pos.start = 0
622
  result_reset_pos.end = len(relevant_text)
623
 
624
- #print("result_reset_pos:", result_reset_pos)
625
- #print("relevant_line_ocr_result:", relevant_line_ocr_result)
626
- #print("ocr_results_with_children_line_level:", ocr_results_with_children_line_level)
627
 
628
  # Map the analyzer results to bounding boxes for this line
629
  line_results = self.map_analyzer_results_to_bounding_boxes(
 
14
  from tools.presidio_analyzer_custom import recognizer_result_from_dict
15
  from tools.load_spacy_model_custom_recognisers import custom_entities
16
  #import string # Import string to get a list of common punctuation characters
17
+ import re # Add this import at the top of the file
18
 
19
  @dataclass
20
  class OCRResult:
 
494
 
495
  elif pii_identification_method == "AWS Comprehend":
496
 
497
+ # If using AWS Comprehend, Spacy model is only used to identify the custom entities created. This is because Comprehend can't pick up Titles, Streetnames, and UKPostcodes, or a custom deny list specifically
498
  text_analyzer_kwargs["entities"] = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
499
 
500
  spacy_analyzer_result = self.analyzer_engine.analyze(
501
  text=line_level_ocr_result.text, **text_analyzer_kwargs)
502
+
503
  analyzer_results_by_line[i].extend(spacy_analyzer_result)
504
 
505
  if len(line_level_ocr_result.text) >= 3:
 
575
  for result in analyzer_result:
576
  # Extract the relevant portion of text based on start and end
577
  relevant_text = line_level_ocr_results[i].text[result.start:result.end]
578
+
579
  # Find the corresponding entry in ocr_results_with_children
580
  child_words = ocr_results_with_children_line_level['words']
581
 
 
585
  word_num = 0 # Initialize word count
586
  total_width = 0 # Initialize total width
587
 
588
+ split_relevant_text = relevant_text.split()
589
+
590
+ loop_child_words = child_words.copy()
591
+
592
+ for word_text in split_relevant_text: # Iterate through each word in relevant_text
593
+
594
+ quote_str = '"'
595
+ replace_str = '(?:"|"|")'
596
+
597
+ word_regex = rf'(?<!\w){re.escape(word_text.strip()).replace(quote_str, replace_str)}(?!\w)'
598
+
599
+ for word in loop_child_words:
600
+ # Check for regex as whole word
601
+
602
+ if re.search(word_regex, word['text']):
603
+ #if re.search(r'\b' + re.escape(word_text) + r'\b', word['text']):
604
  found_word = word
 
605
 
606
  if word_num == 0: # First word
607
  left = found_word['bounding_box'][0]
 
610
  all_words += found_word['text'] + " " # Concatenate words
611
  total_width = found_word['bounding_box'][2] - left # Add each word's width
612
  word_num += 1
613
+
614
+ # Drop the first word of child_words
615
+ loop_child_words = loop_child_words[1:] # Skip the first word
616
+
617
  break # Move to the next word in relevant_text
618
 
619
  width = total_width + horizontal_buffer # Set width to total width of all matched words
 
637
  result_reset_pos.start = 0
638
  result_reset_pos.end = len(relevant_text)
639
 
640
+ print("result_reset_pos:", result_reset_pos)
641
+ print("relevant_line_ocr_result:", relevant_line_ocr_result)
642
+ print("ocr_results_with_children_line_level:", ocr_results_with_children_line_level)
643
 
644
  # Map the analyzer results to bounding boxes for this line
645
  line_results = self.map_analyzer_results_to_bounding_boxes(
tools/file_conversion.py CHANGED
@@ -1,5 +1,5 @@
1
  from pdf2image import convert_from_path, pdfinfo_from_path
2
- from tools.helper_functions import get_file_path_end, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, read_file
3
  from PIL import Image, ImageFile
4
  ImageFile.LOAD_TRUNCATED_IMAGES = True
5
  import os
@@ -48,7 +48,8 @@ def is_pdf(filename):
48
  # %%
49
  ## Convert pdf to image if necessary
50
 
51
-
 
52
 
53
  def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_dir: str = 'input') -> tuple[int, str]:
54
  try:
@@ -261,7 +262,10 @@ def redact_single_box(pymupdf_page:Page, pymupdf_rect:Rect, img_annotation_box:d
261
  else:
262
  out_colour = img_annotation_box["color"]
263
  else:
264
- out_colour = (0,0,0)
 
 
 
265
 
266
  shape.finish(color=out_colour, fill=out_colour) # Black fill for the rectangle
267
  #shape.finish(color=(0, 0, 0)) # Black fill for the rectangle
@@ -478,11 +482,12 @@ def prepare_image_or_pdf(
478
  annotation["image"] = image_path
479
 
480
  all_annotations_object.append(annotation)
481
-
482
- #print("all_annotations_object:", all_annotations_object)
483
-
484
 
485
  elif is_pdf_or_image(file_path): # Alternatively, if it's an image
 
 
 
 
486
  # Convert image to a pymupdf document
487
  pymupdf_doc = pymupdf.open() # Create a new empty document
488
 
@@ -491,14 +496,16 @@ def prepare_image_or_pdf(
491
  page = pymupdf_doc.new_page(width=img.width, height=img.height) # Add a new page
492
  page.insert_image(rect, filename=file_path) # Insert the image into the page
493
 
 
 
 
 
 
494
 
495
- # Check if the file is an image type and the user selected text ocr option
496
- elif file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == text_ocr_option:
497
- in_redact_method = tesseract_ocr_option
498
 
499
  elif file_extension in ['.csv']:
500
  review_file_csv = read_file(file)
501
- all_annotations_object = convert_pandas_df_to_review_json(review_file_csv)
502
  json_from_csv = True
503
  print("Converted CSV review file to json")
504
 
@@ -618,12 +625,7 @@ def prepare_image_or_pdf(
618
  out_message.append(out_time)
619
  out_message_out = '\n'.join(out_message)
620
 
621
- #if prepare_for_review == False:
622
  number_of_pages = len(image_file_paths)
623
- #else:
624
- # number_of_pages = len(all_annotations_object)
625
-
626
- #print("all_annotations_object at end:", all_annotations_object)
627
 
628
  return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
629
 
@@ -650,23 +652,6 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str],
650
 
651
  return out_message, out_file_paths
652
 
653
- # Example DataFrames
654
- # df1 = pd.DataFrame({
655
- # 'xmin': [10, 20, 30],
656
- # 'xmax': [15, 25, 35],
657
- # 'ymin': [40, 50, 60],
658
- # 'ymax': [45, 55, 65],
659
- # 'info1': ['A', 'B', 'C']
660
- # })
661
-
662
- # df2 = pd.DataFrame({
663
- # 'xmin': [12, 18, 32],
664
- # 'xmax': [14, 24, 34],
665
- # 'ymin': [42, 48, 62],
666
- # 'ymax': [44, 54, 66],
667
- # 'info2': ['X', 'Y', 'Z']
668
- # })
669
-
670
  def join_values_within_threshold(df1, df2):
671
  # Threshold for matching
672
  threshold = 5
@@ -757,25 +742,38 @@ def convert_review_json_to_pandas_df(data:List[dict], text_join_data=pd.DataFram
757
 
758
  return df
759
 
760
- def convert_pandas_df_to_review_json(df: pd.DataFrame) -> List[dict]:
 
 
 
761
  # Keep only necessary columns
762
  df = df[["image", "page", "xmin", "ymin", "xmax", "ymax", "color", "label"]]
763
 
764
  # Group the DataFrame by the 'image' column
765
- grouped = df.groupby('image')
766
 
767
  # Create a list to hold the JSON data
768
  json_data = []
769
 
770
- # Iterate over each group
771
- for image_path, group in grouped:
772
- # Convert each group to a list of box dictionaries
773
- boxes = group.drop(columns=['image', 'page']).to_dict(orient='records')
774
-
 
 
 
 
 
 
 
 
 
 
 
 
 
775
  # Append the structured data to the json_data list
776
- json_data.append({
777
- "image": image_path,
778
- "boxes": boxes
779
- })
780
 
781
  return json_data
 
1
  from pdf2image import convert_from_path, pdfinfo_from_path
2
+ from tools.helper_functions import get_file_path_end, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, read_file, get_or_create_env_var
3
  from PIL import Image, ImageFile
4
  ImageFile.LOAD_TRUNCATED_IMAGES = True
5
  import os
 
48
  # %%
49
  ## Convert pdf to image if necessary
50
 
51
+ CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "")
52
+ print(f'The value of CUSTOM_BOX_COLOUR is {CUSTOM_BOX_COLOUR}')
53
 
54
  def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_dir: str = 'input') -> tuple[int, str]:
55
  try:
 
262
  else:
263
  out_colour = img_annotation_box["color"]
264
  else:
265
+ if CUSTOM_BOX_COLOUR == "grey":
266
+ out_colour = (0.5, 0.5, 0.5)
267
+ else:
268
+ out_colour = (0,0,0)
269
 
270
  shape.finish(color=out_colour, fill=out_colour) # Black fill for the rectangle
271
  #shape.finish(color=(0, 0, 0)) # Black fill for the rectangle
 
482
  annotation["image"] = image_path
483
 
484
  all_annotations_object.append(annotation)
 
 
 
485
 
486
  elif is_pdf_or_image(file_path): # Alternatively, if it's an image
487
+ # Check if the file is an image type and the user selected text ocr option
488
+ if file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == text_ocr_option:
489
+ in_redact_method = tesseract_ocr_option
490
+
491
  # Convert image to a pymupdf document
492
  pymupdf_doc = pymupdf.open() # Create a new empty document
493
 
 
496
  page = pymupdf_doc.new_page(width=img.width, height=img.height) # Add a new page
497
  page.insert_image(rect, filename=file_path) # Insert the image into the page
498
 
499
+ file_path_str = str(file_path)
500
+
501
+ image_file_paths = process_file(file_path_str, prepare_for_review)
502
+
503
+ print("Inserted image into PDF file")
504
 
 
 
 
505
 
506
  elif file_extension in ['.csv']:
507
  review_file_csv = read_file(file)
508
+ all_annotations_object = convert_pandas_df_to_review_json(review_file_csv, image_file_paths)
509
  json_from_csv = True
510
  print("Converted CSV review file to json")
511
 
 
625
  out_message.append(out_time)
626
  out_message_out = '\n'.join(out_message)
627
 
 
628
  number_of_pages = len(image_file_paths)
 
 
 
 
629
 
630
  return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
631
 
 
652
 
653
  return out_message, out_file_paths
654
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
655
  def join_values_within_threshold(df1, df2):
656
  # Threshold for matching
657
  threshold = 5
 
742
 
743
  return df
744
 
745
+ def convert_pandas_df_to_review_json(df: pd.DataFrame, image_paths: List[Image.Image]) -> List[dict]:
746
+ '''
747
+ Convert a review csv to a json file for use by the Gradio Annotation object
748
+ '''
749
  # Keep only necessary columns
750
  df = df[["image", "page", "xmin", "ymin", "xmax", "ymax", "color", "label"]]
751
 
752
  # Group the DataFrame by the 'image' column
753
+ grouped_csv_pages = df.groupby('page')
754
 
755
  # Create a list to hold the JSON data
756
  json_data = []
757
 
758
+ for n, pdf_image_path in enumerate(image_paths):
759
+ reported_page_number = int(n + 1)
760
+
761
+ if reported_page_number in df["page"].values:
762
+
763
+ # Convert each relevant group to a list of box dictionaries
764
+ selected_csv_pages = grouped_csv_pages.get_group(reported_page_number)
765
+ annotation_boxes = selected_csv_pages.drop(columns=['image', 'page']).to_dict(orient='records')
766
+
767
+ annotation = {
768
+ "image": pdf_image_path,
769
+ "boxes": annotation_boxes
770
+ }
771
+
772
+ else:
773
+ annotation = {}
774
+ annotation["image"] = pdf_image_path
775
+
776
  # Append the structured data to the json_data list
777
+ json_data.append(annotation)
 
 
 
778
 
779
  return json_data
tools/file_redaction.py CHANGED
@@ -269,7 +269,7 @@ def choose_and_run_redactor(file_paths:List[str],
269
  print("Redacting file:", file_path_without_ext)
270
 
271
  is_a_pdf = is_pdf(file_path) == True
272
- if is_a_pdf == False:
273
  # If user has not submitted a pdf, assume it's an image
274
  print("File is not a pdf, assuming that image analysis needs to be used.")
275
  in_redact_method = tesseract_ocr_option
@@ -708,8 +708,6 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
708
  if image:
709
  img_width, img_height = image.size
710
 
711
- print("annot:", annot)
712
-
713
  x1, image_y1, x2, image_y2 = convert_pymupdf_to_image_coords(page, x1, pymupdf_y1, x2, pymupdf_y2, image)
714
 
715
  img_annotation_box["xmin"] = x1 #* (img_width / rect_width) # Use adjusted x1
@@ -745,16 +743,11 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
745
  "boxes": all_image_annotation_boxes
746
  }
747
 
748
-
749
-
750
-
751
  page.apply_redactions(images=0, graphics=0)
752
  page.clean_contents()
753
 
754
  return page, out_annotation_boxes
755
 
756
-
757
-
758
  def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
759
 
760
  all_bboxes = []
@@ -832,7 +825,11 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
832
  for next_box in group[1:]:
833
  if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
834
  new_text = merged_box.text + " " + next_box.text
835
- new_entity_type = merged_box.entity_type + " - " + next_box.entity_type
 
 
 
 
836
 
837
  new_left = min(merged_box.left, next_box.left)
838
  new_top = min(merged_box.top, next_box.top)
@@ -973,9 +970,6 @@ def redact_image_pdf(file_path:str,
973
  print("Page range:", str(page_min + 1), "to", str(page_max))
974
  #print("Current_loop_page:", current_loop_page)
975
 
976
- if analysis_type == tesseract_ocr_option: ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
977
- elif analysis_type == textract_option: ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
978
-
979
  # If running Textract, check if file already exists. If it does, load in existing data
980
  # Import results from json and convert
981
  if analysis_type == textract_option:
@@ -984,7 +978,6 @@ def redact_image_pdf(file_path:str,
984
  log_files_output_paths.append(json_file_path)
985
 
986
  if not os.path.exists(json_file_path):
987
- no_textract_file = True
988
  print("No existing Textract results file found.")
989
  existing_data = {}
990
  #text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
@@ -1042,12 +1035,8 @@ def redact_image_pdf(file_path:str,
1042
 
1043
  # Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
1044
  if analysis_type == tesseract_ocr_option:
1045
-
1046
  word_level_ocr_results = image_analyser.perform_ocr(image)
1047
-
1048
- # Combine OCR results
1049
  line_level_ocr_results, line_level_ocr_results_with_children = combine_ocr_results(word_level_ocr_results)
1050
-
1051
 
1052
  # Import results from json and convert
1053
  if analysis_type == textract_option:
@@ -1085,44 +1074,6 @@ def redact_image_pdf(file_path:str,
1085
  text_blocks = next(page['data'] for page in existing_data["pages"] if page['page_no'] == reported_page_number)
1086
 
1087
 
1088
- # if not os.path.exists(json_file_path):
1089
- # text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
1090
- # log_files_output_paths.append(json_file_path)
1091
- # request_metadata = request_metadata + "\n" + new_request_metadata
1092
-
1093
- # existing_data = {"pages":[text_blocks]}
1094
-
1095
-
1096
- # else:
1097
- # # Open the file and load the JSON data
1098
- # print("Found existing Textract json results file.")
1099
- # with open(json_file_path, 'r') as json_file:
1100
- # existing_data = json.load(json_file)
1101
-
1102
- # # Check if the current reported_page_number exists in the loaded JSON
1103
- # page_exists = any(page['page_no'] == reported_page_number for page in existing_data.get("pages", []))
1104
-
1105
- # if not page_exists: # If the page does not exist, analyze again
1106
- # print(f"Page number {reported_page_number} not found in existing data. Analyzing again.")
1107
- # text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
1108
-
1109
- # # Check if "pages" key exists, if not, initialize it as an empty list
1110
- # if "pages" not in existing_data:
1111
- # existing_data["pages"] = []
1112
-
1113
- # # Append the new page data
1114
- # existing_data["pages"].append(text_blocks)
1115
-
1116
- # # Write the updated existing_data back to the JSON file
1117
- # with open(json_file_path, 'w') as json_file:
1118
- # json.dump(existing_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
1119
-
1120
- # log_files_output_paths.append(json_file_path)
1121
- # request_metadata = request_metadata + "\n" + new_request_metadata
1122
- # else:
1123
- # # If the page exists, retrieve the data
1124
- # text_blocks = next(page['data'] for page in existing_data["pages"] if page['page_no'] == reported_page_number)
1125
-
1126
  line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
1127
 
1128
  # Step 2: Analyze text and identify PII
@@ -1194,7 +1145,8 @@ def redact_image_pdf(file_path:str,
1194
  else:
1195
  #print("redact_whole_page_list:", redact_whole_page_list)
1196
  if redact_whole_page_list:
1197
- if current_loop_page in redact_whole_page_list: redact_whole_page = True
 
1198
  else: redact_whole_page = False
1199
  else: redact_whole_page = False
1200
 
@@ -1345,8 +1297,14 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
1345
  character_objects_out.append(char) # Collect character objects
1346
 
1347
  if isinstance(char, LTAnno):
 
 
 
 
 
 
1348
  # Handle space separately by finalizing the word
1349
- full_text += char.get_text() # Adds space or newline
1350
 
1351
  if current_word: # Only finalize if there is a current word
1352
  word_bboxes.append((current_word, current_word_bbox))
@@ -1354,7 +1312,7 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
1354
  current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # Reset for next word
1355
 
1356
  # Check for line break (assuming a new line is indicated by a specific character)
1357
- if '\n' in char.get_text():
1358
  #print("char_anno:", char)
1359
  # Finalize the current line
1360
  if current_word:
@@ -1373,7 +1331,6 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
1373
 
1374
  # Concatenate text for LTChar
1375
 
1376
-
1377
  #full_text += char.get_text()
1378
  #added_text = re.sub(r'[^\x00-\x7F]+', ' ', char.get_text())
1379
  added_text = char.get_text()
@@ -1382,8 +1339,6 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
1382
  added_text = clean_unicode_text(added_text)
1383
  full_text += added_text # Adds space or newline, removing
1384
 
1385
-
1386
-
1387
  # Update overall bounding box
1388
  x0, y0, x1, y1 = char.bbox
1389
  overall_bbox[0] = min(overall_bbox[0], x0) # x0
@@ -1480,7 +1435,10 @@ def merge_text_bounding_boxes(analyser_results, characters: List[LTChar], combin
1480
  merged_box[3] = max(current_box[3], next_box[3]) # Adjust the top
1481
  merged_result.end = max(current_result.end, result.end) # Extend text range
1482
  try:
1483
- merged_result.entity_type = current_result.entity_type + " - " + result.entity_type
 
 
 
1484
  except Exception as e:
1485
  print("Unable to combine result entity types:", e)
1486
  if current_text:
@@ -1877,8 +1835,9 @@ def redact_text_pdf(
1877
 
1878
  # Make pymupdf page redactions
1879
  #print("redact_whole_page_list:", redact_whole_page_list)
1880
- if redact_whole_page_list:
1881
- if current_loop_page in redact_whole_page_list: redact_whole_page = True
 
1882
  else: redact_whole_page = False
1883
  else: redact_whole_page = False
1884
 
 
269
  print("Redacting file:", file_path_without_ext)
270
 
271
  is_a_pdf = is_pdf(file_path) == True
272
+ if is_a_pdf == False and in_redact_method == text_ocr_option:
273
  # If user has not submitted a pdf, assume it's an image
274
  print("File is not a pdf, assuming that image analysis needs to be used.")
275
  in_redact_method = tesseract_ocr_option
 
708
  if image:
709
  img_width, img_height = image.size
710
 
 
 
711
  x1, image_y1, x2, image_y2 = convert_pymupdf_to_image_coords(page, x1, pymupdf_y1, x2, pymupdf_y2, image)
712
 
713
  img_annotation_box["xmin"] = x1 #* (img_width / rect_width) # Use adjusted x1
 
743
  "boxes": all_image_annotation_boxes
744
  }
745
 
 
 
 
746
  page.apply_redactions(images=0, graphics=0)
747
  page.clean_contents()
748
 
749
  return page, out_annotation_boxes
750
 
 
 
751
  def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
752
 
753
  all_bboxes = []
 
825
  for next_box in group[1:]:
826
  if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
827
  new_text = merged_box.text + " " + next_box.text
828
+
829
+ if merged_box.entity_type != next_box.entity_type:
830
+ new_entity_type = merged_box.entity_type + " - " + next_box.entity_type
831
+ else:
832
+ new_entity_type = merged_box.entity_type
833
 
834
  new_left = min(merged_box.left, next_box.left)
835
  new_top = min(merged_box.top, next_box.top)
 
970
  print("Page range:", str(page_min + 1), "to", str(page_max))
971
  #print("Current_loop_page:", current_loop_page)
972
 
 
 
 
973
  # If running Textract, check if file already exists. If it does, load in existing data
974
  # Import results from json and convert
975
  if analysis_type == textract_option:
 
978
  log_files_output_paths.append(json_file_path)
979
 
980
  if not os.path.exists(json_file_path):
 
981
  print("No existing Textract results file found.")
982
  existing_data = {}
983
  #text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
 
1035
 
1036
  # Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
1037
  if analysis_type == tesseract_ocr_option:
 
1038
  word_level_ocr_results = image_analyser.perform_ocr(image)
 
 
1039
  line_level_ocr_results, line_level_ocr_results_with_children = combine_ocr_results(word_level_ocr_results)
 
1040
 
1041
  # Import results from json and convert
1042
  if analysis_type == textract_option:
 
1074
  text_blocks = next(page['data'] for page in existing_data["pages"] if page['page_no'] == reported_page_number)
1075
 
1076
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1077
  line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
1078
 
1079
  # Step 2: Analyze text and identify PII
 
1145
  else:
1146
  #print("redact_whole_page_list:", redact_whole_page_list)
1147
  if redact_whole_page_list:
1148
+ int_reported_page_number = int(reported_page_number)
1149
+ if int_reported_page_number in redact_whole_page_list: redact_whole_page = True
1150
  else: redact_whole_page = False
1151
  else: redact_whole_page = False
1152
 
 
1297
  character_objects_out.append(char) # Collect character objects
1298
 
1299
  if isinstance(char, LTAnno):
1300
+
1301
+ added_text = char.get_text()
1302
+
1303
+ # Handle double quotes
1304
+ added_text = added_text.replace('"', '\\"') # Escape double quotes
1305
+
1306
  # Handle space separately by finalizing the word
1307
+ full_text += added_text # Adds space or newline
1308
 
1309
  if current_word: # Only finalize if there is a current word
1310
  word_bboxes.append((current_word, current_word_bbox))
 
1312
  current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # Reset for next word
1313
 
1314
  # Check for line break (assuming a new line is indicated by a specific character)
1315
+ if '\n' in added_text:
1316
  #print("char_anno:", char)
1317
  # Finalize the current line
1318
  if current_word:
 
1331
 
1332
  # Concatenate text for LTChar
1333
 
 
1334
  #full_text += char.get_text()
1335
  #added_text = re.sub(r'[^\x00-\x7F]+', ' ', char.get_text())
1336
  added_text = char.get_text()
 
1339
  added_text = clean_unicode_text(added_text)
1340
  full_text += added_text # Adds space or newline, removing
1341
 
 
 
1342
  # Update overall bounding box
1343
  x0, y0, x1, y1 = char.bbox
1344
  overall_bbox[0] = min(overall_bbox[0], x0) # x0
 
1435
  merged_box[3] = max(current_box[3], next_box[3]) # Adjust the top
1436
  merged_result.end = max(current_result.end, result.end) # Extend text range
1437
  try:
1438
+ if current_result.entity_type != result.entity_type:
1439
+ merged_result.entity_type = current_result.entity_type + " - " + result.entity_type
1440
+ else:
1441
+ merged_result.entity_type = current_result.entity_type
1442
  except Exception as e:
1443
  print("Unable to combine result entity types:", e)
1444
  if current_text:
 
1835
 
1836
  # Make pymupdf page redactions
1837
  #print("redact_whole_page_list:", redact_whole_page_list)
1838
+ if redact_whole_page_list:
1839
+ int_reported_page_number = int(reported_page_number)
1840
+ if int_reported_page_number in redact_whole_page_list: redact_whole_page = True
1841
  else: redact_whole_page = False
1842
  else: redact_whole_page = False
1843
 
tools/load_spacy_model_custom_recognisers.py CHANGED
@@ -24,14 +24,22 @@ except:
24
  print("Successfully downloaded and imported spaCy model", model_name)
25
 
26
  # #### Custom recognisers
27
- # Allow user to create their own recogniser
28
  def custom_word_list_recogniser(custom_list:List[str]=[]):
29
- custom_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(term)}" for term in custom_list) + '\\b'
30
- custom_pattern = Pattern(name="custom_pattern",regex=custom_regex, score = 1)
31
 
32
- #print("custom_pattern:", custom_pattern)
 
 
 
 
 
 
 
 
 
 
33
  custom_recogniser = PatternRecognizer(supported_entity="CUSTOM", name="CUSTOM", patterns = [custom_pattern],
34
- global_regex_flags=re.DOTALL | re.MULTILINE | re.IGNORECASE)
35
 
36
  return custom_recogniser
37
 
 
24
  print("Successfully downloaded and imported spaCy model", model_name)
25
 
26
  # #### Custom recognisers
 
27
  def custom_word_list_recogniser(custom_list:List[str]=[]):
28
+ # Create regex pattern, handling quotes carefully
 
29
 
30
+ quote_str = '"'
31
+ replace_str = '(?:"|"|")'
32
+
33
+ custom_regex = '|'.join(
34
+ rf'(?<!\w){re.escape(term.strip()).replace(quote_str, replace_str)}(?!\w)'
35
+ for term in custom_list
36
+ )
37
+ print(custom_regex)
38
+
39
+ custom_pattern = Pattern(name="custom_pattern", regex=custom_regex, score = 1)
40
+
41
  custom_recogniser = PatternRecognizer(supported_entity="CUSTOM", name="CUSTOM", patterns = [custom_pattern],
42
+ global_regex_flags=re.DOTALL | re.MULTILINE | re.IGNORECASE)
43
 
44
  return custom_recogniser
45
 
tools/redaction_review.py CHANGED
@@ -49,18 +49,12 @@ def update_zoom(current_zoom_level:int, annotate_current_page:int, decrease:bool
49
 
50
  return current_zoom_level, annotate_current_page
51
 
52
- def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, recogniser_entities_drop=gr.Dropdown(value="ALL", allow_custom_value=True), recogniser_dataframe_gr=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]})), zoom:int=80):
53
  '''
54
  Update a gradio_image_annotation object with new annotation data
55
  '''
56
  recogniser_entities = []
57
  recogniser_dataframe = pd.DataFrame()
58
- #recogniser_entities_drop = gr.Dropdown(value="ALL", allow_custom_value=True)
59
- #recogniser_dataframe_gr = gr.Dataframe(pd.DataFrame(data={"page":[""], "label":[""]}))
60
-
61
- #print("recogniser_dataframe_gr", recogniser_dataframe_gr)
62
- #print("recogniser_dataframe_gr shape", recogniser_dataframe_gr.shape)
63
- #print("recogniser_dataframe_gr.iloc[0,0]:", recogniser_dataframe_gr.iloc[0,0])
64
 
65
  if recogniser_dataframe_gr.iloc[0,0] == "":
66
  try:
@@ -228,7 +222,7 @@ def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_
228
 
229
  return all_image_annotations, current_page, current_page, recogniser_entities_drop, recogniser_dataframe_out
230
 
231
- def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, review_file_state, progress=gr.Progress(track_tqdm=True)):
232
  '''
233
  Apply modified redactions to a pymupdf and export review files
234
  '''
@@ -251,75 +245,76 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
251
  file_paths = [file_paths]
252
 
253
  for file_path in file_paths:
254
- print("file_path:", file_path)
255
  file_base = get_file_path_end(file_path)
256
 
257
  file_extension = os.path.splitext(file_path)[1].lower()
258
 
259
- # If working with image docs
260
- if (is_pdf(file_path) == False) & (file_extension not in '.csv'):
261
- image = Image.open(file_paths[-1])
 
262
 
263
- #image = pdf_doc
264
 
265
- draw = ImageDraw.Draw(image)
266
 
267
- for img_annotation_box in image_annotated['boxes']:
268
- coords = [img_annotation_box["xmin"],
269
- img_annotation_box["ymin"],
270
- img_annotation_box["xmax"],
271
- img_annotation_box["ymax"]]
272
 
273
- fill = img_annotation_box["color"]
274
 
275
- draw.rectangle(coords, fill=fill)
276
 
277
- image.save(output_folder + file_base + "_redacted.png")
278
 
279
- doc = [image]
280
 
281
- elif file_extension in '.csv':
282
- print("This is a csv")
283
- pdf_doc = []
284
 
285
- # If working with pdfs
286
- elif is_pdf(file_path) == True:
287
- pdf_doc = pymupdf.open(file_path)
288
 
289
- number_of_pages = pdf_doc.page_count
290
 
291
- print("Saving pages to file.")
292
 
293
- for i in progress.tqdm(range(0, number_of_pages), desc="Saving redactions to file", unit = "pages"):
294
 
295
- #print("Saving page", str(i))
296
-
297
- image_loc = all_image_annotations[i]['image']
298
- #print("Image location:", image_loc)
299
 
300
- # Load in image object
301
- if isinstance(image_loc, np.ndarray):
302
- image = Image.fromarray(image_loc.astype('uint8'))
303
- #all_image_annotations[i]['image'] = image_loc.tolist()
304
- elif isinstance(image_loc, Image.Image):
305
- image = image_loc
306
- #image_out_folder = output_folder + file_base + "_page_" + str(i) + ".png"
307
- #image_loc.save(image_out_folder)
308
- #all_image_annotations[i]['image'] = image_out_folder
309
- elif isinstance(image_loc, str):
310
- image = Image.open(image_loc)
311
 
312
- pymupdf_page = pdf_doc.load_page(i) #doc.load_page(current_page -1)
313
- pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image)
314
 
315
- else:
316
- print("File type not recognised.")
317
-
318
- #try:
319
- if pdf_doc:
320
- out_pdf_file_path = output_folder + file_base + "_redacted.pdf"
321
- pdf_doc.save(out_pdf_file_path)
322
- output_files.append(out_pdf_file_path)
323
 
324
  try:
325
  print("Saving annotations to JSON")
@@ -331,7 +326,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
331
 
332
  print("Saving annotations to CSV review file")
333
 
334
- print("review_file_state:", review_file_state)
335
 
336
  # Convert json to csv and also save this
337
  review_df = convert_review_json_to_pandas_df(all_image_annotations, review_file_state)
 
49
 
50
  return current_zoom_level, annotate_current_page
51
 
52
+ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, recogniser_entities_drop=gr.Dropdown(value="ALL", allow_custom_value=True), recogniser_dataframe_gr=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]})), zoom:int=100):
53
  '''
54
  Update a gradio_image_annotation object with new annotation data
55
  '''
56
  recogniser_entities = []
57
  recogniser_dataframe = pd.DataFrame()
 
 
 
 
 
 
58
 
59
  if recogniser_dataframe_gr.iloc[0,0] == "":
60
  try:
 
222
 
223
  return all_image_annotations, current_page, current_page, recogniser_entities_drop, recogniser_dataframe_out
224
 
225
+ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, review_file_state, save_pdf:bool=True, progress=gr.Progress(track_tqdm=True)):
226
  '''
227
  Apply modified redactions to a pymupdf and export review files
228
  '''
 
245
  file_paths = [file_paths]
246
 
247
  for file_path in file_paths:
248
+ #print("file_path:", file_path)
249
  file_base = get_file_path_end(file_path)
250
 
251
  file_extension = os.path.splitext(file_path)[1].lower()
252
 
253
+ if save_pdf == True:
254
+ # If working with image docs
255
+ if (is_pdf(file_path) == False) & (file_extension not in '.csv'):
256
+ image = Image.open(file_paths[-1])
257
 
258
+ #image = pdf_doc
259
 
260
+ draw = ImageDraw.Draw(image)
261
 
262
+ for img_annotation_box in image_annotated['boxes']:
263
+ coords = [img_annotation_box["xmin"],
264
+ img_annotation_box["ymin"],
265
+ img_annotation_box["xmax"],
266
+ img_annotation_box["ymax"]]
267
 
268
+ fill = img_annotation_box["color"]
269
 
270
+ draw.rectangle(coords, fill=fill)
271
 
272
+ image.save(output_folder + file_base + "_redacted.png")
273
 
274
+ doc = [image]
275
 
276
+ elif file_extension in '.csv':
277
+ print("This is a csv")
278
+ pdf_doc = []
279
 
280
+ # If working with pdfs
281
+ elif is_pdf(file_path) == True:
282
+ pdf_doc = pymupdf.open(file_path)
283
 
284
+ number_of_pages = pdf_doc.page_count
285
 
286
+ print("Saving pages to file.")
287
 
288
+ for i in progress.tqdm(range(0, number_of_pages), desc="Saving redactions to file", unit = "pages"):
289
 
290
+ #print("Saving page", str(i))
291
+
292
+ image_loc = all_image_annotations[i]['image']
293
+ #print("Image location:", image_loc)
294
 
295
+ # Load in image object
296
+ if isinstance(image_loc, np.ndarray):
297
+ image = Image.fromarray(image_loc.astype('uint8'))
298
+ #all_image_annotations[i]['image'] = image_loc.tolist()
299
+ elif isinstance(image_loc, Image.Image):
300
+ image = image_loc
301
+ #image_out_folder = output_folder + file_base + "_page_" + str(i) + ".png"
302
+ #image_loc.save(image_out_folder)
303
+ #all_image_annotations[i]['image'] = image_out_folder
304
+ elif isinstance(image_loc, str):
305
+ image = Image.open(image_loc)
306
 
307
+ pymupdf_page = pdf_doc.load_page(i) #doc.load_page(current_page -1)
308
+ pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image)
309
 
310
+ else:
311
+ print("File type not recognised.")
312
+
313
+ #try:
314
+ if pdf_doc:
315
+ out_pdf_file_path = output_folder + file_base + "_redacted.pdf"
316
+ pdf_doc.save(out_pdf_file_path)
317
+ output_files.append(out_pdf_file_path)
318
 
319
  try:
320
  print("Saving annotations to JSON")
 
326
 
327
  print("Saving annotations to CSV review file")
328
 
329
+ #print("review_file_state:", review_file_state)
330
 
331
  # Convert json to csv and also save this
332
  review_df = convert_review_json_to_pandas_df(all_image_annotations, review_file_state)