seanpedrickcase commited on
Commit
cb349ad
·
1 Parent(s): 3518b67

Ensured the text ocr outputs have no line breaks at end. Multi-line custom text searches now possible. Files for review sent from redact button. Fixed image redaction (not review yet). Can get user pool details from headers. Gradio update.

Browse files
.dockerignore CHANGED
@@ -16,4 +16,5 @@ build/*
16
  dist/*
17
  build_deps/*
18
  logs/*
19
- doc_redaction_amplify_app/*
 
 
16
  dist/*
17
  build_deps/*
18
  logs/*
19
+ doc_redaction_amplify_app/*
20
+ user_guide/*
.gitignore CHANGED
@@ -16,4 +16,5 @@ build/*
16
  dist/*
17
  build_deps/*
18
  logs/*
19
- doc_redaction_amplify_app/*
 
 
16
  dist/*
17
  build_deps/*
18
  logs/*
19
+ doc_redaction_amplify_app/*
20
+ user_guide/*
app.py CHANGED
@@ -66,26 +66,27 @@ with app:
66
 
67
  pdf_doc_state = gr.State([])
68
  all_image_annotations_state = gr.State([])
69
- all_line_level_ocr_results_df_state = gr.State(pd.DataFrame())
70
- all_decision_process_table_state = gr.State(pd.DataFrame())
 
 
 
71
 
72
  session_hash_state = gr.State()
73
  s3_output_folder_state = gr.State()
74
 
75
  first_loop_state = gr.State(True)
76
  second_loop_state = gr.State(False)
 
77
 
78
- prepared_pdf_state = gr.State([])
79
- images_pdf_state = gr.State([]) # List of pdf pages converted to PIL images
 
 
 
 
 
80
 
81
- output_image_files_state = gr.State([])
82
- output_file_list_state = gr.State([])
83
- text_output_file_list_state = gr.State([])
84
- log_files_output_list_state = gr.State([])
85
-
86
- review_file_state = gr.State(pd.DataFrame())
87
-
88
- do_not_save_pdf_state = gr.State(False)
89
 
90
  # Logging state
91
  log_file_name = 'log.csv'
@@ -95,7 +96,7 @@ with app:
95
  access_logs_state = gr.State(access_logs_folder + log_file_name)
96
  access_s3_logs_loc_state = gr.State(access_logs_folder)
97
  usage_logs_state = gr.State(usage_logs_folder + log_file_name)
98
- usage_s3_logs_loc_state = gr.State(usage_logs_folder)
99
 
100
  # Invisible text boxes to hold the session hash/username, Textract request metadata, data file names just for logging purposes.
101
  session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
@@ -115,8 +116,7 @@ with app:
115
  estimated_time_taken_number = gr.Number(label = "estimated_time_taken_number", value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
116
  annotate_previous_page = gr.Number(value=0, label="Previous page", precision=0, visible=False) # Keeps track of the last page that the annotator was on
117
 
118
- s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
119
-
120
 
121
  ## Annotator zoom value
122
  annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=80, precision=0, visible=False)
@@ -129,16 +129,16 @@ with app:
129
  ## Settings page variables
130
  default_allow_list_file_name = "default_allow_list.csv"
131
  default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
132
- in_allow_list_state = gr.State(pd.DataFrame())
133
 
134
  default_deny_list_file_name = "default_deny_list.csv"
135
  default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
136
- in_deny_list_state = gr.State([])
137
  in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
138
 
139
  fully_redacted_list_file_name = "default_fully_redacted_list.csv"
140
  fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
141
- in_fully_redacted_list_state = gr.State([])
142
  in_fully_redacted_text_in = gr.Textbox(value="Fully redacted page list", visible=False)
143
 
144
  # S3 settings for default allow list load
@@ -209,6 +209,8 @@ with app:
209
  with gr.Row():
210
  annotate_zoom_in = gr.Button("Zoom in")
211
  annotate_zoom_out = gr.Button("Zoom out")
 
 
212
  with gr.Row():
213
  clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
214
 
@@ -237,18 +239,16 @@ with app:
237
  )
238
 
239
  with gr.Row():
240
- annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
 
 
 
241
 
242
  #with gr.Column(scale=1):
243
  with gr.Row():
244
  recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
245
  recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", label="Search results. Click to go to page")
246
-
247
- with gr.Row():
248
- annotation_last_page_button_bottom = gr.Button("Previous page", scale = 3)
249
- annotate_current_page_bottom = gr.Number(value=1, label="Page (press enter to change)", precision=0, interactive=True, scale = 2)
250
- annotate_max_pages_bottom = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
251
- annotation_next_page_button_bottom = gr.Button("Next page", scale = 3)
252
 
253
  # TEXT / TABULAR DATA TAB
254
  with gr.Tab(label="Open text or Excel/csv files"):
@@ -322,12 +322,12 @@ with app:
322
  document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state]).\
323
  then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
324
  then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
325
- outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
326
  then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
327
 
328
  # If the app has completed a batch of pages, it will run this until the end of all pages in the document
329
  current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
330
- outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number]).\
331
  then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
332
 
333
  # If a file has been completed, the function will continue onto the next document
@@ -394,7 +394,8 @@ with app:
394
 
395
  recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=[annotate_current_page]).\
396
  then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
397
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
 
398
 
399
 
400
  ###
 
66
 
67
  pdf_doc_state = gr.State([])
68
  all_image_annotations_state = gr.State([])
69
+
70
+
71
+ all_line_level_ocr_results_df_state = gr.Dataframe(value=pd.DataFrame(), label="all_line_level_ocr_results_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
72
+ all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), label="all_decision_process_table", visible=False, type="pandas") # gr.State(pd.DataFrame())
73
+ review_file_state = gr.Dataframe(value=pd.DataFrame(), label="review_file_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
74
 
75
  session_hash_state = gr.State()
76
  s3_output_folder_state = gr.State()
77
 
78
  first_loop_state = gr.State(True)
79
  second_loop_state = gr.State(False)
80
+ do_not_save_pdf_state = gr.State(False)
81
 
82
+ prepared_pdf_state = gr.Dropdown(label = "prepared_pdf_list", value="", allow_custom_value=True,visible=False) #gr.State([])
83
+ images_pdf_state = gr.Dropdown(label = "images_pdf_list", value="", allow_custom_value=True,visible=False) #gr.State([]) # List of pdf pages converted to PIL images
84
+
85
+ output_image_files_state = gr.Dropdown(label = "output_image_files_list", value="", allow_custom_value=True,visible=False) #gr.State([])
86
+ output_file_list_state = gr.Dropdown(label = "output_file_list", value="", allow_custom_value=True,visible=False) #gr.State([])
87
+ text_output_file_list_state = gr.Dropdown(label = "text_output_file_list", value="", allow_custom_value=True,visible=False) #gr.State([])
88
+ log_files_output_list_state = gr.Dropdown(label = "log_files_output_list", value="", allow_custom_value=True,visible=False) #gr.State([])
89
 
 
 
 
 
 
 
 
 
90
 
91
  # Logging state
92
  log_file_name = 'log.csv'
 
96
  access_logs_state = gr.State(access_logs_folder + log_file_name)
97
  access_s3_logs_loc_state = gr.State(access_logs_folder)
98
  usage_logs_state = gr.State(usage_logs_folder + log_file_name)
99
+ usage_s3_logs_loc_state = gr.State(usage_logs_folder)
100
 
101
  # Invisible text boxes to hold the session hash/username, Textract request metadata, data file names just for logging purposes.
102
  session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
 
116
  estimated_time_taken_number = gr.Number(label = "estimated_time_taken_number", value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
117
  annotate_previous_page = gr.Number(value=0, label="Previous page", precision=0, visible=False) # Keeps track of the last page that the annotator was on
118
 
119
+ s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
 
120
 
121
  ## Annotator zoom value
122
  annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=80, precision=0, visible=False)
 
129
  ## Settings page variables
130
  default_allow_list_file_name = "default_allow_list.csv"
131
  default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
132
+ in_allow_list_state = gr.Dataframe(value=pd.DataFrame(), label="in_allow_list_df", visible=False, type="pandas")
133
 
134
  default_deny_list_file_name = "default_deny_list.csv"
135
  default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
136
+ in_deny_list_state = gr.Dataframe(value=pd.DataFrame(), label="in_deny_list_df", visible=False, type="pandas")
137
  in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
138
 
139
  fully_redacted_list_file_name = "default_fully_redacted_list.csv"
140
  fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
141
+ in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), label="in_full_redacted_list_df", visible=False, type="pandas")
142
  in_fully_redacted_text_in = gr.Textbox(value="Fully redacted page list", visible=False)
143
 
144
  # S3 settings for default allow list load
 
209
  with gr.Row():
210
  annotate_zoom_in = gr.Button("Zoom in")
211
  annotate_zoom_out = gr.Button("Zoom out")
212
+ with gr.Row():
213
+ annotation_button_apply = gr.Button("Apply revised redactions to pdf", variant="secondary")
214
  with gr.Row():
215
  clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
216
 
 
239
  )
240
 
241
  with gr.Row():
242
+ annotation_last_page_button_bottom = gr.Button("Previous page", scale = 3)
243
+ annotate_current_page_bottom = gr.Number(value=1, label="Page (press enter to change)", precision=0, interactive=True, scale = 2)
244
+ annotate_max_pages_bottom = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
245
+ annotation_next_page_button_bottom = gr.Button("Next page", scale = 3)
246
 
247
  #with gr.Column(scale=1):
248
  with gr.Row():
249
  recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
250
  recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", label="Search results. Click to go to page")
251
+
 
 
 
 
 
252
 
253
  # TEXT / TABULAR DATA TAB
254
  with gr.Tab(label="Open text or Excel/csv files"):
 
322
  document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state]).\
323
  then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
324
  then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
325
+ outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files], api_name="redact_doc").\
326
  then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
327
 
328
  # If the app has completed a batch of pages, it will run this until the end of all pages in the document
329
  current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
330
+ outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files]).\
331
  then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
332
 
333
  # If a file has been completed, the function will continue onto the next document
 
394
 
395
  recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=[annotate_current_page]).\
396
  then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
397
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
398
+ then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
399
 
400
 
401
  ###
requirements.txt CHANGED
@@ -10,7 +10,7 @@ pandas==2.2.3
10
  spacy==3.8.3
11
  #en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
12
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
13
- gradio==5.10.0
14
  boto3==1.35.83
15
  pyarrow==18.1.0
16
  openpyxl==3.1.2
 
10
  spacy==3.8.3
11
  #en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
12
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
13
+ gradio==5.12.0
14
  boto3==1.35.83
15
  pyarrow==18.1.0
16
  openpyxl==3.1.2
tools/auth.py CHANGED
@@ -1,10 +1,21 @@
1
 
 
2
  import boto3
3
  import gradio as gr
4
  import hmac
5
  import hashlib
6
  import base64
7
- from tools.helper_functions import get_or_create_env_var
 
 
 
 
 
 
 
 
 
 
8
 
9
  client_id = get_or_create_env_var('AWS_CLIENT_ID', '')
10
  #print(f'The value of AWS_CLIENT_ID is {client_id}')
 
1
 
2
+ import os
3
  import boto3
4
  import gradio as gr
5
  import hmac
6
  import hashlib
7
  import base64
8
+
9
+ def get_or_create_env_var(var_name, default_value):
10
+ # Get the environment variable if it exists
11
+ value = os.environ.get(var_name)
12
+
13
+ # If it doesn't exist, set it to the default value
14
+ if value is None:
15
+ os.environ[var_name] = default_value
16
+ value = default_value
17
+
18
+ return value
19
 
20
  client_id = get_or_create_env_var('AWS_CLIENT_ID', '')
21
  #print(f'The value of AWS_CLIENT_ID is {client_id}')
tools/custom_image_analyser_engine.py CHANGED
@@ -1,20 +1,19 @@
1
  import pytesseract
2
  import numpy as np
3
  from presidio_analyzer import AnalyzerEngine, RecognizerResult
4
- #from presidio_image_redactor import ImagePreprocessor
5
  from typing import List, Dict, Optional, Union, Tuple
6
  from dataclasses import dataclass
7
  import time
8
  import cv2
 
 
 
9
  import PIL
10
- from PIL import ImageDraw, ImageFont, Image
11
  from typing import Optional, Tuple, Union
12
- from copy import deepcopy
13
  from tools.helper_functions import clean_unicode_text
14
  from tools.presidio_analyzer_custom import recognizer_result_from_dict
15
  from tools.load_spacy_model_custom_recognisers import custom_entities
16
- #import string # Import string to get a list of common punctuation characters
17
- import re # Add this import at the top of the file
18
 
19
  @dataclass
20
  class OCRResult:
@@ -174,7 +173,6 @@ class BilateralFilter(ImagePreprocessor):
174
 
175
  return Image.fromarray(filtered_image), metadata
176
 
177
-
178
  class SegmentedAdaptiveThreshold(ImagePreprocessor):
179
  """SegmentedAdaptiveThreshold class.
180
 
@@ -252,9 +250,6 @@ class SegmentedAdaptiveThreshold(ImagePreprocessor):
252
  metadata = {"C": c, "background_color": background_color, "contrast": contrast}
253
  return Image.fromarray(adaptive_threshold_image), metadata
254
 
255
-
256
-
257
-
258
  class ImageRescaling(ImagePreprocessor):
259
  """ImageRescaling class. Rescales images based on their size."""
260
 
@@ -302,7 +297,6 @@ class ImageRescaling(ImagePreprocessor):
302
  metadata = {"scale_factor": scale_factor}
303
  return Image.fromarray(rescaled_image), metadata
304
 
305
-
306
  class ContrastSegmentedImageEnhancer(ImagePreprocessor):
307
  """Class containing all logic to perform contrastive segmentation.
308
 
@@ -409,6 +403,464 @@ def bounding_boxes_overlap(box1, box2):
409
  """Check if two bounding boxes overlap."""
410
  return (box1[0] < box2[2] and box2[0] < box1[2] and
411
  box1[1] < box2[3] and box2[1] < box1[3])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
412
 
413
  class CustomImageAnalyzerEngine:
414
  def __init__(
@@ -463,261 +915,225 @@ class CustomImageAnalyzerEngine:
463
  self,
464
  line_level_ocr_results: List[OCRResult],
465
  ocr_results_with_children: Dict[str, Dict],
466
- chosen_redact_comprehend_entities:List[str],
467
- pii_identification_method:str="Local",
468
- comprehend_client="",
469
  **text_analyzer_kwargs
470
  ) -> List[CustomImageRecognizerResult]:
471
- # Define English as default language, if not specified
472
- if "language" not in text_analyzer_kwargs:
473
- text_analyzer_kwargs["language"] = "en"
474
 
475
- horizontal_buffer = 0 # add pixels to right of width
476
- height_buffer = 2 # add pixels to bounding box height
 
477
  comprehend_query_number = 0
478
-
479
- allow_list = text_analyzer_kwargs.get('allow_list', [])
480
-
481
- combined_results = []
482
- # Initialize variables for batching
483
- current_batch = ""
484
- current_batch_mapping = [] # List of (start_pos, line_index, original_text) tuples
485
- analyzer_results_by_line = [[] for _ in line_level_ocr_results] # Store results for each line
486
 
487
- # Process OCR results in batches
488
  for i, line_level_ocr_result in enumerate(line_level_ocr_results):
489
- if pii_identification_method == "Local":
490
- analyzer_result = self.analyzer_engine.analyze(
491
- text=line_level_ocr_result.text, **text_analyzer_kwargs
492
- )
493
- analyzer_results_by_line[i] = analyzer_result
 
 
 
 
 
 
 
 
 
 
 
 
 
494
 
495
- elif pii_identification_method == "AWS Comprehend":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
496
 
497
- # If using AWS Comprehend, Spacy model is only used to identify the custom entities created. This is because Comprehend can't pick up Titles, Streetnames, and UKPostcodes, or a custom deny list specifically
498
- text_analyzer_kwargs["entities"] = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
 
 
 
499
 
500
- spacy_analyzer_result = self.analyzer_engine.analyze(
501
- text=line_level_ocr_result.text, **text_analyzer_kwargs)
 
 
502
 
503
- analyzer_results_by_line[i].extend(spacy_analyzer_result)
 
 
504
 
505
- if len(line_level_ocr_result.text) >= 3:
506
- # Add line to current batch with a separator
507
- if current_batch:
508
- current_batch += " | " # Use a separator that's unlikely to appear in the text
509
 
510
- start_pos = len(current_batch)
511
- current_batch += line_level_ocr_result.text
512
- current_batch_mapping.append((start_pos, i, line_level_ocr_result.text))
513
-
514
- # Process batch if it's approaching 300 characters or this is the last line
515
- if len(current_batch) >= 200 or i == len(line_level_ocr_results) - 1:
516
- print("length of text for Comprehend:", len(current_batch))
517
-
518
- try:
519
- response = comprehend_client.detect_pii_entities(
520
- Text=current_batch,
521
- LanguageCode=text_analyzer_kwargs["language"]
522
- )
523
-
524
- except Exception as e:
525
- print("AWS Comprehend call failed due to:", e, "waiting three seconds to try again.")
526
- time.sleep(3)
527
- response = comprehend_client.detect_pii_entities(
528
- Text=current_batch,
529
- LanguageCode=text_analyzer_kwargs["language"]
530
- )
531
-
532
  comprehend_query_number += 1
533
-
534
- # Map results back to original lines
535
- if response and "Entities" in response:
536
- for entity in response["Entities"]:
537
- entity_start = entity["BeginOffset"]
538
- entity_end = entity["EndOffset"]
539
-
540
- # Find which line this entity belongs to
541
- for batch_start, line_idx, original_text in current_batch_mapping:
542
- batch_end = batch_start + len(original_text)
543
-
544
- # Check if entity belongs to this line
545
- if batch_start <= entity_start < batch_end:
546
- # Adjust offsets relative to the original line
547
- relative_start = entity_start - batch_start
548
- relative_end = min(entity_end - batch_start, len(original_text))
549
-
550
- result_text = original_text[relative_start:relative_end]
551
-
552
- if result_text not in allow_list:
553
- if entity.get("Type") in chosen_redact_comprehend_entities:
554
- # Create a new entity with adjusted positions
555
- adjusted_entity = entity.copy()
556
- adjusted_entity["BeginOffset"] = relative_start
557
- adjusted_entity["EndOffset"] = relative_end
558
-
559
- recogniser_entity = recognizer_result_from_dict(adjusted_entity)
560
- analyzer_results_by_line[line_idx].append(recogniser_entity)
561
 
562
  # Reset batch
563
- current_batch = ""
564
- current_batch_mapping = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
565
 
566
- # Process results for each line
567
- for i, analyzer_result in enumerate(analyzer_results_by_line):
568
- if i >= len(ocr_results_with_children):
569
- continue
570
 
 
 
 
 
 
571
  child_level_key = list(ocr_results_with_children.keys())[i]
572
  ocr_results_with_children_line_level = ocr_results_with_children[child_level_key]
573
-
574
- # Go through results to add bounding boxes
575
- for result in analyzer_result:
576
- # Extract the relevant portion of text based on start and end
577
- relevant_text = line_level_ocr_results[i].text[result.start:result.end]
578
-
579
- # Find the corresponding entry in ocr_results_with_children
580
- child_words = ocr_results_with_children_line_level['words']
581
-
582
- # Initialize bounding box values
583
- left, top, bottom = float('inf'), float('inf'), float('-inf')
584
- all_words = ""
585
- word_num = 0 # Initialize word count
586
- total_width = 0 # Initialize total width
587
-
588
- split_relevant_text = relevant_text.split()
589
-
590
- loop_child_words = child_words.copy()
591
-
592
- for word_text in split_relevant_text: # Iterate through each word in relevant_text
593
-
594
- quote_str = '"'
595
- replace_str = '(?:"|"|")'
596
-
597
- word_regex = rf'(?<!\w){re.escape(word_text.strip()).replace(quote_str, replace_str)}(?!\w)'
598
-
599
- for word in loop_child_words:
600
- # Check for regex as whole word
601
-
602
- if re.search(word_regex, word['text']):
603
- #if re.search(r'\b' + re.escape(word_text) + r'\b', word['text']):
604
- found_word = word
605
-
606
- if word_num == 0: # First word
607
- left = found_word['bounding_box'][0]
608
- top = found_word['bounding_box'][1]
609
- bottom = max(bottom, found_word['bounding_box'][3]) # Update bottom for all words
610
- all_words += found_word['text'] + " " # Concatenate words
611
- total_width = found_word['bounding_box'][2] - left # Add each word's width
612
- word_num += 1
613
-
614
- # Drop the first word of child_words
615
- loop_child_words = loop_child_words[1:] # Skip the first word
616
-
617
- break # Move to the next word in relevant_text
618
-
619
- width = total_width + horizontal_buffer # Set width to total width of all matched words
620
- height = bottom - top if word_num > 0 else 0 # Calculate height
621
-
622
- relevant_line_ocr_result = OCRResult(
623
- text=relevant_text,
624
- left=left,
625
- top=top - height_buffer,
626
- width=width,
627
- height=height + height_buffer
628
- )
629
-
630
- if not ocr_results_with_children_line_level:
631
- # Fallback to previous method if not found in ocr_results_with_children
632
- print("No child info found")
633
- continue
634
-
635
- # Reset the word positions indicated in the relevant ocr_result - i.e. it starts from 0 and ends at word length
636
- result_reset_pos = result
637
- result_reset_pos.start = 0
638
- result_reset_pos.end = len(relevant_text)
639
-
640
- #print("result_reset_pos:", result_reset_pos)
641
- #print("relevant_line_ocr_result:", relevant_line_ocr_result)
642
- #print("ocr_results_with_children_line_level:", ocr_results_with_children_line_level)
643
-
644
- # Map the analyzer results to bounding boxes for this line
645
- line_results = self.map_analyzer_results_to_bounding_boxes(
646
- [result_reset_pos], [relevant_line_ocr_result], relevant_line_ocr_result.text, allow_list, ocr_results_with_children_line_level
647
  )
648
-
649
- #print("line_results:", line_results)
650
-
651
- combined_results.extend(line_results)
652
 
653
  return combined_results, comprehend_query_number
654
 
655
  @staticmethod
656
  def map_analyzer_results_to_bounding_boxes(
657
- text_analyzer_results: List[RecognizerResult],
658
- redaction_relevant_ocr_results: List[OCRResult],
659
- full_text: str,
660
- allow_list: List[str],
661
- ocr_results_with_children_child_info: Dict[str, Dict]
662
- ) -> List[CustomImageRecognizerResult]:
663
  redaction_bboxes = []
664
- text_position = 0
665
 
666
  for redaction_relevant_ocr_result in redaction_relevant_ocr_results:
667
- word_end = text_position + len(redaction_relevant_ocr_result.text)
668
 
669
- #print("Checking relevant OCR result:", redaction_relevant_ocr_result)
 
 
670
 
 
 
671
  for redaction_result in text_analyzer_results:
672
- max_of_current_text_pos_or_result_start_pos = max(text_position, redaction_result.start)
673
- min_of_result_end_pos_or_results_end = min(word_end, redaction_result.end)
674
-
675
- redaction_result_bounding_box = (redaction_relevant_ocr_result.left, redaction_relevant_ocr_result.top,
676
- redaction_relevant_ocr_result.left + redaction_relevant_ocr_result.width,
677
- redaction_relevant_ocr_result.top + redaction_relevant_ocr_result.height)
678
-
679
- if (max_of_current_text_pos_or_result_start_pos < min_of_result_end_pos_or_results_end) and (redaction_relevant_ocr_result.text not in allow_list):
680
- #print("result", redaction_result, "made it through if statement")
681
- # Find the corresponding entry in ocr_results_with_children that overlap with the redaction result
682
- child_info = ocr_results_with_children_child_info#.get(full_text)
683
-
684
- #print("child_info in sub function:", child_info)
685
- #print("redaction_result_bounding_box:", redaction_result_bounding_box)
686
- #print("Overlaps?", bounding_boxes_overlap(redaction_result_bounding_box, child_info['bounding_box']))
687
-
688
- if bounding_boxes_overlap(redaction_result_bounding_box, child_info['bounding_box']):
689
- # Use the bounding box from ocr_results_with_children
690
- bbox = redaction_result_bounding_box #child_info['bounding_box']
691
- left, top, right, bottom = bbox
692
- width = right - left
693
- height = bottom - top
694
-
695
- else:
696
- print("Could not find OCR result")
697
- continue
698
-
699
- redaction_bboxes.append(
700
- CustomImageRecognizerResult(
701
- entity_type=redaction_result.entity_type,
702
- start=redaction_result.start,
703
- end=redaction_result.end,
704
- score=redaction_result.score,
705
- left=left,
706
- top=top,
707
- width=width,
708
- height=height,
709
- text=redaction_relevant_ocr_result.text
 
 
 
 
 
 
 
 
710
  )
711
- )
712
-
713
- text_position = word_end + 1 # +1 for the space between words
714
 
715
  return redaction_bboxes
716
 
717
  @staticmethod
718
  def remove_space_boxes(ocr_result: dict) -> dict:
719
  """Remove OCR bboxes that are for spaces.
720
-
721
  :param ocr_result: OCR results (raw or thresholded).
722
  :return: OCR results with empty words removed.
723
  """
@@ -740,10 +1156,8 @@ class CustomImageAnalyzerEngine:
740
  ocr_result: Dict[str, List[Union[int, str]]], scale_factor: float
741
  ) -> Dict[str, float]:
742
  """Scale down the bounding box results based on a scale percentage.
743
-
744
  :param ocr_result: OCR results (raw).
745
  :param scale_percent: Scale percentage for resizing the bounding box.
746
-
747
  :return: OCR results (scaled).
748
  """
749
  scaled_results = deepcopy(ocr_result)
@@ -790,173 +1204,3 @@ class CustomImageAnalyzerEngine:
790
  estimated_width = int(proportion * ocr_result.width)
791
 
792
  return estimated_width
793
-
794
-
795
- # def estimate_width(self, ocr_result: OCRResult, start: int, end: int) -> int:
796
- # # Extract the relevant text portion
797
- # relevant_text = ocr_result.text[start:end]
798
-
799
- # # Check if the relevant text is the entire text of the OCR result
800
- # if relevant_text == ocr_result.text:
801
- # return ocr_result.width
802
-
803
- # # Estimate the font size based on the height of the bounding box
804
- # estimated_font_size = ocr_result.height + 4
805
-
806
- # # Create a blank image with enough width to measure the text
807
- # dummy_image = Image.new('RGB', (1000, 50), color=(255, 255, 255))
808
- # draw = ImageDraw.Draw(dummy_image)
809
-
810
- # # Specify the font and size
811
- # try:
812
- # font = ImageFont.truetype("arial.ttf", estimated_font_size) # Adjust the font file as needed
813
- # except IOError:
814
- # font = ImageFont.load_default() # Fallback to default font if the specified font is not found
815
-
816
- # # Draw the relevant text on the image
817
- # draw.text((0, 0), relevant_text, fill=(0, 0, 0), font=font)
818
-
819
- # # Save the image for debugging purposes
820
- # dummy_image.save("debug_image.png")
821
-
822
- # # Use pytesseract to get the bounding box of the relevant text
823
- # bbox = pytesseract.image_to_boxes(dummy_image, config=self.tesseract_config)
824
-
825
- # # Print the bbox for debugging
826
- # print("Bounding box:", bbox)
827
-
828
- # # Calculate the width from the bounding box
829
- # if bbox:
830
- # try:
831
- # # Initialize min_left and max_right with extreme values
832
- # min_left = float('inf')
833
- # max_right = float('-inf')
834
-
835
- # # Split the bbox string into lines
836
- # bbox_lines = bbox.splitlines()
837
-
838
- # for line in bbox_lines:
839
- # parts = line.split()
840
- # if len(parts) == 6:
841
- # _, left, _, right, _, _ = parts
842
- # left = int(left)
843
- # right = int(right)
844
- # min_left = min(min_left, left)
845
- # max_right = max(max_right, right)
846
-
847
- # width = max_right - min_left
848
- # except ValueError as e:
849
- # print("Error parsing bounding box:", e)
850
- # width = 0
851
- # else:
852
- # width = 0
853
-
854
- # print("Estimated width:", width)
855
-
856
- # return width
857
-
858
-
859
-
860
- # Function to combine OCR results into line-level results
861
- def combine_ocr_results(ocr_results, x_threshold=50, y_threshold=12):
862
- # Group OCR results into lines based on y_threshold
863
- lines = []
864
- current_line = []
865
- for result in sorted(ocr_results, key=lambda x: x.top):
866
- if not current_line or abs(result.top - current_line[0].top) <= y_threshold:
867
- current_line.append(result)
868
- else:
869
- lines.append(current_line)
870
- current_line = [result]
871
- if current_line:
872
- lines.append(current_line)
873
-
874
- # Sort each line by left position
875
- for line in lines:
876
- line.sort(key=lambda x: x.left)
877
-
878
- # Flatten the sorted lines back into a single list
879
- sorted_results = [result for line in lines for result in line]
880
-
881
- combined_results = []
882
- new_format_results = {}
883
- current_line = []
884
- current_bbox = None
885
- line_counter = 1
886
-
887
- def create_ocr_result_with_children(combined_results, i, current_bbox, current_line):
888
- combined_results["text_line_" + str(i)] = {
889
- "line": i,
890
- 'text': current_bbox.text,
891
- 'bounding_box': (current_bbox.left, current_bbox.top,
892
- current_bbox.left + current_bbox.width,
893
- current_bbox.top + current_bbox.height),
894
- 'words': [{'text': word.text,
895
- 'bounding_box': (word.left, word.top,
896
- word.left + word.width,
897
- word.top + word.height)}
898
- for word in current_line]
899
- }
900
- return combined_results["text_line_" + str(i)]
901
-
902
- for result in sorted_results:
903
- if not current_line:
904
- # Start a new line
905
- current_line.append(result)
906
- current_bbox = result
907
- else:
908
- # Check if the result is on the same line (y-axis) and close horizontally (x-axis)
909
- last_result = current_line[-1]
910
-
911
- if abs(result.top - last_result.top) <= y_threshold and \
912
- (result.left - (last_result.left + last_result.width)) <= x_threshold:
913
- # Update the bounding box to include the new word
914
- new_right = max(current_bbox.left + current_bbox.width, result.left + result.width)
915
- current_bbox = OCRResult(
916
- text=f"{current_bbox.text} {result.text}",
917
- left=current_bbox.left,
918
- top=current_bbox.top,
919
- width=new_right - current_bbox.left,
920
- height=max(current_bbox.height, result.height)
921
- )
922
- current_line.append(result)
923
- else:
924
-
925
-
926
- # Commit the current line and start a new one
927
- combined_results.append(current_bbox)
928
- # new_format_results[current_bbox.text] = { # f"combined_text_{line_counter}"
929
- # 'bounding_box': (current_bbox.left, current_bbox.top,
930
- # current_bbox.left + current_bbox.width,
931
- # current_bbox.top + current_bbox.height),
932
- # 'words': [{'text': word.text,
933
- # 'bounding_box': (word.left, word.top,
934
- # word.left + word.width,
935
- # word.top + word.height)}
936
- # for word in current_line]
937
- # }
938
- new_format_results["text_line_" + str(line_counter)] = create_ocr_result_with_children(new_format_results, line_counter, current_bbox, current_line)
939
-
940
- line_counter += 1
941
- current_line = [result]
942
- current_bbox = result
943
-
944
- # Append the last line
945
- if current_bbox:
946
- combined_results.append(current_bbox)
947
- # new_format_results[current_bbox.text] = { # f"combined_text_{line_counter}"
948
- # 'bounding_box': (current_bbox.left, current_bbox.top,
949
- # current_bbox.left + current_bbox.width,
950
- # current_bbox.top + current_bbox.height),
951
- # 'words': [{'text': word.text,
952
- # 'bounding_box': (word.left, word.top,
953
- # word.left + word.width,
954
- # word.top + word.height)}
955
- # for word in current_line]
956
- # }
957
-
958
- new_format_results["text_line_" + str(line_counter)] = create_ocr_result_with_children(new_format_results, line_counter, current_bbox, current_line)
959
-
960
-
961
- return combined_results, new_format_results
962
-
 
1
  import pytesseract
2
  import numpy as np
3
  from presidio_analyzer import AnalyzerEngine, RecognizerResult
 
4
  from typing import List, Dict, Optional, Union, Tuple
5
  from dataclasses import dataclass
6
  import time
7
  import cv2
8
+ import copy
9
+ from copy import deepcopy
10
+ from pdfminer.layout import LTChar
11
  import PIL
12
+ from PIL import Image
13
  from typing import Optional, Tuple, Union
 
14
  from tools.helper_functions import clean_unicode_text
15
  from tools.presidio_analyzer_custom import recognizer_result_from_dict
16
  from tools.load_spacy_model_custom_recognisers import custom_entities
 
 
17
 
18
  @dataclass
19
  class OCRResult:
 
173
 
174
  return Image.fromarray(filtered_image), metadata
175
 
 
176
  class SegmentedAdaptiveThreshold(ImagePreprocessor):
177
  """SegmentedAdaptiveThreshold class.
178
 
 
250
  metadata = {"C": c, "background_color": background_color, "contrast": contrast}
251
  return Image.fromarray(adaptive_threshold_image), metadata
252
 
 
 
 
253
  class ImageRescaling(ImagePreprocessor):
254
  """ImageRescaling class. Rescales images based on their size."""
255
 
 
297
  metadata = {"scale_factor": scale_factor}
298
  return Image.fromarray(rescaled_image), metadata
299
 
 
300
  class ContrastSegmentedImageEnhancer(ImagePreprocessor):
301
  """Class containing all logic to perform contrastive segmentation.
302
 
 
403
  """Check if two bounding boxes overlap."""
404
  return (box1[0] < box2[2] and box2[0] < box1[2] and
405
  box1[1] < box2[3] and box2[1] < box1[3])
406
+
407
+ def map_back_entity_results(page_analyser_result, page_text_mapping, all_text_line_results):
408
+ for entity in page_analyser_result:
409
+ entity_start = entity.start
410
+ entity_end = entity.end
411
+
412
+ # Track if the entity has been added to any line
413
+ added_to_line = False
414
+
415
+ for batch_start, line_idx, original_line, chars in page_text_mapping:
416
+ batch_end = batch_start + len(original_line.text)
417
+
418
+ # Check if the entity overlaps with the current line
419
+ if batch_start < entity_end and batch_end > entity_start: # Overlap condition
420
+ relative_start = max(0, entity_start - batch_start) # Adjust start relative to the line
421
+ relative_end = min(entity_end - batch_start, len(original_line.text)) # Adjust end relative to the line
422
+
423
+ # Create a new adjusted entity
424
+ adjusted_entity = copy.deepcopy(entity)
425
+ adjusted_entity.start = relative_start
426
+ adjusted_entity.end = relative_end
427
+
428
+ # Check if this line already has an entry
429
+ existing_entry = next((entry for idx, entry in all_text_line_results if idx == line_idx), None)
430
+
431
+ if existing_entry is None:
432
+ all_text_line_results.append((line_idx, [adjusted_entity]))
433
+ else:
434
+ existing_entry.append(adjusted_entity) # Append to the existing list of entities
435
+
436
+ added_to_line = True
437
+
438
+ # If the entity spans multiple lines, you may want to handle that here
439
+ if not added_to_line:
440
+ # Handle cases where the entity does not fit in any line (optional)
441
+ print(f"Entity '{entity}' does not fit in any line.")
442
+
443
+ return all_text_line_results
444
+
445
+ def map_back_comprehend_entity_results(response, current_batch_mapping, allow_list, chosen_redact_comprehend_entities, all_text_line_results):
446
+ if not response or "Entities" not in response:
447
+ return all_text_line_results
448
+
449
+ for entity in response["Entities"]:
450
+ if entity.get("Type") not in chosen_redact_comprehend_entities:
451
+ continue
452
+
453
+ entity_start = entity["BeginOffset"]
454
+ entity_end = entity["EndOffset"]
455
+
456
+ # Track if the entity has been added to any line
457
+ added_to_line = False
458
+
459
+ # Find the correct line and offset within that line
460
+ for batch_start, line_idx, original_line, chars, line_offset in current_batch_mapping:
461
+ batch_end = batch_start + len(original_line.text[line_offset:])
462
+
463
+ # Check if the entity overlaps with the current line
464
+ if batch_start < entity_end and batch_end > entity_start: # Overlap condition
465
+ # Calculate the absolute position within the line
466
+ relative_start = max(0, entity_start - batch_start + line_offset)
467
+ relative_end = min(entity_end - batch_start + line_offset, len(original_line.text))
468
+
469
+ result_text = original_line.text[relative_start:relative_end]
470
+
471
+ if result_text not in allow_list:
472
+ adjusted_entity = entity.copy()
473
+ adjusted_entity["BeginOffset"] = relative_start # Now relative to the full line
474
+ adjusted_entity["EndOffset"] = relative_end
475
+
476
+ recogniser_entity = recognizer_result_from_dict(adjusted_entity)
477
+
478
+ existing_entry = next((entry for idx, entry in all_text_line_results if idx == line_idx), None)
479
+ if existing_entry is None:
480
+ all_text_line_results.append((line_idx, [recogniser_entity]))
481
+ else:
482
+ existing_entry.append(recogniser_entity) # Append to the existing list of entities
483
+
484
+ added_to_line = True
485
+
486
+ # Optional: Handle cases where the entity does not fit in any line
487
+ if not added_to_line:
488
+ print(f"Entity '{entity}' does not fit in any line.")
489
+
490
+ return all_text_line_results
491
+
492
+ def do_aws_comprehend_call(current_batch, current_batch_mapping, comprehend_client, language, allow_list, chosen_redact_comprehend_entities, all_text_line_results):
493
+ if not current_batch:
494
+ return all_text_line_results
495
+
496
+ max_retries = 3
497
+ retry_delay = 3
498
+
499
+ for attempt in range(max_retries):
500
+ try:
501
+ response = comprehend_client.detect_pii_entities(
502
+ Text=current_batch.strip(),
503
+ LanguageCode=language
504
+ )
505
+
506
+ all_text_line_results = map_back_comprehend_entity_results(
507
+ response,
508
+ current_batch_mapping,
509
+ allow_list,
510
+ chosen_redact_comprehend_entities,
511
+ all_text_line_results
512
+ )
513
+
514
+ return all_text_line_results
515
+
516
+ except Exception as e:
517
+ if attempt == max_retries - 1:
518
+ raise
519
+ time.sleep(retry_delay)
520
+
521
+ def run_page_text_redaction(
522
+ language: str,
523
+ chosen_redact_entities: List[str],
524
+ chosen_redact_comprehend_entities: List[str],
525
+ line_level_text_results_list: List[str],
526
+ line_characters: List,
527
+ page_analyser_results: List = [],
528
+ page_analysed_bounding_boxes: List = [],
529
+ comprehend_client = None,
530
+ allow_list: List[str] = None,
531
+ pii_identification_method: str = "Local",
532
+ nlp_analyser = None,
533
+ score_threshold: float = 0.0,
534
+ custom_entities: List[str] = None,
535
+ comprehend_query_number:int = 0#,
536
+ #merge_text_bounding_boxes_fn = merge_text_bounding_boxes
537
+ ):
538
+ #if not merge_text_bounding_boxes_fn:
539
+ # raise ValueError("merge_text_bounding_boxes_fn is required")
540
+
541
+ page_text = ""
542
+ page_text_mapping = []
543
+ all_text_line_results = []
544
+ comprehend_query_number = 0
545
+
546
+ # Collect all text from the page
547
+ for i, text_line in enumerate(line_level_text_results_list):
548
+ #print("line_level_text_results_list:", line_level_text_results_list)
549
+ if chosen_redact_entities:
550
+ if page_text:
551
+ #page_text += " | "
552
+ page_text += " "
553
+
554
+ start_pos = len(page_text)
555
+ page_text += text_line.text
556
+ page_text_mapping.append((start_pos, i, text_line, line_characters[i]))
557
+
558
+ # Process based on identification method
559
+ if pii_identification_method == "Local":
560
+ if not nlp_analyser:
561
+ raise ValueError("nlp_analyser is required for Local identification method")
562
+
563
+ print("page text:", page_text)
564
+
565
+ page_analyser_result = nlp_analyser.analyze(
566
+ text=page_text,
567
+ language=language,
568
+ entities=chosen_redact_entities,
569
+ score_threshold=score_threshold,
570
+ return_decision_process=True,
571
+ allow_list=allow_list
572
+ )
573
+
574
+ #print("page_analyser_result:", page_analyser_result)
575
+
576
+ all_text_line_results = map_back_entity_results(
577
+ page_analyser_result,
578
+ page_text_mapping,
579
+ all_text_line_results
580
+ )
581
+
582
+ #print("all_text_line_results:", all_text_line_results)
583
+
584
+ elif pii_identification_method == "AWS Comprehend":
585
+ #print("page text:", page_text)
586
+
587
+ # Process custom entities if any
588
+ if custom_entities:
589
+ custom_redact_entities = [
590
+ entity for entity in chosen_redact_comprehend_entities
591
+ if entity in custom_entities
592
+ ]
593
+ if custom_redact_entities:
594
+ page_analyser_result = nlp_analyser.analyze(
595
+ text=page_text,
596
+ language=language,
597
+ entities=custom_redact_entities,
598
+ score_threshold=score_threshold,
599
+ return_decision_process=True,
600
+ allow_list=allow_list
601
+ )
602
+
603
+ print("page_analyser_result:", page_analyser_result)
604
+
605
+ all_text_line_results = map_back_entity_results(
606
+ page_analyser_result,
607
+ page_text_mapping,
608
+ all_text_line_results
609
+ )
610
+
611
+ current_batch = ""
612
+ current_batch_mapping = []
613
+ batch_char_count = 0
614
+ batch_word_count = 0
615
+
616
+ for i, text_line in enumerate(line_level_text_results_list):
617
+ words = text_line.text.split()
618
+ word_start_positions = []
619
+
620
+ # Calculate word start positions within the line
621
+ current_pos = 0
622
+ for word in words:
623
+ word_start_positions.append(current_pos)
624
+ current_pos += len(word) + 1 # +1 for space
625
+
626
+ for word_idx, word in enumerate(words):
627
+ new_batch_char_count = len(current_batch) + len(word) + 1
628
+
629
+ if batch_word_count >= 50 or new_batch_char_count >= 200:
630
+ # Process current batch
631
+ all_text_line_results = do_aws_comprehend_call(
632
+ current_batch,
633
+ current_batch_mapping,
634
+ comprehend_client,
635
+ language,
636
+ allow_list,
637
+ chosen_redact_comprehend_entities,
638
+ all_text_line_results
639
+ )
640
+ comprehend_query_number += 1
641
+
642
+ # Start new batch
643
+ current_batch = word
644
+ batch_word_count = 1
645
+ batch_char_count = len(word)
646
+ current_batch_mapping = [(0, i, text_line, line_characters[i], word_start_positions[word_idx])]
647
+ else:
648
+ if current_batch:
649
+ current_batch += " "
650
+ batch_char_count += 1
651
+ current_batch += word
652
+ batch_char_count += len(word)
653
+ batch_word_count += 1
654
+
655
+ if not current_batch_mapping or current_batch_mapping[-1][1] != i:
656
+ current_batch_mapping.append((
657
+ batch_char_count - len(word),
658
+ i,
659
+ text_line,
660
+ line_characters[i],
661
+ word_start_positions[word_idx] # Add the word's start position within its line
662
+ ))
663
+
664
+ # Process final batch
665
+ if current_batch:
666
+ all_text_line_results = do_aws_comprehend_call(
667
+ current_batch,
668
+ current_batch_mapping,
669
+ comprehend_client,
670
+ language,
671
+ allow_list,
672
+ chosen_redact_comprehend_entities,
673
+ all_text_line_results
674
+ )
675
+ comprehend_query_number += 1
676
+
677
+ # Process results for each line
678
+ for i, text_line in enumerate(line_level_text_results_list):
679
+ line_results = next((results for idx, results in all_text_line_results if idx == i), [])
680
+
681
+ if line_results:
682
+ text_line_bounding_boxes = merge_text_bounding_boxes(
683
+ line_results,
684
+ line_characters[i]
685
+ )
686
+
687
+ page_analyser_results.extend(line_results)
688
+ page_analysed_bounding_boxes.extend(text_line_bounding_boxes)
689
+
690
+ return page_analysed_bounding_boxes
691
+
692
+ def merge_text_bounding_boxes(analyser_results, characters: List[LTChar], combine_pixel_dist: int = 20, vertical_padding: int = 0):
693
+ '''
694
+ Merge identified bounding boxes containing PII that are very close to one another
695
+ '''
696
+ analysed_bounding_boxes = []
697
+ original_bounding_boxes = [] # List to hold original bounding boxes
698
+
699
+ if len(analyser_results) > 0 and len(characters) > 0:
700
+ # Extract bounding box coordinates for sorting
701
+ bounding_boxes = []
702
+ for result in analyser_results:
703
+ #print("Result:", result)
704
+ char_boxes = [char.bbox for char in characters[result.start:result.end] if isinstance(char, LTChar)]
705
+ char_text = [char._text for char in characters[result.start:result.end] if isinstance(char, LTChar)]
706
+ if char_boxes:
707
+ # Calculate the bounding box that encompasses all characters
708
+ left = min(box[0] for box in char_boxes)
709
+ bottom = min(box[1] for box in char_boxes)
710
+ right = max(box[2] for box in char_boxes)
711
+ top = max(box[3] for box in char_boxes) + vertical_padding
712
+ bbox = [left, bottom, right, top]
713
+ bounding_boxes.append((bottom, left, result, bbox, char_text)) # (y, x, result, bbox, text)
714
+
715
+ # Store original bounding boxes
716
+ original_bounding_boxes.append({"text": "".join(char_text), "boundingBox": bbox, "result": copy.deepcopy(result)})
717
+ #print("Original bounding boxes:", original_bounding_boxes)
718
+
719
+ # Sort the results by y-coordinate and then by x-coordinate
720
+ bounding_boxes.sort()
721
+
722
+ merged_bounding_boxes = []
723
+ current_box = None
724
+ current_y = None
725
+ current_result = None
726
+ current_text = []
727
+
728
+ for y, x, result, next_box, text in bounding_boxes:
729
+ if current_y is None or current_box is None:
730
+ # Initialize the first bounding box
731
+ current_box = next_box
732
+ current_y = next_box[1]
733
+ current_result = result
734
+ current_text = list(text)
735
+ else:
736
+ vertical_diff_bboxes = abs(next_box[1] - current_y)
737
+ horizontal_diff_bboxes = abs(next_box[0] - current_box[2])
738
+
739
+ if vertical_diff_bboxes <= 5 and horizontal_diff_bboxes <= combine_pixel_dist:
740
+ # Merge bounding boxes
741
+ #print("Merging boxes")
742
+ merged_box = current_box.copy()
743
+ merged_result = current_result
744
+ merged_text = current_text.copy()
745
+
746
+ merged_box[2] = next_box[2] # Extend horizontally
747
+ merged_box[3] = max(current_box[3], next_box[3]) # Adjust the top
748
+ merged_result.end = max(current_result.end, result.end) # Extend text range
749
+ try:
750
+ if current_result.entity_type != result.entity_type:
751
+ merged_result.entity_type = current_result.entity_type + " - " + result.entity_type
752
+ else:
753
+ merged_result.entity_type = current_result.entity_type
754
+ except Exception as e:
755
+ print("Unable to combine result entity types:", e)
756
+ if current_text:
757
+ merged_text.append(" ") # Add space between texts
758
+ merged_text.extend(text)
759
+
760
+ merged_bounding_boxes.append({
761
+ "text": "".join(merged_text),
762
+ "boundingBox": merged_box,
763
+ "result": merged_result
764
+ })
765
+
766
+ else:
767
+ # Start a new bounding box
768
+ current_box = next_box
769
+ current_y = next_box[1]
770
+ current_result = result
771
+ current_text = list(text)
772
+
773
+ # Combine original and merged bounding boxes
774
+ analysed_bounding_boxes.extend(original_bounding_boxes)
775
+ analysed_bounding_boxes.extend(merged_bounding_boxes)
776
+
777
+ #print("Analysed bounding boxes:", analysed_bounding_boxes)
778
+
779
+ return analysed_bounding_boxes
780
+
781
+ # Function to combine OCR results into line-level results
782
+ def combine_ocr_results(ocr_results, x_threshold=50, y_threshold=12):
783
+ # Group OCR results into lines based on y_threshold
784
+ lines = []
785
+ current_line = []
786
+ for result in sorted(ocr_results, key=lambda x: x.top):
787
+ if not current_line or abs(result.top - current_line[0].top) <= y_threshold:
788
+ current_line.append(result)
789
+ else:
790
+ lines.append(current_line)
791
+ current_line = [result]
792
+ if current_line:
793
+ lines.append(current_line)
794
+
795
+ # Sort each line by left position
796
+ for line in lines:
797
+ line.sort(key=lambda x: x.left)
798
+
799
+ # Flatten the sorted lines back into a single list
800
+ sorted_results = [result for line in lines for result in line]
801
+
802
+ combined_results = []
803
+ new_format_results = {}
804
+ current_line = []
805
+ current_bbox = None
806
+ line_counter = 1
807
+
808
+ def create_ocr_result_with_children(combined_results, i, current_bbox, current_line):
809
+ combined_results["text_line_" + str(i)] = {
810
+ "line": i,
811
+ 'text': current_bbox.text,
812
+ 'bounding_box': (current_bbox.left, current_bbox.top,
813
+ current_bbox.left + current_bbox.width,
814
+ current_bbox.top + current_bbox.height),
815
+ 'words': [{'text': word.text,
816
+ 'bounding_box': (word.left, word.top,
817
+ word.left + word.width,
818
+ word.top + word.height)}
819
+ for word in current_line]
820
+ }
821
+ return combined_results["text_line_" + str(i)]
822
+
823
+ for result in sorted_results:
824
+ if not current_line:
825
+ # Start a new line
826
+ current_line.append(result)
827
+ current_bbox = result
828
+ else:
829
+ # Check if the result is on the same line (y-axis) and close horizontally (x-axis)
830
+ last_result = current_line[-1]
831
+
832
+ if abs(result.top - last_result.top) <= y_threshold and \
833
+ (result.left - (last_result.left + last_result.width)) <= x_threshold:
834
+ # Update the bounding box to include the new word
835
+ new_right = max(current_bbox.left + current_bbox.width, result.left + result.width)
836
+ current_bbox = OCRResult(
837
+ text=f"{current_bbox.text} {result.text}",
838
+ left=current_bbox.left,
839
+ top=current_bbox.top,
840
+ width=new_right - current_bbox.left,
841
+ height=max(current_bbox.height, result.height)
842
+ )
843
+ current_line.append(result)
844
+ else:
845
+
846
+
847
+ # Commit the current line and start a new one
848
+ combined_results.append(current_bbox)
849
+
850
+ new_format_results["text_line_" + str(line_counter)] = create_ocr_result_with_children(new_format_results, line_counter, current_bbox, current_line)
851
+
852
+ line_counter += 1
853
+ current_line = [result]
854
+ current_bbox = result
855
+
856
+ # Append the last line
857
+ if current_bbox:
858
+ combined_results.append(current_bbox)
859
+
860
+ new_format_results["text_line_" + str(line_counter)] = create_ocr_result_with_children(new_format_results, line_counter, current_bbox, current_line)
861
+
862
+
863
+ return combined_results, new_format_results
864
 
865
  class CustomImageAnalyzerEngine:
866
  def __init__(
 
915
  self,
916
  line_level_ocr_results: List[OCRResult],
917
  ocr_results_with_children: Dict[str, Dict],
918
+ chosen_redact_comprehend_entities: List[str],
919
+ pii_identification_method: str = "Local",
920
+ comprehend_client = "",
921
  **text_analyzer_kwargs
922
  ) -> List[CustomImageRecognizerResult]:
 
 
 
923
 
924
+ page_text = ""
925
+ page_text_mapping = []
926
+ all_text_line_results = []
927
  comprehend_query_number = 0
 
 
 
 
 
 
 
 
928
 
929
+ # Collect all text and create mapping
930
  for i, line_level_ocr_result in enumerate(line_level_ocr_results):
931
+ if page_text:
932
+ page_text += " "
933
+ start_pos = len(page_text)
934
+ page_text += line_level_ocr_result.text
935
+ # Note: We're not passing line_characters here since it's not needed for this use case
936
+ page_text_mapping.append((start_pos, i, line_level_ocr_result, None))
937
+
938
+ # Process using either Local or AWS Comprehend
939
+ if pii_identification_method == "Local":
940
+ analyzer_result = self.analyzer_engine.analyze(
941
+ text=page_text,
942
+ **text_analyzer_kwargs
943
+ )
944
+ all_text_line_results = map_back_entity_results(
945
+ analyzer_result,
946
+ page_text_mapping,
947
+ all_text_line_results
948
+ )
949
 
950
+ elif pii_identification_method == "AWS Comprehend":
951
+ # Handle custom entities first
952
+ if custom_entities:
953
+ custom_redact_entities = [
954
+ entity for entity in chosen_redact_comprehend_entities
955
+ if entity in custom_entities
956
+ ]
957
+ if custom_redact_entities:
958
+ text_analyzer_kwargs["entities"] = custom_redact_entities
959
+ page_analyser_result = self.analyzer_engine.analyze(
960
+ text=page_text,
961
+ **text_analyzer_kwargs
962
+ )
963
+ all_text_line_results = map_back_entity_results(
964
+ page_analyser_result,
965
+ page_text_mapping,
966
+ all_text_line_results
967
+ )
968
 
969
+ # Process text in batches for AWS Comprehend
970
+ current_batch = ""
971
+ current_batch_mapping = []
972
+ batch_char_count = 0
973
+ batch_word_count = 0
974
 
975
+ for i, text_line in enumerate(line_level_ocr_results):
976
+ words = text_line.text.split()
977
+ word_start_positions = []
978
+ current_pos = 0
979
 
980
+ for word in words:
981
+ word_start_positions.append(current_pos)
982
+ current_pos += len(word) + 1
983
 
984
+ for word_idx, word in enumerate(words):
985
+ new_batch_char_count = len(current_batch) + len(word) + 1
 
 
986
 
987
+ if batch_word_count >= 50 or new_batch_char_count >= 200:
988
+ # Process current batch
989
+ all_text_line_results = do_aws_comprehend_call(
990
+ current_batch,
991
+ current_batch_mapping,
992
+ comprehend_client,
993
+ text_analyzer_kwargs["language"],
994
+ text_analyzer_kwargs.get('allow_list', []),
995
+ chosen_redact_comprehend_entities,
996
+ all_text_line_results
997
+ )
 
 
 
 
 
 
 
 
 
 
 
998
  comprehend_query_number += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
999
 
1000
  # Reset batch
1001
+ current_batch = word
1002
+ batch_word_count = 1
1003
+ batch_char_count = len(word)
1004
+ current_batch_mapping = [(0, i, text_line, None, word_start_positions[word_idx])]
1005
+ else:
1006
+ if current_batch:
1007
+ current_batch += " "
1008
+ batch_char_count += 1
1009
+ current_batch += word
1010
+ batch_char_count += len(word)
1011
+ batch_word_count += 1
1012
+
1013
+ if not current_batch_mapping or current_batch_mapping[-1][1] != i:
1014
+ current_batch_mapping.append((
1015
+ batch_char_count - len(word),
1016
+ i,
1017
+ text_line,
1018
+ None,
1019
+ word_start_positions[word_idx]
1020
+ ))
1021
+
1022
+ # Process final batch if any
1023
+ if current_batch:
1024
+ all_text_line_results = do_aws_comprehend_call(
1025
+ current_batch,
1026
+ current_batch_mapping,
1027
+ comprehend_client,
1028
+ text_analyzer_kwargs["language"],
1029
+ text_analyzer_kwargs.get('allow_list', []),
1030
+ chosen_redact_comprehend_entities,
1031
+ all_text_line_results
1032
+ )
1033
+ comprehend_query_number += 1
1034
 
1035
+
 
 
 
1036
 
1037
+ # Process results and create bounding boxes
1038
+ combined_results = []
1039
+ for i, text_line in enumerate(line_level_ocr_results):
1040
+ line_results = next((results for idx, results in all_text_line_results if idx == i), [])
1041
+ if line_results and i < len(ocr_results_with_children):
1042
  child_level_key = list(ocr_results_with_children.keys())[i]
1043
  ocr_results_with_children_line_level = ocr_results_with_children[child_level_key]
1044
+
1045
+ for result in line_results:
1046
+ bbox_results = self.map_analyzer_results_to_bounding_boxes(
1047
+ [result],
1048
+ [OCRResult(
1049
+ text=text_line.text[result.start:result.end],
1050
+ left=text_line.left,
1051
+ top=text_line.top,
1052
+ width=text_line.width,
1053
+ height=text_line.height
1054
+ )],
1055
+ text_line.text,
1056
+ text_analyzer_kwargs.get('allow_list', []),
1057
+ ocr_results_with_children_line_level
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1058
  )
1059
+ combined_results.extend(bbox_results)
 
 
 
1060
 
1061
  return combined_results, comprehend_query_number
1062
 
1063
  @staticmethod
1064
  def map_analyzer_results_to_bounding_boxes(
1065
+ text_analyzer_results: List[RecognizerResult],
1066
+ redaction_relevant_ocr_results: List[OCRResult],
1067
+ full_text: str,
1068
+ allow_list: List[str],
1069
+ ocr_results_with_children_child_info: Dict[str, Dict]
1070
+ ) -> List[CustomImageRecognizerResult]:
1071
  redaction_bboxes = []
 
1072
 
1073
  for redaction_relevant_ocr_result in redaction_relevant_ocr_results:
1074
+ #print("ocr_results_with_children_child_info:", ocr_results_with_children_child_info)
1075
 
1076
+ line_text = ocr_results_with_children_child_info['text']
1077
+ line_length = len(line_text)
1078
+ redaction_text = redaction_relevant_ocr_result.text
1079
 
1080
+ # print(f"Processing line: '{line_text}'")
1081
+
1082
  for redaction_result in text_analyzer_results:
1083
+ # print(f"Checking redaction result: {redaction_result}")
1084
+ # print("redaction_text:", redaction_text)
1085
+ # print("line_length:", line_length)
1086
+ # print("line_text:", line_text)
1087
+
1088
+ # Check if the redaction text is no in the allow list
1089
+
1090
+ if redaction_text not in allow_list:
1091
+
1092
+ # Adjust start and end to be within line bounds
1093
+ start_in_line = max(0, redaction_result.start)
1094
+ end_in_line = min(line_length, redaction_result.end)
1095
+
1096
+ # Get the matched text from this line
1097
+ matched_text = line_text[start_in_line:end_in_line]
1098
+ matched_words = matched_text.split()
1099
+
1100
+ # print(f"Found match: '{matched_text}' in line")
1101
+
1102
+ # Find the corresponding words in the OCR results
1103
+ matching_word_boxes = []
1104
+ for word_info in ocr_results_with_children_child_info.get('words', []):
1105
+ # Check if this word is part of our match
1106
+ if any(word.lower() in word_info['text'].lower() for word in matched_words):
1107
+ matching_word_boxes.append(word_info['bounding_box'])
1108
+ # print(f"Matched word: {word_info['text']}")
1109
+
1110
+ if matching_word_boxes:
1111
+ # Calculate the combined bounding box for all matching words
1112
+ left = min(box[0] for box in matching_word_boxes)
1113
+ top = min(box[1] for box in matching_word_boxes)
1114
+ right = max(box[2] for box in matching_word_boxes)
1115
+ bottom = max(box[3] for box in matching_word_boxes)
1116
+
1117
+ redaction_bboxes.append(
1118
+ CustomImageRecognizerResult(
1119
+ entity_type=redaction_result.entity_type,
1120
+ start=start_in_line,
1121
+ end=end_in_line,
1122
+ score=redaction_result.score,
1123
+ left=left,
1124
+ top=top,
1125
+ width=right - left,
1126
+ height=bottom - top,
1127
+ text=matched_text
1128
+ )
1129
  )
1130
+ # print(f"Added bounding box for: '{matched_text}'")
 
 
1131
 
1132
  return redaction_bboxes
1133
 
1134
  @staticmethod
1135
  def remove_space_boxes(ocr_result: dict) -> dict:
1136
  """Remove OCR bboxes that are for spaces.
 
1137
  :param ocr_result: OCR results (raw or thresholded).
1138
  :return: OCR results with empty words removed.
1139
  """
 
1156
  ocr_result: Dict[str, List[Union[int, str]]], scale_factor: float
1157
  ) -> Dict[str, float]:
1158
  """Scale down the bounding box results based on a scale percentage.
 
1159
  :param ocr_result: OCR results (raw).
1160
  :param scale_percent: Scale percentage for resizing the bounding box.
 
1161
  :return: OCR results (scaled).
1162
  """
1163
  scaled_results = deepcopy(ocr_result)
 
1204
  estimated_width = int(proportion * ocr_result.width)
1205
 
1206
  return estimated_width
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tools/file_conversion.py CHANGED
@@ -201,7 +201,7 @@ def process_file(file_path:str, prepare_for_review:bool=False):
201
  if file_extension in ['.jpg', '.jpeg', '.png']:
202
  print(f"{file_path} is an image file.")
203
  # Perform image processing here
204
- img_object = [Image.open(file_path)]
205
  # Load images from the file paths
206
 
207
  # Check if the file is a PDF
@@ -490,6 +490,7 @@ def prepare_image_or_pdf(
490
  else:
491
  file_path = file.name
492
  file_path_without_ext = get_file_path_end(file_path)
 
493
 
494
  if not file_path:
495
  out_message = "Please select a file."
@@ -532,8 +533,13 @@ def prepare_image_or_pdf(
532
 
533
  image_file_paths = process_file(file_path_str, prepare_for_review)
534
 
535
- print("Inserted image into PDF file")
 
 
536
 
 
 
 
537
 
538
  elif file_extension in ['.csv']:
539
  review_file_csv = read_file(file)
@@ -738,6 +744,7 @@ def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decis
738
  reported_number = int(number) + 1
739
  else:
740
  print("No number found before .png")
 
741
 
742
  # Check if 'boxes' is in the annotation, if not, add an empty list
743
  if 'boxes' not in annotation:
 
201
  if file_extension in ['.jpg', '.jpeg', '.png']:
202
  print(f"{file_path} is an image file.")
203
  # Perform image processing here
204
+ img_object = [file_path] #[Image.open(file_path)]
205
  # Load images from the file paths
206
 
207
  # Check if the file is a PDF
 
490
  else:
491
  file_path = file.name
492
  file_path_without_ext = get_file_path_end(file_path)
493
+ file_name_with_ext = os.path.basename(file_path)
494
 
495
  if not file_path:
496
  out_message = "Please select a file."
 
533
 
534
  image_file_paths = process_file(file_path_str, prepare_for_review)
535
 
536
+ #print("image_file_paths:", image_file_paths)
537
+
538
+ converted_file_path = output_folder + file_name_with_ext
539
 
540
+ pymupdf_doc.save(converted_file_path)
541
+
542
+ print("Inserted image into PDF file")
543
 
544
  elif file_extension in ['.csv']:
545
  review_file_csv = read_file(file)
 
744
  reported_number = int(number) + 1
745
  else:
746
  print("No number found before .png")
747
+ reported_number = 1
748
 
749
  # Check if 'boxes' is in the annotation, if not, add an empty list
750
  if 'boxes' not in annotation:
tools/file_redaction.py CHANGED
@@ -25,13 +25,13 @@ from collections import defaultdict # For efficient grouping
25
 
26
  from presidio_analyzer import RecognizerResult
27
  from tools.aws_functions import RUN_AWS_FUNCTIONS
28
- from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
29
  from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords
30
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser
31
  from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
32
  from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
33
  from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
34
- from tools.presidio_analyzer_custom import recognizer_result_from_dict
35
 
36
  # Number of pages to loop through before breaking. Currently set very high, as functions are breaking on time metrics (e.g. every 105 seconds), rather than on number of pages redacted.
37
  page_break_value = get_or_create_env_var('page_break_value', '50000')
@@ -136,6 +136,9 @@ def choose_and_run_redactor(file_paths:List[str],
136
  tic = time.perf_counter()
137
  all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
138
 
 
 
 
139
  if isinstance(custom_recogniser_word_list, pd.DataFrame):
140
  custom_recogniser_word_list = custom_recogniser_word_list.iloc[:,0].tolist()
141
 
@@ -159,7 +162,6 @@ def choose_and_run_redactor(file_paths:List[str],
159
  elif (first_loop_state == False) & (current_loop_page == 999):
160
  current_loop_page = 0
161
 
162
-
163
  if not out_file_paths:
164
  out_file_paths = []
165
 
@@ -184,21 +186,33 @@ def choose_and_run_redactor(file_paths:List[str],
184
  combined_out_message = '\n'.join(out_message)
185
  else:
186
  combined_out_message = out_message
 
 
 
 
 
 
187
 
188
  estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
189
  print("Estimated total processing time:", str(estimate_total_processing_time))
190
 
191
- return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
192
 
193
  # If we have reached the last page, return message
194
  if current_loop_page >= number_of_pages:
195
- print("current_loop_page:", current_loop_page, "is equal to or greater than number of pages in document:", number_of_pages)
196
 
197
  # Set to a very high number so as not to mix up with subsequent file processing by the user
198
  current_loop_page = 999
199
  combined_out_message = out_message
200
 
201
- return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
 
 
 
 
 
 
202
 
203
  # Create allow list
204
  # If string, assume file path
@@ -221,7 +235,7 @@ def choose_and_run_redactor(file_paths:List[str],
221
  comprehend_client = ""
222
  out_message = "Cannot connect to AWS Comprehend service. Please choose another PII identification method."
223
  print(out_message)
224
- return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
225
  else:
226
  comprehend_client = ""
227
 
@@ -233,7 +247,7 @@ def choose_and_run_redactor(file_paths:List[str],
233
  textract_client = ""
234
  out_message = "Cannot connect to AWS Textract. Please choose another text extraction method."
235
  print(out_message)
236
- return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
237
  else:
238
  textract_client = ""
239
 
@@ -265,8 +279,9 @@ def choose_and_run_redactor(file_paths:List[str],
265
  file_path = file.name
266
 
267
  if file_path:
268
- file_path_without_ext = get_file_path_end(file_path)
269
- print("Redacting file:", file_path_without_ext)
 
270
 
271
  is_a_pdf = is_pdf(file_path) == True
272
  if is_a_pdf == False and in_redact_method == text_ocr_option:
@@ -277,16 +292,16 @@ def choose_and_run_redactor(file_paths:List[str],
277
  out_message = "No file selected"
278
  print(out_message)
279
 
280
- return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
281
 
282
  if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
283
 
284
  #Analyse and redact image-based pdf or image
285
  if is_pdf_or_image(file_path) == False:
286
  out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
287
- return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
288
 
289
- print("Redacting file " + file_path_without_ext + " as an image-based file")
290
 
291
  pymupdf_doc, all_decision_process_table, log_files_output_paths, new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number = redact_image_pdf(file_path,
292
  prepared_pdf_image_paths,
@@ -328,7 +343,7 @@ def choose_and_run_redactor(file_paths:List[str],
328
 
329
  if is_pdf(file_path) == False:
330
  out_message = "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'."
331
- return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
332
 
333
  # Analyse text-based pdf
334
  print('Redacting file as text-based PDF')
@@ -356,12 +371,12 @@ def choose_and_run_redactor(file_paths:List[str],
356
  else:
357
  out_message = "No redaction method selected"
358
  print(out_message)
359
- return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
360
 
361
  # If at last page, save to file
362
  if current_loop_page >= number_of_pages:
363
 
364
- print("Current page loop:", current_loop_page, "is greater or equal to number of pages:", number_of_pages)
365
  latest_file_completed += 1
366
  current_loop_page = 999
367
 
@@ -370,36 +385,43 @@ def choose_and_run_redactor(file_paths:List[str],
370
 
371
  # Save file
372
  if is_pdf(file_path) == False:
373
- out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_pdf.pdf"
374
- pymupdf_doc[0].save(out_image_file_path, "PDF" ,resolution=image_dpi, save_all=False)#, append_images=pymupdf_doc[:1])
 
 
 
 
375
 
376
  else:
377
- out_image_file_path = output_folder + file_path_without_ext + "_redacted.pdf"
378
- pymupdf_doc.save(out_image_file_path)
379
 
380
- out_file_paths.append(out_image_file_path)
381
 
382
  #if log_files_output_paths:
383
  # log_files_output_paths.extend(log_files_output_paths)
384
 
385
- logs_output_file_name = out_image_file_path + "_decision_process_output.csv"
 
 
 
386
  all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
387
  log_files_output_paths.append(logs_output_file_name)
388
 
389
- all_text_output_file_name = out_image_file_path + "_ocr_output.csv"
390
  all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
391
  out_file_paths.append(all_text_output_file_name)
392
 
393
  # Save the gradio_annotation_boxes to a JSON file
394
  try:
395
- print("Saving annotations to JSON")
396
 
397
- out_annotation_file_path = out_image_file_path + '_review_file.json'
398
  with open(out_annotation_file_path, 'w') as f:
399
  json.dump(annotations_all_pages, f)
400
  log_files_output_paths.append(out_annotation_file_path)
401
 
402
- print("Saving annotations to CSV")
403
 
404
  # Convert json to csv and also save this
405
  #print("annotations_all_pages:", annotations_all_pages)
@@ -407,14 +429,14 @@ def choose_and_run_redactor(file_paths:List[str],
407
 
408
  review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
409
 
410
- out_review_file_file_path = out_image_file_path + '_review_file.csv'
411
- review_df.to_csv(out_review_file_file_path, index=None)
412
- out_file_paths.append(out_review_file_file_path)
413
 
414
  print("Saved review file to csv")
415
 
416
  except Exception as e:
417
- print("Could not save annotations to json file:", e)
418
 
419
  # Make a combined message for the file
420
  if isinstance(out_message, list):
@@ -429,7 +451,7 @@ def choose_and_run_redactor(file_paths:List[str],
429
  combined_out_message = combined_out_message + " " + out_time_message # Ensure this is a single string
430
 
431
  estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
432
- print("Estimated total processing time:", str(estimate_total_processing_time))
433
 
434
  else:
435
  toc = time.perf_counter()
@@ -441,7 +463,7 @@ def choose_and_run_redactor(file_paths:List[str],
441
  if all_request_metadata:
442
  all_request_metadata_str = '\n'.join(all_request_metadata).strip()
443
 
444
- all_request_metadata_file_path = output_folder + file_path_without_ext + "_textract_request_metadata.txt"
445
 
446
  with open(all_request_metadata_file_path, "w") as f:
447
  f.write(all_request_metadata_str)
@@ -456,10 +478,15 @@ def choose_and_run_redactor(file_paths:List[str],
456
 
457
  # Ensure no duplicated output files
458
  log_files_output_paths = list(set(log_files_output_paths))
459
- out_file_paths = list(set(out_file_paths))
 
460
 
 
 
 
461
 
462
- return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
 
463
 
464
  def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
465
  '''
@@ -930,14 +957,7 @@ def redact_image_pdf(file_path:str,
930
  nlp_analyser.registry.remove_recognizer("CUSTOM")
931
  new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
932
  #print("new_custom_recogniser:", new_custom_recogniser)
933
- nlp_analyser.registry.add_recognizer(new_custom_recogniser)
934
-
935
- # List all elements currently in the nlp_analyser registry
936
- #print("Current recognizers in nlp_analyser registry:")
937
- for recognizer_name in nlp_analyser.registry.recognizers:
938
- print(recognizer_name)
939
-
940
-
941
 
942
 
943
  image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
@@ -1031,7 +1051,7 @@ def redact_image_pdf(file_path:str,
1031
 
1032
  #print("Image is in range of pages to redact")
1033
  if isinstance(image, str):
1034
- #print("image is a file path")
1035
  image = Image.open(image)
1036
 
1037
  # Need image size to convert textract OCR outputs to the correct sizes
@@ -1137,7 +1157,7 @@ def redact_image_pdf(file_path:str,
1137
  all_image_annotations_boxes = []
1138
 
1139
  for box in merged_redaction_bboxes:
1140
- print("box:", box)
1141
 
1142
  x0 = box.left
1143
  y0 = box.top
@@ -1299,6 +1319,8 @@ def get_text_container_characters(text_container:LTTextContainer):
1299
  for line in text_container
1300
  if isinstance(line, LTTextLine) or isinstance(line, LTTextLineHorizontal)
1301
  for char in line]
 
 
1302
 
1303
  return characters
1304
  return []
@@ -1312,6 +1334,7 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
1312
  line_level_characters_out = []
1313
  #all_line_level_characters_out = []
1314
  character_objects_out = [] # New list to store character objects
 
1315
 
1316
  # Initialize variables
1317
  full_text = ""
@@ -1326,12 +1349,19 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
1326
  for char in char_objects:
1327
  character_objects_out.append(char) # Collect character objects
1328
 
 
 
 
 
1329
  if isinstance(char, LTAnno):
1330
 
 
 
 
1331
  added_text = char.get_text()
1332
 
1333
  # Handle double quotes
1334
- added_text = added_text.replace('"', '\\"') # Escape double quotes
1335
 
1336
  # Handle space separately by finalizing the word
1337
  full_text += added_text # Adds space or newline
@@ -1348,7 +1378,7 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
1348
  if current_word:
1349
  word_bboxes.append((current_word, current_word_bbox))
1350
  # Create an OCRResult for the current line
1351
- line_level_results_out.append(OCRResult(full_text, round(overall_bbox[0], 2), round(overall_bbox[1], 2), round(overall_bbox[2] - overall_bbox[0], 2), round(overall_bbox[3] - overall_bbox[1], 2)))
1352
  line_level_characters_out.append(character_objects_out)
1353
  # Reset for the next line
1354
  character_objects_out = []
@@ -1396,119 +1426,15 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
1396
  # Convert special characters to a human-readable format
1397
  #full_text = full_text.encode('latin1', errors='replace').decode('utf-8')
1398
  full_text = clean_unicode_text(full_text)
 
1399
  #print("full_text:", full_text)
1400
 
1401
- line_level_results_out.append(OCRResult(full_text, round(overall_bbox[0],2), round(overall_bbox[1], 2), round(overall_bbox[2]-overall_bbox[0],2), round(overall_bbox[3]-overall_bbox[1],2)))
1402
 
1403
  #line_level_characters_out = character_objects_out
1404
 
1405
  return line_level_results_out, line_level_characters_out # Return both results and character objects
1406
 
1407
- def merge_text_bounding_boxes(analyser_results, characters: List[LTChar], combine_pixel_dist: int = 20, vertical_padding: int = 0):
1408
- '''
1409
- Merge identified bounding boxes containing PII that are very close to one another
1410
- '''
1411
- analysed_bounding_boxes = []
1412
- original_bounding_boxes = [] # List to hold original bounding boxes
1413
-
1414
- if len(analyser_results) > 0 and len(characters) > 0:
1415
- # Extract bounding box coordinates for sorting
1416
- bounding_boxes = []
1417
- for result in analyser_results:
1418
- #print("Result:", result)
1419
- char_boxes = [char.bbox for char in characters[result.start:result.end] if isinstance(char, LTChar)]
1420
- char_text = [char._text for char in characters[result.start:result.end] if isinstance(char, LTChar)]
1421
- if char_boxes:
1422
- # Calculate the bounding box that encompasses all characters
1423
- left = min(box[0] for box in char_boxes)
1424
- bottom = min(box[1] for box in char_boxes)
1425
- right = max(box[2] for box in char_boxes)
1426
- top = max(box[3] for box in char_boxes) + vertical_padding
1427
- bbox = [left, bottom, right, top]
1428
- bounding_boxes.append((bottom, left, result, bbox, char_text)) # (y, x, result, bbox, text)
1429
-
1430
- # Store original bounding boxes
1431
- original_bounding_boxes.append({"text": "".join(char_text), "boundingBox": bbox, "result": copy.deepcopy(result)})
1432
- #print("Original bounding boxes:", original_bounding_boxes)
1433
-
1434
- # Sort the results by y-coordinate and then by x-coordinate
1435
- bounding_boxes.sort()
1436
-
1437
- merged_bounding_boxes = []
1438
- current_box = None
1439
- current_y = None
1440
- current_result = None
1441
- current_text = []
1442
-
1443
- for y, x, result, next_box, text in bounding_boxes:
1444
- if current_y is None or current_box is None:
1445
- # Initialize the first bounding box
1446
- current_box = next_box
1447
- current_y = next_box[1]
1448
- current_result = result
1449
- current_text = list(text)
1450
- else:
1451
- vertical_diff_bboxes = abs(next_box[1] - current_y)
1452
- horizontal_diff_bboxes = abs(next_box[0] - current_box[2])
1453
-
1454
- if vertical_diff_bboxes <= 5 and horizontal_diff_bboxes <= combine_pixel_dist:
1455
- # Merge bounding boxes
1456
- #print("Merging boxes")
1457
- merged_box = current_box.copy()
1458
- merged_result = current_result
1459
- merged_text = current_text.copy()
1460
-
1461
- #print("current_box_max_x:", current_box[2])
1462
- #print("char_max_x:", next_box[2])
1463
-
1464
- merged_box[2] = next_box[2] # Extend horizontally
1465
- merged_box[3] = max(current_box[3], next_box[3]) # Adjust the top
1466
- merged_result.end = max(current_result.end, result.end) # Extend text range
1467
- try:
1468
- if current_result.entity_type != result.entity_type:
1469
- merged_result.entity_type = current_result.entity_type + " - " + result.entity_type
1470
- else:
1471
- merged_result.entity_type = current_result.entity_type
1472
- except Exception as e:
1473
- print("Unable to combine result entity types:", e)
1474
- if current_text:
1475
- merged_text.append(" ") # Add space between texts
1476
- merged_text.extend(text)
1477
-
1478
- merged_bounding_boxes.append({
1479
- "text": "".join(merged_text),
1480
- "boundingBox": merged_box,
1481
- "result": merged_result
1482
- })
1483
-
1484
- else:
1485
- # Save the current merged box before starting a new one
1486
- # merged_bounding_boxes.append({
1487
- # "text": "".join(current_text),
1488
- # "boundingBox": current_box,
1489
- # "result": current_result
1490
- # })
1491
- # Start a new bounding box
1492
- current_box = next_box
1493
- current_y = next_box[1]
1494
- current_result = result
1495
- current_text = list(text)
1496
-
1497
- # Handle the last box
1498
- # if current_box is not None:
1499
- # merged_bounding_boxes.append({
1500
- # "text": "".join(current_text),
1501
- # "boundingBox": current_box,
1502
- # "result": current_result
1503
- # })
1504
-
1505
- # Combine original and merged bounding boxes
1506
- analysed_bounding_boxes.extend(original_bounding_boxes)
1507
- analysed_bounding_boxes.extend(merged_bounding_boxes)
1508
-
1509
- #print("Analysed bounding boxes:", analysed_bounding_boxes)
1510
-
1511
- return analysed_bounding_boxes
1512
 
1513
  def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
1514
  decision_process_table = pd.DataFrame()
@@ -1559,6 +1485,182 @@ def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
1559
  pikepdf_annotations_on_page.append(annotation)
1560
  return pikepdf_annotations_on_page
1561
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1562
  def redact_text_pdf(
1563
  filename: str, # Path to the PDF file to be redacted
1564
  prepared_pdf_image_path: str, # Path to the prepared PDF image for redaction
@@ -1681,173 +1783,64 @@ def redact_text_pdf(
1681
 
1682
  for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
1683
 
 
 
1684
  page_analyser_results = []
1685
  page_analysed_bounding_boxes = []
1686
 
1687
  characters = []
1688
  pikepdf_annotations_on_page = []
1689
  decision_process_table_on_page = pd.DataFrame()
1690
- page_text_outputs = pd.DataFrame()
1691
 
1692
  if analysis_type == text_ocr_option:
1693
  for n, text_container in enumerate(page_layout):
1694
-
1695
- text_container_analyser_results = []
1696
- text_container_analysed_bounding_boxes = []
1697
  characters = []
1698
 
 
 
1699
  if isinstance(text_container, LTTextContainer) or isinstance(text_container, LTAnno):
1700
  characters = get_text_container_characters(text_container)
1701
 
1702
  # Create dataframe for all the text on the page
1703
  line_level_text_results_list, line_characters = create_text_bounding_boxes_from_characters(characters)
1704
 
1705
- # Create page_text_outputs (OCR format outputs)
1706
  if line_level_text_results_list:
1707
  # Convert to DataFrame and add to ongoing logging table
1708
  line_level_text_results_df = pd.DataFrame([{
1709
  'page': page_no + 1,
1710
- 'text': result.text,
1711
  'left': result.left,
1712
  'top': result.top,
1713
  'width': result.width,
1714
  'height': result.height
1715
  } for result in line_level_text_results_list])
1716
 
1717
- page_text_outputs = pd.concat([page_text_outputs, line_level_text_results_df])
1718
-
1719
- # Initialize batching variables
1720
- current_batch = ""
1721
- current_batch_mapping = [] # List of (start_pos, line_index, OCRResult) tuples
1722
- all_text_line_results = [] # Store results for all lines
1723
-
1724
- # First pass: collect all lines into batches
1725
- for i, text_line in enumerate(line_level_text_results_list):
1726
- if chosen_redact_entities:
1727
- if pii_identification_method == "Local":
1728
-
1729
- #print("chosen_redact_entities:", chosen_redact_entities)
1730
-
1731
- # Process immediately for local analysis
1732
- text_line_analyser_result = nlp_analyser.analyze(
1733
- text=text_line.text,
1734
- language=language,
1735
- entities=chosen_redact_entities,
1736
- score_threshold=score_threshold,
1737
- return_decision_process=True,
1738
- allow_list=allow_list
1739
- )
1740
- all_text_line_results.append((i, text_line_analyser_result))
1741
-
1742
-
1743
- elif pii_identification_method == "AWS Comprehend":
1744
-
1745
- # First use the local Spacy model to pick up custom entities that AWS Comprehend can't search for.
1746
- custom_redact_entities = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
1747
-
1748
-
1749
- text_line_analyser_result = nlp_analyser.analyze(
1750
- text=text_line.text,
1751
- language=language,
1752
- entities=custom_redact_entities,
1753
- score_threshold=score_threshold,
1754
- return_decision_process=True,
1755
- allow_list=allow_list
1756
- )
1757
- all_text_line_results.append((i, text_line_analyser_result))
1758
-
1759
-
1760
- if len(text_line.text) >= 3:
1761
- # Add separator between lines
1762
- if current_batch:
1763
- current_batch += " | "
1764
-
1765
- start_pos = len(current_batch)
1766
- current_batch += text_line.text
1767
- current_batch_mapping.append((start_pos, i, text_line))
1768
-
1769
- # Process batch if approaching 300 characters or last line
1770
- if len(current_batch) >= 200 or i == len(line_level_text_results_list) - 1:
1771
- print("length of text for Comprehend:", len(current_batch))
1772
-
1773
- try:
1774
- response = comprehend_client.detect_pii_entities(
1775
- Text=current_batch,
1776
- LanguageCode=language
1777
- )
1778
- except Exception as e:
1779
- print(e)
1780
- time.sleep(3)
1781
- response = comprehend_client.detect_pii_entities(
1782
- Text=current_batch,
1783
- LanguageCode=language
1784
- )
1785
-
1786
- comprehend_query_number += 1
1787
-
1788
- # Process response and map back to original lines
1789
- if response and "Entities" in response:
1790
- for entity in response["Entities"]:
1791
- entity_start = entity["BeginOffset"]
1792
- entity_end = entity["EndOffset"]
1793
-
1794
- # Find which line this entity belongs to
1795
- for batch_start, line_idx, original_line in current_batch_mapping:
1796
- batch_end = batch_start + len(original_line.text)
1797
-
1798
- # Check if entity belongs to this line
1799
- if batch_start <= entity_start < batch_end:
1800
- # Adjust offsets relative to original line
1801
- relative_start = entity_start - batch_start
1802
- relative_end = min(entity_end - batch_start, len(original_line.text))
1803
-
1804
- result_text = original_line.text[relative_start:relative_end]
1805
-
1806
- if result_text not in allow_list:
1807
- if entity.get("Type") in chosen_redact_comprehend_entities:
1808
- # Create adjusted entity
1809
- adjusted_entity = entity.copy()
1810
- adjusted_entity["BeginOffset"] = relative_start
1811
- adjusted_entity["EndOffset"] = relative_end
1812
-
1813
- recogniser_entity = recognizer_result_from_dict(adjusted_entity)
1814
-
1815
- # Add to results for this line
1816
- existing_results = next((results for idx, results in all_text_line_results if idx == line_idx), [])
1817
- if not existing_results:
1818
- all_text_line_results.append((line_idx, [recogniser_entity]))
1819
- else:
1820
- existing_results.append(recogniser_entity)
1821
-
1822
- # Reset batch
1823
- current_batch = ""
1824
- current_batch_mapping = []
1825
-
1826
- # Second pass: process results for each line
1827
- for i, text_line in enumerate(line_level_text_results_list):
1828
- text_line_analyser_result = []
1829
- text_line_bounding_boxes = []
1830
-
1831
- # Get results for this line
1832
- line_results = next((results for idx, results in all_text_line_results if idx == i), [])
1833
-
1834
- if line_results:
1835
- text_line_analyser_result = line_results
1836
-
1837
- #print("Analysed text container, now merging bounding boxes")
1838
-
1839
- # Merge bounding boxes if very close together
1840
- text_line_bounding_boxes = merge_text_bounding_boxes(text_line_analyser_result, line_characters[i])
1841
-
1842
- #print("merged bounding boxes")
1843
-
1844
- text_container_analyser_results.extend(text_line_analyser_result)
1845
- text_container_analysed_bounding_boxes.extend(text_line_bounding_boxes)
1846
-
1847
- #print("text_container_analyser_results:", text_container_analyser_results)
1848
-
1849
- page_analyser_results.extend(text_container_analyser_results) # Add this line
1850
- page_analysed_bounding_boxes.extend(text_line_bounding_boxes) # Add this line
1851
 
1852
 
1853
  #print("page_analyser_results:", page_analyser_results)
@@ -1879,17 +1872,18 @@ def redact_text_pdf(
1879
  reported_page_no = page_no + 1
1880
  print("For page number:", reported_page_no, "there are", len(image_annotations["boxes"]), "annotations")
1881
 
 
 
 
 
 
1882
  # Write logs
1883
  # Create decision process table
1884
  decision_process_table_on_page = create_text_redaction_process_results(page_analyser_results, page_analysed_bounding_boxes, current_loop_page)
1885
 
1886
  if not decision_process_table_on_page.empty:
1887
  all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table_on_page])
1888
- #print("all_decision_process_table:", all_decision_process_table)
1889
-
1890
- if not page_text_outputs.empty:
1891
- page_text_outputs = page_text_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
1892
- all_line_level_ocr_results_df = pd.concat([all_line_level_ocr_results_df, page_text_outputs])
1893
 
1894
  toc = time.perf_counter()
1895
 
 
25
 
26
  from presidio_analyzer import RecognizerResult
27
  from tools.aws_functions import RUN_AWS_FUNCTIONS
28
+ from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes
29
  from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords
30
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser
31
  from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
32
  from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
33
  from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
34
+ from tools.presidio_analyzer_custom import recognizer_result_from_dict
35
 
36
  # Number of pages to loop through before breaking. Currently set very high, as functions are breaking on time metrics (e.g. every 105 seconds), rather than on number of pages redacted.
37
  page_break_value = get_or_create_env_var('page_break_value', '50000')
 
136
  tic = time.perf_counter()
137
  all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
138
 
139
+ print("prepared_pdf_file_paths:", prepared_pdf_file_paths[0])
140
+ review_out_file_paths = [prepared_pdf_file_paths[0]]
141
+
142
  if isinstance(custom_recogniser_word_list, pd.DataFrame):
143
  custom_recogniser_word_list = custom_recogniser_word_list.iloc[:,0].tolist()
144
 
 
162
  elif (first_loop_state == False) & (current_loop_page == 999):
163
  current_loop_page = 0
164
 
 
165
  if not out_file_paths:
166
  out_file_paths = []
167
 
 
186
  combined_out_message = '\n'.join(out_message)
187
  else:
188
  combined_out_message = out_message
189
+
190
+ if len(review_out_file_paths) == 1:
191
+
192
+ out_review_file_path = [x for x in out_file_paths if "review_file" in x]
193
+
194
+ review_out_file_paths.extend(out_review_file_path)
195
 
196
  estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
197
  print("Estimated total processing time:", str(estimate_total_processing_time))
198
 
199
+ return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
200
 
201
  # If we have reached the last page, return message
202
  if current_loop_page >= number_of_pages:
203
+ print("Reached last page of document:", current_loop_page)
204
 
205
  # Set to a very high number so as not to mix up with subsequent file processing by the user
206
  current_loop_page = 999
207
  combined_out_message = out_message
208
 
209
+ if len(review_out_file_paths) == 1:
210
+
211
+ out_review_file_path = [x for x in out_file_paths if "review_file" in x]
212
+
213
+ review_out_file_paths.extend(out_review_file_path)
214
+
215
+ return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
216
 
217
  # Create allow list
218
  # If string, assume file path
 
235
  comprehend_client = ""
236
  out_message = "Cannot connect to AWS Comprehend service. Please choose another PII identification method."
237
  print(out_message)
238
+ return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
239
  else:
240
  comprehend_client = ""
241
 
 
247
  textract_client = ""
248
  out_message = "Cannot connect to AWS Textract. Please choose another text extraction method."
249
  print(out_message)
250
+ return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
251
  else:
252
  textract_client = ""
253
 
 
279
  file_path = file.name
280
 
281
  if file_path:
282
+ pdf_file_name_without_ext = get_file_path_end(file_path)
283
+ pdf_file_name_with_ext = os.path.basename(file_path)
284
+ print("Redacting file:", pdf_file_name_with_ext)
285
 
286
  is_a_pdf = is_pdf(file_path) == True
287
  if is_a_pdf == False and in_redact_method == text_ocr_option:
 
292
  out_message = "No file selected"
293
  print(out_message)
294
 
295
+ return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
296
 
297
  if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
298
 
299
  #Analyse and redact image-based pdf or image
300
  if is_pdf_or_image(file_path) == False:
301
  out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
302
+ return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
303
 
304
+ print("Redacting file " + pdf_file_name_with_ext + " as an image-based file")
305
 
306
  pymupdf_doc, all_decision_process_table, log_files_output_paths, new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number = redact_image_pdf(file_path,
307
  prepared_pdf_image_paths,
 
343
 
344
  if is_pdf(file_path) == False:
345
  out_message = "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'."
346
+ return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
347
 
348
  # Analyse text-based pdf
349
  print('Redacting file as text-based PDF')
 
371
  else:
372
  out_message = "No redaction method selected"
373
  print(out_message)
374
+ return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
375
 
376
  # If at last page, save to file
377
  if current_loop_page >= number_of_pages:
378
 
379
+ print("Current page loop:", current_loop_page, "is the last page.")
380
  latest_file_completed += 1
381
  current_loop_page = 999
382
 
 
385
 
386
  # Save file
387
  if is_pdf(file_path) == False:
388
+ out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted_as_pdf.pdf"
389
+ #pymupdf_doc[0].save(out_redacted_pdf_file_path, "PDF" ,resolution=image_dpi, save_all=False)
390
+ #print("pymupdf_doc", pymupdf_doc)
391
+ #print("pymupdf_doc[0]", pymupdf_doc[0])
392
+ pymupdf_doc[-1].save(out_redacted_pdf_file_path, "PDF" ,resolution=image_dpi, save_all=False)#, append_images=pymupdf_doc[:1])
393
+ out_review_file_path = output_folder + pdf_file_name_without_ext + '_review_file.csv'
394
 
395
  else:
396
+ out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.pdf"
397
+ pymupdf_doc.save(out_redacted_pdf_file_path)
398
 
399
+ out_file_paths.append(out_redacted_pdf_file_path)
400
 
401
  #if log_files_output_paths:
402
  # log_files_output_paths.extend(log_files_output_paths)
403
 
404
+
405
+ out_orig_pdf_file_path = output_folder + pdf_file_name_with_ext
406
+
407
+ logs_output_file_name = out_orig_pdf_file_path + "_decision_process_output.csv"
408
  all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
409
  log_files_output_paths.append(logs_output_file_name)
410
 
411
+ all_text_output_file_name = out_orig_pdf_file_path + "_ocr_output.csv"
412
  all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
413
  out_file_paths.append(all_text_output_file_name)
414
 
415
  # Save the gradio_annotation_boxes to a JSON file
416
  try:
417
+ #print("Saving annotations to JSON")
418
 
419
+ out_annotation_file_path = out_orig_pdf_file_path + '_review_file.json'
420
  with open(out_annotation_file_path, 'w') as f:
421
  json.dump(annotations_all_pages, f)
422
  log_files_output_paths.append(out_annotation_file_path)
423
 
424
+ #print("Saving annotations to CSV")
425
 
426
  # Convert json to csv and also save this
427
  #print("annotations_all_pages:", annotations_all_pages)
 
429
 
430
  review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
431
 
432
+ out_review_file_path = out_orig_pdf_file_path + '_review_file.csv'
433
+ review_df.to_csv(out_review_file_path, index=None)
434
+ out_file_paths.append(out_review_file_path)
435
 
436
  print("Saved review file to csv")
437
 
438
  except Exception as e:
439
+ print("Could not save annotations to json or csv file:", e)
440
 
441
  # Make a combined message for the file
442
  if isinstance(out_message, list):
 
451
  combined_out_message = combined_out_message + " " + out_time_message # Ensure this is a single string
452
 
453
  estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
454
+ #print("Estimated total processing time:", str(estimate_total_processing_time))
455
 
456
  else:
457
  toc = time.perf_counter()
 
463
  if all_request_metadata:
464
  all_request_metadata_str = '\n'.join(all_request_metadata).strip()
465
 
466
+ all_request_metadata_file_path = output_folder + pdf_file_name_without_ext + "_textract_request_metadata.txt"
467
 
468
  with open(all_request_metadata_file_path, "w") as f:
469
  f.write(all_request_metadata_str)
 
478
 
479
  # Ensure no duplicated output files
480
  log_files_output_paths = list(set(log_files_output_paths))
481
+ out_file_paths = list(set(out_file_paths))
482
+ review_out_file_paths = [prepared_pdf_file_paths[0], out_review_file_path]
483
 
484
+ #print("log_files_output_paths:", log_files_output_paths)
485
+ #print("out_file_paths:", out_file_paths)
486
+ #print("review_out_file_paths:", review_out_file_paths)
487
 
488
+
489
+ return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
490
 
491
  def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
492
  '''
 
957
  nlp_analyser.registry.remove_recognizer("CUSTOM")
958
  new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
959
  #print("new_custom_recogniser:", new_custom_recogniser)
960
+ nlp_analyser.registry.add_recognizer(new_custom_recogniser)
 
 
 
 
 
 
 
961
 
962
 
963
  image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
 
1051
 
1052
  #print("Image is in range of pages to redact")
1053
  if isinstance(image, str):
1054
+ print("image is a file path", image)
1055
  image = Image.open(image)
1056
 
1057
  # Need image size to convert textract OCR outputs to the correct sizes
 
1157
  all_image_annotations_boxes = []
1158
 
1159
  for box in merged_redaction_bboxes:
1160
+ #print("box:", box)
1161
 
1162
  x0 = box.left
1163
  y0 = box.top
 
1319
  for line in text_container
1320
  if isinstance(line, LTTextLine) or isinstance(line, LTTextLineHorizontal)
1321
  for char in line]
1322
+
1323
+ #print("Initial characters:", characters)
1324
 
1325
  return characters
1326
  return []
 
1334
  line_level_characters_out = []
1335
  #all_line_level_characters_out = []
1336
  character_objects_out = [] # New list to store character objects
1337
+ # character_text_objects_out = []
1338
 
1339
  # Initialize variables
1340
  full_text = ""
 
1349
  for char in char_objects:
1350
  character_objects_out.append(char) # Collect character objects
1351
 
1352
+ if not isinstance(char, LTAnno):
1353
+ character_text = char.get_text()
1354
+ # character_text_objects_out.append(character_text)
1355
+
1356
  if isinstance(char, LTAnno):
1357
 
1358
+ # print("Character line:", "".join(character_text_objects_out))
1359
+ # print("Char is an annotation object:", char)
1360
+
1361
  added_text = char.get_text()
1362
 
1363
  # Handle double quotes
1364
+ #added_text = added_text.replace('"', '\\"') # Escape double quotes
1365
 
1366
  # Handle space separately by finalizing the word
1367
  full_text += added_text # Adds space or newline
 
1378
  if current_word:
1379
  word_bboxes.append((current_word, current_word_bbox))
1380
  # Create an OCRResult for the current line
1381
+ line_level_results_out.append(OCRResult(full_text.strip(), round(overall_bbox[0], 2), round(overall_bbox[1], 2), round(overall_bbox[2] - overall_bbox[0], 2), round(overall_bbox[3] - overall_bbox[1], 2)))
1382
  line_level_characters_out.append(character_objects_out)
1383
  # Reset for the next line
1384
  character_objects_out = []
 
1426
  # Convert special characters to a human-readable format
1427
  #full_text = full_text.encode('latin1', errors='replace').decode('utf-8')
1428
  full_text = clean_unicode_text(full_text)
1429
+ full_text = full_text.strip()
1430
  #print("full_text:", full_text)
1431
 
1432
+ line_level_results_out.append(OCRResult(full_text.strip(), round(overall_bbox[0],2), round(overall_bbox[1], 2), round(overall_bbox[2]-overall_bbox[0],2), round(overall_bbox[3]-overall_bbox[1],2)))
1433
 
1434
  #line_level_characters_out = character_objects_out
1435
 
1436
  return line_level_results_out, line_level_characters_out # Return both results and character objects
1437
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1438
 
1439
  def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
1440
  decision_process_table = pd.DataFrame()
 
1485
  pikepdf_annotations_on_page.append(annotation)
1486
  return pikepdf_annotations_on_page
1487
 
1488
+ # def run_page_text_redaction(language: str, # Language of the PDF content
1489
+ # chosen_redact_entities: List[str], # List of entities to be redacted
1490
+ # chosen_redact_comprehend_entities: List[str],
1491
+ # line_level_text_results_list: List[str],
1492
+ # line_characters: List,
1493
+ # page_analyser_results: List = [],
1494
+ # page_analysed_bounding_boxes: List = [],
1495
+ # comprehend_client = None, # Connection to AWS Comprehend
1496
+ # allow_list: List[str] = None, # Optional list of allowed entities
1497
+ # pii_identification_method: str = "Local"
1498
+ # ):
1499
+
1500
+ # # Initialize batching variables
1501
+ # current_batch = ""
1502
+ # current_batch_mapping = [] # List of (start_pos, line_index, OCRResult) tuples
1503
+ # all_text_line_results = [] # Store results for all lines
1504
+ # text_container_analyser_results = []
1505
+ # text_container_analysed_bounding_boxes = []
1506
+
1507
+ # # First pass: collect all lines into batches
1508
+ # for i, text_line in enumerate(line_level_text_results_list):
1509
+ # if chosen_redact_entities:
1510
+ # if pii_identification_method == "Local":
1511
+
1512
+ # #print("chosen_redact_entities:", chosen_redact_entities)
1513
+
1514
+ # # Process immediately for local analysis
1515
+ # text_line_analyser_result = nlp_analyser.analyze(
1516
+ # text=text_line.text,
1517
+ # language=language,
1518
+ # entities=chosen_redact_entities,
1519
+ # score_threshold=score_threshold,
1520
+ # return_decision_process=True,
1521
+ # allow_list=allow_list
1522
+ # )
1523
+ # all_text_line_results.append((i, text_line_analyser_result))
1524
+
1525
+
1526
+ # elif pii_identification_method == "AWS Comprehend":
1527
+
1528
+ # # First use the local Spacy model to pick up custom entities that AWS Comprehend can't search for.
1529
+ # custom_redact_entities = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
1530
+
1531
+
1532
+ # text_line_analyser_result = nlp_analyser.analyze(
1533
+ # text=text_line.text,
1534
+ # language=language,
1535
+ # entities=custom_redact_entities,
1536
+ # score_threshold=score_threshold,
1537
+ # return_decision_process=True,
1538
+ # allow_list=allow_list
1539
+ # )
1540
+ # all_text_line_results.append((i, text_line_analyser_result))
1541
+
1542
+
1543
+ # if len(text_line.text) >= 3:
1544
+ # # Add separator between lines
1545
+ # if current_batch:
1546
+ # current_batch += " | "
1547
+
1548
+ # start_pos = len(current_batch)
1549
+ # current_batch += text_line.text
1550
+ # current_batch_mapping.append((start_pos, i, text_line))
1551
+
1552
+ # # Process batch if approaching 300 characters or last line
1553
+ # if len(current_batch) >= 200 or i == len(line_level_text_results_list) - 1:
1554
+ # print("length of text for Comprehend:", len(current_batch))
1555
+
1556
+ # try:
1557
+ # response = comprehend_client.detect_pii_entities(
1558
+ # Text=current_batch,
1559
+ # LanguageCode=language
1560
+ # )
1561
+ # except Exception as e:
1562
+ # print(e)
1563
+ # time.sleep(3)
1564
+ # response = comprehend_client.detect_pii_entities(
1565
+ # Text=current_batch,
1566
+ # LanguageCode=language
1567
+ # )
1568
+
1569
+ # comprehend_query_number += 1
1570
+
1571
+ # # Process response and map back to original lines
1572
+ # if response and "Entities" in response:
1573
+ # for entity in response["Entities"]:
1574
+ # entity_start = entity["BeginOffset"]
1575
+ # entity_end = entity["EndOffset"]
1576
+
1577
+ # # Find which line this entity belongs to
1578
+ # for batch_start, line_idx, original_line in current_batch_mapping:
1579
+ # batch_end = batch_start + len(original_line.text)
1580
+
1581
+ # # Check if entity belongs to this line
1582
+ # if batch_start <= entity_start < batch_end:
1583
+ # # Adjust offsets relative to original line
1584
+ # relative_start = entity_start - batch_start
1585
+ # relative_end = min(entity_end - batch_start, len(original_line.text))
1586
+
1587
+ # result_text = original_line.text[relative_start:relative_end]
1588
+
1589
+ # if result_text not in allow_list:
1590
+ # if entity.get("Type") in chosen_redact_comprehend_entities:
1591
+ # # Create adjusted entity
1592
+ # adjusted_entity = entity.copy()
1593
+ # adjusted_entity["BeginOffset"] = relative_start
1594
+ # adjusted_entity["EndOffset"] = relative_end
1595
+
1596
+ # recogniser_entity = recognizer_result_from_dict(adjusted_entity)
1597
+
1598
+ # # Add to results for this line
1599
+ # existing_results = next((results for idx, results in all_text_line_results if idx == line_idx), [])
1600
+ # if not existing_results:
1601
+ # all_text_line_results.append((line_idx, [recogniser_entity]))
1602
+ # else:
1603
+ # existing_results.append(recogniser_entity)
1604
+
1605
+ # # Reset batch
1606
+ # current_batch = ""
1607
+ # current_batch_mapping = []
1608
+
1609
+ # # Second pass: process results for each line
1610
+ # for i, text_line in enumerate(line_level_text_results_list):
1611
+ # text_line_analyser_result = []
1612
+ # text_line_bounding_boxes = []
1613
+
1614
+ # # Get results for this line
1615
+ # line_results = next((results for idx, results in all_text_line_results if idx == i), [])
1616
+
1617
+ # if line_results:
1618
+ # text_line_analyser_result = line_results
1619
+
1620
+ # #print("Analysed text container, now merging bounding boxes")
1621
+
1622
+ # # Merge bounding boxes if very close together
1623
+ # text_line_bounding_boxes = merge_text_bounding_boxes(text_line_analyser_result, line_characters[i])
1624
+
1625
+ # #print("merged bounding boxes")
1626
+
1627
+ # text_container_analyser_results.extend(text_line_analyser_result)
1628
+ # #text_container_analysed_bounding_boxes.extend(text_line_bounding_boxes)
1629
+
1630
+ # #print("text_container_analyser_results:", text_container_analyser_results)
1631
+
1632
+ # page_analyser_results.extend(text_container_analyser_results) # Add this line
1633
+ # page_analysed_bounding_boxes.extend(text_line_bounding_boxes) # Add this line
1634
+
1635
+ # return page_analysed_bounding_boxes
1636
+
1637
+ # def map_back_entity_results(page_analyser_result, page_text_mapping, all_text_line_results):
1638
+ # for entity in page_analyser_result:
1639
+ # entity_start = entity.start
1640
+ # entity_end = entity.end
1641
+
1642
+ # for batch_start, line_idx, original_line, chars in page_text_mapping:
1643
+ # batch_end = batch_start + len(original_line.text)
1644
+
1645
+ # if batch_start <= entity_start < batch_end:
1646
+ # relative_start = entity_start - batch_start
1647
+ # relative_end = min(entity_end - batch_start, len(original_line.text))
1648
+
1649
+ # adjusted_entity = copy.deepcopy(entity)
1650
+ # adjusted_entity.start = relative_start
1651
+ # adjusted_entity.end = relative_end
1652
+
1653
+ # existing_entry = next((entry for idx, entry in all_text_line_results if idx == line_idx), None)
1654
+
1655
+ # if existing_entry is None:
1656
+ # all_text_line_results.append((line_idx, [adjusted_entity]))
1657
+ # else:
1658
+ # existing_entry.append(adjusted_entity)
1659
+ # break
1660
+
1661
+ # return all_text_line_results
1662
+
1663
+
1664
  def redact_text_pdf(
1665
  filename: str, # Path to the PDF file to be redacted
1666
  prepared_pdf_image_path: str, # Path to the prepared PDF image for redaction
 
1783
 
1784
  for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
1785
 
1786
+ all_line_characters = []
1787
+ all_line_level_text_results_list = []
1788
  page_analyser_results = []
1789
  page_analysed_bounding_boxes = []
1790
 
1791
  characters = []
1792
  pikepdf_annotations_on_page = []
1793
  decision_process_table_on_page = pd.DataFrame()
1794
+ page_text_ocr_outputs = pd.DataFrame()
1795
 
1796
  if analysis_type == text_ocr_option:
1797
  for n, text_container in enumerate(page_layout):
1798
+
 
 
1799
  characters = []
1800
 
1801
+ #print("text container:", text_container)
1802
+
1803
  if isinstance(text_container, LTTextContainer) or isinstance(text_container, LTAnno):
1804
  characters = get_text_container_characters(text_container)
1805
 
1806
  # Create dataframe for all the text on the page
1807
  line_level_text_results_list, line_characters = create_text_bounding_boxes_from_characters(characters)
1808
 
1809
+ ### Create page_text_ocr_outputs (OCR format outputs)
1810
  if line_level_text_results_list:
1811
  # Convert to DataFrame and add to ongoing logging table
1812
  line_level_text_results_df = pd.DataFrame([{
1813
  'page': page_no + 1,
1814
+ 'text': (result.text).strip(),
1815
  'left': result.left,
1816
  'top': result.top,
1817
  'width': result.width,
1818
  'height': result.height
1819
  } for result in line_level_text_results_list])
1820
 
1821
+ page_text_ocr_outputs = pd.concat([page_text_ocr_outputs, line_level_text_results_df])
1822
+
1823
+ all_line_level_text_results_list.extend(line_level_text_results_list)
1824
+ all_line_characters.extend(line_characters)
1825
+
1826
+ ### REDACTION
1827
+
1828
+ page_analysed_bounding_boxes = run_page_text_redaction(
1829
+ language,
1830
+ chosen_redact_entities,
1831
+ chosen_redact_comprehend_entities,
1832
+ all_line_level_text_results_list, #line_level_text_results_list,
1833
+ all_line_characters,
1834
+ page_analyser_results,
1835
+ page_analysed_bounding_boxes,
1836
+ comprehend_client,
1837
+ allow_list,
1838
+ pii_identification_method,
1839
+ nlp_analyser,
1840
+ score_threshold,
1841
+ custom_entities,
1842
+ comprehend_query_number
1843
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1844
 
1845
 
1846
  #print("page_analyser_results:", page_analyser_results)
 
1872
  reported_page_no = page_no + 1
1873
  print("For page number:", reported_page_no, "there are", len(image_annotations["boxes"]), "annotations")
1874
 
1875
+ # Join extracted text outputs for all lines together
1876
+ if not page_text_ocr_outputs.empty:
1877
+ page_text_ocr_outputs = page_text_ocr_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
1878
+ all_line_level_ocr_results_df = pd.concat([all_line_level_ocr_results_df, page_text_ocr_outputs])
1879
+
1880
  # Write logs
1881
  # Create decision process table
1882
  decision_process_table_on_page = create_text_redaction_process_results(page_analyser_results, page_analysed_bounding_boxes, current_loop_page)
1883
 
1884
  if not decision_process_table_on_page.empty:
1885
  all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table_on_page])
1886
+ #print("all_decision_process_table:", all_decision_process_table)
 
 
 
 
1887
 
1888
  toc = time.perf_counter()
1889
 
tools/helper_functions.py CHANGED
@@ -1,10 +1,13 @@
1
  import os
2
  import re
 
 
3
  import gradio as gr
4
  import pandas as pd
5
  import unicodedata
6
  from typing import List
7
  from gradio_image_annotation import image_annotator
 
8
 
9
  def reset_state_vars():
10
  return [], [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
@@ -120,6 +123,8 @@ def custom_regex_load(in_file:List[str], file_type:str = "Allow list"):
120
  custom_regex = pd.read_csv(regex_file_name, low_memory=False, header=None)
121
  #regex_file_name_no_ext = get_file_path_end(regex_file_name)
122
 
 
 
123
  output_text = file_type + " file loaded."
124
 
125
  print(output_text)
@@ -229,10 +234,10 @@ async def get_connection_params(request: gr.Request):
229
  #if 'context' in request_data:
230
  # print("Request context dictionary:", request_data['context'])
231
 
232
- print("Request headers dictionary:", request.headers)
233
- print("All host elements", request.client)
234
- print("IP address:", request.client.host)
235
- print("Query parameters:", dict(request.query_params))
236
  # To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
237
  #print("Request dictionary to object:", request.request.body())
238
  print("Session hash:", request.session_hash)
@@ -264,6 +269,23 @@ async def get_connection_params(request: gr.Request):
264
  elif 'x-amzn-oidc-identity' in request.headers:
265
  out_session_hash = request.headers['x-amzn-oidc-identity']
266
  base_folder = "user-files/"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
  print("Cognito ID found:", out_session_hash)
268
 
269
  else:
 
1
  import os
2
  import re
3
+ import boto3
4
+ from botocore.exceptions import ClientError
5
  import gradio as gr
6
  import pandas as pd
7
  import unicodedata
8
  from typing import List
9
  from gradio_image_annotation import image_annotator
10
+ from tools.auth import user_pool_id
11
 
12
  def reset_state_vars():
13
  return [], [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
 
123
  custom_regex = pd.read_csv(regex_file_name, low_memory=False, header=None)
124
  #regex_file_name_no_ext = get_file_path_end(regex_file_name)
125
 
126
+ custom_regex.columns = custom_regex.columns.astype(str)
127
+
128
  output_text = file_type + " file loaded."
129
 
130
  print(output_text)
 
234
  #if 'context' in request_data:
235
  # print("Request context dictionary:", request_data['context'])
236
 
237
+ # print("Request headers dictionary:", request.headers)
238
+ # print("All host elements", request.client)
239
+ # print("IP address:", request.client.host)
240
+ # print("Query parameters:", dict(request.query_params))
241
  # To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
242
  #print("Request dictionary to object:", request.request.body())
243
  print("Session hash:", request.session_hash)
 
269
  elif 'x-amzn-oidc-identity' in request.headers:
270
  out_session_hash = request.headers['x-amzn-oidc-identity']
271
  base_folder = "user-files/"
272
+
273
+ # Fetch email address using Cognito client
274
+ cognito_client = boto3.client('cognito-idp')
275
+ try:
276
+ response = cognito_client.admin_get_user(
277
+ UserPoolId=user_pool_id, # Replace with your User Pool ID
278
+ Username=out_session_hash
279
+ )
280
+ email = next(attr['Value'] for attr in response['UserAttributes'] if attr['Name'] == 'email')
281
+ #print("Email address found:", email)
282
+
283
+ out_session_hash = email
284
+ except ClientError as e:
285
+ print("Error fetching user details:", e)
286
+ email = None
287
+
288
+
289
  print("Cognito ID found:", out_session_hash)
290
 
291
  else:
tools/load_spacy_model_custom_recognisers.py CHANGED
@@ -7,7 +7,6 @@ spacy.prefer_gpu()
7
  from spacy.cli.download import download
8
  import re
9
 
10
- # %%
11
  model_name = "en_core_web_sm" #"en_core_web_trf"
12
  score_threshold = 0.001
13
  custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"]
@@ -34,7 +33,7 @@ def custom_word_list_recogniser(custom_list:List[str]=[]):
34
  rf'(?<!\w){re.escape(term.strip()).replace(quote_str, replace_str)}(?!\w)'
35
  for term in custom_list
36
  )
37
- print(custom_regex)
38
 
39
  custom_pattern = Pattern(name="custom_pattern", regex=custom_regex, score = 1)
40
 
 
7
  from spacy.cli.download import download
8
  import re
9
 
 
10
  model_name = "en_core_web_sm" #"en_core_web_trf"
11
  score_threshold = 0.001
12
  custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"]
 
33
  rf'(?<!\w){re.escape(term.strip()).replace(quote_str, replace_str)}(?!\w)'
34
  for term in custom_list
35
  )
36
+ #print(custom_regex)
37
 
38
  custom_pattern = Pattern(name="custom_pattern", regex=custom_regex, score = 1)
39
 
tools/redaction_review.py CHANGED
@@ -117,13 +117,10 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
117
  recogniser_dataframe_out = gr.Dataframe(review_dataframe)
118
  recogniser_entities_list = recogniser_dataframe_gr["label"].unique().tolist()
119
 
120
- print("recogniser_entities_list all options:", recogniser_entities_list)
121
-
122
  recogniser_entities_list = sorted(recogniser_entities_list)
123
  recogniser_entities_list = [entity for entity in recogniser_entities_list if entity != 'Redaction'] # Remove any existing 'Redaction'
124
  recogniser_entities_list.insert(0, 'Redaction') # Add 'Redaction' to the start of the list
125
 
126
- print("recogniser_entities_list:", recogniser_entities_list)
127
 
128
  zoom_str = str(zoom) + '%'
129
  recogniser_colour_list = [(0, 0, 0) for _ in range(len(recogniser_entities_list))]
@@ -248,6 +245,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
248
 
249
  output_files = []
250
  output_log_files = []
 
251
 
252
  #print("File paths in apply_redactions:", file_paths)
253
 
@@ -264,7 +262,8 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
264
 
265
  for file_path in file_paths:
266
  #print("file_path:", file_path)
267
- file_base = get_file_path_end(file_path)
 
268
 
269
  file_extension = os.path.splitext(file_path)[1].lower()
270
 
@@ -287,7 +286,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
287
 
288
  draw.rectangle(coords, fill=fill)
289
 
290
- image.save(output_folder + file_base + "_redacted.png")
291
 
292
  doc = [image]
293
 
@@ -298,6 +297,9 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
298
  # If working with pdfs
299
  elif is_pdf(file_path) == True:
300
  pdf_doc = pymupdf.open(file_path)
 
 
 
301
 
302
  number_of_pages = pdf_doc.page_count
303
 
@@ -316,7 +318,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
316
  #all_image_annotations[i]['image'] = image_loc.tolist()
317
  elif isinstance(image_loc, Image.Image):
318
  image = image_loc
319
- #image_out_folder = output_folder + file_base + "_page_" + str(i) + ".png"
320
  #image_loc.save(image_out_folder)
321
  #all_image_annotations[i]['image'] = image_out_folder
322
  elif isinstance(image_loc, str):
@@ -330,25 +332,34 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
330
 
331
  #try:
332
  if pdf_doc:
333
- out_pdf_file_path = output_folder + file_base + "_redacted.pdf"
334
  pdf_doc.save(out_pdf_file_path)
335
  output_files.append(out_pdf_file_path)
336
 
 
 
 
 
 
 
 
 
 
337
  try:
338
- print("Saving annotations to JSON")
339
 
340
- out_annotation_file_path = output_folder + file_base + '_review_file.json'
341
  with open(out_annotation_file_path, 'w') as f:
342
  json.dump(all_image_annotations, f)
343
  output_log_files.append(out_annotation_file_path)
344
 
345
- print("Saving annotations to CSV review file")
346
 
347
  #print("review_file_state:", review_file_state)
348
 
349
  # Convert json to csv and also save this
350
  review_df = convert_review_json_to_pandas_df(all_image_annotations, review_file_state)
351
- out_review_file_file_path = output_folder + file_base + '_review_file.csv'
352
  review_df.to_csv(out_review_file_file_path, index=None)
353
  output_files.append(out_review_file_file_path)
354
 
@@ -367,9 +378,6 @@ def update_entities_df(choice:str, df:pd.DataFrame):
367
  return df.loc[df["label"]==choice,:]
368
 
369
  def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
370
- #print("index", evt.index)
371
- #print("value", evt.value)
372
- #print("row_value", evt.row_value)
373
  row_value_page = evt.row_value[0] # This is the page number value
374
  return row_value_page
375
 
 
117
  recogniser_dataframe_out = gr.Dataframe(review_dataframe)
118
  recogniser_entities_list = recogniser_dataframe_gr["label"].unique().tolist()
119
 
 
 
120
  recogniser_entities_list = sorted(recogniser_entities_list)
121
  recogniser_entities_list = [entity for entity in recogniser_entities_list if entity != 'Redaction'] # Remove any existing 'Redaction'
122
  recogniser_entities_list.insert(0, 'Redaction') # Add 'Redaction' to the start of the list
123
 
 
124
 
125
  zoom_str = str(zoom) + '%'
126
  recogniser_colour_list = [(0, 0, 0) for _ in range(len(recogniser_entities_list))]
 
245
 
246
  output_files = []
247
  output_log_files = []
248
+ pdf_doc = []
249
 
250
  #print("File paths in apply_redactions:", file_paths)
251
 
 
262
 
263
  for file_path in file_paths:
264
  #print("file_path:", file_path)
265
+ file_name_without_ext = get_file_path_end(file_path)
266
+ file_name_with_ext = os.path.basename(file_path)
267
 
268
  file_extension = os.path.splitext(file_path)[1].lower()
269
 
 
286
 
287
  draw.rectangle(coords, fill=fill)
288
 
289
+ image.save(output_folder + file_name_without_ext + "_redacted.png")
290
 
291
  doc = [image]
292
 
 
297
  # If working with pdfs
298
  elif is_pdf(file_path) == True:
299
  pdf_doc = pymupdf.open(file_path)
300
+ orig_pdf_file_path = file_path
301
+
302
+ output_files.append(orig_pdf_file_path)
303
 
304
  number_of_pages = pdf_doc.page_count
305
 
 
318
  #all_image_annotations[i]['image'] = image_loc.tolist()
319
  elif isinstance(image_loc, Image.Image):
320
  image = image_loc
321
+ #image_out_folder = output_folder + file_name_without_ext + "_page_" + str(i) + ".png"
322
  #image_loc.save(image_out_folder)
323
  #all_image_annotations[i]['image'] = image_out_folder
324
  elif isinstance(image_loc, str):
 
332
 
333
  #try:
334
  if pdf_doc:
335
+ out_pdf_file_path = output_folder + file_name_without_ext + "_redacted.pdf"
336
  pdf_doc.save(out_pdf_file_path)
337
  output_files.append(out_pdf_file_path)
338
 
339
+ else:
340
+ print("PDF input not found.")
341
+
342
+ # If save_pdf is not true, then add the original pdf to the output files
343
+ else:
344
+ if is_pdf(file_path) == True:
345
+ orig_pdf_file_path = file_path
346
+ output_files.append(orig_pdf_file_path)
347
+
348
  try:
349
+ #print("Saving annotations to JSON")
350
 
351
+ out_annotation_file_path = output_folder + file_name_with_ext + '_review_file.json'
352
  with open(out_annotation_file_path, 'w') as f:
353
  json.dump(all_image_annotations, f)
354
  output_log_files.append(out_annotation_file_path)
355
 
356
+ #print("Saving annotations to CSV review file")
357
 
358
  #print("review_file_state:", review_file_state)
359
 
360
  # Convert json to csv and also save this
361
  review_df = convert_review_json_to_pandas_df(all_image_annotations, review_file_state)
362
+ out_review_file_file_path = output_folder + file_name_with_ext + '_review_file.csv'
363
  review_df.to_csv(out_review_file_file_path, index=None)
364
  output_files.append(out_review_file_file_path)
365
 
 
378
  return df.loc[df["label"]==choice,:]
379
 
380
  def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
 
 
 
381
  row_value_page = evt.row_value[0] # This is the page number value
382
  return row_value_page
383