seanpedrickcase commited on
Commit
390bef2
·
1 Parent(s): 056204b

When on AWS, now loads in a default allow_list to exclude common words from redaction. Improved checks on AWS Comprehend calls.

Browse files
app.py CHANGED
@@ -9,8 +9,8 @@ import pandas as pd
9
  from datetime import datetime
10
  from gradio_image_annotation import image_annotator
11
 
12
- from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, custom_regex_load, reset_state_vars
13
- from tools.aws_functions import upload_file_to_s3, RUN_AWS_FUNCTIONS
14
  from tools.file_redaction import choose_and_run_redactor
15
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
16
  from tools.redaction_review import apply_redactions, crop, get_boxes_json, modify_existing_page_redactions, decrease_page, increase_page, update_annotator
@@ -108,6 +108,14 @@ with app:
108
 
109
  s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
110
 
 
 
 
 
 
 
 
 
111
 
112
  ###
113
  # UI DESIGN
@@ -139,8 +147,8 @@ with app:
139
  page_break_return = gr.Checkbox(value = False, label="Page break reached", visible=False)
140
 
141
  with gr.Row():
142
- output_summary = gr.Textbox(label="Output summary")
143
- output_file = gr.File(label="Output files")
144
  latest_file_completed_text = gr.Number(value=0, label="Number of documents redacted", interactive=False, visible=False)
145
 
146
  with gr.Row():
@@ -228,13 +236,15 @@ with app:
228
 
229
  with gr.Accordion("Settings for documents and open text/xlsx/csv files", open = True):
230
  with gr.Row():
231
- in_allow_list = gr.UploadButton(label="Import allow list file", file_count="multiple")
 
232
  gr.Markdown("""Import allow list file - csv table with one column of a different word/phrase on each row (case sensitive). Terms in this file will not be redacted.""")
233
  in_allow_list_text = gr.Textbox(label="Custom allow list load status")
234
 
235
- in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact - local PII identification model (click close to down arrow for full list)")
236
-
237
- in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="Entities to redact - AWS Comprehend PII identification model (click close to down arrow for full list)")
 
238
 
239
  handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
240
  #with gr.Row():
@@ -247,7 +257,7 @@ with app:
247
  log_files_output = gr.File(label="Log file output", interactive=False)
248
 
249
  # If a custom allow list is uploaded
250
- in_allow_list.upload(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
251
 
252
  ###
253
  # PDF/IMAGE REDACTION
@@ -317,6 +327,15 @@ with app:
317
  # Get connection details on app load
318
  app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
319
 
 
 
 
 
 
 
 
 
 
320
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
321
  access_callback = gr.CSVLogger(dataset_file_name=log_file_name)
322
  access_callback.setup([session_hash_textbox], access_logs_folder)
 
9
  from datetime import datetime
10
  from gradio_image_annotation import image_annotator
11
 
12
+ from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, custom_regex_load, reset_state_vars, load_in_default_allow_list
13
+ from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
14
  from tools.file_redaction import choose_and_run_redactor
15
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
16
  from tools.redaction_review import apply_redactions, crop, get_boxes_json, modify_existing_page_redactions, decrease_page, increase_page, update_annotator
 
108
 
109
  s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
110
 
111
+ ## S3 default bucket and allow list file state
112
+ default_allow_list_file_name = "default_allow_list.csv"
113
+ default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
114
+
115
+ s3_default_bucket = gr.Textbox(label = "Default S3 bucket", value=bucket_name, visible=False)
116
+ s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=default_allow_list_file_name, visible=False)
117
+ default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
118
+
119
 
120
  ###
121
  # UI DESIGN
 
147
  page_break_return = gr.Checkbox(value = False, label="Page break reached", visible=False)
148
 
149
  with gr.Row():
150
+ output_summary = gr.Textbox(label="Output summary", scale=1)
151
+ output_file = gr.File(label="Output files", scale = 2)
152
  latest_file_completed_text = gr.Number(value=0, label="Number of documents redacted", interactive=False, visible=False)
153
 
154
  with gr.Row():
 
236
 
237
  with gr.Accordion("Settings for documents and open text/xlsx/csv files", open = True):
238
  with gr.Row():
239
+ in_allow_list = gr.File(label="Import allow list file", file_count="multiple")
240
+ with gr.Column():
241
  gr.Markdown("""Import allow list file - csv table with one column of a different word/phrase on each row (case sensitive). Terms in this file will not be redacted.""")
242
  in_allow_list_text = gr.Textbox(label="Custom allow list load status")
243
 
244
+ with gr.Accordion("Add or remove entity types to redact", open = False):
245
+ in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact - local PII identification model (click close to down arrow for full list)")
246
+
247
+ in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="Entities to redact - AWS Comprehend PII identification model (click close to down arrow for full list)")
248
 
249
  handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
250
  #with gr.Row():
 
257
  log_files_output = gr.File(label="Log file output", interactive=False)
258
 
259
  # If a custom allow list is uploaded
260
+ in_allow_list.change(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
261
 
262
  ###
263
  # PDF/IMAGE REDACTION
 
327
  # Get connection details on app load
328
  app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
329
 
330
+ # If running on AWS, load in the default allow list file from S3
331
+ if RUN_AWS_FUNCTIONS == "1":
332
+ print("default_allow_list_output_folder_location:", default_allow_list_output_folder_location)
333
+ if not os.path.exists(default_allow_list_loc):
334
+ app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_allow_list_file, default_allow_list_output_folder_location]).\
335
+ then(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
336
+ else:
337
+ app.load(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
338
+
339
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
340
  access_callback = gr.CSVLogger(dataset_file_name=log_file_name)
341
  access_callback.setup([session_hash_textbox], access_logs_folder)
tools/aws_functions.py CHANGED
@@ -38,9 +38,7 @@ def get_assumed_role_info():
38
  if RUN_AWS_FUNCTIONS == "1":
39
  try:
40
  bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
41
- session = boto3.Session()
42
- # Initialize the Boto3 client for Comprehend
43
-
44
 
45
  except Exception as e:
46
  print(e)
@@ -54,15 +52,12 @@ if RUN_AWS_FUNCTIONS == "1":
54
  except Exception as e:
55
  print(e)
56
 
57
-
58
-
59
-
60
  # Download direct from S3 - requires login credentials
61
- def download_file_from_s3(bucket_name, key, local_file_path):
62
 
63
  s3 = boto3.client('s3')
64
- s3.download_file(bucket_name, key, local_file_path)
65
- print(f"File downloaded from S3: s3://{bucket_name}/{key} to {local_file_path}")
66
 
67
  def download_folder_from_s3(bucket_name, s3_folder, local_folder):
68
  """
 
38
  if RUN_AWS_FUNCTIONS == "1":
39
  try:
40
  bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
41
+ session = boto3.Session()
 
 
42
 
43
  except Exception as e:
44
  print(e)
 
52
  except Exception as e:
53
  print(e)
54
 
 
 
 
55
  # Download direct from S3 - requires login credentials
56
+ def download_file_from_s3(bucket_name, key, local_file_path_and_name):
57
 
58
  s3 = boto3.client('s3')
59
+ s3.download_file(bucket_name, key, local_file_path_and_name)
60
+ print(f"File downloaded from S3: s3://{bucket_name}/{key} to {local_file_path_and_name}")
61
 
62
  def download_folder_from_s3(bucket_name, s3_folder, local_folder):
63
  """
tools/custom_image_analyser_engine.py CHANGED
@@ -4,6 +4,7 @@ from presidio_analyzer import AnalyzerEngine, RecognizerResult
4
  #from presidio_image_redactor import ImagePreprocessor
5
  from typing import List, Dict, Optional, Union, Tuple
6
  from dataclasses import dataclass
 
7
  import cv2
8
  import PIL
9
  from PIL import ImageDraw, ImageFont, Image
@@ -479,6 +480,7 @@ class CustomImageAnalyzerEngine:
479
  for i, line_level_ocr_result in enumerate(line_level_ocr_results):
480
 
481
  analyzer_result = []
 
482
 
483
  # Analyze each OCR result (line) individually
484
 
@@ -489,23 +491,35 @@ class CustomImageAnalyzerEngine:
489
 
490
  elif pii_identification_method == "AWS Comprehend":
491
 
492
- # Call the detect_pii_entities method
493
- response = comprehend_client.detect_pii_entities(
494
- Text=line_level_ocr_result.text,
495
- LanguageCode=text_analyzer_kwargs["language"] # Specify the language of the text
496
- )
 
 
 
 
 
 
 
 
 
 
 
497
 
498
- comprehend_query_number += 1
499
 
500
- for result in response["Entities"]:
501
- result_text = line_level_ocr_result.text[result["BeginOffset"]:result["EndOffset"]+1]
 
502
 
503
- if result_text not in allow_list:
504
 
505
- if result.get("Type") in chosen_redact_comprehend_entities:
506
 
507
- recogniser_entity = recognizer_result_from_dict(result)
508
- analyzer_result.append(recogniser_entity)
509
 
510
 
511
  if i < len(ocr_results_with_children): # Check if i is a valid index
 
4
  #from presidio_image_redactor import ImagePreprocessor
5
  from typing import List, Dict, Optional, Union, Tuple
6
  from dataclasses import dataclass
7
+ import time
8
  import cv2
9
  import PIL
10
  from PIL import ImageDraw, ImageFont, Image
 
480
  for i, line_level_ocr_result in enumerate(line_level_ocr_results):
481
 
482
  analyzer_result = []
483
+ response = []
484
 
485
  # Analyze each OCR result (line) individually
486
 
 
491
 
492
  elif pii_identification_method == "AWS Comprehend":
493
 
494
+ if len(line_level_ocr_result.text) >= 3:
495
+
496
+ try:
497
+ # Call the detect_pii_entities method
498
+ response = comprehend_client.detect_pii_entities(
499
+ Text=line_level_ocr_result.text,
500
+ LanguageCode=text_analyzer_kwargs["language"] # Specify the language of the text
501
+ )
502
+ except Exception as e:
503
+ print(e)
504
+ time.sleep(3)
505
+
506
+ response = comprehend_client.detect_pii_entities(
507
+ Text=line_level_ocr_result.text,
508
+ LanguageCode=text_analyzer_kwargs["language"] # Specify the language of the text
509
+ )
510
 
511
+ comprehend_query_number += 1
512
 
513
+ if response:
514
+ for result in response["Entities"]:
515
+ result_text = line_level_ocr_result.text[result["BeginOffset"]:result["EndOffset"]+1]
516
 
517
+ if result_text not in allow_list:
518
 
519
+ if result.get("Type") in chosen_redact_comprehend_entities:
520
 
521
+ recogniser_entity = recognizer_result_from_dict(result)
522
+ analyzer_result.append(recogniser_entity)
523
 
524
 
525
  if i < len(ocr_results_with_children): # Check if i is a valid index
tools/file_redaction.py CHANGED
@@ -1306,6 +1306,7 @@ def identify_pii_in_text_container(text_container:OCRResult, language:str, chose
1306
  '''
1307
  comprehend_query_number = 0
1308
  analyser_results = []
 
1309
 
1310
  #text_to_analyse = initial_clean(text_container.text).strip()
1311
 
@@ -1322,24 +1323,39 @@ def identify_pii_in_text_container(text_container:OCRResult, language:str, chose
1322
 
1323
  elif pii_identification_method == "AWS Comprehend":
1324
 
1325
- # Call the detect_pii_entities method
1326
- response = comprehend_client.detect_pii_entities(
1327
- Text=text_to_analyse,
1328
- LanguageCode=language # Specify the language of the text
1329
- )
 
 
 
 
 
 
 
 
 
 
 
 
1330
 
1331
  comprehend_query_number += 1
1332
 
1333
- for result in response["Entities"]:
 
1334
 
1335
- result_text = text_to_analyse[result["BeginOffset"]:result["EndOffset"]+1]
1336
 
1337
- if result_text not in allow_list:
1338
- if result.get("Type") in chosen_redact_comprehend_entities:
1339
 
1340
- recogniser_entity = recognizer_result_from_dict(result)
1341
 
1342
- analyser_results.append(recogniser_entity)
 
 
1343
 
1344
  else:
1345
  analyser_results = []
 
1306
  '''
1307
  comprehend_query_number = 0
1308
  analyser_results = []
1309
+ response = []
1310
 
1311
  #text_to_analyse = initial_clean(text_container.text).strip()
1312
 
 
1323
 
1324
  elif pii_identification_method == "AWS Comprehend":
1325
 
1326
+
1327
+ if len(text_to_analyse) >= 3:
1328
+
1329
+ try:
1330
+ # Call the detect_pii_entities method
1331
+ response = comprehend_client.detect_pii_entities(
1332
+ Text=text_to_analyse,
1333
+ LanguageCode=language # Specify the language of the text
1334
+ )
1335
+ except Exception as e:
1336
+ print(e)
1337
+ time.sleep(3)
1338
+
1339
+ response = comprehend_client.detect_pii_entities(
1340
+ Text=text_to_analyse,
1341
+ LanguageCode=language # Specify the language of the text
1342
+ )
1343
 
1344
  comprehend_query_number += 1
1345
 
1346
+ if response:
1347
+ for result in response["Entities"]:
1348
 
1349
+ result_text = text_to_analyse[result["BeginOffset"]:result["EndOffset"]+1]
1350
 
1351
+ if result_text not in allow_list:
1352
+ if result.get("Type") in chosen_redact_comprehend_entities:
1353
 
1354
+ recogniser_entity = recognizer_result_from_dict(result)
1355
 
1356
+ analyser_results.append(recogniser_entity)
1357
+ else:
1358
+ analyser_results = []
1359
 
1360
  else:
1361
  analyser_results = []
tools/helper_functions.py CHANGED
@@ -25,6 +25,12 @@ default_value = 'output/'
25
  output_folder = get_or_create_env_var(env_var_name, default_value)
26
  print(f'The value of {env_var_name} is {output_folder}')
27
 
 
 
 
 
 
 
28
  def get_file_path_end(file_path):
29
  # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
30
  basename = os.path.basename(file_path)
@@ -85,16 +91,18 @@ def custom_regex_load(in_file):
85
 
86
  custom_regex = pd.DataFrame()
87
 
88
- file_list = [string.name for string in in_file]
 
 
89
 
90
- regex_file_names = [string for string in file_list if "csv" in string.lower()]
91
- if regex_file_names:
92
- regex_file_name = regex_file_names[0]
93
- custom_regex = pd.read_csv(regex_file_name, low_memory=False, header=None)
94
- #regex_file_name_no_ext = get_file_path_end(regex_file_name)
95
 
96
- output_text = "Allow list file loaded."
97
- print(output_text)
98
  else:
99
  error = "No allow list file provided."
100
  print(error)
 
25
  output_folder = get_or_create_env_var(env_var_name, default_value)
26
  print(f'The value of {env_var_name} is {output_folder}')
27
 
28
+ def load_in_default_allow_list(allow_list_file_path):
29
+ if isinstance(allow_list_file_path, str):
30
+ allow_list_file_path = [allow_list_file_path]
31
+ return allow_list_file_path
32
+
33
+
34
  def get_file_path_end(file_path):
35
  # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
36
  basename = os.path.basename(file_path)
 
91
 
92
  custom_regex = pd.DataFrame()
93
 
94
+ if in_file:
95
+
96
+ file_list = [string.name for string in in_file]
97
 
98
+ regex_file_names = [string for string in file_list if "csv" in string.lower()]
99
+ if regex_file_names:
100
+ regex_file_name = regex_file_names[0]
101
+ custom_regex = pd.read_csv(regex_file_name, low_memory=False, header=None)
102
+ #regex_file_name_no_ext = get_file_path_end(regex_file_name)
103
 
104
+ output_text = "Allow list file loaded."
105
+ print(output_text)
106
  else:
107
  error = "No allow list file provided."
108
  print(error)