Commit
·
390bef2
1
Parent(s):
056204b
When on AWS, now loads in a default allow_list to exclude common words from redaction. Improved checks on AWS Comprehend calls.
Browse files- app.py +28 -9
- tools/aws_functions.py +4 -9
- tools/custom_image_analyser_engine.py +26 -12
- tools/file_redaction.py +27 -11
- tools/helper_functions.py +16 -8
app.py
CHANGED
@@ -9,8 +9,8 @@ import pandas as pd
|
|
9 |
from datetime import datetime
|
10 |
from gradio_image_annotation import image_annotator
|
11 |
|
12 |
-
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, custom_regex_load, reset_state_vars
|
13 |
-
from tools.aws_functions import upload_file_to_s3, RUN_AWS_FUNCTIONS
|
14 |
from tools.file_redaction import choose_and_run_redactor
|
15 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
16 |
from tools.redaction_review import apply_redactions, crop, get_boxes_json, modify_existing_page_redactions, decrease_page, increase_page, update_annotator
|
@@ -108,6 +108,14 @@ with app:
|
|
108 |
|
109 |
s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
|
110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
|
112 |
###
|
113 |
# UI DESIGN
|
@@ -139,8 +147,8 @@ with app:
|
|
139 |
page_break_return = gr.Checkbox(value = False, label="Page break reached", visible=False)
|
140 |
|
141 |
with gr.Row():
|
142 |
-
output_summary = gr.Textbox(label="Output summary")
|
143 |
-
output_file = gr.File(label="Output files")
|
144 |
latest_file_completed_text = gr.Number(value=0, label="Number of documents redacted", interactive=False, visible=False)
|
145 |
|
146 |
with gr.Row():
|
@@ -228,13 +236,15 @@ with app:
|
|
228 |
|
229 |
with gr.Accordion("Settings for documents and open text/xlsx/csv files", open = True):
|
230 |
with gr.Row():
|
231 |
-
|
|
|
232 |
gr.Markdown("""Import allow list file - csv table with one column of a different word/phrase on each row (case sensitive). Terms in this file will not be redacted.""")
|
233 |
in_allow_list_text = gr.Textbox(label="Custom allow list load status")
|
234 |
|
235 |
-
|
236 |
-
|
237 |
-
|
|
|
238 |
|
239 |
handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
|
240 |
#with gr.Row():
|
@@ -247,7 +257,7 @@ with app:
|
|
247 |
log_files_output = gr.File(label="Log file output", interactive=False)
|
248 |
|
249 |
# If a custom allow list is uploaded
|
250 |
-
in_allow_list.
|
251 |
|
252 |
###
|
253 |
# PDF/IMAGE REDACTION
|
@@ -317,6 +327,15 @@ with app:
|
|
317 |
# Get connection details on app load
|
318 |
app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
|
319 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
320 |
# Log usernames and times of access to file (to know who is using the app when running on AWS)
|
321 |
access_callback = gr.CSVLogger(dataset_file_name=log_file_name)
|
322 |
access_callback.setup([session_hash_textbox], access_logs_folder)
|
|
|
9 |
from datetime import datetime
|
10 |
from gradio_image_annotation import image_annotator
|
11 |
|
12 |
+
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, custom_regex_load, reset_state_vars, load_in_default_allow_list
|
13 |
+
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
|
14 |
from tools.file_redaction import choose_and_run_redactor
|
15 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
16 |
from tools.redaction_review import apply_redactions, crop, get_boxes_json, modify_existing_page_redactions, decrease_page, increase_page, update_annotator
|
|
|
108 |
|
109 |
s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
|
110 |
|
111 |
+
## S3 default bucket and allow list file state
|
112 |
+
default_allow_list_file_name = "default_allow_list.csv"
|
113 |
+
default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
|
114 |
+
|
115 |
+
s3_default_bucket = gr.Textbox(label = "Default S3 bucket", value=bucket_name, visible=False)
|
116 |
+
s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=default_allow_list_file_name, visible=False)
|
117 |
+
default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
|
118 |
+
|
119 |
|
120 |
###
|
121 |
# UI DESIGN
|
|
|
147 |
page_break_return = gr.Checkbox(value = False, label="Page break reached", visible=False)
|
148 |
|
149 |
with gr.Row():
|
150 |
+
output_summary = gr.Textbox(label="Output summary", scale=1)
|
151 |
+
output_file = gr.File(label="Output files", scale = 2)
|
152 |
latest_file_completed_text = gr.Number(value=0, label="Number of documents redacted", interactive=False, visible=False)
|
153 |
|
154 |
with gr.Row():
|
|
|
236 |
|
237 |
with gr.Accordion("Settings for documents and open text/xlsx/csv files", open = True):
|
238 |
with gr.Row():
|
239 |
+
in_allow_list = gr.File(label="Import allow list file", file_count="multiple")
|
240 |
+
with gr.Column():
|
241 |
gr.Markdown("""Import allow list file - csv table with one column of a different word/phrase on each row (case sensitive). Terms in this file will not be redacted.""")
|
242 |
in_allow_list_text = gr.Textbox(label="Custom allow list load status")
|
243 |
|
244 |
+
with gr.Accordion("Add or remove entity types to redact", open = False):
|
245 |
+
in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact - local PII identification model (click close to down arrow for full list)")
|
246 |
+
|
247 |
+
in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="Entities to redact - AWS Comprehend PII identification model (click close to down arrow for full list)")
|
248 |
|
249 |
handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
|
250 |
#with gr.Row():
|
|
|
257 |
log_files_output = gr.File(label="Log file output", interactive=False)
|
258 |
|
259 |
# If a custom allow list is uploaded
|
260 |
+
in_allow_list.change(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
|
261 |
|
262 |
###
|
263 |
# PDF/IMAGE REDACTION
|
|
|
327 |
# Get connection details on app load
|
328 |
app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
|
329 |
|
330 |
+
# If running on AWS, load in the default allow list file from S3
|
331 |
+
if RUN_AWS_FUNCTIONS == "1":
|
332 |
+
print("default_allow_list_output_folder_location:", default_allow_list_output_folder_location)
|
333 |
+
if not os.path.exists(default_allow_list_loc):
|
334 |
+
app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_allow_list_file, default_allow_list_output_folder_location]).\
|
335 |
+
then(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
|
336 |
+
else:
|
337 |
+
app.load(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
|
338 |
+
|
339 |
# Log usernames and times of access to file (to know who is using the app when running on AWS)
|
340 |
access_callback = gr.CSVLogger(dataset_file_name=log_file_name)
|
341 |
access_callback.setup([session_hash_textbox], access_logs_folder)
|
tools/aws_functions.py
CHANGED
@@ -38,9 +38,7 @@ def get_assumed_role_info():
|
|
38 |
if RUN_AWS_FUNCTIONS == "1":
|
39 |
try:
|
40 |
bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
|
41 |
-
session = boto3.Session()
|
42 |
-
# Initialize the Boto3 client for Comprehend
|
43 |
-
|
44 |
|
45 |
except Exception as e:
|
46 |
print(e)
|
@@ -54,15 +52,12 @@ if RUN_AWS_FUNCTIONS == "1":
|
|
54 |
except Exception as e:
|
55 |
print(e)
|
56 |
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
# Download direct from S3 - requires login credentials
|
61 |
-
def download_file_from_s3(bucket_name, key,
|
62 |
|
63 |
s3 = boto3.client('s3')
|
64 |
-
s3.download_file(bucket_name, key,
|
65 |
-
print(f"File downloaded from S3: s3://{bucket_name}/{key} to {
|
66 |
|
67 |
def download_folder_from_s3(bucket_name, s3_folder, local_folder):
|
68 |
"""
|
|
|
38 |
if RUN_AWS_FUNCTIONS == "1":
|
39 |
try:
|
40 |
bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
|
41 |
+
session = boto3.Session()
|
|
|
|
|
42 |
|
43 |
except Exception as e:
|
44 |
print(e)
|
|
|
52 |
except Exception as e:
|
53 |
print(e)
|
54 |
|
|
|
|
|
|
|
55 |
# Download direct from S3 - requires login credentials
|
56 |
+
def download_file_from_s3(bucket_name, key, local_file_path_and_name):
|
57 |
|
58 |
s3 = boto3.client('s3')
|
59 |
+
s3.download_file(bucket_name, key, local_file_path_and_name)
|
60 |
+
print(f"File downloaded from S3: s3://{bucket_name}/{key} to {local_file_path_and_name}")
|
61 |
|
62 |
def download_folder_from_s3(bucket_name, s3_folder, local_folder):
|
63 |
"""
|
tools/custom_image_analyser_engine.py
CHANGED
@@ -4,6 +4,7 @@ from presidio_analyzer import AnalyzerEngine, RecognizerResult
|
|
4 |
#from presidio_image_redactor import ImagePreprocessor
|
5 |
from typing import List, Dict, Optional, Union, Tuple
|
6 |
from dataclasses import dataclass
|
|
|
7 |
import cv2
|
8 |
import PIL
|
9 |
from PIL import ImageDraw, ImageFont, Image
|
@@ -479,6 +480,7 @@ class CustomImageAnalyzerEngine:
|
|
479 |
for i, line_level_ocr_result in enumerate(line_level_ocr_results):
|
480 |
|
481 |
analyzer_result = []
|
|
|
482 |
|
483 |
# Analyze each OCR result (line) individually
|
484 |
|
@@ -489,23 +491,35 @@ class CustomImageAnalyzerEngine:
|
|
489 |
|
490 |
elif pii_identification_method == "AWS Comprehend":
|
491 |
|
492 |
-
|
493 |
-
|
494 |
-
|
495 |
-
|
496 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
497 |
|
498 |
-
|
499 |
|
500 |
-
|
501 |
-
|
|
|
502 |
|
503 |
-
|
504 |
|
505 |
-
|
506 |
|
507 |
-
|
508 |
-
|
509 |
|
510 |
|
511 |
if i < len(ocr_results_with_children): # Check if i is a valid index
|
|
|
4 |
#from presidio_image_redactor import ImagePreprocessor
|
5 |
from typing import List, Dict, Optional, Union, Tuple
|
6 |
from dataclasses import dataclass
|
7 |
+
import time
|
8 |
import cv2
|
9 |
import PIL
|
10 |
from PIL import ImageDraw, ImageFont, Image
|
|
|
480 |
for i, line_level_ocr_result in enumerate(line_level_ocr_results):
|
481 |
|
482 |
analyzer_result = []
|
483 |
+
response = []
|
484 |
|
485 |
# Analyze each OCR result (line) individually
|
486 |
|
|
|
491 |
|
492 |
elif pii_identification_method == "AWS Comprehend":
|
493 |
|
494 |
+
if len(line_level_ocr_result.text) >= 3:
|
495 |
+
|
496 |
+
try:
|
497 |
+
# Call the detect_pii_entities method
|
498 |
+
response = comprehend_client.detect_pii_entities(
|
499 |
+
Text=line_level_ocr_result.text,
|
500 |
+
LanguageCode=text_analyzer_kwargs["language"] # Specify the language of the text
|
501 |
+
)
|
502 |
+
except Exception as e:
|
503 |
+
print(e)
|
504 |
+
time.sleep(3)
|
505 |
+
|
506 |
+
response = comprehend_client.detect_pii_entities(
|
507 |
+
Text=line_level_ocr_result.text,
|
508 |
+
LanguageCode=text_analyzer_kwargs["language"] # Specify the language of the text
|
509 |
+
)
|
510 |
|
511 |
+
comprehend_query_number += 1
|
512 |
|
513 |
+
if response:
|
514 |
+
for result in response["Entities"]:
|
515 |
+
result_text = line_level_ocr_result.text[result["BeginOffset"]:result["EndOffset"]+1]
|
516 |
|
517 |
+
if result_text not in allow_list:
|
518 |
|
519 |
+
if result.get("Type") in chosen_redact_comprehend_entities:
|
520 |
|
521 |
+
recogniser_entity = recognizer_result_from_dict(result)
|
522 |
+
analyzer_result.append(recogniser_entity)
|
523 |
|
524 |
|
525 |
if i < len(ocr_results_with_children): # Check if i is a valid index
|
tools/file_redaction.py
CHANGED
@@ -1306,6 +1306,7 @@ def identify_pii_in_text_container(text_container:OCRResult, language:str, chose
|
|
1306 |
'''
|
1307 |
comprehend_query_number = 0
|
1308 |
analyser_results = []
|
|
|
1309 |
|
1310 |
#text_to_analyse = initial_clean(text_container.text).strip()
|
1311 |
|
@@ -1322,24 +1323,39 @@ def identify_pii_in_text_container(text_container:OCRResult, language:str, chose
|
|
1322 |
|
1323 |
elif pii_identification_method == "AWS Comprehend":
|
1324 |
|
1325 |
-
|
1326 |
-
|
1327 |
-
|
1328 |
-
|
1329 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1330 |
|
1331 |
comprehend_query_number += 1
|
1332 |
|
1333 |
-
|
|
|
1334 |
|
1335 |
-
|
1336 |
|
1337 |
-
|
1338 |
-
|
1339 |
|
1340 |
-
|
1341 |
|
1342 |
-
|
|
|
|
|
1343 |
|
1344 |
else:
|
1345 |
analyser_results = []
|
|
|
1306 |
'''
|
1307 |
comprehend_query_number = 0
|
1308 |
analyser_results = []
|
1309 |
+
response = []
|
1310 |
|
1311 |
#text_to_analyse = initial_clean(text_container.text).strip()
|
1312 |
|
|
|
1323 |
|
1324 |
elif pii_identification_method == "AWS Comprehend":
|
1325 |
|
1326 |
+
|
1327 |
+
if len(text_to_analyse) >= 3:
|
1328 |
+
|
1329 |
+
try:
|
1330 |
+
# Call the detect_pii_entities method
|
1331 |
+
response = comprehend_client.detect_pii_entities(
|
1332 |
+
Text=text_to_analyse,
|
1333 |
+
LanguageCode=language # Specify the language of the text
|
1334 |
+
)
|
1335 |
+
except Exception as e:
|
1336 |
+
print(e)
|
1337 |
+
time.sleep(3)
|
1338 |
+
|
1339 |
+
response = comprehend_client.detect_pii_entities(
|
1340 |
+
Text=text_to_analyse,
|
1341 |
+
LanguageCode=language # Specify the language of the text
|
1342 |
+
)
|
1343 |
|
1344 |
comprehend_query_number += 1
|
1345 |
|
1346 |
+
if response:
|
1347 |
+
for result in response["Entities"]:
|
1348 |
|
1349 |
+
result_text = text_to_analyse[result["BeginOffset"]:result["EndOffset"]+1]
|
1350 |
|
1351 |
+
if result_text not in allow_list:
|
1352 |
+
if result.get("Type") in chosen_redact_comprehend_entities:
|
1353 |
|
1354 |
+
recogniser_entity = recognizer_result_from_dict(result)
|
1355 |
|
1356 |
+
analyser_results.append(recogniser_entity)
|
1357 |
+
else:
|
1358 |
+
analyser_results = []
|
1359 |
|
1360 |
else:
|
1361 |
analyser_results = []
|
tools/helper_functions.py
CHANGED
@@ -25,6 +25,12 @@ default_value = 'output/'
|
|
25 |
output_folder = get_or_create_env_var(env_var_name, default_value)
|
26 |
print(f'The value of {env_var_name} is {output_folder}')
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
def get_file_path_end(file_path):
|
29 |
# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
|
30 |
basename = os.path.basename(file_path)
|
@@ -85,16 +91,18 @@ def custom_regex_load(in_file):
|
|
85 |
|
86 |
custom_regex = pd.DataFrame()
|
87 |
|
88 |
-
|
|
|
|
|
89 |
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
|
96 |
-
|
97 |
-
|
98 |
else:
|
99 |
error = "No allow list file provided."
|
100 |
print(error)
|
|
|
25 |
output_folder = get_or_create_env_var(env_var_name, default_value)
|
26 |
print(f'The value of {env_var_name} is {output_folder}')
|
27 |
|
28 |
+
def load_in_default_allow_list(allow_list_file_path):
|
29 |
+
if isinstance(allow_list_file_path, str):
|
30 |
+
allow_list_file_path = [allow_list_file_path]
|
31 |
+
return allow_list_file_path
|
32 |
+
|
33 |
+
|
34 |
def get_file_path_end(file_path):
|
35 |
# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
|
36 |
basename = os.path.basename(file_path)
|
|
|
91 |
|
92 |
custom_regex = pd.DataFrame()
|
93 |
|
94 |
+
if in_file:
|
95 |
+
|
96 |
+
file_list = [string.name for string in in_file]
|
97 |
|
98 |
+
regex_file_names = [string for string in file_list if "csv" in string.lower()]
|
99 |
+
if regex_file_names:
|
100 |
+
regex_file_name = regex_file_names[0]
|
101 |
+
custom_regex = pd.read_csv(regex_file_name, low_memory=False, header=None)
|
102 |
+
#regex_file_name_no_ext = get_file_path_end(regex_file_name)
|
103 |
|
104 |
+
output_text = "Allow list file loaded."
|
105 |
+
print(output_text)
|
106 |
else:
|
107 |
error = "No allow list file provided."
|
108 |
print(error)
|