Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Nov 6, 2024

Commit

390bef2

1 Parent(s): 056204b

When on AWS, now loads in a default allow_list to exclude common words from redaction. Improved checks on AWS Comprehend calls.

Browse files

Files changed (5) hide show

app.py +28 -9
tools/aws_functions.py +4 -9
tools/custom_image_analyser_engine.py +26 -12
tools/file_redaction.py +27 -11
tools/helper_functions.py +16 -8

app.py CHANGED Viewed

@@ -9,8 +9,8 @@ import pandas as pd
 from datetime import datetime
 from gradio_image_annotation import image_annotator
-from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, custom_regex_load, reset_state_vars
-from tools.aws_functions import upload_file_to_s3, RUN_AWS_FUNCTIONS
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
 from tools.redaction_review import apply_redactions, crop, get_boxes_json, modify_existing_page_redactions, decrease_page, increase_page, update_annotator
@@ -108,6 +108,14 @@ with app:
     s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
     ###
     # UI DESIGN
@@ -139,8 +147,8 @@ with app:
             page_break_return = gr.Checkbox(value = False, label="Page break reached", visible=False)
         with gr.Row():
-            output_summary = gr.Textbox(label="Output summary")
-            output_file = gr.File(label="Output files")
             latest_file_completed_text = gr.Number(value=0, label="Number of documents redacted", interactive=False, visible=False)
         with gr.Row():
@@ -228,13 +236,15 @@ with app:
         with gr.Accordion("Settings for documents and open text/xlsx/csv files", open = True):
             with gr.Row():
-                    in_allow_list = gr.UploadButton(label="Import allow list file", file_count="multiple")
                     gr.Markdown("""Import allow list file - csv table with one column of a different word/phrase on each row (case sensitive). Terms in this file will not be redacted.""")
                     in_allow_list_text = gr.Textbox(label="Custom allow list load status")
-            in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact - local PII identification model (click close to down arrow for full list)")
-            in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="Entities to redact - AWS Comprehend PII identification model (click close to down arrow for full list)")
             handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
             #with gr.Row():
@@ -247,7 +257,7 @@ with app:
         log_files_output = gr.File(label="Log file output", interactive=False)
     # If a custom allow list is uploaded
-    in_allow_list.upload(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
     ###
     # PDF/IMAGE REDACTION
@@ -317,6 +327,15 @@ with app:
     # Get connection details on app load
     app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
     # Log usernames and times of access to file (to know who is using the app when running on AWS)
     access_callback = gr.CSVLogger(dataset_file_name=log_file_name)
     access_callback.setup([session_hash_textbox], access_logs_folder)

 from datetime import datetime
 from gradio_image_annotation import image_annotator
+from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, custom_regex_load, reset_state_vars, load_in_default_allow_list
+from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
 from tools.redaction_review import apply_redactions, crop, get_boxes_json, modify_existing_page_redactions, decrease_page, increase_page, update_annotator
     s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
+    ## S3 default bucket and allow list file state
+    default_allow_list_file_name = "default_allow_list.csv"
+    default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
+    s3_default_bucket = gr.Textbox(label = "Default S3 bucket", value=bucket_name, visible=False)
+    s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=default_allow_list_file_name, visible=False)
+    default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
     ###
     # UI DESIGN
             page_break_return = gr.Checkbox(value = False, label="Page break reached", visible=False)
         with gr.Row():
+            output_summary = gr.Textbox(label="Output summary", scale=1)
+            output_file = gr.File(label="Output files", scale = 2)
             latest_file_completed_text = gr.Number(value=0, label="Number of documents redacted", interactive=False, visible=False)
         with gr.Row():
         with gr.Accordion("Settings for documents and open text/xlsx/csv files", open = True):
             with gr.Row():
+                in_allow_list = gr.File(label="Import allow list file", file_count="multiple")
+                with gr.Column():
                     gr.Markdown("""Import allow list file - csv table with one column of a different word/phrase on each row (case sensitive). Terms in this file will not be redacted.""")
                     in_allow_list_text = gr.Textbox(label="Custom allow list load status")
+            with gr.Accordion("Add or remove entity types to redact", open = False):
+                in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact - local PII identification model (click close to down arrow for full list)")
+                in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="Entities to redact - AWS Comprehend PII identification model (click close to down arrow for full list)")
             handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
             #with gr.Row():
         log_files_output = gr.File(label="Log file output", interactive=False)
     # If a custom allow list is uploaded
+    in_allow_list.change(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
     ###
     # PDF/IMAGE REDACTION
     # Get connection details on app load
     app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
+    # If running on AWS, load in the default allow list file from S3
+    if RUN_AWS_FUNCTIONS == "1":
+        print("default_allow_list_output_folder_location:", default_allow_list_output_folder_location)
+        if not os.path.exists(default_allow_list_loc):
+            app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_allow_list_file, default_allow_list_output_folder_location]).\
+            then(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
+        else:
+            app.load(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
     # Log usernames and times of access to file (to know who is using the app when running on AWS)
     access_callback = gr.CSVLogger(dataset_file_name=log_file_name)
     access_callback.setup([session_hash_textbox], access_logs_folder)

tools/aws_functions.py CHANGED Viewed

@@ -38,9 +38,7 @@ def get_assumed_role_info():
 if RUN_AWS_FUNCTIONS == "1":
     try:
         bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
-        session = boto3.Session()
-        # Initialize the Boto3 client for Comprehend
     except Exception as e:
         print(e)
@@ -54,15 +52,12 @@ if RUN_AWS_FUNCTIONS == "1":
     except Exception as e:
         print(e)
 # Download direct from S3 - requires login credentials
-def download_file_from_s3(bucket_name, key, local_file_path):
     s3 = boto3.client('s3')
-    s3.download_file(bucket_name, key, local_file_path)
-    print(f"File downloaded from S3: s3://{bucket_name}/{key} to {local_file_path}")
 def download_folder_from_s3(bucket_name, s3_folder, local_folder):
     """

 if RUN_AWS_FUNCTIONS == "1":
     try:
         bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
+        session = boto3.Session()
     except Exception as e:
         print(e)
     except Exception as e:
         print(e)
 # Download direct from S3 - requires login credentials
+def download_file_from_s3(bucket_name, key, local_file_path_and_name):
     s3 = boto3.client('s3')
+    s3.download_file(bucket_name, key, local_file_path_and_name)
+    print(f"File downloaded from S3: s3://{bucket_name}/{key} to {local_file_path_and_name}")
 def download_folder_from_s3(bucket_name, s3_folder, local_folder):
     """

tools/custom_image_analyser_engine.py CHANGED Viewed

@@ -4,6 +4,7 @@ from presidio_analyzer import AnalyzerEngine, RecognizerResult
 #from presidio_image_redactor import ImagePreprocessor
 from typing import List, Dict, Optional, Union, Tuple
 from dataclasses import dataclass
 import cv2
 import PIL
 from PIL import ImageDraw, ImageFont, Image
@@ -479,6 +480,7 @@ class CustomImageAnalyzerEngine:
         for i, line_level_ocr_result in enumerate(line_level_ocr_results):
             analyzer_result = []
             # Analyze each OCR result (line) individually
@@ -489,23 +491,35 @@ class CustomImageAnalyzerEngine:
             elif pii_identification_method == "AWS Comprehend":
-                # Call the detect_pii_entities method
-                response = comprehend_client.detect_pii_entities(
-                    Text=line_level_ocr_result.text,
-                    LanguageCode=text_analyzer_kwargs["language"] # Specify the language of the text
-                )
-                comprehend_query_number += 1
-                for result in response["Entities"]:
-                    result_text = line_level_ocr_result.text[result["BeginOffset"]:result["EndOffset"]+1]
-                    if result_text not in allow_list:
-                        if result.get("Type") in chosen_redact_comprehend_entities:
-                            recogniser_entity = recognizer_result_from_dict(result)
-                            analyzer_result.append(recogniser_entity)
             if i < len(ocr_results_with_children):  # Check if i is a valid index

 #from presidio_image_redactor import ImagePreprocessor
 from typing import List, Dict, Optional, Union, Tuple
 from dataclasses import dataclass
+import time
 import cv2
 import PIL
 from PIL import ImageDraw, ImageFont, Image
         for i, line_level_ocr_result in enumerate(line_level_ocr_results):
             analyzer_result = []
+            response = []
             # Analyze each OCR result (line) individually
             elif pii_identification_method == "AWS Comprehend":
+                if len(line_level_ocr_result.text) >= 3:
+                    try:
+                        # Call the detect_pii_entities method
+                        response = comprehend_client.detect_pii_entities(
+                            Text=line_level_ocr_result.text,
+                            LanguageCode=text_analyzer_kwargs["language"] # Specify the language of the text
+                        )
+                    except Exception as e:
+                        print(e)
+                        time.sleep(3)
+                        response = comprehend_client.detect_pii_entities(
+                            Text=line_level_ocr_result.text,
+                            LanguageCode=text_analyzer_kwargs["language"] # Specify the language of the text
+                        )
+                    comprehend_query_number += 1
+                if response:
+                    for result in response["Entities"]:
+                        result_text = line_level_ocr_result.text[result["BeginOffset"]:result["EndOffset"]+1]
+                        if result_text not in allow_list:
+                            if result.get("Type") in chosen_redact_comprehend_entities:
+                                recogniser_entity = recognizer_result_from_dict(result)
+                                analyzer_result.append(recogniser_entity)
             if i < len(ocr_results_with_children):  # Check if i is a valid index

tools/file_redaction.py CHANGED Viewed

@@ -1306,6 +1306,7 @@ def identify_pii_in_text_container(text_container:OCRResult, language:str, chose
     '''
     comprehend_query_number = 0
     analyser_results = []
     #text_to_analyse = initial_clean(text_container.text).strip()
@@ -1322,24 +1323,39 @@ def identify_pii_in_text_container(text_container:OCRResult, language:str, chose
         elif pii_identification_method == "AWS Comprehend":
-            # Call the detect_pii_entities method
-            response = comprehend_client.detect_pii_entities(
-                Text=text_to_analyse,
-                LanguageCode=language  # Specify the language of the text
-            )
             comprehend_query_number += 1
-            for result in response["Entities"]:
-                result_text = text_to_analyse[result["BeginOffset"]:result["EndOffset"]+1]
-                if result_text not in allow_list:
-                    if result.get("Type") in chosen_redact_comprehend_entities:
-                        recogniser_entity = recognizer_result_from_dict(result)
-                        analyser_results.append(recogniser_entity)
         else:
             analyser_results = []

     '''
     comprehend_query_number = 0
     analyser_results = []
+    response = []
     #text_to_analyse = initial_clean(text_container.text).strip()
         elif pii_identification_method == "AWS Comprehend":
+            if len(text_to_analyse) >= 3:
+                    try:
+                        # Call the detect_pii_entities method
+                        response = comprehend_client.detect_pii_entities(
+                        Text=text_to_analyse,
+                        LanguageCode=language  # Specify the language of the text
+                        )
+                    except Exception as e:
+                        print(e)
+                        time.sleep(3)
+                        response = comprehend_client.detect_pii_entities(
+                        Text=text_to_analyse,
+                        LanguageCode=language  # Specify the language of the text
+                        )
             comprehend_query_number += 1
+            if response:
+                for result in response["Entities"]:
+                    result_text = text_to_analyse[result["BeginOffset"]:result["EndOffset"]+1]
+                    if result_text not in allow_list:
+                        if result.get("Type") in chosen_redact_comprehend_entities:
+                            recogniser_entity = recognizer_result_from_dict(result)
+                            analyser_results.append(recogniser_entity)
+            else:
+                analyser_results = []
         else:
             analyser_results = []

tools/helper_functions.py CHANGED Viewed

@@ -25,6 +25,12 @@ default_value = 'output/'
 output_folder = get_or_create_env_var(env_var_name, default_value)
 print(f'The value of {env_var_name} is {output_folder}')
 def get_file_path_end(file_path):
     # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
     basename = os.path.basename(file_path)
@@ -85,16 +91,18 @@ def custom_regex_load(in_file):
     custom_regex = pd.DataFrame()
-    file_list = [string.name for string in in_file]
-    regex_file_names = [string for string in file_list if "csv" in string.lower()]
-    if regex_file_names:
-        regex_file_name = regex_file_names[0]
-        custom_regex = pd.read_csv(regex_file_name, low_memory=False, header=None)
-        #regex_file_name_no_ext = get_file_path_end(regex_file_name)
-        output_text = "Allow list file loaded."
-        print(output_text)
     else:
         error = "No allow list file provided."
         print(error)

 output_folder = get_or_create_env_var(env_var_name, default_value)
 print(f'The value of {env_var_name} is {output_folder}')
+def load_in_default_allow_list(allow_list_file_path):
+    if isinstance(allow_list_file_path, str):
+        allow_list_file_path = [allow_list_file_path]
+    return allow_list_file_path
 def get_file_path_end(file_path):
     # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
     basename = os.path.basename(file_path)
     custom_regex = pd.DataFrame()
+    if in_file:
+        file_list = [string.name for string in in_file]
+        regex_file_names = [string for string in file_list if "csv" in string.lower()]
+        if regex_file_names:
+            regex_file_name = regex_file_names[0]
+            custom_regex = pd.read_csv(regex_file_name, low_memory=False, header=None)
+            #regex_file_name_no_ext = get_file_path_end(regex_file_name)
+            output_text = "Allow list file loaded."
+            print(output_text)
     else:
         error = "No allow list file provided."
         print(error)