Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Mar 24

Commit

0e1a4a7

1 Parent(s): 6319afc

Fixed manual entry for allow, deny, and full page redaction lists

Browse files

Files changed (5) hide show

app.py +16 -11
tools/file_conversion.py +3 -3
tools/file_redaction.py +57 -53
tools/helper_functions.py +14 -6
tools/redaction_review.py +0 -3

app.py CHANGED Viewed

@@ -11,7 +11,7 @@ from gradio_image_annotation import image_annotator
 from gradio_image_annotation.image_annotator import AnnotatedImageData
 from tools.config import output_folder, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, DEFAULT_ALLOW_LIST_PATH
-from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, reset_review_vars, merge_csv_files, load_all_output_files
 from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
@@ -145,11 +145,11 @@ with app:
     ## Settings page variables
     default_deny_list_file_name = "default_deny_list.csv"
     default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
-    in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
     fully_redacted_list_file_name = "default_fully_redacted_list.csv"
     fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
-    in_fully_redacted_text_in = gr.Textbox(value="Fully redacted page list", visible=False)
     # S3 settings for default allow list load
     s3_default_bucket = gr.Textbox(label = "Default S3 bucket", value=bucket_name, visible=False)
@@ -337,19 +337,19 @@ with app:
         with gr.Accordion("Custom allow, deny, and full page redaction lists", open = True):
             with gr.Row():
                 with gr.Column():
-                    in_allow_list = gr.File(label="Import allow list file - csv table with one column of a different word/phrase on each row (case sensitive). Terms in this file will not be redacted.", file_count="multiple", height=file_input_height)
                     in_allow_list_text = gr.Textbox(label="Custom allow list load status")
                 with gr.Column():
-                    in_deny_list = gr.File(label="Import custom deny list - csv table with one column of a different word/phrase on each row (case sensitive). Terms in this file will always be redacted.", file_count="multiple", height=file_input_height)
                     in_deny_list_text = gr.Textbox(label="Custom deny list load status")
                 with gr.Column():
                     in_fully_redacted_list = gr.File(label="Import fully redacted pages list - csv table with one column of page numbers on each row. Page numbers in this file will be fully redacted.", file_count="multiple", height=file_input_height)
                     in_fully_redacted_list_text = gr.Textbox(label="Fully redacted page list load status")
-            with gr.Accordion("Manually modify custom allow, deny, and full page redaction lists", open = False):
                 with gr.Row():
-                    in_allow_list_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_allow_list_df", visible=True, type="pandas")
-                    in_deny_list_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_deny_list_df", visible=True, type="pandas")
-                    in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_full_redacted_list_df", visible=True, type="pandas")
         with gr.Accordion("Select entity types to redact", open = True):
                 in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Local PII identification model (click empty space in box for full list)")
@@ -504,10 +504,10 @@ with app:
     in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
                   success(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list])
-    tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
     # If the output file count text box changes, keep going with redacting each data file until done
-    text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
     success(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
     ###
@@ -523,6 +523,11 @@ with app:
     in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
     in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
     # Merge multiple review csv files together
     merge_multiple_review_files_btn.click(fn=merge_csv_files, inputs=multiple_review_files_in_out, outputs=multiple_review_files_in_out)

 from gradio_image_annotation.image_annotator import AnnotatedImageData
 from tools.config import output_folder, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, DEFAULT_ALLOW_LIST_PATH
+from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe
 from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
     ## Settings page variables
     default_deny_list_file_name = "default_deny_list.csv"
     default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
+    in_deny_list_text_in = gr.Textbox(value="deny_list", visible=False)
     fully_redacted_list_file_name = "default_fully_redacted_list.csv"
     fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
+    in_fully_redacted_text_in = gr.Textbox(value="fully_redacted_pages_list", visible=False)
     # S3 settings for default allow list load
     s3_default_bucket = gr.Textbox(label = "Default S3 bucket", value=bucket_name, visible=False)
         with gr.Accordion("Custom allow, deny, and full page redaction lists", open = True):
             with gr.Row():
                 with gr.Column():
+                    in_allow_list = gr.File(label="Import allow list file - csv table with one column of a different word/phrase on each row (case insensitive). Terms in this file will not be redacted.", file_count="multiple", height=file_input_height)
                     in_allow_list_text = gr.Textbox(label="Custom allow list load status")
                 with gr.Column():
+                    in_deny_list = gr.File(label="Import custom deny list - csv table with one column of a different word/phrase on each row (case insensitive). Terms in this file will always be redacted.", file_count="multiple", height=file_input_height)
                     in_deny_list_text = gr.Textbox(label="Custom deny list load status")
                 with gr.Column():
                     in_fully_redacted_list = gr.File(label="Import fully redacted pages list - csv table with one column of page numbers on each row. Page numbers in this file will be fully redacted.", file_count="multiple", height=file_input_height)
                     in_fully_redacted_list_text = gr.Textbox(label="Fully redacted page list load status")
+            with gr.Accordion("Manually modify custom allow, deny, and full page redaction lists (NOTE: you need to press Enter after modifying/adding an entry to the lists to apply them)", open = False):
                 with gr.Row():
+                    in_allow_list_state = gr.Dataframe(value=pd.DataFrame(), headers=["allow_list"], col_count=(1, "fixed"), row_count = (0, "dynamic"), label="Allow list", visible=True, type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True)
+                    in_deny_list_state = gr.Dataframe(value=pd.DataFrame(), headers=["deny_list"], col_count=(1, "fixed"), row_count = (0, "dynamic"), label="Deny list", visible=True, type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True)
+                    in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), headers=["fully_redacted_pages_list"], col_count=(1, "fixed"), row_count = (0, "dynamic"), label="Fully redacted pages", visible=True, type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, datatype='number')
         with gr.Accordion("Select entity types to redact", open = True):
                 in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Local PII identification model (click empty space in box for full list)")
     in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
                   success(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list])
+    tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list_state, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
     # If the output file count text box changes, keep going with redacting each data file until done
+    text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list_state, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
     success(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
     ###
     in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
     in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
+    # The following allows for more reliable updates of the data in the custom list dataframes
+    in_allow_list_state.input(update_dataframe, inputs=[in_allow_list_state], outputs=[in_allow_list_state])
+    in_deny_list_state.input(update_dataframe, inputs=[in_deny_list_state], outputs=[in_deny_list_state])
+    in_fully_redacted_list_state.input(update_dataframe, inputs=[in_fully_redacted_list_state], outputs=[in_fully_redacted_list_state])
     # Merge multiple review csv files together
     merge_multiple_review_files_btn.click(fn=merge_csv_files, inputs=multiple_review_files_in_out, outputs=multiple_review_files_in_out)

tools/file_conversion.py CHANGED Viewed

@@ -827,14 +827,14 @@ def convert_annotation_json_to_review_df(all_annotations:List[dict], redaction_d
     # If all coordinates all greater than one, this is a absolute image coordinates - change back to relative coordinates
     if "xmin" in review_file_df.columns:
         if review_file_df["xmin"].max() >= 1 and review_file_df["xmax"].max() >= 1 and review_file_df["ymin"].max() >= 1 and review_file_df["ymax"].max() >= 1:
-            print("review file df has large coordinates")
             review_file_df["page"] = review_file_df["page"].astype(int)
             if "image_width" not in review_file_df.columns and not page_sizes_df.empty:
                 review_file_df = review_file_df.merge(page_sizes_df, on="page", how="left")
             if "image_width" in review_file_df.columns:
-                print("Dividing coordinates in review file")
                 review_file_df["xmin"] = review_file_df["xmin"] / review_file_df["image_width"]
                 review_file_df["xmax"] = review_file_df["xmax"] / review_file_df["image_width"]
                 review_file_df["ymin"] = review_file_df["ymin"] / review_file_df["image_height"]
@@ -896,7 +896,7 @@ def convert_annotation_json_to_review_df(all_annotations:List[dict], redaction_d
         # Handle missing matches using a proximity-based approach
         #if merged_df['text'].isnull().sum() > 0:
-        print("Attempting tolerance-based merge for text")
         # Convert coordinates to numpy arrays for KDTree lookup
         tree = cKDTree(df2[['xmin', 'ymin', 'xmax', 'ymax']].values)
         query_coords = df1[['xmin', 'ymin', 'xmax', 'ymax']].values

     # If all coordinates all greater than one, this is a absolute image coordinates - change back to relative coordinates
     if "xmin" in review_file_df.columns:
         if review_file_df["xmin"].max() >= 1 and review_file_df["xmax"].max() >= 1 and review_file_df["ymin"].max() >= 1 and review_file_df["ymax"].max() >= 1:
+            #print("review file df has large coordinates")
             review_file_df["page"] = review_file_df["page"].astype(int)
             if "image_width" not in review_file_df.columns and not page_sizes_df.empty:
                 review_file_df = review_file_df.merge(page_sizes_df, on="page", how="left")
             if "image_width" in review_file_df.columns:
+                #print("Dividing coordinates in review file")
                 review_file_df["xmin"] = review_file_df["xmin"] / review_file_df["image_width"]
                 review_file_df["xmax"] = review_file_df["xmax"] / review_file_df["image_width"]
                 review_file_df["ymin"] = review_file_df["ymin"] / review_file_df["image_height"]
         # Handle missing matches using a proximity-based approach
         #if merged_df['text'].isnull().sum() > 0:
+        #print("Attempting tolerance-based merge for text")
         # Convert coordinates to numpy arrays for KDTree lookup
         tree = cKDTree(df2[['xmin', 'ymin', 'xmax', 'ymax']].values)
         query_coords = df1[['xmin', 'ymin', 'xmax', 'ymax']].values

tools/file_redaction.py CHANGED Viewed

@@ -141,6 +141,8 @@ def choose_and_run_redactor(file_paths:List[str],
     The function returns a redacted document along with processing logs.
     '''
     combined_out_message = ""
     tic = time.perf_counter()
     all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
@@ -171,22 +173,46 @@ def choose_and_run_redactor(file_paths:List[str],
     #print("prepared_pdf_file_paths:", prepared_pdf_file_paths[0])
     review_out_file_paths = [prepared_pdf_file_paths[0]]
     if isinstance(custom_recogniser_word_list, pd.DataFrame):
         if not custom_recogniser_word_list.empty:
-            custom_recogniser_word_list = custom_recogniser_word_list.iloc[:, 0].tolist()
         else:
-            # Handle the case where the DataFrame is empty
-            custom_recogniser_word_list = []  # or some default value
         # Sort the strings in order from the longest string to the shortest
-        custom_recogniser_word_list = sorted(custom_recogniser_word_list, key=len, reverse=True)
     if isinstance(redact_whole_page_list, pd.DataFrame):
         if not redact_whole_page_list.empty:
-            redact_whole_page_list = redact_whole_page_list.iloc[:,0].tolist()
         else:
-            # Handle the case where the DataFrame is empty
-            redact_whole_page_list = []  # or some default value
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
@@ -250,24 +276,13 @@ def choose_and_run_redactor(file_paths:List[str],
         return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state, page_sizes, document_cropboxes
-    # Create allow list
-    # If string, assume file path
-    if isinstance(in_allow_list, str):
-        in_allow_list = pd.read_csv(in_allow_list)
-    if not in_allow_list.empty:
-        in_allow_list_flat = in_allow_list.iloc[:,0].tolist()
-        #print("In allow list:", in_allow_list_flat)
-    else:
-        in_allow_list_flat = []
     # Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
     if pii_identification_method == "AWS Comprehend":
         print("Trying to connect to AWS Comprehend service")
         if aws_access_key_textbox and aws_secret_key_textbox:
             print("Connecting to Comprehend using AWS access key and secret keys from textboxes.")
-            print("aws_access_key_textbox:", aws_access_key_textbox)
-            print("aws_secret_access_key:", aws_secret_key_textbox)
             comprehend_client = boto3.client('comprehend',
                 aws_access_key_id=aws_access_key_textbox,
                 aws_secret_access_key=aws_secret_key_textbox)
@@ -372,8 +387,8 @@ def choose_and_run_redactor(file_paths:List[str],
              comprehend_query_number,
              comprehend_client,
              textract_client,
-             custom_recogniser_word_list,
-             redact_whole_page_list,
              max_fuzzy_spelling_mistakes_num,
              match_fuzzy_whole_phrase_bool,
              log_files_output_paths=log_files_output_paths,
@@ -409,8 +424,8 @@ def choose_and_run_redactor(file_paths:List[str],
             pii_identification_method,
             comprehend_query_number,
             comprehend_client,
-            custom_recogniser_word_list,
-            redact_whole_page_list,
             max_fuzzy_spelling_mistakes_num,
             match_fuzzy_whole_phrase_bool)
@@ -444,15 +459,6 @@ def choose_and_run_redactor(file_paths:List[str],
             out_file_paths.append(out_redacted_pdf_file_path)
-            #logs_output_file_name = out_orig_pdf_file_path + "_decision_process_output.csv"
-            #all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
-            #log_files_output_paths.append(logs_output_file_name)
-            # Convert OCR result bounding boxes to relative values
-            #print("all_line_level_ocr_results_df:", all_line_level_ocr_results_df)
-            #print("page_sizes:", page_sizes)
-            #print("all_line_level_ocr_results_df:", all_line_level_ocr_results_df)
             page_sizes_df = pd.DataFrame(page_sizes)
             page_sizes_df["page"] = page_sizes_df["page"].astype(int)
@@ -473,33 +479,26 @@ def choose_and_run_redactor(file_paths:List[str],
             # Save the gradio_annotation_boxes to a review csv file
             try:
-                #print("annotations_all_pages before in choose and run redactor:", annotations_all_pages)
-                #print("all_decision_process_table before in choose and run redactor:", all_decision_process_table)
-                #print("page_sizes before in choose and run redactor:", page_sizes)
-                review_df = convert_annotation_json_to_review_df(annotations_all_pages, all_decision_process_table, page_sizes)
-                #print("annotation_all_pages:", annotations_all_pages)
-                #print("all_decision_process_table after in choose and run redactor:", all_decision_process_table)
-                #print("review_df after in choose and run redactor:", review_df)
-                review_df["page"] = review_df["page"].astype(int)
-                if "image_height" not in review_df.columns:
-                    review_df = review_df.merge(page_sizes_df, on="page", how="left")
                 # If all coordinates all greater than one, this is a absolute image coordinates - change back to relative coordinates
-                if review_df["xmin"].max() >= 1 and review_df["xmax"].max() >= 1 and review_df["ymin"].max() >= 1 and review_df["ymax"].max() >= 1:
-                    review_df["xmin"] = review_df["xmin"] / review_df["image_width"]
-                    review_df["xmax"] = review_df["xmax"] / review_df["image_width"]
-                    review_df["ymin"] = review_df["ymin"] / review_df["image_height"]
-                    review_df["ymax"] = review_df["ymax"] / review_df["image_height"]
                 # Don't need page sizes in outputs
-                review_df.drop(["image_width", "image_height", "mediabox_width", "mediabox_height", "cropbox_width", "cropbox_height"], axis=1, inplace=True, errors="ignore")
-                #print("review_df:", review_df)
-                review_df.to_csv(out_review_file_path, index=None)
                 out_file_paths.append(out_review_file_path)
                 #print("Saved review file to csv")
@@ -550,10 +549,15 @@ def choose_and_run_redactor(file_paths:List[str],
     # Ensure no duplicated output files
     log_files_output_paths = list(set(log_files_output_paths))
-    out_file_paths = list(set(out_file_paths))
-    review_out_file_paths = [prepared_pdf_file_paths[0], out_review_file_path]
-    return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_df, page_sizes, document_cropboxes
 def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
     '''

     The function returns a redacted document along with processing logs.
     '''
     combined_out_message = ""
+    out_review_file_path = ""
+    pdf_file_name_with_ext = ""
     tic = time.perf_counter()
     all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
     #print("prepared_pdf_file_paths:", prepared_pdf_file_paths[0])
     review_out_file_paths = [prepared_pdf_file_paths[0]]
+    # Load/create allow list
+    # If string, assume file path
+    if isinstance(in_allow_list, str):
+        in_allow_list = pd.read_csv(in_allow_list)
+    # Now, should be a pandas dataframe format
+    if not in_allow_list.empty:
+        in_allow_list_flat = in_allow_list.iloc[:,0].tolist()
+        print("In allow list after flattening:", in_allow_list_flat)
+    else:
+        in_allow_list_flat = []
+    # If string, assume file path
+    if isinstance(custom_recogniser_word_list, str):
+        custom_recogniser_word_list = pd.read_csv(custom_recogniser_word_list)
     if isinstance(custom_recogniser_word_list, pd.DataFrame):
         if not custom_recogniser_word_list.empty:
+            custom_recogniser_word_list_flat = custom_recogniser_word_list.iloc[:, 0].tolist()
         else:
+            custom_recogniser_word_list_flat = []
         # Sort the strings in order from the longest string to the shortest
+        custom_recogniser_word_list_flat = sorted(custom_recogniser_word_list_flat, key=len, reverse=True)
+    #print("custom_recogniser_word_list_flat:", custom_recogniser_word_list_flat)
+    # If string, assume file path
+    if isinstance(redact_whole_page_list, str):
+        redact_whole_page_list = pd.read_csv(redact_whole_page_list)
     if isinstance(redact_whole_page_list, pd.DataFrame):
         if not redact_whole_page_list.empty:
+            #print("redact_whole_page_list:", redact_whole_page_list)
+            try:
+                redact_whole_page_list_flat = redact_whole_page_list.iloc[:,0].astype(int).tolist()
+            except Exception as e:
+                print("Could not convert whole page redaction data to number list due to:", e)
+                redact_whole_page_list_flat = redact_whole_page_list.iloc[:,0].tolist()
         else:
+            redact_whole_page_list_flat = []
+    #print("redact_whole_page_list_flat:", redact_whole_page_list_flat)
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
         return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state, page_sizes, document_cropboxes
     # Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
     if pii_identification_method == "AWS Comprehend":
         print("Trying to connect to AWS Comprehend service")
         if aws_access_key_textbox and aws_secret_key_textbox:
             print("Connecting to Comprehend using AWS access key and secret keys from textboxes.")
             comprehend_client = boto3.client('comprehend',
                 aws_access_key_id=aws_access_key_textbox,
                 aws_secret_access_key=aws_secret_key_textbox)
              comprehend_query_number,
              comprehend_client,
              textract_client,
+             custom_recogniser_word_list_flat,
+             redact_whole_page_list_flat,
              max_fuzzy_spelling_mistakes_num,
              match_fuzzy_whole_phrase_bool,
              log_files_output_paths=log_files_output_paths,
             pii_identification_method,
             comprehend_query_number,
             comprehend_client,
+            custom_recogniser_word_list_flat,
+            redact_whole_page_list_flat,
             max_fuzzy_spelling_mistakes_num,
             match_fuzzy_whole_phrase_bool)
             out_file_paths.append(out_redacted_pdf_file_path)
             page_sizes_df = pd.DataFrame(page_sizes)
             page_sizes_df["page"] = page_sizes_df["page"].astype(int)
             # Save the gradio_annotation_boxes to a review csv file
             try:
+                review_file_state = convert_annotation_json_to_review_df(annotations_all_pages, all_decision_process_table, page_sizes)
+                review_file_state["page"] = review_file_state["page"].astype(int)
+                if "image_height" not in review_file_state.columns:
+                    review_file_state = review_file_state.merge(page_sizes_df, on="page", how="left")
                 # If all coordinates all greater than one, this is a absolute image coordinates - change back to relative coordinates
+                if review_file_state["xmin"].max() >= 1 and review_file_state["xmax"].max() >= 1 and review_file_state["ymin"].max() >= 1 and review_file_state["ymax"].max() >= 1:
+                    review_file_state["xmin"] = review_file_state["xmin"] / review_file_state["image_width"]
+                    review_file_state["xmax"] = review_file_state["xmax"] / review_file_state["image_width"]
+                    review_file_state["ymin"] = review_file_state["ymin"] / review_file_state["image_height"]
+                    review_file_state["ymax"] = review_file_state["ymax"] / review_file_state["image_height"]
                 # Don't need page sizes in outputs
+                review_file_state.drop(["image_width", "image_height", "mediabox_width", "mediabox_height", "cropbox_width", "cropbox_height"], axis=1, inplace=True, errors="ignore")
+                #print("review_file_state:", review_file_state)
+                review_file_state.to_csv(out_review_file_path, index=None)
                 out_file_paths.append(out_review_file_path)
                 #print("Saved review file to csv")
     # Ensure no duplicated output files
     log_files_output_paths = list(set(log_files_output_paths))
+    out_file_paths = list(set(out_file_paths))
+    # Output file paths
+    if not out_review_file_path:
+        review_out_file_paths = [prepared_pdf_file_paths[0]]
+    else:
+        review_out_file_paths = [prepared_pdf_file_paths[0], out_review_file_path]
+    return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state, page_sizes, document_cropboxes
 def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
     '''

tools/helper_functions.py CHANGED Viewed

@@ -40,6 +40,9 @@ def load_in_default_allow_list(allow_list_file_path):
         allow_list_file_path = [allow_list_file_path]
     return allow_list_file_path
 def get_file_name_without_type(file_path):
     # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
@@ -96,12 +99,12 @@ def ensure_output_folder_exists():
     else:
         print(f"The 'output/' folder already exists.")
-def custom_regex_load(in_file:List[str], file_type:str = "Allow list"):
     '''
     When file is loaded, update the column dropdown choices and write to relevant data states.
     '''
-    custom_regex = pd.DataFrame()
     if in_file:
         file_list = [string.name for string in in_file]
@@ -109,20 +112,25 @@ def custom_regex_load(in_file:List[str], file_type:str = "Allow list"):
         regex_file_names = [string for string in file_list if "csv" in string.lower()]
         if regex_file_names:
             regex_file_name = regex_file_names[0]
-            custom_regex = pd.read_csv(regex_file_name, low_memory=False, header=None)
             #regex_file_name_no_ext = get_file_name_without_type(regex_file_name)
-            custom_regex.columns = custom_regex.columns.astype(str)
             output_text = file_type + " file loaded."
             print(output_text)
     else:
         output_text = "No file provided."
         print(output_text)
-        return output_text, custom_regex
-    return output_text, custom_regex
 def put_columns_in_df(in_file):
     new_choices = []

         allow_list_file_path = [allow_list_file_path]
     return allow_list_file_path
+def update_dataframe(df:pd.DataFrame):
+    df_copy = df.copy()
+    return df_copy
 def get_file_name_without_type(file_path):
     # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
     else:
         print(f"The 'output/' folder already exists.")
+def custom_regex_load(in_file:List[str], file_type:str = "allow_list"):
     '''
     When file is loaded, update the column dropdown choices and write to relevant data states.
     '''
+    custom_regex_df = pd.DataFrame()
     if in_file:
         file_list = [string.name for string in in_file]
         regex_file_names = [string for string in file_list if "csv" in string.lower()]
         if regex_file_names:
             regex_file_name = regex_file_names[0]
+            custom_regex_df = pd.read_csv(regex_file_name, low_memory=False, header=None)
             #regex_file_name_no_ext = get_file_name_without_type(regex_file_name)
+            # Select just first columns
+            custom_regex_df = pd.DataFrame(custom_regex_df.iloc[:,[0]])
+            custom_regex_df.rename(columns={0:file_type}, inplace=True)
+            custom_regex_df.columns = custom_regex_df.columns.astype(str)
             output_text = file_type + " file loaded."
+            print("Custom regex df:", custom_regex_df)
             print(output_text)
     else:
         output_text = "No file provided."
         print(output_text)
+        return output_text, custom_regex_df
+    return output_text, custom_regex_df
 def put_columns_in_df(in_file):
     new_choices = []

tools/redaction_review.py CHANGED Viewed

@@ -152,7 +152,6 @@ def update_recogniser_dataframes(image_annotator_object:AnnotatedImageData, reco
     elif recogniser_dataframe_modified.iloc[0,0] == "":
         recogniser_dataframe_modified, recogniser_dataframe_out, recogniser_entities_dropdown_value, recogniser_entities_list, text_entities_drop, page_entities_drop = get_filtered_recogniser_dataframe_and_dropdowns(image_annotator_object, recogniser_dataframe_modified, recogniser_entities_dropdown_value, text_dropdown_value, page_dropdown_value, review_df, page_sizes)
     else:
-        print("recogniser dataframe is not empty")
         review_dataframe, text_entities_drop, page_entities_drop = update_entities_df_recogniser_entities(recogniser_entities_dropdown_value, recogniser_dataframe_modified, page_dropdown_value, text_dropdown_value)
         recogniser_dataframe_out = gr.Dataframe(review_dataframe[["page", "label", "text"]], show_search="filter", col_count=(3, "fixed"), type="pandas", headers=["page", "label", "text"])
@@ -600,14 +599,12 @@ def reset_dropdowns():
     return gr.Dropdown(value="ALL", allow_custom_value=True), gr.Dropdown(value="ALL", allow_custom_value=True), gr.Dropdown(value="ALL", allow_custom_value=True)
 def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
-        print("evt.row_value[0]:", evt.row_value[0])
         row_value_page = evt.row_value[0] # This is the page number value
         if isinstance(row_value_page, list):
             row_value_page = row_value_page[0]
-        print("row_value_page:", row_value_page)
         return row_value_page
 def convert_image_coords_to_adobe(pdf_page_width:float, pdf_page_height:float, image_width:float, image_height:float, x1:float, y1:float, x2:float, y2:float):

     elif recogniser_dataframe_modified.iloc[0,0] == "":
         recogniser_dataframe_modified, recogniser_dataframe_out, recogniser_entities_dropdown_value, recogniser_entities_list, text_entities_drop, page_entities_drop = get_filtered_recogniser_dataframe_and_dropdowns(image_annotator_object, recogniser_dataframe_modified, recogniser_entities_dropdown_value, text_dropdown_value, page_dropdown_value, review_df, page_sizes)
     else:
         review_dataframe, text_entities_drop, page_entities_drop = update_entities_df_recogniser_entities(recogniser_entities_dropdown_value, recogniser_dataframe_modified, page_dropdown_value, text_dropdown_value)
         recogniser_dataframe_out = gr.Dataframe(review_dataframe[["page", "label", "text"]], show_search="filter", col_count=(3, "fixed"), type="pandas", headers=["page", "label", "text"])
     return gr.Dropdown(value="ALL", allow_custom_value=True), gr.Dropdown(value="ALL", allow_custom_value=True), gr.Dropdown(value="ALL", allow_custom_value=True)
 def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
         row_value_page = evt.row_value[0] # This is the page number value
         if isinstance(row_value_page, list):
             row_value_page = row_value_page[0]
         return row_value_page
 def convert_image_coords_to_adobe(pdf_page_width:float, pdf_page_height:float, image_width:float, image_height:float, x1:float, y1:float, x2:float, y2:float):