seanpedrickcase commited on
Commit
0e1a4a7
·
1 Parent(s): 6319afc

Fixed manual entry for allow, deny, and full page redaction lists

Browse files
app.py CHANGED
@@ -11,7 +11,7 @@ from gradio_image_annotation import image_annotator
11
  from gradio_image_annotation.image_annotator import AnnotatedImageData
12
 
13
  from tools.config import output_folder, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, DEFAULT_ALLOW_LIST_PATH
14
- from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, reset_review_vars, merge_csv_files, load_all_output_files
15
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
16
  from tools.file_redaction import choose_and_run_redactor
17
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
@@ -145,11 +145,11 @@ with app:
145
  ## Settings page variables
146
  default_deny_list_file_name = "default_deny_list.csv"
147
  default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
148
- in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
149
 
150
  fully_redacted_list_file_name = "default_fully_redacted_list.csv"
151
  fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
152
- in_fully_redacted_text_in = gr.Textbox(value="Fully redacted page list", visible=False)
153
 
154
  # S3 settings for default allow list load
155
  s3_default_bucket = gr.Textbox(label = "Default S3 bucket", value=bucket_name, visible=False)
@@ -337,19 +337,19 @@ with app:
337
  with gr.Accordion("Custom allow, deny, and full page redaction lists", open = True):
338
  with gr.Row():
339
  with gr.Column():
340
- in_allow_list = gr.File(label="Import allow list file - csv table with one column of a different word/phrase on each row (case sensitive). Terms in this file will not be redacted.", file_count="multiple", height=file_input_height)
341
  in_allow_list_text = gr.Textbox(label="Custom allow list load status")
342
  with gr.Column():
343
- in_deny_list = gr.File(label="Import custom deny list - csv table with one column of a different word/phrase on each row (case sensitive). Terms in this file will always be redacted.", file_count="multiple", height=file_input_height)
344
  in_deny_list_text = gr.Textbox(label="Custom deny list load status")
345
  with gr.Column():
346
  in_fully_redacted_list = gr.File(label="Import fully redacted pages list - csv table with one column of page numbers on each row. Page numbers in this file will be fully redacted.", file_count="multiple", height=file_input_height)
347
  in_fully_redacted_list_text = gr.Textbox(label="Fully redacted page list load status")
348
- with gr.Accordion("Manually modify custom allow, deny, and full page redaction lists", open = False):
349
  with gr.Row():
350
- in_allow_list_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_allow_list_df", visible=True, type="pandas")
351
- in_deny_list_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_deny_list_df", visible=True, type="pandas")
352
- in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_full_redacted_list_df", visible=True, type="pandas")
353
 
354
  with gr.Accordion("Select entity types to redact", open = True):
355
  in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Local PII identification model (click empty space in box for full list)")
@@ -504,10 +504,10 @@ with app:
504
  in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
505
  success(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list])
506
 
507
- tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
508
 
509
  # If the output file count text box changes, keep going with redacting each data file until done
510
- text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
511
  success(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
512
 
513
  ###
@@ -523,6 +523,11 @@ with app:
523
  in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
524
  in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
525
 
 
 
 
 
 
526
  # Merge multiple review csv files together
527
  merge_multiple_review_files_btn.click(fn=merge_csv_files, inputs=multiple_review_files_in_out, outputs=multiple_review_files_in_out)
528
 
 
11
  from gradio_image_annotation.image_annotator import AnnotatedImageData
12
 
13
  from tools.config import output_folder, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, DEFAULT_ALLOW_LIST_PATH
14
+ from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe
15
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
16
  from tools.file_redaction import choose_and_run_redactor
17
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
 
145
  ## Settings page variables
146
  default_deny_list_file_name = "default_deny_list.csv"
147
  default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
148
+ in_deny_list_text_in = gr.Textbox(value="deny_list", visible=False)
149
 
150
  fully_redacted_list_file_name = "default_fully_redacted_list.csv"
151
  fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
152
+ in_fully_redacted_text_in = gr.Textbox(value="fully_redacted_pages_list", visible=False)
153
 
154
  # S3 settings for default allow list load
155
  s3_default_bucket = gr.Textbox(label = "Default S3 bucket", value=bucket_name, visible=False)
 
337
  with gr.Accordion("Custom allow, deny, and full page redaction lists", open = True):
338
  with gr.Row():
339
  with gr.Column():
340
+ in_allow_list = gr.File(label="Import allow list file - csv table with one column of a different word/phrase on each row (case insensitive). Terms in this file will not be redacted.", file_count="multiple", height=file_input_height)
341
  in_allow_list_text = gr.Textbox(label="Custom allow list load status")
342
  with gr.Column():
343
+ in_deny_list = gr.File(label="Import custom deny list - csv table with one column of a different word/phrase on each row (case insensitive). Terms in this file will always be redacted.", file_count="multiple", height=file_input_height)
344
  in_deny_list_text = gr.Textbox(label="Custom deny list load status")
345
  with gr.Column():
346
  in_fully_redacted_list = gr.File(label="Import fully redacted pages list - csv table with one column of page numbers on each row. Page numbers in this file will be fully redacted.", file_count="multiple", height=file_input_height)
347
  in_fully_redacted_list_text = gr.Textbox(label="Fully redacted page list load status")
348
+ with gr.Accordion("Manually modify custom allow, deny, and full page redaction lists (NOTE: you need to press Enter after modifying/adding an entry to the lists to apply them)", open = False):
349
  with gr.Row():
350
+ in_allow_list_state = gr.Dataframe(value=pd.DataFrame(), headers=["allow_list"], col_count=(1, "fixed"), row_count = (0, "dynamic"), label="Allow list", visible=True, type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True)
351
+ in_deny_list_state = gr.Dataframe(value=pd.DataFrame(), headers=["deny_list"], col_count=(1, "fixed"), row_count = (0, "dynamic"), label="Deny list", visible=True, type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True)
352
+ in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), headers=["fully_redacted_pages_list"], col_count=(1, "fixed"), row_count = (0, "dynamic"), label="Fully redacted pages", visible=True, type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, datatype='number')
353
 
354
  with gr.Accordion("Select entity types to redact", open = True):
355
  in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Local PII identification model (click empty space in box for full list)")
 
504
  in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
505
  success(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list])
506
 
507
+ tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list_state, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
508
 
509
  # If the output file count text box changes, keep going with redacting each data file until done
510
+ text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list_state, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
511
  success(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
512
 
513
  ###
 
523
  in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
524
  in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
525
 
526
+ # The following allows for more reliable updates of the data in the custom list dataframes
527
+ in_allow_list_state.input(update_dataframe, inputs=[in_allow_list_state], outputs=[in_allow_list_state])
528
+ in_deny_list_state.input(update_dataframe, inputs=[in_deny_list_state], outputs=[in_deny_list_state])
529
+ in_fully_redacted_list_state.input(update_dataframe, inputs=[in_fully_redacted_list_state], outputs=[in_fully_redacted_list_state])
530
+
531
  # Merge multiple review csv files together
532
  merge_multiple_review_files_btn.click(fn=merge_csv_files, inputs=multiple_review_files_in_out, outputs=multiple_review_files_in_out)
533
 
tools/file_conversion.py CHANGED
@@ -827,14 +827,14 @@ def convert_annotation_json_to_review_df(all_annotations:List[dict], redaction_d
827
  # If all coordinates all greater than one, this is a absolute image coordinates - change back to relative coordinates
828
  if "xmin" in review_file_df.columns:
829
  if review_file_df["xmin"].max() >= 1 and review_file_df["xmax"].max() >= 1 and review_file_df["ymin"].max() >= 1 and review_file_df["ymax"].max() >= 1:
830
- print("review file df has large coordinates")
831
  review_file_df["page"] = review_file_df["page"].astype(int)
832
 
833
  if "image_width" not in review_file_df.columns and not page_sizes_df.empty:
834
  review_file_df = review_file_df.merge(page_sizes_df, on="page", how="left")
835
 
836
  if "image_width" in review_file_df.columns:
837
- print("Dividing coordinates in review file")
838
  review_file_df["xmin"] = review_file_df["xmin"] / review_file_df["image_width"]
839
  review_file_df["xmax"] = review_file_df["xmax"] / review_file_df["image_width"]
840
  review_file_df["ymin"] = review_file_df["ymin"] / review_file_df["image_height"]
@@ -896,7 +896,7 @@ def convert_annotation_json_to_review_df(all_annotations:List[dict], redaction_d
896
 
897
  # Handle missing matches using a proximity-based approach
898
  #if merged_df['text'].isnull().sum() > 0:
899
- print("Attempting tolerance-based merge for text")
900
  # Convert coordinates to numpy arrays for KDTree lookup
901
  tree = cKDTree(df2[['xmin', 'ymin', 'xmax', 'ymax']].values)
902
  query_coords = df1[['xmin', 'ymin', 'xmax', 'ymax']].values
 
827
  # If all coordinates all greater than one, this is a absolute image coordinates - change back to relative coordinates
828
  if "xmin" in review_file_df.columns:
829
  if review_file_df["xmin"].max() >= 1 and review_file_df["xmax"].max() >= 1 and review_file_df["ymin"].max() >= 1 and review_file_df["ymax"].max() >= 1:
830
+ #print("review file df has large coordinates")
831
  review_file_df["page"] = review_file_df["page"].astype(int)
832
 
833
  if "image_width" not in review_file_df.columns and not page_sizes_df.empty:
834
  review_file_df = review_file_df.merge(page_sizes_df, on="page", how="left")
835
 
836
  if "image_width" in review_file_df.columns:
837
+ #print("Dividing coordinates in review file")
838
  review_file_df["xmin"] = review_file_df["xmin"] / review_file_df["image_width"]
839
  review_file_df["xmax"] = review_file_df["xmax"] / review_file_df["image_width"]
840
  review_file_df["ymin"] = review_file_df["ymin"] / review_file_df["image_height"]
 
896
 
897
  # Handle missing matches using a proximity-based approach
898
  #if merged_df['text'].isnull().sum() > 0:
899
+ #print("Attempting tolerance-based merge for text")
900
  # Convert coordinates to numpy arrays for KDTree lookup
901
  tree = cKDTree(df2[['xmin', 'ymin', 'xmax', 'ymax']].values)
902
  query_coords = df1[['xmin', 'ymin', 'xmax', 'ymax']].values
tools/file_redaction.py CHANGED
@@ -141,6 +141,8 @@ def choose_and_run_redactor(file_paths:List[str],
141
  The function returns a redacted document along with processing logs.
142
  '''
143
  combined_out_message = ""
 
 
144
  tic = time.perf_counter()
145
  all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
146
 
@@ -171,22 +173,46 @@ def choose_and_run_redactor(file_paths:List[str],
171
  #print("prepared_pdf_file_paths:", prepared_pdf_file_paths[0])
172
  review_out_file_paths = [prepared_pdf_file_paths[0]]
173
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  if isinstance(custom_recogniser_word_list, pd.DataFrame):
175
  if not custom_recogniser_word_list.empty:
176
- custom_recogniser_word_list = custom_recogniser_word_list.iloc[:, 0].tolist()
177
  else:
178
- # Handle the case where the DataFrame is empty
179
- custom_recogniser_word_list = [] # or some default value
180
 
181
  # Sort the strings in order from the longest string to the shortest
182
- custom_recogniser_word_list = sorted(custom_recogniser_word_list, key=len, reverse=True)
 
 
183
 
 
 
 
184
  if isinstance(redact_whole_page_list, pd.DataFrame):
185
  if not redact_whole_page_list.empty:
186
- redact_whole_page_list = redact_whole_page_list.iloc[:,0].tolist()
 
 
 
 
 
187
  else:
188
- # Handle the case where the DataFrame is empty
189
- redact_whole_page_list = [] # or some default value
 
190
 
191
  # If this is the first time around, set variables to 0/blank
192
  if first_loop_state==True:
@@ -250,24 +276,13 @@ def choose_and_run_redactor(file_paths:List[str],
250
 
251
  return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state, page_sizes, document_cropboxes
252
 
253
- # Create allow list
254
- # If string, assume file path
255
- if isinstance(in_allow_list, str):
256
- in_allow_list = pd.read_csv(in_allow_list)
257
 
258
- if not in_allow_list.empty:
259
- in_allow_list_flat = in_allow_list.iloc[:,0].tolist()
260
- #print("In allow list:", in_allow_list_flat)
261
- else:
262
- in_allow_list_flat = []
263
 
264
  # Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
265
  if pii_identification_method == "AWS Comprehend":
266
  print("Trying to connect to AWS Comprehend service")
267
  if aws_access_key_textbox and aws_secret_key_textbox:
268
  print("Connecting to Comprehend using AWS access key and secret keys from textboxes.")
269
- print("aws_access_key_textbox:", aws_access_key_textbox)
270
- print("aws_secret_access_key:", aws_secret_key_textbox)
271
  comprehend_client = boto3.client('comprehend',
272
  aws_access_key_id=aws_access_key_textbox,
273
  aws_secret_access_key=aws_secret_key_textbox)
@@ -372,8 +387,8 @@ def choose_and_run_redactor(file_paths:List[str],
372
  comprehend_query_number,
373
  comprehend_client,
374
  textract_client,
375
- custom_recogniser_word_list,
376
- redact_whole_page_list,
377
  max_fuzzy_spelling_mistakes_num,
378
  match_fuzzy_whole_phrase_bool,
379
  log_files_output_paths=log_files_output_paths,
@@ -409,8 +424,8 @@ def choose_and_run_redactor(file_paths:List[str],
409
  pii_identification_method,
410
  comprehend_query_number,
411
  comprehend_client,
412
- custom_recogniser_word_list,
413
- redact_whole_page_list,
414
  max_fuzzy_spelling_mistakes_num,
415
  match_fuzzy_whole_phrase_bool)
416
 
@@ -444,15 +459,6 @@ def choose_and_run_redactor(file_paths:List[str],
444
 
445
  out_file_paths.append(out_redacted_pdf_file_path)
446
 
447
- #logs_output_file_name = out_orig_pdf_file_path + "_decision_process_output.csv"
448
- #all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
449
- #log_files_output_paths.append(logs_output_file_name)
450
-
451
- # Convert OCR result bounding boxes to relative values
452
- #print("all_line_level_ocr_results_df:", all_line_level_ocr_results_df)
453
- #print("page_sizes:", page_sizes)
454
- #print("all_line_level_ocr_results_df:", all_line_level_ocr_results_df)
455
-
456
  page_sizes_df = pd.DataFrame(page_sizes)
457
 
458
  page_sizes_df["page"] = page_sizes_df["page"].astype(int)
@@ -473,33 +479,26 @@ def choose_and_run_redactor(file_paths:List[str],
473
 
474
  # Save the gradio_annotation_boxes to a review csv file
475
  try:
476
- #print("annotations_all_pages before in choose and run redactor:", annotations_all_pages)
477
- #print("all_decision_process_table before in choose and run redactor:", all_decision_process_table)
478
- #print("page_sizes before in choose and run redactor:", page_sizes)
479
 
480
- review_df = convert_annotation_json_to_review_df(annotations_all_pages, all_decision_process_table, page_sizes)
481
 
482
- #print("annotation_all_pages:", annotations_all_pages)
483
- #print("all_decision_process_table after in choose and run redactor:", all_decision_process_table)
484
- #print("review_df after in choose and run redactor:", review_df)
485
-
486
- review_df["page"] = review_df["page"].astype(int)
487
- if "image_height" not in review_df.columns:
488
- review_df = review_df.merge(page_sizes_df, on="page", how="left")
489
 
490
  # If all coordinates all greater than one, this is a absolute image coordinates - change back to relative coordinates
491
- if review_df["xmin"].max() >= 1 and review_df["xmax"].max() >= 1 and review_df["ymin"].max() >= 1 and review_df["ymax"].max() >= 1:
492
- review_df["xmin"] = review_df["xmin"] / review_df["image_width"]
493
- review_df["xmax"] = review_df["xmax"] / review_df["image_width"]
494
- review_df["ymin"] = review_df["ymin"] / review_df["image_height"]
495
- review_df["ymax"] = review_df["ymax"] / review_df["image_height"]
496
 
497
  # Don't need page sizes in outputs
498
- review_df.drop(["image_width", "image_height", "mediabox_width", "mediabox_height", "cropbox_width", "cropbox_height"], axis=1, inplace=True, errors="ignore")
499
 
500
- #print("review_df:", review_df)
501
 
502
- review_df.to_csv(out_review_file_path, index=None)
503
  out_file_paths.append(out_review_file_path)
504
 
505
  #print("Saved review file to csv")
@@ -550,10 +549,15 @@ def choose_and_run_redactor(file_paths:List[str],
550
 
551
  # Ensure no duplicated output files
552
  log_files_output_paths = list(set(log_files_output_paths))
553
- out_file_paths = list(set(out_file_paths))
554
- review_out_file_paths = [prepared_pdf_file_paths[0], out_review_file_path]
 
 
 
 
 
555
 
556
- return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_df, page_sizes, document_cropboxes
557
 
558
  def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
559
  '''
 
141
  The function returns a redacted document along with processing logs.
142
  '''
143
  combined_out_message = ""
144
+ out_review_file_path = ""
145
+ pdf_file_name_with_ext = ""
146
  tic = time.perf_counter()
147
  all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
148
 
 
173
  #print("prepared_pdf_file_paths:", prepared_pdf_file_paths[0])
174
  review_out_file_paths = [prepared_pdf_file_paths[0]]
175
 
176
+ # Load/create allow list
177
+ # If string, assume file path
178
+ if isinstance(in_allow_list, str):
179
+ in_allow_list = pd.read_csv(in_allow_list)
180
+ # Now, should be a pandas dataframe format
181
+ if not in_allow_list.empty:
182
+ in_allow_list_flat = in_allow_list.iloc[:,0].tolist()
183
+ print("In allow list after flattening:", in_allow_list_flat)
184
+ else:
185
+ in_allow_list_flat = []
186
+
187
+ # If string, assume file path
188
+ if isinstance(custom_recogniser_word_list, str):
189
+ custom_recogniser_word_list = pd.read_csv(custom_recogniser_word_list)
190
  if isinstance(custom_recogniser_word_list, pd.DataFrame):
191
  if not custom_recogniser_word_list.empty:
192
+ custom_recogniser_word_list_flat = custom_recogniser_word_list.iloc[:, 0].tolist()
193
  else:
194
+ custom_recogniser_word_list_flat = []
 
195
 
196
  # Sort the strings in order from the longest string to the shortest
197
+ custom_recogniser_word_list_flat = sorted(custom_recogniser_word_list_flat, key=len, reverse=True)
198
+
199
+ #print("custom_recogniser_word_list_flat:", custom_recogniser_word_list_flat)
200
 
201
+ # If string, assume file path
202
+ if isinstance(redact_whole_page_list, str):
203
+ redact_whole_page_list = pd.read_csv(redact_whole_page_list)
204
  if isinstance(redact_whole_page_list, pd.DataFrame):
205
  if not redact_whole_page_list.empty:
206
+ #print("redact_whole_page_list:", redact_whole_page_list)
207
+ try:
208
+ redact_whole_page_list_flat = redact_whole_page_list.iloc[:,0].astype(int).tolist()
209
+ except Exception as e:
210
+ print("Could not convert whole page redaction data to number list due to:", e)
211
+ redact_whole_page_list_flat = redact_whole_page_list.iloc[:,0].tolist()
212
  else:
213
+ redact_whole_page_list_flat = []
214
+
215
+ #print("redact_whole_page_list_flat:", redact_whole_page_list_flat)
216
 
217
  # If this is the first time around, set variables to 0/blank
218
  if first_loop_state==True:
 
276
 
277
  return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state, page_sizes, document_cropboxes
278
 
 
 
 
 
279
 
 
 
 
 
 
280
 
281
  # Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
282
  if pii_identification_method == "AWS Comprehend":
283
  print("Trying to connect to AWS Comprehend service")
284
  if aws_access_key_textbox and aws_secret_key_textbox:
285
  print("Connecting to Comprehend using AWS access key and secret keys from textboxes.")
 
 
286
  comprehend_client = boto3.client('comprehend',
287
  aws_access_key_id=aws_access_key_textbox,
288
  aws_secret_access_key=aws_secret_key_textbox)
 
387
  comprehend_query_number,
388
  comprehend_client,
389
  textract_client,
390
+ custom_recogniser_word_list_flat,
391
+ redact_whole_page_list_flat,
392
  max_fuzzy_spelling_mistakes_num,
393
  match_fuzzy_whole_phrase_bool,
394
  log_files_output_paths=log_files_output_paths,
 
424
  pii_identification_method,
425
  comprehend_query_number,
426
  comprehend_client,
427
+ custom_recogniser_word_list_flat,
428
+ redact_whole_page_list_flat,
429
  max_fuzzy_spelling_mistakes_num,
430
  match_fuzzy_whole_phrase_bool)
431
 
 
459
 
460
  out_file_paths.append(out_redacted_pdf_file_path)
461
 
 
 
 
 
 
 
 
 
 
462
  page_sizes_df = pd.DataFrame(page_sizes)
463
 
464
  page_sizes_df["page"] = page_sizes_df["page"].astype(int)
 
479
 
480
  # Save the gradio_annotation_boxes to a review csv file
481
  try:
 
 
 
482
 
483
+ review_file_state = convert_annotation_json_to_review_df(annotations_all_pages, all_decision_process_table, page_sizes)
484
 
485
+ review_file_state["page"] = review_file_state["page"].astype(int)
486
+ if "image_height" not in review_file_state.columns:
487
+ review_file_state = review_file_state.merge(page_sizes_df, on="page", how="left")
 
 
 
 
488
 
489
  # If all coordinates all greater than one, this is a absolute image coordinates - change back to relative coordinates
490
+ if review_file_state["xmin"].max() >= 1 and review_file_state["xmax"].max() >= 1 and review_file_state["ymin"].max() >= 1 and review_file_state["ymax"].max() >= 1:
491
+ review_file_state["xmin"] = review_file_state["xmin"] / review_file_state["image_width"]
492
+ review_file_state["xmax"] = review_file_state["xmax"] / review_file_state["image_width"]
493
+ review_file_state["ymin"] = review_file_state["ymin"] / review_file_state["image_height"]
494
+ review_file_state["ymax"] = review_file_state["ymax"] / review_file_state["image_height"]
495
 
496
  # Don't need page sizes in outputs
497
+ review_file_state.drop(["image_width", "image_height", "mediabox_width", "mediabox_height", "cropbox_width", "cropbox_height"], axis=1, inplace=True, errors="ignore")
498
 
499
+ #print("review_file_state:", review_file_state)
500
 
501
+ review_file_state.to_csv(out_review_file_path, index=None)
502
  out_file_paths.append(out_review_file_path)
503
 
504
  #print("Saved review file to csv")
 
549
 
550
  # Ensure no duplicated output files
551
  log_files_output_paths = list(set(log_files_output_paths))
552
+ out_file_paths = list(set(out_file_paths))
553
+
554
+ # Output file paths
555
+ if not out_review_file_path:
556
+ review_out_file_paths = [prepared_pdf_file_paths[0]]
557
+ else:
558
+ review_out_file_paths = [prepared_pdf_file_paths[0], out_review_file_path]
559
 
560
+ return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state, page_sizes, document_cropboxes
561
 
562
  def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
563
  '''
tools/helper_functions.py CHANGED
@@ -40,6 +40,9 @@ def load_in_default_allow_list(allow_list_file_path):
40
  allow_list_file_path = [allow_list_file_path]
41
  return allow_list_file_path
42
 
 
 
 
43
 
44
  def get_file_name_without_type(file_path):
45
  # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
@@ -96,12 +99,12 @@ def ensure_output_folder_exists():
96
  else:
97
  print(f"The 'output/' folder already exists.")
98
 
99
- def custom_regex_load(in_file:List[str], file_type:str = "Allow list"):
100
  '''
101
  When file is loaded, update the column dropdown choices and write to relevant data states.
102
  '''
103
 
104
- custom_regex = pd.DataFrame()
105
 
106
  if in_file:
107
  file_list = [string.name for string in in_file]
@@ -109,20 +112,25 @@ def custom_regex_load(in_file:List[str], file_type:str = "Allow list"):
109
  regex_file_names = [string for string in file_list if "csv" in string.lower()]
110
  if regex_file_names:
111
  regex_file_name = regex_file_names[0]
112
- custom_regex = pd.read_csv(regex_file_name, low_memory=False, header=None)
113
  #regex_file_name_no_ext = get_file_name_without_type(regex_file_name)
 
 
 
 
114
 
115
- custom_regex.columns = custom_regex.columns.astype(str)
116
 
117
  output_text = file_type + " file loaded."
118
 
 
119
  print(output_text)
120
  else:
121
  output_text = "No file provided."
122
  print(output_text)
123
- return output_text, custom_regex
124
 
125
- return output_text, custom_regex
126
 
127
  def put_columns_in_df(in_file):
128
  new_choices = []
 
40
  allow_list_file_path = [allow_list_file_path]
41
  return allow_list_file_path
42
 
43
+ def update_dataframe(df:pd.DataFrame):
44
+ df_copy = df.copy()
45
+ return df_copy
46
 
47
  def get_file_name_without_type(file_path):
48
  # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
 
99
  else:
100
  print(f"The 'output/' folder already exists.")
101
 
102
+ def custom_regex_load(in_file:List[str], file_type:str = "allow_list"):
103
  '''
104
  When file is loaded, update the column dropdown choices and write to relevant data states.
105
  '''
106
 
107
+ custom_regex_df = pd.DataFrame()
108
 
109
  if in_file:
110
  file_list = [string.name for string in in_file]
 
112
  regex_file_names = [string for string in file_list if "csv" in string.lower()]
113
  if regex_file_names:
114
  regex_file_name = regex_file_names[0]
115
+ custom_regex_df = pd.read_csv(regex_file_name, low_memory=False, header=None)
116
  #regex_file_name_no_ext = get_file_name_without_type(regex_file_name)
117
+
118
+ # Select just first columns
119
+ custom_regex_df = pd.DataFrame(custom_regex_df.iloc[:,[0]])
120
+ custom_regex_df.rename(columns={0:file_type}, inplace=True)
121
 
122
+ custom_regex_df.columns = custom_regex_df.columns.astype(str)
123
 
124
  output_text = file_type + " file loaded."
125
 
126
+ print("Custom regex df:", custom_regex_df)
127
  print(output_text)
128
  else:
129
  output_text = "No file provided."
130
  print(output_text)
131
+ return output_text, custom_regex_df
132
 
133
+ return output_text, custom_regex_df
134
 
135
  def put_columns_in_df(in_file):
136
  new_choices = []
tools/redaction_review.py CHANGED
@@ -152,7 +152,6 @@ def update_recogniser_dataframes(image_annotator_object:AnnotatedImageData, reco
152
  elif recogniser_dataframe_modified.iloc[0,0] == "":
153
  recogniser_dataframe_modified, recogniser_dataframe_out, recogniser_entities_dropdown_value, recogniser_entities_list, text_entities_drop, page_entities_drop = get_filtered_recogniser_dataframe_and_dropdowns(image_annotator_object, recogniser_dataframe_modified, recogniser_entities_dropdown_value, text_dropdown_value, page_dropdown_value, review_df, page_sizes)
154
  else:
155
- print("recogniser dataframe is not empty")
156
  review_dataframe, text_entities_drop, page_entities_drop = update_entities_df_recogniser_entities(recogniser_entities_dropdown_value, recogniser_dataframe_modified, page_dropdown_value, text_dropdown_value)
157
  recogniser_dataframe_out = gr.Dataframe(review_dataframe[["page", "label", "text"]], show_search="filter", col_count=(3, "fixed"), type="pandas", headers=["page", "label", "text"])
158
 
@@ -600,14 +599,12 @@ def reset_dropdowns():
600
  return gr.Dropdown(value="ALL", allow_custom_value=True), gr.Dropdown(value="ALL", allow_custom_value=True), gr.Dropdown(value="ALL", allow_custom_value=True)
601
 
602
  def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
603
- print("evt.row_value[0]:", evt.row_value[0])
604
 
605
  row_value_page = evt.row_value[0] # This is the page number value
606
 
607
  if isinstance(row_value_page, list):
608
  row_value_page = row_value_page[0]
609
 
610
- print("row_value_page:", row_value_page)
611
  return row_value_page
612
 
613
  def convert_image_coords_to_adobe(pdf_page_width:float, pdf_page_height:float, image_width:float, image_height:float, x1:float, y1:float, x2:float, y2:float):
 
152
  elif recogniser_dataframe_modified.iloc[0,0] == "":
153
  recogniser_dataframe_modified, recogniser_dataframe_out, recogniser_entities_dropdown_value, recogniser_entities_list, text_entities_drop, page_entities_drop = get_filtered_recogniser_dataframe_and_dropdowns(image_annotator_object, recogniser_dataframe_modified, recogniser_entities_dropdown_value, text_dropdown_value, page_dropdown_value, review_df, page_sizes)
154
  else:
 
155
  review_dataframe, text_entities_drop, page_entities_drop = update_entities_df_recogniser_entities(recogniser_entities_dropdown_value, recogniser_dataframe_modified, page_dropdown_value, text_dropdown_value)
156
  recogniser_dataframe_out = gr.Dataframe(review_dataframe[["page", "label", "text"]], show_search="filter", col_count=(3, "fixed"), type="pandas", headers=["page", "label", "text"])
157
 
 
599
  return gr.Dropdown(value="ALL", allow_custom_value=True), gr.Dropdown(value="ALL", allow_custom_value=True), gr.Dropdown(value="ALL", allow_custom_value=True)
600
 
601
  def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
 
602
 
603
  row_value_page = evt.row_value[0] # This is the page number value
604
 
605
  if isinstance(row_value_page, list):
606
  row_value_page = row_value_page[0]
607
 
 
608
  return row_value_page
609
 
610
  def convert_image_coords_to_adobe(pdf_page_width:float, pdf_page_height:float, image_width:float, image_height:float, x1:float, y1:float, x2:float, y2:float):