Commit
·
0e1a4a7
1
Parent(s):
6319afc
Fixed manual entry for allow, deny, and full page redaction lists
Browse files- app.py +16 -11
- tools/file_conversion.py +3 -3
- tools/file_redaction.py +57 -53
- tools/helper_functions.py +14 -6
- tools/redaction_review.py +0 -3
app.py
CHANGED
|
@@ -11,7 +11,7 @@ from gradio_image_annotation import image_annotator
|
|
| 11 |
from gradio_image_annotation.image_annotator import AnnotatedImageData
|
| 12 |
|
| 13 |
from tools.config import output_folder, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, DEFAULT_ALLOW_LIST_PATH
|
| 14 |
-
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, reset_review_vars, merge_csv_files, load_all_output_files
|
| 15 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
|
| 16 |
from tools.file_redaction import choose_and_run_redactor
|
| 17 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
|
@@ -145,11 +145,11 @@ with app:
|
|
| 145 |
## Settings page variables
|
| 146 |
default_deny_list_file_name = "default_deny_list.csv"
|
| 147 |
default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
|
| 148 |
-
in_deny_list_text_in = gr.Textbox(value="
|
| 149 |
|
| 150 |
fully_redacted_list_file_name = "default_fully_redacted_list.csv"
|
| 151 |
fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
|
| 152 |
-
in_fully_redacted_text_in = gr.Textbox(value="
|
| 153 |
|
| 154 |
# S3 settings for default allow list load
|
| 155 |
s3_default_bucket = gr.Textbox(label = "Default S3 bucket", value=bucket_name, visible=False)
|
|
@@ -337,19 +337,19 @@ with app:
|
|
| 337 |
with gr.Accordion("Custom allow, deny, and full page redaction lists", open = True):
|
| 338 |
with gr.Row():
|
| 339 |
with gr.Column():
|
| 340 |
-
in_allow_list = gr.File(label="Import allow list file - csv table with one column of a different word/phrase on each row (case
|
| 341 |
in_allow_list_text = gr.Textbox(label="Custom allow list load status")
|
| 342 |
with gr.Column():
|
| 343 |
-
in_deny_list = gr.File(label="Import custom deny list - csv table with one column of a different word/phrase on each row (case
|
| 344 |
in_deny_list_text = gr.Textbox(label="Custom deny list load status")
|
| 345 |
with gr.Column():
|
| 346 |
in_fully_redacted_list = gr.File(label="Import fully redacted pages list - csv table with one column of page numbers on each row. Page numbers in this file will be fully redacted.", file_count="multiple", height=file_input_height)
|
| 347 |
in_fully_redacted_list_text = gr.Textbox(label="Fully redacted page list load status")
|
| 348 |
-
with gr.Accordion("Manually modify custom allow, deny, and full page redaction lists", open = False):
|
| 349 |
with gr.Row():
|
| 350 |
-
in_allow_list_state = gr.Dataframe(value=pd.DataFrame(), headers=
|
| 351 |
-
in_deny_list_state = gr.Dataframe(value=pd.DataFrame(), headers=
|
| 352 |
-
in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), headers=
|
| 353 |
|
| 354 |
with gr.Accordion("Select entity types to redact", open = True):
|
| 355 |
in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Local PII identification model (click empty space in box for full list)")
|
|
@@ -504,10 +504,10 @@ with app:
|
|
| 504 |
in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
|
| 505 |
success(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list])
|
| 506 |
|
| 507 |
-
tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities,
|
| 508 |
|
| 509 |
# If the output file count text box changes, keep going with redacting each data file until done
|
| 510 |
-
text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities,
|
| 511 |
success(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
|
| 512 |
|
| 513 |
###
|
|
@@ -523,6 +523,11 @@ with app:
|
|
| 523 |
in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
|
| 524 |
in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
|
| 525 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 526 |
# Merge multiple review csv files together
|
| 527 |
merge_multiple_review_files_btn.click(fn=merge_csv_files, inputs=multiple_review_files_in_out, outputs=multiple_review_files_in_out)
|
| 528 |
|
|
|
|
| 11 |
from gradio_image_annotation.image_annotator import AnnotatedImageData
|
| 12 |
|
| 13 |
from tools.config import output_folder, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, DEFAULT_ALLOW_LIST_PATH
|
| 14 |
+
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe
|
| 15 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
|
| 16 |
from tools.file_redaction import choose_and_run_redactor
|
| 17 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
|
|
|
| 145 |
## Settings page variables
|
| 146 |
default_deny_list_file_name = "default_deny_list.csv"
|
| 147 |
default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
|
| 148 |
+
in_deny_list_text_in = gr.Textbox(value="deny_list", visible=False)
|
| 149 |
|
| 150 |
fully_redacted_list_file_name = "default_fully_redacted_list.csv"
|
| 151 |
fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
|
| 152 |
+
in_fully_redacted_text_in = gr.Textbox(value="fully_redacted_pages_list", visible=False)
|
| 153 |
|
| 154 |
# S3 settings for default allow list load
|
| 155 |
s3_default_bucket = gr.Textbox(label = "Default S3 bucket", value=bucket_name, visible=False)
|
|
|
|
| 337 |
with gr.Accordion("Custom allow, deny, and full page redaction lists", open = True):
|
| 338 |
with gr.Row():
|
| 339 |
with gr.Column():
|
| 340 |
+
in_allow_list = gr.File(label="Import allow list file - csv table with one column of a different word/phrase on each row (case insensitive). Terms in this file will not be redacted.", file_count="multiple", height=file_input_height)
|
| 341 |
in_allow_list_text = gr.Textbox(label="Custom allow list load status")
|
| 342 |
with gr.Column():
|
| 343 |
+
in_deny_list = gr.File(label="Import custom deny list - csv table with one column of a different word/phrase on each row (case insensitive). Terms in this file will always be redacted.", file_count="multiple", height=file_input_height)
|
| 344 |
in_deny_list_text = gr.Textbox(label="Custom deny list load status")
|
| 345 |
with gr.Column():
|
| 346 |
in_fully_redacted_list = gr.File(label="Import fully redacted pages list - csv table with one column of page numbers on each row. Page numbers in this file will be fully redacted.", file_count="multiple", height=file_input_height)
|
| 347 |
in_fully_redacted_list_text = gr.Textbox(label="Fully redacted page list load status")
|
| 348 |
+
with gr.Accordion("Manually modify custom allow, deny, and full page redaction lists (NOTE: you need to press Enter after modifying/adding an entry to the lists to apply them)", open = False):
|
| 349 |
with gr.Row():
|
| 350 |
+
in_allow_list_state = gr.Dataframe(value=pd.DataFrame(), headers=["allow_list"], col_count=(1, "fixed"), row_count = (0, "dynamic"), label="Allow list", visible=True, type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True)
|
| 351 |
+
in_deny_list_state = gr.Dataframe(value=pd.DataFrame(), headers=["deny_list"], col_count=(1, "fixed"), row_count = (0, "dynamic"), label="Deny list", visible=True, type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True)
|
| 352 |
+
in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), headers=["fully_redacted_pages_list"], col_count=(1, "fixed"), row_count = (0, "dynamic"), label="Fully redacted pages", visible=True, type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, datatype='number')
|
| 353 |
|
| 354 |
with gr.Accordion("Select entity types to redact", open = True):
|
| 355 |
in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Local PII identification model (click empty space in box for full list)")
|
|
|
|
| 504 |
in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
|
| 505 |
success(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list])
|
| 506 |
|
| 507 |
+
tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list_state, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
|
| 508 |
|
| 509 |
# If the output file count text box changes, keep going with redacting each data file until done
|
| 510 |
+
text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list_state, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
|
| 511 |
success(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
|
| 512 |
|
| 513 |
###
|
|
|
|
| 523 |
in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
|
| 524 |
in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
|
| 525 |
|
| 526 |
+
# The following allows for more reliable updates of the data in the custom list dataframes
|
| 527 |
+
in_allow_list_state.input(update_dataframe, inputs=[in_allow_list_state], outputs=[in_allow_list_state])
|
| 528 |
+
in_deny_list_state.input(update_dataframe, inputs=[in_deny_list_state], outputs=[in_deny_list_state])
|
| 529 |
+
in_fully_redacted_list_state.input(update_dataframe, inputs=[in_fully_redacted_list_state], outputs=[in_fully_redacted_list_state])
|
| 530 |
+
|
| 531 |
# Merge multiple review csv files together
|
| 532 |
merge_multiple_review_files_btn.click(fn=merge_csv_files, inputs=multiple_review_files_in_out, outputs=multiple_review_files_in_out)
|
| 533 |
|
tools/file_conversion.py
CHANGED
|
@@ -827,14 +827,14 @@ def convert_annotation_json_to_review_df(all_annotations:List[dict], redaction_d
|
|
| 827 |
# If all coordinates all greater than one, this is a absolute image coordinates - change back to relative coordinates
|
| 828 |
if "xmin" in review_file_df.columns:
|
| 829 |
if review_file_df["xmin"].max() >= 1 and review_file_df["xmax"].max() >= 1 and review_file_df["ymin"].max() >= 1 and review_file_df["ymax"].max() >= 1:
|
| 830 |
-
print("review file df has large coordinates")
|
| 831 |
review_file_df["page"] = review_file_df["page"].astype(int)
|
| 832 |
|
| 833 |
if "image_width" not in review_file_df.columns and not page_sizes_df.empty:
|
| 834 |
review_file_df = review_file_df.merge(page_sizes_df, on="page", how="left")
|
| 835 |
|
| 836 |
if "image_width" in review_file_df.columns:
|
| 837 |
-
print("Dividing coordinates in review file")
|
| 838 |
review_file_df["xmin"] = review_file_df["xmin"] / review_file_df["image_width"]
|
| 839 |
review_file_df["xmax"] = review_file_df["xmax"] / review_file_df["image_width"]
|
| 840 |
review_file_df["ymin"] = review_file_df["ymin"] / review_file_df["image_height"]
|
|
@@ -896,7 +896,7 @@ def convert_annotation_json_to_review_df(all_annotations:List[dict], redaction_d
|
|
| 896 |
|
| 897 |
# Handle missing matches using a proximity-based approach
|
| 898 |
#if merged_df['text'].isnull().sum() > 0:
|
| 899 |
-
print("Attempting tolerance-based merge for text")
|
| 900 |
# Convert coordinates to numpy arrays for KDTree lookup
|
| 901 |
tree = cKDTree(df2[['xmin', 'ymin', 'xmax', 'ymax']].values)
|
| 902 |
query_coords = df1[['xmin', 'ymin', 'xmax', 'ymax']].values
|
|
|
|
| 827 |
# If all coordinates all greater than one, this is a absolute image coordinates - change back to relative coordinates
|
| 828 |
if "xmin" in review_file_df.columns:
|
| 829 |
if review_file_df["xmin"].max() >= 1 and review_file_df["xmax"].max() >= 1 and review_file_df["ymin"].max() >= 1 and review_file_df["ymax"].max() >= 1:
|
| 830 |
+
#print("review file df has large coordinates")
|
| 831 |
review_file_df["page"] = review_file_df["page"].astype(int)
|
| 832 |
|
| 833 |
if "image_width" not in review_file_df.columns and not page_sizes_df.empty:
|
| 834 |
review_file_df = review_file_df.merge(page_sizes_df, on="page", how="left")
|
| 835 |
|
| 836 |
if "image_width" in review_file_df.columns:
|
| 837 |
+
#print("Dividing coordinates in review file")
|
| 838 |
review_file_df["xmin"] = review_file_df["xmin"] / review_file_df["image_width"]
|
| 839 |
review_file_df["xmax"] = review_file_df["xmax"] / review_file_df["image_width"]
|
| 840 |
review_file_df["ymin"] = review_file_df["ymin"] / review_file_df["image_height"]
|
|
|
|
| 896 |
|
| 897 |
# Handle missing matches using a proximity-based approach
|
| 898 |
#if merged_df['text'].isnull().sum() > 0:
|
| 899 |
+
#print("Attempting tolerance-based merge for text")
|
| 900 |
# Convert coordinates to numpy arrays for KDTree lookup
|
| 901 |
tree = cKDTree(df2[['xmin', 'ymin', 'xmax', 'ymax']].values)
|
| 902 |
query_coords = df1[['xmin', 'ymin', 'xmax', 'ymax']].values
|
tools/file_redaction.py
CHANGED
|
@@ -141,6 +141,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 141 |
The function returns a redacted document along with processing logs.
|
| 142 |
'''
|
| 143 |
combined_out_message = ""
|
|
|
|
|
|
|
| 144 |
tic = time.perf_counter()
|
| 145 |
all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
|
| 146 |
|
|
@@ -171,22 +173,46 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 171 |
#print("prepared_pdf_file_paths:", prepared_pdf_file_paths[0])
|
| 172 |
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
| 173 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
if isinstance(custom_recogniser_word_list, pd.DataFrame):
|
| 175 |
if not custom_recogniser_word_list.empty:
|
| 176 |
-
|
| 177 |
else:
|
| 178 |
-
|
| 179 |
-
custom_recogniser_word_list = [] # or some default value
|
| 180 |
|
| 181 |
# Sort the strings in order from the longest string to the shortest
|
| 182 |
-
|
|
|
|
|
|
|
| 183 |
|
|
|
|
|
|
|
|
|
|
| 184 |
if isinstance(redact_whole_page_list, pd.DataFrame):
|
| 185 |
if not redact_whole_page_list.empty:
|
| 186 |
-
redact_whole_page_list
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
else:
|
| 188 |
-
|
| 189 |
-
|
|
|
|
| 190 |
|
| 191 |
# If this is the first time around, set variables to 0/blank
|
| 192 |
if first_loop_state==True:
|
|
@@ -250,24 +276,13 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 250 |
|
| 251 |
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state, page_sizes, document_cropboxes
|
| 252 |
|
| 253 |
-
# Create allow list
|
| 254 |
-
# If string, assume file path
|
| 255 |
-
if isinstance(in_allow_list, str):
|
| 256 |
-
in_allow_list = pd.read_csv(in_allow_list)
|
| 257 |
|
| 258 |
-
if not in_allow_list.empty:
|
| 259 |
-
in_allow_list_flat = in_allow_list.iloc[:,0].tolist()
|
| 260 |
-
#print("In allow list:", in_allow_list_flat)
|
| 261 |
-
else:
|
| 262 |
-
in_allow_list_flat = []
|
| 263 |
|
| 264 |
# Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
|
| 265 |
if pii_identification_method == "AWS Comprehend":
|
| 266 |
print("Trying to connect to AWS Comprehend service")
|
| 267 |
if aws_access_key_textbox and aws_secret_key_textbox:
|
| 268 |
print("Connecting to Comprehend using AWS access key and secret keys from textboxes.")
|
| 269 |
-
print("aws_access_key_textbox:", aws_access_key_textbox)
|
| 270 |
-
print("aws_secret_access_key:", aws_secret_key_textbox)
|
| 271 |
comprehend_client = boto3.client('comprehend',
|
| 272 |
aws_access_key_id=aws_access_key_textbox,
|
| 273 |
aws_secret_access_key=aws_secret_key_textbox)
|
|
@@ -372,8 +387,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 372 |
comprehend_query_number,
|
| 373 |
comprehend_client,
|
| 374 |
textract_client,
|
| 375 |
-
|
| 376 |
-
|
| 377 |
max_fuzzy_spelling_mistakes_num,
|
| 378 |
match_fuzzy_whole_phrase_bool,
|
| 379 |
log_files_output_paths=log_files_output_paths,
|
|
@@ -409,8 +424,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 409 |
pii_identification_method,
|
| 410 |
comprehend_query_number,
|
| 411 |
comprehend_client,
|
| 412 |
-
|
| 413 |
-
|
| 414 |
max_fuzzy_spelling_mistakes_num,
|
| 415 |
match_fuzzy_whole_phrase_bool)
|
| 416 |
|
|
@@ -444,15 +459,6 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 444 |
|
| 445 |
out_file_paths.append(out_redacted_pdf_file_path)
|
| 446 |
|
| 447 |
-
#logs_output_file_name = out_orig_pdf_file_path + "_decision_process_output.csv"
|
| 448 |
-
#all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
|
| 449 |
-
#log_files_output_paths.append(logs_output_file_name)
|
| 450 |
-
|
| 451 |
-
# Convert OCR result bounding boxes to relative values
|
| 452 |
-
#print("all_line_level_ocr_results_df:", all_line_level_ocr_results_df)
|
| 453 |
-
#print("page_sizes:", page_sizes)
|
| 454 |
-
#print("all_line_level_ocr_results_df:", all_line_level_ocr_results_df)
|
| 455 |
-
|
| 456 |
page_sizes_df = pd.DataFrame(page_sizes)
|
| 457 |
|
| 458 |
page_sizes_df["page"] = page_sizes_df["page"].astype(int)
|
|
@@ -473,33 +479,26 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 473 |
|
| 474 |
# Save the gradio_annotation_boxes to a review csv file
|
| 475 |
try:
|
| 476 |
-
#print("annotations_all_pages before in choose and run redactor:", annotations_all_pages)
|
| 477 |
-
#print("all_decision_process_table before in choose and run redactor:", all_decision_process_table)
|
| 478 |
-
#print("page_sizes before in choose and run redactor:", page_sizes)
|
| 479 |
|
| 480 |
-
|
| 481 |
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
review_df["page"] = review_df["page"].astype(int)
|
| 487 |
-
if "image_height" not in review_df.columns:
|
| 488 |
-
review_df = review_df.merge(page_sizes_df, on="page", how="left")
|
| 489 |
|
| 490 |
# If all coordinates all greater than one, this is a absolute image coordinates - change back to relative coordinates
|
| 491 |
-
if
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
|
| 496 |
|
| 497 |
# Don't need page sizes in outputs
|
| 498 |
-
|
| 499 |
|
| 500 |
-
#print("
|
| 501 |
|
| 502 |
-
|
| 503 |
out_file_paths.append(out_review_file_path)
|
| 504 |
|
| 505 |
#print("Saved review file to csv")
|
|
@@ -550,10 +549,15 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 550 |
|
| 551 |
# Ensure no duplicated output files
|
| 552 |
log_files_output_paths = list(set(log_files_output_paths))
|
| 553 |
-
out_file_paths = list(set(out_file_paths))
|
| 554 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 555 |
|
| 556 |
-
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths,
|
| 557 |
|
| 558 |
def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
|
| 559 |
'''
|
|
|
|
| 141 |
The function returns a redacted document along with processing logs.
|
| 142 |
'''
|
| 143 |
combined_out_message = ""
|
| 144 |
+
out_review_file_path = ""
|
| 145 |
+
pdf_file_name_with_ext = ""
|
| 146 |
tic = time.perf_counter()
|
| 147 |
all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
|
| 148 |
|
|
|
|
| 173 |
#print("prepared_pdf_file_paths:", prepared_pdf_file_paths[0])
|
| 174 |
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
| 175 |
|
| 176 |
+
# Load/create allow list
|
| 177 |
+
# If string, assume file path
|
| 178 |
+
if isinstance(in_allow_list, str):
|
| 179 |
+
in_allow_list = pd.read_csv(in_allow_list)
|
| 180 |
+
# Now, should be a pandas dataframe format
|
| 181 |
+
if not in_allow_list.empty:
|
| 182 |
+
in_allow_list_flat = in_allow_list.iloc[:,0].tolist()
|
| 183 |
+
print("In allow list after flattening:", in_allow_list_flat)
|
| 184 |
+
else:
|
| 185 |
+
in_allow_list_flat = []
|
| 186 |
+
|
| 187 |
+
# If string, assume file path
|
| 188 |
+
if isinstance(custom_recogniser_word_list, str):
|
| 189 |
+
custom_recogniser_word_list = pd.read_csv(custom_recogniser_word_list)
|
| 190 |
if isinstance(custom_recogniser_word_list, pd.DataFrame):
|
| 191 |
if not custom_recogniser_word_list.empty:
|
| 192 |
+
custom_recogniser_word_list_flat = custom_recogniser_word_list.iloc[:, 0].tolist()
|
| 193 |
else:
|
| 194 |
+
custom_recogniser_word_list_flat = []
|
|
|
|
| 195 |
|
| 196 |
# Sort the strings in order from the longest string to the shortest
|
| 197 |
+
custom_recogniser_word_list_flat = sorted(custom_recogniser_word_list_flat, key=len, reverse=True)
|
| 198 |
+
|
| 199 |
+
#print("custom_recogniser_word_list_flat:", custom_recogniser_word_list_flat)
|
| 200 |
|
| 201 |
+
# If string, assume file path
|
| 202 |
+
if isinstance(redact_whole_page_list, str):
|
| 203 |
+
redact_whole_page_list = pd.read_csv(redact_whole_page_list)
|
| 204 |
if isinstance(redact_whole_page_list, pd.DataFrame):
|
| 205 |
if not redact_whole_page_list.empty:
|
| 206 |
+
#print("redact_whole_page_list:", redact_whole_page_list)
|
| 207 |
+
try:
|
| 208 |
+
redact_whole_page_list_flat = redact_whole_page_list.iloc[:,0].astype(int).tolist()
|
| 209 |
+
except Exception as e:
|
| 210 |
+
print("Could not convert whole page redaction data to number list due to:", e)
|
| 211 |
+
redact_whole_page_list_flat = redact_whole_page_list.iloc[:,0].tolist()
|
| 212 |
else:
|
| 213 |
+
redact_whole_page_list_flat = []
|
| 214 |
+
|
| 215 |
+
#print("redact_whole_page_list_flat:", redact_whole_page_list_flat)
|
| 216 |
|
| 217 |
# If this is the first time around, set variables to 0/blank
|
| 218 |
if first_loop_state==True:
|
|
|
|
| 276 |
|
| 277 |
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state, page_sizes, document_cropboxes
|
| 278 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
|
| 281 |
# Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
|
| 282 |
if pii_identification_method == "AWS Comprehend":
|
| 283 |
print("Trying to connect to AWS Comprehend service")
|
| 284 |
if aws_access_key_textbox and aws_secret_key_textbox:
|
| 285 |
print("Connecting to Comprehend using AWS access key and secret keys from textboxes.")
|
|
|
|
|
|
|
| 286 |
comprehend_client = boto3.client('comprehend',
|
| 287 |
aws_access_key_id=aws_access_key_textbox,
|
| 288 |
aws_secret_access_key=aws_secret_key_textbox)
|
|
|
|
| 387 |
comprehend_query_number,
|
| 388 |
comprehend_client,
|
| 389 |
textract_client,
|
| 390 |
+
custom_recogniser_word_list_flat,
|
| 391 |
+
redact_whole_page_list_flat,
|
| 392 |
max_fuzzy_spelling_mistakes_num,
|
| 393 |
match_fuzzy_whole_phrase_bool,
|
| 394 |
log_files_output_paths=log_files_output_paths,
|
|
|
|
| 424 |
pii_identification_method,
|
| 425 |
comprehend_query_number,
|
| 426 |
comprehend_client,
|
| 427 |
+
custom_recogniser_word_list_flat,
|
| 428 |
+
redact_whole_page_list_flat,
|
| 429 |
max_fuzzy_spelling_mistakes_num,
|
| 430 |
match_fuzzy_whole_phrase_bool)
|
| 431 |
|
|
|
|
| 459 |
|
| 460 |
out_file_paths.append(out_redacted_pdf_file_path)
|
| 461 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 462 |
page_sizes_df = pd.DataFrame(page_sizes)
|
| 463 |
|
| 464 |
page_sizes_df["page"] = page_sizes_df["page"].astype(int)
|
|
|
|
| 479 |
|
| 480 |
# Save the gradio_annotation_boxes to a review csv file
|
| 481 |
try:
|
|
|
|
|
|
|
|
|
|
| 482 |
|
| 483 |
+
review_file_state = convert_annotation_json_to_review_df(annotations_all_pages, all_decision_process_table, page_sizes)
|
| 484 |
|
| 485 |
+
review_file_state["page"] = review_file_state["page"].astype(int)
|
| 486 |
+
if "image_height" not in review_file_state.columns:
|
| 487 |
+
review_file_state = review_file_state.merge(page_sizes_df, on="page", how="left")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 488 |
|
| 489 |
# If all coordinates all greater than one, this is a absolute image coordinates - change back to relative coordinates
|
| 490 |
+
if review_file_state["xmin"].max() >= 1 and review_file_state["xmax"].max() >= 1 and review_file_state["ymin"].max() >= 1 and review_file_state["ymax"].max() >= 1:
|
| 491 |
+
review_file_state["xmin"] = review_file_state["xmin"] / review_file_state["image_width"]
|
| 492 |
+
review_file_state["xmax"] = review_file_state["xmax"] / review_file_state["image_width"]
|
| 493 |
+
review_file_state["ymin"] = review_file_state["ymin"] / review_file_state["image_height"]
|
| 494 |
+
review_file_state["ymax"] = review_file_state["ymax"] / review_file_state["image_height"]
|
| 495 |
|
| 496 |
# Don't need page sizes in outputs
|
| 497 |
+
review_file_state.drop(["image_width", "image_height", "mediabox_width", "mediabox_height", "cropbox_width", "cropbox_height"], axis=1, inplace=True, errors="ignore")
|
| 498 |
|
| 499 |
+
#print("review_file_state:", review_file_state)
|
| 500 |
|
| 501 |
+
review_file_state.to_csv(out_review_file_path, index=None)
|
| 502 |
out_file_paths.append(out_review_file_path)
|
| 503 |
|
| 504 |
#print("Saved review file to csv")
|
|
|
|
| 549 |
|
| 550 |
# Ensure no duplicated output files
|
| 551 |
log_files_output_paths = list(set(log_files_output_paths))
|
| 552 |
+
out_file_paths = list(set(out_file_paths))
|
| 553 |
+
|
| 554 |
+
# Output file paths
|
| 555 |
+
if not out_review_file_path:
|
| 556 |
+
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
| 557 |
+
else:
|
| 558 |
+
review_out_file_paths = [prepared_pdf_file_paths[0], out_review_file_path]
|
| 559 |
|
| 560 |
+
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state, page_sizes, document_cropboxes
|
| 561 |
|
| 562 |
def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
|
| 563 |
'''
|
tools/helper_functions.py
CHANGED
|
@@ -40,6 +40,9 @@ def load_in_default_allow_list(allow_list_file_path):
|
|
| 40 |
allow_list_file_path = [allow_list_file_path]
|
| 41 |
return allow_list_file_path
|
| 42 |
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
def get_file_name_without_type(file_path):
|
| 45 |
# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
|
|
@@ -96,12 +99,12 @@ def ensure_output_folder_exists():
|
|
| 96 |
else:
|
| 97 |
print(f"The 'output/' folder already exists.")
|
| 98 |
|
| 99 |
-
def custom_regex_load(in_file:List[str], file_type:str = "
|
| 100 |
'''
|
| 101 |
When file is loaded, update the column dropdown choices and write to relevant data states.
|
| 102 |
'''
|
| 103 |
|
| 104 |
-
|
| 105 |
|
| 106 |
if in_file:
|
| 107 |
file_list = [string.name for string in in_file]
|
|
@@ -109,20 +112,25 @@ def custom_regex_load(in_file:List[str], file_type:str = "Allow list"):
|
|
| 109 |
regex_file_names = [string for string in file_list if "csv" in string.lower()]
|
| 110 |
if regex_file_names:
|
| 111 |
regex_file_name = regex_file_names[0]
|
| 112 |
-
|
| 113 |
#regex_file_name_no_ext = get_file_name_without_type(regex_file_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
-
|
| 116 |
|
| 117 |
output_text = file_type + " file loaded."
|
| 118 |
|
|
|
|
| 119 |
print(output_text)
|
| 120 |
else:
|
| 121 |
output_text = "No file provided."
|
| 122 |
print(output_text)
|
| 123 |
-
return output_text,
|
| 124 |
|
| 125 |
-
return output_text,
|
| 126 |
|
| 127 |
def put_columns_in_df(in_file):
|
| 128 |
new_choices = []
|
|
|
|
| 40 |
allow_list_file_path = [allow_list_file_path]
|
| 41 |
return allow_list_file_path
|
| 42 |
|
| 43 |
+
def update_dataframe(df:pd.DataFrame):
|
| 44 |
+
df_copy = df.copy()
|
| 45 |
+
return df_copy
|
| 46 |
|
| 47 |
def get_file_name_without_type(file_path):
|
| 48 |
# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
|
|
|
|
| 99 |
else:
|
| 100 |
print(f"The 'output/' folder already exists.")
|
| 101 |
|
| 102 |
+
def custom_regex_load(in_file:List[str], file_type:str = "allow_list"):
|
| 103 |
'''
|
| 104 |
When file is loaded, update the column dropdown choices and write to relevant data states.
|
| 105 |
'''
|
| 106 |
|
| 107 |
+
custom_regex_df = pd.DataFrame()
|
| 108 |
|
| 109 |
if in_file:
|
| 110 |
file_list = [string.name for string in in_file]
|
|
|
|
| 112 |
regex_file_names = [string for string in file_list if "csv" in string.lower()]
|
| 113 |
if regex_file_names:
|
| 114 |
regex_file_name = regex_file_names[0]
|
| 115 |
+
custom_regex_df = pd.read_csv(regex_file_name, low_memory=False, header=None)
|
| 116 |
#regex_file_name_no_ext = get_file_name_without_type(regex_file_name)
|
| 117 |
+
|
| 118 |
+
# Select just first columns
|
| 119 |
+
custom_regex_df = pd.DataFrame(custom_regex_df.iloc[:,[0]])
|
| 120 |
+
custom_regex_df.rename(columns={0:file_type}, inplace=True)
|
| 121 |
|
| 122 |
+
custom_regex_df.columns = custom_regex_df.columns.astype(str)
|
| 123 |
|
| 124 |
output_text = file_type + " file loaded."
|
| 125 |
|
| 126 |
+
print("Custom regex df:", custom_regex_df)
|
| 127 |
print(output_text)
|
| 128 |
else:
|
| 129 |
output_text = "No file provided."
|
| 130 |
print(output_text)
|
| 131 |
+
return output_text, custom_regex_df
|
| 132 |
|
| 133 |
+
return output_text, custom_regex_df
|
| 134 |
|
| 135 |
def put_columns_in_df(in_file):
|
| 136 |
new_choices = []
|
tools/redaction_review.py
CHANGED
|
@@ -152,7 +152,6 @@ def update_recogniser_dataframes(image_annotator_object:AnnotatedImageData, reco
|
|
| 152 |
elif recogniser_dataframe_modified.iloc[0,0] == "":
|
| 153 |
recogniser_dataframe_modified, recogniser_dataframe_out, recogniser_entities_dropdown_value, recogniser_entities_list, text_entities_drop, page_entities_drop = get_filtered_recogniser_dataframe_and_dropdowns(image_annotator_object, recogniser_dataframe_modified, recogniser_entities_dropdown_value, text_dropdown_value, page_dropdown_value, review_df, page_sizes)
|
| 154 |
else:
|
| 155 |
-
print("recogniser dataframe is not empty")
|
| 156 |
review_dataframe, text_entities_drop, page_entities_drop = update_entities_df_recogniser_entities(recogniser_entities_dropdown_value, recogniser_dataframe_modified, page_dropdown_value, text_dropdown_value)
|
| 157 |
recogniser_dataframe_out = gr.Dataframe(review_dataframe[["page", "label", "text"]], show_search="filter", col_count=(3, "fixed"), type="pandas", headers=["page", "label", "text"])
|
| 158 |
|
|
@@ -600,14 +599,12 @@ def reset_dropdowns():
|
|
| 600 |
return gr.Dropdown(value="ALL", allow_custom_value=True), gr.Dropdown(value="ALL", allow_custom_value=True), gr.Dropdown(value="ALL", allow_custom_value=True)
|
| 601 |
|
| 602 |
def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
|
| 603 |
-
print("evt.row_value[0]:", evt.row_value[0])
|
| 604 |
|
| 605 |
row_value_page = evt.row_value[0] # This is the page number value
|
| 606 |
|
| 607 |
if isinstance(row_value_page, list):
|
| 608 |
row_value_page = row_value_page[0]
|
| 609 |
|
| 610 |
-
print("row_value_page:", row_value_page)
|
| 611 |
return row_value_page
|
| 612 |
|
| 613 |
def convert_image_coords_to_adobe(pdf_page_width:float, pdf_page_height:float, image_width:float, image_height:float, x1:float, y1:float, x2:float, y2:float):
|
|
|
|
| 152 |
elif recogniser_dataframe_modified.iloc[0,0] == "":
|
| 153 |
recogniser_dataframe_modified, recogniser_dataframe_out, recogniser_entities_dropdown_value, recogniser_entities_list, text_entities_drop, page_entities_drop = get_filtered_recogniser_dataframe_and_dropdowns(image_annotator_object, recogniser_dataframe_modified, recogniser_entities_dropdown_value, text_dropdown_value, page_dropdown_value, review_df, page_sizes)
|
| 154 |
else:
|
|
|
|
| 155 |
review_dataframe, text_entities_drop, page_entities_drop = update_entities_df_recogniser_entities(recogniser_entities_dropdown_value, recogniser_dataframe_modified, page_dropdown_value, text_dropdown_value)
|
| 156 |
recogniser_dataframe_out = gr.Dataframe(review_dataframe[["page", "label", "text"]], show_search="filter", col_count=(3, "fixed"), type="pandas", headers=["page", "label", "text"])
|
| 157 |
|
|
|
|
| 599 |
return gr.Dropdown(value="ALL", allow_custom_value=True), gr.Dropdown(value="ALL", allow_custom_value=True), gr.Dropdown(value="ALL", allow_custom_value=True)
|
| 600 |
|
| 601 |
def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
|
|
|
|
| 602 |
|
| 603 |
row_value_page = evt.row_value[0] # This is the page number value
|
| 604 |
|
| 605 |
if isinstance(row_value_page, list):
|
| 606 |
row_value_page = row_value_page[0]
|
| 607 |
|
|
|
|
| 608 |
return row_value_page
|
| 609 |
|
| 610 |
def convert_image_coords_to_adobe(pdf_page_width:float, pdf_page_height:float, image_width:float, image_height:float, x1:float, y1:float, x2:float, y2:float):
|