Commit
·
0e1a4a7
1
Parent(s):
6319afc
Fixed manual entry for allow, deny, and full page redaction lists
Browse files- app.py +16 -11
- tools/file_conversion.py +3 -3
- tools/file_redaction.py +57 -53
- tools/helper_functions.py +14 -6
- tools/redaction_review.py +0 -3
app.py
CHANGED
@@ -11,7 +11,7 @@ from gradio_image_annotation import image_annotator
|
|
11 |
from gradio_image_annotation.image_annotator import AnnotatedImageData
|
12 |
|
13 |
from tools.config import output_folder, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, DEFAULT_ALLOW_LIST_PATH
|
14 |
-
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, reset_review_vars, merge_csv_files, load_all_output_files
|
15 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
|
16 |
from tools.file_redaction import choose_and_run_redactor
|
17 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
@@ -145,11 +145,11 @@ with app:
|
|
145 |
## Settings page variables
|
146 |
default_deny_list_file_name = "default_deny_list.csv"
|
147 |
default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
|
148 |
-
in_deny_list_text_in = gr.Textbox(value="
|
149 |
|
150 |
fully_redacted_list_file_name = "default_fully_redacted_list.csv"
|
151 |
fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
|
152 |
-
in_fully_redacted_text_in = gr.Textbox(value="
|
153 |
|
154 |
# S3 settings for default allow list load
|
155 |
s3_default_bucket = gr.Textbox(label = "Default S3 bucket", value=bucket_name, visible=False)
|
@@ -337,19 +337,19 @@ with app:
|
|
337 |
with gr.Accordion("Custom allow, deny, and full page redaction lists", open = True):
|
338 |
with gr.Row():
|
339 |
with gr.Column():
|
340 |
-
in_allow_list = gr.File(label="Import allow list file - csv table with one column of a different word/phrase on each row (case
|
341 |
in_allow_list_text = gr.Textbox(label="Custom allow list load status")
|
342 |
with gr.Column():
|
343 |
-
in_deny_list = gr.File(label="Import custom deny list - csv table with one column of a different word/phrase on each row (case
|
344 |
in_deny_list_text = gr.Textbox(label="Custom deny list load status")
|
345 |
with gr.Column():
|
346 |
in_fully_redacted_list = gr.File(label="Import fully redacted pages list - csv table with one column of page numbers on each row. Page numbers in this file will be fully redacted.", file_count="multiple", height=file_input_height)
|
347 |
in_fully_redacted_list_text = gr.Textbox(label="Fully redacted page list load status")
|
348 |
-
with gr.Accordion("Manually modify custom allow, deny, and full page redaction lists", open = False):
|
349 |
with gr.Row():
|
350 |
-
in_allow_list_state = gr.Dataframe(value=pd.DataFrame(), headers=
|
351 |
-
in_deny_list_state = gr.Dataframe(value=pd.DataFrame(), headers=
|
352 |
-
in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), headers=
|
353 |
|
354 |
with gr.Accordion("Select entity types to redact", open = True):
|
355 |
in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Local PII identification model (click empty space in box for full list)")
|
@@ -504,10 +504,10 @@ with app:
|
|
504 |
in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
|
505 |
success(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list])
|
506 |
|
507 |
-
tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities,
|
508 |
|
509 |
# If the output file count text box changes, keep going with redacting each data file until done
|
510 |
-
text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities,
|
511 |
success(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
|
512 |
|
513 |
###
|
@@ -523,6 +523,11 @@ with app:
|
|
523 |
in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
|
524 |
in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
|
525 |
|
|
|
|
|
|
|
|
|
|
|
526 |
# Merge multiple review csv files together
|
527 |
merge_multiple_review_files_btn.click(fn=merge_csv_files, inputs=multiple_review_files_in_out, outputs=multiple_review_files_in_out)
|
528 |
|
|
|
11 |
from gradio_image_annotation.image_annotator import AnnotatedImageData
|
12 |
|
13 |
from tools.config import output_folder, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, DEFAULT_ALLOW_LIST_PATH
|
14 |
+
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe
|
15 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
|
16 |
from tools.file_redaction import choose_and_run_redactor
|
17 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
|
|
145 |
## Settings page variables
|
146 |
default_deny_list_file_name = "default_deny_list.csv"
|
147 |
default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
|
148 |
+
in_deny_list_text_in = gr.Textbox(value="deny_list", visible=False)
|
149 |
|
150 |
fully_redacted_list_file_name = "default_fully_redacted_list.csv"
|
151 |
fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
|
152 |
+
in_fully_redacted_text_in = gr.Textbox(value="fully_redacted_pages_list", visible=False)
|
153 |
|
154 |
# S3 settings for default allow list load
|
155 |
s3_default_bucket = gr.Textbox(label = "Default S3 bucket", value=bucket_name, visible=False)
|
|
|
337 |
with gr.Accordion("Custom allow, deny, and full page redaction lists", open = True):
|
338 |
with gr.Row():
|
339 |
with gr.Column():
|
340 |
+
in_allow_list = gr.File(label="Import allow list file - csv table with one column of a different word/phrase on each row (case insensitive). Terms in this file will not be redacted.", file_count="multiple", height=file_input_height)
|
341 |
in_allow_list_text = gr.Textbox(label="Custom allow list load status")
|
342 |
with gr.Column():
|
343 |
+
in_deny_list = gr.File(label="Import custom deny list - csv table with one column of a different word/phrase on each row (case insensitive). Terms in this file will always be redacted.", file_count="multiple", height=file_input_height)
|
344 |
in_deny_list_text = gr.Textbox(label="Custom deny list load status")
|
345 |
with gr.Column():
|
346 |
in_fully_redacted_list = gr.File(label="Import fully redacted pages list - csv table with one column of page numbers on each row. Page numbers in this file will be fully redacted.", file_count="multiple", height=file_input_height)
|
347 |
in_fully_redacted_list_text = gr.Textbox(label="Fully redacted page list load status")
|
348 |
+
with gr.Accordion("Manually modify custom allow, deny, and full page redaction lists (NOTE: you need to press Enter after modifying/adding an entry to the lists to apply them)", open = False):
|
349 |
with gr.Row():
|
350 |
+
in_allow_list_state = gr.Dataframe(value=pd.DataFrame(), headers=["allow_list"], col_count=(1, "fixed"), row_count = (0, "dynamic"), label="Allow list", visible=True, type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True)
|
351 |
+
in_deny_list_state = gr.Dataframe(value=pd.DataFrame(), headers=["deny_list"], col_count=(1, "fixed"), row_count = (0, "dynamic"), label="Deny list", visible=True, type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True)
|
352 |
+
in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), headers=["fully_redacted_pages_list"], col_count=(1, "fixed"), row_count = (0, "dynamic"), label="Fully redacted pages", visible=True, type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, datatype='number')
|
353 |
|
354 |
with gr.Accordion("Select entity types to redact", open = True):
|
355 |
in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Local PII identification model (click empty space in box for full list)")
|
|
|
504 |
in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
|
505 |
success(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list])
|
506 |
|
507 |
+
tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list_state, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
|
508 |
|
509 |
# If the output file count text box changes, keep going with redacting each data file until done
|
510 |
+
text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list_state, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
|
511 |
success(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
|
512 |
|
513 |
###
|
|
|
523 |
in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
|
524 |
in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
|
525 |
|
526 |
+
# The following allows for more reliable updates of the data in the custom list dataframes
|
527 |
+
in_allow_list_state.input(update_dataframe, inputs=[in_allow_list_state], outputs=[in_allow_list_state])
|
528 |
+
in_deny_list_state.input(update_dataframe, inputs=[in_deny_list_state], outputs=[in_deny_list_state])
|
529 |
+
in_fully_redacted_list_state.input(update_dataframe, inputs=[in_fully_redacted_list_state], outputs=[in_fully_redacted_list_state])
|
530 |
+
|
531 |
# Merge multiple review csv files together
|
532 |
merge_multiple_review_files_btn.click(fn=merge_csv_files, inputs=multiple_review_files_in_out, outputs=multiple_review_files_in_out)
|
533 |
|
tools/file_conversion.py
CHANGED
@@ -827,14 +827,14 @@ def convert_annotation_json_to_review_df(all_annotations:List[dict], redaction_d
|
|
827 |
# If all coordinates all greater than one, this is a absolute image coordinates - change back to relative coordinates
|
828 |
if "xmin" in review_file_df.columns:
|
829 |
if review_file_df["xmin"].max() >= 1 and review_file_df["xmax"].max() >= 1 and review_file_df["ymin"].max() >= 1 and review_file_df["ymax"].max() >= 1:
|
830 |
-
print("review file df has large coordinates")
|
831 |
review_file_df["page"] = review_file_df["page"].astype(int)
|
832 |
|
833 |
if "image_width" not in review_file_df.columns and not page_sizes_df.empty:
|
834 |
review_file_df = review_file_df.merge(page_sizes_df, on="page", how="left")
|
835 |
|
836 |
if "image_width" in review_file_df.columns:
|
837 |
-
print("Dividing coordinates in review file")
|
838 |
review_file_df["xmin"] = review_file_df["xmin"] / review_file_df["image_width"]
|
839 |
review_file_df["xmax"] = review_file_df["xmax"] / review_file_df["image_width"]
|
840 |
review_file_df["ymin"] = review_file_df["ymin"] / review_file_df["image_height"]
|
@@ -896,7 +896,7 @@ def convert_annotation_json_to_review_df(all_annotations:List[dict], redaction_d
|
|
896 |
|
897 |
# Handle missing matches using a proximity-based approach
|
898 |
#if merged_df['text'].isnull().sum() > 0:
|
899 |
-
print("Attempting tolerance-based merge for text")
|
900 |
# Convert coordinates to numpy arrays for KDTree lookup
|
901 |
tree = cKDTree(df2[['xmin', 'ymin', 'xmax', 'ymax']].values)
|
902 |
query_coords = df1[['xmin', 'ymin', 'xmax', 'ymax']].values
|
|
|
827 |
# If all coordinates all greater than one, this is a absolute image coordinates - change back to relative coordinates
|
828 |
if "xmin" in review_file_df.columns:
|
829 |
if review_file_df["xmin"].max() >= 1 and review_file_df["xmax"].max() >= 1 and review_file_df["ymin"].max() >= 1 and review_file_df["ymax"].max() >= 1:
|
830 |
+
#print("review file df has large coordinates")
|
831 |
review_file_df["page"] = review_file_df["page"].astype(int)
|
832 |
|
833 |
if "image_width" not in review_file_df.columns and not page_sizes_df.empty:
|
834 |
review_file_df = review_file_df.merge(page_sizes_df, on="page", how="left")
|
835 |
|
836 |
if "image_width" in review_file_df.columns:
|
837 |
+
#print("Dividing coordinates in review file")
|
838 |
review_file_df["xmin"] = review_file_df["xmin"] / review_file_df["image_width"]
|
839 |
review_file_df["xmax"] = review_file_df["xmax"] / review_file_df["image_width"]
|
840 |
review_file_df["ymin"] = review_file_df["ymin"] / review_file_df["image_height"]
|
|
|
896 |
|
897 |
# Handle missing matches using a proximity-based approach
|
898 |
#if merged_df['text'].isnull().sum() > 0:
|
899 |
+
#print("Attempting tolerance-based merge for text")
|
900 |
# Convert coordinates to numpy arrays for KDTree lookup
|
901 |
tree = cKDTree(df2[['xmin', 'ymin', 'xmax', 'ymax']].values)
|
902 |
query_coords = df1[['xmin', 'ymin', 'xmax', 'ymax']].values
|
tools/file_redaction.py
CHANGED
@@ -141,6 +141,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
141 |
The function returns a redacted document along with processing logs.
|
142 |
'''
|
143 |
combined_out_message = ""
|
|
|
|
|
144 |
tic = time.perf_counter()
|
145 |
all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
|
146 |
|
@@ -171,22 +173,46 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
171 |
#print("prepared_pdf_file_paths:", prepared_pdf_file_paths[0])
|
172 |
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
173 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
if isinstance(custom_recogniser_word_list, pd.DataFrame):
|
175 |
if not custom_recogniser_word_list.empty:
|
176 |
-
|
177 |
else:
|
178 |
-
|
179 |
-
custom_recogniser_word_list = [] # or some default value
|
180 |
|
181 |
# Sort the strings in order from the longest string to the shortest
|
182 |
-
|
|
|
|
|
183 |
|
|
|
|
|
|
|
184 |
if isinstance(redact_whole_page_list, pd.DataFrame):
|
185 |
if not redact_whole_page_list.empty:
|
186 |
-
redact_whole_page_list
|
|
|
|
|
|
|
|
|
|
|
187 |
else:
|
188 |
-
|
189 |
-
|
|
|
190 |
|
191 |
# If this is the first time around, set variables to 0/blank
|
192 |
if first_loop_state==True:
|
@@ -250,24 +276,13 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
250 |
|
251 |
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state, page_sizes, document_cropboxes
|
252 |
|
253 |
-
# Create allow list
|
254 |
-
# If string, assume file path
|
255 |
-
if isinstance(in_allow_list, str):
|
256 |
-
in_allow_list = pd.read_csv(in_allow_list)
|
257 |
|
258 |
-
if not in_allow_list.empty:
|
259 |
-
in_allow_list_flat = in_allow_list.iloc[:,0].tolist()
|
260 |
-
#print("In allow list:", in_allow_list_flat)
|
261 |
-
else:
|
262 |
-
in_allow_list_flat = []
|
263 |
|
264 |
# Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
|
265 |
if pii_identification_method == "AWS Comprehend":
|
266 |
print("Trying to connect to AWS Comprehend service")
|
267 |
if aws_access_key_textbox and aws_secret_key_textbox:
|
268 |
print("Connecting to Comprehend using AWS access key and secret keys from textboxes.")
|
269 |
-
print("aws_access_key_textbox:", aws_access_key_textbox)
|
270 |
-
print("aws_secret_access_key:", aws_secret_key_textbox)
|
271 |
comprehend_client = boto3.client('comprehend',
|
272 |
aws_access_key_id=aws_access_key_textbox,
|
273 |
aws_secret_access_key=aws_secret_key_textbox)
|
@@ -372,8 +387,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
372 |
comprehend_query_number,
|
373 |
comprehend_client,
|
374 |
textract_client,
|
375 |
-
|
376 |
-
|
377 |
max_fuzzy_spelling_mistakes_num,
|
378 |
match_fuzzy_whole_phrase_bool,
|
379 |
log_files_output_paths=log_files_output_paths,
|
@@ -409,8 +424,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
409 |
pii_identification_method,
|
410 |
comprehend_query_number,
|
411 |
comprehend_client,
|
412 |
-
|
413 |
-
|
414 |
max_fuzzy_spelling_mistakes_num,
|
415 |
match_fuzzy_whole_phrase_bool)
|
416 |
|
@@ -444,15 +459,6 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
444 |
|
445 |
out_file_paths.append(out_redacted_pdf_file_path)
|
446 |
|
447 |
-
#logs_output_file_name = out_orig_pdf_file_path + "_decision_process_output.csv"
|
448 |
-
#all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
|
449 |
-
#log_files_output_paths.append(logs_output_file_name)
|
450 |
-
|
451 |
-
# Convert OCR result bounding boxes to relative values
|
452 |
-
#print("all_line_level_ocr_results_df:", all_line_level_ocr_results_df)
|
453 |
-
#print("page_sizes:", page_sizes)
|
454 |
-
#print("all_line_level_ocr_results_df:", all_line_level_ocr_results_df)
|
455 |
-
|
456 |
page_sizes_df = pd.DataFrame(page_sizes)
|
457 |
|
458 |
page_sizes_df["page"] = page_sizes_df["page"].astype(int)
|
@@ -473,33 +479,26 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
473 |
|
474 |
# Save the gradio_annotation_boxes to a review csv file
|
475 |
try:
|
476 |
-
#print("annotations_all_pages before in choose and run redactor:", annotations_all_pages)
|
477 |
-
#print("all_decision_process_table before in choose and run redactor:", all_decision_process_table)
|
478 |
-
#print("page_sizes before in choose and run redactor:", page_sizes)
|
479 |
|
480 |
-
|
481 |
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
review_df["page"] = review_df["page"].astype(int)
|
487 |
-
if "image_height" not in review_df.columns:
|
488 |
-
review_df = review_df.merge(page_sizes_df, on="page", how="left")
|
489 |
|
490 |
# If all coordinates all greater than one, this is a absolute image coordinates - change back to relative coordinates
|
491 |
-
if
|
492 |
-
|
493 |
-
|
494 |
-
|
495 |
-
|
496 |
|
497 |
# Don't need page sizes in outputs
|
498 |
-
|
499 |
|
500 |
-
#print("
|
501 |
|
502 |
-
|
503 |
out_file_paths.append(out_review_file_path)
|
504 |
|
505 |
#print("Saved review file to csv")
|
@@ -550,10 +549,15 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
550 |
|
551 |
# Ensure no duplicated output files
|
552 |
log_files_output_paths = list(set(log_files_output_paths))
|
553 |
-
out_file_paths = list(set(out_file_paths))
|
554 |
-
|
|
|
|
|
|
|
|
|
|
|
555 |
|
556 |
-
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths,
|
557 |
|
558 |
def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
|
559 |
'''
|
|
|
141 |
The function returns a redacted document along with processing logs.
|
142 |
'''
|
143 |
combined_out_message = ""
|
144 |
+
out_review_file_path = ""
|
145 |
+
pdf_file_name_with_ext = ""
|
146 |
tic = time.perf_counter()
|
147 |
all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
|
148 |
|
|
|
173 |
#print("prepared_pdf_file_paths:", prepared_pdf_file_paths[0])
|
174 |
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
175 |
|
176 |
+
# Load/create allow list
|
177 |
+
# If string, assume file path
|
178 |
+
if isinstance(in_allow_list, str):
|
179 |
+
in_allow_list = pd.read_csv(in_allow_list)
|
180 |
+
# Now, should be a pandas dataframe format
|
181 |
+
if not in_allow_list.empty:
|
182 |
+
in_allow_list_flat = in_allow_list.iloc[:,0].tolist()
|
183 |
+
print("In allow list after flattening:", in_allow_list_flat)
|
184 |
+
else:
|
185 |
+
in_allow_list_flat = []
|
186 |
+
|
187 |
+
# If string, assume file path
|
188 |
+
if isinstance(custom_recogniser_word_list, str):
|
189 |
+
custom_recogniser_word_list = pd.read_csv(custom_recogniser_word_list)
|
190 |
if isinstance(custom_recogniser_word_list, pd.DataFrame):
|
191 |
if not custom_recogniser_word_list.empty:
|
192 |
+
custom_recogniser_word_list_flat = custom_recogniser_word_list.iloc[:, 0].tolist()
|
193 |
else:
|
194 |
+
custom_recogniser_word_list_flat = []
|
|
|
195 |
|
196 |
# Sort the strings in order from the longest string to the shortest
|
197 |
+
custom_recogniser_word_list_flat = sorted(custom_recogniser_word_list_flat, key=len, reverse=True)
|
198 |
+
|
199 |
+
#print("custom_recogniser_word_list_flat:", custom_recogniser_word_list_flat)
|
200 |
|
201 |
+
# If string, assume file path
|
202 |
+
if isinstance(redact_whole_page_list, str):
|
203 |
+
redact_whole_page_list = pd.read_csv(redact_whole_page_list)
|
204 |
if isinstance(redact_whole_page_list, pd.DataFrame):
|
205 |
if not redact_whole_page_list.empty:
|
206 |
+
#print("redact_whole_page_list:", redact_whole_page_list)
|
207 |
+
try:
|
208 |
+
redact_whole_page_list_flat = redact_whole_page_list.iloc[:,0].astype(int).tolist()
|
209 |
+
except Exception as e:
|
210 |
+
print("Could not convert whole page redaction data to number list due to:", e)
|
211 |
+
redact_whole_page_list_flat = redact_whole_page_list.iloc[:,0].tolist()
|
212 |
else:
|
213 |
+
redact_whole_page_list_flat = []
|
214 |
+
|
215 |
+
#print("redact_whole_page_list_flat:", redact_whole_page_list_flat)
|
216 |
|
217 |
# If this is the first time around, set variables to 0/blank
|
218 |
if first_loop_state==True:
|
|
|
276 |
|
277 |
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state, page_sizes, document_cropboxes
|
278 |
|
|
|
|
|
|
|
|
|
279 |
|
|
|
|
|
|
|
|
|
|
|
280 |
|
281 |
# Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
|
282 |
if pii_identification_method == "AWS Comprehend":
|
283 |
print("Trying to connect to AWS Comprehend service")
|
284 |
if aws_access_key_textbox and aws_secret_key_textbox:
|
285 |
print("Connecting to Comprehend using AWS access key and secret keys from textboxes.")
|
|
|
|
|
286 |
comprehend_client = boto3.client('comprehend',
|
287 |
aws_access_key_id=aws_access_key_textbox,
|
288 |
aws_secret_access_key=aws_secret_key_textbox)
|
|
|
387 |
comprehend_query_number,
|
388 |
comprehend_client,
|
389 |
textract_client,
|
390 |
+
custom_recogniser_word_list_flat,
|
391 |
+
redact_whole_page_list_flat,
|
392 |
max_fuzzy_spelling_mistakes_num,
|
393 |
match_fuzzy_whole_phrase_bool,
|
394 |
log_files_output_paths=log_files_output_paths,
|
|
|
424 |
pii_identification_method,
|
425 |
comprehend_query_number,
|
426 |
comprehend_client,
|
427 |
+
custom_recogniser_word_list_flat,
|
428 |
+
redact_whole_page_list_flat,
|
429 |
max_fuzzy_spelling_mistakes_num,
|
430 |
match_fuzzy_whole_phrase_bool)
|
431 |
|
|
|
459 |
|
460 |
out_file_paths.append(out_redacted_pdf_file_path)
|
461 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
462 |
page_sizes_df = pd.DataFrame(page_sizes)
|
463 |
|
464 |
page_sizes_df["page"] = page_sizes_df["page"].astype(int)
|
|
|
479 |
|
480 |
# Save the gradio_annotation_boxes to a review csv file
|
481 |
try:
|
|
|
|
|
|
|
482 |
|
483 |
+
review_file_state = convert_annotation_json_to_review_df(annotations_all_pages, all_decision_process_table, page_sizes)
|
484 |
|
485 |
+
review_file_state["page"] = review_file_state["page"].astype(int)
|
486 |
+
if "image_height" not in review_file_state.columns:
|
487 |
+
review_file_state = review_file_state.merge(page_sizes_df, on="page", how="left")
|
|
|
|
|
|
|
|
|
488 |
|
489 |
# If all coordinates all greater than one, this is a absolute image coordinates - change back to relative coordinates
|
490 |
+
if review_file_state["xmin"].max() >= 1 and review_file_state["xmax"].max() >= 1 and review_file_state["ymin"].max() >= 1 and review_file_state["ymax"].max() >= 1:
|
491 |
+
review_file_state["xmin"] = review_file_state["xmin"] / review_file_state["image_width"]
|
492 |
+
review_file_state["xmax"] = review_file_state["xmax"] / review_file_state["image_width"]
|
493 |
+
review_file_state["ymin"] = review_file_state["ymin"] / review_file_state["image_height"]
|
494 |
+
review_file_state["ymax"] = review_file_state["ymax"] / review_file_state["image_height"]
|
495 |
|
496 |
# Don't need page sizes in outputs
|
497 |
+
review_file_state.drop(["image_width", "image_height", "mediabox_width", "mediabox_height", "cropbox_width", "cropbox_height"], axis=1, inplace=True, errors="ignore")
|
498 |
|
499 |
+
#print("review_file_state:", review_file_state)
|
500 |
|
501 |
+
review_file_state.to_csv(out_review_file_path, index=None)
|
502 |
out_file_paths.append(out_review_file_path)
|
503 |
|
504 |
#print("Saved review file to csv")
|
|
|
549 |
|
550 |
# Ensure no duplicated output files
|
551 |
log_files_output_paths = list(set(log_files_output_paths))
|
552 |
+
out_file_paths = list(set(out_file_paths))
|
553 |
+
|
554 |
+
# Output file paths
|
555 |
+
if not out_review_file_path:
|
556 |
+
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
557 |
+
else:
|
558 |
+
review_out_file_paths = [prepared_pdf_file_paths[0], out_review_file_path]
|
559 |
|
560 |
+
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state, page_sizes, document_cropboxes
|
561 |
|
562 |
def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
|
563 |
'''
|
tools/helper_functions.py
CHANGED
@@ -40,6 +40,9 @@ def load_in_default_allow_list(allow_list_file_path):
|
|
40 |
allow_list_file_path = [allow_list_file_path]
|
41 |
return allow_list_file_path
|
42 |
|
|
|
|
|
|
|
43 |
|
44 |
def get_file_name_without_type(file_path):
|
45 |
# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
|
@@ -96,12 +99,12 @@ def ensure_output_folder_exists():
|
|
96 |
else:
|
97 |
print(f"The 'output/' folder already exists.")
|
98 |
|
99 |
-
def custom_regex_load(in_file:List[str], file_type:str = "
|
100 |
'''
|
101 |
When file is loaded, update the column dropdown choices and write to relevant data states.
|
102 |
'''
|
103 |
|
104 |
-
|
105 |
|
106 |
if in_file:
|
107 |
file_list = [string.name for string in in_file]
|
@@ -109,20 +112,25 @@ def custom_regex_load(in_file:List[str], file_type:str = "Allow list"):
|
|
109 |
regex_file_names = [string for string in file_list if "csv" in string.lower()]
|
110 |
if regex_file_names:
|
111 |
regex_file_name = regex_file_names[0]
|
112 |
-
|
113 |
#regex_file_name_no_ext = get_file_name_without_type(regex_file_name)
|
|
|
|
|
|
|
|
|
114 |
|
115 |
-
|
116 |
|
117 |
output_text = file_type + " file loaded."
|
118 |
|
|
|
119 |
print(output_text)
|
120 |
else:
|
121 |
output_text = "No file provided."
|
122 |
print(output_text)
|
123 |
-
return output_text,
|
124 |
|
125 |
-
return output_text,
|
126 |
|
127 |
def put_columns_in_df(in_file):
|
128 |
new_choices = []
|
|
|
40 |
allow_list_file_path = [allow_list_file_path]
|
41 |
return allow_list_file_path
|
42 |
|
43 |
+
def update_dataframe(df:pd.DataFrame):
|
44 |
+
df_copy = df.copy()
|
45 |
+
return df_copy
|
46 |
|
47 |
def get_file_name_without_type(file_path):
|
48 |
# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
|
|
|
99 |
else:
|
100 |
print(f"The 'output/' folder already exists.")
|
101 |
|
102 |
+
def custom_regex_load(in_file:List[str], file_type:str = "allow_list"):
|
103 |
'''
|
104 |
When file is loaded, update the column dropdown choices and write to relevant data states.
|
105 |
'''
|
106 |
|
107 |
+
custom_regex_df = pd.DataFrame()
|
108 |
|
109 |
if in_file:
|
110 |
file_list = [string.name for string in in_file]
|
|
|
112 |
regex_file_names = [string for string in file_list if "csv" in string.lower()]
|
113 |
if regex_file_names:
|
114 |
regex_file_name = regex_file_names[0]
|
115 |
+
custom_regex_df = pd.read_csv(regex_file_name, low_memory=False, header=None)
|
116 |
#regex_file_name_no_ext = get_file_name_without_type(regex_file_name)
|
117 |
+
|
118 |
+
# Select just first columns
|
119 |
+
custom_regex_df = pd.DataFrame(custom_regex_df.iloc[:,[0]])
|
120 |
+
custom_regex_df.rename(columns={0:file_type}, inplace=True)
|
121 |
|
122 |
+
custom_regex_df.columns = custom_regex_df.columns.astype(str)
|
123 |
|
124 |
output_text = file_type + " file loaded."
|
125 |
|
126 |
+
print("Custom regex df:", custom_regex_df)
|
127 |
print(output_text)
|
128 |
else:
|
129 |
output_text = "No file provided."
|
130 |
print(output_text)
|
131 |
+
return output_text, custom_regex_df
|
132 |
|
133 |
+
return output_text, custom_regex_df
|
134 |
|
135 |
def put_columns_in_df(in_file):
|
136 |
new_choices = []
|
tools/redaction_review.py
CHANGED
@@ -152,7 +152,6 @@ def update_recogniser_dataframes(image_annotator_object:AnnotatedImageData, reco
|
|
152 |
elif recogniser_dataframe_modified.iloc[0,0] == "":
|
153 |
recogniser_dataframe_modified, recogniser_dataframe_out, recogniser_entities_dropdown_value, recogniser_entities_list, text_entities_drop, page_entities_drop = get_filtered_recogniser_dataframe_and_dropdowns(image_annotator_object, recogniser_dataframe_modified, recogniser_entities_dropdown_value, text_dropdown_value, page_dropdown_value, review_df, page_sizes)
|
154 |
else:
|
155 |
-
print("recogniser dataframe is not empty")
|
156 |
review_dataframe, text_entities_drop, page_entities_drop = update_entities_df_recogniser_entities(recogniser_entities_dropdown_value, recogniser_dataframe_modified, page_dropdown_value, text_dropdown_value)
|
157 |
recogniser_dataframe_out = gr.Dataframe(review_dataframe[["page", "label", "text"]], show_search="filter", col_count=(3, "fixed"), type="pandas", headers=["page", "label", "text"])
|
158 |
|
@@ -600,14 +599,12 @@ def reset_dropdowns():
|
|
600 |
return gr.Dropdown(value="ALL", allow_custom_value=True), gr.Dropdown(value="ALL", allow_custom_value=True), gr.Dropdown(value="ALL", allow_custom_value=True)
|
601 |
|
602 |
def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
|
603 |
-
print("evt.row_value[0]:", evt.row_value[0])
|
604 |
|
605 |
row_value_page = evt.row_value[0] # This is the page number value
|
606 |
|
607 |
if isinstance(row_value_page, list):
|
608 |
row_value_page = row_value_page[0]
|
609 |
|
610 |
-
print("row_value_page:", row_value_page)
|
611 |
return row_value_page
|
612 |
|
613 |
def convert_image_coords_to_adobe(pdf_page_width:float, pdf_page_height:float, image_width:float, image_height:float, x1:float, y1:float, x2:float, y2:float):
|
|
|
152 |
elif recogniser_dataframe_modified.iloc[0,0] == "":
|
153 |
recogniser_dataframe_modified, recogniser_dataframe_out, recogniser_entities_dropdown_value, recogniser_entities_list, text_entities_drop, page_entities_drop = get_filtered_recogniser_dataframe_and_dropdowns(image_annotator_object, recogniser_dataframe_modified, recogniser_entities_dropdown_value, text_dropdown_value, page_dropdown_value, review_df, page_sizes)
|
154 |
else:
|
|
|
155 |
review_dataframe, text_entities_drop, page_entities_drop = update_entities_df_recogniser_entities(recogniser_entities_dropdown_value, recogniser_dataframe_modified, page_dropdown_value, text_dropdown_value)
|
156 |
recogniser_dataframe_out = gr.Dataframe(review_dataframe[["page", "label", "text"]], show_search="filter", col_count=(3, "fixed"), type="pandas", headers=["page", "label", "text"])
|
157 |
|
|
|
599 |
return gr.Dropdown(value="ALL", allow_custom_value=True), gr.Dropdown(value="ALL", allow_custom_value=True), gr.Dropdown(value="ALL", allow_custom_value=True)
|
600 |
|
601 |
def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
|
|
|
602 |
|
603 |
row_value_page = evt.row_value[0] # This is the page number value
|
604 |
|
605 |
if isinstance(row_value_page, list):
|
606 |
row_value_page = row_value_page[0]
|
607 |
|
|
|
608 |
return row_value_page
|
609 |
|
610 |
def convert_image_coords_to_adobe(pdf_page_width:float, pdf_page_height:float, image_width:float, image_height:float, x1:float, y1:float, x2:float, y2:float):
|