Commit
·
6319afc
1
Parent(s):
66e145d
More config options. Fixed some bugs with removing elements from review page and Adobe export. Some UI rearrangements
Browse files- app.py +39 -56
- requirements.txt +1 -1
- tools/auth.py +4 -24
- tools/aws_functions.py +5 -32
- tools/aws_textract.py +5 -12
- tools/cli_redact.py +3 -2
- tools/config.py +120 -0
- tools/custom_csvlogger.py +0 -2
- tools/custom_image_analyser_engine.py +4 -4
- tools/data_anonymise.py +2 -3
- tools/file_conversion.py +87 -45
- tools/file_redaction.py +14 -21
- tools/find_duplicate_pages.py +2 -2
- tools/helper_functions.py +1 -47
- tools/presidio_analyzer_custom.py +2 -2
- tools/redaction_review.py +132 -57
app.py
CHANGED
@@ -10,10 +10,11 @@ from datetime import datetime
|
|
10 |
from gradio_image_annotation import image_annotator
|
11 |
from gradio_image_annotation.image_annotator import AnnotatedImageData
|
12 |
|
13 |
-
from tools.
|
|
|
14 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
|
15 |
from tools.file_redaction import choose_and_run_redactor
|
16 |
-
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
17 |
from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal
|
18 |
from tools.data_anonymise import anonymise_data_files
|
19 |
from tools.auth import authenticate_user
|
@@ -142,9 +143,6 @@ with app:
|
|
142 |
prepare_for_review_bool = gr.Checkbox(label="prepare_for_review_bool", value=True, visible=False)
|
143 |
|
144 |
## Settings page variables
|
145 |
-
default_allow_list_file_name = "default_allow_list.csv"
|
146 |
-
default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
|
147 |
-
|
148 |
default_deny_list_file_name = "default_deny_list.csv"
|
149 |
default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
|
150 |
in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
|
@@ -155,7 +153,11 @@ with app:
|
|
155 |
|
156 |
# S3 settings for default allow list load
|
157 |
s3_default_bucket = gr.Textbox(label = "Default S3 bucket", value=bucket_name, visible=False)
|
158 |
-
|
|
|
|
|
|
|
|
|
159 |
default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
|
160 |
|
161 |
# Base dataframe for recognisers that is not modified subsequent to load
|
@@ -185,7 +187,7 @@ with app:
|
|
185 |
###
|
186 |
with gr.Tab("Redact PDFs/images"):
|
187 |
with gr.Accordion("Redact document", open = True):
|
188 |
-
in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json'], height=file_input_height)
|
189 |
# if RUN_AWS_FUNCTIONS == "1":
|
190 |
in_redaction_method = gr.Radio(label="Choose text extraction method. AWS Textract has a cost per page - $3.50 per 1,000 pages with signature detection (default), $1.50 without. Go to Redaction settings - AWS Textract options to remove signature detection.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
|
191 |
pii_identification_method_drop = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost of approximately $0.01 per 10,000 characters.", value = default_pii_detector, choices=[local_pii_detector, aws_pii_detector])
|
@@ -217,18 +219,16 @@ with app:
|
|
217 |
###
|
218 |
with gr.Tab("Review redactions", id="tab_object_annotation"):
|
219 |
|
220 |
-
with gr.Accordion(label = "Review
|
221 |
-
output_review_files = gr.File(label="
|
222 |
-
upload_previous_review_file_btn = gr.Button("Review
|
223 |
-
with gr.Row():
|
224 |
-
annotation_button_apply = gr.Button("Apply revised redactions to pdf", variant="secondary")
|
225 |
with gr.Row():
|
226 |
annotate_zoom_in = gr.Button("Zoom in", visible=False)
|
227 |
annotate_zoom_out = gr.Button("Zoom out", visible=False)
|
228 |
with gr.Row():
|
229 |
clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
|
230 |
|
231 |
-
with gr.Row():
|
232 |
with gr.Column(scale=2):
|
233 |
with gr.Row(equal_height=True):
|
234 |
annotation_last_page_button = gr.Button("Previous page", scale = 4)
|
@@ -236,7 +236,8 @@ with app:
|
|
236 |
annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
|
237 |
annotation_next_page_button = gr.Button("Next page", scale = 4)
|
238 |
with gr.Column(scale=1):
|
239 |
-
|
|
|
240 |
|
241 |
with gr.Row():
|
242 |
with gr.Column(scale=2):
|
@@ -261,12 +262,12 @@ with app:
|
|
261 |
interactive=False
|
262 |
)
|
263 |
with gr.Column(scale=1):
|
264 |
-
with gr.Row():
|
265 |
recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
|
266 |
page_entity_dropdown = gr.Dropdown(label="Page", value="ALL", allow_custom_value=True)
|
267 |
text_entity_dropdown = gr.Dropdown(label="Text", value="ALL", allow_custom_value=True)
|
268 |
recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=(3,"fixed"), type="pandas", label="Search results. Click to go to page", headers=["page", "label", "text"])
|
269 |
-
with gr.Row():
|
270 |
reset_dropdowns_btn = gr.Button(value="Reset filters")
|
271 |
exclude_selected_btn = gr.Button(value="Exclude items in table from redactions")
|
272 |
undo_last_removal_btn = gr.Button(value="Undo last element removal")
|
@@ -393,21 +394,22 @@ with app:
|
|
393 |
###
|
394 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
|
395 |
|
|
|
396 |
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, document_cropboxes]).\
|
397 |
-
success(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox],
|
398 |
-
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes], api_name="redact_doc").\
|
399 |
success(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
|
400 |
|
401 |
# If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
|
406 |
# If a file has been completed, the function will continue onto the next document
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
|
412 |
###
|
413 |
# REVIEW PDF REDACTIONS
|
@@ -479,8 +481,9 @@ with app:
|
|
479 |
success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
|
480 |
|
481 |
exclude_selected_btn.click(exclude_selected_items_from_redaction, inputs=[review_file_state, recogniser_entity_dataframe, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_state, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
|
482 |
-
success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
|
483 |
-
|
|
|
484 |
|
485 |
undo_last_removal_btn.click(undo_last_removal, inputs=[backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base], outputs=[review_file_state, all_image_annotations_state, recogniser_entity_dataframe_base]).\
|
486 |
success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
|
@@ -488,7 +491,7 @@ with app:
|
|
488 |
# Convert review file to xfdf Adobe format
|
489 |
convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
|
490 |
success(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes]).\
|
491 |
-
success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes], outputs=[adobe_review_files_out])
|
492 |
|
493 |
# Convert xfdf Adobe file back to review_file.csv
|
494 |
convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
|
@@ -533,14 +536,14 @@ with app:
|
|
533 |
# Get connection details on app load
|
534 |
app.load(get_connection_params, inputs=[output_folder_textbox], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox])
|
535 |
|
536 |
-
# If
|
537 |
-
|
538 |
-
|
539 |
-
|
540 |
-
|
541 |
-
|
542 |
-
|
543 |
-
|
544 |
|
545 |
# Log usernames and times of access to file (to know who is using the app when running on AWS)
|
546 |
access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
@@ -566,27 +569,7 @@ with app:
|
|
566 |
latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], None, preprocess=False).\
|
567 |
success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
568 |
|
569 |
-
# Get some environment variables and Launch the Gradio app
|
570 |
-
COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
|
571 |
-
print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
|
572 |
-
1
|
573 |
-
RUN_DIRECT_MODE = get_or_create_env_var('RUN_DIRECT_MODE', '0')
|
574 |
-
print(f'The value of RUN_DIRECT_MODE is {RUN_DIRECT_MODE}')
|
575 |
-
|
576 |
-
MAX_QUEUE_SIZE = int(get_or_create_env_var('MAX_QUEUE_SIZE', '5'))
|
577 |
-
print(f'The value of RUN_DIRECT_MODE is {MAX_QUEUE_SIZE}')
|
578 |
-
|
579 |
-
MAX_FILE_SIZE = get_or_create_env_var('MAX_FILE_SIZE', '250mb')
|
580 |
-
print(f'The value of MAX_FILE_SIZE is {MAX_FILE_SIZE}')
|
581 |
-
|
582 |
-
GRADIO_SERVER_PORT = int(get_or_create_env_var('GRADIO_SERVER_PORT', '7860'))
|
583 |
-
print(f'The value of GRADIO_SERVER_PORT is {GRADIO_SERVER_PORT}')
|
584 |
-
|
585 |
-
ROOT_PATH = get_or_create_env_var('ROOT_PATH', '')
|
586 |
-
print(f'The value of ROOT_PATH is {ROOT_PATH}')
|
587 |
|
588 |
-
DEFAULT_CONCURRENCY_LIMIT = get_or_create_env_var('DEFAULT_CONCURRENCY_LIMIT', '5')
|
589 |
-
print(f'The value of DEFAULT_CONCURRENCY_LIMIT is {DEFAULT_CONCURRENCY_LIMIT}')
|
590 |
|
591 |
if __name__ == "__main__":
|
592 |
|
|
|
10 |
from gradio_image_annotation import image_annotator
|
11 |
from gradio_image_annotation.image_annotator import AnnotatedImageData
|
12 |
|
13 |
+
from tools.config import output_folder, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, DEFAULT_ALLOW_LIST_PATH
|
14 |
+
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, reset_review_vars, merge_csv_files, load_all_output_files
|
15 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
|
16 |
from tools.file_redaction import choose_and_run_redactor
|
17 |
+
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
18 |
from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal
|
19 |
from tools.data_anonymise import anonymise_data_files
|
20 |
from tools.auth import authenticate_user
|
|
|
143 |
prepare_for_review_bool = gr.Checkbox(label="prepare_for_review_bool", value=True, visible=False)
|
144 |
|
145 |
## Settings page variables
|
|
|
|
|
|
|
146 |
default_deny_list_file_name = "default_deny_list.csv"
|
147 |
default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
|
148 |
in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
|
|
|
153 |
|
154 |
# S3 settings for default allow list load
|
155 |
s3_default_bucket = gr.Textbox(label = "Default S3 bucket", value=bucket_name, visible=False)
|
156 |
+
|
157 |
+
default_allow_list_file_name = "default_allow_list.csv"
|
158 |
+
default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
|
159 |
+
|
160 |
+
s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=DEFAULT_ALLOW_LIST_PATH, visible=False)
|
161 |
default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
|
162 |
|
163 |
# Base dataframe for recognisers that is not modified subsequent to load
|
|
|
187 |
###
|
188 |
with gr.Tab("Redact PDFs/images"):
|
189 |
with gr.Accordion("Redact document", open = True):
|
190 |
+
in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json', '.zip'], height=file_input_height)
|
191 |
# if RUN_AWS_FUNCTIONS == "1":
|
192 |
in_redaction_method = gr.Radio(label="Choose text extraction method. AWS Textract has a cost per page - $3.50 per 1,000 pages with signature detection (default), $1.50 without. Go to Redaction settings - AWS Textract options to remove signature detection.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
|
193 |
pii_identification_method_drop = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost of approximately $0.01 per 10,000 characters.", value = default_pii_detector, choices=[local_pii_detector, aws_pii_detector])
|
|
|
219 |
###
|
220 |
with gr.Tab("Review redactions", id="tab_object_annotation"):
|
221 |
|
222 |
+
with gr.Accordion(label = "Review PDF redactions", open=True):
|
223 |
+
output_review_files = gr.File(label="Upload original PDF and 'review_file' csv here to review suggested redactions", file_count='multiple', height=file_input_height)
|
224 |
+
upload_previous_review_file_btn = gr.Button("Review PDF and 'review file' csv provided above", variant="primary")
|
|
|
|
|
225 |
with gr.Row():
|
226 |
annotate_zoom_in = gr.Button("Zoom in", visible=False)
|
227 |
annotate_zoom_out = gr.Button("Zoom out", visible=False)
|
228 |
with gr.Row():
|
229 |
clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
|
230 |
|
231 |
+
with gr.Row(equal_height=True):
|
232 |
with gr.Column(scale=2):
|
233 |
with gr.Row(equal_height=True):
|
234 |
annotation_last_page_button = gr.Button("Previous page", scale = 4)
|
|
|
236 |
annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
|
237 |
annotation_next_page_button = gr.Button("Next page", scale = 4)
|
238 |
with gr.Column(scale=1):
|
239 |
+
annotation_button_apply = gr.Button("Apply revised redactions to PDF", variant="secondary")
|
240 |
+
#blank_markdown_top = gr.Markdown(value="", label="")
|
241 |
|
242 |
with gr.Row():
|
243 |
with gr.Column(scale=2):
|
|
|
262 |
interactive=False
|
263 |
)
|
264 |
with gr.Column(scale=1):
|
265 |
+
with gr.Row(equal_height=True):
|
266 |
recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
|
267 |
page_entity_dropdown = gr.Dropdown(label="Page", value="ALL", allow_custom_value=True)
|
268 |
text_entity_dropdown = gr.Dropdown(label="Text", value="ALL", allow_custom_value=True)
|
269 |
recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=(3,"fixed"), type="pandas", label="Search results. Click to go to page", headers=["page", "label", "text"])
|
270 |
+
with gr.Row(equal_height=True):
|
271 |
reset_dropdowns_btn = gr.Button(value="Reset filters")
|
272 |
exclude_selected_btn = gr.Button(value="Exclude items in table from redactions")
|
273 |
undo_last_removal_btn = gr.Button(value="Undo last element removal")
|
|
|
394 |
###
|
395 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
|
396 |
|
397 |
+
# Run redaction function
|
398 |
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, document_cropboxes]).\
|
399 |
+
success(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes],
|
400 |
+
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, document_cropboxes], api_name="redact_doc").\
|
401 |
success(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
|
402 |
|
403 |
# If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
|
404 |
+
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes],
|
405 |
+
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, document_cropboxes]).\
|
406 |
+
success(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
|
407 |
|
408 |
# If a file has been completed, the function will continue onto the next document
|
409 |
+
latest_file_completed_text.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes],
|
410 |
+
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, document_cropboxes]).\
|
411 |
+
success(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown]).\
|
412 |
+
success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
|
413 |
|
414 |
###
|
415 |
# REVIEW PDF REDACTIONS
|
|
|
481 |
success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
|
482 |
|
483 |
exclude_selected_btn.click(exclude_selected_items_from_redaction, inputs=[review_file_state, recogniser_entity_dataframe, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_state, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
|
484 |
+
success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown]).\
|
485 |
+
success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
486 |
+
# success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown, recogniser_entity_dataframe_base, review_file_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
487 |
|
488 |
undo_last_removal_btn.click(undo_last_removal, inputs=[backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base], outputs=[review_file_state, all_image_annotations_state, recogniser_entity_dataframe_base]).\
|
489 |
success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
|
|
|
491 |
# Convert review file to xfdf Adobe format
|
492 |
convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
|
493 |
success(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes]).\
|
494 |
+
success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes, page_sizes], outputs=[adobe_review_files_out])
|
495 |
|
496 |
# Convert xfdf Adobe file back to review_file.csv
|
497 |
convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
|
|
|
536 |
# Get connection details on app load
|
537 |
app.load(get_connection_params, inputs=[output_folder_textbox], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox])
|
538 |
|
539 |
+
# If relevant environment variable is set, load in the default allow list file from S3
|
540 |
+
if GET_DEFAULT_ALLOW_LIST == "True" and DEFAULT_ALLOW_LIST_PATH:
|
541 |
+
print("Loading allow list from default_allow_list_output_folder_location:", default_allow_list_loc)
|
542 |
+
if not os.path.exists(default_allow_list_loc):
|
543 |
+
app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_allow_list_file, default_allow_list_output_folder_location]).\
|
544 |
+
success(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
|
545 |
+
else:
|
546 |
+
app.load(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
|
547 |
|
548 |
# Log usernames and times of access to file (to know who is using the app when running on AWS)
|
549 |
access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
|
|
569 |
latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], None, preprocess=False).\
|
570 |
success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
571 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
572 |
|
|
|
|
|
573 |
|
574 |
if __name__ == "__main__":
|
575 |
|
requirements.txt
CHANGED
@@ -13,7 +13,7 @@ spacy==3.8.4
|
|
13 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
14 |
#en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
15 |
gradio==5.22.0
|
16 |
-
boto3==1.
|
17 |
pyarrow==19.0.1
|
18 |
openpyxl==3.1.5
|
19 |
Faker==36.1.1
|
|
|
13 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
14 |
#en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
15 |
gradio==5.22.0
|
16 |
+
boto3==1.37.17
|
17 |
pyarrow==19.0.1
|
18 |
openpyxl==3.1.5
|
19 |
Faker==36.1.1
|
tools/auth.py
CHANGED
@@ -1,32 +1,12 @@
|
|
1 |
-
|
2 |
-
import os
|
3 |
import boto3
|
4 |
-
import gradio as gr
|
5 |
import hmac
|
6 |
import hashlib
|
7 |
import base64
|
|
|
8 |
|
9 |
-
def
|
10 |
-
# Get the environment variable if it exists
|
11 |
-
value = os.environ.get(var_name)
|
12 |
-
|
13 |
-
# If it doesn't exist, set it to the default value
|
14 |
-
if value is None:
|
15 |
-
os.environ[var_name] = default_value
|
16 |
-
value = default_value
|
17 |
-
|
18 |
-
return value
|
19 |
-
|
20 |
-
client_id = get_or_create_env_var('AWS_CLIENT_ID', '')
|
21 |
-
#print(f'The value of AWS_CLIENT_ID is {client_id}')
|
22 |
-
|
23 |
-
client_secret = get_or_create_env_var('AWS_CLIENT_SECRET', '')
|
24 |
-
#print(f'The value of AWS_CLIENT_SECRET is {client_secret}')
|
25 |
-
|
26 |
-
user_pool_id = get_or_create_env_var('AWS_USER_POOL_ID', '')
|
27 |
-
#print(f'The value of AWS_USER_POOL_ID is {user_pool_id}')
|
28 |
-
|
29 |
-
def calculate_secret_hash(client_id, client_secret, username):
|
30 |
message = username + client_id
|
31 |
dig = hmac.new(
|
32 |
str(client_secret).encode('utf-8'),
|
|
|
1 |
+
#import os
|
|
|
2 |
import boto3
|
3 |
+
#import gradio as gr
|
4 |
import hmac
|
5 |
import hashlib
|
6 |
import base64
|
7 |
+
from tools.config import client_id, client_secret, user_pool_id
|
8 |
|
9 |
+
def calculate_secret_hash(client_id:str, client_secret:str, username:str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
message = username + client_id
|
11 |
dig = hmac.new(
|
12 |
str(client_secret).encode('utf-8'),
|
tools/aws_functions.py
CHANGED
@@ -3,37 +3,13 @@ import pandas as pd
|
|
3 |
import boto3
|
4 |
import tempfile
|
5 |
import os
|
6 |
-
from tools.
|
7 |
-
|
8 |
|
9 |
PandasDataFrame = Type[pd.DataFrame]
|
10 |
|
11 |
# Get AWS credentials
|
12 |
-
bucket_name=
|
13 |
-
|
14 |
-
RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
|
15 |
-
print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
|
16 |
-
|
17 |
-
AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
|
18 |
-
print(f'The value of AWS_REGION is {AWS_REGION}')
|
19 |
-
|
20 |
-
# If you have an aws_config env file in the config folder, you can load in AWS keys this way
|
21 |
-
AWS_CONFIG_PATH = get_or_create_env_var('AWS_CONFIG_PATH', '/env/aws_config.env')
|
22 |
-
print(f'The value of AWS_CONFIG_PATH is {AWS_CONFIG_PATH}')
|
23 |
-
|
24 |
-
if os.path.exists(AWS_CONFIG_PATH):
|
25 |
-
print("Loading AWS keys from config folder")
|
26 |
-
load_dotenv(AWS_CONFIG_PATH)
|
27 |
-
|
28 |
-
AWS_ACCESS_KEY = get_or_create_env_var('AWS_ACCESS_KEY', '')
|
29 |
-
if AWS_ACCESS_KEY:
|
30 |
-
print(f'AWS_ACCESS_KEY found in environment variables')
|
31 |
-
|
32 |
-
AWS_SECRET_KEY = get_or_create_env_var('AWS_SECRET_KEY', '')
|
33 |
-
if AWS_SECRET_KEY:
|
34 |
-
print(f'AWS_SECRET_KEY found in environment variables')
|
35 |
-
|
36 |
-
|
37 |
|
38 |
def get_assumed_role_info():
|
39 |
sts_endpoint = 'https://sts.' + AWS_REGION + '.amazonaws.com'
|
@@ -49,14 +25,11 @@ def get_assumed_role_info():
|
|
49 |
return assumed_role_arn, assumed_role_name
|
50 |
|
51 |
if RUN_AWS_FUNCTIONS == "1":
|
52 |
-
try:
|
53 |
-
bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
|
54 |
session = boto3.Session()
|
55 |
-
|
56 |
-
#print("session:", session)
|
57 |
|
58 |
except Exception as e:
|
59 |
-
print("Could not start boto3 session:", e)
|
60 |
|
61 |
try:
|
62 |
assumed_role_arn, assumed_role_name = get_assumed_role_info()
|
|
|
3 |
import boto3
|
4 |
import tempfile
|
5 |
import os
|
6 |
+
from tools.config import AWS_REGION, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET
|
7 |
+
|
8 |
|
9 |
PandasDataFrame = Type[pd.DataFrame]
|
10 |
|
11 |
# Get AWS credentials
|
12 |
+
bucket_name = DOCUMENT_REDACTION_BUCKET
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
def get_assumed_role_info():
|
15 |
sts_endpoint = 'https://sts.' + AWS_REGION + '.amazonaws.com'
|
|
|
25 |
return assumed_role_arn, assumed_role_name
|
26 |
|
27 |
if RUN_AWS_FUNCTIONS == "1":
|
28 |
+
try:
|
|
|
29 |
session = boto3.Session()
|
|
|
|
|
30 |
|
31 |
except Exception as e:
|
32 |
+
print("Could not start boto3 session:", e)
|
33 |
|
34 |
try:
|
35 |
assumed_role_arn, assumed_role_name = get_assumed_role_info()
|
tools/aws_textract.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
import boto3
|
2 |
-
#from PIL import Image
|
3 |
from typing import List
|
4 |
import io
|
5 |
import os
|
@@ -7,12 +6,10 @@ import json
|
|
7 |
from collections import defaultdict
|
8 |
import pikepdf
|
9 |
import time
|
10 |
-
# Example: converting this single page to an image
|
11 |
-
#from pdf2image import convert_from_bytes
|
12 |
from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
|
13 |
-
from tools.
|
14 |
|
15 |
-
def extract_textract_metadata(response):
|
16 |
"""Extracts metadata from an AWS Textract response."""
|
17 |
|
18 |
#print("Document metadata:", response['DocumentMetadata'])
|
@@ -83,8 +80,7 @@ def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str=""
|
|
83 |
# Return a list containing the wrapped response and the metadata
|
84 |
return wrapped_response, request_metadata # Return as a list to match the desired structure
|
85 |
|
86 |
-
|
87 |
-
def convert_pike_pdf_page_to_bytes(pdf, page_num):
|
88 |
# Create a new empty PDF
|
89 |
new_pdf = pikepdf.Pdf.new()
|
90 |
|
@@ -109,8 +105,7 @@ def convert_pike_pdf_page_to_bytes(pdf, page_num):
|
|
109 |
|
110 |
return pdf_bytes
|
111 |
|
112 |
-
|
113 |
-
def json_to_ocrresult(json_data, page_width, page_height, page_no):
|
114 |
'''
|
115 |
Convert the json response from textract to the OCRResult format used elsewhere in the code. Looks for lines, words, and signatures. Handwriting and signatures are set aside especially for later in case the user wants to override the default behaviour and redact all handwriting/signatures.
|
116 |
'''
|
@@ -274,7 +269,7 @@ def json_to_ocrresult(json_data, page_width, page_height, page_no):
|
|
274 |
|
275 |
return all_ocr_results, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_children
|
276 |
|
277 |
-
def load_and_convert_textract_json(textract_json_file_path, log_files_output_paths):
|
278 |
"""
|
279 |
Loads Textract JSON from a file, detects if conversion is needed,
|
280 |
and converts if necessary.
|
@@ -317,8 +312,6 @@ def load_and_convert_textract_json(textract_json_file_path, log_files_output_pat
|
|
317 |
print("textract data:", textract_data)
|
318 |
return {}, True, log_files_output_paths # Return empty data if JSON is not recognized
|
319 |
|
320 |
-
|
321 |
-
|
322 |
# Load Textract JSON output (assuming it's stored in a variable called `textract_output`)
|
323 |
def restructure_textract_output(textract_output:object):
|
324 |
'''
|
|
|
1 |
import boto3
|
|
|
2 |
from typing import List
|
3 |
import io
|
4 |
import os
|
|
|
6 |
from collections import defaultdict
|
7 |
import pikepdf
|
8 |
import time
|
|
|
|
|
9 |
from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
|
10 |
+
from tools.config import AWS_ACCESS_KEY, AWS_SECRET_KEY
|
11 |
|
12 |
+
def extract_textract_metadata(response:object):
|
13 |
"""Extracts metadata from an AWS Textract response."""
|
14 |
|
15 |
#print("Document metadata:", response['DocumentMetadata'])
|
|
|
80 |
# Return a list containing the wrapped response and the metadata
|
81 |
return wrapped_response, request_metadata # Return as a list to match the desired structure
|
82 |
|
83 |
+
def convert_pike_pdf_page_to_bytes(pdf:object, page_num:int):
|
|
|
84 |
# Create a new empty PDF
|
85 |
new_pdf = pikepdf.Pdf.new()
|
86 |
|
|
|
105 |
|
106 |
return pdf_bytes
|
107 |
|
108 |
+
def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_no:int):
|
|
|
109 |
'''
|
110 |
Convert the json response from textract to the OCRResult format used elsewhere in the code. Looks for lines, words, and signatures. Handwriting and signatures are set aside especially for later in case the user wants to override the default behaviour and redact all handwriting/signatures.
|
111 |
'''
|
|
|
269 |
|
270 |
return all_ocr_results, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_children
|
271 |
|
272 |
+
def load_and_convert_textract_json(textract_json_file_path:str, log_files_output_paths:str):
|
273 |
"""
|
274 |
Loads Textract JSON from a file, detects if conversion is needed,
|
275 |
and converts if necessary.
|
|
|
312 |
print("textract data:", textract_data)
|
313 |
return {}, True, log_files_output_paths # Return empty data if JSON is not recognized
|
314 |
|
|
|
|
|
315 |
# Load Textract JSON output (assuming it's stored in a variable called `textract_output`)
|
316 |
def restructure_textract_output(textract_output:object):
|
317 |
'''
|
tools/cli_redact.py
CHANGED
@@ -1,12 +1,13 @@
|
|
1 |
import argparse
|
2 |
import os
|
3 |
-
from tools.
|
|
|
4 |
from tools.file_conversion import get_input_file_names, prepare_image_or_pdf
|
5 |
from tools.file_redaction import choose_and_run_redactor
|
6 |
import pandas as pd
|
7 |
from datetime import datetime
|
8 |
|
9 |
-
chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER', 'CREDIT_DEBIT_CVV',
|
10 |
'NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD',
|
11 |
'IP_ADDRESS','MAC_ADDRESS','LICENSE_PLATE',
|
12 |
'VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER',
|
|
|
1 |
import argparse
|
2 |
import os
|
3 |
+
from tools.config import get_or_create_env_var
|
4 |
+
from tools.helper_functions import ensure_output_folder_exists,tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
|
5 |
from tools.file_conversion import get_input_file_names, prepare_image_or_pdf
|
6 |
from tools.file_redaction import choose_and_run_redactor
|
7 |
import pandas as pd
|
8 |
from datetime import datetime
|
9 |
|
10 |
+
chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER', 'CREDIT_DEBIT_CVV', 'CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS',
|
11 |
'NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD',
|
12 |
'IP_ADDRESS','MAC_ADDRESS','LICENSE_PLATE',
|
13 |
'VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER',
|
tools/config.py
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
|
4 |
+
# Set or retrieve configuration variables for the redaction app
|
5 |
+
|
6 |
+
def get_or_create_env_var(var_name:str, default_value:str, print_val:bool=False):
|
7 |
+
'''
|
8 |
+
Get an environmental variable, and set it to a default value if it doesn't exist
|
9 |
+
'''
|
10 |
+
# Get the environment variable if it exists
|
11 |
+
value = os.environ.get(var_name)
|
12 |
+
|
13 |
+
# If it doesn't exist, set the environment variable to the default value
|
14 |
+
if value is None:
|
15 |
+
os.environ[var_name] = default_value
|
16 |
+
value = default_value
|
17 |
+
|
18 |
+
if print_val == True:
|
19 |
+
print(f'The value of {var_name} is {value}')
|
20 |
+
|
21 |
+
return value
|
22 |
+
|
23 |
+
|
24 |
+
# If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. '/env/app_config.env'
|
25 |
+
APP_CONFIG_PATH = get_or_create_env_var('APP_CONFIG_PATH', '', print_val=True)
|
26 |
+
|
27 |
+
|
28 |
+
if os.path.exists(APP_CONFIG_PATH):
|
29 |
+
print(f"Loading APP variables from config file {APP_CONFIG_PATH}")
|
30 |
+
load_dotenv(APP_CONFIG_PATH)
|
31 |
+
|
32 |
+
###
|
33 |
+
# AWS CONFIG
|
34 |
+
###
|
35 |
+
|
36 |
+
# If you have an aws_config env file in the config folder, you can load in AWS keys this way, e.g. '/env/aws_config.env'
|
37 |
+
AWS_CONFIG_PATH = get_or_create_env_var('AWS_CONFIG_PATH', '', print_val=True)
|
38 |
+
|
39 |
+
if os.path.exists(AWS_CONFIG_PATH):
|
40 |
+
print(f"Loading AWS variables from config file {AWS_CONFIG_PATH}")
|
41 |
+
load_dotenv(AWS_CONFIG_PATH)
|
42 |
+
|
43 |
+
RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
|
44 |
+
|
45 |
+
AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
|
46 |
+
|
47 |
+
client_id = get_or_create_env_var('AWS_CLIENT_ID', '')
|
48 |
+
|
49 |
+
client_secret = get_or_create_env_var('AWS_CLIENT_SECRET', '')
|
50 |
+
|
51 |
+
user_pool_id = get_or_create_env_var('AWS_USER_POOL_ID', '')
|
52 |
+
|
53 |
+
AWS_ACCESS_KEY = get_or_create_env_var('AWS_ACCESS_KEY', '')
|
54 |
+
if AWS_ACCESS_KEY: print(f'AWS_ACCESS_KEY found in environment variables')
|
55 |
+
|
56 |
+
AWS_SECRET_KEY = get_or_create_env_var('AWS_SECRET_KEY', '')
|
57 |
+
if AWS_SECRET_KEY: print(f'AWS_SECRET_KEY found in environment variables')
|
58 |
+
|
59 |
+
DOCUMENT_REDACTION_BUCKET = get_or_create_env_var('DOCUMENT_REDACTION_BUCKET', '')
|
60 |
+
|
61 |
+
# Custom headers e.g. if routing traffic through Cloudfront
|
62 |
+
# Retrieving or setting CUSTOM_HEADER
|
63 |
+
CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
|
64 |
+
if CUSTOM_HEADER: print(f'CUSTOM_HEADER found')
|
65 |
+
|
66 |
+
# Retrieving or setting CUSTOM_HEADER_VALUE
|
67 |
+
CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
|
68 |
+
if CUSTOM_HEADER_VALUE: print(f'CUSTOM_HEADER_VALUE found')
|
69 |
+
|
70 |
+
###
|
71 |
+
# Images config
|
72 |
+
###
|
73 |
+
IMAGES_DPI = get_or_create_env_var('IMAGES_DPI', '300.0')
|
74 |
+
LOAD_TRUNCATED_IMAGES = get_or_create_env_var('LOAD_TRUNCATED_IMAGES', 'True')
|
75 |
+
MAX_IMAGE_PIXELS = get_or_create_env_var('MAX_IMAGE_PIXELS', '') # Changed to None if blank in file_conversion.py
|
76 |
+
|
77 |
+
###
|
78 |
+
# File I/O config
|
79 |
+
###
|
80 |
+
|
81 |
+
output_folder = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/')
|
82 |
+
print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
|
83 |
+
|
84 |
+
session_output_folder = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'False')
|
85 |
+
print(f'The value of SESSION_OUTPUT_FOLDER is {session_output_folder}')
|
86 |
+
|
87 |
+
input_folder = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/')
|
88 |
+
print(f'The value of GRADIO_INPUT_FOLDER is {input_folder}')
|
89 |
+
|
90 |
+
###
|
91 |
+
# REDACTION CONFIG
|
92 |
+
###
|
93 |
+
# Number of pages to loop through before breaking the function and restarting from the last finished page.
|
94 |
+
page_break_value = get_or_create_env_var('page_break_value', '50000')
|
95 |
+
|
96 |
+
max_time_value = get_or_create_env_var('max_time_value', '999999')
|
97 |
+
|
98 |
+
CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "")
|
99 |
+
|
100 |
+
###
|
101 |
+
# APP RUN CONFIG
|
102 |
+
###
|
103 |
+
# Get some environment variables and Launch the Gradio app
|
104 |
+
COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
|
105 |
+
|
106 |
+
RUN_DIRECT_MODE = get_or_create_env_var('RUN_DIRECT_MODE', '0')
|
107 |
+
|
108 |
+
MAX_QUEUE_SIZE = int(get_or_create_env_var('MAX_QUEUE_SIZE', '5'))
|
109 |
+
|
110 |
+
MAX_FILE_SIZE = get_or_create_env_var('MAX_FILE_SIZE', '250mb')
|
111 |
+
|
112 |
+
GRADIO_SERVER_PORT = int(get_or_create_env_var('GRADIO_SERVER_PORT', '7860'))
|
113 |
+
|
114 |
+
ROOT_PATH = get_or_create_env_var('ROOT_PATH', '')
|
115 |
+
|
116 |
+
DEFAULT_CONCURRENCY_LIMIT = get_or_create_env_var('DEFAULT_CONCURRENCY_LIMIT', '5')
|
117 |
+
|
118 |
+
GET_DEFAULT_ALLOW_LIST = get_or_create_env_var('GET_DEFAULT_ALLOW_LIST', 'False')
|
119 |
+
|
120 |
+
DEFAULT_ALLOW_LIST_PATH = get_or_create_env_var('DEFAULT_ALLOW_LIST_PATH', '')
|
tools/custom_csvlogger.py
CHANGED
@@ -8,9 +8,7 @@ from collections.abc import Sequence
|
|
8 |
from multiprocessing import Lock
|
9 |
from pathlib import Path
|
10 |
from typing import TYPE_CHECKING, Any
|
11 |
-
|
12 |
from gradio_client import utils as client_utils
|
13 |
-
|
14 |
import gradio as gr
|
15 |
from gradio import utils, wasm_utils
|
16 |
|
|
|
8 |
from multiprocessing import Lock
|
9 |
from pathlib import Path
|
10 |
from typing import TYPE_CHECKING, Any
|
|
|
11 |
from gradio_client import utils as client_utils
|
|
|
12 |
import gradio as gr
|
13 |
from gradio import utils, wasm_utils
|
14 |
|
tools/custom_image_analyser_engine.py
CHANGED
@@ -405,7 +405,7 @@ def bounding_boxes_overlap(box1:List, box2:List):
|
|
405 |
return (box1[0] < box2[2] and box2[0] < box1[2] and
|
406 |
box1[1] < box2[3] and box2[1] < box1[3])
|
407 |
|
408 |
-
def map_back_entity_results(page_analyser_result, page_text_mapping, all_text_line_results:List[Tuple]):
|
409 |
for entity in page_analyser_result:
|
410 |
entity_start = entity.start
|
411 |
entity_end = entity.end
|
@@ -443,7 +443,7 @@ def map_back_entity_results(page_analyser_result, page_text_mapping, all_text_li
|
|
443 |
|
444 |
return all_text_line_results
|
445 |
|
446 |
-
def map_back_comprehend_entity_results(response, current_batch_mapping:List[Tuple], allow_list:List[str], chosen_redact_comprehend_entities:List[str], all_text_line_results:List[Tuple]):
|
447 |
if not response or "Entities" not in response:
|
448 |
return all_text_line_results
|
449 |
|
@@ -686,7 +686,7 @@ def run_page_text_redaction(
|
|
686 |
|
687 |
return page_analysed_bounding_boxes
|
688 |
|
689 |
-
def merge_text_bounding_boxes(analyser_results, characters: List[LTChar], combine_pixel_dist: int = 20, vertical_padding: int = 0):
|
690 |
'''
|
691 |
Merge identified bounding boxes containing PII that are very close to one another
|
692 |
'''
|
@@ -776,7 +776,7 @@ def merge_text_bounding_boxes(analyser_results, characters: List[LTChar], combin
|
|
776 |
return analysed_bounding_boxes
|
777 |
|
778 |
# Function to combine OCR results into line-level results
|
779 |
-
def combine_ocr_results(ocr_results, x_threshold=50, y_threshold=12):
|
780 |
# Group OCR results into lines based on y_threshold
|
781 |
lines = []
|
782 |
current_line = []
|
|
|
405 |
return (box1[0] < box2[2] and box2[0] < box1[2] and
|
406 |
box1[1] < box2[3] and box2[1] < box1[3])
|
407 |
|
408 |
+
def map_back_entity_results(page_analyser_result:dict, page_text_mapping:dict, all_text_line_results:List[Tuple]):
|
409 |
for entity in page_analyser_result:
|
410 |
entity_start = entity.start
|
411 |
entity_end = entity.end
|
|
|
443 |
|
444 |
return all_text_line_results
|
445 |
|
446 |
+
def map_back_comprehend_entity_results(response:object, current_batch_mapping:List[Tuple], allow_list:List[str], chosen_redact_comprehend_entities:List[str], all_text_line_results:List[Tuple]):
|
447 |
if not response or "Entities" not in response:
|
448 |
return all_text_line_results
|
449 |
|
|
|
686 |
|
687 |
return page_analysed_bounding_boxes
|
688 |
|
689 |
+
def merge_text_bounding_boxes(analyser_results:dict, characters: List[LTChar], combine_pixel_dist: int = 20, vertical_padding: int = 0):
|
690 |
'''
|
691 |
Merge identified bounding boxes containing PII that are very close to one another
|
692 |
'''
|
|
|
776 |
return analysed_bounding_boxes
|
777 |
|
778 |
# Function to combine OCR results into line-level results
|
779 |
+
def combine_ocr_results(ocr_results:dict, x_threshold:float=50.0, y_threshold:float=12.0):
|
780 |
# Group OCR results into lines based on y_threshold
|
781 |
lines = []
|
782 |
current_line = []
|
tools/data_anonymise.py
CHANGED
@@ -13,12 +13,11 @@ from typing import List, Dict, Any
|
|
13 |
from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
|
14 |
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
|
15 |
from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
|
16 |
-
from tools.aws_functions import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY
|
17 |
|
18 |
-
from tools.
|
|
|
19 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, custom_entities
|
20 |
from tools.custom_image_analyser_engine import do_aws_comprehend_call
|
21 |
-
|
22 |
# Use custom version of analyze_dict to be able to track progress
|
23 |
from tools.presidio_analyzer_custom import analyze_dict
|
24 |
|
|
|
13 |
from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
|
14 |
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
|
15 |
from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
|
|
|
16 |
|
17 |
+
from tools.config import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, output_folder
|
18 |
+
from tools.helper_functions import get_file_name_without_type, read_file, detect_file_type
|
19 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, custom_entities
|
20 |
from tools.custom_image_analyser_engine import do_aws_comprehend_call
|
|
|
21 |
# Use custom version of analyze_dict to be able to track progress
|
22 |
from tools.presidio_analyzer_custom import analyze_dict
|
23 |
|
tools/file_conversion.py
CHANGED
@@ -1,13 +1,14 @@
|
|
1 |
from pdf2image import convert_from_path, pdfinfo_from_path
|
2 |
-
|
3 |
from PIL import Image, ImageFile
|
4 |
import os
|
5 |
import re
|
6 |
import time
|
7 |
import json
|
8 |
import pymupdf
|
|
|
9 |
import pandas as pd
|
10 |
-
import numpy as np
|
11 |
import shutil
|
12 |
from pymupdf import Rect
|
13 |
from fitz import Page
|
@@ -19,9 +20,13 @@ from pdf2image import convert_from_path
|
|
19 |
from PIL import Image
|
20 |
from scipy.spatial import cKDTree
|
21 |
|
22 |
-
|
23 |
-
|
24 |
-
|
|
|
|
|
|
|
|
|
25 |
|
26 |
def is_pdf_or_image(filename):
|
27 |
"""
|
@@ -54,8 +59,7 @@ def is_pdf(filename):
|
|
54 |
# %%
|
55 |
## Convert pdf to image if necessary
|
56 |
|
57 |
-
|
58 |
-
print(f'The value of CUSTOM_BOX_COLOUR is {CUSTOM_BOX_COLOUR}')
|
59 |
|
60 |
def check_image_size_and_reduce(out_path:str, image:Image):
|
61 |
'''
|
@@ -360,6 +364,27 @@ def redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colou
|
|
360 |
|
361 |
return whole_page_img_annotation_box
|
362 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
363 |
def prepare_image_or_pdf(
|
364 |
file_paths: List[str],
|
365 |
in_redact_method: str,
|
@@ -371,6 +396,7 @@ def prepare_image_or_pdf(
|
|
371 |
prepare_for_review:bool = False,
|
372 |
in_fully_redacted_list:List[int]=[],
|
373 |
output_folder:str=output_folder,
|
|
|
374 |
progress: Progress = Progress(track_tqdm=True)
|
375 |
) -> tuple[List[str], List[str]]:
|
376 |
"""
|
@@ -390,6 +416,7 @@ def prepare_image_or_pdf(
|
|
390 |
prepare_for_review(optional, bool): Is this preparation step preparing pdfs and json files to review current redactions?
|
391 |
in_fully_redacted_list(optional, List of int): A list of pages to fully redact
|
392 |
output_folder (optional, str): The output folder for file save
|
|
|
393 |
progress (optional, Progress): Progress tracker for the operation
|
394 |
|
395 |
|
@@ -400,6 +427,10 @@ def prepare_image_or_pdf(
|
|
400 |
tic = time.perf_counter()
|
401 |
json_from_csv = False
|
402 |
original_cropboxes = [] # Store original CropBox values
|
|
|
|
|
|
|
|
|
403 |
|
404 |
if isinstance(in_fully_redacted_list, pd.DataFrame):
|
405 |
if not in_fully_redacted_list.empty:
|
@@ -426,11 +457,6 @@ def prepare_image_or_pdf(
|
|
426 |
if isinstance(out_message, str):
|
427 |
out_message = [out_message]
|
428 |
|
429 |
-
converted_file_paths = []
|
430 |
-
image_file_paths = []
|
431 |
-
pymupdf_doc = []
|
432 |
-
review_file_csv = pd.DataFrame()
|
433 |
-
|
434 |
if not file_paths:
|
435 |
file_paths = []
|
436 |
|
@@ -496,23 +522,35 @@ def prepare_image_or_pdf(
|
|
496 |
# If a pdf, load as a pymupdf document
|
497 |
if is_pdf(file_path):
|
498 |
pymupdf_doc = pymupdf.open(file_path)
|
|
|
499 |
|
500 |
# Load cropbox dimensions to use later
|
501 |
|
502 |
converted_file_path = file_path
|
503 |
-
image_file_paths, image_sizes_width, image_sizes_height = process_file(file_path, prepare_for_review)
|
504 |
-
page_sizes = []
|
505 |
|
506 |
-
|
507 |
-
|
508 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
509 |
|
510 |
-
|
511 |
-
|
|
|
|
|
|
|
|
|
512 |
|
513 |
-
|
514 |
-
out_page_image_sizes = {"page":reported_page_no, "image_width":image_sizes_width[page_no], "image_height":image_sizes_height[page_no], "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":pymupdf_page.cropbox.width, "cropbox_height":pymupdf_page.cropbox.height}
|
515 |
-
page_sizes.append(out_page_image_sizes)
|
516 |
|
517 |
#Create base version of the annotation object that doesn't have any annotations in it
|
518 |
if (not all_annotations_object) & (prepare_for_review == True):
|
@@ -521,6 +559,7 @@ def prepare_image_or_pdf(
|
|
521 |
for image_path in image_file_paths:
|
522 |
annotation = {}
|
523 |
annotation["image"] = image_path
|
|
|
524 |
|
525 |
all_annotations_object.append(annotation)
|
526 |
|
@@ -546,7 +585,7 @@ def prepare_image_or_pdf(
|
|
546 |
|
547 |
#print("image_file_paths:", image_file_paths)
|
548 |
# Create a page_sizes_object
|
549 |
-
out_page_image_sizes = {"page":1, "image_width":image_sizes_width[
|
550 |
page_sizes.append(out_page_image_sizes)
|
551 |
|
552 |
converted_file_path = output_folder + file_name_with_ext
|
@@ -557,7 +596,7 @@ def prepare_image_or_pdf(
|
|
557 |
|
558 |
elif file_extension in ['.csv']:
|
559 |
review_file_csv = read_file(file)
|
560 |
-
all_annotations_object =
|
561 |
json_from_csv = True
|
562 |
print("Converted CSV review file to json")
|
563 |
|
@@ -708,7 +747,7 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str],
|
|
708 |
|
709 |
return out_message, out_file_paths
|
710 |
|
711 |
-
def join_values_within_threshold(df1, df2):
|
712 |
# Threshold for matching
|
713 |
threshold = 5
|
714 |
|
@@ -739,7 +778,7 @@ def join_values_within_threshold(df1, df2):
|
|
739 |
print(final_df)
|
740 |
|
741 |
|
742 |
-
def
|
743 |
'''
|
744 |
Convert the annotation json data to a dataframe format. Add on any text from the initial review_file dataframe by joining on pages/co-ordinates (doesn't work very well currently).
|
745 |
'''
|
@@ -887,7 +926,7 @@ def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decis
|
|
887 |
# review_file_df[col] = np.floor(review_file_df[col])
|
888 |
|
889 |
# If colours are saved as list, convert to tuple
|
890 |
-
review_file_df["color"] = review_file_df["color"].apply(lambda x: tuple(x) if isinstance(x, list) else x)
|
891 |
|
892 |
# print("page_sizes:", page_sizes)
|
893 |
|
@@ -910,32 +949,35 @@ def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decis
|
|
910 |
|
911 |
review_file_df = review_file_df.sort_values(['page', 'ymin', 'xmin', 'label'])
|
912 |
|
913 |
-
review_file_df.to_csv(output_folder + "review_file_test.csv", index=None)
|
914 |
|
915 |
return review_file_df
|
916 |
|
917 |
-
def
|
918 |
'''
|
919 |
Convert a review csv to a json file for use by the Gradio Annotation object.
|
920 |
'''
|
921 |
|
922 |
-
|
923 |
-
|
924 |
page_sizes_df = pd.DataFrame(page_sizes)
|
925 |
|
926 |
-
#
|
|
|
|
|
927 |
|
928 |
-
|
929 |
-
|
930 |
|
931 |
-
|
932 |
-
|
933 |
-
|
934 |
-
|
935 |
-
review_file_df["xmin"]
|
936 |
-
|
937 |
-
|
938 |
-
|
|
|
939 |
|
940 |
# Keep only necessary columns
|
941 |
review_file_df = review_file_df[["image", "page", "xmin", "ymin", "xmax", "ymax", "color", "label"]]
|
@@ -949,9 +991,8 @@ def convert_pandas_df_to_review_json(review_file_df: pd.DataFrame, image_paths:
|
|
949 |
# Create a list to hold the JSON data
|
950 |
json_data = []
|
951 |
|
952 |
-
for
|
953 |
-
reported_page_number = int(
|
954 |
-
|
955 |
|
956 |
if reported_page_number in review_file_df["page"].values:
|
957 |
|
@@ -969,6 +1010,7 @@ def convert_pandas_df_to_review_json(review_file_df: pd.DataFrame, image_paths:
|
|
969 |
else:
|
970 |
annotation = {}
|
971 |
annotation["image"] = pdf_image_path
|
|
|
972 |
|
973 |
# Append the structured data to the json_data list
|
974 |
json_data.append(annotation)
|
|
|
1 |
from pdf2image import convert_from_path, pdfinfo_from_path
|
2 |
+
|
3 |
from PIL import Image, ImageFile
|
4 |
import os
|
5 |
import re
|
6 |
import time
|
7 |
import json
|
8 |
import pymupdf
|
9 |
+
from pymupdf import Document
|
10 |
import pandas as pd
|
11 |
+
#import numpy as np
|
12 |
import shutil
|
13 |
from pymupdf import Rect
|
14 |
from fitz import Page
|
|
|
20 |
from PIL import Image
|
21 |
from scipy.spatial import cKDTree
|
22 |
|
23 |
+
from tools.config import output_folder, IMAGES_DPI, LOAD_TRUNCATED_IMAGES, MAX_IMAGE_PIXELS, CUSTOM_BOX_COLOUR
|
24 |
+
from tools.helper_functions import get_file_name_without_type, tesseract_ocr_option, text_ocr_option, textract_option, read_file
|
25 |
+
|
26 |
+
image_dpi = float(IMAGES_DPI)
|
27 |
+
if not MAX_IMAGE_PIXELS: Image.MAX_IMAGE_PIXELS = None
|
28 |
+
else: Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
|
29 |
+
ImageFile.LOAD_TRUNCATED_IMAGES = LOAD_TRUNCATED_IMAGES.lower() == "true"
|
30 |
|
31 |
def is_pdf_or_image(filename):
|
32 |
"""
|
|
|
59 |
# %%
|
60 |
## Convert pdf to image if necessary
|
61 |
|
62 |
+
|
|
|
63 |
|
64 |
def check_image_size_and_reduce(out_path:str, image:Image):
|
65 |
'''
|
|
|
364 |
|
365 |
return whole_page_img_annotation_box
|
366 |
|
367 |
+
def create_page_size_objects(pymupdf_doc:Document, image_sizes_width:List[float], image_sizes_height:List[float]):
|
368 |
+
page_sizes = []
|
369 |
+
original_cropboxes = []
|
370 |
+
|
371 |
+
for page_no, page in enumerate(pymupdf_doc):
|
372 |
+
reported_page_no = page_no + 1
|
373 |
+
|
374 |
+
pymupdf_page = pymupdf_doc.load_page(page_no)
|
375 |
+
original_cropboxes.append(pymupdf_page.cropbox) # Save original CropBox
|
376 |
+
|
377 |
+
# Create a page_sizes_object.
|
378 |
+
# If images have been created, then image width an height come from this value. Otherwise, they are set to the cropbox size
|
379 |
+
if image_sizes_width and image_sizes_height:
|
380 |
+
out_page_image_sizes = {"page":reported_page_no, "image_width":image_sizes_width[page_no], "image_height":image_sizes_height[page_no], "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":pymupdf_page.cropbox.width, "cropbox_height":pymupdf_page.cropbox.height}
|
381 |
+
else:
|
382 |
+
out_page_image_sizes = {"page":reported_page_no, "image_width":pd.NA(), "image_height":pd.NA(), "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":pymupdf_page.cropbox.width, "cropbox_height":pymupdf_page.cropbox.height}
|
383 |
+
|
384 |
+
page_sizes.append(out_page_image_sizes)
|
385 |
+
|
386 |
+
return page_sizes, original_cropboxes
|
387 |
+
|
388 |
def prepare_image_or_pdf(
|
389 |
file_paths: List[str],
|
390 |
in_redact_method: str,
|
|
|
396 |
prepare_for_review:bool = False,
|
397 |
in_fully_redacted_list:List[int]=[],
|
398 |
output_folder:str=output_folder,
|
399 |
+
prepare_images:bool=True,
|
400 |
progress: Progress = Progress(track_tqdm=True)
|
401 |
) -> tuple[List[str], List[str]]:
|
402 |
"""
|
|
|
416 |
prepare_for_review(optional, bool): Is this preparation step preparing pdfs and json files to review current redactions?
|
417 |
in_fully_redacted_list(optional, List of int): A list of pages to fully redact
|
418 |
output_folder (optional, str): The output folder for file save
|
419 |
+
prepare_images (optional, bool): A boolean indicating whether to create images for each PDF page. Defaults to true
|
420 |
progress (optional, Progress): Progress tracker for the operation
|
421 |
|
422 |
|
|
|
427 |
tic = time.perf_counter()
|
428 |
json_from_csv = False
|
429 |
original_cropboxes = [] # Store original CropBox values
|
430 |
+
converted_file_paths = []
|
431 |
+
image_file_paths = []
|
432 |
+
pymupdf_doc = []
|
433 |
+
review_file_csv = pd.DataFrame()
|
434 |
|
435 |
if isinstance(in_fully_redacted_list, pd.DataFrame):
|
436 |
if not in_fully_redacted_list.empty:
|
|
|
457 |
if isinstance(out_message, str):
|
458 |
out_message = [out_message]
|
459 |
|
|
|
|
|
|
|
|
|
|
|
460 |
if not file_paths:
|
461 |
file_paths = []
|
462 |
|
|
|
522 |
# If a pdf, load as a pymupdf document
|
523 |
if is_pdf(file_path):
|
524 |
pymupdf_doc = pymupdf.open(file_path)
|
525 |
+
pymupdf_pages = pymupdf_doc.page_count
|
526 |
|
527 |
# Load cropbox dimensions to use later
|
528 |
|
529 |
converted_file_path = file_path
|
|
|
|
|
530 |
|
531 |
+
if prepare_images==True:
|
532 |
+
image_file_paths, image_sizes_width, image_sizes_height = process_file(file_path, prepare_for_review)
|
533 |
+
else:
|
534 |
+
print("Skipping image preparation")
|
535 |
+
image_file_paths=[]
|
536 |
+
image_sizes_width=[]
|
537 |
+
image_sizes_height=[]
|
538 |
+
|
539 |
+
# Create page sizes object
|
540 |
+
# page_sizes = []
|
541 |
+
|
542 |
+
# for i, page in enumerate(pymupdf_doc):
|
543 |
+
# page_no = i
|
544 |
+
# reported_page_no = i + 1
|
545 |
|
546 |
+
# pymupdf_page = pymupdf_doc.load_page(page_no)
|
547 |
+
# original_cropboxes.append(pymupdf_page.cropbox) # Save original CropBox
|
548 |
+
|
549 |
+
# # Create a page_sizes_object
|
550 |
+
# out_page_image_sizes = {"page":reported_page_no, "image_width":image_sizes_width[page_no], "image_height":image_sizes_height[page_no], "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":pymupdf_page.cropbox.width, "cropbox_height":pymupdf_page.cropbox.height}
|
551 |
+
# page_sizes.append(out_page_image_sizes)
|
552 |
|
553 |
+
page_sizes, original_cropboxes = create_page_size_objects(pymupdf_doc, image_sizes_width, image_sizes_height)
|
|
|
|
|
554 |
|
555 |
#Create base version of the annotation object that doesn't have any annotations in it
|
556 |
if (not all_annotations_object) & (prepare_for_review == True):
|
|
|
559 |
for image_path in image_file_paths:
|
560 |
annotation = {}
|
561 |
annotation["image"] = image_path
|
562 |
+
annotation["boxes"] = []
|
563 |
|
564 |
all_annotations_object.append(annotation)
|
565 |
|
|
|
585 |
|
586 |
#print("image_file_paths:", image_file_paths)
|
587 |
# Create a page_sizes_object
|
588 |
+
out_page_image_sizes = {"page":1, "image_width":image_sizes_width[0], "image_height":image_sizes_height[0], "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":original_cropboxes[-1].width, "cropbox_height":original_cropboxes[-1].height}
|
589 |
page_sizes.append(out_page_image_sizes)
|
590 |
|
591 |
converted_file_path = output_folder + file_name_with_ext
|
|
|
596 |
|
597 |
elif file_extension in ['.csv']:
|
598 |
review_file_csv = read_file(file)
|
599 |
+
all_annotations_object = convert_review_df_to_annotation_json(review_file_csv, image_file_paths, page_sizes)
|
600 |
json_from_csv = True
|
601 |
print("Converted CSV review file to json")
|
602 |
|
|
|
747 |
|
748 |
return out_message, out_file_paths
|
749 |
|
750 |
+
def join_values_within_threshold(df1:pd.DataFrame, df2:pd.DataFrame):
|
751 |
# Threshold for matching
|
752 |
threshold = 5
|
753 |
|
|
|
778 |
print(final_df)
|
779 |
|
780 |
|
781 |
+
def convert_annotation_json_to_review_df(all_annotations:List[dict], redaction_decision_output:pd.DataFrame=pd.DataFrame(), page_sizes:List[dict]=[]) -> pd.DataFrame:
|
782 |
'''
|
783 |
Convert the annotation json data to a dataframe format. Add on any text from the initial review_file dataframe by joining on pages/co-ordinates (doesn't work very well currently).
|
784 |
'''
|
|
|
926 |
# review_file_df[col] = np.floor(review_file_df[col])
|
927 |
|
928 |
# If colours are saved as list, convert to tuple
|
929 |
+
review_file_df.loc[:,"color"] = review_file_df.loc[:,"color"].apply(lambda x: tuple(x) if isinstance(x, list) else x)
|
930 |
|
931 |
# print("page_sizes:", page_sizes)
|
932 |
|
|
|
949 |
|
950 |
review_file_df = review_file_df.sort_values(['page', 'ymin', 'xmin', 'label'])
|
951 |
|
952 |
+
#review_file_df.to_csv(output_folder + "review_file_test.csv", index=None)
|
953 |
|
954 |
return review_file_df
|
955 |
|
956 |
+
def convert_review_df_to_annotation_json(review_file_df:pd.DataFrame, image_paths:List[Image.Image], page_sizes:List[dict]=[]) -> List[dict]:
|
957 |
'''
|
958 |
Convert a review csv to a json file for use by the Gradio Annotation object.
|
959 |
'''
|
960 |
|
961 |
+
# Convert relative co-ordinates into image coordinates for the image annotation output object
|
962 |
+
if page_sizes:
|
963 |
page_sizes_df = pd.DataFrame(page_sizes)
|
964 |
|
965 |
+
# If there are no image coordinates, then just convert the first page to image to be able to see this at least.
|
966 |
+
if len(page_sizes_df.loc[page_sizes_df["image_width"].isnull(),"image_width"]) == len(page_sizes_df["image_width"]):
|
967 |
+
print("No image dimensions found, converting first page.")
|
968 |
|
969 |
+
# If no nulls, then can do image coordinate conversion
|
970 |
+
elif len(page_sizes_df.loc[page_sizes_df["image_width"].isnull(),"image_width"]) == 0:
|
971 |
|
972 |
+
if "image_width" not in review_file_df.columns:
|
973 |
+
review_file_df = review_file_df.merge(page_sizes_df, how="left", on = "page")
|
974 |
+
|
975 |
+
# If all coordinates are less or equal to one, this is a relative page scaling - change back to image coordinates
|
976 |
+
if review_file_df["xmin"].max() <= 1 and review_file_df["xmax"].max() <= 1 and review_file_df["ymin"].max() <= 1 and review_file_df["ymax"].max() <= 1:
|
977 |
+
review_file_df["xmin"] = review_file_df["xmin"] * review_file_df["image_width"]
|
978 |
+
review_file_df["xmax"] = review_file_df["xmax"] * review_file_df["image_width"]
|
979 |
+
review_file_df["ymin"] = review_file_df["ymin"] * review_file_df["image_height"]
|
980 |
+
review_file_df["ymax"] = review_file_df["ymax"] * review_file_df["image_height"]
|
981 |
|
982 |
# Keep only necessary columns
|
983 |
review_file_df = review_file_df[["image", "page", "xmin", "ymin", "xmax", "ymax", "color", "label"]]
|
|
|
991 |
# Create a list to hold the JSON data
|
992 |
json_data = []
|
993 |
|
994 |
+
for page_no, pdf_image_path in enumerate(image_paths):
|
995 |
+
reported_page_number = int(page_no + 1)
|
|
|
996 |
|
997 |
if reported_page_number in review_file_df["page"].values:
|
998 |
|
|
|
1010 |
else:
|
1011 |
annotation = {}
|
1012 |
annotation["image"] = pdf_image_path
|
1013 |
+
annotation["boxes"] = []
|
1014 |
|
1015 |
# Append the structured data to the json_data list
|
1016 |
json_data.append(annotation)
|
tools/file_redaction.py
CHANGED
@@ -8,38 +8,29 @@ import copy
|
|
8 |
|
9 |
from tqdm import tqdm
|
10 |
from PIL import Image, ImageChops, ImageFile, ImageDraw
|
11 |
-
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
12 |
from typing import List, Dict, Tuple
|
13 |
import pandas as pd
|
14 |
|
15 |
-
#from presidio_image_redactor.entities import ImageRecognizerResult
|
16 |
from pdfminer.high_level import extract_pages
|
17 |
from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTTextLineHorizontal, LTAnno
|
18 |
from pikepdf import Pdf, Dictionary, Name
|
19 |
-
import
|
20 |
-
from pymupdf import Rect
|
21 |
-
from fitz import Page
|
22 |
import gradio as gr
|
23 |
from gradio import Progress
|
24 |
from collections import defaultdict # For efficient grouping
|
25 |
|
26 |
-
from
|
27 |
-
from tools.aws_functions import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY
|
28 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes
|
29 |
-
from tools.file_conversion import process_file,
|
30 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
|
31 |
-
from tools.helper_functions import get_file_name_without_type,
|
32 |
from tools.file_conversion import process_file, is_pdf, is_pdf_or_image, prepare_image_or_pdf
|
33 |
from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
|
34 |
-
from tools.presidio_analyzer_custom import recognizer_result_from_dict
|
35 |
-
|
36 |
-
# Number of pages to loop through before breaking. Currently set very high, as functions are breaking on time metrics (e.g. every 105 seconds), rather than on number of pages redacted.
|
37 |
-
page_break_value = get_or_create_env_var('page_break_value', '50000')
|
38 |
-
print(f'The value of page_break_value is {page_break_value}')
|
39 |
-
|
40 |
-
max_time_value = get_or_create_env_var('max_time_value', '999999')
|
41 |
-
print(f'The value of max_time_value is {max_time_value}')
|
42 |
|
|
|
|
|
|
|
|
|
43 |
|
44 |
def bounding_boxes_overlap(box1, box2):
|
45 |
"""Check if two bounding boxes overlap."""
|
@@ -103,6 +94,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
103 |
review_file_state:pd.DataFrame=[],
|
104 |
output_folder:str=output_folder,
|
105 |
document_cropboxes:List=[],
|
|
|
106 |
progress=gr.Progress(track_tqdm=True)):
|
107 |
'''
|
108 |
This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
|
@@ -143,6 +135,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
143 |
- review_file_state (pd.DataFrame, optional): Output review file dataframe.
|
144 |
- output_folder (str, optional): Output folder for results.
|
145 |
- document_cropboxes (List, optional): List of document cropboxes for the PDF.
|
|
|
146 |
- progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
147 |
|
148 |
The function returns a redacted document along with processing logs.
|
@@ -239,7 +232,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
239 |
estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
|
240 |
print("Estimated total processing time:", str(estimate_total_processing_time))
|
241 |
|
242 |
-
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state, page_sizes
|
243 |
|
244 |
# If we have reached the last page, return message and outputs
|
245 |
if current_loop_page >= number_of_pages:
|
@@ -255,7 +248,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
255 |
|
256 |
review_out_file_paths.extend(out_review_file_path)
|
257 |
|
258 |
-
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state, page_sizes
|
259 |
|
260 |
# Create allow list
|
261 |
# If string, assume file path
|
@@ -484,7 +477,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
484 |
#print("all_decision_process_table before in choose and run redactor:", all_decision_process_table)
|
485 |
#print("page_sizes before in choose and run redactor:", page_sizes)
|
486 |
|
487 |
-
review_df =
|
488 |
|
489 |
#print("annotation_all_pages:", annotations_all_pages)
|
490 |
#print("all_decision_process_table after in choose and run redactor:", all_decision_process_table)
|
@@ -560,7 +553,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
560 |
out_file_paths = list(set(out_file_paths))
|
561 |
review_out_file_paths = [prepared_pdf_file_paths[0], out_review_file_path]
|
562 |
|
563 |
-
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_df, page_sizes
|
564 |
|
565 |
def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
|
566 |
'''
|
|
|
8 |
|
9 |
from tqdm import tqdm
|
10 |
from PIL import Image, ImageChops, ImageFile, ImageDraw
|
|
|
11 |
from typing import List, Dict, Tuple
|
12 |
import pandas as pd
|
13 |
|
|
|
14 |
from pdfminer.high_level import extract_pages
|
15 |
from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTTextLineHorizontal, LTAnno
|
16 |
from pikepdf import Pdf, Dictionary, Name
|
17 |
+
from pymupdf import Rect, Page
|
|
|
|
|
18 |
import gradio as gr
|
19 |
from gradio import Progress
|
20 |
from collections import defaultdict # For efficient grouping
|
21 |
|
22 |
+
from tools.config import output_folder, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, page_break_value, max_time_value, LOAD_TRUNCATED_IMAGES
|
|
|
23 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes
|
24 |
+
from tools.file_conversion import process_file, convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords
|
25 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
|
26 |
+
from tools.helper_functions import get_file_name_without_type, clean_unicode_text, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
|
27 |
from tools.file_conversion import process_file, is_pdf, is_pdf_or_image, prepare_image_or_pdf
|
28 |
from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
+
ImageFile.LOAD_TRUNCATED_IMAGES = LOAD_TRUNCATED_IMAGES.lower() == "true"
|
31 |
+
if not MAX_IMAGE_PIXELS: Image.MAX_IMAGE_PIXELS = None
|
32 |
+
else: Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
|
33 |
+
image_dpi = float(IMAGES_DPI)
|
34 |
|
35 |
def bounding_boxes_overlap(box1, box2):
|
36 |
"""Check if two bounding boxes overlap."""
|
|
|
94 |
review_file_state:pd.DataFrame=[],
|
95 |
output_folder:str=output_folder,
|
96 |
document_cropboxes:List=[],
|
97 |
+
page_sizes:List[dict]=[],
|
98 |
progress=gr.Progress(track_tqdm=True)):
|
99 |
'''
|
100 |
This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
|
|
|
135 |
- review_file_state (pd.DataFrame, optional): Output review file dataframe.
|
136 |
- output_folder (str, optional): Output folder for results.
|
137 |
- document_cropboxes (List, optional): List of document cropboxes for the PDF.
|
138 |
+
- page_sizes (List[dict], optional): List of dictionaries of PDF page sizes in PDF or image format.
|
139 |
- progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
140 |
|
141 |
The function returns a redacted document along with processing logs.
|
|
|
232 |
estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
|
233 |
print("Estimated total processing time:", str(estimate_total_processing_time))
|
234 |
|
235 |
+
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state, page_sizes, document_cropboxes
|
236 |
|
237 |
# If we have reached the last page, return message and outputs
|
238 |
if current_loop_page >= number_of_pages:
|
|
|
248 |
|
249 |
review_out_file_paths.extend(out_review_file_path)
|
250 |
|
251 |
+
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state, page_sizes, document_cropboxes
|
252 |
|
253 |
# Create allow list
|
254 |
# If string, assume file path
|
|
|
477 |
#print("all_decision_process_table before in choose and run redactor:", all_decision_process_table)
|
478 |
#print("page_sizes before in choose and run redactor:", page_sizes)
|
479 |
|
480 |
+
review_df = convert_annotation_json_to_review_df(annotations_all_pages, all_decision_process_table, page_sizes)
|
481 |
|
482 |
#print("annotation_all_pages:", annotations_all_pages)
|
483 |
#print("all_decision_process_table after in choose and run redactor:", all_decision_process_table)
|
|
|
553 |
out_file_paths = list(set(out_file_paths))
|
554 |
review_out_file_paths = [prepared_pdf_file_paths[0], out_review_file_path]
|
555 |
|
556 |
+
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_df, page_sizes, document_cropboxes
|
557 |
|
558 |
def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
|
559 |
'''
|
tools/find_duplicate_pages.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import pandas as pd
|
2 |
-
import argparse
|
3 |
-
import glob
|
4 |
import os
|
5 |
import re
|
6 |
from tools.helper_functions import output_folder
|
|
|
1 |
import pandas as pd
|
2 |
+
#import argparse
|
3 |
+
#import glob
|
4 |
import os
|
5 |
import re
|
6 |
from tools.helper_functions import output_folder
|
tools/helper_functions.py
CHANGED
@@ -9,19 +9,7 @@ import unicodedata
|
|
9 |
from typing import List
|
10 |
from gradio_image_annotation import image_annotator
|
11 |
from tools.auth import user_pool_id
|
12 |
-
|
13 |
-
|
14 |
-
def get_or_create_env_var(var_name, default_value):
|
15 |
-
# Get the environment variable if it exists
|
16 |
-
value = os.environ.get(var_name)
|
17 |
-
|
18 |
-
# If it doesn't exist, set it to the default value
|
19 |
-
if value is None:
|
20 |
-
os.environ[var_name] = default_value
|
21 |
-
value = default_value
|
22 |
-
|
23 |
-
return value
|
24 |
-
|
25 |
|
26 |
# Names for options labels
|
27 |
text_ocr_option = "Local model - selectable text"
|
@@ -31,24 +19,6 @@ textract_option = "AWS Textract service - all PDF types"
|
|
31 |
local_pii_detector = "Local"
|
32 |
aws_pii_detector = "AWS Comprehend"
|
33 |
|
34 |
-
output_folder = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/')
|
35 |
-
print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
|
36 |
-
|
37 |
-
session_output_folder = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'False')
|
38 |
-
print(f'The value of SESSION_OUTPUT_FOLDER is {session_output_folder}')
|
39 |
-
|
40 |
-
input_folder = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/')
|
41 |
-
print(f'The value of GRADIO_INPUT_FOLDER is {input_folder}')
|
42 |
-
|
43 |
-
# Retrieving or setting CUSTOM_HEADER
|
44 |
-
CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
|
45 |
-
print(f'CUSTOM_HEADER found')
|
46 |
-
|
47 |
-
# Retrieving or setting CUSTOM_HEADER_VALUE
|
48 |
-
CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
|
49 |
-
print(f'CUSTOM_HEADER_VALUE found')
|
50 |
-
|
51 |
-
|
52 |
def reset_state_vars():
|
53 |
return [], [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
|
54 |
label="Modify redaction boxes",
|
@@ -268,24 +238,8 @@ def merge_csv_files(file_list):
|
|
268 |
|
269 |
return output_files
|
270 |
|
271 |
-
|
272 |
-
|
273 |
async def get_connection_params(request: gr.Request, output_folder_textbox:str='/output/'):
|
274 |
|
275 |
-
#print("request user:", request.username)
|
276 |
-
|
277 |
-
#request_data = await request.json() # Parse JSON body
|
278 |
-
#print("All request data:", request_data)
|
279 |
-
#context_value = request_data.get('context')
|
280 |
-
#if 'context' in request_data:
|
281 |
-
# print("Request context dictionary:", request_data['context'])
|
282 |
-
|
283 |
-
# print("Request headers dictionary:", request.headers)
|
284 |
-
# print("All host elements", request.client)
|
285 |
-
# print("IP address:", request.client.host)
|
286 |
-
# print("Query parameters:", dict(request.query_params))
|
287 |
-
# To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
|
288 |
-
#print("Request dictionary to object:", request.request.body())
|
289 |
print("Session hash:", request.session_hash)
|
290 |
|
291 |
if CUSTOM_HEADER and CUSTOM_HEADER_VALUE:
|
|
|
9 |
from typing import List
|
10 |
from gradio_image_annotation import image_annotator
|
11 |
from tools.auth import user_pool_id
|
12 |
+
from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, output_folder, session_output_folder
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
# Names for options labels
|
15 |
text_ocr_option = "Local model - selectable text"
|
|
|
19 |
local_pii_detector = "Local"
|
20 |
aws_pii_detector = "AWS Comprehend"
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
def reset_state_vars():
|
23 |
return [], [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
|
24 |
label="Modify redaction boxes",
|
|
|
238 |
|
239 |
return output_files
|
240 |
|
|
|
|
|
241 |
async def get_connection_params(request: gr.Request, output_folder_textbox:str='/output/'):
|
242 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
243 |
print("Session hash:", request.session_hash)
|
244 |
|
245 |
if CUSTOM_HEADER and CUSTOM_HEADER_VALUE:
|
tools/presidio_analyzer_custom.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
import gradio as gr
|
2 |
from typing import List, Iterable, Dict, Union, Any, Optional, Iterator, Tuple
|
3 |
-
from tqdm import tqdm
|
4 |
|
5 |
-
from presidio_analyzer import DictAnalyzerResult, RecognizerResult
|
6 |
from presidio_analyzer.nlp_engine import NlpArtifacts
|
7 |
|
8 |
def recognizer_result_from_dict(data: Dict) -> RecognizerResult:
|
|
|
1 |
import gradio as gr
|
2 |
from typing import List, Iterable, Dict, Union, Any, Optional, Iterator, Tuple
|
3 |
+
#from tqdm import tqdm
|
4 |
|
5 |
+
from presidio_analyzer import DictAnalyzerResult, RecognizerResult
|
6 |
from presidio_analyzer.nlp_engine import NlpArtifacts
|
7 |
|
8 |
def recognizer_result_from_dict(data: Dict) -> RecognizerResult:
|
tools/redaction_review.py
CHANGED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
@@ -7,18 +9,18 @@ import uuid
|
|
7 |
from typing import List
|
8 |
from gradio_image_annotation import image_annotator
|
9 |
from gradio_image_annotation.image_annotator import AnnotatedImageData
|
10 |
-
from
|
11 |
-
from tools.helper_functions import get_file_name_without_type, output_folder, detect_file_type
|
12 |
-
from tools.file_redaction import redact_page_with_pymupdf
|
13 |
-
import json
|
14 |
-
import os
|
15 |
-
import re
|
16 |
import pymupdf
|
17 |
-
from fitz
|
18 |
from PIL import ImageDraw, Image
|
19 |
from collections import defaultdict
|
20 |
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
def decrease_page(number:int):
|
24 |
'''
|
@@ -110,9 +112,7 @@ def get_filtered_recogniser_dataframe_and_dropdowns(image_annotator_object:Annot
|
|
110 |
recogniser_dataframe_out = recogniser_dataframe_modified
|
111 |
|
112 |
try:
|
113 |
-
review_dataframe =
|
114 |
-
|
115 |
-
print("in get_filtered_recogniser_dataframe_and_dropdowns, recogniser_dropdown_value:", recogniser_dropdown_value)
|
116 |
|
117 |
recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(review_dataframe, "label")
|
118 |
recogniser_entities_drop = gr.Dropdown(value=recogniser_dropdown_value, choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
|
@@ -140,7 +140,6 @@ def get_filtered_recogniser_dataframe_and_dropdowns(image_annotator_object:Annot
|
|
140 |
|
141 |
return recogniser_dataframe_out, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list, text_entities_drop, page_entities_drop
|
142 |
|
143 |
-
|
144 |
def update_recogniser_dataframes(image_annotator_object:AnnotatedImageData, recogniser_dataframe_modified:pd.DataFrame, recogniser_entities_dropdown_value:str="ALL", text_dropdown_value:str="ALL", page_dropdown_value:str="ALL", review_df:pd.DataFrame=[], page_sizes:list[str]=[]):
|
145 |
'''
|
146 |
Update recogniser dataframe information that appears alongside the pdf pages on the review screen.
|
@@ -168,7 +167,6 @@ def update_recogniser_dataframes(image_annotator_object:AnnotatedImageData, reco
|
|
168 |
|
169 |
return recogniser_entities_list, recogniser_dataframe_out, recogniser_dataframe_modified, recogniser_entities_drop, text_entities_drop, page_entities_drop
|
170 |
|
171 |
-
|
172 |
def undo_last_removal(backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base):
|
173 |
return backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base
|
174 |
|
@@ -191,15 +189,24 @@ def exclude_selected_items_from_redaction(review_df: pd.DataFrame, selected_rows
|
|
191 |
# Keep only the rows that do not have a match in selected_rows_df
|
192 |
out_review_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])
|
193 |
|
194 |
-
out_image_annotations_state =
|
195 |
-
|
196 |
|
|
|
197 |
else:
|
198 |
out_review_df = review_df
|
199 |
-
|
200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
|
202 |
-
return out_review_df, out_image_annotations_state,
|
203 |
|
204 |
def update_annotator(image_annotator_object:AnnotatedImageData,
|
205 |
page_num:int,
|
@@ -315,8 +322,6 @@ def modify_existing_page_redactions(image_annotator_object:AnnotatedImageData,
|
|
315 |
if not current_page:
|
316 |
current_page = 1
|
317 |
|
318 |
-
print("in modify_existing_page_redactions - recogniser_entities_dropdown_value:", recogniser_entities_dropdown_value)
|
319 |
-
|
320 |
image_annotator_object['image'] = all_image_annotations[previous_page - 1]["image"]
|
321 |
|
322 |
if clear_all == False:
|
@@ -471,10 +476,10 @@ def apply_redactions(image_annotator_object:AnnotatedImageData,
|
|
471 |
#print("page_sizes before conversion in apply redactions:", page_sizes)
|
472 |
|
473 |
# Convert json to csv and also save this
|
474 |
-
review_df =
|
475 |
out_review_file_file_path = output_folder + file_name_with_ext + '_review_file.csv'
|
476 |
|
477 |
-
print("Saving review file after
|
478 |
review_df.to_csv(out_review_file_file_path, index=None)
|
479 |
output_files.append(out_review_file_file_path)
|
480 |
|
@@ -589,6 +594,9 @@ def update_entities_df_text(choice:str, df:pd.DataFrame, label_dropdown_value:st
|
|
589 |
return filtered_df, recogniser_entities_drop, page_entities_drop
|
590 |
|
591 |
def reset_dropdowns():
|
|
|
|
|
|
|
592 |
return gr.Dropdown(value="ALL", allow_custom_value=True), gr.Dropdown(value="ALL", allow_custom_value=True), gr.Dropdown(value="ALL", allow_custom_value=True)
|
593 |
|
594 |
def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
|
@@ -612,10 +620,13 @@ def convert_image_coords_to_adobe(pdf_page_width:float, pdf_page_height:float, i
|
|
612 |
- image_width: Width of the source image
|
613 |
- image_height: Height of the source image
|
614 |
- x1, y1, x2, y2: Coordinates in image space
|
|
|
615 |
|
616 |
Returns:
|
617 |
- Tuple of converted coordinates (x1, y1, x2, y2) in Adobe PDF space
|
618 |
'''
|
|
|
|
|
619 |
|
620 |
# Calculate scaling factors
|
621 |
scale_width = pdf_page_width / image_width
|
@@ -636,12 +647,34 @@ def convert_image_coords_to_adobe(pdf_page_width:float, pdf_page_height:float, i
|
|
636 |
|
637 |
return pdf_x1, pdf_y1, pdf_x2, pdf_y2
|
638 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
639 |
|
640 |
-
def create_xfdf(
|
641 |
'''
|
642 |
Create an xfdf file from a review csv file and a pdf
|
643 |
'''
|
644 |
-
|
|
|
645 |
# Create root element
|
646 |
xfdf = Element('xfdf', xmlns="http://ns.adobe.com/xfdf/", xml_space="preserve")
|
647 |
|
@@ -651,13 +684,49 @@ def create_xfdf(df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, image_paths:L
|
|
651 |
|
652 |
# Add annots
|
653 |
annots = SubElement(xfdf, 'annots')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
654 |
|
655 |
-
|
|
|
656 |
page_python_format = int(row["page"])-1
|
657 |
|
658 |
pymupdf_page = pymupdf_doc.load_page(page_python_format)
|
659 |
|
660 |
-
# Load cropbox sizes
|
661 |
if document_cropboxes:
|
662 |
#print("Document cropboxes:", document_cropboxes)
|
663 |
|
@@ -672,13 +741,12 @@ def create_xfdf(df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, image_paths:L
|
|
672 |
else:
|
673 |
print("Document cropboxes not found.")
|
674 |
|
|
|
675 |
pdf_page_height = pymupdf_page.mediabox.height
|
676 |
pdf_page_width = pymupdf_page.mediabox.width
|
677 |
|
678 |
image = image_paths[page_python_format]
|
679 |
|
680 |
-
#print("image:", image)
|
681 |
-
|
682 |
if isinstance(image, str):
|
683 |
image = Image.open(image)
|
684 |
|
@@ -695,16 +763,22 @@ def create_xfdf(df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, image_paths:L
|
|
695 |
redact_annot.set('page', str(int(row['page']) - 1))
|
696 |
|
697 |
# Convert coordinates
|
698 |
-
|
699 |
-
|
700 |
-
|
701 |
-
|
702 |
-
|
703 |
-
|
704 |
-
|
705 |
-
|
706 |
-
|
707 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
708 |
|
709 |
if CUSTOM_BOX_COLOUR == "grey":
|
710 |
colour_str = "0.5,0.5,0.5"
|
@@ -756,12 +830,13 @@ def create_xfdf(df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, image_paths:L
|
|
756 |
|
757 |
return xml_str
|
758 |
|
759 |
-
def convert_df_to_xfdf(input_files:List[str], pdf_doc, image_paths:List[str], output_folder:str = output_folder, document_cropboxes:List=[]):
|
760 |
'''
|
761 |
Load in files to convert a review file into an Adobe comment file format
|
762 |
'''
|
763 |
output_paths = []
|
764 |
pdf_name = ""
|
|
|
765 |
|
766 |
if isinstance(input_files, str):
|
767 |
file_paths_list = [input_files]
|
@@ -778,29 +853,29 @@ def convert_df_to_xfdf(input_files:List[str], pdf_doc, image_paths:List[str], ou
|
|
778 |
else:
|
779 |
file_path = file.name
|
780 |
|
781 |
-
|
782 |
-
|
783 |
|
784 |
-
|
785 |
-
|
786 |
|
787 |
-
|
788 |
-
|
789 |
-
|
790 |
-
|
791 |
-
|
792 |
-
|
793 |
|
794 |
-
|
795 |
|
796 |
-
|
797 |
|
798 |
-
|
799 |
-
|
800 |
-
|
801 |
-
|
802 |
|
803 |
-
|
804 |
|
805 |
return output_paths
|
806 |
|
@@ -841,7 +916,7 @@ def convert_adobe_coords_to_image(pdf_page_width:float, pdf_page_height:float, i
|
|
841 |
|
842 |
return image_x1, image_y1, image_x2, image_y2
|
843 |
|
844 |
-
def parse_xfdf(xfdf_path):
|
845 |
'''
|
846 |
Parse the XFDF file and extract redaction annotations.
|
847 |
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
import gradio as gr
|
4 |
import pandas as pd
|
5 |
import numpy as np
|
|
|
9 |
from typing import List
|
10 |
from gradio_image_annotation import image_annotator
|
11 |
from gradio_image_annotation.image_annotator import AnnotatedImageData
|
12 |
+
from pymupdf import Document, Rect
|
|
|
|
|
|
|
|
|
|
|
13 |
import pymupdf
|
14 |
+
#from fitz
|
15 |
from PIL import ImageDraw, Image
|
16 |
from collections import defaultdict
|
17 |
|
18 |
+
from tools.config import output_folder, CUSTOM_BOX_COLOUR, MAX_IMAGE_PIXELS
|
19 |
+
from tools.file_conversion import is_pdf, convert_annotation_json_to_review_df, convert_review_df_to_annotation_json
|
20 |
+
from tools.helper_functions import get_file_name_without_type, detect_file_type
|
21 |
+
from tools.file_redaction import redact_page_with_pymupdf
|
22 |
+
|
23 |
+
if not MAX_IMAGE_PIXELS: Image.MAX_IMAGE_PIXELS = None
|
24 |
|
25 |
def decrease_page(number:int):
|
26 |
'''
|
|
|
112 |
recogniser_dataframe_out = recogniser_dataframe_modified
|
113 |
|
114 |
try:
|
115 |
+
review_dataframe = convert_annotation_json_to_review_df(image_annotator_object, review_df, page_sizes)
|
|
|
|
|
116 |
|
117 |
recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(review_dataframe, "label")
|
118 |
recogniser_entities_drop = gr.Dropdown(value=recogniser_dropdown_value, choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
|
|
|
140 |
|
141 |
return recogniser_dataframe_out, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list, text_entities_drop, page_entities_drop
|
142 |
|
|
|
143 |
def update_recogniser_dataframes(image_annotator_object:AnnotatedImageData, recogniser_dataframe_modified:pd.DataFrame, recogniser_entities_dropdown_value:str="ALL", text_dropdown_value:str="ALL", page_dropdown_value:str="ALL", review_df:pd.DataFrame=[], page_sizes:list[str]=[]):
|
144 |
'''
|
145 |
Update recogniser dataframe information that appears alongside the pdf pages on the review screen.
|
|
|
167 |
|
168 |
return recogniser_entities_list, recogniser_dataframe_out, recogniser_dataframe_modified, recogniser_entities_drop, text_entities_drop, page_entities_drop
|
169 |
|
|
|
170 |
def undo_last_removal(backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base):
|
171 |
return backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base
|
172 |
|
|
|
189 |
# Keep only the rows that do not have a match in selected_rows_df
|
190 |
out_review_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])
|
191 |
|
192 |
+
out_image_annotations_state = convert_review_df_to_annotation_json(out_review_df, image_file_paths, page_sizes)
|
193 |
+
out_recogniser_entity_dataframe_base = out_review_df[["page", "label", "text"]]
|
194 |
|
195 |
+
# Either there is nothing left in the selection dataframe, or the review dataframe
|
196 |
else:
|
197 |
out_review_df = review_df
|
198 |
+
out_recogniser_entity_dataframe_base = recogniser_entity_dataframe_base
|
199 |
+
|
200 |
+
out_image_annotations_state = []
|
201 |
+
|
202 |
+
for page_no, page in enumerate(image_file_paths):
|
203 |
+
annotation = {}
|
204 |
+
annotation["image"] = image_file_paths[page_no]
|
205 |
+
annotation["boxes"] = []
|
206 |
+
|
207 |
+
out_image_annotations_state.append(annotation)
|
208 |
|
209 |
+
return out_review_df, out_image_annotations_state, out_recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base
|
210 |
|
211 |
def update_annotator(image_annotator_object:AnnotatedImageData,
|
212 |
page_num:int,
|
|
|
322 |
if not current_page:
|
323 |
current_page = 1
|
324 |
|
|
|
|
|
325 |
image_annotator_object['image'] = all_image_annotations[previous_page - 1]["image"]
|
326 |
|
327 |
if clear_all == False:
|
|
|
476 |
#print("page_sizes before conversion in apply redactions:", page_sizes)
|
477 |
|
478 |
# Convert json to csv and also save this
|
479 |
+
review_df = convert_annotation_json_to_review_df(all_image_annotations, review_file_state, page_sizes=page_sizes)[["image", "page", "text", "label","color", "xmin", "ymin", "xmax", "ymax"]]
|
480 |
out_review_file_file_path = output_folder + file_name_with_ext + '_review_file.csv'
|
481 |
|
482 |
+
#print("Saving review file after convert_annotation_json_to_review_df function in apply redactions")
|
483 |
review_df.to_csv(out_review_file_file_path, index=None)
|
484 |
output_files.append(out_review_file_file_path)
|
485 |
|
|
|
594 |
return filtered_df, recogniser_entities_drop, page_entities_drop
|
595 |
|
596 |
def reset_dropdowns():
|
597 |
+
'''
|
598 |
+
Return Gradio dropdown objects with value 'ALL'.
|
599 |
+
'''
|
600 |
return gr.Dropdown(value="ALL", allow_custom_value=True), gr.Dropdown(value="ALL", allow_custom_value=True), gr.Dropdown(value="ALL", allow_custom_value=True)
|
601 |
|
602 |
def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
|
|
|
620 |
- image_width: Width of the source image
|
621 |
- image_height: Height of the source image
|
622 |
- x1, y1, x2, y2: Coordinates in image space
|
623 |
+
- page_sizes: List of dicts containing sizes of page as pymupdf page or PIL image
|
624 |
|
625 |
Returns:
|
626 |
- Tuple of converted coordinates (x1, y1, x2, y2) in Adobe PDF space
|
627 |
'''
|
628 |
+
|
629 |
+
|
630 |
|
631 |
# Calculate scaling factors
|
632 |
scale_width = pdf_page_width / image_width
|
|
|
647 |
|
648 |
return pdf_x1, pdf_y1, pdf_x2, pdf_y2
|
649 |
|
650 |
+
def convert_pymupdf_coords_to_adobe(x1: float, y1: float, x2: float, y2: float):
|
651 |
+
"""
|
652 |
+
Converts coordinates from PyMuPDF (fitz) space to Adobe PDF space.
|
653 |
+
|
654 |
+
Parameters:
|
655 |
+
- pdf_page_width: Width of the PDF page
|
656 |
+
- pdf_page_height: Height of the PDF page
|
657 |
+
- x1, y1, x2, y2: Coordinates in PyMuPDF space
|
658 |
+
|
659 |
+
Returns:
|
660 |
+
- Tuple of converted coordinates (x1, y1, x2, y2) in Adobe PDF space
|
661 |
+
"""
|
662 |
+
|
663 |
+
# PyMuPDF and Adobe PDF coordinates are similar, but ensure y1 is always the lower value
|
664 |
+
pdf_x1, pdf_x2 = x1, x2
|
665 |
+
|
666 |
+
# Ensure y1 is the bottom coordinate and y2 is the top
|
667 |
+
pdf_y1, pdf_y2 = min(y1, y2), max(y1, y2)
|
668 |
+
|
669 |
+
return pdf_x1, pdf_y1, pdf_x2, pdf_y2
|
670 |
+
|
671 |
|
672 |
+
def create_xfdf(review_file_df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, image_paths:List[str], document_cropboxes:List=[], page_sizes:List[dict]=[]):
|
673 |
'''
|
674 |
Create an xfdf file from a review csv file and a pdf
|
675 |
'''
|
676 |
+
pages_are_images = True
|
677 |
+
|
678 |
# Create root element
|
679 |
xfdf = Element('xfdf', xmlns="http://ns.adobe.com/xfdf/", xml_space="preserve")
|
680 |
|
|
|
684 |
|
685 |
# Add annots
|
686 |
annots = SubElement(xfdf, 'annots')
|
687 |
+
|
688 |
+
# Check if page size object exists, and if current coordinates are in relative format or image coordinates format.
|
689 |
+
if page_sizes:
|
690 |
+
page_sizes_df = pd.DataFrame(page_sizes)
|
691 |
+
|
692 |
+
# If there are no image coordinates, then convert coordinates to pymupdf coordinates prior to export
|
693 |
+
if len(page_sizes_df.loc[page_sizes_df["image_width"].isnull(),"image_width"]) == len(page_sizes_df["image_width"]):
|
694 |
+
print("No image dimensions found, using pymupdf coordinates for conversion.")
|
695 |
+
|
696 |
+
if "mediabox_width" not in review_file_df.columns:
|
697 |
+
review_file_df = review_file_df.merge(page_sizes_df, how="left", on = "page")
|
698 |
+
|
699 |
+
# If all coordinates are less or equal to one, this is a relative page scaling - change back to image coordinates
|
700 |
+
if review_file_df["xmin"].max() <= 1 and review_file_df["xmax"].max() <= 1 and review_file_df["ymin"].max() <= 1 and review_file_df["ymax"].max() <= 1:
|
701 |
+
review_file_df["xmin"] = review_file_df["xmin"] * review_file_df["mediabox_width"]
|
702 |
+
review_file_df["xmax"] = review_file_df["xmax"] * review_file_df["mediabox_width"]
|
703 |
+
review_file_df["ymin"] = review_file_df["ymin"] * review_file_df["mediabox_height"]
|
704 |
+
review_file_df["ymax"] = review_file_df["ymax"] * review_file_df["mediabox_height"]
|
705 |
+
|
706 |
+
pages_are_images = False
|
707 |
+
|
708 |
+
# If no nulls, then can do image coordinate conversion
|
709 |
+
elif len(page_sizes_df.loc[page_sizes_df["image_width"].isnull(),"image_width"]) == 0:
|
710 |
+
|
711 |
+
if "image_width" not in review_file_df.columns:
|
712 |
+
review_file_df = review_file_df.merge(page_sizes_df, how="left", on = "page")
|
713 |
+
|
714 |
+
# If all coordinates are less or equal to one, this is a relative page scaling - change back to image coordinates
|
715 |
+
if review_file_df["xmin"].max() <= 1 and review_file_df["xmax"].max() <= 1 and review_file_df["ymin"].max() <= 1 and review_file_df["ymax"].max() <= 1:
|
716 |
+
review_file_df["xmin"] = review_file_df["xmin"] * review_file_df["image_width"]
|
717 |
+
review_file_df["xmax"] = review_file_df["xmax"] * review_file_df["image_width"]
|
718 |
+
review_file_df["ymin"] = review_file_df["ymin"] * review_file_df["image_height"]
|
719 |
+
review_file_df["ymax"] = review_file_df["ymax"] * review_file_df["image_height"]
|
720 |
+
|
721 |
+
pages_are_images = True
|
722 |
|
723 |
+
# Go through each row of the review_file_df, create an entry in the output Adobe xfdf file.
|
724 |
+
for _, row in review_file_df.iterrows():
|
725 |
page_python_format = int(row["page"])-1
|
726 |
|
727 |
pymupdf_page = pymupdf_doc.load_page(page_python_format)
|
728 |
|
729 |
+
# Load cropbox sizes. Set cropbox to the original cropbox sizes from when the document was loaded into the app.
|
730 |
if document_cropboxes:
|
731 |
#print("Document cropboxes:", document_cropboxes)
|
732 |
|
|
|
741 |
else:
|
742 |
print("Document cropboxes not found.")
|
743 |
|
744 |
+
|
745 |
pdf_page_height = pymupdf_page.mediabox.height
|
746 |
pdf_page_width = pymupdf_page.mediabox.width
|
747 |
|
748 |
image = image_paths[page_python_format]
|
749 |
|
|
|
|
|
750 |
if isinstance(image, str):
|
751 |
image = Image.open(image)
|
752 |
|
|
|
763 |
redact_annot.set('page', str(int(row['page']) - 1))
|
764 |
|
765 |
# Convert coordinates
|
766 |
+
if pages_are_images == True:
|
767 |
+
x1, y1, x2, y2 = convert_image_coords_to_adobe(
|
768 |
+
pdf_page_width,
|
769 |
+
pdf_page_height,
|
770 |
+
image_page_width,
|
771 |
+
image_page_height,
|
772 |
+
row['xmin'],
|
773 |
+
row['ymin'],
|
774 |
+
row['xmax'],
|
775 |
+
row['ymax']
|
776 |
+
)
|
777 |
+
else:
|
778 |
+
x1, y1, x2, y2 = convert_pymupdf_coords_to_adobe(row['xmin'],
|
779 |
+
row['ymin'],
|
780 |
+
row['xmax'],
|
781 |
+
row['ymax'])
|
782 |
|
783 |
if CUSTOM_BOX_COLOUR == "grey":
|
784 |
colour_str = "0.5,0.5,0.5"
|
|
|
830 |
|
831 |
return xml_str
|
832 |
|
833 |
+
def convert_df_to_xfdf(input_files:List[str], pdf_doc:Document, image_paths:List[str], output_folder:str = output_folder, document_cropboxes:List=[], page_sizes:List[dict]=[]):
|
834 |
'''
|
835 |
Load in files to convert a review file into an Adobe comment file format
|
836 |
'''
|
837 |
output_paths = []
|
838 |
pdf_name = ""
|
839 |
+
file_path_name = ""
|
840 |
|
841 |
if isinstance(input_files, str):
|
842 |
file_paths_list = [input_files]
|
|
|
853 |
else:
|
854 |
file_path = file.name
|
855 |
|
856 |
+
file_path_name = get_file_name_without_type(file_path)
|
857 |
+
file_path_end = detect_file_type(file_path)
|
858 |
|
859 |
+
if file_path_end == "pdf":
|
860 |
+
pdf_name = os.path.basename(file_path)
|
861 |
|
862 |
+
if file_path_end == "csv":
|
863 |
+
# If no pdf name, just get the name of the file path
|
864 |
+
if not pdf_name:
|
865 |
+
pdf_name = file_path_name
|
866 |
+
# Read CSV file
|
867 |
+
review_file_df = pd.read_csv(file_path)
|
868 |
|
869 |
+
review_file_df.fillna('', inplace=True) # Replace NaN in review file with an empty string
|
870 |
|
871 |
+
xfdf_content = create_xfdf(review_file_df, pdf_name, pdf_doc, image_paths, document_cropboxes, page_sizes)
|
872 |
|
873 |
+
output_path = output_folder + file_path_name + "_adobe.xfdf"
|
874 |
+
|
875 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
876 |
+
f.write(xfdf_content)
|
877 |
|
878 |
+
output_paths.append(output_path)
|
879 |
|
880 |
return output_paths
|
881 |
|
|
|
916 |
|
917 |
return image_x1, image_y1, image_x2, image_y2
|
918 |
|
919 |
+
def parse_xfdf(xfdf_path:str):
|
920 |
'''
|
921 |
Parse the XFDF file and extract redaction annotations.
|
922 |
|