Commit
·
cb349ad
1
Parent(s):
3518b67
Ensured the text ocr outputs have no line breaks at end. Multi-line custom text searches now possible. Files for review sent from redact button. Fixed image redaction (not review yet). Can get user pool details from headers. Gradio update.
Browse files- .dockerignore +2 -1
- .gitignore +2 -1
- app.py +29 -28
- requirements.txt +1 -1
- tools/auth.py +12 -1
- tools/custom_image_analyser_engine.py +643 -399
- tools/file_conversion.py +9 -2
- tools/file_redaction.py +291 -297
- tools/helper_functions.py +26 -4
- tools/load_spacy_model_custom_recognisers.py +1 -2
- tools/redaction_review.py +22 -14
.dockerignore
CHANGED
@@ -16,4 +16,5 @@ build/*
|
|
16 |
dist/*
|
17 |
build_deps/*
|
18 |
logs/*
|
19 |
-
doc_redaction_amplify_app/*
|
|
|
|
16 |
dist/*
|
17 |
build_deps/*
|
18 |
logs/*
|
19 |
+
doc_redaction_amplify_app/*
|
20 |
+
user_guide/*
|
.gitignore
CHANGED
@@ -16,4 +16,5 @@ build/*
|
|
16 |
dist/*
|
17 |
build_deps/*
|
18 |
logs/*
|
19 |
-
doc_redaction_amplify_app/*
|
|
|
|
16 |
dist/*
|
17 |
build_deps/*
|
18 |
logs/*
|
19 |
+
doc_redaction_amplify_app/*
|
20 |
+
user_guide/*
|
app.py
CHANGED
@@ -66,26 +66,27 @@ with app:
|
|
66 |
|
67 |
pdf_doc_state = gr.State([])
|
68 |
all_image_annotations_state = gr.State([])
|
69 |
-
|
70 |
-
|
|
|
|
|
|
|
71 |
|
72 |
session_hash_state = gr.State()
|
73 |
s3_output_folder_state = gr.State()
|
74 |
|
75 |
first_loop_state = gr.State(True)
|
76 |
second_loop_state = gr.State(False)
|
|
|
77 |
|
78 |
-
prepared_pdf_state = gr.State([])
|
79 |
-
images_pdf_state = gr.State([]) # List of pdf pages converted to PIL images
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
-
output_image_files_state = gr.State([])
|
82 |
-
output_file_list_state = gr.State([])
|
83 |
-
text_output_file_list_state = gr.State([])
|
84 |
-
log_files_output_list_state = gr.State([])
|
85 |
-
|
86 |
-
review_file_state = gr.State(pd.DataFrame())
|
87 |
-
|
88 |
-
do_not_save_pdf_state = gr.State(False)
|
89 |
|
90 |
# Logging state
|
91 |
log_file_name = 'log.csv'
|
@@ -95,7 +96,7 @@ with app:
|
|
95 |
access_logs_state = gr.State(access_logs_folder + log_file_name)
|
96 |
access_s3_logs_loc_state = gr.State(access_logs_folder)
|
97 |
usage_logs_state = gr.State(usage_logs_folder + log_file_name)
|
98 |
-
usage_s3_logs_loc_state = gr.State(usage_logs_folder)
|
99 |
|
100 |
# Invisible text boxes to hold the session hash/username, Textract request metadata, data file names just for logging purposes.
|
101 |
session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
|
@@ -115,8 +116,7 @@ with app:
|
|
115 |
estimated_time_taken_number = gr.Number(label = "estimated_time_taken_number", value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
|
116 |
annotate_previous_page = gr.Number(value=0, label="Previous page", precision=0, visible=False) # Keeps track of the last page that the annotator was on
|
117 |
|
118 |
-
s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
|
119 |
-
|
120 |
|
121 |
## Annotator zoom value
|
122 |
annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=80, precision=0, visible=False)
|
@@ -129,16 +129,16 @@ with app:
|
|
129 |
## Settings page variables
|
130 |
default_allow_list_file_name = "default_allow_list.csv"
|
131 |
default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
|
132 |
-
in_allow_list_state = gr.
|
133 |
|
134 |
default_deny_list_file_name = "default_deny_list.csv"
|
135 |
default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
|
136 |
-
in_deny_list_state = gr.
|
137 |
in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
|
138 |
|
139 |
fully_redacted_list_file_name = "default_fully_redacted_list.csv"
|
140 |
fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
|
141 |
-
in_fully_redacted_list_state = gr.
|
142 |
in_fully_redacted_text_in = gr.Textbox(value="Fully redacted page list", visible=False)
|
143 |
|
144 |
# S3 settings for default allow list load
|
@@ -209,6 +209,8 @@ with app:
|
|
209 |
with gr.Row():
|
210 |
annotate_zoom_in = gr.Button("Zoom in")
|
211 |
annotate_zoom_out = gr.Button("Zoom out")
|
|
|
|
|
212 |
with gr.Row():
|
213 |
clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
|
214 |
|
@@ -237,18 +239,16 @@ with app:
|
|
237 |
)
|
238 |
|
239 |
with gr.Row():
|
240 |
-
|
|
|
|
|
|
|
241 |
|
242 |
#with gr.Column(scale=1):
|
243 |
with gr.Row():
|
244 |
recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
|
245 |
recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", label="Search results. Click to go to page")
|
246 |
-
|
247 |
-
with gr.Row():
|
248 |
-
annotation_last_page_button_bottom = gr.Button("Previous page", scale = 3)
|
249 |
-
annotate_current_page_bottom = gr.Number(value=1, label="Page (press enter to change)", precision=0, interactive=True, scale = 2)
|
250 |
-
annotate_max_pages_bottom = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
|
251 |
-
annotation_next_page_button_bottom = gr.Button("Next page", scale = 3)
|
252 |
|
253 |
# TEXT / TABULAR DATA TAB
|
254 |
with gr.Tab(label="Open text or Excel/csv files"):
|
@@ -322,12 +322,12 @@ with app:
|
|
322 |
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state]).\
|
323 |
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
|
324 |
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
325 |
-
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
|
326 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
327 |
|
328 |
# If the app has completed a batch of pages, it will run this until the end of all pages in the document
|
329 |
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
330 |
-
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number]).\
|
331 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
332 |
|
333 |
# If a file has been completed, the function will continue onto the next document
|
@@ -394,7 +394,8 @@ with app:
|
|
394 |
|
395 |
recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=[annotate_current_page]).\
|
396 |
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
397 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
|
|
398 |
|
399 |
|
400 |
###
|
|
|
66 |
|
67 |
pdf_doc_state = gr.State([])
|
68 |
all_image_annotations_state = gr.State([])
|
69 |
+
|
70 |
+
|
71 |
+
all_line_level_ocr_results_df_state = gr.Dataframe(value=pd.DataFrame(), label="all_line_level_ocr_results_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
|
72 |
+
all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), label="all_decision_process_table", visible=False, type="pandas") # gr.State(pd.DataFrame())
|
73 |
+
review_file_state = gr.Dataframe(value=pd.DataFrame(), label="review_file_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
|
74 |
|
75 |
session_hash_state = gr.State()
|
76 |
s3_output_folder_state = gr.State()
|
77 |
|
78 |
first_loop_state = gr.State(True)
|
79 |
second_loop_state = gr.State(False)
|
80 |
+
do_not_save_pdf_state = gr.State(False)
|
81 |
|
82 |
+
prepared_pdf_state = gr.Dropdown(label = "prepared_pdf_list", value="", allow_custom_value=True,visible=False) #gr.State([])
|
83 |
+
images_pdf_state = gr.Dropdown(label = "images_pdf_list", value="", allow_custom_value=True,visible=False) #gr.State([]) # List of pdf pages converted to PIL images
|
84 |
+
|
85 |
+
output_image_files_state = gr.Dropdown(label = "output_image_files_list", value="", allow_custom_value=True,visible=False) #gr.State([])
|
86 |
+
output_file_list_state = gr.Dropdown(label = "output_file_list", value="", allow_custom_value=True,visible=False) #gr.State([])
|
87 |
+
text_output_file_list_state = gr.Dropdown(label = "text_output_file_list", value="", allow_custom_value=True,visible=False) #gr.State([])
|
88 |
+
log_files_output_list_state = gr.Dropdown(label = "log_files_output_list", value="", allow_custom_value=True,visible=False) #gr.State([])
|
89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
# Logging state
|
92 |
log_file_name = 'log.csv'
|
|
|
96 |
access_logs_state = gr.State(access_logs_folder + log_file_name)
|
97 |
access_s3_logs_loc_state = gr.State(access_logs_folder)
|
98 |
usage_logs_state = gr.State(usage_logs_folder + log_file_name)
|
99 |
+
usage_s3_logs_loc_state = gr.State(usage_logs_folder)
|
100 |
|
101 |
# Invisible text boxes to hold the session hash/username, Textract request metadata, data file names just for logging purposes.
|
102 |
session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
|
|
|
116 |
estimated_time_taken_number = gr.Number(label = "estimated_time_taken_number", value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
|
117 |
annotate_previous_page = gr.Number(value=0, label="Previous page", precision=0, visible=False) # Keeps track of the last page that the annotator was on
|
118 |
|
119 |
+
s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
|
|
|
120 |
|
121 |
## Annotator zoom value
|
122 |
annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=80, precision=0, visible=False)
|
|
|
129 |
## Settings page variables
|
130 |
default_allow_list_file_name = "default_allow_list.csv"
|
131 |
default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
|
132 |
+
in_allow_list_state = gr.Dataframe(value=pd.DataFrame(), label="in_allow_list_df", visible=False, type="pandas")
|
133 |
|
134 |
default_deny_list_file_name = "default_deny_list.csv"
|
135 |
default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
|
136 |
+
in_deny_list_state = gr.Dataframe(value=pd.DataFrame(), label="in_deny_list_df", visible=False, type="pandas")
|
137 |
in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
|
138 |
|
139 |
fully_redacted_list_file_name = "default_fully_redacted_list.csv"
|
140 |
fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
|
141 |
+
in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), label="in_full_redacted_list_df", visible=False, type="pandas")
|
142 |
in_fully_redacted_text_in = gr.Textbox(value="Fully redacted page list", visible=False)
|
143 |
|
144 |
# S3 settings for default allow list load
|
|
|
209 |
with gr.Row():
|
210 |
annotate_zoom_in = gr.Button("Zoom in")
|
211 |
annotate_zoom_out = gr.Button("Zoom out")
|
212 |
+
with gr.Row():
|
213 |
+
annotation_button_apply = gr.Button("Apply revised redactions to pdf", variant="secondary")
|
214 |
with gr.Row():
|
215 |
clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
|
216 |
|
|
|
239 |
)
|
240 |
|
241 |
with gr.Row():
|
242 |
+
annotation_last_page_button_bottom = gr.Button("Previous page", scale = 3)
|
243 |
+
annotate_current_page_bottom = gr.Number(value=1, label="Page (press enter to change)", precision=0, interactive=True, scale = 2)
|
244 |
+
annotate_max_pages_bottom = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
|
245 |
+
annotation_next_page_button_bottom = gr.Button("Next page", scale = 3)
|
246 |
|
247 |
#with gr.Column(scale=1):
|
248 |
with gr.Row():
|
249 |
recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
|
250 |
recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", label="Search results. Click to go to page")
|
251 |
+
|
|
|
|
|
|
|
|
|
|
|
252 |
|
253 |
# TEXT / TABULAR DATA TAB
|
254 |
with gr.Tab(label="Open text or Excel/csv files"):
|
|
|
322 |
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state]).\
|
323 |
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
|
324 |
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
325 |
+
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files], api_name="redact_doc").\
|
326 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
327 |
|
328 |
# If the app has completed a batch of pages, it will run this until the end of all pages in the document
|
329 |
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
330 |
+
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files]).\
|
331 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
332 |
|
333 |
# If a file has been completed, the function will continue onto the next document
|
|
|
394 |
|
395 |
recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=[annotate_current_page]).\
|
396 |
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
397 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
398 |
+
then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
399 |
|
400 |
|
401 |
###
|
requirements.txt
CHANGED
@@ -10,7 +10,7 @@ pandas==2.2.3
|
|
10 |
spacy==3.8.3
|
11 |
#en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
|
12 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
13 |
-
gradio==5.
|
14 |
boto3==1.35.83
|
15 |
pyarrow==18.1.0
|
16 |
openpyxl==3.1.2
|
|
|
10 |
spacy==3.8.3
|
11 |
#en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
|
12 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
13 |
+
gradio==5.12.0
|
14 |
boto3==1.35.83
|
15 |
pyarrow==18.1.0
|
16 |
openpyxl==3.1.2
|
tools/auth.py
CHANGED
@@ -1,10 +1,21 @@
|
|
1 |
|
|
|
2 |
import boto3
|
3 |
import gradio as gr
|
4 |
import hmac
|
5 |
import hashlib
|
6 |
import base64
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
client_id = get_or_create_env_var('AWS_CLIENT_ID', '')
|
10 |
#print(f'The value of AWS_CLIENT_ID is {client_id}')
|
|
|
1 |
|
2 |
+
import os
|
3 |
import boto3
|
4 |
import gradio as gr
|
5 |
import hmac
|
6 |
import hashlib
|
7 |
import base64
|
8 |
+
|
9 |
+
def get_or_create_env_var(var_name, default_value):
|
10 |
+
# Get the environment variable if it exists
|
11 |
+
value = os.environ.get(var_name)
|
12 |
+
|
13 |
+
# If it doesn't exist, set it to the default value
|
14 |
+
if value is None:
|
15 |
+
os.environ[var_name] = default_value
|
16 |
+
value = default_value
|
17 |
+
|
18 |
+
return value
|
19 |
|
20 |
client_id = get_or_create_env_var('AWS_CLIENT_ID', '')
|
21 |
#print(f'The value of AWS_CLIENT_ID is {client_id}')
|
tools/custom_image_analyser_engine.py
CHANGED
@@ -1,20 +1,19 @@
|
|
1 |
import pytesseract
|
2 |
import numpy as np
|
3 |
from presidio_analyzer import AnalyzerEngine, RecognizerResult
|
4 |
-
#from presidio_image_redactor import ImagePreprocessor
|
5 |
from typing import List, Dict, Optional, Union, Tuple
|
6 |
from dataclasses import dataclass
|
7 |
import time
|
8 |
import cv2
|
|
|
|
|
|
|
9 |
import PIL
|
10 |
-
from PIL import
|
11 |
from typing import Optional, Tuple, Union
|
12 |
-
from copy import deepcopy
|
13 |
from tools.helper_functions import clean_unicode_text
|
14 |
from tools.presidio_analyzer_custom import recognizer_result_from_dict
|
15 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
16 |
-
#import string # Import string to get a list of common punctuation characters
|
17 |
-
import re # Add this import at the top of the file
|
18 |
|
19 |
@dataclass
|
20 |
class OCRResult:
|
@@ -174,7 +173,6 @@ class BilateralFilter(ImagePreprocessor):
|
|
174 |
|
175 |
return Image.fromarray(filtered_image), metadata
|
176 |
|
177 |
-
|
178 |
class SegmentedAdaptiveThreshold(ImagePreprocessor):
|
179 |
"""SegmentedAdaptiveThreshold class.
|
180 |
|
@@ -252,9 +250,6 @@ class SegmentedAdaptiveThreshold(ImagePreprocessor):
|
|
252 |
metadata = {"C": c, "background_color": background_color, "contrast": contrast}
|
253 |
return Image.fromarray(adaptive_threshold_image), metadata
|
254 |
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
class ImageRescaling(ImagePreprocessor):
|
259 |
"""ImageRescaling class. Rescales images based on their size."""
|
260 |
|
@@ -302,7 +297,6 @@ class ImageRescaling(ImagePreprocessor):
|
|
302 |
metadata = {"scale_factor": scale_factor}
|
303 |
return Image.fromarray(rescaled_image), metadata
|
304 |
|
305 |
-
|
306 |
class ContrastSegmentedImageEnhancer(ImagePreprocessor):
|
307 |
"""Class containing all logic to perform contrastive segmentation.
|
308 |
|
@@ -409,6 +403,464 @@ def bounding_boxes_overlap(box1, box2):
|
|
409 |
"""Check if two bounding boxes overlap."""
|
410 |
return (box1[0] < box2[2] and box2[0] < box1[2] and
|
411 |
box1[1] < box2[3] and box2[1] < box1[3])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
412 |
|
413 |
class CustomImageAnalyzerEngine:
|
414 |
def __init__(
|
@@ -463,261 +915,225 @@ class CustomImageAnalyzerEngine:
|
|
463 |
self,
|
464 |
line_level_ocr_results: List[OCRResult],
|
465 |
ocr_results_with_children: Dict[str, Dict],
|
466 |
-
chosen_redact_comprehend_entities:List[str],
|
467 |
-
pii_identification_method:str="Local",
|
468 |
-
comprehend_client="",
|
469 |
**text_analyzer_kwargs
|
470 |
) -> List[CustomImageRecognizerResult]:
|
471 |
-
# Define English as default language, if not specified
|
472 |
-
if "language" not in text_analyzer_kwargs:
|
473 |
-
text_analyzer_kwargs["language"] = "en"
|
474 |
|
475 |
-
|
476 |
-
|
|
|
477 |
comprehend_query_number = 0
|
478 |
-
|
479 |
-
allow_list = text_analyzer_kwargs.get('allow_list', [])
|
480 |
-
|
481 |
-
combined_results = []
|
482 |
-
# Initialize variables for batching
|
483 |
-
current_batch = ""
|
484 |
-
current_batch_mapping = [] # List of (start_pos, line_index, original_text) tuples
|
485 |
-
analyzer_results_by_line = [[] for _ in line_level_ocr_results] # Store results for each line
|
486 |
|
487 |
-
#
|
488 |
for i, line_level_ocr_result in enumerate(line_level_ocr_results):
|
489 |
-
if
|
490 |
-
|
491 |
-
|
492 |
-
|
493 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
494 |
|
495 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
496 |
|
497 |
-
|
498 |
-
|
|
|
|
|
|
|
499 |
|
500 |
-
|
501 |
-
|
|
|
|
|
502 |
|
503 |
-
|
|
|
|
|
504 |
|
505 |
-
|
506 |
-
|
507 |
-
if current_batch:
|
508 |
-
current_batch += " | " # Use a separator that's unlikely to appear in the text
|
509 |
|
510 |
-
|
511 |
-
|
512 |
-
|
513 |
-
|
514 |
-
|
515 |
-
|
516 |
-
|
517 |
-
|
518 |
-
|
519 |
-
|
520 |
-
|
521 |
-
LanguageCode=text_analyzer_kwargs["language"]
|
522 |
-
)
|
523 |
-
|
524 |
-
except Exception as e:
|
525 |
-
print("AWS Comprehend call failed due to:", e, "waiting three seconds to try again.")
|
526 |
-
time.sleep(3)
|
527 |
-
response = comprehend_client.detect_pii_entities(
|
528 |
-
Text=current_batch,
|
529 |
-
LanguageCode=text_analyzer_kwargs["language"]
|
530 |
-
)
|
531 |
-
|
532 |
comprehend_query_number += 1
|
533 |
-
|
534 |
-
# Map results back to original lines
|
535 |
-
if response and "Entities" in response:
|
536 |
-
for entity in response["Entities"]:
|
537 |
-
entity_start = entity["BeginOffset"]
|
538 |
-
entity_end = entity["EndOffset"]
|
539 |
-
|
540 |
-
# Find which line this entity belongs to
|
541 |
-
for batch_start, line_idx, original_text in current_batch_mapping:
|
542 |
-
batch_end = batch_start + len(original_text)
|
543 |
-
|
544 |
-
# Check if entity belongs to this line
|
545 |
-
if batch_start <= entity_start < batch_end:
|
546 |
-
# Adjust offsets relative to the original line
|
547 |
-
relative_start = entity_start - batch_start
|
548 |
-
relative_end = min(entity_end - batch_start, len(original_text))
|
549 |
-
|
550 |
-
result_text = original_text[relative_start:relative_end]
|
551 |
-
|
552 |
-
if result_text not in allow_list:
|
553 |
-
if entity.get("Type") in chosen_redact_comprehend_entities:
|
554 |
-
# Create a new entity with adjusted positions
|
555 |
-
adjusted_entity = entity.copy()
|
556 |
-
adjusted_entity["BeginOffset"] = relative_start
|
557 |
-
adjusted_entity["EndOffset"] = relative_end
|
558 |
-
|
559 |
-
recogniser_entity = recognizer_result_from_dict(adjusted_entity)
|
560 |
-
analyzer_results_by_line[line_idx].append(recogniser_entity)
|
561 |
|
562 |
# Reset batch
|
563 |
-
current_batch =
|
564 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
565 |
|
566 |
-
|
567 |
-
for i, analyzer_result in enumerate(analyzer_results_by_line):
|
568 |
-
if i >= len(ocr_results_with_children):
|
569 |
-
continue
|
570 |
|
|
|
|
|
|
|
|
|
|
|
571 |
child_level_key = list(ocr_results_with_children.keys())[i]
|
572 |
ocr_results_with_children_line_level = ocr_results_with_children[child_level_key]
|
573 |
-
|
574 |
-
|
575 |
-
|
576 |
-
|
577 |
-
|
578 |
-
|
579 |
-
|
580 |
-
|
581 |
-
|
582 |
-
|
583 |
-
|
584 |
-
|
585 |
-
|
586 |
-
|
587 |
-
|
588 |
-
split_relevant_text = relevant_text.split()
|
589 |
-
|
590 |
-
loop_child_words = child_words.copy()
|
591 |
-
|
592 |
-
for word_text in split_relevant_text: # Iterate through each word in relevant_text
|
593 |
-
|
594 |
-
quote_str = '"'
|
595 |
-
replace_str = '(?:"|"|")'
|
596 |
-
|
597 |
-
word_regex = rf'(?<!\w){re.escape(word_text.strip()).replace(quote_str, replace_str)}(?!\w)'
|
598 |
-
|
599 |
-
for word in loop_child_words:
|
600 |
-
# Check for regex as whole word
|
601 |
-
|
602 |
-
if re.search(word_regex, word['text']):
|
603 |
-
#if re.search(r'\b' + re.escape(word_text) + r'\b', word['text']):
|
604 |
-
found_word = word
|
605 |
-
|
606 |
-
if word_num == 0: # First word
|
607 |
-
left = found_word['bounding_box'][0]
|
608 |
-
top = found_word['bounding_box'][1]
|
609 |
-
bottom = max(bottom, found_word['bounding_box'][3]) # Update bottom for all words
|
610 |
-
all_words += found_word['text'] + " " # Concatenate words
|
611 |
-
total_width = found_word['bounding_box'][2] - left # Add each word's width
|
612 |
-
word_num += 1
|
613 |
-
|
614 |
-
# Drop the first word of child_words
|
615 |
-
loop_child_words = loop_child_words[1:] # Skip the first word
|
616 |
-
|
617 |
-
break # Move to the next word in relevant_text
|
618 |
-
|
619 |
-
width = total_width + horizontal_buffer # Set width to total width of all matched words
|
620 |
-
height = bottom - top if word_num > 0 else 0 # Calculate height
|
621 |
-
|
622 |
-
relevant_line_ocr_result = OCRResult(
|
623 |
-
text=relevant_text,
|
624 |
-
left=left,
|
625 |
-
top=top - height_buffer,
|
626 |
-
width=width,
|
627 |
-
height=height + height_buffer
|
628 |
-
)
|
629 |
-
|
630 |
-
if not ocr_results_with_children_line_level:
|
631 |
-
# Fallback to previous method if not found in ocr_results_with_children
|
632 |
-
print("No child info found")
|
633 |
-
continue
|
634 |
-
|
635 |
-
# Reset the word positions indicated in the relevant ocr_result - i.e. it starts from 0 and ends at word length
|
636 |
-
result_reset_pos = result
|
637 |
-
result_reset_pos.start = 0
|
638 |
-
result_reset_pos.end = len(relevant_text)
|
639 |
-
|
640 |
-
#print("result_reset_pos:", result_reset_pos)
|
641 |
-
#print("relevant_line_ocr_result:", relevant_line_ocr_result)
|
642 |
-
#print("ocr_results_with_children_line_level:", ocr_results_with_children_line_level)
|
643 |
-
|
644 |
-
# Map the analyzer results to bounding boxes for this line
|
645 |
-
line_results = self.map_analyzer_results_to_bounding_boxes(
|
646 |
-
[result_reset_pos], [relevant_line_ocr_result], relevant_line_ocr_result.text, allow_list, ocr_results_with_children_line_level
|
647 |
)
|
648 |
-
|
649 |
-
#print("line_results:", line_results)
|
650 |
-
|
651 |
-
combined_results.extend(line_results)
|
652 |
|
653 |
return combined_results, comprehend_query_number
|
654 |
|
655 |
@staticmethod
|
656 |
def map_analyzer_results_to_bounding_boxes(
|
657 |
-
|
658 |
-
|
659 |
-
|
660 |
-
|
661 |
-
|
662 |
-
|
663 |
redaction_bboxes = []
|
664 |
-
text_position = 0
|
665 |
|
666 |
for redaction_relevant_ocr_result in redaction_relevant_ocr_results:
|
667 |
-
|
668 |
|
669 |
-
|
|
|
|
|
670 |
|
|
|
|
|
671 |
for redaction_result in text_analyzer_results:
|
672 |
-
|
673 |
-
|
674 |
-
|
675 |
-
|
676 |
-
|
677 |
-
|
678 |
-
|
679 |
-
if
|
680 |
-
|
681 |
-
#
|
682 |
-
|
683 |
-
|
684 |
-
|
685 |
-
#
|
686 |
-
|
687 |
-
|
688 |
-
|
689 |
-
|
690 |
-
|
691 |
-
|
692 |
-
|
693 |
-
|
694 |
-
|
695 |
-
|
696 |
-
|
697 |
-
|
698 |
-
|
699 |
-
|
700 |
-
|
701 |
-
|
702 |
-
|
703 |
-
|
704 |
-
|
705 |
-
|
706 |
-
|
707 |
-
|
708 |
-
|
709 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
710 |
)
|
711 |
-
|
712 |
-
|
713 |
-
text_position = word_end + 1 # +1 for the space between words
|
714 |
|
715 |
return redaction_bboxes
|
716 |
|
717 |
@staticmethod
|
718 |
def remove_space_boxes(ocr_result: dict) -> dict:
|
719 |
"""Remove OCR bboxes that are for spaces.
|
720 |
-
|
721 |
:param ocr_result: OCR results (raw or thresholded).
|
722 |
:return: OCR results with empty words removed.
|
723 |
"""
|
@@ -740,10 +1156,8 @@ class CustomImageAnalyzerEngine:
|
|
740 |
ocr_result: Dict[str, List[Union[int, str]]], scale_factor: float
|
741 |
) -> Dict[str, float]:
|
742 |
"""Scale down the bounding box results based on a scale percentage.
|
743 |
-
|
744 |
:param ocr_result: OCR results (raw).
|
745 |
:param scale_percent: Scale percentage for resizing the bounding box.
|
746 |
-
|
747 |
:return: OCR results (scaled).
|
748 |
"""
|
749 |
scaled_results = deepcopy(ocr_result)
|
@@ -790,173 +1204,3 @@ class CustomImageAnalyzerEngine:
|
|
790 |
estimated_width = int(proportion * ocr_result.width)
|
791 |
|
792 |
return estimated_width
|
793 |
-
|
794 |
-
|
795 |
-
# def estimate_width(self, ocr_result: OCRResult, start: int, end: int) -> int:
|
796 |
-
# # Extract the relevant text portion
|
797 |
-
# relevant_text = ocr_result.text[start:end]
|
798 |
-
|
799 |
-
# # Check if the relevant text is the entire text of the OCR result
|
800 |
-
# if relevant_text == ocr_result.text:
|
801 |
-
# return ocr_result.width
|
802 |
-
|
803 |
-
# # Estimate the font size based on the height of the bounding box
|
804 |
-
# estimated_font_size = ocr_result.height + 4
|
805 |
-
|
806 |
-
# # Create a blank image with enough width to measure the text
|
807 |
-
# dummy_image = Image.new('RGB', (1000, 50), color=(255, 255, 255))
|
808 |
-
# draw = ImageDraw.Draw(dummy_image)
|
809 |
-
|
810 |
-
# # Specify the font and size
|
811 |
-
# try:
|
812 |
-
# font = ImageFont.truetype("arial.ttf", estimated_font_size) # Adjust the font file as needed
|
813 |
-
# except IOError:
|
814 |
-
# font = ImageFont.load_default() # Fallback to default font if the specified font is not found
|
815 |
-
|
816 |
-
# # Draw the relevant text on the image
|
817 |
-
# draw.text((0, 0), relevant_text, fill=(0, 0, 0), font=font)
|
818 |
-
|
819 |
-
# # Save the image for debugging purposes
|
820 |
-
# dummy_image.save("debug_image.png")
|
821 |
-
|
822 |
-
# # Use pytesseract to get the bounding box of the relevant text
|
823 |
-
# bbox = pytesseract.image_to_boxes(dummy_image, config=self.tesseract_config)
|
824 |
-
|
825 |
-
# # Print the bbox for debugging
|
826 |
-
# print("Bounding box:", bbox)
|
827 |
-
|
828 |
-
# # Calculate the width from the bounding box
|
829 |
-
# if bbox:
|
830 |
-
# try:
|
831 |
-
# # Initialize min_left and max_right with extreme values
|
832 |
-
# min_left = float('inf')
|
833 |
-
# max_right = float('-inf')
|
834 |
-
|
835 |
-
# # Split the bbox string into lines
|
836 |
-
# bbox_lines = bbox.splitlines()
|
837 |
-
|
838 |
-
# for line in bbox_lines:
|
839 |
-
# parts = line.split()
|
840 |
-
# if len(parts) == 6:
|
841 |
-
# _, left, _, right, _, _ = parts
|
842 |
-
# left = int(left)
|
843 |
-
# right = int(right)
|
844 |
-
# min_left = min(min_left, left)
|
845 |
-
# max_right = max(max_right, right)
|
846 |
-
|
847 |
-
# width = max_right - min_left
|
848 |
-
# except ValueError as e:
|
849 |
-
# print("Error parsing bounding box:", e)
|
850 |
-
# width = 0
|
851 |
-
# else:
|
852 |
-
# width = 0
|
853 |
-
|
854 |
-
# print("Estimated width:", width)
|
855 |
-
|
856 |
-
# return width
|
857 |
-
|
858 |
-
|
859 |
-
|
860 |
-
# Function to combine OCR results into line-level results
|
861 |
-
def combine_ocr_results(ocr_results, x_threshold=50, y_threshold=12):
|
862 |
-
# Group OCR results into lines based on y_threshold
|
863 |
-
lines = []
|
864 |
-
current_line = []
|
865 |
-
for result in sorted(ocr_results, key=lambda x: x.top):
|
866 |
-
if not current_line or abs(result.top - current_line[0].top) <= y_threshold:
|
867 |
-
current_line.append(result)
|
868 |
-
else:
|
869 |
-
lines.append(current_line)
|
870 |
-
current_line = [result]
|
871 |
-
if current_line:
|
872 |
-
lines.append(current_line)
|
873 |
-
|
874 |
-
# Sort each line by left position
|
875 |
-
for line in lines:
|
876 |
-
line.sort(key=lambda x: x.left)
|
877 |
-
|
878 |
-
# Flatten the sorted lines back into a single list
|
879 |
-
sorted_results = [result for line in lines for result in line]
|
880 |
-
|
881 |
-
combined_results = []
|
882 |
-
new_format_results = {}
|
883 |
-
current_line = []
|
884 |
-
current_bbox = None
|
885 |
-
line_counter = 1
|
886 |
-
|
887 |
-
def create_ocr_result_with_children(combined_results, i, current_bbox, current_line):
|
888 |
-
combined_results["text_line_" + str(i)] = {
|
889 |
-
"line": i,
|
890 |
-
'text': current_bbox.text,
|
891 |
-
'bounding_box': (current_bbox.left, current_bbox.top,
|
892 |
-
current_bbox.left + current_bbox.width,
|
893 |
-
current_bbox.top + current_bbox.height),
|
894 |
-
'words': [{'text': word.text,
|
895 |
-
'bounding_box': (word.left, word.top,
|
896 |
-
word.left + word.width,
|
897 |
-
word.top + word.height)}
|
898 |
-
for word in current_line]
|
899 |
-
}
|
900 |
-
return combined_results["text_line_" + str(i)]
|
901 |
-
|
902 |
-
for result in sorted_results:
|
903 |
-
if not current_line:
|
904 |
-
# Start a new line
|
905 |
-
current_line.append(result)
|
906 |
-
current_bbox = result
|
907 |
-
else:
|
908 |
-
# Check if the result is on the same line (y-axis) and close horizontally (x-axis)
|
909 |
-
last_result = current_line[-1]
|
910 |
-
|
911 |
-
if abs(result.top - last_result.top) <= y_threshold and \
|
912 |
-
(result.left - (last_result.left + last_result.width)) <= x_threshold:
|
913 |
-
# Update the bounding box to include the new word
|
914 |
-
new_right = max(current_bbox.left + current_bbox.width, result.left + result.width)
|
915 |
-
current_bbox = OCRResult(
|
916 |
-
text=f"{current_bbox.text} {result.text}",
|
917 |
-
left=current_bbox.left,
|
918 |
-
top=current_bbox.top,
|
919 |
-
width=new_right - current_bbox.left,
|
920 |
-
height=max(current_bbox.height, result.height)
|
921 |
-
)
|
922 |
-
current_line.append(result)
|
923 |
-
else:
|
924 |
-
|
925 |
-
|
926 |
-
# Commit the current line and start a new one
|
927 |
-
combined_results.append(current_bbox)
|
928 |
-
# new_format_results[current_bbox.text] = { # f"combined_text_{line_counter}"
|
929 |
-
# 'bounding_box': (current_bbox.left, current_bbox.top,
|
930 |
-
# current_bbox.left + current_bbox.width,
|
931 |
-
# current_bbox.top + current_bbox.height),
|
932 |
-
# 'words': [{'text': word.text,
|
933 |
-
# 'bounding_box': (word.left, word.top,
|
934 |
-
# word.left + word.width,
|
935 |
-
# word.top + word.height)}
|
936 |
-
# for word in current_line]
|
937 |
-
# }
|
938 |
-
new_format_results["text_line_" + str(line_counter)] = create_ocr_result_with_children(new_format_results, line_counter, current_bbox, current_line)
|
939 |
-
|
940 |
-
line_counter += 1
|
941 |
-
current_line = [result]
|
942 |
-
current_bbox = result
|
943 |
-
|
944 |
-
# Append the last line
|
945 |
-
if current_bbox:
|
946 |
-
combined_results.append(current_bbox)
|
947 |
-
# new_format_results[current_bbox.text] = { # f"combined_text_{line_counter}"
|
948 |
-
# 'bounding_box': (current_bbox.left, current_bbox.top,
|
949 |
-
# current_bbox.left + current_bbox.width,
|
950 |
-
# current_bbox.top + current_bbox.height),
|
951 |
-
# 'words': [{'text': word.text,
|
952 |
-
# 'bounding_box': (word.left, word.top,
|
953 |
-
# word.left + word.width,
|
954 |
-
# word.top + word.height)}
|
955 |
-
# for word in current_line]
|
956 |
-
# }
|
957 |
-
|
958 |
-
new_format_results["text_line_" + str(line_counter)] = create_ocr_result_with_children(new_format_results, line_counter, current_bbox, current_line)
|
959 |
-
|
960 |
-
|
961 |
-
return combined_results, new_format_results
|
962 |
-
|
|
|
1 |
import pytesseract
|
2 |
import numpy as np
|
3 |
from presidio_analyzer import AnalyzerEngine, RecognizerResult
|
|
|
4 |
from typing import List, Dict, Optional, Union, Tuple
|
5 |
from dataclasses import dataclass
|
6 |
import time
|
7 |
import cv2
|
8 |
+
import copy
|
9 |
+
from copy import deepcopy
|
10 |
+
from pdfminer.layout import LTChar
|
11 |
import PIL
|
12 |
+
from PIL import Image
|
13 |
from typing import Optional, Tuple, Union
|
|
|
14 |
from tools.helper_functions import clean_unicode_text
|
15 |
from tools.presidio_analyzer_custom import recognizer_result_from_dict
|
16 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
|
|
|
|
17 |
|
18 |
@dataclass
|
19 |
class OCRResult:
|
|
|
173 |
|
174 |
return Image.fromarray(filtered_image), metadata
|
175 |
|
|
|
176 |
class SegmentedAdaptiveThreshold(ImagePreprocessor):
|
177 |
"""SegmentedAdaptiveThreshold class.
|
178 |
|
|
|
250 |
metadata = {"C": c, "background_color": background_color, "contrast": contrast}
|
251 |
return Image.fromarray(adaptive_threshold_image), metadata
|
252 |
|
|
|
|
|
|
|
253 |
class ImageRescaling(ImagePreprocessor):
|
254 |
"""ImageRescaling class. Rescales images based on their size."""
|
255 |
|
|
|
297 |
metadata = {"scale_factor": scale_factor}
|
298 |
return Image.fromarray(rescaled_image), metadata
|
299 |
|
|
|
300 |
class ContrastSegmentedImageEnhancer(ImagePreprocessor):
|
301 |
"""Class containing all logic to perform contrastive segmentation.
|
302 |
|
|
|
403 |
"""Check if two bounding boxes overlap."""
|
404 |
return (box1[0] < box2[2] and box2[0] < box1[2] and
|
405 |
box1[1] < box2[3] and box2[1] < box1[3])
|
406 |
+
|
407 |
+
def map_back_entity_results(page_analyser_result, page_text_mapping, all_text_line_results):
|
408 |
+
for entity in page_analyser_result:
|
409 |
+
entity_start = entity.start
|
410 |
+
entity_end = entity.end
|
411 |
+
|
412 |
+
# Track if the entity has been added to any line
|
413 |
+
added_to_line = False
|
414 |
+
|
415 |
+
for batch_start, line_idx, original_line, chars in page_text_mapping:
|
416 |
+
batch_end = batch_start + len(original_line.text)
|
417 |
+
|
418 |
+
# Check if the entity overlaps with the current line
|
419 |
+
if batch_start < entity_end and batch_end > entity_start: # Overlap condition
|
420 |
+
relative_start = max(0, entity_start - batch_start) # Adjust start relative to the line
|
421 |
+
relative_end = min(entity_end - batch_start, len(original_line.text)) # Adjust end relative to the line
|
422 |
+
|
423 |
+
# Create a new adjusted entity
|
424 |
+
adjusted_entity = copy.deepcopy(entity)
|
425 |
+
adjusted_entity.start = relative_start
|
426 |
+
adjusted_entity.end = relative_end
|
427 |
+
|
428 |
+
# Check if this line already has an entry
|
429 |
+
existing_entry = next((entry for idx, entry in all_text_line_results if idx == line_idx), None)
|
430 |
+
|
431 |
+
if existing_entry is None:
|
432 |
+
all_text_line_results.append((line_idx, [adjusted_entity]))
|
433 |
+
else:
|
434 |
+
existing_entry.append(adjusted_entity) # Append to the existing list of entities
|
435 |
+
|
436 |
+
added_to_line = True
|
437 |
+
|
438 |
+
# If the entity spans multiple lines, you may want to handle that here
|
439 |
+
if not added_to_line:
|
440 |
+
# Handle cases where the entity does not fit in any line (optional)
|
441 |
+
print(f"Entity '{entity}' does not fit in any line.")
|
442 |
+
|
443 |
+
return all_text_line_results
|
444 |
+
|
445 |
+
def map_back_comprehend_entity_results(response, current_batch_mapping, allow_list, chosen_redact_comprehend_entities, all_text_line_results):
|
446 |
+
if not response or "Entities" not in response:
|
447 |
+
return all_text_line_results
|
448 |
+
|
449 |
+
for entity in response["Entities"]:
|
450 |
+
if entity.get("Type") not in chosen_redact_comprehend_entities:
|
451 |
+
continue
|
452 |
+
|
453 |
+
entity_start = entity["BeginOffset"]
|
454 |
+
entity_end = entity["EndOffset"]
|
455 |
+
|
456 |
+
# Track if the entity has been added to any line
|
457 |
+
added_to_line = False
|
458 |
+
|
459 |
+
# Find the correct line and offset within that line
|
460 |
+
for batch_start, line_idx, original_line, chars, line_offset in current_batch_mapping:
|
461 |
+
batch_end = batch_start + len(original_line.text[line_offset:])
|
462 |
+
|
463 |
+
# Check if the entity overlaps with the current line
|
464 |
+
if batch_start < entity_end and batch_end > entity_start: # Overlap condition
|
465 |
+
# Calculate the absolute position within the line
|
466 |
+
relative_start = max(0, entity_start - batch_start + line_offset)
|
467 |
+
relative_end = min(entity_end - batch_start + line_offset, len(original_line.text))
|
468 |
+
|
469 |
+
result_text = original_line.text[relative_start:relative_end]
|
470 |
+
|
471 |
+
if result_text not in allow_list:
|
472 |
+
adjusted_entity = entity.copy()
|
473 |
+
adjusted_entity["BeginOffset"] = relative_start # Now relative to the full line
|
474 |
+
adjusted_entity["EndOffset"] = relative_end
|
475 |
+
|
476 |
+
recogniser_entity = recognizer_result_from_dict(adjusted_entity)
|
477 |
+
|
478 |
+
existing_entry = next((entry for idx, entry in all_text_line_results if idx == line_idx), None)
|
479 |
+
if existing_entry is None:
|
480 |
+
all_text_line_results.append((line_idx, [recogniser_entity]))
|
481 |
+
else:
|
482 |
+
existing_entry.append(recogniser_entity) # Append to the existing list of entities
|
483 |
+
|
484 |
+
added_to_line = True
|
485 |
+
|
486 |
+
# Optional: Handle cases where the entity does not fit in any line
|
487 |
+
if not added_to_line:
|
488 |
+
print(f"Entity '{entity}' does not fit in any line.")
|
489 |
+
|
490 |
+
return all_text_line_results
|
491 |
+
|
492 |
+
def do_aws_comprehend_call(current_batch, current_batch_mapping, comprehend_client, language, allow_list, chosen_redact_comprehend_entities, all_text_line_results):
|
493 |
+
if not current_batch:
|
494 |
+
return all_text_line_results
|
495 |
+
|
496 |
+
max_retries = 3
|
497 |
+
retry_delay = 3
|
498 |
+
|
499 |
+
for attempt in range(max_retries):
|
500 |
+
try:
|
501 |
+
response = comprehend_client.detect_pii_entities(
|
502 |
+
Text=current_batch.strip(),
|
503 |
+
LanguageCode=language
|
504 |
+
)
|
505 |
+
|
506 |
+
all_text_line_results = map_back_comprehend_entity_results(
|
507 |
+
response,
|
508 |
+
current_batch_mapping,
|
509 |
+
allow_list,
|
510 |
+
chosen_redact_comprehend_entities,
|
511 |
+
all_text_line_results
|
512 |
+
)
|
513 |
+
|
514 |
+
return all_text_line_results
|
515 |
+
|
516 |
+
except Exception as e:
|
517 |
+
if attempt == max_retries - 1:
|
518 |
+
raise
|
519 |
+
time.sleep(retry_delay)
|
520 |
+
|
521 |
+
def run_page_text_redaction(
|
522 |
+
language: str,
|
523 |
+
chosen_redact_entities: List[str],
|
524 |
+
chosen_redact_comprehend_entities: List[str],
|
525 |
+
line_level_text_results_list: List[str],
|
526 |
+
line_characters: List,
|
527 |
+
page_analyser_results: List = [],
|
528 |
+
page_analysed_bounding_boxes: List = [],
|
529 |
+
comprehend_client = None,
|
530 |
+
allow_list: List[str] = None,
|
531 |
+
pii_identification_method: str = "Local",
|
532 |
+
nlp_analyser = None,
|
533 |
+
score_threshold: float = 0.0,
|
534 |
+
custom_entities: List[str] = None,
|
535 |
+
comprehend_query_number:int = 0#,
|
536 |
+
#merge_text_bounding_boxes_fn = merge_text_bounding_boxes
|
537 |
+
):
|
538 |
+
#if not merge_text_bounding_boxes_fn:
|
539 |
+
# raise ValueError("merge_text_bounding_boxes_fn is required")
|
540 |
+
|
541 |
+
page_text = ""
|
542 |
+
page_text_mapping = []
|
543 |
+
all_text_line_results = []
|
544 |
+
comprehend_query_number = 0
|
545 |
+
|
546 |
+
# Collect all text from the page
|
547 |
+
for i, text_line in enumerate(line_level_text_results_list):
|
548 |
+
#print("line_level_text_results_list:", line_level_text_results_list)
|
549 |
+
if chosen_redact_entities:
|
550 |
+
if page_text:
|
551 |
+
#page_text += " | "
|
552 |
+
page_text += " "
|
553 |
+
|
554 |
+
start_pos = len(page_text)
|
555 |
+
page_text += text_line.text
|
556 |
+
page_text_mapping.append((start_pos, i, text_line, line_characters[i]))
|
557 |
+
|
558 |
+
# Process based on identification method
|
559 |
+
if pii_identification_method == "Local":
|
560 |
+
if not nlp_analyser:
|
561 |
+
raise ValueError("nlp_analyser is required for Local identification method")
|
562 |
+
|
563 |
+
print("page text:", page_text)
|
564 |
+
|
565 |
+
page_analyser_result = nlp_analyser.analyze(
|
566 |
+
text=page_text,
|
567 |
+
language=language,
|
568 |
+
entities=chosen_redact_entities,
|
569 |
+
score_threshold=score_threshold,
|
570 |
+
return_decision_process=True,
|
571 |
+
allow_list=allow_list
|
572 |
+
)
|
573 |
+
|
574 |
+
#print("page_analyser_result:", page_analyser_result)
|
575 |
+
|
576 |
+
all_text_line_results = map_back_entity_results(
|
577 |
+
page_analyser_result,
|
578 |
+
page_text_mapping,
|
579 |
+
all_text_line_results
|
580 |
+
)
|
581 |
+
|
582 |
+
#print("all_text_line_results:", all_text_line_results)
|
583 |
+
|
584 |
+
elif pii_identification_method == "AWS Comprehend":
|
585 |
+
#print("page text:", page_text)
|
586 |
+
|
587 |
+
# Process custom entities if any
|
588 |
+
if custom_entities:
|
589 |
+
custom_redact_entities = [
|
590 |
+
entity for entity in chosen_redact_comprehend_entities
|
591 |
+
if entity in custom_entities
|
592 |
+
]
|
593 |
+
if custom_redact_entities:
|
594 |
+
page_analyser_result = nlp_analyser.analyze(
|
595 |
+
text=page_text,
|
596 |
+
language=language,
|
597 |
+
entities=custom_redact_entities,
|
598 |
+
score_threshold=score_threshold,
|
599 |
+
return_decision_process=True,
|
600 |
+
allow_list=allow_list
|
601 |
+
)
|
602 |
+
|
603 |
+
print("page_analyser_result:", page_analyser_result)
|
604 |
+
|
605 |
+
all_text_line_results = map_back_entity_results(
|
606 |
+
page_analyser_result,
|
607 |
+
page_text_mapping,
|
608 |
+
all_text_line_results
|
609 |
+
)
|
610 |
+
|
611 |
+
current_batch = ""
|
612 |
+
current_batch_mapping = []
|
613 |
+
batch_char_count = 0
|
614 |
+
batch_word_count = 0
|
615 |
+
|
616 |
+
for i, text_line in enumerate(line_level_text_results_list):
|
617 |
+
words = text_line.text.split()
|
618 |
+
word_start_positions = []
|
619 |
+
|
620 |
+
# Calculate word start positions within the line
|
621 |
+
current_pos = 0
|
622 |
+
for word in words:
|
623 |
+
word_start_positions.append(current_pos)
|
624 |
+
current_pos += len(word) + 1 # +1 for space
|
625 |
+
|
626 |
+
for word_idx, word in enumerate(words):
|
627 |
+
new_batch_char_count = len(current_batch) + len(word) + 1
|
628 |
+
|
629 |
+
if batch_word_count >= 50 or new_batch_char_count >= 200:
|
630 |
+
# Process current batch
|
631 |
+
all_text_line_results = do_aws_comprehend_call(
|
632 |
+
current_batch,
|
633 |
+
current_batch_mapping,
|
634 |
+
comprehend_client,
|
635 |
+
language,
|
636 |
+
allow_list,
|
637 |
+
chosen_redact_comprehend_entities,
|
638 |
+
all_text_line_results
|
639 |
+
)
|
640 |
+
comprehend_query_number += 1
|
641 |
+
|
642 |
+
# Start new batch
|
643 |
+
current_batch = word
|
644 |
+
batch_word_count = 1
|
645 |
+
batch_char_count = len(word)
|
646 |
+
current_batch_mapping = [(0, i, text_line, line_characters[i], word_start_positions[word_idx])]
|
647 |
+
else:
|
648 |
+
if current_batch:
|
649 |
+
current_batch += " "
|
650 |
+
batch_char_count += 1
|
651 |
+
current_batch += word
|
652 |
+
batch_char_count += len(word)
|
653 |
+
batch_word_count += 1
|
654 |
+
|
655 |
+
if not current_batch_mapping or current_batch_mapping[-1][1] != i:
|
656 |
+
current_batch_mapping.append((
|
657 |
+
batch_char_count - len(word),
|
658 |
+
i,
|
659 |
+
text_line,
|
660 |
+
line_characters[i],
|
661 |
+
word_start_positions[word_idx] # Add the word's start position within its line
|
662 |
+
))
|
663 |
+
|
664 |
+
# Process final batch
|
665 |
+
if current_batch:
|
666 |
+
all_text_line_results = do_aws_comprehend_call(
|
667 |
+
current_batch,
|
668 |
+
current_batch_mapping,
|
669 |
+
comprehend_client,
|
670 |
+
language,
|
671 |
+
allow_list,
|
672 |
+
chosen_redact_comprehend_entities,
|
673 |
+
all_text_line_results
|
674 |
+
)
|
675 |
+
comprehend_query_number += 1
|
676 |
+
|
677 |
+
# Process results for each line
|
678 |
+
for i, text_line in enumerate(line_level_text_results_list):
|
679 |
+
line_results = next((results for idx, results in all_text_line_results if idx == i), [])
|
680 |
+
|
681 |
+
if line_results:
|
682 |
+
text_line_bounding_boxes = merge_text_bounding_boxes(
|
683 |
+
line_results,
|
684 |
+
line_characters[i]
|
685 |
+
)
|
686 |
+
|
687 |
+
page_analyser_results.extend(line_results)
|
688 |
+
page_analysed_bounding_boxes.extend(text_line_bounding_boxes)
|
689 |
+
|
690 |
+
return page_analysed_bounding_boxes
|
691 |
+
|
692 |
+
def merge_text_bounding_boxes(analyser_results, characters: List[LTChar], combine_pixel_dist: int = 20, vertical_padding: int = 0):
|
693 |
+
'''
|
694 |
+
Merge identified bounding boxes containing PII that are very close to one another
|
695 |
+
'''
|
696 |
+
analysed_bounding_boxes = []
|
697 |
+
original_bounding_boxes = [] # List to hold original bounding boxes
|
698 |
+
|
699 |
+
if len(analyser_results) > 0 and len(characters) > 0:
|
700 |
+
# Extract bounding box coordinates for sorting
|
701 |
+
bounding_boxes = []
|
702 |
+
for result in analyser_results:
|
703 |
+
#print("Result:", result)
|
704 |
+
char_boxes = [char.bbox for char in characters[result.start:result.end] if isinstance(char, LTChar)]
|
705 |
+
char_text = [char._text for char in characters[result.start:result.end] if isinstance(char, LTChar)]
|
706 |
+
if char_boxes:
|
707 |
+
# Calculate the bounding box that encompasses all characters
|
708 |
+
left = min(box[0] for box in char_boxes)
|
709 |
+
bottom = min(box[1] for box in char_boxes)
|
710 |
+
right = max(box[2] for box in char_boxes)
|
711 |
+
top = max(box[3] for box in char_boxes) + vertical_padding
|
712 |
+
bbox = [left, bottom, right, top]
|
713 |
+
bounding_boxes.append((bottom, left, result, bbox, char_text)) # (y, x, result, bbox, text)
|
714 |
+
|
715 |
+
# Store original bounding boxes
|
716 |
+
original_bounding_boxes.append({"text": "".join(char_text), "boundingBox": bbox, "result": copy.deepcopy(result)})
|
717 |
+
#print("Original bounding boxes:", original_bounding_boxes)
|
718 |
+
|
719 |
+
# Sort the results by y-coordinate and then by x-coordinate
|
720 |
+
bounding_boxes.sort()
|
721 |
+
|
722 |
+
merged_bounding_boxes = []
|
723 |
+
current_box = None
|
724 |
+
current_y = None
|
725 |
+
current_result = None
|
726 |
+
current_text = []
|
727 |
+
|
728 |
+
for y, x, result, next_box, text in bounding_boxes:
|
729 |
+
if current_y is None or current_box is None:
|
730 |
+
# Initialize the first bounding box
|
731 |
+
current_box = next_box
|
732 |
+
current_y = next_box[1]
|
733 |
+
current_result = result
|
734 |
+
current_text = list(text)
|
735 |
+
else:
|
736 |
+
vertical_diff_bboxes = abs(next_box[1] - current_y)
|
737 |
+
horizontal_diff_bboxes = abs(next_box[0] - current_box[2])
|
738 |
+
|
739 |
+
if vertical_diff_bboxes <= 5 and horizontal_diff_bboxes <= combine_pixel_dist:
|
740 |
+
# Merge bounding boxes
|
741 |
+
#print("Merging boxes")
|
742 |
+
merged_box = current_box.copy()
|
743 |
+
merged_result = current_result
|
744 |
+
merged_text = current_text.copy()
|
745 |
+
|
746 |
+
merged_box[2] = next_box[2] # Extend horizontally
|
747 |
+
merged_box[3] = max(current_box[3], next_box[3]) # Adjust the top
|
748 |
+
merged_result.end = max(current_result.end, result.end) # Extend text range
|
749 |
+
try:
|
750 |
+
if current_result.entity_type != result.entity_type:
|
751 |
+
merged_result.entity_type = current_result.entity_type + " - " + result.entity_type
|
752 |
+
else:
|
753 |
+
merged_result.entity_type = current_result.entity_type
|
754 |
+
except Exception as e:
|
755 |
+
print("Unable to combine result entity types:", e)
|
756 |
+
if current_text:
|
757 |
+
merged_text.append(" ") # Add space between texts
|
758 |
+
merged_text.extend(text)
|
759 |
+
|
760 |
+
merged_bounding_boxes.append({
|
761 |
+
"text": "".join(merged_text),
|
762 |
+
"boundingBox": merged_box,
|
763 |
+
"result": merged_result
|
764 |
+
})
|
765 |
+
|
766 |
+
else:
|
767 |
+
# Start a new bounding box
|
768 |
+
current_box = next_box
|
769 |
+
current_y = next_box[1]
|
770 |
+
current_result = result
|
771 |
+
current_text = list(text)
|
772 |
+
|
773 |
+
# Combine original and merged bounding boxes
|
774 |
+
analysed_bounding_boxes.extend(original_bounding_boxes)
|
775 |
+
analysed_bounding_boxes.extend(merged_bounding_boxes)
|
776 |
+
|
777 |
+
#print("Analysed bounding boxes:", analysed_bounding_boxes)
|
778 |
+
|
779 |
+
return analysed_bounding_boxes
|
780 |
+
|
781 |
+
# Function to combine OCR results into line-level results
|
782 |
+
def combine_ocr_results(ocr_results, x_threshold=50, y_threshold=12):
|
783 |
+
# Group OCR results into lines based on y_threshold
|
784 |
+
lines = []
|
785 |
+
current_line = []
|
786 |
+
for result in sorted(ocr_results, key=lambda x: x.top):
|
787 |
+
if not current_line or abs(result.top - current_line[0].top) <= y_threshold:
|
788 |
+
current_line.append(result)
|
789 |
+
else:
|
790 |
+
lines.append(current_line)
|
791 |
+
current_line = [result]
|
792 |
+
if current_line:
|
793 |
+
lines.append(current_line)
|
794 |
+
|
795 |
+
# Sort each line by left position
|
796 |
+
for line in lines:
|
797 |
+
line.sort(key=lambda x: x.left)
|
798 |
+
|
799 |
+
# Flatten the sorted lines back into a single list
|
800 |
+
sorted_results = [result for line in lines for result in line]
|
801 |
+
|
802 |
+
combined_results = []
|
803 |
+
new_format_results = {}
|
804 |
+
current_line = []
|
805 |
+
current_bbox = None
|
806 |
+
line_counter = 1
|
807 |
+
|
808 |
+
def create_ocr_result_with_children(combined_results, i, current_bbox, current_line):
|
809 |
+
combined_results["text_line_" + str(i)] = {
|
810 |
+
"line": i,
|
811 |
+
'text': current_bbox.text,
|
812 |
+
'bounding_box': (current_bbox.left, current_bbox.top,
|
813 |
+
current_bbox.left + current_bbox.width,
|
814 |
+
current_bbox.top + current_bbox.height),
|
815 |
+
'words': [{'text': word.text,
|
816 |
+
'bounding_box': (word.left, word.top,
|
817 |
+
word.left + word.width,
|
818 |
+
word.top + word.height)}
|
819 |
+
for word in current_line]
|
820 |
+
}
|
821 |
+
return combined_results["text_line_" + str(i)]
|
822 |
+
|
823 |
+
for result in sorted_results:
|
824 |
+
if not current_line:
|
825 |
+
# Start a new line
|
826 |
+
current_line.append(result)
|
827 |
+
current_bbox = result
|
828 |
+
else:
|
829 |
+
# Check if the result is on the same line (y-axis) and close horizontally (x-axis)
|
830 |
+
last_result = current_line[-1]
|
831 |
+
|
832 |
+
if abs(result.top - last_result.top) <= y_threshold and \
|
833 |
+
(result.left - (last_result.left + last_result.width)) <= x_threshold:
|
834 |
+
# Update the bounding box to include the new word
|
835 |
+
new_right = max(current_bbox.left + current_bbox.width, result.left + result.width)
|
836 |
+
current_bbox = OCRResult(
|
837 |
+
text=f"{current_bbox.text} {result.text}",
|
838 |
+
left=current_bbox.left,
|
839 |
+
top=current_bbox.top,
|
840 |
+
width=new_right - current_bbox.left,
|
841 |
+
height=max(current_bbox.height, result.height)
|
842 |
+
)
|
843 |
+
current_line.append(result)
|
844 |
+
else:
|
845 |
+
|
846 |
+
|
847 |
+
# Commit the current line and start a new one
|
848 |
+
combined_results.append(current_bbox)
|
849 |
+
|
850 |
+
new_format_results["text_line_" + str(line_counter)] = create_ocr_result_with_children(new_format_results, line_counter, current_bbox, current_line)
|
851 |
+
|
852 |
+
line_counter += 1
|
853 |
+
current_line = [result]
|
854 |
+
current_bbox = result
|
855 |
+
|
856 |
+
# Append the last line
|
857 |
+
if current_bbox:
|
858 |
+
combined_results.append(current_bbox)
|
859 |
+
|
860 |
+
new_format_results["text_line_" + str(line_counter)] = create_ocr_result_with_children(new_format_results, line_counter, current_bbox, current_line)
|
861 |
+
|
862 |
+
|
863 |
+
return combined_results, new_format_results
|
864 |
|
865 |
class CustomImageAnalyzerEngine:
|
866 |
def __init__(
|
|
|
915 |
self,
|
916 |
line_level_ocr_results: List[OCRResult],
|
917 |
ocr_results_with_children: Dict[str, Dict],
|
918 |
+
chosen_redact_comprehend_entities: List[str],
|
919 |
+
pii_identification_method: str = "Local",
|
920 |
+
comprehend_client = "",
|
921 |
**text_analyzer_kwargs
|
922 |
) -> List[CustomImageRecognizerResult]:
|
|
|
|
|
|
|
923 |
|
924 |
+
page_text = ""
|
925 |
+
page_text_mapping = []
|
926 |
+
all_text_line_results = []
|
927 |
comprehend_query_number = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
928 |
|
929 |
+
# Collect all text and create mapping
|
930 |
for i, line_level_ocr_result in enumerate(line_level_ocr_results):
|
931 |
+
if page_text:
|
932 |
+
page_text += " "
|
933 |
+
start_pos = len(page_text)
|
934 |
+
page_text += line_level_ocr_result.text
|
935 |
+
# Note: We're not passing line_characters here since it's not needed for this use case
|
936 |
+
page_text_mapping.append((start_pos, i, line_level_ocr_result, None))
|
937 |
+
|
938 |
+
# Process using either Local or AWS Comprehend
|
939 |
+
if pii_identification_method == "Local":
|
940 |
+
analyzer_result = self.analyzer_engine.analyze(
|
941 |
+
text=page_text,
|
942 |
+
**text_analyzer_kwargs
|
943 |
+
)
|
944 |
+
all_text_line_results = map_back_entity_results(
|
945 |
+
analyzer_result,
|
946 |
+
page_text_mapping,
|
947 |
+
all_text_line_results
|
948 |
+
)
|
949 |
|
950 |
+
elif pii_identification_method == "AWS Comprehend":
|
951 |
+
# Handle custom entities first
|
952 |
+
if custom_entities:
|
953 |
+
custom_redact_entities = [
|
954 |
+
entity for entity in chosen_redact_comprehend_entities
|
955 |
+
if entity in custom_entities
|
956 |
+
]
|
957 |
+
if custom_redact_entities:
|
958 |
+
text_analyzer_kwargs["entities"] = custom_redact_entities
|
959 |
+
page_analyser_result = self.analyzer_engine.analyze(
|
960 |
+
text=page_text,
|
961 |
+
**text_analyzer_kwargs
|
962 |
+
)
|
963 |
+
all_text_line_results = map_back_entity_results(
|
964 |
+
page_analyser_result,
|
965 |
+
page_text_mapping,
|
966 |
+
all_text_line_results
|
967 |
+
)
|
968 |
|
969 |
+
# Process text in batches for AWS Comprehend
|
970 |
+
current_batch = ""
|
971 |
+
current_batch_mapping = []
|
972 |
+
batch_char_count = 0
|
973 |
+
batch_word_count = 0
|
974 |
|
975 |
+
for i, text_line in enumerate(line_level_ocr_results):
|
976 |
+
words = text_line.text.split()
|
977 |
+
word_start_positions = []
|
978 |
+
current_pos = 0
|
979 |
|
980 |
+
for word in words:
|
981 |
+
word_start_positions.append(current_pos)
|
982 |
+
current_pos += len(word) + 1
|
983 |
|
984 |
+
for word_idx, word in enumerate(words):
|
985 |
+
new_batch_char_count = len(current_batch) + len(word) + 1
|
|
|
|
|
986 |
|
987 |
+
if batch_word_count >= 50 or new_batch_char_count >= 200:
|
988 |
+
# Process current batch
|
989 |
+
all_text_line_results = do_aws_comprehend_call(
|
990 |
+
current_batch,
|
991 |
+
current_batch_mapping,
|
992 |
+
comprehend_client,
|
993 |
+
text_analyzer_kwargs["language"],
|
994 |
+
text_analyzer_kwargs.get('allow_list', []),
|
995 |
+
chosen_redact_comprehend_entities,
|
996 |
+
all_text_line_results
|
997 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
998 |
comprehend_query_number += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
999 |
|
1000 |
# Reset batch
|
1001 |
+
current_batch = word
|
1002 |
+
batch_word_count = 1
|
1003 |
+
batch_char_count = len(word)
|
1004 |
+
current_batch_mapping = [(0, i, text_line, None, word_start_positions[word_idx])]
|
1005 |
+
else:
|
1006 |
+
if current_batch:
|
1007 |
+
current_batch += " "
|
1008 |
+
batch_char_count += 1
|
1009 |
+
current_batch += word
|
1010 |
+
batch_char_count += len(word)
|
1011 |
+
batch_word_count += 1
|
1012 |
+
|
1013 |
+
if not current_batch_mapping or current_batch_mapping[-1][1] != i:
|
1014 |
+
current_batch_mapping.append((
|
1015 |
+
batch_char_count - len(word),
|
1016 |
+
i,
|
1017 |
+
text_line,
|
1018 |
+
None,
|
1019 |
+
word_start_positions[word_idx]
|
1020 |
+
))
|
1021 |
+
|
1022 |
+
# Process final batch if any
|
1023 |
+
if current_batch:
|
1024 |
+
all_text_line_results = do_aws_comprehend_call(
|
1025 |
+
current_batch,
|
1026 |
+
current_batch_mapping,
|
1027 |
+
comprehend_client,
|
1028 |
+
text_analyzer_kwargs["language"],
|
1029 |
+
text_analyzer_kwargs.get('allow_list', []),
|
1030 |
+
chosen_redact_comprehend_entities,
|
1031 |
+
all_text_line_results
|
1032 |
+
)
|
1033 |
+
comprehend_query_number += 1
|
1034 |
|
1035 |
+
|
|
|
|
|
|
|
1036 |
|
1037 |
+
# Process results and create bounding boxes
|
1038 |
+
combined_results = []
|
1039 |
+
for i, text_line in enumerate(line_level_ocr_results):
|
1040 |
+
line_results = next((results for idx, results in all_text_line_results if idx == i), [])
|
1041 |
+
if line_results and i < len(ocr_results_with_children):
|
1042 |
child_level_key = list(ocr_results_with_children.keys())[i]
|
1043 |
ocr_results_with_children_line_level = ocr_results_with_children[child_level_key]
|
1044 |
+
|
1045 |
+
for result in line_results:
|
1046 |
+
bbox_results = self.map_analyzer_results_to_bounding_boxes(
|
1047 |
+
[result],
|
1048 |
+
[OCRResult(
|
1049 |
+
text=text_line.text[result.start:result.end],
|
1050 |
+
left=text_line.left,
|
1051 |
+
top=text_line.top,
|
1052 |
+
width=text_line.width,
|
1053 |
+
height=text_line.height
|
1054 |
+
)],
|
1055 |
+
text_line.text,
|
1056 |
+
text_analyzer_kwargs.get('allow_list', []),
|
1057 |
+
ocr_results_with_children_line_level
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1058 |
)
|
1059 |
+
combined_results.extend(bbox_results)
|
|
|
|
|
|
|
1060 |
|
1061 |
return combined_results, comprehend_query_number
|
1062 |
|
1063 |
@staticmethod
|
1064 |
def map_analyzer_results_to_bounding_boxes(
|
1065 |
+
text_analyzer_results: List[RecognizerResult],
|
1066 |
+
redaction_relevant_ocr_results: List[OCRResult],
|
1067 |
+
full_text: str,
|
1068 |
+
allow_list: List[str],
|
1069 |
+
ocr_results_with_children_child_info: Dict[str, Dict]
|
1070 |
+
) -> List[CustomImageRecognizerResult]:
|
1071 |
redaction_bboxes = []
|
|
|
1072 |
|
1073 |
for redaction_relevant_ocr_result in redaction_relevant_ocr_results:
|
1074 |
+
#print("ocr_results_with_children_child_info:", ocr_results_with_children_child_info)
|
1075 |
|
1076 |
+
line_text = ocr_results_with_children_child_info['text']
|
1077 |
+
line_length = len(line_text)
|
1078 |
+
redaction_text = redaction_relevant_ocr_result.text
|
1079 |
|
1080 |
+
# print(f"Processing line: '{line_text}'")
|
1081 |
+
|
1082 |
for redaction_result in text_analyzer_results:
|
1083 |
+
# print(f"Checking redaction result: {redaction_result}")
|
1084 |
+
# print("redaction_text:", redaction_text)
|
1085 |
+
# print("line_length:", line_length)
|
1086 |
+
# print("line_text:", line_text)
|
1087 |
+
|
1088 |
+
# Check if the redaction text is no in the allow list
|
1089 |
+
|
1090 |
+
if redaction_text not in allow_list:
|
1091 |
+
|
1092 |
+
# Adjust start and end to be within line bounds
|
1093 |
+
start_in_line = max(0, redaction_result.start)
|
1094 |
+
end_in_line = min(line_length, redaction_result.end)
|
1095 |
+
|
1096 |
+
# Get the matched text from this line
|
1097 |
+
matched_text = line_text[start_in_line:end_in_line]
|
1098 |
+
matched_words = matched_text.split()
|
1099 |
+
|
1100 |
+
# print(f"Found match: '{matched_text}' in line")
|
1101 |
+
|
1102 |
+
# Find the corresponding words in the OCR results
|
1103 |
+
matching_word_boxes = []
|
1104 |
+
for word_info in ocr_results_with_children_child_info.get('words', []):
|
1105 |
+
# Check if this word is part of our match
|
1106 |
+
if any(word.lower() in word_info['text'].lower() for word in matched_words):
|
1107 |
+
matching_word_boxes.append(word_info['bounding_box'])
|
1108 |
+
# print(f"Matched word: {word_info['text']}")
|
1109 |
+
|
1110 |
+
if matching_word_boxes:
|
1111 |
+
# Calculate the combined bounding box for all matching words
|
1112 |
+
left = min(box[0] for box in matching_word_boxes)
|
1113 |
+
top = min(box[1] for box in matching_word_boxes)
|
1114 |
+
right = max(box[2] for box in matching_word_boxes)
|
1115 |
+
bottom = max(box[3] for box in matching_word_boxes)
|
1116 |
+
|
1117 |
+
redaction_bboxes.append(
|
1118 |
+
CustomImageRecognizerResult(
|
1119 |
+
entity_type=redaction_result.entity_type,
|
1120 |
+
start=start_in_line,
|
1121 |
+
end=end_in_line,
|
1122 |
+
score=redaction_result.score,
|
1123 |
+
left=left,
|
1124 |
+
top=top,
|
1125 |
+
width=right - left,
|
1126 |
+
height=bottom - top,
|
1127 |
+
text=matched_text
|
1128 |
+
)
|
1129 |
)
|
1130 |
+
# print(f"Added bounding box for: '{matched_text}'")
|
|
|
|
|
1131 |
|
1132 |
return redaction_bboxes
|
1133 |
|
1134 |
@staticmethod
|
1135 |
def remove_space_boxes(ocr_result: dict) -> dict:
|
1136 |
"""Remove OCR bboxes that are for spaces.
|
|
|
1137 |
:param ocr_result: OCR results (raw or thresholded).
|
1138 |
:return: OCR results with empty words removed.
|
1139 |
"""
|
|
|
1156 |
ocr_result: Dict[str, List[Union[int, str]]], scale_factor: float
|
1157 |
) -> Dict[str, float]:
|
1158 |
"""Scale down the bounding box results based on a scale percentage.
|
|
|
1159 |
:param ocr_result: OCR results (raw).
|
1160 |
:param scale_percent: Scale percentage for resizing the bounding box.
|
|
|
1161 |
:return: OCR results (scaled).
|
1162 |
"""
|
1163 |
scaled_results = deepcopy(ocr_result)
|
|
|
1204 |
estimated_width = int(proportion * ocr_result.width)
|
1205 |
|
1206 |
return estimated_width
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tools/file_conversion.py
CHANGED
@@ -201,7 +201,7 @@ def process_file(file_path:str, prepare_for_review:bool=False):
|
|
201 |
if file_extension in ['.jpg', '.jpeg', '.png']:
|
202 |
print(f"{file_path} is an image file.")
|
203 |
# Perform image processing here
|
204 |
-
img_object = [Image.open(file_path)]
|
205 |
# Load images from the file paths
|
206 |
|
207 |
# Check if the file is a PDF
|
@@ -490,6 +490,7 @@ def prepare_image_or_pdf(
|
|
490 |
else:
|
491 |
file_path = file.name
|
492 |
file_path_without_ext = get_file_path_end(file_path)
|
|
|
493 |
|
494 |
if not file_path:
|
495 |
out_message = "Please select a file."
|
@@ -532,8 +533,13 @@ def prepare_image_or_pdf(
|
|
532 |
|
533 |
image_file_paths = process_file(file_path_str, prepare_for_review)
|
534 |
|
535 |
-
print("
|
|
|
|
|
536 |
|
|
|
|
|
|
|
537 |
|
538 |
elif file_extension in ['.csv']:
|
539 |
review_file_csv = read_file(file)
|
@@ -738,6 +744,7 @@ def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decis
|
|
738 |
reported_number = int(number) + 1
|
739 |
else:
|
740 |
print("No number found before .png")
|
|
|
741 |
|
742 |
# Check if 'boxes' is in the annotation, if not, add an empty list
|
743 |
if 'boxes' not in annotation:
|
|
|
201 |
if file_extension in ['.jpg', '.jpeg', '.png']:
|
202 |
print(f"{file_path} is an image file.")
|
203 |
# Perform image processing here
|
204 |
+
img_object = [file_path] #[Image.open(file_path)]
|
205 |
# Load images from the file paths
|
206 |
|
207 |
# Check if the file is a PDF
|
|
|
490 |
else:
|
491 |
file_path = file.name
|
492 |
file_path_without_ext = get_file_path_end(file_path)
|
493 |
+
file_name_with_ext = os.path.basename(file_path)
|
494 |
|
495 |
if not file_path:
|
496 |
out_message = "Please select a file."
|
|
|
533 |
|
534 |
image_file_paths = process_file(file_path_str, prepare_for_review)
|
535 |
|
536 |
+
#print("image_file_paths:", image_file_paths)
|
537 |
+
|
538 |
+
converted_file_path = output_folder + file_name_with_ext
|
539 |
|
540 |
+
pymupdf_doc.save(converted_file_path)
|
541 |
+
|
542 |
+
print("Inserted image into PDF file")
|
543 |
|
544 |
elif file_extension in ['.csv']:
|
545 |
review_file_csv = read_file(file)
|
|
|
744 |
reported_number = int(number) + 1
|
745 |
else:
|
746 |
print("No number found before .png")
|
747 |
+
reported_number = 1
|
748 |
|
749 |
# Check if 'boxes' is in the annotation, if not, add an empty list
|
750 |
if 'boxes' not in annotation:
|
tools/file_redaction.py
CHANGED
@@ -25,13 +25,13 @@ from collections import defaultdict # For efficient grouping
|
|
25 |
|
26 |
from presidio_analyzer import RecognizerResult
|
27 |
from tools.aws_functions import RUN_AWS_FUNCTIONS
|
28 |
-
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
|
29 |
from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords
|
30 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser
|
31 |
from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
|
32 |
from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
|
33 |
from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
|
34 |
-
from tools.presidio_analyzer_custom import recognizer_result_from_dict
|
35 |
|
36 |
# Number of pages to loop through before breaking. Currently set very high, as functions are breaking on time metrics (e.g. every 105 seconds), rather than on number of pages redacted.
|
37 |
page_break_value = get_or_create_env_var('page_break_value', '50000')
|
@@ -136,6 +136,9 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
136 |
tic = time.perf_counter()
|
137 |
all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
|
138 |
|
|
|
|
|
|
|
139 |
if isinstance(custom_recogniser_word_list, pd.DataFrame):
|
140 |
custom_recogniser_word_list = custom_recogniser_word_list.iloc[:,0].tolist()
|
141 |
|
@@ -159,7 +162,6 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
159 |
elif (first_loop_state == False) & (current_loop_page == 999):
|
160 |
current_loop_page = 0
|
161 |
|
162 |
-
|
163 |
if not out_file_paths:
|
164 |
out_file_paths = []
|
165 |
|
@@ -184,21 +186,33 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
184 |
combined_out_message = '\n'.join(out_message)
|
185 |
else:
|
186 |
combined_out_message = out_message
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
|
188 |
estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
|
189 |
print("Estimated total processing time:", str(estimate_total_processing_time))
|
190 |
|
191 |
-
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
|
192 |
|
193 |
# If we have reached the last page, return message
|
194 |
if current_loop_page >= number_of_pages:
|
195 |
-
print("
|
196 |
|
197 |
# Set to a very high number so as not to mix up with subsequent file processing by the user
|
198 |
current_loop_page = 999
|
199 |
combined_out_message = out_message
|
200 |
|
201 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
202 |
|
203 |
# Create allow list
|
204 |
# If string, assume file path
|
@@ -221,7 +235,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
221 |
comprehend_client = ""
|
222 |
out_message = "Cannot connect to AWS Comprehend service. Please choose another PII identification method."
|
223 |
print(out_message)
|
224 |
-
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
|
225 |
else:
|
226 |
comprehend_client = ""
|
227 |
|
@@ -233,7 +247,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
233 |
textract_client = ""
|
234 |
out_message = "Cannot connect to AWS Textract. Please choose another text extraction method."
|
235 |
print(out_message)
|
236 |
-
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
|
237 |
else:
|
238 |
textract_client = ""
|
239 |
|
@@ -265,8 +279,9 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
265 |
file_path = file.name
|
266 |
|
267 |
if file_path:
|
268 |
-
|
269 |
-
|
|
|
270 |
|
271 |
is_a_pdf = is_pdf(file_path) == True
|
272 |
if is_a_pdf == False and in_redact_method == text_ocr_option:
|
@@ -277,16 +292,16 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
277 |
out_message = "No file selected"
|
278 |
print(out_message)
|
279 |
|
280 |
-
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
|
281 |
|
282 |
if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
|
283 |
|
284 |
#Analyse and redact image-based pdf or image
|
285 |
if is_pdf_or_image(file_path) == False:
|
286 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
287 |
-
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
|
288 |
|
289 |
-
print("Redacting file " +
|
290 |
|
291 |
pymupdf_doc, all_decision_process_table, log_files_output_paths, new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number = redact_image_pdf(file_path,
|
292 |
prepared_pdf_image_paths,
|
@@ -328,7 +343,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
328 |
|
329 |
if is_pdf(file_path) == False:
|
330 |
out_message = "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'."
|
331 |
-
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
|
332 |
|
333 |
# Analyse text-based pdf
|
334 |
print('Redacting file as text-based PDF')
|
@@ -356,12 +371,12 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
356 |
else:
|
357 |
out_message = "No redaction method selected"
|
358 |
print(out_message)
|
359 |
-
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
|
360 |
|
361 |
# If at last page, save to file
|
362 |
if current_loop_page >= number_of_pages:
|
363 |
|
364 |
-
print("Current page loop:", current_loop_page, "is
|
365 |
latest_file_completed += 1
|
366 |
current_loop_page = 999
|
367 |
|
@@ -370,36 +385,43 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
370 |
|
371 |
# Save file
|
372 |
if is_pdf(file_path) == False:
|
373 |
-
|
374 |
-
pymupdf_doc[0].save(
|
|
|
|
|
|
|
|
|
375 |
|
376 |
else:
|
377 |
-
|
378 |
-
pymupdf_doc.save(
|
379 |
|
380 |
-
out_file_paths.append(
|
381 |
|
382 |
#if log_files_output_paths:
|
383 |
# log_files_output_paths.extend(log_files_output_paths)
|
384 |
|
385 |
-
|
|
|
|
|
|
|
386 |
all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
|
387 |
log_files_output_paths.append(logs_output_file_name)
|
388 |
|
389 |
-
all_text_output_file_name =
|
390 |
all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
|
391 |
out_file_paths.append(all_text_output_file_name)
|
392 |
|
393 |
# Save the gradio_annotation_boxes to a JSON file
|
394 |
try:
|
395 |
-
print("Saving annotations to JSON")
|
396 |
|
397 |
-
out_annotation_file_path =
|
398 |
with open(out_annotation_file_path, 'w') as f:
|
399 |
json.dump(annotations_all_pages, f)
|
400 |
log_files_output_paths.append(out_annotation_file_path)
|
401 |
|
402 |
-
print("Saving annotations to CSV")
|
403 |
|
404 |
# Convert json to csv and also save this
|
405 |
#print("annotations_all_pages:", annotations_all_pages)
|
@@ -407,14 +429,14 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
407 |
|
408 |
review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
|
409 |
|
410 |
-
|
411 |
-
review_df.to_csv(
|
412 |
-
out_file_paths.append(
|
413 |
|
414 |
print("Saved review file to csv")
|
415 |
|
416 |
except Exception as e:
|
417 |
-
print("Could not save annotations to json file:", e)
|
418 |
|
419 |
# Make a combined message for the file
|
420 |
if isinstance(out_message, list):
|
@@ -429,7 +451,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
429 |
combined_out_message = combined_out_message + " " + out_time_message # Ensure this is a single string
|
430 |
|
431 |
estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
|
432 |
-
print("Estimated total processing time:", str(estimate_total_processing_time))
|
433 |
|
434 |
else:
|
435 |
toc = time.perf_counter()
|
@@ -441,7 +463,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
441 |
if all_request_metadata:
|
442 |
all_request_metadata_str = '\n'.join(all_request_metadata).strip()
|
443 |
|
444 |
-
all_request_metadata_file_path = output_folder +
|
445 |
|
446 |
with open(all_request_metadata_file_path, "w") as f:
|
447 |
f.write(all_request_metadata_str)
|
@@ -456,10 +478,15 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
456 |
|
457 |
# Ensure no duplicated output files
|
458 |
log_files_output_paths = list(set(log_files_output_paths))
|
459 |
-
out_file_paths = list(set(out_file_paths))
|
|
|
460 |
|
|
|
|
|
|
|
461 |
|
462 |
-
|
|
|
463 |
|
464 |
def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
|
465 |
'''
|
@@ -930,14 +957,7 @@ def redact_image_pdf(file_path:str,
|
|
930 |
nlp_analyser.registry.remove_recognizer("CUSTOM")
|
931 |
new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
|
932 |
#print("new_custom_recogniser:", new_custom_recogniser)
|
933 |
-
nlp_analyser.registry.add_recognizer(new_custom_recogniser)
|
934 |
-
|
935 |
-
# List all elements currently in the nlp_analyser registry
|
936 |
-
#print("Current recognizers in nlp_analyser registry:")
|
937 |
-
for recognizer_name in nlp_analyser.registry.recognizers:
|
938 |
-
print(recognizer_name)
|
939 |
-
|
940 |
-
|
941 |
|
942 |
|
943 |
image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
|
@@ -1031,7 +1051,7 @@ def redact_image_pdf(file_path:str,
|
|
1031 |
|
1032 |
#print("Image is in range of pages to redact")
|
1033 |
if isinstance(image, str):
|
1034 |
-
|
1035 |
image = Image.open(image)
|
1036 |
|
1037 |
# Need image size to convert textract OCR outputs to the correct sizes
|
@@ -1137,7 +1157,7 @@ def redact_image_pdf(file_path:str,
|
|
1137 |
all_image_annotations_boxes = []
|
1138 |
|
1139 |
for box in merged_redaction_bboxes:
|
1140 |
-
print("box:", box)
|
1141 |
|
1142 |
x0 = box.left
|
1143 |
y0 = box.top
|
@@ -1299,6 +1319,8 @@ def get_text_container_characters(text_container:LTTextContainer):
|
|
1299 |
for line in text_container
|
1300 |
if isinstance(line, LTTextLine) or isinstance(line, LTTextLineHorizontal)
|
1301 |
for char in line]
|
|
|
|
|
1302 |
|
1303 |
return characters
|
1304 |
return []
|
@@ -1312,6 +1334,7 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
|
|
1312 |
line_level_characters_out = []
|
1313 |
#all_line_level_characters_out = []
|
1314 |
character_objects_out = [] # New list to store character objects
|
|
|
1315 |
|
1316 |
# Initialize variables
|
1317 |
full_text = ""
|
@@ -1326,12 +1349,19 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
|
|
1326 |
for char in char_objects:
|
1327 |
character_objects_out.append(char) # Collect character objects
|
1328 |
|
|
|
|
|
|
|
|
|
1329 |
if isinstance(char, LTAnno):
|
1330 |
|
|
|
|
|
|
|
1331 |
added_text = char.get_text()
|
1332 |
|
1333 |
# Handle double quotes
|
1334 |
-
added_text = added_text.replace('"', '\\"') # Escape double quotes
|
1335 |
|
1336 |
# Handle space separately by finalizing the word
|
1337 |
full_text += added_text # Adds space or newline
|
@@ -1348,7 +1378,7 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
|
|
1348 |
if current_word:
|
1349 |
word_bboxes.append((current_word, current_word_bbox))
|
1350 |
# Create an OCRResult for the current line
|
1351 |
-
line_level_results_out.append(OCRResult(full_text, round(overall_bbox[0], 2), round(overall_bbox[1], 2), round(overall_bbox[2] - overall_bbox[0], 2), round(overall_bbox[3] - overall_bbox[1], 2)))
|
1352 |
line_level_characters_out.append(character_objects_out)
|
1353 |
# Reset for the next line
|
1354 |
character_objects_out = []
|
@@ -1396,119 +1426,15 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
|
|
1396 |
# Convert special characters to a human-readable format
|
1397 |
#full_text = full_text.encode('latin1', errors='replace').decode('utf-8')
|
1398 |
full_text = clean_unicode_text(full_text)
|
|
|
1399 |
#print("full_text:", full_text)
|
1400 |
|
1401 |
-
line_level_results_out.append(OCRResult(full_text, round(overall_bbox[0],2), round(overall_bbox[1], 2), round(overall_bbox[2]-overall_bbox[0],2), round(overall_bbox[3]-overall_bbox[1],2)))
|
1402 |
|
1403 |
#line_level_characters_out = character_objects_out
|
1404 |
|
1405 |
return line_level_results_out, line_level_characters_out # Return both results and character objects
|
1406 |
|
1407 |
-
def merge_text_bounding_boxes(analyser_results, characters: List[LTChar], combine_pixel_dist: int = 20, vertical_padding: int = 0):
|
1408 |
-
'''
|
1409 |
-
Merge identified bounding boxes containing PII that are very close to one another
|
1410 |
-
'''
|
1411 |
-
analysed_bounding_boxes = []
|
1412 |
-
original_bounding_boxes = [] # List to hold original bounding boxes
|
1413 |
-
|
1414 |
-
if len(analyser_results) > 0 and len(characters) > 0:
|
1415 |
-
# Extract bounding box coordinates for sorting
|
1416 |
-
bounding_boxes = []
|
1417 |
-
for result in analyser_results:
|
1418 |
-
#print("Result:", result)
|
1419 |
-
char_boxes = [char.bbox for char in characters[result.start:result.end] if isinstance(char, LTChar)]
|
1420 |
-
char_text = [char._text for char in characters[result.start:result.end] if isinstance(char, LTChar)]
|
1421 |
-
if char_boxes:
|
1422 |
-
# Calculate the bounding box that encompasses all characters
|
1423 |
-
left = min(box[0] for box in char_boxes)
|
1424 |
-
bottom = min(box[1] for box in char_boxes)
|
1425 |
-
right = max(box[2] for box in char_boxes)
|
1426 |
-
top = max(box[3] for box in char_boxes) + vertical_padding
|
1427 |
-
bbox = [left, bottom, right, top]
|
1428 |
-
bounding_boxes.append((bottom, left, result, bbox, char_text)) # (y, x, result, bbox, text)
|
1429 |
-
|
1430 |
-
# Store original bounding boxes
|
1431 |
-
original_bounding_boxes.append({"text": "".join(char_text), "boundingBox": bbox, "result": copy.deepcopy(result)})
|
1432 |
-
#print("Original bounding boxes:", original_bounding_boxes)
|
1433 |
-
|
1434 |
-
# Sort the results by y-coordinate and then by x-coordinate
|
1435 |
-
bounding_boxes.sort()
|
1436 |
-
|
1437 |
-
merged_bounding_boxes = []
|
1438 |
-
current_box = None
|
1439 |
-
current_y = None
|
1440 |
-
current_result = None
|
1441 |
-
current_text = []
|
1442 |
-
|
1443 |
-
for y, x, result, next_box, text in bounding_boxes:
|
1444 |
-
if current_y is None or current_box is None:
|
1445 |
-
# Initialize the first bounding box
|
1446 |
-
current_box = next_box
|
1447 |
-
current_y = next_box[1]
|
1448 |
-
current_result = result
|
1449 |
-
current_text = list(text)
|
1450 |
-
else:
|
1451 |
-
vertical_diff_bboxes = abs(next_box[1] - current_y)
|
1452 |
-
horizontal_diff_bboxes = abs(next_box[0] - current_box[2])
|
1453 |
-
|
1454 |
-
if vertical_diff_bboxes <= 5 and horizontal_diff_bboxes <= combine_pixel_dist:
|
1455 |
-
# Merge bounding boxes
|
1456 |
-
#print("Merging boxes")
|
1457 |
-
merged_box = current_box.copy()
|
1458 |
-
merged_result = current_result
|
1459 |
-
merged_text = current_text.copy()
|
1460 |
-
|
1461 |
-
#print("current_box_max_x:", current_box[2])
|
1462 |
-
#print("char_max_x:", next_box[2])
|
1463 |
-
|
1464 |
-
merged_box[2] = next_box[2] # Extend horizontally
|
1465 |
-
merged_box[3] = max(current_box[3], next_box[3]) # Adjust the top
|
1466 |
-
merged_result.end = max(current_result.end, result.end) # Extend text range
|
1467 |
-
try:
|
1468 |
-
if current_result.entity_type != result.entity_type:
|
1469 |
-
merged_result.entity_type = current_result.entity_type + " - " + result.entity_type
|
1470 |
-
else:
|
1471 |
-
merged_result.entity_type = current_result.entity_type
|
1472 |
-
except Exception as e:
|
1473 |
-
print("Unable to combine result entity types:", e)
|
1474 |
-
if current_text:
|
1475 |
-
merged_text.append(" ") # Add space between texts
|
1476 |
-
merged_text.extend(text)
|
1477 |
-
|
1478 |
-
merged_bounding_boxes.append({
|
1479 |
-
"text": "".join(merged_text),
|
1480 |
-
"boundingBox": merged_box,
|
1481 |
-
"result": merged_result
|
1482 |
-
})
|
1483 |
-
|
1484 |
-
else:
|
1485 |
-
# Save the current merged box before starting a new one
|
1486 |
-
# merged_bounding_boxes.append({
|
1487 |
-
# "text": "".join(current_text),
|
1488 |
-
# "boundingBox": current_box,
|
1489 |
-
# "result": current_result
|
1490 |
-
# })
|
1491 |
-
# Start a new bounding box
|
1492 |
-
current_box = next_box
|
1493 |
-
current_y = next_box[1]
|
1494 |
-
current_result = result
|
1495 |
-
current_text = list(text)
|
1496 |
-
|
1497 |
-
# Handle the last box
|
1498 |
-
# if current_box is not None:
|
1499 |
-
# merged_bounding_boxes.append({
|
1500 |
-
# "text": "".join(current_text),
|
1501 |
-
# "boundingBox": current_box,
|
1502 |
-
# "result": current_result
|
1503 |
-
# })
|
1504 |
-
|
1505 |
-
# Combine original and merged bounding boxes
|
1506 |
-
analysed_bounding_boxes.extend(original_bounding_boxes)
|
1507 |
-
analysed_bounding_boxes.extend(merged_bounding_boxes)
|
1508 |
-
|
1509 |
-
#print("Analysed bounding boxes:", analysed_bounding_boxes)
|
1510 |
-
|
1511 |
-
return analysed_bounding_boxes
|
1512 |
|
1513 |
def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
|
1514 |
decision_process_table = pd.DataFrame()
|
@@ -1559,6 +1485,182 @@ def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
|
|
1559 |
pikepdf_annotations_on_page.append(annotation)
|
1560 |
return pikepdf_annotations_on_page
|
1561 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1562 |
def redact_text_pdf(
|
1563 |
filename: str, # Path to the PDF file to be redacted
|
1564 |
prepared_pdf_image_path: str, # Path to the prepared PDF image for redaction
|
@@ -1681,173 +1783,64 @@ def redact_text_pdf(
|
|
1681 |
|
1682 |
for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
|
1683 |
|
|
|
|
|
1684 |
page_analyser_results = []
|
1685 |
page_analysed_bounding_boxes = []
|
1686 |
|
1687 |
characters = []
|
1688 |
pikepdf_annotations_on_page = []
|
1689 |
decision_process_table_on_page = pd.DataFrame()
|
1690 |
-
|
1691 |
|
1692 |
if analysis_type == text_ocr_option:
|
1693 |
for n, text_container in enumerate(page_layout):
|
1694 |
-
|
1695 |
-
text_container_analyser_results = []
|
1696 |
-
text_container_analysed_bounding_boxes = []
|
1697 |
characters = []
|
1698 |
|
|
|
|
|
1699 |
if isinstance(text_container, LTTextContainer) or isinstance(text_container, LTAnno):
|
1700 |
characters = get_text_container_characters(text_container)
|
1701 |
|
1702 |
# Create dataframe for all the text on the page
|
1703 |
line_level_text_results_list, line_characters = create_text_bounding_boxes_from_characters(characters)
|
1704 |
|
1705 |
-
|
1706 |
if line_level_text_results_list:
|
1707 |
# Convert to DataFrame and add to ongoing logging table
|
1708 |
line_level_text_results_df = pd.DataFrame([{
|
1709 |
'page': page_no + 1,
|
1710 |
-
'text': result.text,
|
1711 |
'left': result.left,
|
1712 |
'top': result.top,
|
1713 |
'width': result.width,
|
1714 |
'height': result.height
|
1715 |
} for result in line_level_text_results_list])
|
1716 |
|
1717 |
-
|
1718 |
-
|
1719 |
-
|
1720 |
-
|
1721 |
-
|
1722 |
-
|
1723 |
-
|
1724 |
-
|
1725 |
-
|
1726 |
-
|
1727 |
-
|
1728 |
-
|
1729 |
-
|
1730 |
-
|
1731 |
-
|
1732 |
-
|
1733 |
-
|
1734 |
-
|
1735 |
-
|
1736 |
-
|
1737 |
-
|
1738 |
-
|
1739 |
-
|
1740 |
-
all_text_line_results.append((i, text_line_analyser_result))
|
1741 |
-
|
1742 |
-
|
1743 |
-
elif pii_identification_method == "AWS Comprehend":
|
1744 |
-
|
1745 |
-
# First use the local Spacy model to pick up custom entities that AWS Comprehend can't search for.
|
1746 |
-
custom_redact_entities = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
|
1747 |
-
|
1748 |
-
|
1749 |
-
text_line_analyser_result = nlp_analyser.analyze(
|
1750 |
-
text=text_line.text,
|
1751 |
-
language=language,
|
1752 |
-
entities=custom_redact_entities,
|
1753 |
-
score_threshold=score_threshold,
|
1754 |
-
return_decision_process=True,
|
1755 |
-
allow_list=allow_list
|
1756 |
-
)
|
1757 |
-
all_text_line_results.append((i, text_line_analyser_result))
|
1758 |
-
|
1759 |
-
|
1760 |
-
if len(text_line.text) >= 3:
|
1761 |
-
# Add separator between lines
|
1762 |
-
if current_batch:
|
1763 |
-
current_batch += " | "
|
1764 |
-
|
1765 |
-
start_pos = len(current_batch)
|
1766 |
-
current_batch += text_line.text
|
1767 |
-
current_batch_mapping.append((start_pos, i, text_line))
|
1768 |
-
|
1769 |
-
# Process batch if approaching 300 characters or last line
|
1770 |
-
if len(current_batch) >= 200 or i == len(line_level_text_results_list) - 1:
|
1771 |
-
print("length of text for Comprehend:", len(current_batch))
|
1772 |
-
|
1773 |
-
try:
|
1774 |
-
response = comprehend_client.detect_pii_entities(
|
1775 |
-
Text=current_batch,
|
1776 |
-
LanguageCode=language
|
1777 |
-
)
|
1778 |
-
except Exception as e:
|
1779 |
-
print(e)
|
1780 |
-
time.sleep(3)
|
1781 |
-
response = comprehend_client.detect_pii_entities(
|
1782 |
-
Text=current_batch,
|
1783 |
-
LanguageCode=language
|
1784 |
-
)
|
1785 |
-
|
1786 |
-
comprehend_query_number += 1
|
1787 |
-
|
1788 |
-
# Process response and map back to original lines
|
1789 |
-
if response and "Entities" in response:
|
1790 |
-
for entity in response["Entities"]:
|
1791 |
-
entity_start = entity["BeginOffset"]
|
1792 |
-
entity_end = entity["EndOffset"]
|
1793 |
-
|
1794 |
-
# Find which line this entity belongs to
|
1795 |
-
for batch_start, line_idx, original_line in current_batch_mapping:
|
1796 |
-
batch_end = batch_start + len(original_line.text)
|
1797 |
-
|
1798 |
-
# Check if entity belongs to this line
|
1799 |
-
if batch_start <= entity_start < batch_end:
|
1800 |
-
# Adjust offsets relative to original line
|
1801 |
-
relative_start = entity_start - batch_start
|
1802 |
-
relative_end = min(entity_end - batch_start, len(original_line.text))
|
1803 |
-
|
1804 |
-
result_text = original_line.text[relative_start:relative_end]
|
1805 |
-
|
1806 |
-
if result_text not in allow_list:
|
1807 |
-
if entity.get("Type") in chosen_redact_comprehend_entities:
|
1808 |
-
# Create adjusted entity
|
1809 |
-
adjusted_entity = entity.copy()
|
1810 |
-
adjusted_entity["BeginOffset"] = relative_start
|
1811 |
-
adjusted_entity["EndOffset"] = relative_end
|
1812 |
-
|
1813 |
-
recogniser_entity = recognizer_result_from_dict(adjusted_entity)
|
1814 |
-
|
1815 |
-
# Add to results for this line
|
1816 |
-
existing_results = next((results for idx, results in all_text_line_results if idx == line_idx), [])
|
1817 |
-
if not existing_results:
|
1818 |
-
all_text_line_results.append((line_idx, [recogniser_entity]))
|
1819 |
-
else:
|
1820 |
-
existing_results.append(recogniser_entity)
|
1821 |
-
|
1822 |
-
# Reset batch
|
1823 |
-
current_batch = ""
|
1824 |
-
current_batch_mapping = []
|
1825 |
-
|
1826 |
-
# Second pass: process results for each line
|
1827 |
-
for i, text_line in enumerate(line_level_text_results_list):
|
1828 |
-
text_line_analyser_result = []
|
1829 |
-
text_line_bounding_boxes = []
|
1830 |
-
|
1831 |
-
# Get results for this line
|
1832 |
-
line_results = next((results for idx, results in all_text_line_results if idx == i), [])
|
1833 |
-
|
1834 |
-
if line_results:
|
1835 |
-
text_line_analyser_result = line_results
|
1836 |
-
|
1837 |
-
#print("Analysed text container, now merging bounding boxes")
|
1838 |
-
|
1839 |
-
# Merge bounding boxes if very close together
|
1840 |
-
text_line_bounding_boxes = merge_text_bounding_boxes(text_line_analyser_result, line_characters[i])
|
1841 |
-
|
1842 |
-
#print("merged bounding boxes")
|
1843 |
-
|
1844 |
-
text_container_analyser_results.extend(text_line_analyser_result)
|
1845 |
-
text_container_analysed_bounding_boxes.extend(text_line_bounding_boxes)
|
1846 |
-
|
1847 |
-
#print("text_container_analyser_results:", text_container_analyser_results)
|
1848 |
-
|
1849 |
-
page_analyser_results.extend(text_container_analyser_results) # Add this line
|
1850 |
-
page_analysed_bounding_boxes.extend(text_line_bounding_boxes) # Add this line
|
1851 |
|
1852 |
|
1853 |
#print("page_analyser_results:", page_analyser_results)
|
@@ -1879,17 +1872,18 @@ def redact_text_pdf(
|
|
1879 |
reported_page_no = page_no + 1
|
1880 |
print("For page number:", reported_page_no, "there are", len(image_annotations["boxes"]), "annotations")
|
1881 |
|
|
|
|
|
|
|
|
|
|
|
1882 |
# Write logs
|
1883 |
# Create decision process table
|
1884 |
decision_process_table_on_page = create_text_redaction_process_results(page_analyser_results, page_analysed_bounding_boxes, current_loop_page)
|
1885 |
|
1886 |
if not decision_process_table_on_page.empty:
|
1887 |
all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table_on_page])
|
1888 |
-
#print("all_decision_process_table:", all_decision_process_table)
|
1889 |
-
|
1890 |
-
if not page_text_outputs.empty:
|
1891 |
-
page_text_outputs = page_text_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
|
1892 |
-
all_line_level_ocr_results_df = pd.concat([all_line_level_ocr_results_df, page_text_outputs])
|
1893 |
|
1894 |
toc = time.perf_counter()
|
1895 |
|
|
|
25 |
|
26 |
from presidio_analyzer import RecognizerResult
|
27 |
from tools.aws_functions import RUN_AWS_FUNCTIONS
|
28 |
+
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes
|
29 |
from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords
|
30 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser
|
31 |
from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
|
32 |
from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
|
33 |
from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
|
34 |
+
from tools.presidio_analyzer_custom import recognizer_result_from_dict
|
35 |
|
36 |
# Number of pages to loop through before breaking. Currently set very high, as functions are breaking on time metrics (e.g. every 105 seconds), rather than on number of pages redacted.
|
37 |
page_break_value = get_or_create_env_var('page_break_value', '50000')
|
|
|
136 |
tic = time.perf_counter()
|
137 |
all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
|
138 |
|
139 |
+
print("prepared_pdf_file_paths:", prepared_pdf_file_paths[0])
|
140 |
+
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
141 |
+
|
142 |
if isinstance(custom_recogniser_word_list, pd.DataFrame):
|
143 |
custom_recogniser_word_list = custom_recogniser_word_list.iloc[:,0].tolist()
|
144 |
|
|
|
162 |
elif (first_loop_state == False) & (current_loop_page == 999):
|
163 |
current_loop_page = 0
|
164 |
|
|
|
165 |
if not out_file_paths:
|
166 |
out_file_paths = []
|
167 |
|
|
|
186 |
combined_out_message = '\n'.join(out_message)
|
187 |
else:
|
188 |
combined_out_message = out_message
|
189 |
+
|
190 |
+
if len(review_out_file_paths) == 1:
|
191 |
+
|
192 |
+
out_review_file_path = [x for x in out_file_paths if "review_file" in x]
|
193 |
+
|
194 |
+
review_out_file_paths.extend(out_review_file_path)
|
195 |
|
196 |
estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
|
197 |
print("Estimated total processing time:", str(estimate_total_processing_time))
|
198 |
|
199 |
+
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
|
200 |
|
201 |
# If we have reached the last page, return message
|
202 |
if current_loop_page >= number_of_pages:
|
203 |
+
print("Reached last page of document:", current_loop_page)
|
204 |
|
205 |
# Set to a very high number so as not to mix up with subsequent file processing by the user
|
206 |
current_loop_page = 999
|
207 |
combined_out_message = out_message
|
208 |
|
209 |
+
if len(review_out_file_paths) == 1:
|
210 |
+
|
211 |
+
out_review_file_path = [x for x in out_file_paths if "review_file" in x]
|
212 |
+
|
213 |
+
review_out_file_paths.extend(out_review_file_path)
|
214 |
+
|
215 |
+
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
|
216 |
|
217 |
# Create allow list
|
218 |
# If string, assume file path
|
|
|
235 |
comprehend_client = ""
|
236 |
out_message = "Cannot connect to AWS Comprehend service. Please choose another PII identification method."
|
237 |
print(out_message)
|
238 |
+
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
|
239 |
else:
|
240 |
comprehend_client = ""
|
241 |
|
|
|
247 |
textract_client = ""
|
248 |
out_message = "Cannot connect to AWS Textract. Please choose another text extraction method."
|
249 |
print(out_message)
|
250 |
+
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
|
251 |
else:
|
252 |
textract_client = ""
|
253 |
|
|
|
279 |
file_path = file.name
|
280 |
|
281 |
if file_path:
|
282 |
+
pdf_file_name_without_ext = get_file_path_end(file_path)
|
283 |
+
pdf_file_name_with_ext = os.path.basename(file_path)
|
284 |
+
print("Redacting file:", pdf_file_name_with_ext)
|
285 |
|
286 |
is_a_pdf = is_pdf(file_path) == True
|
287 |
if is_a_pdf == False and in_redact_method == text_ocr_option:
|
|
|
292 |
out_message = "No file selected"
|
293 |
print(out_message)
|
294 |
|
295 |
+
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
|
296 |
|
297 |
if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
|
298 |
|
299 |
#Analyse and redact image-based pdf or image
|
300 |
if is_pdf_or_image(file_path) == False:
|
301 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
302 |
+
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
|
303 |
|
304 |
+
print("Redacting file " + pdf_file_name_with_ext + " as an image-based file")
|
305 |
|
306 |
pymupdf_doc, all_decision_process_table, log_files_output_paths, new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number = redact_image_pdf(file_path,
|
307 |
prepared_pdf_image_paths,
|
|
|
343 |
|
344 |
if is_pdf(file_path) == False:
|
345 |
out_message = "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'."
|
346 |
+
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
|
347 |
|
348 |
# Analyse text-based pdf
|
349 |
print('Redacting file as text-based PDF')
|
|
|
371 |
else:
|
372 |
out_message = "No redaction method selected"
|
373 |
print(out_message)
|
374 |
+
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
|
375 |
|
376 |
# If at last page, save to file
|
377 |
if current_loop_page >= number_of_pages:
|
378 |
|
379 |
+
print("Current page loop:", current_loop_page, "is the last page.")
|
380 |
latest_file_completed += 1
|
381 |
current_loop_page = 999
|
382 |
|
|
|
385 |
|
386 |
# Save file
|
387 |
if is_pdf(file_path) == False:
|
388 |
+
out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted_as_pdf.pdf"
|
389 |
+
#pymupdf_doc[0].save(out_redacted_pdf_file_path, "PDF" ,resolution=image_dpi, save_all=False)
|
390 |
+
#print("pymupdf_doc", pymupdf_doc)
|
391 |
+
#print("pymupdf_doc[0]", pymupdf_doc[0])
|
392 |
+
pymupdf_doc[-1].save(out_redacted_pdf_file_path, "PDF" ,resolution=image_dpi, save_all=False)#, append_images=pymupdf_doc[:1])
|
393 |
+
out_review_file_path = output_folder + pdf_file_name_without_ext + '_review_file.csv'
|
394 |
|
395 |
else:
|
396 |
+
out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.pdf"
|
397 |
+
pymupdf_doc.save(out_redacted_pdf_file_path)
|
398 |
|
399 |
+
out_file_paths.append(out_redacted_pdf_file_path)
|
400 |
|
401 |
#if log_files_output_paths:
|
402 |
# log_files_output_paths.extend(log_files_output_paths)
|
403 |
|
404 |
+
|
405 |
+
out_orig_pdf_file_path = output_folder + pdf_file_name_with_ext
|
406 |
+
|
407 |
+
logs_output_file_name = out_orig_pdf_file_path + "_decision_process_output.csv"
|
408 |
all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
|
409 |
log_files_output_paths.append(logs_output_file_name)
|
410 |
|
411 |
+
all_text_output_file_name = out_orig_pdf_file_path + "_ocr_output.csv"
|
412 |
all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
|
413 |
out_file_paths.append(all_text_output_file_name)
|
414 |
|
415 |
# Save the gradio_annotation_boxes to a JSON file
|
416 |
try:
|
417 |
+
#print("Saving annotations to JSON")
|
418 |
|
419 |
+
out_annotation_file_path = out_orig_pdf_file_path + '_review_file.json'
|
420 |
with open(out_annotation_file_path, 'w') as f:
|
421 |
json.dump(annotations_all_pages, f)
|
422 |
log_files_output_paths.append(out_annotation_file_path)
|
423 |
|
424 |
+
#print("Saving annotations to CSV")
|
425 |
|
426 |
# Convert json to csv and also save this
|
427 |
#print("annotations_all_pages:", annotations_all_pages)
|
|
|
429 |
|
430 |
review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
|
431 |
|
432 |
+
out_review_file_path = out_orig_pdf_file_path + '_review_file.csv'
|
433 |
+
review_df.to_csv(out_review_file_path, index=None)
|
434 |
+
out_file_paths.append(out_review_file_path)
|
435 |
|
436 |
print("Saved review file to csv")
|
437 |
|
438 |
except Exception as e:
|
439 |
+
print("Could not save annotations to json or csv file:", e)
|
440 |
|
441 |
# Make a combined message for the file
|
442 |
if isinstance(out_message, list):
|
|
|
451 |
combined_out_message = combined_out_message + " " + out_time_message # Ensure this is a single string
|
452 |
|
453 |
estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
|
454 |
+
#print("Estimated total processing time:", str(estimate_total_processing_time))
|
455 |
|
456 |
else:
|
457 |
toc = time.perf_counter()
|
|
|
463 |
if all_request_metadata:
|
464 |
all_request_metadata_str = '\n'.join(all_request_metadata).strip()
|
465 |
|
466 |
+
all_request_metadata_file_path = output_folder + pdf_file_name_without_ext + "_textract_request_metadata.txt"
|
467 |
|
468 |
with open(all_request_metadata_file_path, "w") as f:
|
469 |
f.write(all_request_metadata_str)
|
|
|
478 |
|
479 |
# Ensure no duplicated output files
|
480 |
log_files_output_paths = list(set(log_files_output_paths))
|
481 |
+
out_file_paths = list(set(out_file_paths))
|
482 |
+
review_out_file_paths = [prepared_pdf_file_paths[0], out_review_file_path]
|
483 |
|
484 |
+
#print("log_files_output_paths:", log_files_output_paths)
|
485 |
+
#print("out_file_paths:", out_file_paths)
|
486 |
+
#print("review_out_file_paths:", review_out_file_paths)
|
487 |
|
488 |
+
|
489 |
+
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
|
490 |
|
491 |
def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
|
492 |
'''
|
|
|
957 |
nlp_analyser.registry.remove_recognizer("CUSTOM")
|
958 |
new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
|
959 |
#print("new_custom_recogniser:", new_custom_recogniser)
|
960 |
+
nlp_analyser.registry.add_recognizer(new_custom_recogniser)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
961 |
|
962 |
|
963 |
image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
|
|
|
1051 |
|
1052 |
#print("Image is in range of pages to redact")
|
1053 |
if isinstance(image, str):
|
1054 |
+
print("image is a file path", image)
|
1055 |
image = Image.open(image)
|
1056 |
|
1057 |
# Need image size to convert textract OCR outputs to the correct sizes
|
|
|
1157 |
all_image_annotations_boxes = []
|
1158 |
|
1159 |
for box in merged_redaction_bboxes:
|
1160 |
+
#print("box:", box)
|
1161 |
|
1162 |
x0 = box.left
|
1163 |
y0 = box.top
|
|
|
1319 |
for line in text_container
|
1320 |
if isinstance(line, LTTextLine) or isinstance(line, LTTextLineHorizontal)
|
1321 |
for char in line]
|
1322 |
+
|
1323 |
+
#print("Initial characters:", characters)
|
1324 |
|
1325 |
return characters
|
1326 |
return []
|
|
|
1334 |
line_level_characters_out = []
|
1335 |
#all_line_level_characters_out = []
|
1336 |
character_objects_out = [] # New list to store character objects
|
1337 |
+
# character_text_objects_out = []
|
1338 |
|
1339 |
# Initialize variables
|
1340 |
full_text = ""
|
|
|
1349 |
for char in char_objects:
|
1350 |
character_objects_out.append(char) # Collect character objects
|
1351 |
|
1352 |
+
if not isinstance(char, LTAnno):
|
1353 |
+
character_text = char.get_text()
|
1354 |
+
# character_text_objects_out.append(character_text)
|
1355 |
+
|
1356 |
if isinstance(char, LTAnno):
|
1357 |
|
1358 |
+
# print("Character line:", "".join(character_text_objects_out))
|
1359 |
+
# print("Char is an annotation object:", char)
|
1360 |
+
|
1361 |
added_text = char.get_text()
|
1362 |
|
1363 |
# Handle double quotes
|
1364 |
+
#added_text = added_text.replace('"', '\\"') # Escape double quotes
|
1365 |
|
1366 |
# Handle space separately by finalizing the word
|
1367 |
full_text += added_text # Adds space or newline
|
|
|
1378 |
if current_word:
|
1379 |
word_bboxes.append((current_word, current_word_bbox))
|
1380 |
# Create an OCRResult for the current line
|
1381 |
+
line_level_results_out.append(OCRResult(full_text.strip(), round(overall_bbox[0], 2), round(overall_bbox[1], 2), round(overall_bbox[2] - overall_bbox[0], 2), round(overall_bbox[3] - overall_bbox[1], 2)))
|
1382 |
line_level_characters_out.append(character_objects_out)
|
1383 |
# Reset for the next line
|
1384 |
character_objects_out = []
|
|
|
1426 |
# Convert special characters to a human-readable format
|
1427 |
#full_text = full_text.encode('latin1', errors='replace').decode('utf-8')
|
1428 |
full_text = clean_unicode_text(full_text)
|
1429 |
+
full_text = full_text.strip()
|
1430 |
#print("full_text:", full_text)
|
1431 |
|
1432 |
+
line_level_results_out.append(OCRResult(full_text.strip(), round(overall_bbox[0],2), round(overall_bbox[1], 2), round(overall_bbox[2]-overall_bbox[0],2), round(overall_bbox[3]-overall_bbox[1],2)))
|
1433 |
|
1434 |
#line_level_characters_out = character_objects_out
|
1435 |
|
1436 |
return line_level_results_out, line_level_characters_out # Return both results and character objects
|
1437 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1438 |
|
1439 |
def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
|
1440 |
decision_process_table = pd.DataFrame()
|
|
|
1485 |
pikepdf_annotations_on_page.append(annotation)
|
1486 |
return pikepdf_annotations_on_page
|
1487 |
|
1488 |
+
# def run_page_text_redaction(language: str, # Language of the PDF content
|
1489 |
+
# chosen_redact_entities: List[str], # List of entities to be redacted
|
1490 |
+
# chosen_redact_comprehend_entities: List[str],
|
1491 |
+
# line_level_text_results_list: List[str],
|
1492 |
+
# line_characters: List,
|
1493 |
+
# page_analyser_results: List = [],
|
1494 |
+
# page_analysed_bounding_boxes: List = [],
|
1495 |
+
# comprehend_client = None, # Connection to AWS Comprehend
|
1496 |
+
# allow_list: List[str] = None, # Optional list of allowed entities
|
1497 |
+
# pii_identification_method: str = "Local"
|
1498 |
+
# ):
|
1499 |
+
|
1500 |
+
# # Initialize batching variables
|
1501 |
+
# current_batch = ""
|
1502 |
+
# current_batch_mapping = [] # List of (start_pos, line_index, OCRResult) tuples
|
1503 |
+
# all_text_line_results = [] # Store results for all lines
|
1504 |
+
# text_container_analyser_results = []
|
1505 |
+
# text_container_analysed_bounding_boxes = []
|
1506 |
+
|
1507 |
+
# # First pass: collect all lines into batches
|
1508 |
+
# for i, text_line in enumerate(line_level_text_results_list):
|
1509 |
+
# if chosen_redact_entities:
|
1510 |
+
# if pii_identification_method == "Local":
|
1511 |
+
|
1512 |
+
# #print("chosen_redact_entities:", chosen_redact_entities)
|
1513 |
+
|
1514 |
+
# # Process immediately for local analysis
|
1515 |
+
# text_line_analyser_result = nlp_analyser.analyze(
|
1516 |
+
# text=text_line.text,
|
1517 |
+
# language=language,
|
1518 |
+
# entities=chosen_redact_entities,
|
1519 |
+
# score_threshold=score_threshold,
|
1520 |
+
# return_decision_process=True,
|
1521 |
+
# allow_list=allow_list
|
1522 |
+
# )
|
1523 |
+
# all_text_line_results.append((i, text_line_analyser_result))
|
1524 |
+
|
1525 |
+
|
1526 |
+
# elif pii_identification_method == "AWS Comprehend":
|
1527 |
+
|
1528 |
+
# # First use the local Spacy model to pick up custom entities that AWS Comprehend can't search for.
|
1529 |
+
# custom_redact_entities = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
|
1530 |
+
|
1531 |
+
|
1532 |
+
# text_line_analyser_result = nlp_analyser.analyze(
|
1533 |
+
# text=text_line.text,
|
1534 |
+
# language=language,
|
1535 |
+
# entities=custom_redact_entities,
|
1536 |
+
# score_threshold=score_threshold,
|
1537 |
+
# return_decision_process=True,
|
1538 |
+
# allow_list=allow_list
|
1539 |
+
# )
|
1540 |
+
# all_text_line_results.append((i, text_line_analyser_result))
|
1541 |
+
|
1542 |
+
|
1543 |
+
# if len(text_line.text) >= 3:
|
1544 |
+
# # Add separator between lines
|
1545 |
+
# if current_batch:
|
1546 |
+
# current_batch += " | "
|
1547 |
+
|
1548 |
+
# start_pos = len(current_batch)
|
1549 |
+
# current_batch += text_line.text
|
1550 |
+
# current_batch_mapping.append((start_pos, i, text_line))
|
1551 |
+
|
1552 |
+
# # Process batch if approaching 300 characters or last line
|
1553 |
+
# if len(current_batch) >= 200 or i == len(line_level_text_results_list) - 1:
|
1554 |
+
# print("length of text for Comprehend:", len(current_batch))
|
1555 |
+
|
1556 |
+
# try:
|
1557 |
+
# response = comprehend_client.detect_pii_entities(
|
1558 |
+
# Text=current_batch,
|
1559 |
+
# LanguageCode=language
|
1560 |
+
# )
|
1561 |
+
# except Exception as e:
|
1562 |
+
# print(e)
|
1563 |
+
# time.sleep(3)
|
1564 |
+
# response = comprehend_client.detect_pii_entities(
|
1565 |
+
# Text=current_batch,
|
1566 |
+
# LanguageCode=language
|
1567 |
+
# )
|
1568 |
+
|
1569 |
+
# comprehend_query_number += 1
|
1570 |
+
|
1571 |
+
# # Process response and map back to original lines
|
1572 |
+
# if response and "Entities" in response:
|
1573 |
+
# for entity in response["Entities"]:
|
1574 |
+
# entity_start = entity["BeginOffset"]
|
1575 |
+
# entity_end = entity["EndOffset"]
|
1576 |
+
|
1577 |
+
# # Find which line this entity belongs to
|
1578 |
+
# for batch_start, line_idx, original_line in current_batch_mapping:
|
1579 |
+
# batch_end = batch_start + len(original_line.text)
|
1580 |
+
|
1581 |
+
# # Check if entity belongs to this line
|
1582 |
+
# if batch_start <= entity_start < batch_end:
|
1583 |
+
# # Adjust offsets relative to original line
|
1584 |
+
# relative_start = entity_start - batch_start
|
1585 |
+
# relative_end = min(entity_end - batch_start, len(original_line.text))
|
1586 |
+
|
1587 |
+
# result_text = original_line.text[relative_start:relative_end]
|
1588 |
+
|
1589 |
+
# if result_text not in allow_list:
|
1590 |
+
# if entity.get("Type") in chosen_redact_comprehend_entities:
|
1591 |
+
# # Create adjusted entity
|
1592 |
+
# adjusted_entity = entity.copy()
|
1593 |
+
# adjusted_entity["BeginOffset"] = relative_start
|
1594 |
+
# adjusted_entity["EndOffset"] = relative_end
|
1595 |
+
|
1596 |
+
# recogniser_entity = recognizer_result_from_dict(adjusted_entity)
|
1597 |
+
|
1598 |
+
# # Add to results for this line
|
1599 |
+
# existing_results = next((results for idx, results in all_text_line_results if idx == line_idx), [])
|
1600 |
+
# if not existing_results:
|
1601 |
+
# all_text_line_results.append((line_idx, [recogniser_entity]))
|
1602 |
+
# else:
|
1603 |
+
# existing_results.append(recogniser_entity)
|
1604 |
+
|
1605 |
+
# # Reset batch
|
1606 |
+
# current_batch = ""
|
1607 |
+
# current_batch_mapping = []
|
1608 |
+
|
1609 |
+
# # Second pass: process results for each line
|
1610 |
+
# for i, text_line in enumerate(line_level_text_results_list):
|
1611 |
+
# text_line_analyser_result = []
|
1612 |
+
# text_line_bounding_boxes = []
|
1613 |
+
|
1614 |
+
# # Get results for this line
|
1615 |
+
# line_results = next((results for idx, results in all_text_line_results if idx == i), [])
|
1616 |
+
|
1617 |
+
# if line_results:
|
1618 |
+
# text_line_analyser_result = line_results
|
1619 |
+
|
1620 |
+
# #print("Analysed text container, now merging bounding boxes")
|
1621 |
+
|
1622 |
+
# # Merge bounding boxes if very close together
|
1623 |
+
# text_line_bounding_boxes = merge_text_bounding_boxes(text_line_analyser_result, line_characters[i])
|
1624 |
+
|
1625 |
+
# #print("merged bounding boxes")
|
1626 |
+
|
1627 |
+
# text_container_analyser_results.extend(text_line_analyser_result)
|
1628 |
+
# #text_container_analysed_bounding_boxes.extend(text_line_bounding_boxes)
|
1629 |
+
|
1630 |
+
# #print("text_container_analyser_results:", text_container_analyser_results)
|
1631 |
+
|
1632 |
+
# page_analyser_results.extend(text_container_analyser_results) # Add this line
|
1633 |
+
# page_analysed_bounding_boxes.extend(text_line_bounding_boxes) # Add this line
|
1634 |
+
|
1635 |
+
# return page_analysed_bounding_boxes
|
1636 |
+
|
1637 |
+
# def map_back_entity_results(page_analyser_result, page_text_mapping, all_text_line_results):
|
1638 |
+
# for entity in page_analyser_result:
|
1639 |
+
# entity_start = entity.start
|
1640 |
+
# entity_end = entity.end
|
1641 |
+
|
1642 |
+
# for batch_start, line_idx, original_line, chars in page_text_mapping:
|
1643 |
+
# batch_end = batch_start + len(original_line.text)
|
1644 |
+
|
1645 |
+
# if batch_start <= entity_start < batch_end:
|
1646 |
+
# relative_start = entity_start - batch_start
|
1647 |
+
# relative_end = min(entity_end - batch_start, len(original_line.text))
|
1648 |
+
|
1649 |
+
# adjusted_entity = copy.deepcopy(entity)
|
1650 |
+
# adjusted_entity.start = relative_start
|
1651 |
+
# adjusted_entity.end = relative_end
|
1652 |
+
|
1653 |
+
# existing_entry = next((entry for idx, entry in all_text_line_results if idx == line_idx), None)
|
1654 |
+
|
1655 |
+
# if existing_entry is None:
|
1656 |
+
# all_text_line_results.append((line_idx, [adjusted_entity]))
|
1657 |
+
# else:
|
1658 |
+
# existing_entry.append(adjusted_entity)
|
1659 |
+
# break
|
1660 |
+
|
1661 |
+
# return all_text_line_results
|
1662 |
+
|
1663 |
+
|
1664 |
def redact_text_pdf(
|
1665 |
filename: str, # Path to the PDF file to be redacted
|
1666 |
prepared_pdf_image_path: str, # Path to the prepared PDF image for redaction
|
|
|
1783 |
|
1784 |
for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
|
1785 |
|
1786 |
+
all_line_characters = []
|
1787 |
+
all_line_level_text_results_list = []
|
1788 |
page_analyser_results = []
|
1789 |
page_analysed_bounding_boxes = []
|
1790 |
|
1791 |
characters = []
|
1792 |
pikepdf_annotations_on_page = []
|
1793 |
decision_process_table_on_page = pd.DataFrame()
|
1794 |
+
page_text_ocr_outputs = pd.DataFrame()
|
1795 |
|
1796 |
if analysis_type == text_ocr_option:
|
1797 |
for n, text_container in enumerate(page_layout):
|
1798 |
+
|
|
|
|
|
1799 |
characters = []
|
1800 |
|
1801 |
+
#print("text container:", text_container)
|
1802 |
+
|
1803 |
if isinstance(text_container, LTTextContainer) or isinstance(text_container, LTAnno):
|
1804 |
characters = get_text_container_characters(text_container)
|
1805 |
|
1806 |
# Create dataframe for all the text on the page
|
1807 |
line_level_text_results_list, line_characters = create_text_bounding_boxes_from_characters(characters)
|
1808 |
|
1809 |
+
### Create page_text_ocr_outputs (OCR format outputs)
|
1810 |
if line_level_text_results_list:
|
1811 |
# Convert to DataFrame and add to ongoing logging table
|
1812 |
line_level_text_results_df = pd.DataFrame([{
|
1813 |
'page': page_no + 1,
|
1814 |
+
'text': (result.text).strip(),
|
1815 |
'left': result.left,
|
1816 |
'top': result.top,
|
1817 |
'width': result.width,
|
1818 |
'height': result.height
|
1819 |
} for result in line_level_text_results_list])
|
1820 |
|
1821 |
+
page_text_ocr_outputs = pd.concat([page_text_ocr_outputs, line_level_text_results_df])
|
1822 |
+
|
1823 |
+
all_line_level_text_results_list.extend(line_level_text_results_list)
|
1824 |
+
all_line_characters.extend(line_characters)
|
1825 |
+
|
1826 |
+
### REDACTION
|
1827 |
+
|
1828 |
+
page_analysed_bounding_boxes = run_page_text_redaction(
|
1829 |
+
language,
|
1830 |
+
chosen_redact_entities,
|
1831 |
+
chosen_redact_comprehend_entities,
|
1832 |
+
all_line_level_text_results_list, #line_level_text_results_list,
|
1833 |
+
all_line_characters,
|
1834 |
+
page_analyser_results,
|
1835 |
+
page_analysed_bounding_boxes,
|
1836 |
+
comprehend_client,
|
1837 |
+
allow_list,
|
1838 |
+
pii_identification_method,
|
1839 |
+
nlp_analyser,
|
1840 |
+
score_threshold,
|
1841 |
+
custom_entities,
|
1842 |
+
comprehend_query_number
|
1843 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1844 |
|
1845 |
|
1846 |
#print("page_analyser_results:", page_analyser_results)
|
|
|
1872 |
reported_page_no = page_no + 1
|
1873 |
print("For page number:", reported_page_no, "there are", len(image_annotations["boxes"]), "annotations")
|
1874 |
|
1875 |
+
# Join extracted text outputs for all lines together
|
1876 |
+
if not page_text_ocr_outputs.empty:
|
1877 |
+
page_text_ocr_outputs = page_text_ocr_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
|
1878 |
+
all_line_level_ocr_results_df = pd.concat([all_line_level_ocr_results_df, page_text_ocr_outputs])
|
1879 |
+
|
1880 |
# Write logs
|
1881 |
# Create decision process table
|
1882 |
decision_process_table_on_page = create_text_redaction_process_results(page_analyser_results, page_analysed_bounding_boxes, current_loop_page)
|
1883 |
|
1884 |
if not decision_process_table_on_page.empty:
|
1885 |
all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table_on_page])
|
1886 |
+
#print("all_decision_process_table:", all_decision_process_table)
|
|
|
|
|
|
|
|
|
1887 |
|
1888 |
toc = time.perf_counter()
|
1889 |
|
tools/helper_functions.py
CHANGED
@@ -1,10 +1,13 @@
|
|
1 |
import os
|
2 |
import re
|
|
|
|
|
3 |
import gradio as gr
|
4 |
import pandas as pd
|
5 |
import unicodedata
|
6 |
from typing import List
|
7 |
from gradio_image_annotation import image_annotator
|
|
|
8 |
|
9 |
def reset_state_vars():
|
10 |
return [], [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
|
@@ -120,6 +123,8 @@ def custom_regex_load(in_file:List[str], file_type:str = "Allow list"):
|
|
120 |
custom_regex = pd.read_csv(regex_file_name, low_memory=False, header=None)
|
121 |
#regex_file_name_no_ext = get_file_path_end(regex_file_name)
|
122 |
|
|
|
|
|
123 |
output_text = file_type + " file loaded."
|
124 |
|
125 |
print(output_text)
|
@@ -229,10 +234,10 @@ async def get_connection_params(request: gr.Request):
|
|
229 |
#if 'context' in request_data:
|
230 |
# print("Request context dictionary:", request_data['context'])
|
231 |
|
232 |
-
print("Request headers dictionary:", request.headers)
|
233 |
-
print("All host elements", request.client)
|
234 |
-
print("IP address:", request.client.host)
|
235 |
-
print("Query parameters:", dict(request.query_params))
|
236 |
# To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
|
237 |
#print("Request dictionary to object:", request.request.body())
|
238 |
print("Session hash:", request.session_hash)
|
@@ -264,6 +269,23 @@ async def get_connection_params(request: gr.Request):
|
|
264 |
elif 'x-amzn-oidc-identity' in request.headers:
|
265 |
out_session_hash = request.headers['x-amzn-oidc-identity']
|
266 |
base_folder = "user-files/"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
267 |
print("Cognito ID found:", out_session_hash)
|
268 |
|
269 |
else:
|
|
|
1 |
import os
|
2 |
import re
|
3 |
+
import boto3
|
4 |
+
from botocore.exceptions import ClientError
|
5 |
import gradio as gr
|
6 |
import pandas as pd
|
7 |
import unicodedata
|
8 |
from typing import List
|
9 |
from gradio_image_annotation import image_annotator
|
10 |
+
from tools.auth import user_pool_id
|
11 |
|
12 |
def reset_state_vars():
|
13 |
return [], [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
|
|
|
123 |
custom_regex = pd.read_csv(regex_file_name, low_memory=False, header=None)
|
124 |
#regex_file_name_no_ext = get_file_path_end(regex_file_name)
|
125 |
|
126 |
+
custom_regex.columns = custom_regex.columns.astype(str)
|
127 |
+
|
128 |
output_text = file_type + " file loaded."
|
129 |
|
130 |
print(output_text)
|
|
|
234 |
#if 'context' in request_data:
|
235 |
# print("Request context dictionary:", request_data['context'])
|
236 |
|
237 |
+
# print("Request headers dictionary:", request.headers)
|
238 |
+
# print("All host elements", request.client)
|
239 |
+
# print("IP address:", request.client.host)
|
240 |
+
# print("Query parameters:", dict(request.query_params))
|
241 |
# To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
|
242 |
#print("Request dictionary to object:", request.request.body())
|
243 |
print("Session hash:", request.session_hash)
|
|
|
269 |
elif 'x-amzn-oidc-identity' in request.headers:
|
270 |
out_session_hash = request.headers['x-amzn-oidc-identity']
|
271 |
base_folder = "user-files/"
|
272 |
+
|
273 |
+
# Fetch email address using Cognito client
|
274 |
+
cognito_client = boto3.client('cognito-idp')
|
275 |
+
try:
|
276 |
+
response = cognito_client.admin_get_user(
|
277 |
+
UserPoolId=user_pool_id, # Replace with your User Pool ID
|
278 |
+
Username=out_session_hash
|
279 |
+
)
|
280 |
+
email = next(attr['Value'] for attr in response['UserAttributes'] if attr['Name'] == 'email')
|
281 |
+
#print("Email address found:", email)
|
282 |
+
|
283 |
+
out_session_hash = email
|
284 |
+
except ClientError as e:
|
285 |
+
print("Error fetching user details:", e)
|
286 |
+
email = None
|
287 |
+
|
288 |
+
|
289 |
print("Cognito ID found:", out_session_hash)
|
290 |
|
291 |
else:
|
tools/load_spacy_model_custom_recognisers.py
CHANGED
@@ -7,7 +7,6 @@ spacy.prefer_gpu()
|
|
7 |
from spacy.cli.download import download
|
8 |
import re
|
9 |
|
10 |
-
# %%
|
11 |
model_name = "en_core_web_sm" #"en_core_web_trf"
|
12 |
score_threshold = 0.001
|
13 |
custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"]
|
@@ -34,7 +33,7 @@ def custom_word_list_recogniser(custom_list:List[str]=[]):
|
|
34 |
rf'(?<!\w){re.escape(term.strip()).replace(quote_str, replace_str)}(?!\w)'
|
35 |
for term in custom_list
|
36 |
)
|
37 |
-
print(custom_regex)
|
38 |
|
39 |
custom_pattern = Pattern(name="custom_pattern", regex=custom_regex, score = 1)
|
40 |
|
|
|
7 |
from spacy.cli.download import download
|
8 |
import re
|
9 |
|
|
|
10 |
model_name = "en_core_web_sm" #"en_core_web_trf"
|
11 |
score_threshold = 0.001
|
12 |
custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"]
|
|
|
33 |
rf'(?<!\w){re.escape(term.strip()).replace(quote_str, replace_str)}(?!\w)'
|
34 |
for term in custom_list
|
35 |
)
|
36 |
+
#print(custom_regex)
|
37 |
|
38 |
custom_pattern = Pattern(name="custom_pattern", regex=custom_regex, score = 1)
|
39 |
|
tools/redaction_review.py
CHANGED
@@ -117,13 +117,10 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
|
|
117 |
recogniser_dataframe_out = gr.Dataframe(review_dataframe)
|
118 |
recogniser_entities_list = recogniser_dataframe_gr["label"].unique().tolist()
|
119 |
|
120 |
-
print("recogniser_entities_list all options:", recogniser_entities_list)
|
121 |
-
|
122 |
recogniser_entities_list = sorted(recogniser_entities_list)
|
123 |
recogniser_entities_list = [entity for entity in recogniser_entities_list if entity != 'Redaction'] # Remove any existing 'Redaction'
|
124 |
recogniser_entities_list.insert(0, 'Redaction') # Add 'Redaction' to the start of the list
|
125 |
|
126 |
-
print("recogniser_entities_list:", recogniser_entities_list)
|
127 |
|
128 |
zoom_str = str(zoom) + '%'
|
129 |
recogniser_colour_list = [(0, 0, 0) for _ in range(len(recogniser_entities_list))]
|
@@ -248,6 +245,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
|
|
248 |
|
249 |
output_files = []
|
250 |
output_log_files = []
|
|
|
251 |
|
252 |
#print("File paths in apply_redactions:", file_paths)
|
253 |
|
@@ -264,7 +262,8 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
|
|
264 |
|
265 |
for file_path in file_paths:
|
266 |
#print("file_path:", file_path)
|
267 |
-
|
|
|
268 |
|
269 |
file_extension = os.path.splitext(file_path)[1].lower()
|
270 |
|
@@ -287,7 +286,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
|
|
287 |
|
288 |
draw.rectangle(coords, fill=fill)
|
289 |
|
290 |
-
image.save(output_folder +
|
291 |
|
292 |
doc = [image]
|
293 |
|
@@ -298,6 +297,9 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
|
|
298 |
# If working with pdfs
|
299 |
elif is_pdf(file_path) == True:
|
300 |
pdf_doc = pymupdf.open(file_path)
|
|
|
|
|
|
|
301 |
|
302 |
number_of_pages = pdf_doc.page_count
|
303 |
|
@@ -316,7 +318,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
|
|
316 |
#all_image_annotations[i]['image'] = image_loc.tolist()
|
317 |
elif isinstance(image_loc, Image.Image):
|
318 |
image = image_loc
|
319 |
-
#image_out_folder = output_folder +
|
320 |
#image_loc.save(image_out_folder)
|
321 |
#all_image_annotations[i]['image'] = image_out_folder
|
322 |
elif isinstance(image_loc, str):
|
@@ -330,25 +332,34 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
|
|
330 |
|
331 |
#try:
|
332 |
if pdf_doc:
|
333 |
-
out_pdf_file_path = output_folder +
|
334 |
pdf_doc.save(out_pdf_file_path)
|
335 |
output_files.append(out_pdf_file_path)
|
336 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
337 |
try:
|
338 |
-
print("Saving annotations to JSON")
|
339 |
|
340 |
-
out_annotation_file_path = output_folder +
|
341 |
with open(out_annotation_file_path, 'w') as f:
|
342 |
json.dump(all_image_annotations, f)
|
343 |
output_log_files.append(out_annotation_file_path)
|
344 |
|
345 |
-
print("Saving annotations to CSV review file")
|
346 |
|
347 |
#print("review_file_state:", review_file_state)
|
348 |
|
349 |
# Convert json to csv and also save this
|
350 |
review_df = convert_review_json_to_pandas_df(all_image_annotations, review_file_state)
|
351 |
-
out_review_file_file_path = output_folder +
|
352 |
review_df.to_csv(out_review_file_file_path, index=None)
|
353 |
output_files.append(out_review_file_file_path)
|
354 |
|
@@ -367,9 +378,6 @@ def update_entities_df(choice:str, df:pd.DataFrame):
|
|
367 |
return df.loc[df["label"]==choice,:]
|
368 |
|
369 |
def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
|
370 |
-
#print("index", evt.index)
|
371 |
-
#print("value", evt.value)
|
372 |
-
#print("row_value", evt.row_value)
|
373 |
row_value_page = evt.row_value[0] # This is the page number value
|
374 |
return row_value_page
|
375 |
|
|
|
117 |
recogniser_dataframe_out = gr.Dataframe(review_dataframe)
|
118 |
recogniser_entities_list = recogniser_dataframe_gr["label"].unique().tolist()
|
119 |
|
|
|
|
|
120 |
recogniser_entities_list = sorted(recogniser_entities_list)
|
121 |
recogniser_entities_list = [entity for entity in recogniser_entities_list if entity != 'Redaction'] # Remove any existing 'Redaction'
|
122 |
recogniser_entities_list.insert(0, 'Redaction') # Add 'Redaction' to the start of the list
|
123 |
|
|
|
124 |
|
125 |
zoom_str = str(zoom) + '%'
|
126 |
recogniser_colour_list = [(0, 0, 0) for _ in range(len(recogniser_entities_list))]
|
|
|
245 |
|
246 |
output_files = []
|
247 |
output_log_files = []
|
248 |
+
pdf_doc = []
|
249 |
|
250 |
#print("File paths in apply_redactions:", file_paths)
|
251 |
|
|
|
262 |
|
263 |
for file_path in file_paths:
|
264 |
#print("file_path:", file_path)
|
265 |
+
file_name_without_ext = get_file_path_end(file_path)
|
266 |
+
file_name_with_ext = os.path.basename(file_path)
|
267 |
|
268 |
file_extension = os.path.splitext(file_path)[1].lower()
|
269 |
|
|
|
286 |
|
287 |
draw.rectangle(coords, fill=fill)
|
288 |
|
289 |
+
image.save(output_folder + file_name_without_ext + "_redacted.png")
|
290 |
|
291 |
doc = [image]
|
292 |
|
|
|
297 |
# If working with pdfs
|
298 |
elif is_pdf(file_path) == True:
|
299 |
pdf_doc = pymupdf.open(file_path)
|
300 |
+
orig_pdf_file_path = file_path
|
301 |
+
|
302 |
+
output_files.append(orig_pdf_file_path)
|
303 |
|
304 |
number_of_pages = pdf_doc.page_count
|
305 |
|
|
|
318 |
#all_image_annotations[i]['image'] = image_loc.tolist()
|
319 |
elif isinstance(image_loc, Image.Image):
|
320 |
image = image_loc
|
321 |
+
#image_out_folder = output_folder + file_name_without_ext + "_page_" + str(i) + ".png"
|
322 |
#image_loc.save(image_out_folder)
|
323 |
#all_image_annotations[i]['image'] = image_out_folder
|
324 |
elif isinstance(image_loc, str):
|
|
|
332 |
|
333 |
#try:
|
334 |
if pdf_doc:
|
335 |
+
out_pdf_file_path = output_folder + file_name_without_ext + "_redacted.pdf"
|
336 |
pdf_doc.save(out_pdf_file_path)
|
337 |
output_files.append(out_pdf_file_path)
|
338 |
|
339 |
+
else:
|
340 |
+
print("PDF input not found.")
|
341 |
+
|
342 |
+
# If save_pdf is not true, then add the original pdf to the output files
|
343 |
+
else:
|
344 |
+
if is_pdf(file_path) == True:
|
345 |
+
orig_pdf_file_path = file_path
|
346 |
+
output_files.append(orig_pdf_file_path)
|
347 |
+
|
348 |
try:
|
349 |
+
#print("Saving annotations to JSON")
|
350 |
|
351 |
+
out_annotation_file_path = output_folder + file_name_with_ext + '_review_file.json'
|
352 |
with open(out_annotation_file_path, 'w') as f:
|
353 |
json.dump(all_image_annotations, f)
|
354 |
output_log_files.append(out_annotation_file_path)
|
355 |
|
356 |
+
#print("Saving annotations to CSV review file")
|
357 |
|
358 |
#print("review_file_state:", review_file_state)
|
359 |
|
360 |
# Convert json to csv and also save this
|
361 |
review_df = convert_review_json_to_pandas_df(all_image_annotations, review_file_state)
|
362 |
+
out_review_file_file_path = output_folder + file_name_with_ext + '_review_file.csv'
|
363 |
review_df.to_csv(out_review_file_file_path, index=None)
|
364 |
output_files.append(out_review_file_file_path)
|
365 |
|
|
|
378 |
return df.loc[df["label"]==choice,:]
|
379 |
|
380 |
def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
|
|
|
|
|
|
|
381 |
row_value_page = evt.row_value[0] # This is the page number value
|
382 |
return row_value_page
|
383 |
|