Merge pull request #6 from seanpedrick-case/dev
Browse filesExport to Adobe, fuzzy matching, and duplicate page identification
- Dockerfile +3 -0
- app.py +85 -25
- requirements.txt +4 -0
- tools/custom_image_analyser_engine.py +42 -11
- tools/data_anonymise.py +3 -3
- tools/file_conversion.py +10 -6
- tools/file_redaction.py +99 -236
- tools/find_duplicate_pages.py +274 -0
- tools/helper_functions.py +66 -20
- tools/load_spacy_model_custom_recognisers.py +176 -25
- tools/redaction_review.py +376 -4
Dockerfile
CHANGED
@@ -60,6 +60,9 @@ RUN mkdir -p /home/user/app/output \
|
|
60 |
# Copy installed packages from builder stage
|
61 |
COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
|
62 |
|
|
|
|
|
|
|
63 |
# Entrypoint helps to switch between Gradio and Lambda mode
|
64 |
COPY entrypoint.sh /entrypoint.sh
|
65 |
|
|
|
60 |
# Copy installed packages from builder stage
|
61 |
COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
|
62 |
|
63 |
+
# Download NLTK data packages
|
64 |
+
RUN python -m nltk.downloader punkt stopwords punkt_tab
|
65 |
+
|
66 |
# Entrypoint helps to switch between Gradio and Lambda mode
|
67 |
COPY entrypoint.sh /entrypoint.sh
|
68 |
|
app.py
CHANGED
@@ -10,15 +10,16 @@ from datetime import datetime
|
|
10 |
from gradio_image_annotation import image_annotator
|
11 |
from gradio_image_annotation.image_annotator import AnnotatedImageData
|
12 |
|
13 |
-
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
|
14 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
|
15 |
from tools.file_redaction import choose_and_run_redactor
|
16 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
|
17 |
-
from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df, df_select_callback
|
18 |
from tools.data_anonymise import anonymise_data_files
|
19 |
from tools.auth import authenticate_user
|
20 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
21 |
from tools.custom_csvlogger import CSVLogger_custom
|
|
|
22 |
|
23 |
today_rev = datetime.now().strftime("%Y%m%d")
|
24 |
|
@@ -29,15 +30,16 @@ ensure_output_folder_exists()
|
|
29 |
|
30 |
chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']
|
31 |
|
32 |
-
full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER']
|
33 |
|
34 |
# Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
|
35 |
chosen_comprehend_entities.extend(custom_entities)
|
36 |
full_comprehend_entity_list.extend(custom_entities)
|
37 |
|
|
|
38 |
chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", "CUSTOM"]
|
39 |
|
40 |
-
full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM']
|
41 |
|
42 |
language = 'en'
|
43 |
|
@@ -67,10 +69,9 @@ with app:
|
|
67 |
pdf_doc_state = gr.State([])
|
68 |
all_image_annotations_state = gr.State([])
|
69 |
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
review_file_state = gr.Dataframe(value=pd.DataFrame(), label="review_file_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
|
74 |
|
75 |
session_hash_state = gr.State()
|
76 |
s3_output_folder_state = gr.State()
|
@@ -129,16 +130,16 @@ with app:
|
|
129 |
## Settings page variables
|
130 |
default_allow_list_file_name = "default_allow_list.csv"
|
131 |
default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
|
132 |
-
in_allow_list_state = gr.Dataframe(value=pd.DataFrame(), label="in_allow_list_df", visible=False, type="pandas")
|
133 |
|
134 |
default_deny_list_file_name = "default_deny_list.csv"
|
135 |
default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
|
136 |
-
in_deny_list_state = gr.Dataframe(value=pd.DataFrame(), label="in_deny_list_df", visible=False, type="pandas")
|
137 |
in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
|
138 |
|
139 |
fully_redacted_list_file_name = "default_fully_redacted_list.csv"
|
140 |
fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
|
141 |
-
in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), label="in_full_redacted_list_df", visible=False, type="pandas")
|
142 |
in_fully_redacted_text_in = gr.Textbox(value="Fully redacted page list", visible=False)
|
143 |
|
144 |
# S3 settings for default allow list load
|
@@ -149,6 +150,12 @@ with app:
|
|
149 |
# Base dataframe for recognisers that is not modified subsequent to load
|
150 |
recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", visible=False)
|
151 |
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
###
|
153 |
# UI DESIGN
|
154 |
###
|
@@ -164,8 +171,10 @@ with app:
|
|
164 |
|
165 |
NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.""")
|
166 |
|
167 |
-
|
168 |
-
|
|
|
|
|
169 |
with gr.Accordion("Redact document", open = True):
|
170 |
in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'], height=file_input_height)
|
171 |
if RUN_AWS_FUNCTIONS == "1":
|
@@ -194,7 +203,9 @@ with app:
|
|
194 |
pdf_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
|
195 |
pdf_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
|
196 |
|
197 |
-
|
|
|
|
|
198 |
with gr.Tab("Review redactions", id="tab_object_annotation"):
|
199 |
|
200 |
with gr.Accordion(label = "Review redaction file", open=True):
|
@@ -215,7 +226,6 @@ with app:
|
|
215 |
clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
|
216 |
|
217 |
with gr.Row():
|
218 |
-
|
219 |
with gr.Column(scale=1):
|
220 |
|
221 |
zoom_str = str(annotator_zoom_number) + '%'
|
@@ -247,10 +257,16 @@ with app:
|
|
247 |
#with gr.Column(scale=1):
|
248 |
with gr.Row():
|
249 |
recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
|
250 |
-
recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", label="Search results. Click to go to page")
|
251 |
|
252 |
-
|
|
|
|
|
|
|
|
|
|
|
253 |
# TEXT / TABULAR DATA TAB
|
|
|
254 |
with gr.Tab(label="Open text or Excel/csv files"):
|
255 |
gr.Markdown(
|
256 |
"""
|
@@ -280,7 +296,20 @@ with app:
|
|
280 |
data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
|
281 |
data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
|
282 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
283 |
# SETTINGS TAB
|
|
|
284 |
with gr.Tab(label="Redaction settings"):
|
285 |
with gr.Accordion("Custom allow, deny, and full page redaction lists", open = True):
|
286 |
with gr.Row():
|
@@ -296,9 +325,12 @@ with app:
|
|
296 |
|
297 |
with gr.Accordion("Select entity types to redact", open = True):
|
298 |
in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Local PII identification model (click empty space in box for full list)")
|
299 |
-
|
300 |
in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="AWS Comprehend PII identification model (click empty space in box for full list)")
|
301 |
|
|
|
|
|
|
|
|
|
302 |
with gr.Accordion("Redact only selected pages", open = False):
|
303 |
with gr.Row():
|
304 |
page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
|
@@ -312,21 +344,30 @@ with app:
|
|
312 |
with gr.Accordion("Settings for open text or xlsx/csv files", open = False):
|
313 |
anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
|
314 |
|
315 |
-
log_files_output = gr.File(label="Log file output", interactive=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
316 |
|
317 |
###
|
318 |
# PDF/IMAGE REDACTION
|
319 |
###
|
320 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
|
321 |
|
322 |
-
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state]).\
|
323 |
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
|
324 |
-
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
325 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files], api_name="redact_doc").\
|
326 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
327 |
|
328 |
# If the app has completed a batch of pages, it will run this until the end of all pages in the document
|
329 |
-
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
330 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files]).\
|
331 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
332 |
|
@@ -339,7 +380,8 @@ with app:
|
|
339 |
###
|
340 |
|
341 |
# Upload previous files for modifying redactions
|
342 |
-
upload_previous_review_file_btn.click(fn=
|
|
|
343 |
then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
|
344 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
345 |
|
@@ -397,7 +439,16 @@ with app:
|
|
397 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
398 |
then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
399 |
|
400 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
401 |
###
|
402 |
# TABULAR DATA REDACTION
|
403 |
###
|
@@ -410,13 +461,22 @@ with app:
|
|
410 |
text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
|
411 |
then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
|
412 |
|
|
|
|
|
|
|
|
|
|
|
413 |
###
|
414 |
# SETTINGS PAGE INPUT / OUTPUT
|
415 |
###
|
416 |
-
# If a custom allow list is uploaded
|
417 |
in_allow_list.change(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
|
418 |
in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
|
419 |
in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
|
|
|
|
|
|
|
|
|
420 |
|
421 |
|
422 |
###
|
|
|
10 |
from gradio_image_annotation import image_annotator
|
11 |
from gradio_image_annotation.image_annotator import AnnotatedImageData
|
12 |
|
13 |
+
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, reset_review_vars, merge_csv_files
|
14 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
|
15 |
from tools.file_redaction import choose_and_run_redactor
|
16 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
|
17 |
+
from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe
|
18 |
from tools.data_anonymise import anonymise_data_files
|
19 |
from tools.auth import authenticate_user
|
20 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
21 |
from tools.custom_csvlogger import CSVLogger_custom
|
22 |
+
from tools.find_duplicate_pages import identify_similar_pages
|
23 |
|
24 |
today_rev = datetime.now().strftime("%Y%m%d")
|
25 |
|
|
|
30 |
|
31 |
chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']
|
32 |
|
33 |
+
full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER', "CUSTOM_FUZZY"]
|
34 |
|
35 |
# Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
|
36 |
chosen_comprehend_entities.extend(custom_entities)
|
37 |
full_comprehend_entity_list.extend(custom_entities)
|
38 |
|
39 |
+
# Entities for local PII redaction option
|
40 |
chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", "CUSTOM"]
|
41 |
|
42 |
+
full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM', 'CUSTOM_FUZZY']
|
43 |
|
44 |
language = 'en'
|
45 |
|
|
|
69 |
pdf_doc_state = gr.State([])
|
70 |
all_image_annotations_state = gr.State([])
|
71 |
|
72 |
+
all_line_level_ocr_results_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_line_level_ocr_results_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
|
73 |
+
all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas") # gr.State(pd.DataFrame())
|
74 |
+
review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
|
|
|
75 |
|
76 |
session_hash_state = gr.State()
|
77 |
s3_output_folder_state = gr.State()
|
|
|
130 |
## Settings page variables
|
131 |
default_allow_list_file_name = "default_allow_list.csv"
|
132 |
default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
|
133 |
+
in_allow_list_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_allow_list_df", visible=False, type="pandas")
|
134 |
|
135 |
default_deny_list_file_name = "default_deny_list.csv"
|
136 |
default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
|
137 |
+
in_deny_list_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_deny_list_df", visible=False, type="pandas")
|
138 |
in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
|
139 |
|
140 |
fully_redacted_list_file_name = "default_fully_redacted_list.csv"
|
141 |
fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
|
142 |
+
in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_full_redacted_list_df", visible=False, type="pandas")
|
143 |
in_fully_redacted_text_in = gr.Textbox(value="Fully redacted page list", visible=False)
|
144 |
|
145 |
# S3 settings for default allow list load
|
|
|
150 |
# Base dataframe for recognisers that is not modified subsequent to load
|
151 |
recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", visible=False)
|
152 |
|
153 |
+
# Duplicate page detection
|
154 |
+
in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
|
155 |
+
duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_deny_list_df", visible=False, type="pandas")
|
156 |
+
|
157 |
+
|
158 |
+
|
159 |
###
|
160 |
# UI DESIGN
|
161 |
###
|
|
|
171 |
|
172 |
NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.""")
|
173 |
|
174 |
+
###
|
175 |
+
# REDACTION PDF/IMAGES TABL
|
176 |
+
###
|
177 |
+
with gr.Tab("Redact PDFs/images"):
|
178 |
with gr.Accordion("Redact document", open = True):
|
179 |
in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'], height=file_input_height)
|
180 |
if RUN_AWS_FUNCTIONS == "1":
|
|
|
203 |
pdf_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
|
204 |
pdf_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
|
205 |
|
206 |
+
###
|
207 |
+
# REVIEW REDACTIONS TAB
|
208 |
+
###
|
209 |
with gr.Tab("Review redactions", id="tab_object_annotation"):
|
210 |
|
211 |
with gr.Accordion(label = "Review redaction file", open=True):
|
|
|
226 |
clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
|
227 |
|
228 |
with gr.Row():
|
|
|
229 |
with gr.Column(scale=1):
|
230 |
|
231 |
zoom_str = str(annotator_zoom_number) + '%'
|
|
|
257 |
#with gr.Column(scale=1):
|
258 |
with gr.Row():
|
259 |
recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
|
260 |
+
recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", label="Search results. Click to go to page")
|
261 |
|
262 |
+
with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
|
263 |
+
convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
|
264 |
+
adobe_review_files_out = gr.File(label="Output Adobe comment files will appear here. If converting from .xfdf file to review_file.csv, upload the original pdf with the xfdf file here then click Convert below.", file_count='multiple', file_types=['.csv', '.xfdf', '.pdf'])
|
265 |
+
convert_adobe_to_review_file_btn = gr.Button("Convert Adobe .xfdf comment file to review_file.csv", variant="primary")
|
266 |
+
|
267 |
+
###
|
268 |
# TEXT / TABULAR DATA TAB
|
269 |
+
###
|
270 |
with gr.Tab(label="Open text or Excel/csv files"):
|
271 |
gr.Markdown(
|
272 |
"""
|
|
|
296 |
data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
|
297 |
data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
|
298 |
|
299 |
+
###
|
300 |
+
# IDENTIFY DUPLICATE PAGES TAB
|
301 |
+
###
|
302 |
+
with gr.Tab(label="Identify duplicate pages"):
|
303 |
+
with gr.Accordion("Identify duplicate pages to redact", open = True):
|
304 |
+
in_duplicate_pages = gr.File(label="Upload multiple 'ocr_output.csv' data files from redaction jobs here to compare", file_count="multiple", height=file_input_height, file_types=['.csv'])
|
305 |
+
|
306 |
+
find_duplicate_pages_btn = gr.Button(value="Identify duplicate pages", variant="primary")
|
307 |
+
|
308 |
+
duplicate_pages_out =gr.File(label="Duplicate pages analysis output", file_count="multiple", height=file_input_height, file_types=['.csv'])
|
309 |
+
|
310 |
+
###
|
311 |
# SETTINGS TAB
|
312 |
+
###
|
313 |
with gr.Tab(label="Redaction settings"):
|
314 |
with gr.Accordion("Custom allow, deny, and full page redaction lists", open = True):
|
315 |
with gr.Row():
|
|
|
325 |
|
326 |
with gr.Accordion("Select entity types to redact", open = True):
|
327 |
in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Local PII identification model (click empty space in box for full list)")
|
|
|
328 |
in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="AWS Comprehend PII identification model (click empty space in box for full list)")
|
329 |
|
330 |
+
with gr.Row():
|
331 |
+
max_fuzzy_spelling_mistakes_num = gr.Number(label="Maximum number of spelling mistakes allowed for fuzzy matching (CUSTOM_FUZZY entity).", value=1, minimum=0, maximum=9, precision=0)
|
332 |
+
match_fuzzy_whole_phrase_bool = gr.Checkbox(label="Should fuzzy match on entire phrases in deny list (as opposed to each word individually)?", value=True)
|
333 |
+
|
334 |
with gr.Accordion("Redact only selected pages", open = False):
|
335 |
with gr.Row():
|
336 |
page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
|
|
|
344 |
with gr.Accordion("Settings for open text or xlsx/csv files", open = False):
|
345 |
anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
|
346 |
|
347 |
+
log_files_output = gr.File(label="Log file output", interactive=False)
|
348 |
+
|
349 |
+
with gr.Accordion("Combine multiple review files", open = False):
|
350 |
+
multiple_review_files_in_out = gr.File(label="Output Adobe comment files will appear here. If converting from .xfdf file to review_file.csv, upload the original pdf with the xfdf file here then click Convert below.", file_count='multiple', file_types=['.csv'])
|
351 |
+
merge_multiple_review_files_btn = gr.Button("Merge multiple review files into one", variant="primary")
|
352 |
+
|
353 |
+
|
354 |
+
|
355 |
+
|
356 |
+
### UI INTERACTION ###
|
357 |
|
358 |
###
|
359 |
# PDF/IMAGE REDACTION
|
360 |
###
|
361 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
|
362 |
|
363 |
+
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
364 |
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
|
365 |
+
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool],
|
366 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files], api_name="redact_doc").\
|
367 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
368 |
|
369 |
# If the app has completed a batch of pages, it will run this until the end of all pages in the document
|
370 |
+
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool],
|
371 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files]).\
|
372 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
373 |
|
|
|
380 |
###
|
381 |
|
382 |
# Upload previous files for modifying redactions
|
383 |
+
upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
384 |
+
then(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
|
385 |
then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
|
386 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
387 |
|
|
|
439 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
440 |
then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
441 |
|
442 |
+
# Convert review file to xfdf Adobe format
|
443 |
+
convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
|
444 |
+
then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
|
445 |
+
then(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state], outputs=[adobe_review_files_out])
|
446 |
+
|
447 |
+
# Convert xfdf Adobe file back to review_file.csv
|
448 |
+
convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
|
449 |
+
then(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
|
450 |
+
then(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state], outputs=[output_review_files], scroll_to_output=True)
|
451 |
+
|
452 |
###
|
453 |
# TABULAR DATA REDACTION
|
454 |
###
|
|
|
461 |
text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
|
462 |
then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
|
463 |
|
464 |
+
###
|
465 |
+
# IDENTIFY DUPLICATE PAGES
|
466 |
+
###
|
467 |
+
find_duplicate_pages_btn.click(fn=identify_similar_pages, inputs=[in_duplicate_pages], outputs=[duplicate_pages_df, duplicate_pages_out])
|
468 |
+
|
469 |
###
|
470 |
# SETTINGS PAGE INPUT / OUTPUT
|
471 |
###
|
472 |
+
# If a custom allow/deny/duplicate page list is uploaded
|
473 |
in_allow_list.change(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
|
474 |
in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
|
475 |
in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
|
476 |
+
|
477 |
+
|
478 |
+
# Merge multiple review csv files together
|
479 |
+
merge_multiple_review_files_btn.click(fn=merge_csv_files, inputs=multiple_review_files_in_out, outputs=multiple_review_files_in_out)
|
480 |
|
481 |
|
482 |
###
|
requirements.txt
CHANGED
@@ -7,6 +7,8 @@ presidio_anonymizer==2.2.355
|
|
7 |
presidio-image-redactor==0.0.53
|
8 |
pikepdf==8.15.1
|
9 |
pandas==2.2.3
|
|
|
|
|
10 |
spacy==3.8.3
|
11 |
#en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
|
12 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
@@ -15,6 +17,8 @@ boto3==1.35.83
|
|
15 |
pyarrow==18.1.0
|
16 |
openpyxl==3.1.2
|
17 |
Faker==22.2.0
|
|
|
|
|
18 |
gradio_image_annotation==0.2.5
|
19 |
numpy==1.26.4
|
20 |
awslambdaric==3.0.0
|
|
|
7 |
presidio-image-redactor==0.0.53
|
8 |
pikepdf==8.15.1
|
9 |
pandas==2.2.3
|
10 |
+
nltk==3.9.1
|
11 |
+
scikit-learn==1.5.2
|
12 |
spacy==3.8.3
|
13 |
#en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
|
14 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
|
|
17 |
pyarrow==18.1.0
|
18 |
openpyxl==3.1.2
|
19 |
Faker==22.2.0
|
20 |
+
python-levenshtein==0.26.1
|
21 |
+
spaczz==0.6.1
|
22 |
gradio_image_annotation==0.2.5
|
23 |
numpy==1.26.4
|
24 |
awslambdaric==3.0.0
|
tools/custom_image_analyser_engine.py
CHANGED
@@ -560,7 +560,7 @@ def run_page_text_redaction(
|
|
560 |
if not nlp_analyser:
|
561 |
raise ValueError("nlp_analyser is required for Local identification method")
|
562 |
|
563 |
-
print("page text:", page_text)
|
564 |
|
565 |
page_analyser_result = nlp_analyser.analyze(
|
566 |
text=page_text,
|
@@ -1077,15 +1077,15 @@ class CustomImageAnalyzerEngine:
|
|
1077 |
line_length = len(line_text)
|
1078 |
redaction_text = redaction_relevant_ocr_result.text
|
1079 |
|
1080 |
-
#
|
1081 |
|
1082 |
for redaction_result in text_analyzer_results:
|
1083 |
-
#
|
1084 |
-
#
|
1085 |
-
#
|
1086 |
-
#
|
1087 |
|
1088 |
-
# Check if the redaction text is
|
1089 |
|
1090 |
if redaction_text not in allow_list:
|
1091 |
|
@@ -1098,14 +1098,45 @@ class CustomImageAnalyzerEngine:
|
|
1098 |
matched_words = matched_text.split()
|
1099 |
|
1100 |
# print(f"Found match: '{matched_text}' in line")
|
|
|
|
|
|
|
|
|
|
|
|
|
1101 |
|
1102 |
# Find the corresponding words in the OCR results
|
1103 |
matching_word_boxes = []
|
|
|
|
|
|
|
|
|
|
|
1104 |
for word_info in ocr_results_with_children_child_info.get('words', []):
|
1105 |
-
|
1106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1107 |
matching_word_boxes.append(word_info['bounding_box'])
|
1108 |
-
#
|
1109 |
|
1110 |
if matching_word_boxes:
|
1111 |
# Calculate the combined bounding box for all matching words
|
@@ -1127,7 +1158,7 @@ class CustomImageAnalyzerEngine:
|
|
1127 |
text=matched_text
|
1128 |
)
|
1129 |
)
|
1130 |
-
#
|
1131 |
|
1132 |
return redaction_bboxes
|
1133 |
|
|
|
560 |
if not nlp_analyser:
|
561 |
raise ValueError("nlp_analyser is required for Local identification method")
|
562 |
|
563 |
+
#print("page text:", page_text)
|
564 |
|
565 |
page_analyser_result = nlp_analyser.analyze(
|
566 |
text=page_text,
|
|
|
1077 |
line_length = len(line_text)
|
1078 |
redaction_text = redaction_relevant_ocr_result.text
|
1079 |
|
1080 |
+
#print(f"Processing line: '{line_text}'")
|
1081 |
|
1082 |
for redaction_result in text_analyzer_results:
|
1083 |
+
#print(f"Checking redaction result: {redaction_result}")
|
1084 |
+
#print("redaction_text:", redaction_text)
|
1085 |
+
#print("line_length:", line_length)
|
1086 |
+
#print("line_text:", line_text)
|
1087 |
|
1088 |
+
# Check if the redaction text is not in the allow list
|
1089 |
|
1090 |
if redaction_text not in allow_list:
|
1091 |
|
|
|
1098 |
matched_words = matched_text.split()
|
1099 |
|
1100 |
# print(f"Found match: '{matched_text}' in line")
|
1101 |
+
|
1102 |
+
# for word_info in ocr_results_with_children_child_info.get('words', []):
|
1103 |
+
# # Check if this word is part of our match
|
1104 |
+
# if any(word.lower() in word_info['text'].lower() for word in matched_words):
|
1105 |
+
# matching_word_boxes.append(word_info['bounding_box'])
|
1106 |
+
# print(f"Matched word: {word_info['text']}")
|
1107 |
|
1108 |
# Find the corresponding words in the OCR results
|
1109 |
matching_word_boxes = []
|
1110 |
+
|
1111 |
+
#print("ocr_results_with_children_child_info:", ocr_results_with_children_child_info)
|
1112 |
+
|
1113 |
+
current_position = 0
|
1114 |
+
|
1115 |
for word_info in ocr_results_with_children_child_info.get('words', []):
|
1116 |
+
word_text = word_info['text']
|
1117 |
+
word_length = len(word_text)
|
1118 |
+
|
1119 |
+
# Assign start and end character positions
|
1120 |
+
#word_info['start_position'] = current_position
|
1121 |
+
#word_info['end_position'] = current_position + word_length
|
1122 |
+
|
1123 |
+
word_start = current_position
|
1124 |
+
word_end = current_position + word_length
|
1125 |
+
|
1126 |
+
# Update current position for the next word
|
1127 |
+
current_position += word_length + 1 # +1 for the space after the word
|
1128 |
+
|
1129 |
+
#print("word_info['bounding_box']:", word_info['bounding_box'])
|
1130 |
+
#print("word_start:", word_start)
|
1131 |
+
#print("start_in_line:", start_in_line)
|
1132 |
+
|
1133 |
+
#print("word_end:", word_end)
|
1134 |
+
#print("end_in_line:", end_in_line)
|
1135 |
+
|
1136 |
+
# Check if the word's bounding box is within the start and end bounds
|
1137 |
+
if word_start >= start_in_line and word_end <= (end_in_line + 1):
|
1138 |
matching_word_boxes.append(word_info['bounding_box'])
|
1139 |
+
#print(f"Matched word: {word_info['text']}")
|
1140 |
|
1141 |
if matching_word_boxes:
|
1142 |
# Calculate the combined bounding box for all matching words
|
|
|
1158 |
text=matched_text
|
1159 |
)
|
1160 |
)
|
1161 |
+
#print(f"Added bounding box for: '{matched_text}'")
|
1162 |
|
1163 |
return redaction_bboxes
|
1164 |
|
tools/data_anonymise.py
CHANGED
@@ -12,7 +12,7 @@ from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerR
|
|
12 |
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
|
13 |
from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
|
14 |
|
15 |
-
from tools.helper_functions import output_folder,
|
16 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
|
17 |
|
18 |
# Use custom version of analyze_dict to be able to track progress
|
@@ -434,7 +434,7 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
|
|
434 |
file_type = detect_file_type(anon_file)
|
435 |
print("File type is:", file_type)
|
436 |
|
437 |
-
out_file_part =
|
438 |
|
439 |
if file_type == 'xlsx':
|
440 |
print("Running through all xlsx sheets")
|
@@ -472,7 +472,7 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
|
|
472 |
else:
|
473 |
sheet_name = ""
|
474 |
anon_df = read_file(anon_file)
|
475 |
-
out_file_part =
|
476 |
out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths)
|
477 |
|
478 |
# Increase latest file completed count unless we are at the last file
|
|
|
12 |
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
|
13 |
from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
|
14 |
|
15 |
+
from tools.helper_functions import output_folder, get_file_name_without_type, read_file, detect_file_type
|
16 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
|
17 |
|
18 |
# Use custom version of analyze_dict to be able to track progress
|
|
|
434 |
file_type = detect_file_type(anon_file)
|
435 |
print("File type is:", file_type)
|
436 |
|
437 |
+
out_file_part = get_file_name_without_type(anon_file.name)
|
438 |
|
439 |
if file_type == 'xlsx':
|
440 |
print("Running through all xlsx sheets")
|
|
|
472 |
else:
|
473 |
sheet_name = ""
|
474 |
anon_df = read_file(anon_file)
|
475 |
+
out_file_part = get_file_name_without_type(anon_file.name)
|
476 |
out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths)
|
477 |
|
478 |
# Increase latest file completed count unless we are at the last file
|
tools/file_conversion.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
from pdf2image import convert_from_path, pdfinfo_from_path
|
2 |
-
from tools.helper_functions import
|
3 |
from PIL import Image, ImageFile
|
4 |
import os
|
5 |
import re
|
@@ -7,6 +7,7 @@ import time
|
|
7 |
import json
|
8 |
import pymupdf
|
9 |
import pandas as pd
|
|
|
10 |
from pymupdf import Rect
|
11 |
from fitz import Page
|
12 |
from tqdm import tqdm
|
@@ -240,7 +241,7 @@ def get_input_file_names(file_input:List[str]):
|
|
240 |
else:
|
241 |
file_path = file.name
|
242 |
|
243 |
-
file_path_without_ext =
|
244 |
|
245 |
file_extension = os.path.splitext(file_path)[1].lower()
|
246 |
|
@@ -489,7 +490,7 @@ def prepare_image_or_pdf(
|
|
489 |
file_path = file
|
490 |
else:
|
491 |
file_path = file.name
|
492 |
-
file_path_without_ext =
|
493 |
file_name_with_ext = os.path.basename(file_path)
|
494 |
|
495 |
if not file_path:
|
@@ -668,7 +669,7 @@ def prepare_image_or_pdf(
|
|
668 |
return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
|
669 |
|
670 |
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
|
671 |
-
file_path_without_ext =
|
672 |
|
673 |
out_file_paths = out_text_file_path
|
674 |
|
@@ -754,7 +755,7 @@ def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decis
|
|
754 |
if 'text' not in box:
|
755 |
data_to_add = {"image": image_path, "page": reported_number, **box} # "text": annotation['text'],
|
756 |
else:
|
757 |
-
data_to_add = {"image": image_path, "page": reported_number, "text":
|
758 |
#print("data_to_add:", data_to_add)
|
759 |
flattened_annotation_data.append(data_to_add)
|
760 |
|
@@ -764,7 +765,7 @@ def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decis
|
|
764 |
#print("redaction_decision_output:", redaction_decision_output)
|
765 |
#print("annotation_data_as_df:", annotation_data_as_df)
|
766 |
|
767 |
-
# Join on additional text data from decision output results if included
|
768 |
if not redaction_decision_output.empty:
|
769 |
#print("redaction_decision_output is not empty")
|
770 |
#print("redaction_decision_output:", redaction_decision_output)
|
@@ -793,6 +794,9 @@ def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decis
|
|
793 |
if col not in annotation_data_as_df.columns:
|
794 |
annotation_data_as_df[col] = ''
|
795 |
|
|
|
|
|
|
|
796 |
annotation_data_as_df = annotation_data_as_df.sort_values(['page', 'ymin', 'xmin', 'label'])
|
797 |
|
798 |
return annotation_data_as_df
|
|
|
1 |
from pdf2image import convert_from_path, pdfinfo_from_path
|
2 |
+
from tools.helper_functions import get_file_name_without_type, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, read_file, get_or_create_env_var
|
3 |
from PIL import Image, ImageFile
|
4 |
import os
|
5 |
import re
|
|
|
7 |
import json
|
8 |
import pymupdf
|
9 |
import pandas as pd
|
10 |
+
import numpy as np
|
11 |
from pymupdf import Rect
|
12 |
from fitz import Page
|
13 |
from tqdm import tqdm
|
|
|
241 |
else:
|
242 |
file_path = file.name
|
243 |
|
244 |
+
file_path_without_ext = get_file_name_without_type(file_path)
|
245 |
|
246 |
file_extension = os.path.splitext(file_path)[1].lower()
|
247 |
|
|
|
490 |
file_path = file
|
491 |
else:
|
492 |
file_path = file.name
|
493 |
+
file_path_without_ext = get_file_name_without_type(file_path)
|
494 |
file_name_with_ext = os.path.basename(file_path)
|
495 |
|
496 |
if not file_path:
|
|
|
669 |
return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
|
670 |
|
671 |
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
|
672 |
+
file_path_without_ext = get_file_name_without_type(in_file_path)
|
673 |
|
674 |
out_file_paths = out_text_file_path
|
675 |
|
|
|
755 |
if 'text' not in box:
|
756 |
data_to_add = {"image": image_path, "page": reported_number, **box} # "text": annotation['text'],
|
757 |
else:
|
758 |
+
data_to_add = {"image": image_path, "page": reported_number, "text": box['text'], **box}
|
759 |
#print("data_to_add:", data_to_add)
|
760 |
flattened_annotation_data.append(data_to_add)
|
761 |
|
|
|
765 |
#print("redaction_decision_output:", redaction_decision_output)
|
766 |
#print("annotation_data_as_df:", annotation_data_as_df)
|
767 |
|
768 |
+
# Join on additional text data from decision output results if included, if text not already there
|
769 |
if not redaction_decision_output.empty:
|
770 |
#print("redaction_decision_output is not empty")
|
771 |
#print("redaction_decision_output:", redaction_decision_output)
|
|
|
794 |
if col not in annotation_data_as_df.columns:
|
795 |
annotation_data_as_df[col] = ''
|
796 |
|
797 |
+
for col in ['xmin', 'xmax', 'ymin', 'ymax']:
|
798 |
+
annotation_data_as_df[col] = np.floor(annotation_data_as_df[col])
|
799 |
+
|
800 |
annotation_data_as_df = annotation_data_as_df.sort_values(['page', 'ymin', 'xmin', 'label'])
|
801 |
|
802 |
return annotation_data_as_df
|
tools/file_redaction.py
CHANGED
@@ -27,8 +27,8 @@ from presidio_analyzer import RecognizerResult
|
|
27 |
from tools.aws_functions import RUN_AWS_FUNCTIONS
|
28 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes
|
29 |
from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords
|
30 |
-
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser
|
31 |
-
from tools.helper_functions import
|
32 |
from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
|
33 |
from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
|
34 |
from tools.presidio_analyzer_custom import recognizer_result_from_dict
|
@@ -94,6 +94,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
94 |
page_break_return:bool=False,
|
95 |
pii_identification_method:str="Local",
|
96 |
comprehend_query_number:int=0,
|
|
|
|
|
97 |
output_folder:str=output_folder,
|
98 |
progress=gr.Progress(track_tqdm=True)):
|
99 |
'''
|
@@ -127,6 +129,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
127 |
- page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
|
128 |
- pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
|
129 |
- comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
|
|
|
|
|
130 |
- output_folder (str, optional): Output folder for results.
|
131 |
- progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
132 |
|
@@ -136,7 +140,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
136 |
tic = time.perf_counter()
|
137 |
all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
|
138 |
|
139 |
-
print("prepared_pdf_file_paths:", prepared_pdf_file_paths[0])
|
140 |
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
141 |
|
142 |
if isinstance(custom_recogniser_word_list, pd.DataFrame):
|
@@ -279,9 +283,9 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
279 |
file_path = file.name
|
280 |
|
281 |
if file_path:
|
282 |
-
pdf_file_name_without_ext =
|
283 |
pdf_file_name_with_ext = os.path.basename(file_path)
|
284 |
-
print("Redacting file:", pdf_file_name_with_ext)
|
285 |
|
286 |
is_a_pdf = is_pdf(file_path) == True
|
287 |
if is_a_pdf == False and in_redact_method == text_ocr_option:
|
@@ -327,7 +331,9 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
327 |
comprehend_client,
|
328 |
textract_client,
|
329 |
custom_recogniser_word_list,
|
330 |
-
redact_whole_page_list
|
|
|
|
|
331 |
|
332 |
|
333 |
#print("log_files_output_paths at end of image redact function:", log_files_output_paths)
|
@@ -366,7 +372,9 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
366 |
comprehend_query_number,
|
367 |
comprehend_client,
|
368 |
custom_recogniser_word_list,
|
369 |
-
redact_whole_page_list
|
|
|
|
|
370 |
|
371 |
else:
|
372 |
out_message = "No redaction method selected"
|
@@ -414,13 +422,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
414 |
|
415 |
# Save the gradio_annotation_boxes to a JSON file
|
416 |
try:
|
417 |
-
|
418 |
-
|
419 |
-
out_annotation_file_path = out_orig_pdf_file_path + '_review_file.json'
|
420 |
-
with open(out_annotation_file_path, 'w') as f:
|
421 |
-
json.dump(annotations_all_pages, f)
|
422 |
-
log_files_output_paths.append(out_annotation_file_path)
|
423 |
-
|
424 |
#print("Saving annotations to CSV")
|
425 |
|
426 |
# Convert json to csv and also save this
|
@@ -435,6 +437,13 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
435 |
|
436 |
print("Saved review file to csv")
|
437 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
438 |
except Exception as e:
|
439 |
print("Could not save annotations to json or csv file:", e)
|
440 |
|
@@ -694,10 +703,10 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
|
|
694 |
x1 = pymupdf_x1
|
695 |
x2 = pymupdf_x2
|
696 |
|
697 |
-
|
698 |
-
|
699 |
-
|
700 |
-
|
701 |
|
702 |
# Else should be CustomImageRecognizerResult
|
703 |
else:
|
@@ -715,10 +724,11 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
|
|
715 |
img_annotation_box["label"] = annot.entity_type
|
716 |
except:
|
717 |
img_annotation_box["label"] = "Redaction"
|
718 |
-
|
719 |
-
|
720 |
-
|
721 |
-
|
|
|
722 |
|
723 |
rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2) # Create the PyMuPDF Rect
|
724 |
|
@@ -749,12 +759,14 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
|
|
749 |
|
750 |
if isinstance(annot, Dictionary):
|
751 |
img_annotation_box["label"] = str(annot["/T"])
|
|
|
|
|
|
|
|
|
|
|
752 |
else:
|
753 |
img_annotation_box["label"] = "REDACTION"
|
754 |
-
|
755 |
-
# img_annotation_box["text"] = annot.text
|
756 |
-
# else:
|
757 |
-
# img_annotation_box["text"] = ""
|
758 |
|
759 |
# Convert to a PyMuPDF Rect object
|
760 |
#rect = Rect(rect_coordinates)
|
@@ -779,6 +791,11 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
|
|
779 |
|
780 |
return page, out_annotation_boxes
|
781 |
|
|
|
|
|
|
|
|
|
|
|
782 |
def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
|
783 |
|
784 |
all_bboxes = []
|
@@ -908,6 +925,8 @@ def redact_image_pdf(file_path:str,
|
|
908 |
textract_client:str="",
|
909 |
custom_recogniser_word_list:List[str]=[],
|
910 |
redact_whole_page_list:List[str]=[],
|
|
|
|
|
911 |
page_break_val:int=int(page_break_value),
|
912 |
log_files_output_paths:List=[],
|
913 |
max_time:int=int(max_time_value),
|
@@ -940,14 +959,16 @@ def redact_image_pdf(file_path:str,
|
|
940 |
- textract_client (optional): A connection to the AWS Textract service via the boto3 package.
|
941 |
- custom_recogniser_word_list (optional): A list of custom words that the user has chosen specifically to redact.
|
942 |
- redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
|
|
|
|
|
943 |
- page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
|
944 |
- log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
|
945 |
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
946 |
- progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
947 |
|
948 |
-
The function returns a
|
949 |
'''
|
950 |
-
file_name =
|
951 |
fill = (0, 0, 0) # Fill colour for redactions
|
952 |
comprehend_query_number_new = 0
|
953 |
|
@@ -957,11 +978,14 @@ def redact_image_pdf(file_path:str,
|
|
957 |
nlp_analyser.registry.remove_recognizer("CUSTOM")
|
958 |
new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
|
959 |
#print("new_custom_recogniser:", new_custom_recogniser)
|
960 |
-
nlp_analyser.registry.add_recognizer(new_custom_recogniser)
|
961 |
|
|
|
|
|
|
|
|
|
962 |
|
963 |
-
image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
|
964 |
-
|
965 |
|
966 |
if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
|
967 |
print("Connection to AWS Comprehend service unsuccessful.")
|
@@ -1051,7 +1075,7 @@ def redact_image_pdf(file_path:str,
|
|
1051 |
|
1052 |
#print("Image is in range of pages to redact")
|
1053 |
if isinstance(image, str):
|
1054 |
-
print("image is a file path", image)
|
1055 |
image = Image.open(image)
|
1056 |
|
1057 |
# Need image size to convert textract OCR outputs to the correct sizes
|
@@ -1119,7 +1143,7 @@ def redact_image_pdf(file_path:str,
|
|
1119 |
line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
|
1120 |
|
1121 |
# Step 2: Analyze text and identify PII
|
1122 |
-
if chosen_redact_entities:
|
1123 |
|
1124 |
redaction_bboxes, comprehend_query_number_new = image_analyser.analyze_text(
|
1125 |
line_level_ocr_results,
|
@@ -1185,6 +1209,7 @@ def redact_image_pdf(file_path:str,
|
|
1185 |
|
1186 |
## Apply annotations with pymupdf
|
1187 |
else:
|
|
|
1188 |
#print("redact_whole_page_list:", redact_whole_page_list)
|
1189 |
if redact_whole_page_list:
|
1190 |
int_reported_page_number = int(reported_page_number)
|
@@ -1309,7 +1334,7 @@ def redact_image_pdf(file_path:str,
|
|
1309 |
|
1310 |
|
1311 |
###
|
1312 |
-
# PIKEPDF TEXT
|
1313 |
###
|
1314 |
|
1315 |
def get_text_container_characters(text_container:LTTextContainer):
|
@@ -1466,6 +1491,8 @@ def create_text_redaction_process_results(analyser_results, analysed_bounding_bo
|
|
1466 |
def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
|
1467 |
pikepdf_annotations_on_page = []
|
1468 |
for analysed_bounding_box in analysed_bounding_boxes:
|
|
|
|
|
1469 |
bounding_box = analysed_bounding_box["boundingBox"]
|
1470 |
annotation = Dictionary(
|
1471 |
Type=Name.Annot,
|
@@ -1477,6 +1504,7 @@ def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
|
|
1477 |
IC=[0, 0, 0],
|
1478 |
CA=1, # Transparency
|
1479 |
T=analysed_bounding_box["result"].entity_type,
|
|
|
1480 |
BS=Dictionary(
|
1481 |
W=0, # Border width: 1 point
|
1482 |
S=Name.S # Border style: solid
|
@@ -1485,182 +1513,6 @@ def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
|
|
1485 |
pikepdf_annotations_on_page.append(annotation)
|
1486 |
return pikepdf_annotations_on_page
|
1487 |
|
1488 |
-
# def run_page_text_redaction(language: str, # Language of the PDF content
|
1489 |
-
# chosen_redact_entities: List[str], # List of entities to be redacted
|
1490 |
-
# chosen_redact_comprehend_entities: List[str],
|
1491 |
-
# line_level_text_results_list: List[str],
|
1492 |
-
# line_characters: List,
|
1493 |
-
# page_analyser_results: List = [],
|
1494 |
-
# page_analysed_bounding_boxes: List = [],
|
1495 |
-
# comprehend_client = None, # Connection to AWS Comprehend
|
1496 |
-
# allow_list: List[str] = None, # Optional list of allowed entities
|
1497 |
-
# pii_identification_method: str = "Local"
|
1498 |
-
# ):
|
1499 |
-
|
1500 |
-
# # Initialize batching variables
|
1501 |
-
# current_batch = ""
|
1502 |
-
# current_batch_mapping = [] # List of (start_pos, line_index, OCRResult) tuples
|
1503 |
-
# all_text_line_results = [] # Store results for all lines
|
1504 |
-
# text_container_analyser_results = []
|
1505 |
-
# text_container_analysed_bounding_boxes = []
|
1506 |
-
|
1507 |
-
# # First pass: collect all lines into batches
|
1508 |
-
# for i, text_line in enumerate(line_level_text_results_list):
|
1509 |
-
# if chosen_redact_entities:
|
1510 |
-
# if pii_identification_method == "Local":
|
1511 |
-
|
1512 |
-
# #print("chosen_redact_entities:", chosen_redact_entities)
|
1513 |
-
|
1514 |
-
# # Process immediately for local analysis
|
1515 |
-
# text_line_analyser_result = nlp_analyser.analyze(
|
1516 |
-
# text=text_line.text,
|
1517 |
-
# language=language,
|
1518 |
-
# entities=chosen_redact_entities,
|
1519 |
-
# score_threshold=score_threshold,
|
1520 |
-
# return_decision_process=True,
|
1521 |
-
# allow_list=allow_list
|
1522 |
-
# )
|
1523 |
-
# all_text_line_results.append((i, text_line_analyser_result))
|
1524 |
-
|
1525 |
-
|
1526 |
-
# elif pii_identification_method == "AWS Comprehend":
|
1527 |
-
|
1528 |
-
# # First use the local Spacy model to pick up custom entities that AWS Comprehend can't search for.
|
1529 |
-
# custom_redact_entities = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
|
1530 |
-
|
1531 |
-
|
1532 |
-
# text_line_analyser_result = nlp_analyser.analyze(
|
1533 |
-
# text=text_line.text,
|
1534 |
-
# language=language,
|
1535 |
-
# entities=custom_redact_entities,
|
1536 |
-
# score_threshold=score_threshold,
|
1537 |
-
# return_decision_process=True,
|
1538 |
-
# allow_list=allow_list
|
1539 |
-
# )
|
1540 |
-
# all_text_line_results.append((i, text_line_analyser_result))
|
1541 |
-
|
1542 |
-
|
1543 |
-
# if len(text_line.text) >= 3:
|
1544 |
-
# # Add separator between lines
|
1545 |
-
# if current_batch:
|
1546 |
-
# current_batch += " | "
|
1547 |
-
|
1548 |
-
# start_pos = len(current_batch)
|
1549 |
-
# current_batch += text_line.text
|
1550 |
-
# current_batch_mapping.append((start_pos, i, text_line))
|
1551 |
-
|
1552 |
-
# # Process batch if approaching 300 characters or last line
|
1553 |
-
# if len(current_batch) >= 200 or i == len(line_level_text_results_list) - 1:
|
1554 |
-
# print("length of text for Comprehend:", len(current_batch))
|
1555 |
-
|
1556 |
-
# try:
|
1557 |
-
# response = comprehend_client.detect_pii_entities(
|
1558 |
-
# Text=current_batch,
|
1559 |
-
# LanguageCode=language
|
1560 |
-
# )
|
1561 |
-
# except Exception as e:
|
1562 |
-
# print(e)
|
1563 |
-
# time.sleep(3)
|
1564 |
-
# response = comprehend_client.detect_pii_entities(
|
1565 |
-
# Text=current_batch,
|
1566 |
-
# LanguageCode=language
|
1567 |
-
# )
|
1568 |
-
|
1569 |
-
# comprehend_query_number += 1
|
1570 |
-
|
1571 |
-
# # Process response and map back to original lines
|
1572 |
-
# if response and "Entities" in response:
|
1573 |
-
# for entity in response["Entities"]:
|
1574 |
-
# entity_start = entity["BeginOffset"]
|
1575 |
-
# entity_end = entity["EndOffset"]
|
1576 |
-
|
1577 |
-
# # Find which line this entity belongs to
|
1578 |
-
# for batch_start, line_idx, original_line in current_batch_mapping:
|
1579 |
-
# batch_end = batch_start + len(original_line.text)
|
1580 |
-
|
1581 |
-
# # Check if entity belongs to this line
|
1582 |
-
# if batch_start <= entity_start < batch_end:
|
1583 |
-
# # Adjust offsets relative to original line
|
1584 |
-
# relative_start = entity_start - batch_start
|
1585 |
-
# relative_end = min(entity_end - batch_start, len(original_line.text))
|
1586 |
-
|
1587 |
-
# result_text = original_line.text[relative_start:relative_end]
|
1588 |
-
|
1589 |
-
# if result_text not in allow_list:
|
1590 |
-
# if entity.get("Type") in chosen_redact_comprehend_entities:
|
1591 |
-
# # Create adjusted entity
|
1592 |
-
# adjusted_entity = entity.copy()
|
1593 |
-
# adjusted_entity["BeginOffset"] = relative_start
|
1594 |
-
# adjusted_entity["EndOffset"] = relative_end
|
1595 |
-
|
1596 |
-
# recogniser_entity = recognizer_result_from_dict(adjusted_entity)
|
1597 |
-
|
1598 |
-
# # Add to results for this line
|
1599 |
-
# existing_results = next((results for idx, results in all_text_line_results if idx == line_idx), [])
|
1600 |
-
# if not existing_results:
|
1601 |
-
# all_text_line_results.append((line_idx, [recogniser_entity]))
|
1602 |
-
# else:
|
1603 |
-
# existing_results.append(recogniser_entity)
|
1604 |
-
|
1605 |
-
# # Reset batch
|
1606 |
-
# current_batch = ""
|
1607 |
-
# current_batch_mapping = []
|
1608 |
-
|
1609 |
-
# # Second pass: process results for each line
|
1610 |
-
# for i, text_line in enumerate(line_level_text_results_list):
|
1611 |
-
# text_line_analyser_result = []
|
1612 |
-
# text_line_bounding_boxes = []
|
1613 |
-
|
1614 |
-
# # Get results for this line
|
1615 |
-
# line_results = next((results for idx, results in all_text_line_results if idx == i), [])
|
1616 |
-
|
1617 |
-
# if line_results:
|
1618 |
-
# text_line_analyser_result = line_results
|
1619 |
-
|
1620 |
-
# #print("Analysed text container, now merging bounding boxes")
|
1621 |
-
|
1622 |
-
# # Merge bounding boxes if very close together
|
1623 |
-
# text_line_bounding_boxes = merge_text_bounding_boxes(text_line_analyser_result, line_characters[i])
|
1624 |
-
|
1625 |
-
# #print("merged bounding boxes")
|
1626 |
-
|
1627 |
-
# text_container_analyser_results.extend(text_line_analyser_result)
|
1628 |
-
# #text_container_analysed_bounding_boxes.extend(text_line_bounding_boxes)
|
1629 |
-
|
1630 |
-
# #print("text_container_analyser_results:", text_container_analyser_results)
|
1631 |
-
|
1632 |
-
# page_analyser_results.extend(text_container_analyser_results) # Add this line
|
1633 |
-
# page_analysed_bounding_boxes.extend(text_line_bounding_boxes) # Add this line
|
1634 |
-
|
1635 |
-
# return page_analysed_bounding_boxes
|
1636 |
-
|
1637 |
-
# def map_back_entity_results(page_analyser_result, page_text_mapping, all_text_line_results):
|
1638 |
-
# for entity in page_analyser_result:
|
1639 |
-
# entity_start = entity.start
|
1640 |
-
# entity_end = entity.end
|
1641 |
-
|
1642 |
-
# for batch_start, line_idx, original_line, chars in page_text_mapping:
|
1643 |
-
# batch_end = batch_start + len(original_line.text)
|
1644 |
-
|
1645 |
-
# if batch_start <= entity_start < batch_end:
|
1646 |
-
# relative_start = entity_start - batch_start
|
1647 |
-
# relative_end = min(entity_end - batch_start, len(original_line.text))
|
1648 |
-
|
1649 |
-
# adjusted_entity = copy.deepcopy(entity)
|
1650 |
-
# adjusted_entity.start = relative_start
|
1651 |
-
# adjusted_entity.end = relative_end
|
1652 |
-
|
1653 |
-
# existing_entry = next((entry for idx, entry in all_text_line_results if idx == line_idx), None)
|
1654 |
-
|
1655 |
-
# if existing_entry is None:
|
1656 |
-
# all_text_line_results.append((line_idx, [adjusted_entity]))
|
1657 |
-
# else:
|
1658 |
-
# existing_entry.append(adjusted_entity)
|
1659 |
-
# break
|
1660 |
-
|
1661 |
-
# return all_text_line_results
|
1662 |
-
|
1663 |
-
|
1664 |
def redact_text_pdf(
|
1665 |
filename: str, # Path to the PDF file to be redacted
|
1666 |
prepared_pdf_image_path: str, # Path to the prepared PDF image for redaction
|
@@ -1682,6 +1534,8 @@ def redact_text_pdf(
|
|
1682 |
comprehend_client="",
|
1683 |
custom_recogniser_word_list:List[str]=[],
|
1684 |
redact_whole_page_list:List[str]=[],
|
|
|
|
|
1685 |
page_break_val: int = int(page_break_value), # Value for page break
|
1686 |
max_time: int = int(max_time_value),
|
1687 |
progress: Progress = Progress(track_tqdm=True) # Progress tracking object
|
@@ -1711,6 +1565,8 @@ def redact_text_pdf(
|
|
1711 |
- comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
|
1712 |
- custom_recogniser_word_list (optional, List[str]): A list of custom words that the user has chosen specifically to redact.
|
1713 |
- redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
|
|
|
|
|
1714 |
- page_break_val: Value for page break
|
1715 |
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
1716 |
- progress: Progress tracking object
|
@@ -1726,9 +1582,12 @@ def redact_text_pdf(
|
|
1726 |
if custom_recogniser_word_list:
|
1727 |
nlp_analyser.registry.remove_recognizer("CUSTOM")
|
1728 |
new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
|
1729 |
-
#print("new_custom_recogniser:", new_custom_recogniser)
|
1730 |
nlp_analyser.registry.add_recognizer(new_custom_recogniser)
|
1731 |
|
|
|
|
|
|
|
|
|
1732 |
# List all elements currently in the nlp_analyser registry
|
1733 |
#print("Current recognizers in nlp_analyser registry:")
|
1734 |
#for recognizer_name in nlp_analyser.registry.recognizers:
|
@@ -1761,15 +1620,14 @@ def redact_text_pdf(
|
|
1761 |
for page_no in progress_bar:
|
1762 |
|
1763 |
reported_page_number = str(page_no + 1)
|
1764 |
-
print("Redacting page:", reported_page_number)
|
1765 |
|
1766 |
# Assuming prepared_pdf_file_paths[page_no] is a PIL image object
|
1767 |
try:
|
1768 |
image = prepared_pdf_image_path[page_no]#.copy()
|
1769 |
#print("image:", image)
|
1770 |
except Exception as e:
|
1771 |
-
print("Could not redact page:", reported_page_number, "due to:")
|
1772 |
-
print(e)
|
1773 |
continue
|
1774 |
|
1775 |
image_annotations = {"image": image, "boxes": []}
|
@@ -1825,27 +1683,32 @@ def redact_text_pdf(
|
|
1825 |
|
1826 |
### REDACTION
|
1827 |
|
1828 |
-
|
1829 |
-
|
1830 |
-
|
1831 |
-
|
1832 |
-
|
1833 |
-
|
1834 |
-
|
1835 |
-
|
1836 |
-
|
1837 |
-
|
1838 |
-
|
1839 |
-
|
1840 |
-
|
1841 |
-
|
1842 |
-
|
1843 |
-
|
1844 |
-
|
1845 |
-
|
1846 |
-
|
1847 |
-
|
1848 |
-
|
|
|
|
|
|
|
|
|
|
|
1849 |
|
1850 |
page_analysed_bounding_boxes = convert_pikepdf_decision_output_to_image_coords(pymupdf_page, page_analysed_bounding_boxes, image)
|
1851 |
|
@@ -1854,7 +1717,7 @@ def redact_text_pdf(
|
|
1854 |
# Annotate redactions on page
|
1855 |
pikepdf_annotations_on_page = create_pikepdf_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
|
1856 |
|
1857 |
-
#print("pikepdf_annotations_on_page:", pikepdf_annotations_on_page)
|
1858 |
|
1859 |
# Make pymupdf page redactions
|
1860 |
#print("redact_whole_page_list:", redact_whole_page_list)
|
|
|
27 |
from tools.aws_functions import RUN_AWS_FUNCTIONS
|
28 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes
|
29 |
from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords
|
30 |
+
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
|
31 |
+
from tools.helper_functions import get_file_name_without_type, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
|
32 |
from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
|
33 |
from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
|
34 |
from tools.presidio_analyzer_custom import recognizer_result_from_dict
|
|
|
94 |
page_break_return:bool=False,
|
95 |
pii_identification_method:str="Local",
|
96 |
comprehend_query_number:int=0,
|
97 |
+
max_fuzzy_spelling_mistakes_num:int=1,
|
98 |
+
match_fuzzy_whole_phrase_bool:bool=True,
|
99 |
output_folder:str=output_folder,
|
100 |
progress=gr.Progress(track_tqdm=True)):
|
101 |
'''
|
|
|
129 |
- page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
|
130 |
- pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
|
131 |
- comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
|
132 |
+
- max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
|
133 |
+
- match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
|
134 |
- output_folder (str, optional): Output folder for results.
|
135 |
- progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
136 |
|
|
|
140 |
tic = time.perf_counter()
|
141 |
all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
|
142 |
|
143 |
+
#print("prepared_pdf_file_paths:", prepared_pdf_file_paths[0])
|
144 |
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
145 |
|
146 |
if isinstance(custom_recogniser_word_list, pd.DataFrame):
|
|
|
283 |
file_path = file.name
|
284 |
|
285 |
if file_path:
|
286 |
+
pdf_file_name_without_ext = get_file_name_without_type(file_path)
|
287 |
pdf_file_name_with_ext = os.path.basename(file_path)
|
288 |
+
# print("Redacting file:", pdf_file_name_with_ext)
|
289 |
|
290 |
is_a_pdf = is_pdf(file_path) == True
|
291 |
if is_a_pdf == False and in_redact_method == text_ocr_option:
|
|
|
331 |
comprehend_client,
|
332 |
textract_client,
|
333 |
custom_recogniser_word_list,
|
334 |
+
redact_whole_page_list,
|
335 |
+
max_fuzzy_spelling_mistakes_num,
|
336 |
+
match_fuzzy_whole_phrase_bool)
|
337 |
|
338 |
|
339 |
#print("log_files_output_paths at end of image redact function:", log_files_output_paths)
|
|
|
372 |
comprehend_query_number,
|
373 |
comprehend_client,
|
374 |
custom_recogniser_word_list,
|
375 |
+
redact_whole_page_list,
|
376 |
+
max_fuzzy_spelling_mistakes_num,
|
377 |
+
match_fuzzy_whole_phrase_bool)
|
378 |
|
379 |
else:
|
380 |
out_message = "No redaction method selected"
|
|
|
422 |
|
423 |
# Save the gradio_annotation_boxes to a JSON file
|
424 |
try:
|
425 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
426 |
#print("Saving annotations to CSV")
|
427 |
|
428 |
# Convert json to csv and also save this
|
|
|
437 |
|
438 |
print("Saved review file to csv")
|
439 |
|
440 |
+
out_annotation_file_path = out_orig_pdf_file_path + '_review_file.json'
|
441 |
+
with open(out_annotation_file_path, 'w') as f:
|
442 |
+
json.dump(annotations_all_pages, f)
|
443 |
+
log_files_output_paths.append(out_annotation_file_path)
|
444 |
+
|
445 |
+
print("Saving annotations to JSON")
|
446 |
+
|
447 |
except Exception as e:
|
448 |
print("Could not save annotations to json or csv file:", e)
|
449 |
|
|
|
703 |
x1 = pymupdf_x1
|
704 |
x2 = pymupdf_x2
|
705 |
|
706 |
+
if hasattr(annot, 'text') and annot.text:
|
707 |
+
img_annotation_box["text"] = annot.text
|
708 |
+
else:
|
709 |
+
img_annotation_box["text"] = ""
|
710 |
|
711 |
# Else should be CustomImageRecognizerResult
|
712 |
else:
|
|
|
724 |
img_annotation_box["label"] = annot.entity_type
|
725 |
except:
|
726 |
img_annotation_box["label"] = "Redaction"
|
727 |
+
|
728 |
+
if hasattr(annot, 'text') and annot.text:
|
729 |
+
img_annotation_box["text"] = annot.text
|
730 |
+
else:
|
731 |
+
img_annotation_box["text"] = ""
|
732 |
|
733 |
rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2) # Create the PyMuPDF Rect
|
734 |
|
|
|
759 |
|
760 |
if isinstance(annot, Dictionary):
|
761 |
img_annotation_box["label"] = str(annot["/T"])
|
762 |
+
|
763 |
+
if hasattr(annot, 'Contents'):
|
764 |
+
img_annotation_box["text"] = annot.Contents
|
765 |
+
else:
|
766 |
+
img_annotation_box["text"] = ""
|
767 |
else:
|
768 |
img_annotation_box["label"] = "REDACTION"
|
769 |
+
img_annotation_box["text"] = ""
|
|
|
|
|
|
|
770 |
|
771 |
# Convert to a PyMuPDF Rect object
|
772 |
#rect = Rect(rect_coordinates)
|
|
|
791 |
|
792 |
return page, out_annotation_boxes
|
793 |
|
794 |
+
###
|
795 |
+
# IMAGE-BASED OCR PDF TEXT DETECTION/REDACTION WITH TESSERACT OR AWS TEXTRACT
|
796 |
+
###
|
797 |
+
|
798 |
+
|
799 |
def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
|
800 |
|
801 |
all_bboxes = []
|
|
|
925 |
textract_client:str="",
|
926 |
custom_recogniser_word_list:List[str]=[],
|
927 |
redact_whole_page_list:List[str]=[],
|
928 |
+
max_fuzzy_spelling_mistakes_num:int=1,
|
929 |
+
match_fuzzy_whole_phrase_bool:bool=True,
|
930 |
page_break_val:int=int(page_break_value),
|
931 |
log_files_output_paths:List=[],
|
932 |
max_time:int=int(max_time_value),
|
|
|
959 |
- textract_client (optional): A connection to the AWS Textract service via the boto3 package.
|
960 |
- custom_recogniser_word_list (optional): A list of custom words that the user has chosen specifically to redact.
|
961 |
- redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
|
962 |
+
- max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
|
963 |
+
- match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
|
964 |
- page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
|
965 |
- log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
|
966 |
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
967 |
- progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
968 |
|
969 |
+
The function returns a redacted PDF document along with processing output objects.
|
970 |
'''
|
971 |
+
file_name = get_file_name_without_type(file_path)
|
972 |
fill = (0, 0, 0) # Fill colour for redactions
|
973 |
comprehend_query_number_new = 0
|
974 |
|
|
|
978 |
nlp_analyser.registry.remove_recognizer("CUSTOM")
|
979 |
new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
|
980 |
#print("new_custom_recogniser:", new_custom_recogniser)
|
981 |
+
nlp_analyser.registry.add_recognizer(new_custom_recogniser)
|
982 |
|
983 |
+
nlp_analyser.registry.remove_recognizer("CUSTOM_FUZZY")
|
984 |
+
new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
|
985 |
+
#print("new_custom_recogniser:", new_custom_recogniser)
|
986 |
+
nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
|
987 |
|
988 |
+
image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
|
|
|
989 |
|
990 |
if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
|
991 |
print("Connection to AWS Comprehend service unsuccessful.")
|
|
|
1075 |
|
1076 |
#print("Image is in range of pages to redact")
|
1077 |
if isinstance(image, str):
|
1078 |
+
#print("image is a file path", image)
|
1079 |
image = Image.open(image)
|
1080 |
|
1081 |
# Need image size to convert textract OCR outputs to the correct sizes
|
|
|
1143 |
line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
|
1144 |
|
1145 |
# Step 2: Analyze text and identify PII
|
1146 |
+
if chosen_redact_entities or chosen_redact_comprehend_entities:
|
1147 |
|
1148 |
redaction_bboxes, comprehend_query_number_new = image_analyser.analyze_text(
|
1149 |
line_level_ocr_results,
|
|
|
1209 |
|
1210 |
## Apply annotations with pymupdf
|
1211 |
else:
|
1212 |
+
print("merged_redaction_boxes:", merged_redaction_bboxes)
|
1213 |
#print("redact_whole_page_list:", redact_whole_page_list)
|
1214 |
if redact_whole_page_list:
|
1215 |
int_reported_page_number = int(reported_page_number)
|
|
|
1334 |
|
1335 |
|
1336 |
###
|
1337 |
+
# PIKEPDF TEXT DETECTION/REDACTION
|
1338 |
###
|
1339 |
|
1340 |
def get_text_container_characters(text_container:LTTextContainer):
|
|
|
1491 |
def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
|
1492 |
pikepdf_annotations_on_page = []
|
1493 |
for analysed_bounding_box in analysed_bounding_boxes:
|
1494 |
+
#print("analysed_bounding_box:", analysed_bounding_boxes)
|
1495 |
+
|
1496 |
bounding_box = analysed_bounding_box["boundingBox"]
|
1497 |
annotation = Dictionary(
|
1498 |
Type=Name.Annot,
|
|
|
1504 |
IC=[0, 0, 0],
|
1505 |
CA=1, # Transparency
|
1506 |
T=analysed_bounding_box["result"].entity_type,
|
1507 |
+
Contents=analysed_bounding_box["text"],
|
1508 |
BS=Dictionary(
|
1509 |
W=0, # Border width: 1 point
|
1510 |
S=Name.S # Border style: solid
|
|
|
1513 |
pikepdf_annotations_on_page.append(annotation)
|
1514 |
return pikepdf_annotations_on_page
|
1515 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1516 |
def redact_text_pdf(
|
1517 |
filename: str, # Path to the PDF file to be redacted
|
1518 |
prepared_pdf_image_path: str, # Path to the prepared PDF image for redaction
|
|
|
1534 |
comprehend_client="",
|
1535 |
custom_recogniser_word_list:List[str]=[],
|
1536 |
redact_whole_page_list:List[str]=[],
|
1537 |
+
max_fuzzy_spelling_mistakes_num:int=1,
|
1538 |
+
match_fuzzy_whole_phrase_bool:bool=True,
|
1539 |
page_break_val: int = int(page_break_value), # Value for page break
|
1540 |
max_time: int = int(max_time_value),
|
1541 |
progress: Progress = Progress(track_tqdm=True) # Progress tracking object
|
|
|
1565 |
- comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
|
1566 |
- custom_recogniser_word_list (optional, List[str]): A list of custom words that the user has chosen specifically to redact.
|
1567 |
- redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
|
1568 |
+
- max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
|
1569 |
+
- match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
|
1570 |
- page_break_val: Value for page break
|
1571 |
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
1572 |
- progress: Progress tracking object
|
|
|
1582 |
if custom_recogniser_word_list:
|
1583 |
nlp_analyser.registry.remove_recognizer("CUSTOM")
|
1584 |
new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
|
|
|
1585 |
nlp_analyser.registry.add_recognizer(new_custom_recogniser)
|
1586 |
|
1587 |
+
nlp_analyser.registry.remove_recognizer("CUSTOM_FUZZY")
|
1588 |
+
new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
|
1589 |
+
nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
|
1590 |
+
|
1591 |
# List all elements currently in the nlp_analyser registry
|
1592 |
#print("Current recognizers in nlp_analyser registry:")
|
1593 |
#for recognizer_name in nlp_analyser.registry.recognizers:
|
|
|
1620 |
for page_no in progress_bar:
|
1621 |
|
1622 |
reported_page_number = str(page_no + 1)
|
1623 |
+
#print("Redacting page:", reported_page_number)
|
1624 |
|
1625 |
# Assuming prepared_pdf_file_paths[page_no] is a PIL image object
|
1626 |
try:
|
1627 |
image = prepared_pdf_image_path[page_no]#.copy()
|
1628 |
#print("image:", image)
|
1629 |
except Exception as e:
|
1630 |
+
print("Could not redact page:", reported_page_number, "due to:", e)
|
|
|
1631 |
continue
|
1632 |
|
1633 |
image_annotations = {"image": image, "boxes": []}
|
|
|
1683 |
|
1684 |
### REDACTION
|
1685 |
|
1686 |
+
if chosen_redact_entities or chosen_redact_comprehend_entities:
|
1687 |
+
#print("Identifying redactions on page.")
|
1688 |
+
|
1689 |
+
page_analysed_bounding_boxes = run_page_text_redaction(
|
1690 |
+
language,
|
1691 |
+
chosen_redact_entities,
|
1692 |
+
chosen_redact_comprehend_entities,
|
1693 |
+
all_line_level_text_results_list,
|
1694 |
+
all_line_characters,
|
1695 |
+
page_analyser_results,
|
1696 |
+
page_analysed_bounding_boxes,
|
1697 |
+
comprehend_client,
|
1698 |
+
allow_list,
|
1699 |
+
pii_identification_method,
|
1700 |
+
nlp_analyser,
|
1701 |
+
score_threshold,
|
1702 |
+
custom_entities,
|
1703 |
+
comprehend_query_number
|
1704 |
+
)
|
1705 |
+
|
1706 |
+
#print("page_analyser_results:", page_analyser_results)
|
1707 |
+
#print("page_analysed_bounding_boxes:", page_analysed_bounding_boxes)
|
1708 |
+
#print("image:", image)
|
1709 |
+
else:
|
1710 |
+
page_analysed_bounding_boxes = []
|
1711 |
+
|
1712 |
|
1713 |
page_analysed_bounding_boxes = convert_pikepdf_decision_output_to_image_coords(pymupdf_page, page_analysed_bounding_boxes, image)
|
1714 |
|
|
|
1717 |
# Annotate redactions on page
|
1718 |
pikepdf_annotations_on_page = create_pikepdf_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
|
1719 |
|
1720 |
+
# print("pikepdf_annotations_on_page:", pikepdf_annotations_on_page)
|
1721 |
|
1722 |
# Make pymupdf page redactions
|
1723 |
#print("redact_whole_page_list:", redact_whole_page_list)
|
tools/find_duplicate_pages.py
ADDED
@@ -0,0 +1,274 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import argparse
|
3 |
+
import glob
|
4 |
+
import os
|
5 |
+
import re
|
6 |
+
from tools.helper_functions import output_folder
|
7 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
8 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
9 |
+
import nltk
|
10 |
+
from nltk.corpus import stopwords
|
11 |
+
from nltk.tokenize import word_tokenize
|
12 |
+
from nltk.stem import PorterStemmer
|
13 |
+
import numpy as np
|
14 |
+
import random
|
15 |
+
import string
|
16 |
+
from typing import List
|
17 |
+
|
18 |
+
nltk.download('punkt')
|
19 |
+
nltk.download('stopwords')
|
20 |
+
nltk.download('punkt_tab')
|
21 |
+
|
22 |
+
similarity_threshold = 0.9
|
23 |
+
|
24 |
+
stop_words = set(stopwords.words('english'))
|
25 |
+
# List of words to remove from the stopword set
|
26 |
+
#words_to_remove = ['no', 'nor', 'not', 'don', 'don't', 'wasn', 'wasn't', 'weren', 'weren't', "don't", "wasn't", "weren't"]
|
27 |
+
|
28 |
+
# Remove the specified words from the stopwords set
|
29 |
+
#for word in words_to_remove:
|
30 |
+
# stop_words.discard(word.lower())
|
31 |
+
|
32 |
+
stemmer = PorterStemmer()
|
33 |
+
vectorizer = TfidfVectorizer()
|
34 |
+
|
35 |
+
def combine_ocr_output_text(input_files):
|
36 |
+
"""
|
37 |
+
Combines text from multiple CSV files containing page and text columns.
|
38 |
+
Groups text by file and page number, concatenating text within these groups.
|
39 |
+
|
40 |
+
Args:
|
41 |
+
input_files (list): List of paths to CSV files
|
42 |
+
|
43 |
+
Returns:
|
44 |
+
pd.DataFrame: Combined dataframe with columns [file, page, text]
|
45 |
+
"""
|
46 |
+
all_data = []
|
47 |
+
output_files = []
|
48 |
+
|
49 |
+
if isinstance(input_files, str):
|
50 |
+
file_paths_list = [input_files]
|
51 |
+
else:
|
52 |
+
file_paths_list = input_files
|
53 |
+
|
54 |
+
for file in file_paths_list:
|
55 |
+
|
56 |
+
if isinstance(file, str):
|
57 |
+
file_path = file
|
58 |
+
else:
|
59 |
+
file_path = file.name
|
60 |
+
|
61 |
+
# Read CSV file
|
62 |
+
df = pd.read_csv(file_path)
|
63 |
+
|
64 |
+
# Ensure required columns exist
|
65 |
+
if 'page' not in df.columns or 'text' not in df.columns:
|
66 |
+
print(f"Warning: Skipping {file_path} - missing required columns 'page' and 'text'")
|
67 |
+
continue
|
68 |
+
|
69 |
+
# Group by page and concatenate text
|
70 |
+
grouped = df.groupby('page')['text'].apply(' '.join).reset_index()
|
71 |
+
|
72 |
+
# Add filename column
|
73 |
+
grouped['file'] = os.path.basename(file_path)
|
74 |
+
|
75 |
+
all_data.append(grouped)
|
76 |
+
|
77 |
+
if not all_data:
|
78 |
+
raise ValueError("No valid CSV files were processed")
|
79 |
+
|
80 |
+
# Combine all dataframes
|
81 |
+
combined_df = pd.concat(all_data, ignore_index=True)
|
82 |
+
|
83 |
+
# Reorder columns
|
84 |
+
combined_df = combined_df[['file', 'page', 'text']]
|
85 |
+
|
86 |
+
output_combined_file_path = output_folder + "combined_ocr_output_files.csv"
|
87 |
+
combined_df.to_csv(output_combined_file_path, index=None)
|
88 |
+
|
89 |
+
output_files.append(output_combined_file_path)
|
90 |
+
|
91 |
+
return combined_df, output_files
|
92 |
+
|
93 |
+
def process_data(df, column:str):
|
94 |
+
'''
|
95 |
+
Clean and stem text columns in a data frame
|
96 |
+
'''
|
97 |
+
|
98 |
+
def _clean_text(raw_text):
|
99 |
+
# Remove HTML tags
|
100 |
+
clean = re.sub(r'<.*?>', '', raw_text)
|
101 |
+
clean = re.sub(r' ', ' ', clean)
|
102 |
+
clean = re.sub(r'\r\n', ' ', clean)
|
103 |
+
clean = re.sub(r'<', ' ', clean)
|
104 |
+
clean = re.sub(r'>', ' ', clean)
|
105 |
+
clean = re.sub(r'<strong>', ' ', clean)
|
106 |
+
clean = re.sub(r'</strong>', ' ', clean)
|
107 |
+
|
108 |
+
# Replace non-breaking space \xa0 with a space
|
109 |
+
clean = clean.replace(u'\xa0', u' ')
|
110 |
+
# Remove extra whitespace
|
111 |
+
clean = ' '.join(clean.split())
|
112 |
+
|
113 |
+
# Tokenize the text
|
114 |
+
words = word_tokenize(clean.lower())
|
115 |
+
|
116 |
+
# Remove punctuation and numbers
|
117 |
+
words = [word for word in words if word.isalpha()]
|
118 |
+
|
119 |
+
# Remove stopwords
|
120 |
+
words = [word for word in words if word not in stop_words]
|
121 |
+
|
122 |
+
# Join the cleaned words back into a string
|
123 |
+
return ' '.join(words)
|
124 |
+
|
125 |
+
# Function to apply stemming
|
126 |
+
def _apply_stemming(text):
|
127 |
+
# Tokenize the text
|
128 |
+
words = word_tokenize(text.lower())
|
129 |
+
|
130 |
+
# Apply stemming to each word
|
131 |
+
stemmed_words = [stemmer.stem(word) for word in words]
|
132 |
+
|
133 |
+
# Join the stemmed words back into a single string
|
134 |
+
return ' '.join(stemmed_words)
|
135 |
+
|
136 |
+
|
137 |
+
|
138 |
+
|
139 |
+
df['text_clean'] = df[column].apply(_clean_text)
|
140 |
+
df['text_clean'] = df['text_clean'].apply(_apply_stemming)
|
141 |
+
|
142 |
+
return df
|
143 |
+
|
144 |
+
def identify_similar_pages(input_files:List[str]):
|
145 |
+
|
146 |
+
output_paths = []
|
147 |
+
|
148 |
+
df, output_files = combine_ocr_output_text(input_files)
|
149 |
+
|
150 |
+
output_paths.extend(output_files)
|
151 |
+
|
152 |
+
# Clean text
|
153 |
+
df = process_data(df, 'text')
|
154 |
+
|
155 |
+
# Vectorise text
|
156 |
+
tfidf_matrix = vectorizer.fit_transform(df['text_clean'])
|
157 |
+
|
158 |
+
# Calculate cosine similarity
|
159 |
+
similarity_matrix = cosine_similarity(tfidf_matrix)
|
160 |
+
|
161 |
+
# Find the indices of the most similar pages
|
162 |
+
np.fill_diagonal(similarity_matrix, 0) # Ignore self-comparisons
|
163 |
+
similar_pages = np.argwhere(similarity_matrix > similarity_threshold) # Threshold of similarity
|
164 |
+
|
165 |
+
#print(similar_pages)
|
166 |
+
|
167 |
+
# Create a DataFrame for similar pairs and their scores
|
168 |
+
similarity_df = pd.DataFrame({
|
169 |
+
'Page1_Index': similar_pages[:, 0],
|
170 |
+
'Page2_Index': similar_pages[:, 1],
|
171 |
+
'Page1_File': similar_pages[:, 0],
|
172 |
+
'Page2_File': similar_pages[:, 1],
|
173 |
+
'Similarity_Score': similarity_matrix[similar_pages[:, 0], similar_pages[:, 1]]
|
174 |
+
})
|
175 |
+
|
176 |
+
# Filter out duplicate pairs (keep only one direction)
|
177 |
+
similarity_df = similarity_df[similarity_df['Page1_Index'] < similarity_df['Page2_Index']]
|
178 |
+
|
179 |
+
# Map the indices to their corresponding text and metadata
|
180 |
+
similarity_df['Page1_File'] = similarity_df['Page1_File'].map(df['file'])
|
181 |
+
similarity_df['Page2_File'] = similarity_df['Page2_File'].map(df['file'])
|
182 |
+
|
183 |
+
similarity_df['Page1_Page'] = similarity_df['Page1_Index'].map(df['page'])
|
184 |
+
similarity_df['Page2_Page'] = similarity_df['Page2_Index'].map(df['page'])
|
185 |
+
|
186 |
+
similarity_df['Page1_Text'] = similarity_df['Page1_Index'].map(df['text'])
|
187 |
+
similarity_df['Page2_Text'] = similarity_df['Page2_Index'].map(df['text'])
|
188 |
+
|
189 |
+
similarity_df_out = similarity_df[['Page1_File', 'Page1_Page', 'Page2_File', 'Page2_Page', 'Similarity_Score', 'Page1_Text', 'Page2_Text']]
|
190 |
+
similarity_df_out = similarity_df_out.sort_values(["Page1_File", "Page1_Page", "Page2_File", "Page2_Page", "Similarity_Score"], ascending=[True, True, True, True, False])
|
191 |
+
|
192 |
+
# Save detailed results to a CSV file
|
193 |
+
similarity_file_output_path = output_folder + 'page_similarity_results.csv'
|
194 |
+
similarity_df_out.to_csv(similarity_file_output_path, index=False)
|
195 |
+
|
196 |
+
output_paths.append(similarity_file_output_path)
|
197 |
+
|
198 |
+
if not similarity_df_out.empty:
|
199 |
+
unique_files = similarity_df_out['Page2_File'].unique()
|
200 |
+
for redact_file in unique_files:
|
201 |
+
output_file_name = output_folder + redact_file + "_whole_page.csv"
|
202 |
+
whole_pages_to_redact_df = similarity_df_out.loc[similarity_df_out['Page2_File']==redact_file,:][['Page2_Page']]
|
203 |
+
whole_pages_to_redact_df.to_csv(output_file_name, header=None, index=None)
|
204 |
+
|
205 |
+
output_paths.append(output_file_name)
|
206 |
+
|
207 |
+
|
208 |
+
return similarity_df_out, output_paths
|
209 |
+
|
210 |
+
# Perturb text
|
211 |
+
# Apply the perturbation function with a 10% error probability
|
212 |
+
def perturb_text_with_errors(series):
|
213 |
+
|
214 |
+
def _perturb_text(text, error_probability=0.1):
|
215 |
+
words = text.split() # Split text into words
|
216 |
+
perturbed_words = []
|
217 |
+
|
218 |
+
for word in words:
|
219 |
+
if random.random() < error_probability: # Add a random error
|
220 |
+
perturbation_type = random.choice(['char_error', 'extra_space', 'extra_punctuation'])
|
221 |
+
|
222 |
+
if perturbation_type == 'char_error': # Introduce a character error
|
223 |
+
idx = random.randint(0, len(word) - 1)
|
224 |
+
char = random.choice(string.ascii_lowercase) # Add a random letter
|
225 |
+
word = word[:idx] + char + word[idx:]
|
226 |
+
|
227 |
+
elif perturbation_type == 'extra_space': # Add extra space around a word
|
228 |
+
word = ' ' + word + ' '
|
229 |
+
|
230 |
+
elif perturbation_type == 'extra_punctuation': # Add punctuation to the word
|
231 |
+
punctuation = random.choice(string.punctuation)
|
232 |
+
idx = random.randint(0, len(word)) # Insert punctuation randomly
|
233 |
+
word = word[:idx] + punctuation + word[idx:]
|
234 |
+
|
235 |
+
perturbed_words.append(word)
|
236 |
+
|
237 |
+
return ' '.join(perturbed_words)
|
238 |
+
|
239 |
+
series = series.apply(lambda x: _perturb_text(x, error_probability=0.1))
|
240 |
+
|
241 |
+
return series
|
242 |
+
|
243 |
+
# Run through command line
|
244 |
+
# def main():
|
245 |
+
# parser = argparse.ArgumentParser(description='Combine text from multiple CSV files by page')
|
246 |
+
# parser.add_argument('input_pattern', help='Input file pattern (e.g., "input/*.csv")')
|
247 |
+
# parser.add_argument('--output', '-o', default='combined_text.csv',
|
248 |
+
# help='Output CSV file path (default: combined_text.csv)')
|
249 |
+
|
250 |
+
# args = parser.parse_args()
|
251 |
+
|
252 |
+
# # Get list of input files
|
253 |
+
# input_files = glob.glob(args.input_pattern)
|
254 |
+
|
255 |
+
# if not input_files:
|
256 |
+
# print(f"No files found matching pattern: {args.input_pattern}")
|
257 |
+
# return
|
258 |
+
|
259 |
+
# print(f"Processing {len(input_files)} files...")
|
260 |
+
|
261 |
+
# try:
|
262 |
+
# # Combine the text from all files
|
263 |
+
# combined_df = combine_ocr_output_text(input_files)
|
264 |
+
|
265 |
+
# # Save to CSV
|
266 |
+
# combined_df.to_csv(args.output, index=False)
|
267 |
+
# print(f"Successfully created combined output: {args.output}")
|
268 |
+
# print(f"Total pages processed: {len(combined_df)}")
|
269 |
+
|
270 |
+
# except Exception as e:
|
271 |
+
# print(f"Error processing files: {str(e)}")
|
272 |
+
|
273 |
+
# if __name__ == "__main__":
|
274 |
+
# main()
|
tools/helper_functions.py
CHANGED
@@ -4,23 +4,12 @@ import boto3
|
|
4 |
from botocore.exceptions import ClientError
|
5 |
import gradio as gr
|
6 |
import pandas as pd
|
|
|
7 |
import unicodedata
|
8 |
from typing import List
|
9 |
from gradio_image_annotation import image_annotator
|
10 |
from tools.auth import user_pool_id
|
11 |
|
12 |
-
def reset_state_vars():
|
13 |
-
return [], [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
|
14 |
-
label="Modify redaction boxes",
|
15 |
-
label_list=["Redaction"],
|
16 |
-
label_colors=[(0, 0, 0)],
|
17 |
-
show_label=False,
|
18 |
-
sources=None,#["upload"],
|
19 |
-
show_clear_button=False,
|
20 |
-
show_share_button=False,
|
21 |
-
show_remove_button=False,
|
22 |
-
interactive=False
|
23 |
-
), [], []
|
24 |
|
25 |
def get_or_create_env_var(var_name, default_value):
|
26 |
# Get the environment variable if it exists
|
@@ -48,13 +37,40 @@ print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
|
|
48 |
input_folder = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/')
|
49 |
print(f'The value of GRADIO_INPUT_FOLDER is {input_folder}')
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
def load_in_default_allow_list(allow_list_file_path):
|
52 |
if isinstance(allow_list_file_path, str):
|
53 |
allow_list_file_path = [allow_list_file_path]
|
54 |
return allow_list_file_path
|
55 |
|
56 |
|
57 |
-
def
|
58 |
# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
|
59 |
basename = os.path.basename(file_path)
|
60 |
|
@@ -81,6 +97,8 @@ def detect_file_type(filename):
|
|
81 |
return 'jpeg'
|
82 |
elif filename.endswith('.png'):
|
83 |
return 'png'
|
|
|
|
|
84 |
else:
|
85 |
raise ValueError("Unsupported file type.")
|
86 |
|
@@ -121,7 +139,7 @@ def custom_regex_load(in_file:List[str], file_type:str = "Allow list"):
|
|
121 |
if regex_file_names:
|
122 |
regex_file_name = regex_file_names[0]
|
123 |
custom_regex = pd.read_csv(regex_file_name, low_memory=False, header=None)
|
124 |
-
#regex_file_name_no_ext =
|
125 |
|
126 |
custom_regex.columns = custom_regex.columns.astype(str)
|
127 |
|
@@ -215,13 +233,41 @@ def wipe_logs(feedback_logs_loc, usage_logs_loc):
|
|
215 |
except Exception as e:
|
216 |
print("Could not remove usage logs file", e)
|
217 |
|
218 |
-
|
219 |
-
|
220 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
|
222 |
-
# Retrieving or setting CUSTOM_HEADER_VALUE
|
223 |
-
CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
|
224 |
-
print(f'CUSTOM_HEADER_VALUE found')
|
225 |
|
226 |
async def get_connection_params(request: gr.Request):
|
227 |
base_folder = ""
|
|
|
4 |
from botocore.exceptions import ClientError
|
5 |
import gradio as gr
|
6 |
import pandas as pd
|
7 |
+
import numpy as np
|
8 |
import unicodedata
|
9 |
from typing import List
|
10 |
from gradio_image_annotation import image_annotator
|
11 |
from tools.auth import user_pool_id
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
def get_or_create_env_var(var_name, default_value):
|
15 |
# Get the environment variable if it exists
|
|
|
37 |
input_folder = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/')
|
38 |
print(f'The value of GRADIO_INPUT_FOLDER is {input_folder}')
|
39 |
|
40 |
+
# Retrieving or setting CUSTOM_HEADER
|
41 |
+
CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
|
42 |
+
print(f'CUSTOM_HEADER found')
|
43 |
+
|
44 |
+
# Retrieving or setting CUSTOM_HEADER_VALUE
|
45 |
+
CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
|
46 |
+
print(f'CUSTOM_HEADER_VALUE found')
|
47 |
+
|
48 |
+
|
49 |
+
def reset_state_vars():
|
50 |
+
return [], [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
|
51 |
+
label="Modify redaction boxes",
|
52 |
+
label_list=["Redaction"],
|
53 |
+
label_colors=[(0, 0, 0)],
|
54 |
+
show_label=False,
|
55 |
+
sources=None,#["upload"],
|
56 |
+
show_clear_button=False,
|
57 |
+
show_share_button=False,
|
58 |
+
show_remove_button=False,
|
59 |
+
interactive=False
|
60 |
+
), [], [], [], pd.DataFrame(), pd.DataFrame()
|
61 |
+
|
62 |
+
def reset_review_vars():
|
63 |
+
return [], pd.DataFrame(), pd.DataFrame()
|
64 |
+
|
65 |
+
|
66 |
+
|
67 |
def load_in_default_allow_list(allow_list_file_path):
|
68 |
if isinstance(allow_list_file_path, str):
|
69 |
allow_list_file_path = [allow_list_file_path]
|
70 |
return allow_list_file_path
|
71 |
|
72 |
|
73 |
+
def get_file_name_without_type(file_path):
|
74 |
# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
|
75 |
basename = os.path.basename(file_path)
|
76 |
|
|
|
97 |
return 'jpeg'
|
98 |
elif filename.endswith('.png'):
|
99 |
return 'png'
|
100 |
+
elif filename.endswith('.xfdf'):
|
101 |
+
return 'xfdf'
|
102 |
else:
|
103 |
raise ValueError("Unsupported file type.")
|
104 |
|
|
|
139 |
if regex_file_names:
|
140 |
regex_file_name = regex_file_names[0]
|
141 |
custom_regex = pd.read_csv(regex_file_name, low_memory=False, header=None)
|
142 |
+
#regex_file_name_no_ext = get_file_name_without_type(regex_file_name)
|
143 |
|
144 |
custom_regex.columns = custom_regex.columns.astype(str)
|
145 |
|
|
|
233 |
except Exception as e:
|
234 |
print("Could not remove usage logs file", e)
|
235 |
|
236 |
+
def merge_csv_files(file_list):
|
237 |
+
|
238 |
+
# Initialise an empty list to hold DataFrames
|
239 |
+
dataframes = []
|
240 |
+
output_files = []
|
241 |
+
|
242 |
+
# Loop through each file in the file list
|
243 |
+
for file in file_list:
|
244 |
+
# Read the CSV file into a DataFrame
|
245 |
+
df = pd.read_csv(file.name)
|
246 |
+
dataframes.append(df)
|
247 |
+
|
248 |
+
# Concatenate all DataFrames into a single DataFrame
|
249 |
+
merged_df = pd.concat(dataframes, ignore_index=True)
|
250 |
+
|
251 |
+
for col in ['xmin', 'xmax', 'ymin', 'ymax']:
|
252 |
+
merged_df[col] = np.floor(merged_df[col])
|
253 |
+
|
254 |
+
merged_df = merged_df.drop_duplicates(subset=['page', 'label', 'color', 'xmin', 'ymin', 'xmax', 'ymax'])
|
255 |
+
|
256 |
+
merged_df = merged_df.sort_values(['page', 'ymin', 'xmin', 'label'])
|
257 |
+
|
258 |
+
file_out_name = os.path.basename(file_list[0])
|
259 |
+
|
260 |
+
merged_csv_path = output_folder + file_out_name + "_merged.csv"
|
261 |
+
|
262 |
+
# Save the merged DataFrame to a CSV file
|
263 |
+
#merged_csv = StringIO()
|
264 |
+
merged_df.to_csv(merged_csv_path, index=False)
|
265 |
+
output_files.append(merged_csv_path)
|
266 |
+
#merged_csv.seek(0) # Move to the beginning of the StringIO object
|
267 |
+
|
268 |
+
return output_files
|
269 |
+
|
270 |
|
|
|
|
|
|
|
271 |
|
272 |
async def get_connection_params(request: gr.Request):
|
273 |
base_folder = ""
|
tools/load_spacy_model_custom_recognisers.py
CHANGED
@@ -3,9 +3,13 @@ from typing import List
|
|
3 |
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
|
4 |
from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts
|
5 |
import spacy
|
|
|
|
|
6 |
spacy.prefer_gpu()
|
7 |
from spacy.cli.download import download
|
|
|
8 |
import re
|
|
|
9 |
|
10 |
model_name = "en_core_web_sm" #"en_core_web_trf"
|
11 |
score_threshold = 0.001
|
@@ -65,16 +69,8 @@ ukpostcode_pattern = Pattern(
|
|
65 |
# Define the recognizer with one or more patterns
|
66 |
ukpostcode_recogniser = PatternRecognizer(supported_entity="UKPOSTCODE", name = "UKPOSTCODE", patterns = [ukpostcode_pattern])
|
67 |
|
68 |
-
|
69 |
-
# Examples for testing
|
70 |
-
|
71 |
-
#text = "I live in 510 Broad st SE5 9NG ."
|
72 |
-
|
73 |
-
#numbers_result = ukpostcode_recogniser.analyze(text=text, entities=["UKPOSTCODE"])
|
74 |
-
#print("Result:")
|
75 |
-
#print(numbers_result)
|
76 |
|
77 |
-
# %%
|
78 |
def extract_street_name(text:str) -> str:
|
79 |
"""
|
80 |
Extracts the street name and preceding word (that should contain at least one number) from the given text.
|
@@ -101,7 +97,7 @@ def extract_street_name(text:str) -> str:
|
|
101 |
pattern += rf'(?P<street_name>\w+\s*\b(?:{street_types_pattern})\b)'
|
102 |
|
103 |
# Find all matches in text
|
104 |
-
matches = re.finditer(pattern, text, re.IGNORECASE)
|
105 |
|
106 |
start_positions = []
|
107 |
end_positions = []
|
@@ -120,19 +116,6 @@ def extract_street_name(text:str) -> str:
|
|
120 |
|
121 |
return start_positions, end_positions
|
122 |
|
123 |
-
|
124 |
-
# %%
|
125 |
-
# Some examples for testing
|
126 |
-
|
127 |
-
#text = "1234 Main Street, 5678 Oak Rd, 9ABC Elm Blvd, 42 Eagle st."
|
128 |
-
#text = "Roberto lives in Five 10 Broad st in Oregon"
|
129 |
-
#text = "Roberto lives in 55 Oregon Square"
|
130 |
-
#text = "There is 51a no way I will do that"
|
131 |
-
#text = "I am writing to apply for"
|
132 |
-
|
133 |
-
#extract_street_name(text)
|
134 |
-
|
135 |
-
# %%
|
136 |
class StreetNameRecognizer(EntityRecognizer):
|
137 |
|
138 |
def load(self) -> None:
|
@@ -163,14 +146,181 @@ class StreetNameRecognizer(EntityRecognizer):
|
|
163 |
|
164 |
street_recogniser = StreetNameRecognizer(supported_entities=["STREETNAME"])
|
165 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
# Create a class inheriting from SpacyNlpEngine
|
167 |
class LoadedSpacyNlpEngine(SpacyNlpEngine):
|
168 |
def __init__(self, loaded_spacy_model):
|
169 |
super().__init__()
|
170 |
self.nlp = {"en": loaded_spacy_model}
|
171 |
|
172 |
-
|
173 |
-
|
174 |
# Pass the loaded model to the new LoadedSpacyNlpEngine
|
175 |
loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
|
176 |
|
@@ -186,4 +336,5 @@ nlp_analyser.registry.add_recognizer(street_recogniser)
|
|
186 |
nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
|
187 |
nlp_analyser.registry.add_recognizer(titles_recogniser)
|
188 |
nlp_analyser.registry.add_recognizer(custom_recogniser)
|
|
|
189 |
|
|
|
3 |
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
|
4 |
from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts
|
5 |
import spacy
|
6 |
+
from spacy.matcher import Matcher, PhraseMatcher
|
7 |
+
from spaczz.matcher import FuzzyMatcher
|
8 |
spacy.prefer_gpu()
|
9 |
from spacy.cli.download import download
|
10 |
+
import Levenshtein
|
11 |
import re
|
12 |
+
import gradio as gr
|
13 |
|
14 |
model_name = "en_core_web_sm" #"en_core_web_trf"
|
15 |
score_threshold = 0.001
|
|
|
69 |
# Define the recognizer with one or more patterns
|
70 |
ukpostcode_recogniser = PatternRecognizer(supported_entity="UKPOSTCODE", name = "UKPOSTCODE", patterns = [ukpostcode_pattern])
|
71 |
|
72 |
+
### Street name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
|
|
74 |
def extract_street_name(text:str) -> str:
|
75 |
"""
|
76 |
Extracts the street name and preceding word (that should contain at least one number) from the given text.
|
|
|
97 |
pattern += rf'(?P<street_name>\w+\s*\b(?:{street_types_pattern})\b)'
|
98 |
|
99 |
# Find all matches in text
|
100 |
+
matches = re.finditer(pattern, text, re.DOTALL | re.MULTILINE | re.IGNORECASE)
|
101 |
|
102 |
start_positions = []
|
103 |
end_positions = []
|
|
|
116 |
|
117 |
return start_positions, end_positions
|
118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
class StreetNameRecognizer(EntityRecognizer):
|
120 |
|
121 |
def load(self) -> None:
|
|
|
146 |
|
147 |
street_recogniser = StreetNameRecognizer(supported_entities=["STREETNAME"])
|
148 |
|
149 |
+
## Custom fuzzy match recogniser for list of strings
|
150 |
+
def custom_fuzzy_word_list_regex(text:str, custom_list:List[str]=[]):
|
151 |
+
# Create regex pattern, handling quotes carefully
|
152 |
+
|
153 |
+
quote_str = '"'
|
154 |
+
replace_str = '(?:"|"|")'
|
155 |
+
|
156 |
+
custom_regex_pattern = '|'.join(
|
157 |
+
rf'(?<!\w){re.escape(term.strip()).replace(quote_str, replace_str)}(?!\w)'
|
158 |
+
for term in custom_list
|
159 |
+
)
|
160 |
+
|
161 |
+
# Find all matches in text
|
162 |
+
matches = re.finditer(custom_regex_pattern, text, re.DOTALL | re.MULTILINE | re.IGNORECASE)
|
163 |
+
|
164 |
+
start_positions = []
|
165 |
+
end_positions = []
|
166 |
+
|
167 |
+
for match in matches:
|
168 |
+
start_pos = match.start()
|
169 |
+
end_pos = match.end()
|
170 |
+
|
171 |
+
start_positions.append(start_pos)
|
172 |
+
end_positions.append(end_pos)
|
173 |
+
|
174 |
+
return start_positions, end_positions
|
175 |
+
|
176 |
+
def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mistakes_max:int = 1, search_whole_phrase:bool=True, nlp=nlp, progress=gr.Progress(track_tqdm=True)):
|
177 |
+
''' Conduct fuzzy match on a list of text data.'''
|
178 |
+
|
179 |
+
all_matches = []
|
180 |
+
all_start_positions = []
|
181 |
+
all_end_positions = []
|
182 |
+
all_ratios = []
|
183 |
+
|
184 |
+
#print("custom_query_list:", custom_query_list)
|
185 |
+
|
186 |
+
if not text:
|
187 |
+
out_message = "Prepared data not found. Have you clicked 'Load data' above to prepare a search index?"
|
188 |
+
print(out_message)
|
189 |
+
return out_message, None
|
190 |
+
|
191 |
+
for string_query in custom_query_list:
|
192 |
+
|
193 |
+
#print("text:", text)
|
194 |
+
#print("string_query:", string_query)
|
195 |
+
|
196 |
+
query = nlp(string_query)
|
197 |
+
|
198 |
+
if search_whole_phrase == False:
|
199 |
+
# Keep only words that are not stop words
|
200 |
+
token_query = [token.text for token in query if not token.is_space and not token.is_stop and not token.is_punct]
|
201 |
+
|
202 |
+
spelling_mistakes_fuzzy_pattern = "FUZZY" + str(spelling_mistakes_max)
|
203 |
+
|
204 |
+
#print("token_query:", token_query)
|
205 |
+
|
206 |
+
if len(token_query) > 1:
|
207 |
+
#pattern_lemma = [{"LEMMA": {"IN": query}}]
|
208 |
+
pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: {"IN": token_query}}}]
|
209 |
+
else:
|
210 |
+
#pattern_lemma = [{"LEMMA": query[0]}]
|
211 |
+
pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: token_query[0]}}]
|
212 |
+
|
213 |
+
matcher = Matcher(nlp.vocab)
|
214 |
+
matcher.add(string_query, [pattern_fuzz])
|
215 |
+
#matcher.add(string_query, [pattern_lemma])
|
216 |
+
|
217 |
+
else:
|
218 |
+
# If matching a whole phrase, use Spacy PhraseMatcher, then consider similarity after using Levenshtein distance.
|
219 |
+
#tokenised_query = [string_query.lower()]
|
220 |
+
# If you want to match the whole phrase, use phrase matcher
|
221 |
+
matcher = FuzzyMatcher(nlp.vocab)
|
222 |
+
patterns = [nlp.make_doc(string_query)] # Convert query into a Doc object
|
223 |
+
matcher.add("PHRASE", patterns, [{"ignore_case": True}])
|
224 |
+
|
225 |
+
batch_size = 256
|
226 |
+
docs = nlp.pipe([text], batch_size=batch_size)
|
227 |
+
|
228 |
+
# Get number of matches per doc
|
229 |
+
for doc in docs: #progress.tqdm(docs, desc = "Searching text", unit = "rows"):
|
230 |
+
matches = matcher(doc)
|
231 |
+
match_count = len(matches)
|
232 |
+
|
233 |
+
# If considering each sub term individually, append match. If considering together, consider weight of the relevance to that of the whole phrase.
|
234 |
+
if search_whole_phrase==False:
|
235 |
+
all_matches.append(match_count)
|
236 |
+
|
237 |
+
for match_id, start, end in matches:
|
238 |
+
span = str(doc[start:end]).strip()
|
239 |
+
query_search = str(query).strip()
|
240 |
+
#print("doc:", doc)
|
241 |
+
#print("span:", span)
|
242 |
+
#print("query_search:", query_search)
|
243 |
+
|
244 |
+
# Convert word positions to character positions
|
245 |
+
start_char = doc[start].idx # Start character position
|
246 |
+
end_char = doc[end - 1].idx + len(doc[end - 1]) # End character position
|
247 |
+
|
248 |
+
# The positions here are word position, not character position
|
249 |
+
all_matches.append(match_count)
|
250 |
+
all_start_positions.append(start_char)
|
251 |
+
all_end_positions.append(end_char)
|
252 |
+
|
253 |
+
else:
|
254 |
+
for match_id, start, end, ratio, pattern in matches:
|
255 |
+
span = str(doc[start:end]).strip()
|
256 |
+
query_search = str(query).strip()
|
257 |
+
print("doc:", doc)
|
258 |
+
print("span:", span)
|
259 |
+
print("query_search:", query_search)
|
260 |
+
|
261 |
+
# Calculate Levenshtein distance. Only keep matches with less than specified number of spelling mistakes
|
262 |
+
distance = Levenshtein.distance(query_search.lower(), span.lower())
|
263 |
+
|
264 |
+
print("Levenshtein distance:", distance)
|
265 |
+
|
266 |
+
if distance > spelling_mistakes_max:
|
267 |
+
match_count = match_count - 1
|
268 |
+
else:
|
269 |
+
# Convert word positions to character positions
|
270 |
+
start_char = doc[start].idx # Start character position
|
271 |
+
end_char = doc[end - 1].idx + len(doc[end - 1]) # End character position
|
272 |
+
|
273 |
+
print("start_char:", start_char)
|
274 |
+
print("end_char:", end_char)
|
275 |
+
|
276 |
+
all_matches.append(match_count)
|
277 |
+
all_start_positions.append(start_char)
|
278 |
+
all_end_positions.append(end_char)
|
279 |
+
all_ratios.append(ratio)
|
280 |
+
|
281 |
+
|
282 |
+
return all_start_positions, all_end_positions
|
283 |
+
|
284 |
+
|
285 |
+
class CustomWordFuzzyRecognizer(EntityRecognizer):
|
286 |
+
def __init__(self, supported_entities: List[str], custom_list: List[str] = [], spelling_mistakes_max: int = 1, search_whole_phrase: bool = True):
|
287 |
+
super().__init__(supported_entities=supported_entities)
|
288 |
+
self.custom_list = custom_list # Store the custom_list as an instance attribute
|
289 |
+
self.spelling_mistakes_max = spelling_mistakes_max # Store the max spelling mistakes
|
290 |
+
self.search_whole_phrase = search_whole_phrase # Store the search whole phrase flag
|
291 |
+
|
292 |
+
def load(self) -> None:
|
293 |
+
"""No loading is required."""
|
294 |
+
pass
|
295 |
+
|
296 |
+
def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts) -> List[RecognizerResult]:
|
297 |
+
"""
|
298 |
+
Logic for detecting a specific PII
|
299 |
+
"""
|
300 |
+
start_pos, end_pos = spacy_fuzzy_search(text, self.custom_list, self.spelling_mistakes_max, self.search_whole_phrase) # Pass new parameters
|
301 |
+
|
302 |
+
results = []
|
303 |
+
|
304 |
+
for i in range(0, len(start_pos)):
|
305 |
+
result = RecognizerResult(
|
306 |
+
entity_type="CUSTOM_FUZZY",
|
307 |
+
start=start_pos[i],
|
308 |
+
end=end_pos[i],
|
309 |
+
score=1
|
310 |
+
)
|
311 |
+
results.append(result)
|
312 |
+
|
313 |
+
return results
|
314 |
+
|
315 |
+
custom_list_default = []
|
316 |
+
custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_list_default)
|
317 |
+
|
318 |
# Create a class inheriting from SpacyNlpEngine
|
319 |
class LoadedSpacyNlpEngine(SpacyNlpEngine):
|
320 |
def __init__(self, loaded_spacy_model):
|
321 |
super().__init__()
|
322 |
self.nlp = {"en": loaded_spacy_model}
|
323 |
|
|
|
|
|
324 |
# Pass the loaded model to the new LoadedSpacyNlpEngine
|
325 |
loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
|
326 |
|
|
|
336 |
nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
|
337 |
nlp_analyser.registry.add_recognizer(titles_recogniser)
|
338 |
nlp_analyser.registry.add_recognizer(custom_recogniser)
|
339 |
+
nlp_analyser.registry.add_recognizer(custom_word_fuzzy_recognizer)
|
340 |
|
tools/redaction_review.py
CHANGED
@@ -1,12 +1,14 @@
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
|
|
|
|
|
|
4 |
from typing import List
|
5 |
from gradio_image_annotation import image_annotator
|
6 |
from gradio_image_annotation.image_annotator import AnnotatedImageData
|
7 |
-
|
8 |
-
from tools.
|
9 |
-
from tools.helper_functions import get_file_path_end, output_folder
|
10 |
from tools.file_redaction import redact_page_with_pymupdf
|
11 |
import json
|
12 |
import os
|
@@ -66,6 +68,12 @@ def remove_duplicate_images_with_blank_boxes(data: List[dict]) -> List[dict]:
|
|
66 |
for image, items in image_groups.items():
|
67 |
# Filter items with non-empty boxes
|
68 |
non_empty_boxes = [item for item in items if item.get('boxes')]
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
if non_empty_boxes:
|
70 |
# Keep the first entry with non-empty boxes
|
71 |
result.append(non_empty_boxes[0])
|
@@ -173,6 +181,8 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
|
|
173 |
|
174 |
image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
|
175 |
|
|
|
|
|
176 |
out_image_annotator = image_annotator(
|
177 |
value = image_annotator_object[page_num_reported - 1],
|
178 |
boxes_alpha=0.1,
|
@@ -262,7 +272,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
|
|
262 |
|
263 |
for file_path in file_paths:
|
264 |
#print("file_path:", file_path)
|
265 |
-
file_name_without_ext =
|
266 |
file_name_with_ext = os.path.basename(file_path)
|
267 |
|
268 |
file_extension = os.path.splitext(file_path)[1].lower()
|
@@ -381,3 +391,365 @@ def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
|
|
381 |
row_value_page = evt.row_value[0] # This is the page number value
|
382 |
return row_value_page
|
383 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
4 |
+
from xml.etree.ElementTree import Element, SubElement, tostring, parse
|
5 |
+
from xml.dom import minidom
|
6 |
+
import uuid
|
7 |
from typing import List
|
8 |
from gradio_image_annotation import image_annotator
|
9 |
from gradio_image_annotation.image_annotator import AnnotatedImageData
|
10 |
+
from tools.file_conversion import is_pdf, convert_review_json_to_pandas_df, CUSTOM_BOX_COLOUR
|
11 |
+
from tools.helper_functions import get_file_name_without_type, output_folder, detect_file_type
|
|
|
12 |
from tools.file_redaction import redact_page_with_pymupdf
|
13 |
import json
|
14 |
import os
|
|
|
68 |
for image, items in image_groups.items():
|
69 |
# Filter items with non-empty boxes
|
70 |
non_empty_boxes = [item for item in items if item.get('boxes')]
|
71 |
+
|
72 |
+
# Remove 'text' elements from boxes
|
73 |
+
for item in non_empty_boxes:
|
74 |
+
if 'boxes' in item:
|
75 |
+
item['boxes'] = [{k: v for k, v in box.items() if k != 'text'} for box in item['boxes']]
|
76 |
+
|
77 |
if non_empty_boxes:
|
78 |
# Keep the first entry with non-empty boxes
|
79 |
result.append(non_empty_boxes[0])
|
|
|
181 |
|
182 |
image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
|
183 |
|
184 |
+
|
185 |
+
|
186 |
out_image_annotator = image_annotator(
|
187 |
value = image_annotator_object[page_num_reported - 1],
|
188 |
boxes_alpha=0.1,
|
|
|
272 |
|
273 |
for file_path in file_paths:
|
274 |
#print("file_path:", file_path)
|
275 |
+
file_name_without_ext = get_file_name_without_type(file_path)
|
276 |
file_name_with_ext = os.path.basename(file_path)
|
277 |
|
278 |
file_extension = os.path.splitext(file_path)[1].lower()
|
|
|
391 |
row_value_page = evt.row_value[0] # This is the page number value
|
392 |
return row_value_page
|
393 |
|
394 |
+
def convert_image_coords_to_adobe(pdf_page_width, pdf_page_height, image_width, image_height, x1, y1, x2, y2):
|
395 |
+
'''
|
396 |
+
Converts coordinates from image space to Adobe PDF space.
|
397 |
+
|
398 |
+
Parameters:
|
399 |
+
- pdf_page_width: Width of the PDF page
|
400 |
+
- pdf_page_height: Height of the PDF page
|
401 |
+
- image_width: Width of the source image
|
402 |
+
- image_height: Height of the source image
|
403 |
+
- x1, y1, x2, y2: Coordinates in image space
|
404 |
+
|
405 |
+
Returns:
|
406 |
+
- Tuple of converted coordinates (x1, y1, x2, y2) in Adobe PDF space
|
407 |
+
'''
|
408 |
+
|
409 |
+
# Calculate scaling factors
|
410 |
+
scale_width = pdf_page_width / image_width
|
411 |
+
scale_height = pdf_page_height / image_height
|
412 |
+
|
413 |
+
# Convert coordinates
|
414 |
+
pdf_x1 = x1 * scale_width
|
415 |
+
pdf_x2 = x2 * scale_width
|
416 |
+
|
417 |
+
# Convert Y coordinates (flip vertical axis)
|
418 |
+
# Adobe coordinates start from bottom-left
|
419 |
+
pdf_y1 = pdf_page_height - (y1 * scale_height)
|
420 |
+
pdf_y2 = pdf_page_height - (y2 * scale_height)
|
421 |
+
|
422 |
+
# Make sure y1 is always less than y2 for Adobe's coordinate system
|
423 |
+
if pdf_y1 > pdf_y2:
|
424 |
+
pdf_y1, pdf_y2 = pdf_y2, pdf_y1
|
425 |
+
|
426 |
+
return pdf_x1, pdf_y1, pdf_x2, pdf_y2
|
427 |
+
|
428 |
+
|
429 |
+
def create_xfdf(df, pdf_path, pymupdf_doc, image_paths):
|
430 |
+
'''
|
431 |
+
Create an xfdf file from a review csv file and a pdf
|
432 |
+
'''
|
433 |
+
|
434 |
+
# Create root element
|
435 |
+
xfdf = Element('xfdf', xmlns="http://ns.adobe.com/xfdf/", xml_space="preserve")
|
436 |
+
|
437 |
+
# Add header
|
438 |
+
header = SubElement(xfdf, 'header')
|
439 |
+
header.set('pdf-filepath', pdf_path)
|
440 |
+
|
441 |
+
# Add annots
|
442 |
+
annots = SubElement(xfdf, 'annots')
|
443 |
+
|
444 |
+
for _, row in df.iterrows():
|
445 |
+
page_python_format = int(row["page"])-1
|
446 |
+
|
447 |
+
pymupdf_page = pymupdf_doc.load_page(page_python_format)
|
448 |
+
|
449 |
+
pdf_page_height = pymupdf_page.rect.height
|
450 |
+
pdf_page_width = pymupdf_page.rect.width
|
451 |
+
|
452 |
+
image = image_paths[page_python_format]
|
453 |
+
|
454 |
+
#print("image:", image)
|
455 |
+
|
456 |
+
if isinstance(image, str):
|
457 |
+
image = Image.open(image)
|
458 |
+
|
459 |
+
image_page_width, image_page_height = image.size
|
460 |
+
|
461 |
+
# Create redaction annotation
|
462 |
+
redact_annot = SubElement(annots, 'redact')
|
463 |
+
|
464 |
+
# Generate unique ID
|
465 |
+
annot_id = str(uuid.uuid4())
|
466 |
+
redact_annot.set('name', annot_id)
|
467 |
+
|
468 |
+
# Set page number (subtract 1 as PDF pages are 0-based)
|
469 |
+
redact_annot.set('page', str(int(row['page']) - 1))
|
470 |
+
|
471 |
+
# Convert coordinates
|
472 |
+
x1, y1, x2, y2 = convert_image_coords_to_adobe(
|
473 |
+
pdf_page_width,
|
474 |
+
pdf_page_height,
|
475 |
+
image_page_width,
|
476 |
+
image_page_height,
|
477 |
+
row['xmin'],
|
478 |
+
row['ymin'],
|
479 |
+
row['xmax'],
|
480 |
+
row['ymax']
|
481 |
+
)
|
482 |
+
|
483 |
+
if CUSTOM_BOX_COLOUR == "grey":
|
484 |
+
colour_str = "0.5,0.5,0.5"
|
485 |
+
else:
|
486 |
+
colour_str = row['color'].strip('()').replace(' ', '')
|
487 |
+
|
488 |
+
# Set coordinates
|
489 |
+
redact_annot.set('rect', f"{x1:.2f},{y1:.2f},{x2:.2f},{y2:.2f}")
|
490 |
+
|
491 |
+
# Set redaction properties
|
492 |
+
redact_annot.set('title', row['label']) # The type of redaction (e.g., "PERSON")
|
493 |
+
redact_annot.set('contents', row['text']) # The redacted text
|
494 |
+
redact_annot.set('subject', row['label']) # The redacted text
|
495 |
+
redact_annot.set('mimetype', "Form")
|
496 |
+
|
497 |
+
# Set appearance properties
|
498 |
+
redact_annot.set('border-color', colour_str) # Black border
|
499 |
+
redact_annot.set('repeat', 'false')
|
500 |
+
redact_annot.set('interior-color', colour_str)
|
501 |
+
#redact_annot.set('fill-color', colour_str)
|
502 |
+
#redact_annot.set('outline-color', colour_str)
|
503 |
+
redact_annot.set('overlay-color', colour_str)
|
504 |
+
redact_annot.set('overlay-text', row['label'])
|
505 |
+
redact_annot.set('opacity', "0.5")
|
506 |
+
|
507 |
+
# Add appearance dictionary
|
508 |
+
# appearanceDict = SubElement(redact_annot, 'appearancedict')
|
509 |
+
|
510 |
+
# # Normal appearance
|
511 |
+
# normal = SubElement(appearanceDict, 'normal')
|
512 |
+
# #normal.set('appearance', 'redact')
|
513 |
+
|
514 |
+
# # Color settings for the mark (before applying redaction)
|
515 |
+
# markAppearance = SubElement(redact_annot, 'markappearance')
|
516 |
+
# markAppearance.set('stroke-color', colour_str) # Red outline
|
517 |
+
# markAppearance.set('fill-color', colour_str) # Light red fill
|
518 |
+
# markAppearance.set('opacity', '0.5') # 50% opacity
|
519 |
+
|
520 |
+
# # Final redaction appearance (after applying)
|
521 |
+
# redactAppearance = SubElement(redact_annot, 'redactAppearance')
|
522 |
+
# redactAppearance.set('fillColor', colour_str) # Black fill
|
523 |
+
# redactAppearance.set('fontName', 'Helvetica')
|
524 |
+
# redactAppearance.set('fontSize', '12')
|
525 |
+
# redactAppearance.set('textAlignment', 'left')
|
526 |
+
# redactAppearance.set('textColor', colour_str) # White text
|
527 |
+
|
528 |
+
# Convert to pretty XML string
|
529 |
+
xml_str = minidom.parseString(tostring(xfdf)).toprettyxml(indent=" ")
|
530 |
+
|
531 |
+
return xml_str
|
532 |
+
|
533 |
+
def convert_df_to_xfdf(input_files:List[str], pdf_doc, image_paths):
|
534 |
+
'''
|
535 |
+
Load in files to convert a review file into an Adobe comment file format
|
536 |
+
'''
|
537 |
+
output_paths = []
|
538 |
+
pdf_name = ""
|
539 |
+
|
540 |
+
if isinstance(input_files, str):
|
541 |
+
file_paths_list = [input_files]
|
542 |
+
else:
|
543 |
+
file_paths_list = input_files
|
544 |
+
|
545 |
+
# Sort the file paths so that the pdfs come first
|
546 |
+
file_paths_list = sorted(file_paths_list, key=lambda x: (os.path.splitext(x)[1] != '.pdf', os.path.splitext(x)[1] != '.json'))
|
547 |
+
|
548 |
+
for file in file_paths_list:
|
549 |
+
|
550 |
+
if isinstance(file, str):
|
551 |
+
file_path = file
|
552 |
+
else:
|
553 |
+
file_path = file.name
|
554 |
+
|
555 |
+
file_path_name = get_file_name_without_type(file_path)
|
556 |
+
file_path_end = detect_file_type(file_path)
|
557 |
+
|
558 |
+
if file_path_end == "pdf":
|
559 |
+
pdf_name = os.path.basename(file_path)
|
560 |
+
|
561 |
+
if file_path_end == "csv":
|
562 |
+
# If no pdf name, just get the name of the file path
|
563 |
+
if not pdf_name:
|
564 |
+
pdf_name = file_path_name
|
565 |
+
# Read CSV file
|
566 |
+
df = pd.read_csv(file_path)
|
567 |
+
|
568 |
+
df.fillna('', inplace=True) # Replace NaN with an empty string
|
569 |
+
|
570 |
+
xfdf_content = create_xfdf(df, pdf_name, pdf_doc, image_paths)
|
571 |
+
|
572 |
+
output_path = output_folder + file_path_name + "_adobe.xfdf"
|
573 |
+
|
574 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
575 |
+
f.write(xfdf_content)
|
576 |
+
|
577 |
+
output_paths.append(output_path)
|
578 |
+
|
579 |
+
return output_paths
|
580 |
+
|
581 |
+
|
582 |
+
### Convert xfdf coordinates back to image for app
|
583 |
+
|
584 |
+
def convert_adobe_coords_to_image(pdf_page_width, pdf_page_height, image_width, image_height, x1, y1, x2, y2):
|
585 |
+
'''
|
586 |
+
Converts coordinates from Adobe PDF space to image space.
|
587 |
+
|
588 |
+
Parameters:
|
589 |
+
- pdf_page_width: Width of the PDF page
|
590 |
+
- pdf_page_height: Height of the PDF page
|
591 |
+
- image_width: Width of the source image
|
592 |
+
- image_height: Height of the source image
|
593 |
+
- x1, y1, x2, y2: Coordinates in Adobe PDF space
|
594 |
+
|
595 |
+
Returns:
|
596 |
+
- Tuple of converted coordinates (x1, y1, x2, y2) in image space
|
597 |
+
'''
|
598 |
+
|
599 |
+
# Calculate scaling factors
|
600 |
+
scale_width = image_width / pdf_page_width
|
601 |
+
scale_height = image_height / pdf_page_height
|
602 |
+
|
603 |
+
# Convert coordinates
|
604 |
+
image_x1 = x1 * scale_width
|
605 |
+
image_x2 = x2 * scale_width
|
606 |
+
|
607 |
+
# Convert Y coordinates (flip vertical axis)
|
608 |
+
# Adobe coordinates start from bottom-left
|
609 |
+
image_y1 = (pdf_page_height - y1) * scale_height
|
610 |
+
image_y2 = (pdf_page_height - y2) * scale_height
|
611 |
+
|
612 |
+
# Make sure y1 is always less than y2 for image's coordinate system
|
613 |
+
if image_y1 > image_y2:
|
614 |
+
image_y1, image_y2 = image_y2, image_y1
|
615 |
+
|
616 |
+
return image_x1, image_y1, image_x2, image_y2
|
617 |
+
|
618 |
+
def parse_xfdf(xfdf_path):
|
619 |
+
'''
|
620 |
+
Parse the XFDF file and extract redaction annotations.
|
621 |
+
|
622 |
+
Parameters:
|
623 |
+
- xfdf_path: Path to the XFDF file
|
624 |
+
|
625 |
+
Returns:
|
626 |
+
- List of dictionaries containing redaction information
|
627 |
+
'''
|
628 |
+
tree = parse(xfdf_path)
|
629 |
+
root = tree.getroot()
|
630 |
+
|
631 |
+
# Define the namespace
|
632 |
+
namespace = {'xfdf': 'http://ns.adobe.com/xfdf/'}
|
633 |
+
|
634 |
+
redactions = []
|
635 |
+
|
636 |
+
# Find all redact elements using the namespace
|
637 |
+
for redact in root.findall('.//xfdf:redact', namespaces=namespace):
|
638 |
+
|
639 |
+
#print("redact:", redact)
|
640 |
+
|
641 |
+
redaction_info = {
|
642 |
+
'image': '', # Image will be filled in later
|
643 |
+
'page': int(redact.get('page')) + 1, # Convert to 1-based index
|
644 |
+
'xmin': float(redact.get('rect').split(',')[0]),
|
645 |
+
'ymin': float(redact.get('rect').split(',')[1]),
|
646 |
+
'xmax': float(redact.get('rect').split(',')[2]),
|
647 |
+
'ymax': float(redact.get('rect').split(',')[3]),
|
648 |
+
'label': redact.get('title'),
|
649 |
+
'text': redact.get('contents'),
|
650 |
+
'color': redact.get('border-color', '(0, 0, 0)') # Default to black if not specified
|
651 |
+
}
|
652 |
+
redactions.append(redaction_info)
|
653 |
+
|
654 |
+
print("redactions:", redactions)
|
655 |
+
|
656 |
+
return redactions
|
657 |
+
|
658 |
+
def convert_xfdf_to_dataframe(file_paths_list, pymupdf_doc, image_paths):
|
659 |
+
'''
|
660 |
+
Convert redaction annotations from XFDF and associated images into a DataFrame.
|
661 |
+
|
662 |
+
Parameters:
|
663 |
+
- xfdf_path: Path to the XFDF file
|
664 |
+
- pdf_doc: PyMuPDF document object
|
665 |
+
- image_paths: List of PIL Image objects corresponding to PDF pages
|
666 |
+
|
667 |
+
Returns:
|
668 |
+
- DataFrame containing redaction information
|
669 |
+
'''
|
670 |
+
output_paths = []
|
671 |
+
xfdf_paths = []
|
672 |
+
df = pd.DataFrame()
|
673 |
+
|
674 |
+
#print("Image paths:", image_paths)
|
675 |
+
|
676 |
+
# Sort the file paths so that the pdfs come first
|
677 |
+
file_paths_list = sorted(file_paths_list, key=lambda x: (os.path.splitext(x)[1] != '.pdf', os.path.splitext(x)[1] != '.json'))
|
678 |
+
|
679 |
+
for file in file_paths_list:
|
680 |
+
|
681 |
+
if isinstance(file, str):
|
682 |
+
file_path = file
|
683 |
+
else:
|
684 |
+
file_path = file.name
|
685 |
+
|
686 |
+
file_path_name = get_file_name_without_type(file_path)
|
687 |
+
file_path_end = detect_file_type(file_path)
|
688 |
+
|
689 |
+
if file_path_end == "pdf":
|
690 |
+
pdf_name = os.path.basename(file_path)
|
691 |
+
#print("pymupdf_doc:", pymupdf_doc)
|
692 |
+
|
693 |
+
# Add pdf to outputs
|
694 |
+
output_paths.append(file_path)
|
695 |
+
|
696 |
+
if file_path_end == "xfdf":
|
697 |
+
|
698 |
+
if not pdf_name:
|
699 |
+
message = "Original PDF needed to convert from .xfdf format"
|
700 |
+
print(message)
|
701 |
+
raise ValueError(message)
|
702 |
+
|
703 |
+
xfdf_path = file
|
704 |
+
|
705 |
+
# if isinstance(xfdf_paths, str):
|
706 |
+
# xfdf_path = xfdf_paths.name
|
707 |
+
# else:
|
708 |
+
# xfdf_path = xfdf_paths[0].name
|
709 |
+
|
710 |
+
file_path_name = get_file_name_without_type(xfdf_path)
|
711 |
+
|
712 |
+
#print("file_path_name:", file_path_name)
|
713 |
+
|
714 |
+
# Parse the XFDF file
|
715 |
+
redactions = parse_xfdf(xfdf_path)
|
716 |
+
|
717 |
+
# Create a DataFrame from the redaction information
|
718 |
+
df = pd.DataFrame(redactions)
|
719 |
+
|
720 |
+
df.fillna('', inplace=True) # Replace NaN with an empty string
|
721 |
+
|
722 |
+
for _, row in df.iterrows():
|
723 |
+
page_python_format = int(row["page"])-1
|
724 |
+
|
725 |
+
pymupdf_page = pymupdf_doc.load_page(page_python_format)
|
726 |
+
|
727 |
+
pdf_page_height = pymupdf_page.rect.height
|
728 |
+
pdf_page_width = pymupdf_page.rect.width
|
729 |
+
|
730 |
+
image_path = image_paths[page_python_format]
|
731 |
+
|
732 |
+
#print("image_path:", image_path)
|
733 |
+
|
734 |
+
if isinstance(image_path, str):
|
735 |
+
image = Image.open(image_path)
|
736 |
+
|
737 |
+
image_page_width, image_page_height = image.size
|
738 |
+
|
739 |
+
# Convert to image coordinates
|
740 |
+
image_x1, image_y1, image_x2, image_y2 = convert_adobe_coords_to_image(pdf_page_width, pdf_page_height, image_page_width, image_page_height, row['xmin'], row['ymin'], row['xmax'], row['ymax'])
|
741 |
+
|
742 |
+
df.loc[_, ['xmin', 'ymin', 'xmax', 'ymax']] = [image_x1, image_y1, image_x2, image_y2]
|
743 |
+
|
744 |
+
# Optionally, you can add the image path or other relevant information
|
745 |
+
#print("Image path:", image_path)
|
746 |
+
df.loc[_, 'image'] = image_path
|
747 |
+
|
748 |
+
#print('row:', row)
|
749 |
+
|
750 |
+
out_file_path = output_folder + file_path_name + "_review_file.csv"
|
751 |
+
df.to_csv(out_file_path, index=None)
|
752 |
+
|
753 |
+
output_paths.append(out_file_path)
|
754 |
+
|
755 |
+
return output_paths
|