Commit
·
d60759d
1
Parent(s):
64ab318
Added example data files. Greatly revised CLI redaction for redaction, deduplication, and AWS Textract batch calls. Various minor fixes and package updates.
Browse files- .dockerignore +0 -4
- .gitattributes +7 -0
- .gitignore +2 -5
- app.py +153 -62
- cli_redact.py +655 -118
- example_data/Bold minimalist professional cover letter.docx +3 -0
- example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv +0 -0
- example_data/Partnership-Agreement-Toolkit_0_0.pdf +3 -0
- example_data/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv +2 -0
- example_data/combined_case_notes.csv +19 -0
- example_data/doubled_output_joined.pdf +3 -0
- example_data/example_complaint_letter.jpg +3 -0
- example_data/example_of_emails_sent_to_a_professor_before_applying.pdf +3 -0
- example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_ocr_output.csv +277 -0
- example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_review_file.csv +77 -0
- example_data/example_outputs/Partnership-Agreement-Toolkit_0_0_ocr_results_with_words_textract.csv +0 -0
- example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv +0 -0
- example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_ocr_output_textract.csv +40 -0
- example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_ocr_results_with_words_textract.csv +432 -0
- example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_review_file.csv +15 -0
- example_data/graduate-job-example-cover-letter.pdf +3 -0
- example_data/partnership_toolkit_redact_custom_deny_list.csv +4 -0
- example_data/partnership_toolkit_redact_some_pages.csv +2 -0
- example_data/test_allow_list_graduate.csv +1 -0
- example_data/test_allow_list_partnership.csv +1 -0
- lambda_entrypoint.py +61 -19
- pyproject.toml +2 -2
- requirements.txt +2 -2
- tools/aws_functions.py +2 -2
- tools/aws_textract.py +2 -0
- tools/cli_usage_logger.py +302 -0
- tools/config.py +18 -8
- tools/custom_csvlogger.py +2 -4
- tools/custom_image_analyser_engine.py +10 -4
- tools/data_anonymise.py +67 -47
- tools/example_cli_calls.txt +0 -30
- tools/file_redaction.py +118 -43
- tools/find_duplicate_pages.py +13 -5
- tools/find_duplicate_tabular.py +268 -75
- tools/helper_functions.py +9 -3
- tools/load_spacy_model_custom_recognisers.py +5 -2
- tools/textract_batch_call.py +52 -17
.dockerignore
CHANGED
@@ -1,8 +1,4 @@
|
|
1 |
-
*.csv
|
2 |
-
*.pdf
|
3 |
*.url
|
4 |
-
*.jpg
|
5 |
-
*.png
|
6 |
*.ipynb
|
7 |
*.pyc
|
8 |
examples/*
|
|
|
|
|
|
|
1 |
*.url
|
|
|
|
|
2 |
*.ipynb
|
3 |
*.pyc
|
4 |
examples/*
|
.gitattributes
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.pdf filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.xls filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.xlsx filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.docx filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.doc filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
.gitignore
CHANGED
@@ -1,8 +1,4 @@
|
|
1 |
-
*.csv
|
2 |
-
*.pdf
|
3 |
*.url
|
4 |
-
*.jpg
|
5 |
-
*.png
|
6 |
*.ipynb
|
7 |
*.pyc
|
8 |
examples/*
|
@@ -17,8 +13,9 @@ build/*
|
|
17 |
dist/*
|
18 |
build_deps/*
|
19 |
logs/*
|
|
|
|
|
20 |
config/*
|
21 |
-
doc_redaction_amplify_app/*
|
22 |
user_guide/*
|
23 |
cdk/config/*
|
24 |
cdk/cdk.out/*
|
|
|
|
|
|
|
1 |
*.url
|
|
|
|
|
2 |
*.ipynb
|
3 |
*.pyc
|
4 |
examples/*
|
|
|
13 |
dist/*
|
14 |
build_deps/*
|
15 |
logs/*
|
16 |
+
usage/*
|
17 |
+
feedback/*
|
18 |
config/*
|
|
|
19 |
user_guide/*
|
20 |
cdk/config/*
|
21 |
cdk/cdk.out/*
|
app.py
CHANGED
@@ -2,7 +2,7 @@ import os
|
|
2 |
import pandas as pd
|
3 |
import gradio as gr
|
4 |
from gradio_image_annotation import image_annotator
|
5 |
-
from tools.config import DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX, DEFAULT_TABULAR_ANONYMISATION_STRATEGY, OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, DEFAULT_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH,
|
6 |
from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_relevant_ocr_output_with_words, reset_data_vars, reset_aws_call_vars, _get_env_list, ensure_folder_exists, reset_ocr_with_words_base_dataframe, update_language_dropdown, LANGUAGE_CHOICES, MAPPED_LANGUAGE_CHOICES
|
7 |
from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
|
8 |
from tools.file_redaction import choose_and_run_redactor
|
@@ -14,6 +14,7 @@ from tools.load_spacy_model_custom_recognisers import custom_entities
|
|
14 |
from tools.custom_csvlogger import CSVLogger_custom
|
15 |
from tools.find_duplicate_pages import run_duplicate_analysis, exclude_match, handle_selection_and_preview, apply_whole_page_redactions_from_list, create_annotation_objects_from_duplicates, run_full_search_and_analysis
|
16 |
from tools.find_duplicate_tabular import update_tabular_column_choices, run_tabular_duplicate_detection, handle_tabular_row_selection, clean_tabular_duplicates
|
|
|
17 |
from tools.textract_batch_call import analyse_document_with_textract_api, poll_whole_document_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist, replace_existing_pdf_input_for_whole_document_outputs
|
18 |
|
19 |
# Suppress downcasting warnings
|
@@ -48,6 +49,8 @@ if USE_GREEDY_DUPLICATE_DETECTION == "True": USE_GREEDY_DUPLICATE_DETECTION = Tr
|
|
48 |
else: USE_GREEDY_DUPLICATE_DETECTION = False
|
49 |
if DEFAULT_COMBINE_PAGES == "True": DEFAULT_COMBINE_PAGES = True
|
50 |
else: DEFAULT_COMBINE_PAGES = False
|
|
|
|
|
51 |
|
52 |
if CSV_ACCESS_LOG_HEADERS: CSV_ACCESS_LOG_HEADERS = _get_env_list(CSV_ACCESS_LOG_HEADERS)
|
53 |
if CSV_FEEDBACK_LOG_HEADERS: CSV_FEEDBACK_LOG_HEADERS = _get_env_list(CSV_FEEDBACK_LOG_HEADERS)
|
@@ -73,6 +76,58 @@ FULL_COMPREHEND_ENTITY_LIST.extend(custom_entities)
|
|
73 |
|
74 |
FILE_INPUT_HEIGHT = int(FILE_INPUT_HEIGHT)
|
75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
# Create the gradio interface
|
77 |
app = gr.Blocks(theme = gr.themes.Default(primary_hue="blue"), fill_width=True) #gr.themes.Base()
|
78 |
|
@@ -185,7 +240,7 @@ with app:
|
|
185 |
# S3 settings for default allow list load
|
186 |
s3_default_bucket = gr.Textbox(label = "Default S3 bucket", value=DOCUMENT_REDACTION_BUCKET, visible=False)
|
187 |
s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=S3_ALLOW_LIST_PATH, visible=False)
|
188 |
-
default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=
|
189 |
|
190 |
s3_whole_document_textract_default_bucket = gr.Textbox(label = "Default Textract whole_document S3 bucket", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, visible=False)
|
191 |
s3_whole_document_textract_input_subfolder = gr.Textbox(label = "Default Textract whole_document S3 input folder", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, visible=False)
|
@@ -255,6 +310,7 @@ with app:
|
|
255 |
textract_job_detail_df = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','job_date_time']), label="Previous job details", visible=False, type="pandas", wrap=True)
|
256 |
selected_job_id_row = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','job_date_time']), label="Selected job id row", visible=False, type="pandas", wrap=True)
|
257 |
is_a_textract_api_call = gr.Checkbox(value=False, label="is_this_a_textract_api_call", visible=False)
|
|
|
258 |
job_output_textbox = gr.Textbox(value="", label="Textract call outputs", visible=False)
|
259 |
job_input_textbox = gr.Textbox(value=TEXTRACT_JOBS_S3_INPUT_LOC, label="Textract call outputs", visible=False)
|
260 |
|
@@ -336,13 +392,13 @@ with app:
|
|
336 |
textract_job_output_file = gr.File(label="Textract job output files", height=100, visible=True)
|
337 |
with gr.Column():
|
338 |
job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=True)
|
339 |
-
convert_textract_outputs_to_ocr_results = gr.Button("Convert Textract job outputs to OCR results", variant="secondary", visible=True)
|
340 |
|
341 |
gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the Redaction Settings tab.""")
|
342 |
document_redact_btn = gr.Button("Extract text and redact document", variant="primary", scale = 4)
|
343 |
|
344 |
-
with gr.Row(
|
345 |
-
redaction_output_summary_textbox = gr.Textbox(label="Output summary", scale=1)
|
346 |
output_file = gr.File(label="Output files", scale = 2)#, height=FILE_INPUT_HEIGHT)
|
347 |
latest_file_completed_num = gr.Number(value=0, label="Number of documents redacted", interactive=False, visible=False)
|
348 |
|
@@ -558,13 +614,13 @@ with app:
|
|
558 |
|
559 |
with gr.Accordion("Anonymisation output format - by default will replace PII with a blank space", open = False):
|
560 |
with gr.Row():
|
561 |
-
|
562 |
do_initial_clean = gr.Checkbox(label="Do initial clean of text (remove URLs, HTML tags, and non-ASCII characters)", value=DO_INITIAL_TABULAR_DATA_CLEAN)
|
563 |
|
564 |
tabular_data_redact_btn = gr.Button("Redact text/data files", variant="primary")
|
565 |
|
566 |
-
with gr.Row(
|
567 |
-
text_output_summary = gr.Textbox(label="Output result")
|
568 |
text_output_file = gr.File(label="Output files")
|
569 |
text_tabular_files_done = gr.Number(value=0, label="Number of tabular files redacted", interactive=False, visible=False)
|
570 |
|
@@ -574,21 +630,21 @@ with app:
|
|
574 |
# TABULAR DUPLICATE DETECTION TAB
|
575 |
###
|
576 |
with gr.Accordion(label="Find duplicate cells in tabular data", open=False):
|
577 |
-
gr.Markdown("""Find duplicate cells or rows in CSV
|
578 |
|
579 |
with gr.Accordion("Step 1: Upload files and configure analysis", open=True):
|
580 |
in_tabular_duplicate_files = gr.File(
|
581 |
-
label="Upload CSV or
|
582 |
file_count="multiple",
|
583 |
file_types=['.csv', '.xlsx', '.xls', '.parquet'],
|
584 |
height=FILE_INPUT_HEIGHT
|
585 |
)
|
586 |
|
587 |
-
with gr.Row():
|
588 |
tabular_duplicate_threshold = gr.Number(
|
589 |
value=DEFAULT_DUPLICATE_DETECTION_THRESHOLD,
|
590 |
label="Similarity threshold",
|
591 |
-
info="Score (0-1) to consider cells a match.
|
592 |
)
|
593 |
tabular_min_word_count = gr.Number(
|
594 |
value=DEFAULT_MIN_WORD_COUNT,
|
@@ -596,13 +652,17 @@ with app:
|
|
596 |
info="Cells with fewer words than this are ignored."
|
597 |
)
|
598 |
do_initial_clean_dup = gr.Checkbox(label="Do initial clean of text (remove URLs, HTML tags, and non-ASCII characters)", value=DO_INITIAL_TABULAR_DATA_CLEAN)
|
|
|
599 |
|
600 |
-
|
601 |
-
choices=
|
602 |
-
|
603 |
-
|
604 |
-
|
605 |
-
|
|
|
|
|
|
|
606 |
|
607 |
find_tabular_duplicates_btn = gr.Button(
|
608 |
value="Find duplicate cells/rows",
|
@@ -621,7 +681,7 @@ with app:
|
|
621 |
show_copy_button=True
|
622 |
)
|
623 |
|
624 |
-
with gr.Row():
|
625 |
tabular_selected_row_index = gr.Number(value=None, visible=False)
|
626 |
tabular_text1_preview = gr.Textbox(
|
627 |
label="Text from File 1",
|
@@ -775,11 +835,11 @@ with app:
|
|
775 |
document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number, all_page_line_level_ocr_results_with_words]).\
|
776 |
success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
|
777 |
success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox, chosen_language_drop],
|
778 |
-
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, input_pdf_for_review, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state], api_name="redact_doc", show_progress_on=[redaction_output_summary_textbox])
|
779 |
|
780 |
# If a file has been completed, the function will continue onto the next document
|
781 |
latest_file_completed_num.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox, chosen_language_drop],
|
782 |
-
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, input_pdf_for_review, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state], show_progress_on=[redaction_output_summary_textbox]).\
|
783 |
success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state], show_progress_on=[annotator]).\
|
784 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
|
785 |
success(fn=check_for_relevant_ocr_output_with_words, inputs=[doc_file_name_no_extension_textbox, text_extract_method_radio, output_folder_textbox], outputs=[relevant_ocr_output_with_words_found_checkbox]).\
|
@@ -791,7 +851,7 @@ with app:
|
|
791 |
all_page_line_level_ocr_results_with_words_df_base.change(reset_ocr_with_words_base_dataframe, inputs=[all_page_line_level_ocr_results_with_words_df_base, page_entity_dropdown_redaction], outputs=[all_page_line_level_ocr_results_with_words_df, backup_all_page_line_level_ocr_results_with_words_df_base])
|
792 |
|
793 |
# Send whole document to Textract for text extraction
|
794 |
-
send_document_to_textract_api_btn.click(analyse_document_with_textract_api, inputs=[prepared_pdf_state, s3_whole_document_textract_input_subfolder, s3_whole_document_textract_output_subfolder, textract_job_detail_df, s3_whole_document_textract_default_bucket, output_folder_textbox, handwrite_signature_checkbox, successful_textract_api_call_number, total_pdf_page_count], outputs=[job_output_textbox, job_id_textbox, job_type_dropdown, successful_textract_api_call_number, is_a_textract_api_call, textract_query_number]).\
|
795 |
success(check_for_provided_job_id, inputs=[job_id_textbox]).\
|
796 |
success(poll_whole_document_textract_analysis_progress_and_download, inputs=[job_id_textbox, job_type_dropdown, s3_whole_document_textract_output_subfolder, doc_file_name_no_extension_textbox, textract_job_detail_df, s3_whole_document_textract_default_bucket, output_folder_textbox, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder], outputs = [textract_job_output_file, job_current_status, textract_job_detail_df, doc_file_name_no_extension_textbox]).\
|
797 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
|
@@ -809,7 +869,7 @@ with app:
|
|
809 |
success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
|
810 |
success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
|
811 |
success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox, chosen_language_drop],
|
812 |
-
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, input_pdf_for_review, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state], show_progress_on=[redaction_output_summary_textbox]).\
|
813 |
success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state], show_progress_on=[annotator])
|
814 |
|
815 |
###
|
@@ -1021,10 +1081,10 @@ with app:
|
|
1021 |
success(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list, total_pdf_page_count])
|
1022 |
|
1023 |
tabular_data_redact_btn.click(reset_data_vars, outputs=[actual_time_taken_number, log_files_output_list_state, comprehend_query_number]).\
|
1024 |
-
success(fn=anonymise_files_with_open_text, inputs=[in_data_files, in_text,
|
1025 |
|
1026 |
# If the output file count text box changes, keep going with redacting each data file until done
|
1027 |
-
text_tabular_files_done.change(fn=anonymise_files_with_open_text, inputs=[in_data_files, in_text,
|
1028 |
success(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
|
1029 |
|
1030 |
###
|
@@ -1039,12 +1099,15 @@ with app:
|
|
1039 |
min_word_count_input,
|
1040 |
min_consecutive_pages_input,
|
1041 |
greedy_match_input,
|
1042 |
-
combine_page_text_for_duplicates_bool
|
|
|
1043 |
],
|
1044 |
outputs=[
|
1045 |
results_df_preview,
|
1046 |
duplicate_files_out,
|
1047 |
-
full_duplicate_data_by_file
|
|
|
|
|
1048 |
]
|
1049 |
)
|
1050 |
|
@@ -1076,15 +1139,13 @@ with app:
|
|
1076 |
|
1077 |
# Event handlers
|
1078 |
in_tabular_duplicate_files.upload(
|
1079 |
-
fn=
|
1080 |
-
|
1081 |
-
outputs=[tabular_text_columns]
|
1082 |
-
)
|
1083 |
|
1084 |
find_tabular_duplicates_btn.click(
|
1085 |
fn=run_tabular_duplicate_detection,
|
1086 |
-
inputs=[in_tabular_duplicate_files, tabular_duplicate_threshold, tabular_min_word_count, tabular_text_columns, output_folder_textbox, do_initial_clean_dup],
|
1087 |
-
outputs=[tabular_results_df, tabular_cleaned_file, tabular_file_to_clean], api_name="tabular_clean_duplicates", show_progress_on=[tabular_results_df]
|
1088 |
)
|
1089 |
|
1090 |
tabular_results_df.select(
|
@@ -1095,7 +1156,7 @@ with app:
|
|
1095 |
|
1096 |
clean_duplicates_btn.click(
|
1097 |
fn=clean_tabular_duplicates,
|
1098 |
-
inputs=[tabular_file_to_clean, tabular_results_df, output_folder_textbox],
|
1099 |
outputs=[tabular_cleaned_file]
|
1100 |
)
|
1101 |
|
@@ -1182,15 +1243,15 @@ with app:
|
|
1182 |
pdf_callback = CSVLogger_custom(dataset_file_name=FEEDBACK_LOG_FILE_NAME)
|
1183 |
data_callback = CSVLogger_custom(dataset_file_name=FEEDBACK_LOG_FILE_NAME)
|
1184 |
|
1185 |
-
if DISPLAY_FILE_NAMES_IN_LOGS ==
|
1186 |
# User submitted feedback for pdf redactions
|
1187 |
pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], FEEDBACK_LOGS_FOLDER)
|
1188 |
pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], None, preprocess=False).\
|
1189 |
success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
|
1190 |
|
1191 |
# User submitted feedback for data redactions
|
1192 |
-
data_callback.setup([data_feedback_radio, data_further_details_text,
|
1193 |
-
data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [data_feedback_radio, data_further_details_text,
|
1194 |
success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
|
1195 |
else:
|
1196 |
# User submitted feedback for pdf redactions
|
@@ -1199,7 +1260,7 @@ with app:
|
|
1199 |
success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
|
1200 |
|
1201 |
# User submitted feedback for data redactions
|
1202 |
-
data_callback.setup([data_feedback_radio, data_further_details_text,
|
1203 |
data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [data_feedback_radio, data_further_details_text, placeholder_data_file_name_no_extension_textbox_for_logs], None, preprocess=False).\
|
1204 |
success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
|
1205 |
|
@@ -1207,27 +1268,41 @@ with app:
|
|
1207 |
# Log processing usage - time taken for redaction queries, and also logs for queries to Textract/Comprehend
|
1208 |
usage_callback = CSVLogger_custom(dataset_file_name=USAGE_LOG_FILE_NAME)
|
1209 |
|
1210 |
-
if DISPLAY_FILE_NAMES_IN_LOGS ==
|
1211 |
-
usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox,
|
|
|
|
|
|
|
1212 |
|
1213 |
-
|
1214 |
success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
1215 |
|
1216 |
-
|
1217 |
success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
1218 |
|
1219 |
-
|
|
|
|
|
|
|
|
|
1220 |
success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
1221 |
else:
|
1222 |
-
usage_callback.setup([session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, blank_data_file_name_no_extension_textbox_for_logs, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
|
|
|
|
|
|
|
1223 |
|
1224 |
-
|
1225 |
success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
1226 |
|
1227 |
-
|
1228 |
success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
1229 |
|
1230 |
-
|
|
|
|
|
|
|
|
|
1231 |
success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
1232 |
|
1233 |
if __name__ == "__main__":
|
@@ -1252,46 +1327,62 @@ if __name__ == "__main__":
|
|
1252 |
'task': DIRECT_MODE_TASK,
|
1253 |
'input_file': DIRECT_MODE_INPUT_FILE,
|
1254 |
'output_dir': DIRECT_MODE_OUTPUT_DIR,
|
|
|
1255 |
'language': DEFAULT_LANGUAGE,
|
1256 |
'allow_list': ALLOW_LIST_PATH,
|
1257 |
'pii_detector': LOCAL_PII_OPTION,
|
|
|
|
|
1258 |
'aws_access_key': AWS_ACCESS_KEY,
|
1259 |
'aws_secret_key': AWS_SECRET_KEY,
|
1260 |
'aws_region': AWS_REGION,
|
1261 |
's3_bucket': DOCUMENT_REDACTION_BUCKET,
|
1262 |
'do_initial_clean': DO_INITIAL_TABULAR_DATA_CLEAN,
|
1263 |
'save_logs_to_csv': SAVE_LOGS_TO_CSV,
|
|
|
1264 |
'display_file_names_in_logs': DISPLAY_FILE_NAMES_IN_LOGS,
|
|
|
|
|
1265 |
'ocr_method': TESSERACT_TEXT_EXTRACT_OPTION,
|
1266 |
'page_min': DEFAULT_PAGE_MIN,
|
1267 |
'page_max': DEFAULT_PAGE_MAX,
|
1268 |
-
'prepare_for_review': False,
|
1269 |
-
'prepare_images': True,
|
1270 |
-
'no_images': False,
|
1271 |
'images_dpi': IMAGES_DPI,
|
1272 |
-
'max_image_pixels': None,
|
1273 |
-
'load_truncated_images': True,
|
1274 |
'chosen_local_ocr_model': CHOSEN_LOCAL_OCR_MODEL,
|
1275 |
'preprocess_local_ocr_images': PREPROCESS_LOCAL_OCR_IMAGES,
|
1276 |
'compress_redacted_pdf': COMPRESS_REDACTED_PDF,
|
1277 |
'return_pdf_end_of_redaction': RETURN_PDF_END_OF_REDACTION,
|
1278 |
-
'
|
1279 |
-
'
|
1280 |
-
'
|
1281 |
-
'
|
1282 |
-
'
|
1283 |
-
'columns': DEFAULT_TEXT_COLUMNS,
|
1284 |
'excel_sheets': DEFAULT_EXCEL_SHEETS,
|
1285 |
-
'deny_list': OUTPUT_DENY_LIST_PATH,
|
1286 |
'fuzzy_mistakes': DEFAULT_FUZZY_SPELLING_MISTAKES_NUM,
|
|
|
1287 |
'duplicate_type': DIRECT_MODE_DUPLICATE_TYPE,
|
1288 |
'similarity_threshold': DEFAULT_DUPLICATE_DETECTION_THRESHOLD,
|
1289 |
'min_word_count': DEFAULT_MIN_WORD_COUNT,
|
1290 |
'min_consecutive_pages': DEFAULT_MIN_CONSECUTIVE_PAGES,
|
1291 |
'greedy_match': USE_GREEDY_DUPLICATE_DETECTION,
|
1292 |
'combine_pages': DEFAULT_COMBINE_PAGES,
|
1293 |
-
'search_query': DEFAULT_SEARCH_QUERY
|
1294 |
-
'text_columns': DEFAULT_TEXT_COLUMNS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1295 |
}
|
1296 |
|
1297 |
print(f"Running in direct mode with task: {DIRECT_MODE_TASK}")
|
@@ -1306,6 +1397,6 @@ if __name__ == "__main__":
|
|
1306 |
print(f"Search query: {DEFAULT_SEARCH_QUERY}")
|
1307 |
if DEFAULT_TEXT_COLUMNS:
|
1308 |
print(f"Text columns: {DEFAULT_TEXT_COLUMNS}")
|
1309 |
-
|
1310 |
# Run the CLI main function with direct mode arguments
|
1311 |
main(direct_mode_args=direct_mode_args)
|
|
|
2 |
import pandas as pd
|
3 |
import gradio as gr
|
4 |
from gradio_image_annotation import image_annotator
|
5 |
+
from tools.config import DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX, DEFAULT_TABULAR_ANONYMISATION_STRATEGY, OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, DEFAULT_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS, USAGE_LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, S3_FEEDBACK_LOGS_FOLDER, S3_ACCESS_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER, CHOSEN_LOCAL_OCR_MODEL, DEFAULT_LANGUAGE_FULL_NAME, SHOW_LANGUAGE_SELECTION, DO_INITIAL_TABULAR_DATA_CLEAN, DEFAULT_DUPLICATE_DETECTION_THRESHOLD, DIRECT_MODE_TASK, DIRECT_MODE_INPUT_FILE, DIRECT_MODE_OUTPUT_DIR, DIRECT_MODE_DUPLICATE_TYPE, DIRECT_MODE_DEFAULT_USER, LOCAL_PII_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, DOCUMENT_REDACTION_BUCKET, IMAGES_DPI, PREPROCESS_LOCAL_OCR_IMAGES, COMPRESS_REDACTED_PDF, RETURN_PDF_END_OF_REDACTION, DENY_LIST_PATH, WHOLE_PAGE_REDACTION_LIST_PATH, DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX, DEFAULT_TABULAR_ANONYMISATION_STRATEGY, DEFAULT_FUZZY_SPELLING_MISTAKES_NUM, DEFAULT_PAGE_MIN, DEFAULT_PAGE_MAX, DEFAULT_EXCEL_SHEETS, USE_GREEDY_DUPLICATE_DETECTION, DEFAULT_MIN_CONSECUTIVE_PAGES, DEFAULT_COMBINE_PAGES, DEFAULT_MIN_WORD_COUNT, DEFAULT_TEXT_COLUMNS, DEFAULT_SEARCH_QUERY, REMOVE_DUPLICATE_ROWS
|
6 |
from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_relevant_ocr_output_with_words, reset_data_vars, reset_aws_call_vars, _get_env_list, ensure_folder_exists, reset_ocr_with_words_base_dataframe, update_language_dropdown, LANGUAGE_CHOICES, MAPPED_LANGUAGE_CHOICES
|
7 |
from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
|
8 |
from tools.file_redaction import choose_and_run_redactor
|
|
|
14 |
from tools.custom_csvlogger import CSVLogger_custom
|
15 |
from tools.find_duplicate_pages import run_duplicate_analysis, exclude_match, handle_selection_and_preview, apply_whole_page_redactions_from_list, create_annotation_objects_from_duplicates, run_full_search_and_analysis
|
16 |
from tools.find_duplicate_tabular import update_tabular_column_choices, run_tabular_duplicate_detection, handle_tabular_row_selection, clean_tabular_duplicates
|
17 |
+
import time
|
18 |
from tools.textract_batch_call import analyse_document_with_textract_api, poll_whole_document_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist, replace_existing_pdf_input_for_whole_document_outputs
|
19 |
|
20 |
# Suppress downcasting warnings
|
|
|
49 |
else: USE_GREEDY_DUPLICATE_DETECTION = False
|
50 |
if DEFAULT_COMBINE_PAGES == "True": DEFAULT_COMBINE_PAGES = True
|
51 |
else: DEFAULT_COMBINE_PAGES = False
|
52 |
+
if REMOVE_DUPLICATE_ROWS == "True": REMOVE_DUPLICATE_ROWS = True
|
53 |
+
else: REMOVE_DUPLICATE_ROWS = False
|
54 |
|
55 |
if CSV_ACCESS_LOG_HEADERS: CSV_ACCESS_LOG_HEADERS = _get_env_list(CSV_ACCESS_LOG_HEADERS)
|
56 |
if CSV_FEEDBACK_LOG_HEADERS: CSV_FEEDBACK_LOG_HEADERS = _get_env_list(CSV_FEEDBACK_LOG_HEADERS)
|
|
|
76 |
|
77 |
FILE_INPUT_HEIGHT = int(FILE_INPUT_HEIGHT)
|
78 |
|
79 |
+
# Wrapper functions to add timing to deduplication functions
|
80 |
+
def run_duplicate_analysis_with_timing(files, threshold, min_words, min_consecutive, greedy_match, combine_pages, output_folder):
|
81 |
+
"""
|
82 |
+
Wrapper for run_duplicate_analysis that adds timing and returns time taken.
|
83 |
+
"""
|
84 |
+
start_time = time.time()
|
85 |
+
results_df, output_paths, full_data_by_file = run_duplicate_analysis(
|
86 |
+
files=files,
|
87 |
+
threshold=threshold,
|
88 |
+
min_words=min_words,
|
89 |
+
min_consecutive=min_consecutive,
|
90 |
+
greedy_match=greedy_match,
|
91 |
+
combine_pages=combine_pages,
|
92 |
+
output_folder=output_folder
|
93 |
+
)
|
94 |
+
end_time = time.time()
|
95 |
+
processing_time = end_time - start_time
|
96 |
+
|
97 |
+
# Store the time taken in a global variable for logging
|
98 |
+
global duplicate_analysis_time_taken
|
99 |
+
duplicate_analysis_time_taken = processing_time
|
100 |
+
|
101 |
+
return results_df, output_paths, full_data_by_file
|
102 |
+
|
103 |
+
def run_tabular_duplicate_detection_with_timing(files, threshold, min_words, text_columns, output_folder, do_initial_clean_dup, in_excel_tabular_sheets, remove_duplicate_rows):
|
104 |
+
"""
|
105 |
+
Wrapper for run_tabular_duplicate_detection that adds timing and returns time taken.
|
106 |
+
"""
|
107 |
+
start_time = time.time()
|
108 |
+
results_df, output_paths, file_choices = run_tabular_duplicate_detection(
|
109 |
+
files=files,
|
110 |
+
threshold=threshold,
|
111 |
+
min_words=min_words,
|
112 |
+
text_columns=text_columns,
|
113 |
+
output_folder=output_folder,
|
114 |
+
do_initial_clean_dup=do_initial_clean_dup,
|
115 |
+
in_excel_tabular_sheets=in_excel_tabular_sheets,
|
116 |
+
remove_duplicate_rows=remove_duplicate_rows
|
117 |
+
)
|
118 |
+
end_time = time.time()
|
119 |
+
processing_time = end_time - start_time
|
120 |
+
|
121 |
+
# Store the time taken in a global variable for logging
|
122 |
+
global tabular_duplicate_analysis_time_taken
|
123 |
+
tabular_duplicate_analysis_time_taken = processing_time
|
124 |
+
|
125 |
+
return results_df, output_paths, file_choices
|
126 |
+
|
127 |
+
# Initialize global variables for timing
|
128 |
+
duplicate_analysis_time_taken = 0.0
|
129 |
+
tabular_duplicate_analysis_time_taken = 0.0
|
130 |
+
|
131 |
# Create the gradio interface
|
132 |
app = gr.Blocks(theme = gr.themes.Default(primary_hue="blue"), fill_width=True) #gr.themes.Base()
|
133 |
|
|
|
240 |
# S3 settings for default allow list load
|
241 |
s3_default_bucket = gr.Textbox(label = "Default S3 bucket", value=DOCUMENT_REDACTION_BUCKET, visible=False)
|
242 |
s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=S3_ALLOW_LIST_PATH, visible=False)
|
243 |
+
default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=ALLOW_LIST_PATH, visible=False)
|
244 |
|
245 |
s3_whole_document_textract_default_bucket = gr.Textbox(label = "Default Textract whole_document S3 bucket", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, visible=False)
|
246 |
s3_whole_document_textract_input_subfolder = gr.Textbox(label = "Default Textract whole_document S3 input folder", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, visible=False)
|
|
|
310 |
textract_job_detail_df = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','job_date_time']), label="Previous job details", visible=False, type="pandas", wrap=True)
|
311 |
selected_job_id_row = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','job_date_time']), label="Selected job id row", visible=False, type="pandas", wrap=True)
|
312 |
is_a_textract_api_call = gr.Checkbox(value=False, label="is_this_a_textract_api_call", visible=False)
|
313 |
+
task_textbox = gr.Textbox(value="redact", label="task", visible=False) # Track the task being performed
|
314 |
job_output_textbox = gr.Textbox(value="", label="Textract call outputs", visible=False)
|
315 |
job_input_textbox = gr.Textbox(value=TEXTRACT_JOBS_S3_INPUT_LOC, label="Textract call outputs", visible=False)
|
316 |
|
|
|
392 |
textract_job_output_file = gr.File(label="Textract job output files", height=100, visible=True)
|
393 |
with gr.Column():
|
394 |
job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=True)
|
395 |
+
convert_textract_outputs_to_ocr_results = gr.Button("Convert Textract job outputs to OCR results", variant="secondary", visible=True)
|
396 |
|
397 |
gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the Redaction Settings tab.""")
|
398 |
document_redact_btn = gr.Button("Extract text and redact document", variant="primary", scale = 4)
|
399 |
|
400 |
+
with gr.Row():
|
401 |
+
redaction_output_summary_textbox = gr.Textbox(label="Output summary", scale=1, lines=4)
|
402 |
output_file = gr.File(label="Output files", scale = 2)#, height=FILE_INPUT_HEIGHT)
|
403 |
latest_file_completed_num = gr.Number(value=0, label="Number of documents redacted", interactive=False, visible=False)
|
404 |
|
|
|
614 |
|
615 |
with gr.Accordion("Anonymisation output format - by default will replace PII with a blank space", open = False):
|
616 |
with gr.Row():
|
617 |
+
anon_strategy = gr.Radio(choices=["replace with 'REDACTED'", "replace with <ENTITY_NAME>", "redact completely", "hash", "mask"], label="Select an anonymisation method.", value = DEFAULT_TABULAR_ANONYMISATION_STRATEGY) # , "encrypt", "fake_first_name" are also available, but are not currently included as not that useful in current form
|
618 |
do_initial_clean = gr.Checkbox(label="Do initial clean of text (remove URLs, HTML tags, and non-ASCII characters)", value=DO_INITIAL_TABULAR_DATA_CLEAN)
|
619 |
|
620 |
tabular_data_redact_btn = gr.Button("Redact text/data files", variant="primary")
|
621 |
|
622 |
+
with gr.Row():
|
623 |
+
text_output_summary = gr.Textbox(label="Output result", lines=4)
|
624 |
text_output_file = gr.File(label="Output files")
|
625 |
text_tabular_files_done = gr.Number(value=0, label="Number of tabular files redacted", interactive=False, visible=False)
|
626 |
|
|
|
630 |
# TABULAR DUPLICATE DETECTION TAB
|
631 |
###
|
632 |
with gr.Accordion(label="Find duplicate cells in tabular data", open=False):
|
633 |
+
gr.Markdown("""Find duplicate cells or rows in CSV, Excel, or Parquet files. This tool analyzes text content across all columns to identify similar or identical entries that may be duplicates. You can review the results and choose to remove duplicate rows from your files.""")
|
634 |
|
635 |
with gr.Accordion("Step 1: Upload files and configure analysis", open=True):
|
636 |
in_tabular_duplicate_files = gr.File(
|
637 |
+
label="Upload CSV, Excel, or Parquet files to find duplicate cells/rows. Note that the app will remove duplicates from later cells/files that are found in earlier cells/files and not vice versa.",
|
638 |
file_count="multiple",
|
639 |
file_types=['.csv', '.xlsx', '.xls', '.parquet'],
|
640 |
height=FILE_INPUT_HEIGHT
|
641 |
)
|
642 |
|
643 |
+
with gr.Row(equal_height=True):
|
644 |
tabular_duplicate_threshold = gr.Number(
|
645 |
value=DEFAULT_DUPLICATE_DETECTION_THRESHOLD,
|
646 |
label="Similarity threshold",
|
647 |
+
info="Score (0-1) to consider cells a match. 1 = perfect match."
|
648 |
)
|
649 |
tabular_min_word_count = gr.Number(
|
650 |
value=DEFAULT_MIN_WORD_COUNT,
|
|
|
652 |
info="Cells with fewer words than this are ignored."
|
653 |
)
|
654 |
do_initial_clean_dup = gr.Checkbox(label="Do initial clean of text (remove URLs, HTML tags, and non-ASCII characters)", value=DO_INITIAL_TABULAR_DATA_CLEAN)
|
655 |
+
remove_duplicate_rows = gr.Checkbox(label="Remove duplicate rows from deduplicated files", value=REMOVE_DUPLICATE_ROWS)
|
656 |
|
657 |
+
with gr.Row():
|
658 |
+
in_excel_tabular_sheets = gr.Dropdown(choices=list(), multiselect = True, label="Select Excel sheet names that you want to deduplicate (showing sheets present across all Excel files).", visible=True, allow_custom_value=True)
|
659 |
+
|
660 |
+
tabular_text_columns = gr.Dropdown(
|
661 |
+
choices=DEFAULT_TEXT_COLUMNS,
|
662 |
+
multiselect=True,
|
663 |
+
label="Select specific columns to analyse (leave empty to analyse all text columns simultaneously - i.e. all text is joined together)",
|
664 |
+
info="If no columns selected, all text columns will combined together and analysed"
|
665 |
+
)
|
666 |
|
667 |
find_tabular_duplicates_btn = gr.Button(
|
668 |
value="Find duplicate cells/rows",
|
|
|
681 |
show_copy_button=True
|
682 |
)
|
683 |
|
684 |
+
with gr.Row(equal_height=True):
|
685 |
tabular_selected_row_index = gr.Number(value=None, visible=False)
|
686 |
tabular_text1_preview = gr.Textbox(
|
687 |
label="Text from File 1",
|
|
|
835 |
document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number, all_page_line_level_ocr_results_with_words]).\
|
836 |
success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
|
837 |
success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox, chosen_language_drop],
|
838 |
+
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, input_pdf_for_review, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state, task_textbox], api_name="redact_doc", show_progress_on=[redaction_output_summary_textbox])
|
839 |
|
840 |
# If a file has been completed, the function will continue onto the next document
|
841 |
latest_file_completed_num.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox, chosen_language_drop],
|
842 |
+
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, input_pdf_for_review, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state, task_textbox], show_progress_on=[redaction_output_summary_textbox]).\
|
843 |
success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state], show_progress_on=[annotator]).\
|
844 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
|
845 |
success(fn=check_for_relevant_ocr_output_with_words, inputs=[doc_file_name_no_extension_textbox, text_extract_method_radio, output_folder_textbox], outputs=[relevant_ocr_output_with_words_found_checkbox]).\
|
|
|
851 |
all_page_line_level_ocr_results_with_words_df_base.change(reset_ocr_with_words_base_dataframe, inputs=[all_page_line_level_ocr_results_with_words_df_base, page_entity_dropdown_redaction], outputs=[all_page_line_level_ocr_results_with_words_df, backup_all_page_line_level_ocr_results_with_words_df_base])
|
852 |
|
853 |
# Send whole document to Textract for text extraction
|
854 |
+
send_document_to_textract_api_btn.click(analyse_document_with_textract_api, inputs=[prepared_pdf_state, s3_whole_document_textract_input_subfolder, s3_whole_document_textract_output_subfolder, textract_job_detail_df, s3_whole_document_textract_default_bucket, output_folder_textbox, handwrite_signature_checkbox, successful_textract_api_call_number, total_pdf_page_count], outputs=[job_output_textbox, job_id_textbox, job_type_dropdown, successful_textract_api_call_number, is_a_textract_api_call, textract_query_number, task_textbox]).\
|
855 |
success(check_for_provided_job_id, inputs=[job_id_textbox]).\
|
856 |
success(poll_whole_document_textract_analysis_progress_and_download, inputs=[job_id_textbox, job_type_dropdown, s3_whole_document_textract_output_subfolder, doc_file_name_no_extension_textbox, textract_job_detail_df, s3_whole_document_textract_default_bucket, output_folder_textbox, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder], outputs = [textract_job_output_file, job_current_status, textract_job_detail_df, doc_file_name_no_extension_textbox]).\
|
857 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
|
|
|
869 |
success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
|
870 |
success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
|
871 |
success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox, chosen_language_drop],
|
872 |
+
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, input_pdf_for_review, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state, task_textbox], show_progress_on=[redaction_output_summary_textbox]).\
|
873 |
success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state], show_progress_on=[annotator])
|
874 |
|
875 |
###
|
|
|
1081 |
success(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list, total_pdf_page_count])
|
1082 |
|
1083 |
tabular_data_redact_btn.click(reset_data_vars, outputs=[actual_time_taken_number, log_files_output_list_state, comprehend_query_number]).\
|
1084 |
+
success(fn=anonymise_files_with_open_text, inputs=[in_data_files, in_text, anon_strategy, in_colnames, in_redact_entities, in_allow_list_state, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox, actual_time_taken_number, do_initial_clean, chosen_language_drop], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state, actual_time_taken_number, comprehend_query_number], api_name="redact_data")
|
1085 |
|
1086 |
# If the output file count text box changes, keep going with redacting each data file until done
|
1087 |
+
text_tabular_files_done.change(fn=anonymise_files_with_open_text, inputs=[in_data_files, in_text, anon_strategy, in_colnames, in_redact_entities, in_allow_list_state, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox, actual_time_taken_number, do_initial_clean, chosen_language_drop], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state, actual_time_taken_number, comprehend_query_number]).\
|
1088 |
success(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
|
1089 |
|
1090 |
###
|
|
|
1099 |
min_word_count_input,
|
1100 |
min_consecutive_pages_input,
|
1101 |
greedy_match_input,
|
1102 |
+
combine_page_text_for_duplicates_bool,
|
1103 |
+
output_folder_textbox
|
1104 |
],
|
1105 |
outputs=[
|
1106 |
results_df_preview,
|
1107 |
duplicate_files_out,
|
1108 |
+
full_duplicate_data_by_file,
|
1109 |
+
actual_time_taken_number,
|
1110 |
+
task_textbox
|
1111 |
]
|
1112 |
)
|
1113 |
|
|
|
1139 |
|
1140 |
# Event handlers
|
1141 |
in_tabular_duplicate_files.upload(
|
1142 |
+
fn=put_columns_in_df, inputs=[in_tabular_duplicate_files], outputs=[tabular_text_columns, in_excel_tabular_sheets])
|
1143 |
+
|
|
|
|
|
1144 |
|
1145 |
find_tabular_duplicates_btn.click(
|
1146 |
fn=run_tabular_duplicate_detection,
|
1147 |
+
inputs=[in_tabular_duplicate_files, tabular_duplicate_threshold, tabular_min_word_count, tabular_text_columns, output_folder_textbox, do_initial_clean_dup, in_excel_tabular_sheets, remove_duplicate_rows],
|
1148 |
+
outputs=[tabular_results_df, tabular_cleaned_file, tabular_file_to_clean, actual_time_taken_number, task_textbox], api_name="tabular_clean_duplicates", show_progress_on=[tabular_results_df]
|
1149 |
)
|
1150 |
|
1151 |
tabular_results_df.select(
|
|
|
1156 |
|
1157 |
clean_duplicates_btn.click(
|
1158 |
fn=clean_tabular_duplicates,
|
1159 |
+
inputs=[tabular_file_to_clean, tabular_results_df, output_folder_textbox, in_excel_tabular_sheets],
|
1160 |
outputs=[tabular_cleaned_file]
|
1161 |
)
|
1162 |
|
|
|
1243 |
pdf_callback = CSVLogger_custom(dataset_file_name=FEEDBACK_LOG_FILE_NAME)
|
1244 |
data_callback = CSVLogger_custom(dataset_file_name=FEEDBACK_LOG_FILE_NAME)
|
1245 |
|
1246 |
+
if DISPLAY_FILE_NAMES_IN_LOGS == True:
|
1247 |
# User submitted feedback for pdf redactions
|
1248 |
pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], FEEDBACK_LOGS_FOLDER)
|
1249 |
pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], None, preprocess=False).\
|
1250 |
success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
|
1251 |
|
1252 |
# User submitted feedback for data redactions
|
1253 |
+
data_callback.setup([data_feedback_radio, data_further_details_text, data_file_name_with_extension_textbox], FEEDBACK_LOGS_FOLDER)
|
1254 |
+
data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [data_feedback_radio, data_further_details_text, data_file_name_with_extension_textbox], None, preprocess=False).\
|
1255 |
success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
|
1256 |
else:
|
1257 |
# User submitted feedback for pdf redactions
|
|
|
1260 |
success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
|
1261 |
|
1262 |
# User submitted feedback for data redactions
|
1263 |
+
data_callback.setup([data_feedback_radio, data_further_details_text, data_file_name_with_extension_textbox], FEEDBACK_LOGS_FOLDER)
|
1264 |
data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [data_feedback_radio, data_further_details_text, placeholder_data_file_name_no_extension_textbox_for_logs], None, preprocess=False).\
|
1265 |
success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
|
1266 |
|
|
|
1268 |
# Log processing usage - time taken for redaction queries, and also logs for queries to Textract/Comprehend
|
1269 |
usage_callback = CSVLogger_custom(dataset_file_name=USAGE_LOG_FILE_NAME)
|
1270 |
|
1271 |
+
if DISPLAY_FILE_NAMES_IN_LOGS == True:
|
1272 |
+
usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_file_name_with_extension_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call, task_textbox], USAGE_LOGS_FOLDER)
|
1273 |
+
|
1274 |
+
latest_file_completed_num.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, doc_file_name_no_extension_textbox, data_file_name_with_extension_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call, task_textbox], None, preprocess=False, api_name="usage_logs").\
|
1275 |
+
success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
1276 |
|
1277 |
+
text_tabular_files_done.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, doc_file_name_no_extension_textbox, data_file_name_with_extension_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop_tabular, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call, task_textbox], None, preprocess=False).\
|
1278 |
success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
1279 |
|
1280 |
+
successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, doc_file_name_no_extension_textbox, data_file_name_with_extension_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call, task_textbox], None, preprocess=False).\
|
1281 |
success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
1282 |
|
1283 |
+
# Deduplication usage logging
|
1284 |
+
duplicate_files_out.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, blank_data_file_name_no_extension_textbox_for_logs, actual_time_taken_number, textract_query_number, pii_identification_method_drop_tabular, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call, task_textbox], None, preprocess=False).\
|
1285 |
+
success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
1286 |
+
|
1287 |
+
tabular_results_df.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, blank_data_file_name_no_extension_textbox_for_logs, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop_tabular, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call, task_textbox], None, preprocess=False).\
|
1288 |
success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
1289 |
else:
|
1290 |
+
usage_callback.setup([session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, blank_data_file_name_no_extension_textbox_for_logs, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call, task_textbox], USAGE_LOGS_FOLDER)
|
1291 |
+
|
1292 |
+
latest_file_completed_num.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, placeholder_doc_file_name_no_extension_textbox_for_logs, blank_data_file_name_no_extension_textbox_for_logs, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call, task_textbox], None, preprocess=False).\
|
1293 |
+
success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
1294 |
|
1295 |
+
text_tabular_files_done.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, placeholder_data_file_name_no_extension_textbox_for_logs, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop_tabular, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call, task_textbox], None, preprocess=False).\
|
1296 |
success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
1297 |
|
1298 |
+
successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, placeholder_doc_file_name_no_extension_textbox_for_logs, blank_data_file_name_no_extension_textbox_for_logs, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call, task_textbox], None, preprocess=False).\
|
1299 |
success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
1300 |
|
1301 |
+
# Deduplication usage logging (when file names not displayed)
|
1302 |
+
duplicate_files_out.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, placeholder_doc_file_name_no_extension_textbox_for_logs, blank_data_file_name_no_extension_textbox_for_logs, total_pdf_page_count, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop_tabular, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call, task_textbox], None, preprocess=False).\
|
1303 |
+
success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
1304 |
+
|
1305 |
+
tabular_results_df.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, placeholder_doc_file_name_no_extension_textbox_for_logs, blank_data_file_name_no_extension_textbox_for_logs, total_pdf_page_count, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop_tabular, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call, task_textbox], None, preprocess=False).\
|
1306 |
success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
1307 |
|
1308 |
if __name__ == "__main__":
|
|
|
1327 |
'task': DIRECT_MODE_TASK,
|
1328 |
'input_file': DIRECT_MODE_INPUT_FILE,
|
1329 |
'output_dir': DIRECT_MODE_OUTPUT_DIR,
|
1330 |
+
'input_dir': INPUT_FOLDER,
|
1331 |
'language': DEFAULT_LANGUAGE,
|
1332 |
'allow_list': ALLOW_LIST_PATH,
|
1333 |
'pii_detector': LOCAL_PII_OPTION,
|
1334 |
+
'username': DIRECT_MODE_DEFAULT_USER,
|
1335 |
+
'save_to_user_folders': SESSION_OUTPUT_FOLDER,
|
1336 |
'aws_access_key': AWS_ACCESS_KEY,
|
1337 |
'aws_secret_key': AWS_SECRET_KEY,
|
1338 |
'aws_region': AWS_REGION,
|
1339 |
's3_bucket': DOCUMENT_REDACTION_BUCKET,
|
1340 |
'do_initial_clean': DO_INITIAL_TABULAR_DATA_CLEAN,
|
1341 |
'save_logs_to_csv': SAVE_LOGS_TO_CSV,
|
1342 |
+
'save_logs_to_dynamodb': SAVE_LOGS_TO_DYNAMODB,
|
1343 |
'display_file_names_in_logs': DISPLAY_FILE_NAMES_IN_LOGS,
|
1344 |
+
'upload_logs_to_s3': RUN_AWS_FUNCTIONS == "1",
|
1345 |
+
's3_logs_prefix': S3_USAGE_LOGS_FOLDER,
|
1346 |
'ocr_method': TESSERACT_TEXT_EXTRACT_OPTION,
|
1347 |
'page_min': DEFAULT_PAGE_MIN,
|
1348 |
'page_max': DEFAULT_PAGE_MAX,
|
|
|
|
|
|
|
1349 |
'images_dpi': IMAGES_DPI,
|
|
|
|
|
1350 |
'chosen_local_ocr_model': CHOSEN_LOCAL_OCR_MODEL,
|
1351 |
'preprocess_local_ocr_images': PREPROCESS_LOCAL_OCR_IMAGES,
|
1352 |
'compress_redacted_pdf': COMPRESS_REDACTED_PDF,
|
1353 |
'return_pdf_end_of_redaction': RETURN_PDF_END_OF_REDACTION,
|
1354 |
+
'allow_list_file': ALLOW_LIST_PATH,
|
1355 |
+
'deny_list_file': DENY_LIST_PATH,
|
1356 |
+
'redact_whole_page_file': WHOLE_PAGE_REDACTION_LIST_PATH,
|
1357 |
+
'handwrite_signature_extraction': DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX,
|
1358 |
+
'anon_strategy': DEFAULT_TABULAR_ANONYMISATION_STRATEGY,
|
|
|
1359 |
'excel_sheets': DEFAULT_EXCEL_SHEETS,
|
|
|
1360 |
'fuzzy_mistakes': DEFAULT_FUZZY_SPELLING_MISTAKES_NUM,
|
1361 |
+
'match_fuzzy_whole_phrase_bool': 'True', # Default value
|
1362 |
'duplicate_type': DIRECT_MODE_DUPLICATE_TYPE,
|
1363 |
'similarity_threshold': DEFAULT_DUPLICATE_DETECTION_THRESHOLD,
|
1364 |
'min_word_count': DEFAULT_MIN_WORD_COUNT,
|
1365 |
'min_consecutive_pages': DEFAULT_MIN_CONSECUTIVE_PAGES,
|
1366 |
'greedy_match': USE_GREEDY_DUPLICATE_DETECTION,
|
1367 |
'combine_pages': DEFAULT_COMBINE_PAGES,
|
1368 |
+
'search_query': DEFAULT_SEARCH_QUERY,
|
1369 |
+
'text_columns': DEFAULT_TEXT_COLUMNS,
|
1370 |
+
'remove_duplicate_rows': REMOVE_DUPLICATE_ROWS,
|
1371 |
+
# Textract specific arguments (with defaults)
|
1372 |
+
'textract_action': '',
|
1373 |
+
'job_id': '',
|
1374 |
+
'extract_signatures': False,
|
1375 |
+
'textract_bucket': TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET,
|
1376 |
+
'textract_input_prefix': TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER,
|
1377 |
+
'textract_output_prefix': TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER,
|
1378 |
+
's3_textract_document_logs_subfolder': TEXTRACT_JOBS_S3_LOC,
|
1379 |
+
'local_textract_document_logs_subfolder': TEXTRACT_JOBS_LOCAL_LOC,
|
1380 |
+
'poll_interval': 30,
|
1381 |
+
'max_poll_attempts': 120,
|
1382 |
+
# General arguments that might be missing
|
1383 |
+
'local_redact_entities': CHOSEN_REDACT_ENTITIES,
|
1384 |
+
'aws_redact_entities': CHOSEN_COMPREHEND_ENTITIES,
|
1385 |
+
'cost_code': DEFAULT_COST_CODE
|
1386 |
}
|
1387 |
|
1388 |
print(f"Running in direct mode with task: {DIRECT_MODE_TASK}")
|
|
|
1397 |
print(f"Search query: {DEFAULT_SEARCH_QUERY}")
|
1398 |
if DEFAULT_TEXT_COLUMNS:
|
1399 |
print(f"Text columns: {DEFAULT_TEXT_COLUMNS}")
|
1400 |
+
print(f"Remove duplicate rows: {REMOVE_DUPLICATE_ROWS}")
|
1401 |
# Run the CLI main function with direct mode arguments
|
1402 |
main(direct_mode_args=direct_mode_args)
|
cli_redact.py
CHANGED
@@ -1,15 +1,63 @@
|
|
1 |
import argparse
|
2 |
import os
|
3 |
import pandas as pd
|
4 |
-
|
|
|
|
|
|
|
5 |
from tools.helper_functions import ensure_output_folder_exists
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
# --- Constants and Configuration ---
|
15 |
|
@@ -17,18 +65,21 @@ if CHOSEN_COMPREHEND_ENTITIES: CHOSEN_COMPREHEND_ENTITIES = _get_env_list(CHOSEN
|
|
17 |
if FULL_COMPREHEND_ENTITY_LIST: FULL_COMPREHEND_ENTITY_LIST = _get_env_list(FULL_COMPREHEND_ENTITY_LIST)
|
18 |
if CHOSEN_REDACT_ENTITIES: CHOSEN_REDACT_ENTITIES = _get_env_list(CHOSEN_REDACT_ENTITIES)
|
19 |
if FULL_ENTITY_LIST: FULL_ENTITY_LIST = _get_env_list(FULL_ENTITY_LIST)
|
|
|
|
|
20 |
|
21 |
# Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
|
22 |
-
CHOSEN_COMPREHEND_ENTITIES.extend(
|
23 |
-
FULL_COMPREHEND_ENTITY_LIST.extend(
|
24 |
|
25 |
chosen_redact_entities = CHOSEN_REDACT_ENTITIES
|
26 |
full_entity_list = FULL_ENTITY_LIST
|
27 |
chosen_comprehend_entities = CHOSEN_COMPREHEND_ENTITIES
|
28 |
full_comprehend_entity_list = FULL_COMPREHEND_ENTITY_LIST
|
|
|
29 |
|
30 |
# --- Main CLI Function ---
|
31 |
-
def main(direct_mode_args=
|
32 |
"""
|
33 |
A unified command-line interface to prepare, redact, and anonymise various document types.
|
34 |
|
@@ -40,99 +91,159 @@ def main(direct_mode_args=None):
|
|
40 |
description='A versatile CLI for redacting PII from PDF/image files and anonymising Word/tabular data.',
|
41 |
formatter_class=argparse.RawTextHelpFormatter,
|
42 |
epilog='''
|
43 |
-
Examples:
|
44 |
-
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
-
|
48 |
-
|
49 |
|
50 |
-
|
51 |
-
|
52 |
|
53 |
-
|
54 |
-
|
55 |
|
56 |
-
|
57 |
-
|
58 |
|
59 |
-
|
60 |
-
|
61 |
|
62 |
-
|
63 |
-
|
64 |
|
65 |
-
|
66 |
-
|
67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
)
|
69 |
|
70 |
# --- Task Selection ---
|
71 |
task_group = parser.add_argument_group('Task Selection')
|
72 |
task_group.add_argument('--task',
|
73 |
-
choices=['redact', 'deduplicate'],
|
74 |
default='redact',
|
75 |
-
help='Task to perform: redact (PII redaction/
|
76 |
|
77 |
# --- General Arguments (apply to all file types) ---
|
78 |
general_group = parser.add_argument_group('General Options')
|
79 |
-
general_group.add_argument('--input_file',
|
80 |
general_group.add_argument('--output_dir', default=OUTPUT_FOLDER, help='Directory for all output files.')
|
|
|
81 |
general_group.add_argument('--language', default=DEFAULT_LANGUAGE, help='Language of the document content.')
|
82 |
-
general_group.add_argument('--allow_list', help='Path to a CSV file with words to exclude from redaction.')
|
83 |
-
general_group.add_argument('--pii_detector',
|
84 |
-
|
85 |
-
default=
|
86 |
-
help='
|
87 |
-
|
88 |
-
general_group.add_argument('--
|
89 |
-
|
90 |
-
|
91 |
-
general_group.add_argument('--
|
92 |
-
|
93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
# --- PDF/Image Redaction Arguments ---
|
96 |
pdf_group = parser.add_argument_group('PDF/Image Redaction Options (.pdf, .png, .jpg)')
|
97 |
-
pdf_group.add_argument('--ocr_method',
|
98 |
-
choices=[SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION],
|
99 |
-
default=TESSERACT_TEXT_EXTRACT_OPTION,
|
100 |
-
help='OCR method for text extraction from images.')
|
101 |
pdf_group.add_argument('--page_min', type=int, default=0, help='First page to redact.')
|
102 |
-
pdf_group.add_argument('--page_max', type=int, default=
|
103 |
-
pdf_group.add_argument('--
|
104 |
-
pdf_group.add_argument('--
|
105 |
-
pdf_group.add_argument('--
|
106 |
-
pdf_group.add_argument('--
|
107 |
-
pdf_group.add_argument('--
|
108 |
-
pdf_group.add_argument('--
|
109 |
-
pdf_group.add_argument('--
|
110 |
-
pdf_group.add_argument('--
|
111 |
-
pdf_group.add_argument('--
|
112 |
-
pdf_group.add_argument('--return_pdf_end_of_redaction', action='store_true', default=True, help='Return PDF at end of redaction process.')
|
113 |
-
pdf_group.add_argument('--in_deny_list', nargs='+', default=list(), help='Custom words to recognize for redaction.')
|
114 |
-
pdf_group.add_argument('--redact_whole_page_list', nargs='+', default=list(), help='Pages to redact completely.')
|
115 |
-
pdf_group.add_argument('--handwrite_signature_checkbox', nargs='+', default=['Extract handwriting', 'Extract signatures'], help='Handwriting and signature extraction options.')
|
116 |
|
117 |
# --- Word/Tabular Anonymisation Arguments ---
|
118 |
tabular_group = parser.add_argument_group('Word/Tabular Anonymisation Options (.docx, .csv, .xlsx)')
|
119 |
-
tabular_group.add_argument('--
|
120 |
-
tabular_group.add_argument('--
|
121 |
tabular_group.add_argument('--excel_sheets', nargs='+', default=list(), help='Specific Excel sheet names to process.')
|
122 |
-
tabular_group.add_argument('--
|
123 |
-
tabular_group.add_argument('--
|
124 |
-
|
125 |
# --- Duplicate Detection Arguments ---
|
126 |
duplicate_group = parser.add_argument_group('Duplicate Detection Options')
|
127 |
duplicate_group.add_argument('--duplicate_type', choices=['pages', 'tabular'], default='pages', help='Type of duplicate detection: pages (for OCR files) or tabular (for CSV/Excel files).')
|
128 |
-
duplicate_group.add_argument('--similarity_threshold', type=float, default=
|
129 |
-
duplicate_group.add_argument('--min_word_count', type=int, default=
|
130 |
-
duplicate_group.add_argument('--min_consecutive_pages', type=int, default=
|
131 |
-
duplicate_group.add_argument('--greedy_match',
|
132 |
-
duplicate_group.add_argument('--combine_pages',
|
133 |
-
duplicate_group.add_argument('--
|
134 |
-
duplicate_group.add_argument('--text_columns', nargs='+', default=list(), help='Specific text columns to analyze for duplicates (for tabular data).')
|
135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
# Parse arguments - either from command line or direct mode
|
137 |
if direct_mode_args:
|
138 |
# Use direct mode arguments
|
@@ -142,54 +253,161 @@ Examples:
|
|
142 |
args = parser.parse_args()
|
143 |
|
144 |
# --- Initial Setup ---
|
145 |
-
|
146 |
-
|
147 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
|
149 |
-
#
|
150 |
-
|
151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
|
153 |
# --- Route to the Correct Workflow Based on Task and File Type ---
|
154 |
|
155 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
if args.task == 'redact':
|
|
|
157 |
# Workflow 1: PDF/Image Redaction
|
158 |
if file_extension in ['.pdf', '.png', '.jpg', '.jpeg']:
|
159 |
print("--- Detected PDF/Image file. Starting Redaction Workflow... ---")
|
|
|
160 |
try:
|
|
|
|
|
161 |
# Step 1: Prepare the document
|
162 |
print("\nStep 1: Preparing document...")
|
163 |
(
|
164 |
prep_summary, prepared_pdf_paths, image_file_paths, _, _, pdf_doc,
|
165 |
-
image_annotations, _, original_cropboxes, page_sizes,
|
166 |
) = prepare_image_or_pdf(
|
167 |
-
file_paths=
|
168 |
-
|
169 |
-
|
170 |
-
output_folder=args.output_dir, prepare_images=args.prepare_images
|
171 |
)
|
172 |
print(f"Preparation complete. {prep_summary}")
|
173 |
|
174 |
# Step 2: Redact the prepared document
|
175 |
print("\nStep 2: Running redaction...")
|
176 |
(
|
177 |
-
output_summary, output_files, _, _, log_files, _, _, _, _, _, _, _, _, _,
|
178 |
) = choose_and_run_redactor(
|
179 |
-
file_paths=
|
180 |
-
pdf_image_file_paths=image_file_paths, chosen_redact_entities=
|
181 |
-
chosen_redact_comprehend_entities=
|
182 |
-
in_allow_list=
|
183 |
-
redact_whole_page_list=args.
|
184 |
-
page_min=args.page_min, page_max=args.page_max, handwrite_signature_checkbox=args.
|
185 |
pymupdf_doc=pdf_doc, annotations_all_pages=image_annotations, page_sizes=page_sizes,
|
186 |
document_cropboxes=original_cropboxes, pii_identification_method=args.pii_detector,
|
187 |
aws_access_key_textbox=args.aws_access_key, aws_secret_key_textbox=args.aws_secret_key,
|
188 |
-
language=args.language, output_folder=args.output_dir
|
189 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
|
191 |
print("\n--- Redaction Process Complete ---")
|
192 |
print(f"Summary: {output_summary}")
|
|
|
193 |
print(f"\nOutput files saved to: {args.output_dir}")
|
194 |
print("Generated Files:", sorted(output_files))
|
195 |
if log_files: print("Log Files:", sorted(log_files))
|
@@ -200,30 +418,83 @@ Examples:
|
|
200 |
# Workflow 2: Word/Tabular Data Anonymisation
|
201 |
elif file_extension in ['.docx', '.xlsx', '.xls', '.csv', '.parquet']:
|
202 |
print("--- Detected Word/Tabular file. Starting Anonymisation Workflow... ---")
|
|
|
203 |
try:
|
|
|
|
|
204 |
# Run the anonymisation function directly
|
205 |
-
|
206 |
-
|
|
|
207 |
in_text="", # Not used for file-based operations
|
208 |
-
|
209 |
-
chosen_cols=args.
|
210 |
-
chosen_redact_entities=
|
211 |
-
in_allow_list=
|
212 |
in_excel_sheets=args.excel_sheets,
|
213 |
first_loop_state=True,
|
214 |
output_folder=args.output_dir,
|
215 |
-
in_deny_list=
|
216 |
max_fuzzy_spelling_mistakes_num=args.fuzzy_mistakes,
|
217 |
pii_identification_method=args.pii_detector,
|
218 |
-
chosen_redact_comprehend_entities=
|
219 |
aws_access_key_textbox=args.aws_access_key,
|
220 |
aws_secret_key_textbox=args.aws_secret_key,
|
221 |
language=args.language,
|
222 |
do_initial_clean=args.do_initial_clean
|
223 |
)
|
224 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
225 |
print("\n--- Anonymisation Process Complete ---")
|
226 |
print(f"Summary: {output_summary}")
|
|
|
227 |
print(f"\nOutput files saved to: {args.output_dir}")
|
228 |
print("Generated Files:", sorted(output_files))
|
229 |
if log_files: print("Log Files:", sorted(log_files))
|
@@ -240,29 +511,33 @@ Examples:
|
|
240 |
elif args.task == 'deduplicate':
|
241 |
print("--- Starting Duplicate Detection Workflow... ---")
|
242 |
try:
|
|
|
243 |
if args.duplicate_type == 'pages':
|
244 |
# Page duplicate detection
|
245 |
if file_extension == '.csv':
|
246 |
print("--- Detected OCR CSV file. Starting Page Duplicate Detection... ---")
|
247 |
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
print("
|
254 |
-
|
255 |
-
|
256 |
# Load the CSV file as a list for the duplicate analysis function
|
257 |
-
results_df, output_paths, full_data_by_file = run_duplicate_analysis(
|
258 |
-
files=
|
259 |
threshold=args.similarity_threshold,
|
260 |
min_words=args.min_word_count,
|
261 |
min_consecutive=args.min_consecutive_pages,
|
262 |
greedy_match=args.greedy_match,
|
263 |
-
combine_pages=args.combine_pages
|
|
|
264 |
)
|
265 |
|
|
|
|
|
|
|
266 |
print("\n--- Page Duplicate Detection Complete ---")
|
267 |
print(f"Found {len(results_df)} duplicate matches")
|
268 |
print(f"\nOutput files saved to: {args.output_dir}")
|
@@ -271,19 +546,116 @@ Examples:
|
|
271 |
else:
|
272 |
print(f"Error: Page duplicate detection requires CSV files with OCR data.")
|
273 |
print("Please provide a CSV file containing OCR output data.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
274 |
|
275 |
elif args.duplicate_type == 'tabular':
|
276 |
# Tabular duplicate detection
|
|
|
277 |
if file_extension in ['.csv', '.xlsx', '.xls', '.parquet']:
|
278 |
print("--- Detected tabular file. Starting Tabular Duplicate Detection... ---")
|
|
|
|
|
279 |
|
280 |
-
results_df, output_paths, full_data_by_file =
|
281 |
-
files=
|
282 |
threshold=args.similarity_threshold,
|
283 |
min_words=args.min_word_count,
|
284 |
-
text_columns=args.text_columns
|
285 |
-
output_folder=args.output_dir
|
|
|
|
|
|
|
286 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
287 |
|
288 |
print("\n--- Tabular Duplicate Detection Complete ---")
|
289 |
print(f"Found {len(results_df)} duplicate matches")
|
@@ -299,10 +671,175 @@ Examples:
|
|
299 |
|
300 |
except Exception as e:
|
301 |
print(f"\nAn error occurred during the duplicate detection workflow: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
302 |
|
303 |
else:
|
304 |
print(f"Error: Invalid task '{args.task}'.")
|
305 |
-
print("Valid options: 'redact' or '
|
306 |
|
307 |
if __name__ == "__main__":
|
308 |
main()
|
|
|
1 |
import argparse
|
2 |
import os
|
3 |
import pandas as pd
|
4 |
+
import time
|
5 |
+
import uuid
|
6 |
+
from tools.config import LOCAL_PII_OPTION, AWS_PII_OPTION, OUTPUT_FOLDER, DEFAULT_LANGUAGE, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, CUSTOM_ENTITIES, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, DOCUMENT_REDACTION_BUCKET, DEFAULT_COST_CODE, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, DISPLAY_FILE_NAMES_IN_LOGS, DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX, DO_INITIAL_TABULAR_DATA_CLEAN, ALLOW_LIST_PATH, DENY_LIST_PATH, WHOLE_PAGE_REDACTION_LIST_PATH, PREPROCESS_LOCAL_OCR_IMAGES, IMAGES_DPI, RETURN_PDF_END_OF_REDACTION, COMPRESS_REDACTED_PDF, CHOSEN_LOCAL_OCR_MODEL, DEFAULT_TABULAR_ANONYMISATION_STRATEGY, DEFAULT_FUZZY_SPELLING_MISTAKES_NUM, DEFAULT_DUPLICATE_DETECTION_THRESHOLD, DEFAULT_MIN_WORD_COUNT, DEFAULT_MIN_CONSECUTIVE_PAGES, USE_GREEDY_DUPLICATE_DETECTION, DEFAULT_COMBINE_PAGES, REMOVE_DUPLICATE_ROWS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, INPUT_FOLDER, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, SESSION_OUTPUT_FOLDER, DIRECT_MODE_DEFAULT_USER, RUN_AWS_FUNCTIONS, S3_USAGE_LOGS_FOLDER
|
7 |
+
|
8 |
from tools.helper_functions import ensure_output_folder_exists
|
9 |
+
|
10 |
+
|
11 |
+
def _generate_session_hash() -> str:
|
12 |
+
"""Generate a unique session hash for logging purposes."""
|
13 |
+
return str(uuid.uuid4())[:8]
|
14 |
+
|
15 |
+
def get_username_and_folders(username:str = "",
|
16 |
+
output_folder_textbox:str=OUTPUT_FOLDER,
|
17 |
+
input_folder_textbox:str=INPUT_FOLDER,
|
18 |
+
session_output_folder:str=SESSION_OUTPUT_FOLDER,
|
19 |
+
textract_document_upload_input_folder:str=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER,
|
20 |
+
textract_document_upload_output_folder:str=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER,
|
21 |
+
s3_textract_document_logs_subfolder:str=TEXTRACT_JOBS_S3_LOC,
|
22 |
+
local_textract_document_logs_subfolder:str=TEXTRACT_JOBS_LOCAL_LOC):
|
23 |
+
|
24 |
+
|
25 |
+
# Generate session hash for logging. Either from input user name or generated
|
26 |
+
if username:
|
27 |
+
out_session_hash = username
|
28 |
+
else:
|
29 |
+
out_session_hash = _generate_session_hash()
|
30 |
+
|
31 |
+
|
32 |
+
if session_output_folder == 'True' or session_output_folder == True:
|
33 |
+
output_folder = output_folder_textbox + out_session_hash + "/"
|
34 |
+
input_folder = input_folder_textbox + out_session_hash + "/"
|
35 |
+
|
36 |
+
textract_document_upload_input_folder = textract_document_upload_input_folder + "/" + out_session_hash
|
37 |
+
textract_document_upload_output_folder = textract_document_upload_output_folder + "/" + out_session_hash
|
38 |
+
|
39 |
+
s3_textract_document_logs_subfolder = s3_textract_document_logs_subfolder + "/" + out_session_hash
|
40 |
+
local_textract_document_logs_subfolder = local_textract_document_logs_subfolder + "/" + out_session_hash + "/"
|
41 |
+
|
42 |
+
else:
|
43 |
+
output_folder = output_folder_textbox
|
44 |
+
input_folder = input_folder_textbox
|
45 |
+
|
46 |
+
if not os.path.exists(output_folder): os.mkdir(output_folder)
|
47 |
+
if not os.path.exists(input_folder): os.mkdir(input_folder)
|
48 |
+
|
49 |
+
return out_session_hash, output_folder, out_session_hash, input_folder, textract_document_upload_input_folder, textract_document_upload_output_folder, s3_textract_document_logs_subfolder, local_textract_document_logs_subfolder
|
50 |
+
|
51 |
+
|
52 |
+
def _get_env_list(env_var_name: str) -> list[str]:
|
53 |
+
"""Parses a comma-separated environment variable into a list of strings."""
|
54 |
+
value = env_var_name[1:-1].strip().replace('\"', '').replace("\'","")
|
55 |
+
if not value:
|
56 |
+
return []
|
57 |
+
# Split by comma and filter out any empty strings that might result from extra commas
|
58 |
+
return [s.strip() for s in value.split(',') if s.strip()]
|
59 |
+
|
60 |
+
|
61 |
|
62 |
# --- Constants and Configuration ---
|
63 |
|
|
|
65 |
if FULL_COMPREHEND_ENTITY_LIST: FULL_COMPREHEND_ENTITY_LIST = _get_env_list(FULL_COMPREHEND_ENTITY_LIST)
|
66 |
if CHOSEN_REDACT_ENTITIES: CHOSEN_REDACT_ENTITIES = _get_env_list(CHOSEN_REDACT_ENTITIES)
|
67 |
if FULL_ENTITY_LIST: FULL_ENTITY_LIST = _get_env_list(FULL_ENTITY_LIST)
|
68 |
+
if CUSTOM_ENTITIES: CUSTOM_ENTITIES = _get_env_list(CUSTOM_ENTITIES)
|
69 |
+
if DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX: DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX = _get_env_list(DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX)
|
70 |
|
71 |
# Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
|
72 |
+
CHOSEN_COMPREHEND_ENTITIES.extend(CUSTOM_ENTITIES)
|
73 |
+
FULL_COMPREHEND_ENTITY_LIST.extend(CUSTOM_ENTITIES)
|
74 |
|
75 |
chosen_redact_entities = CHOSEN_REDACT_ENTITIES
|
76 |
full_entity_list = FULL_ENTITY_LIST
|
77 |
chosen_comprehend_entities = CHOSEN_COMPREHEND_ENTITIES
|
78 |
full_comprehend_entity_list = FULL_COMPREHEND_ENTITY_LIST
|
79 |
+
default_handwrite_signature_checkbox = DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX
|
80 |
|
81 |
# --- Main CLI Function ---
|
82 |
+
def main(direct_mode_args={}):
|
83 |
"""
|
84 |
A unified command-line interface to prepare, redact, and anonymise various document types.
|
85 |
|
|
|
91 |
description='A versatile CLI for redacting PII from PDF/image files and anonymising Word/tabular data.',
|
92 |
formatter_class=argparse.RawTextHelpFormatter,
|
93 |
epilog='''
|
94 |
+
Examples (look in the output/ folder to see output files):
|
95 |
+
|
96 |
+
# Redaction
|
97 |
+
|
98 |
+
## Redact a PDF with default settings (local OCR):
|
99 |
+
python cli_redact.py --input_file example_data/example_of_emails_sent_to_a_professor_before_applying.pdf
|
100 |
+
|
101 |
+
## Extract text from a PDF only (i.e. no redaction), using local OCR:
|
102 |
+
python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --redact_whole_page_file example_data/partnership_toolkit_redact_some_pages.csv --pii_detector None
|
103 |
+
|
104 |
+
## Extract text from a PDF only (i.e. no redaction), using local OCR, with a whole page redaction list:
|
105 |
+
python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --redact_whole_page_file example_data/partnership_toolkit_redact_some_pages.csv --pii_detector Local --local_redact_entities CUSTOM
|
106 |
+
|
107 |
+
## Redact a PDF with allow list (local OCR) and custom list of redaction entities:
|
108 |
+
python cli_redact.py --input_file example_data/graduate-job-example-cover-letter.pdf --allow_list_file example_data/test_allow_list_graduate.csv --local_redact_entities TITLES PERSON DATE_TIME
|
109 |
|
110 |
+
## Redact a PDF with limited pages and text extraction method (local text) with custom fuzzy matching:
|
111 |
+
python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --deny_list_file example_data/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv --local_redact_entities CUSTOM_FUZZY --page_min 1 --page_max 3 --ocr_method "Local text" --fuzzy_mistakes 3
|
112 |
|
113 |
+
## Redaction with custom deny list, allow list, and whole page redaction list:
|
114 |
+
python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --deny_list_file example_data/partnership_toolkit_redact_custom_deny_list.csv --redact_whole_page_file example_data/partnership_toolkit_redact_some_pages.csv --allow_list_file example_data/test_allow_list_partnership.csv
|
115 |
|
116 |
+
## Redact an image:
|
117 |
+
python cli_redact.py --input_file example_data/example_complaint_letter.jpg
|
118 |
|
119 |
+
## Anonymise csv file with specific columns:
|
120 |
+
python cli_redact.py --input_file example_data/combined_case_notes.csv --text_columns "Case Note" "Client" --anon_strategy replace_redacted
|
121 |
|
122 |
+
## Anonymise csv file with a different strategy (remove text completely):
|
123 |
+
python cli_redact.py --input_file example_data/combined_case_notes.csv --text_columns "Case Note" "Client" --anon_strategy redact
|
124 |
|
125 |
+
## Anonymise a word document:
|
126 |
+
python cli_redact.py --input_file "example_data/Bold minimalist professional cover letter.docx" --anon_strategy replace_redacted
|
127 |
|
128 |
+
# Redaction with AWS services:
|
129 |
+
|
130 |
+
## Use Textract and Comprehend::
|
131 |
+
python cli_redact.py --input_file example_data/example_of_emails_sent_to_a_professor_before_applying.pdf --ocr_method "AWS Textract" --pii_detector "AWS Comprehend"
|
132 |
+
|
133 |
+
## Redact specific pages with AWS OCR and signature extraction:
|
134 |
+
python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --page_min 6 --page_max 7 --ocr_method "AWS Textract" --handwrite_signature_extraction "Extract handwriting" "Extract signatures"
|
135 |
+
|
136 |
+
# Duplicate page detection
|
137 |
+
|
138 |
+
## Find duplicate pages in OCR files:
|
139 |
+
python cli_redact.py --task deduplicate --input_file example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv --duplicate_type pages --similarity_threshold 0.95
|
140 |
+
|
141 |
+
## Find duplicate in OCR files at the line level:
|
142 |
+
python cli_redact.py --task deduplicate --input_file example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv --duplicate_type pages --similarity_threshold 0.95 --combine_pages False --min_word_count 3
|
143 |
+
|
144 |
+
## Find duplicate rows in tabular data:
|
145 |
+
python cli_redact.py --task deduplicate --input_file example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv --duplicate_type tabular --text_columns "text" --similarity_threshold 0.95
|
146 |
+
|
147 |
+
# AWS Textract whole document analysis
|
148 |
+
|
149 |
+
## Submit document to Textract for basic text analysis:
|
150 |
+
python cli_redact.py --task textract --textract_action submit --input_file example_data/example_of_emails_sent_to_a_professor_before_applying.pdf
|
151 |
+
|
152 |
+
## Submit document to Textract for analysis with signature extraction (Job ID will be printed to the console, you need this to retrieve the results):
|
153 |
+
python cli_redact.py --task textract --textract_action submit --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --extract_signatures
|
154 |
+
|
155 |
+
## Retrieve Textract results by job ID (returns a .json file output):
|
156 |
+
python cli_redact.py --task textract --textract_action retrieve --job_id 12345678-1234-1234-1234-123456789012
|
157 |
+
|
158 |
+
## List recent Textract jobs:
|
159 |
+
python cli_redact.py --task textract --textract_action list
|
160 |
+
|
161 |
+
'''
|
162 |
)
|
163 |
|
164 |
# --- Task Selection ---
|
165 |
task_group = parser.add_argument_group('Task Selection')
|
166 |
task_group.add_argument('--task',
|
167 |
+
choices=['redact', 'deduplicate', 'textract'],
|
168 |
default='redact',
|
169 |
+
help='Task to perform: redact (PII redaction/anonymisation), deduplicate (find duplicate content), or textract (AWS Textract batch operations).')
|
170 |
|
171 |
# --- General Arguments (apply to all file types) ---
|
172 |
general_group = parser.add_argument_group('General Options')
|
173 |
+
general_group.add_argument('--input_file', nargs='+', help='Path to the input file(s) to process. Separate multiple files with a space, and use quotes if there are spaces in the file name.')
|
174 |
general_group.add_argument('--output_dir', default=OUTPUT_FOLDER, help='Directory for all output files.')
|
175 |
+
general_group.add_argument('--input_dir', default=INPUT_FOLDER, help='Directory for all input files.')
|
176 |
general_group.add_argument('--language', default=DEFAULT_LANGUAGE, help='Language of the document content.')
|
177 |
+
general_group.add_argument('--allow_list', default=ALLOW_LIST_PATH, help='Path to a CSV file with words to exclude from redaction.')
|
178 |
+
general_group.add_argument('--pii_detector', choices=[LOCAL_PII_OPTION, AWS_PII_OPTION, "None"], default=LOCAL_PII_OPTION,
|
179 |
+
help='Core PII detection method (Local or AWS Comprehend, or None).')
|
180 |
+
general_group.add_argument('--username', default=DIRECT_MODE_DEFAULT_USER, help='Username for the session.')
|
181 |
+
general_group.add_argument('--save_to_user_folders', default=SESSION_OUTPUT_FOLDER, help='Whether to save to user folders or not.')
|
182 |
+
|
183 |
+
general_group.add_argument('--local_redact_entities', nargs='+', choices=full_entity_list, default=chosen_redact_entities,
|
184 |
+
help=f'Local redaction entities to use. Default: {chosen_redact_entities}. Full list: {full_entity_list}.')
|
185 |
+
|
186 |
+
general_group.add_argument('--aws_redact_entities', nargs='+', choices=full_comprehend_entity_list, default=chosen_comprehend_entities,
|
187 |
+
help=f'AWS redaction entities to use. Default: {chosen_comprehend_entities}. Full list: {full_comprehend_entity_list}.')
|
188 |
+
|
189 |
+
general_group.add_argument('--aws_access_key', default=AWS_ACCESS_KEY, help='Your AWS Access Key ID.')
|
190 |
+
general_group.add_argument('--aws_secret_key', default=AWS_SECRET_KEY, help='Your AWS Secret Access Key.')
|
191 |
+
general_group.add_argument('--cost_code', default=DEFAULT_COST_CODE, help='Cost code for tracking usage.')
|
192 |
+
general_group.add_argument('--aws_region', default=AWS_REGION, help='AWS region for cloud services.')
|
193 |
+
general_group.add_argument('--s3_bucket', default=DOCUMENT_REDACTION_BUCKET, help='S3 bucket name for cloud operations.')
|
194 |
+
general_group.add_argument('--do_initial_clean', default=DO_INITIAL_TABULAR_DATA_CLEAN, help='Perform initial text cleaning for tabular data.')
|
195 |
+
general_group.add_argument('--save_logs_to_csv', default=SAVE_LOGS_TO_CSV, help='Save processing logs to CSV files.')
|
196 |
+
general_group.add_argument('--save_logs_to_dynamodb', default=SAVE_LOGS_TO_DYNAMODB, help='Save processing logs to DynamoDB.')
|
197 |
+
general_group.add_argument('--display_file_names_in_logs', default=DISPLAY_FILE_NAMES_IN_LOGS, help='Include file names in log outputs.')
|
198 |
+
general_group.add_argument('--upload_logs_to_s3', default=RUN_AWS_FUNCTIONS == "1", help='Upload log files to S3 after processing.')
|
199 |
+
general_group.add_argument('--s3_logs_prefix', default=S3_USAGE_LOGS_FOLDER, help='S3 prefix for usage log files.')
|
200 |
|
201 |
# --- PDF/Image Redaction Arguments ---
|
202 |
pdf_group = parser.add_argument_group('PDF/Image Redaction Options (.pdf, .png, .jpg)')
|
203 |
+
pdf_group.add_argument('--ocr_method', choices=["AWS Textract", "Local OCR", "Local text"], default="Local OCR", help='OCR method for text extraction from images.')
|
|
|
|
|
|
|
204 |
pdf_group.add_argument('--page_min', type=int, default=0, help='First page to redact.')
|
205 |
+
pdf_group.add_argument('--page_max', type=int, default=0, help='Last page to redact.')
|
206 |
+
pdf_group.add_argument('--images_dpi', type=float, default=float(IMAGES_DPI), help='DPI for image processing.')
|
207 |
+
pdf_group.add_argument('--chosen_local_ocr_model', choices=['tesseract', 'hybrid', 'paddle'], default=CHOSEN_LOCAL_OCR_MODEL, help='Local OCR model to use.')
|
208 |
+
pdf_group.add_argument('--preprocess_local_ocr_images', default=PREPROCESS_LOCAL_OCR_IMAGES, help='Preprocess images before OCR.')
|
209 |
+
pdf_group.add_argument('--compress_redacted_pdf', default=COMPRESS_REDACTED_PDF, help='Compress the final redacted PDF.')
|
210 |
+
pdf_group.add_argument('--return_pdf_end_of_redaction', default=RETURN_PDF_END_OF_REDACTION, help='Return PDF at end of redaction process.')
|
211 |
+
pdf_group.add_argument('--deny_list_file', default=DENY_LIST_PATH, help='Custom words file to recognize for redaction.')
|
212 |
+
pdf_group.add_argument('--allow_list_file', default=ALLOW_LIST_PATH, help='Custom words file to recognize for redaction.')
|
213 |
+
pdf_group.add_argument('--redact_whole_page_file', default=WHOLE_PAGE_REDACTION_LIST_PATH, help='File for pages to redact completely.')
|
214 |
+
pdf_group.add_argument('--handwrite_signature_extraction', nargs='+', default=default_handwrite_signature_checkbox, help='Handwriting and signature extraction options. Choose from "Extract handwriting", "Extract signatures".')
|
|
|
|
|
|
|
|
|
215 |
|
216 |
# --- Word/Tabular Anonymisation Arguments ---
|
217 |
tabular_group = parser.add_argument_group('Word/Tabular Anonymisation Options (.docx, .csv, .xlsx)')
|
218 |
+
tabular_group.add_argument('--anon_strategy', choices=['redact', 'redact completely', 'replace_redacted', 'entity_type', 'encrypt', 'hash', 'replace with \'REDACTED\'', 'replace with <ENTITY_NAME>', 'mask', 'fake_first_name'], default=DEFAULT_TABULAR_ANONYMISATION_STRATEGY, help='The anonymisation strategy to apply.')
|
219 |
+
tabular_group.add_argument('--text_columns', nargs='+', default=list(), help='A list of column names to anonymise or deduplicate in tabular data.')
|
220 |
tabular_group.add_argument('--excel_sheets', nargs='+', default=list(), help='Specific Excel sheet names to process.')
|
221 |
+
tabular_group.add_argument('--fuzzy_mistakes', type=int, default=DEFAULT_FUZZY_SPELLING_MISTAKES_NUM, help='Number of allowed spelling mistakes for fuzzy matching.')
|
222 |
+
tabular_group.add_argument('--match_fuzzy_whole_phrase_bool', default=True, help='Match fuzzy whole phrase boolean.')
|
|
|
223 |
# --- Duplicate Detection Arguments ---
|
224 |
duplicate_group = parser.add_argument_group('Duplicate Detection Options')
|
225 |
duplicate_group.add_argument('--duplicate_type', choices=['pages', 'tabular'], default='pages', help='Type of duplicate detection: pages (for OCR files) or tabular (for CSV/Excel files).')
|
226 |
+
duplicate_group.add_argument('--similarity_threshold', type=float, default=DEFAULT_DUPLICATE_DETECTION_THRESHOLD, help='Similarity threshold (0-1) to consider content as duplicates.')
|
227 |
+
duplicate_group.add_argument('--min_word_count', type=int, default=DEFAULT_MIN_WORD_COUNT, help='Minimum word count for text to be considered in duplicate analysis.')
|
228 |
+
duplicate_group.add_argument('--min_consecutive_pages', type=int, default=DEFAULT_MIN_CONSECUTIVE_PAGES, help='Minimum number of consecutive pages to consider as a match.')
|
229 |
+
duplicate_group.add_argument('--greedy_match', default=USE_GREEDY_DUPLICATE_DETECTION, help='Use greedy matching strategy for consecutive pages.')
|
230 |
+
duplicate_group.add_argument('--combine_pages', default=DEFAULT_COMBINE_PAGES, help='Combine text from the same page number within a file. Alternative will enable line-level duplicate detection.')
|
231 |
+
duplicate_group.add_argument('--remove_duplicate_rows', default=REMOVE_DUPLICATE_ROWS, help='Remove duplicate rows from the output.')
|
|
|
232 |
|
233 |
+
# --- Textract Batch Operations Arguments ---
|
234 |
+
textract_group = parser.add_argument_group('Textract Batch Operations Options')
|
235 |
+
textract_group.add_argument('--textract_action',
|
236 |
+
choices=['submit', 'retrieve', 'list'],
|
237 |
+
help='Textract action to perform: submit (submit document for analysis), retrieve (get results by job ID), or list (show recent jobs).')
|
238 |
+
textract_group.add_argument('--job_id', help='Textract job ID for retrieve action.')
|
239 |
+
textract_group.add_argument('--extract_signatures', action='store_true', help='Extract signatures during Textract analysis (for submit action).')
|
240 |
+
textract_group.add_argument('--textract_bucket', default=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, help='S3 bucket name for Textract operations (overrides default).')
|
241 |
+
textract_group.add_argument('--textract_input_prefix', default=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, help='S3 prefix for input files in Textract operations.')
|
242 |
+
textract_group.add_argument('--textract_output_prefix', default=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, help='S3 prefix for output files in Textract operations.')
|
243 |
+
textract_group.add_argument('--s3_textract_document_logs_subfolder', default=TEXTRACT_JOBS_S3_LOC, help='S3 prefix for logs in Textract operations.')
|
244 |
+
textract_group.add_argument('--local_textract_document_logs_subfolder', default=TEXTRACT_JOBS_LOCAL_LOC, help='Local prefix for logs in Textract operations.')
|
245 |
+
textract_group.add_argument('--poll_interval', type=int, default=30, help='Polling interval in seconds for Textract job status.')
|
246 |
+
textract_group.add_argument('--max_poll_attempts', type=int, default=120, help='Maximum number of polling attempts for Textract job completion.')
|
247 |
# Parse arguments - either from command line or direct mode
|
248 |
if direct_mode_args:
|
249 |
# Use direct mode arguments
|
|
|
253 |
args = parser.parse_args()
|
254 |
|
255 |
# --- Initial Setup ---
|
256 |
+
# Convert string boolean variables to boolean
|
257 |
+
if args.preprocess_local_ocr_images == "True": args.preprocess_local_ocr_images = True
|
258 |
+
else: args.preprocess_local_ocr_images = False
|
259 |
+
if args.greedy_match == "True": args.greedy_match = True
|
260 |
+
else: args.greedy_match = False
|
261 |
+
if args.combine_pages == "True": args.combine_pages = True
|
262 |
+
else: args.combine_pages = False
|
263 |
+
if args.remove_duplicate_rows == "True": args.remove_duplicate_rows = True
|
264 |
+
else: args.remove_duplicate_rows = False
|
265 |
+
if args.return_pdf_end_of_redaction == "True": args.return_pdf_end_of_redaction = True
|
266 |
+
else: args.return_pdf_end_of_redaction = False
|
267 |
+
if args.compress_redacted_pdf == "True": args.compress_redacted_pdf = True
|
268 |
+
else: args.compress_redacted_pdf = False
|
269 |
+
if args.do_initial_clean == "True": args.do_initial_clean = True
|
270 |
+
else: args.do_initial_clean = False
|
271 |
+
if args.save_logs_to_csv == "True": args.save_logs_to_csv = True
|
272 |
+
else: args.save_logs_to_csv = False
|
273 |
+
if args.save_logs_to_dynamodb == "True": args.save_logs_to_dynamodb = True
|
274 |
+
else: args.save_logs_to_dynamodb = False
|
275 |
+
if args.display_file_names_in_logs == "True": args.display_file_names_in_logs = True
|
276 |
+
else: args.display_file_names_in_logs = False
|
277 |
+
if args.match_fuzzy_whole_phrase_bool == "True": args.match_fuzzy_whole_phrase_bool = True
|
278 |
+
else: args.match_fuzzy_whole_phrase_bool = False
|
279 |
+
if args.save_to_user_folders == "True": args.save_to_user_folders = True
|
280 |
+
else: args.save_to_user_folders = False
|
281 |
+
|
282 |
+
if args.task in ['redact', 'deduplicate']:
|
283 |
+
if args.input_file:
|
284 |
+
if isinstance(args.input_file, str):
|
285 |
+
args.input_file = [args.input_file]
|
286 |
+
|
287 |
+
_, file_extension = os.path.splitext(args.input_file[0])
|
288 |
+
file_extension = file_extension.lower()
|
289 |
+
else:
|
290 |
+
raise ValueError("Error: --input_file is required for 'redact' task.")
|
291 |
|
292 |
+
# Initialise usage logger if logging is enabled
|
293 |
+
usage_logger = None
|
294 |
+
if args.save_logs_to_csv or args.save_logs_to_dynamodb:
|
295 |
+
from tools.cli_usage_logger import create_cli_usage_logger
|
296 |
+
try:
|
297 |
+
usage_logger = create_cli_usage_logger()
|
298 |
+
except Exception as e:
|
299 |
+
print(f"Warning: Could not initialise usage logger: {e}")
|
300 |
+
|
301 |
+
print(f"Argument args.save_to_user_folders: {args.save_to_user_folders} will be used to determine if outputs will be saved to user folders.")
|
302 |
+
|
303 |
+
# Get username and folders
|
304 |
+
session_hash, args.output_dir, _, args.input_dir, args.textract_input_prefix, args.textract_output_prefix, args.s3_textract_document_logs_subfolder, args.local_textract_document_logs_subfolder = get_username_and_folders(username=args.username, output_folder_textbox=args.output_dir, input_folder_textbox=args.input_dir, session_output_folder=args.save_to_user_folders, textract_document_upload_input_folder=args.textract_input_prefix, textract_document_upload_output_folder=args.textract_output_prefix, s3_textract_document_logs_subfolder=args.s3_textract_document_logs_subfolder, local_textract_document_logs_subfolder=args.local_textract_document_logs_subfolder)
|
305 |
+
|
306 |
+
print(f"Conducting analyses with user {args.username}. Outputs will be saved to {args.output_dir}.")
|
307 |
|
308 |
# --- Route to the Correct Workflow Based on Task and File Type ---
|
309 |
|
310 |
+
# Validate input_file requirement for tasks that need it
|
311 |
+
if args.task in ['redact', 'deduplicate'] and not args.input_file:
|
312 |
+
print(f"Error: --input_file is required for '{args.task}' task.")
|
313 |
+
return
|
314 |
+
|
315 |
+
if args.ocr_method in ["Local OCR", "AWS Textract"]:
|
316 |
+
args.prepare_images = True
|
317 |
+
else:
|
318 |
+
args.prepare_images = False
|
319 |
+
|
320 |
+
from tools.cli_usage_logger import create_cli_usage_logger, log_redaction_usage
|
321 |
+
# Task 1: Redaction/Anonymisation
|
322 |
if args.task == 'redact':
|
323 |
+
|
324 |
# Workflow 1: PDF/Image Redaction
|
325 |
if file_extension in ['.pdf', '.png', '.jpg', '.jpeg']:
|
326 |
print("--- Detected PDF/Image file. Starting Redaction Workflow... ---")
|
327 |
+
start_time = time.time()
|
328 |
try:
|
329 |
+
from tools.file_conversion import prepare_image_or_pdf
|
330 |
+
from tools.file_redaction import choose_and_run_redactor
|
331 |
# Step 1: Prepare the document
|
332 |
print("\nStep 1: Preparing document...")
|
333 |
(
|
334 |
prep_summary, prepared_pdf_paths, image_file_paths, _, _, pdf_doc,
|
335 |
+
image_annotations, _, original_cropboxes, page_sizes, _, _, _, _, _
|
336 |
) = prepare_image_or_pdf(
|
337 |
+
file_paths=args.input_file, text_extract_method=args.ocr_method, all_line_level_ocr_results_df=pd.DataFrame(), all_page_line_level_ocr_results_with_words_df=pd.DataFrame(),
|
338 |
+
first_loop_state=True, prepare_for_review=False,
|
339 |
+
output_folder=args.output_dir, input_folder=args.input_dir, prepare_images=args.prepare_images
|
|
|
340 |
)
|
341 |
print(f"Preparation complete. {prep_summary}")
|
342 |
|
343 |
# Step 2: Redact the prepared document
|
344 |
print("\nStep 2: Running redaction...")
|
345 |
(
|
346 |
+
output_summary, output_files, _, _, log_files, _, _, _, _, _, _, _, _, _, comprehend_query_number, _, _, _, _, _, _, page_sizes, _, _, _, total_textract_query_number, _, _, _, _, _, _
|
347 |
) = choose_and_run_redactor(
|
348 |
+
file_paths=args.input_file, prepared_pdf_file_paths=prepared_pdf_paths,
|
349 |
+
pdf_image_file_paths=image_file_paths, chosen_redact_entities=args.local_redact_entities,
|
350 |
+
chosen_redact_comprehend_entities=args.aws_redact_entities, text_extraction_method=args.ocr_method,
|
351 |
+
in_allow_list=args.allow_list_file, in_deny_list=args.deny_list_file,
|
352 |
+
redact_whole_page_list=args.redact_whole_page_file, first_loop_state=True,
|
353 |
+
page_min=args.page_min, page_max=args.page_max, handwrite_signature_checkbox=args.handwrite_signature_extraction, max_fuzzy_spelling_mistakes_num=args.fuzzy_mistakes, match_fuzzy_whole_phrase_bool=args.match_fuzzy_whole_phrase_bool,
|
354 |
pymupdf_doc=pdf_doc, annotations_all_pages=image_annotations, page_sizes=page_sizes,
|
355 |
document_cropboxes=original_cropboxes, pii_identification_method=args.pii_detector,
|
356 |
aws_access_key_textbox=args.aws_access_key, aws_secret_key_textbox=args.aws_secret_key,
|
357 |
+
language=args.language, output_folder=args.output_dir, input_folder=args.input_dir
|
358 |
)
|
359 |
+
|
360 |
+
# Calculate processing time
|
361 |
+
end_time = time.time()
|
362 |
+
processing_time = end_time - start_time
|
363 |
+
|
364 |
+
# Log usage data if logger is available
|
365 |
+
if usage_logger:
|
366 |
+
try:
|
367 |
+
# Extract file name for logging
|
368 |
+
print("Saving logs to CSV")
|
369 |
+
doc_file_name = os.path.basename(args.input_file[0]) if args.display_file_names_in_logs else "document"
|
370 |
+
data_file_name = "" # Not applicable for PDF/image redaction
|
371 |
+
|
372 |
+
# Determine if this was a Textract API call
|
373 |
+
is_textract_call = args.ocr_method == "AWS Textract"
|
374 |
+
|
375 |
+
# Count pages (approximate from page_sizes if available)
|
376 |
+
total_pages = len(page_sizes) if page_sizes else 1
|
377 |
+
|
378 |
+
# Count API calls (approximate - would need to be tracked in the redaction function)
|
379 |
+
textract_queries = int(total_textract_query_number) if is_textract_call else 0
|
380 |
+
comprehend_queries = int(comprehend_query_number) if args.pii_detector == "AWS Comprehend" else 0
|
381 |
+
|
382 |
+
# Format handwriting/signature options
|
383 |
+
handwriting_signature = ", ".join(args.handwrite_signature_extraction) if args.handwrite_signature_extraction else ""
|
384 |
+
|
385 |
+
log_redaction_usage(
|
386 |
+
logger=usage_logger,
|
387 |
+
session_hash=session_hash,
|
388 |
+
doc_file_name=doc_file_name,
|
389 |
+
data_file_name=data_file_name,
|
390 |
+
time_taken=processing_time,
|
391 |
+
total_pages=total_pages,
|
392 |
+
textract_queries=textract_queries,
|
393 |
+
pii_method=args.pii_detector,
|
394 |
+
comprehend_queries=comprehend_queries,
|
395 |
+
cost_code=args.cost_code,
|
396 |
+
handwriting_signature=handwriting_signature,
|
397 |
+
text_extraction_method=args.ocr_method,
|
398 |
+
is_textract_call=is_textract_call,
|
399 |
+
task=args.task,
|
400 |
+
save_to_dynamodb=args.save_logs_to_dynamodb,
|
401 |
+
save_to_s3=args.upload_logs_to_s3,
|
402 |
+
s3_bucket=args.s3_bucket,
|
403 |
+
s3_key_prefix=args.s3_logs_prefix
|
404 |
+
)
|
405 |
+
except Exception as e:
|
406 |
+
print(f"Warning: Could not log usage data: {e}")
|
407 |
|
408 |
print("\n--- Redaction Process Complete ---")
|
409 |
print(f"Summary: {output_summary}")
|
410 |
+
print(f"Processing time: {processing_time:.2f} seconds")
|
411 |
print(f"\nOutput files saved to: {args.output_dir}")
|
412 |
print("Generated Files:", sorted(output_files))
|
413 |
if log_files: print("Log Files:", sorted(log_files))
|
|
|
418 |
# Workflow 2: Word/Tabular Data Anonymisation
|
419 |
elif file_extension in ['.docx', '.xlsx', '.xls', '.csv', '.parquet']:
|
420 |
print("--- Detected Word/Tabular file. Starting Anonymisation Workflow... ---")
|
421 |
+
start_time = time.time()
|
422 |
try:
|
423 |
+
from tools.data_anonymise import anonymise_files_with_open_text
|
424 |
+
|
425 |
# Run the anonymisation function directly
|
426 |
+
|
427 |
+
output_summary, output_files, _, _, log_files, _, processing_time, comprehend_query_number = anonymise_files_with_open_text(
|
428 |
+
file_paths=args.input_file,
|
429 |
in_text="", # Not used for file-based operations
|
430 |
+
anon_strategy=args.anon_strategy,
|
431 |
+
chosen_cols=args.text_columns,
|
432 |
+
chosen_redact_entities=args.local_redact_entities,
|
433 |
+
in_allow_list=args.allow_list_file,
|
434 |
in_excel_sheets=args.excel_sheets,
|
435 |
first_loop_state=True,
|
436 |
output_folder=args.output_dir,
|
437 |
+
in_deny_list=args.deny_list_file,
|
438 |
max_fuzzy_spelling_mistakes_num=args.fuzzy_mistakes,
|
439 |
pii_identification_method=args.pii_detector,
|
440 |
+
chosen_redact_comprehend_entities=args.aws_redact_entities,
|
441 |
aws_access_key_textbox=args.aws_access_key,
|
442 |
aws_secret_key_textbox=args.aws_secret_key,
|
443 |
language=args.language,
|
444 |
do_initial_clean=args.do_initial_clean
|
445 |
)
|
446 |
|
447 |
+
# Calculate processing time
|
448 |
+
end_time = time.time()
|
449 |
+
processing_time = end_time - start_time
|
450 |
+
|
451 |
+
# Log usage data if logger is available
|
452 |
+
if usage_logger:
|
453 |
+
try:
|
454 |
+
print("Saving logs to CSV")
|
455 |
+
# Extract file name for logging
|
456 |
+
doc_file_name = "" # Not applicable for tabular data
|
457 |
+
data_file_name = os.path.basename(args.input_file[0]) if args.display_file_names_in_logs else "data_file"
|
458 |
+
|
459 |
+
# Determine if this was a Textract API call (not applicable for tabular)
|
460 |
+
is_textract_call = False
|
461 |
+
|
462 |
+
# Count pages (not applicable for tabular data)
|
463 |
+
total_pages = 0
|
464 |
+
|
465 |
+
# Count API calls (approximate - would need to be tracked in the anonymisation function)
|
466 |
+
textract_queries = 0 # Not applicable for tabular data
|
467 |
+
comprehend_queries = comprehend_query_number if args.pii_detector == "AWS Comprehend" else 0
|
468 |
+
|
469 |
+
# Format handwriting/signature options (not applicable for tabular)
|
470 |
+
handwriting_signature = ""
|
471 |
+
|
472 |
+
log_redaction_usage(
|
473 |
+
logger=usage_logger,
|
474 |
+
session_hash=session_hash,
|
475 |
+
doc_file_name=doc_file_name,
|
476 |
+
data_file_name=data_file_name,
|
477 |
+
time_taken=processing_time,
|
478 |
+
total_pages=total_pages,
|
479 |
+
textract_queries=textract_queries,
|
480 |
+
pii_method=args.pii_detector,
|
481 |
+
comprehend_queries=comprehend_queries,
|
482 |
+
cost_code=args.cost_code,
|
483 |
+
handwriting_signature=handwriting_signature,
|
484 |
+
text_extraction_method="tabular", # Indicate this is tabular processing
|
485 |
+
is_textract_call=is_textract_call,
|
486 |
+
task=args.task,
|
487 |
+
save_to_dynamodb=args.save_logs_to_dynamodb,
|
488 |
+
save_to_s3=args.upload_logs_to_s3,
|
489 |
+
s3_bucket=args.s3_bucket,
|
490 |
+
s3_key_prefix=args.s3_logs_prefix
|
491 |
+
)
|
492 |
+
except Exception as e:
|
493 |
+
print(f"Warning: Could not log usage data: {e}")
|
494 |
+
|
495 |
print("\n--- Anonymisation Process Complete ---")
|
496 |
print(f"Summary: {output_summary}")
|
497 |
+
print(f"Processing time: {processing_time:.2f} seconds")
|
498 |
print(f"\nOutput files saved to: {args.output_dir}")
|
499 |
print("Generated Files:", sorted(output_files))
|
500 |
if log_files: print("Log Files:", sorted(log_files))
|
|
|
511 |
elif args.task == 'deduplicate':
|
512 |
print("--- Starting Duplicate Detection Workflow... ---")
|
513 |
try:
|
514 |
+
from tools.find_duplicate_pages import run_duplicate_analysis
|
515 |
if args.duplicate_type == 'pages':
|
516 |
# Page duplicate detection
|
517 |
if file_extension == '.csv':
|
518 |
print("--- Detected OCR CSV file. Starting Page Duplicate Detection... ---")
|
519 |
|
520 |
+
start_time = time.time()
|
521 |
+
|
522 |
+
if args.combine_pages == True:
|
523 |
+
print("Combining pages...")
|
524 |
+
else:
|
525 |
+
print("Using line-level duplicate detection...")
|
526 |
+
|
|
|
527 |
# Load the CSV file as a list for the duplicate analysis function
|
528 |
+
results_df, output_paths, full_data_by_file, processing_time, task_textbox = run_duplicate_analysis(
|
529 |
+
files=args.input_file,
|
530 |
threshold=args.similarity_threshold,
|
531 |
min_words=args.min_word_count,
|
532 |
min_consecutive=args.min_consecutive_pages,
|
533 |
greedy_match=args.greedy_match,
|
534 |
+
combine_pages=args.combine_pages,
|
535 |
+
output_folder=args.output_dir
|
536 |
)
|
537 |
|
538 |
+
end_time = time.time()
|
539 |
+
processing_time = end_time - start_time
|
540 |
+
|
541 |
print("\n--- Page Duplicate Detection Complete ---")
|
542 |
print(f"Found {len(results_df)} duplicate matches")
|
543 |
print(f"\nOutput files saved to: {args.output_dir}")
|
|
|
546 |
else:
|
547 |
print(f"Error: Page duplicate detection requires CSV files with OCR data.")
|
548 |
print("Please provide a CSV file containing OCR output data.")
|
549 |
+
|
550 |
+
# Log usage data if logger is available
|
551 |
+
if usage_logger:
|
552 |
+
try:
|
553 |
+
# Extract file name for logging
|
554 |
+
print("Saving logs to CSV")
|
555 |
+
doc_file_name = os.path.basename(args.input_file[0]) if args.display_file_names_in_logs else "document"
|
556 |
+
data_file_name = "" # Not applicable for PDF/image redaction
|
557 |
+
|
558 |
+
# Determine if this was a Textract API call
|
559 |
+
is_textract_call = False
|
560 |
+
|
561 |
+
# Count pages (approximate from page_sizes if available)
|
562 |
+
total_pages = len(page_sizes) if page_sizes else 1
|
563 |
+
|
564 |
+
# Count API calls (approximate - would need to be tracked in the redaction function)
|
565 |
+
textract_queries = 0
|
566 |
+
comprehend_queries = 0
|
567 |
+
|
568 |
+
# Format handwriting/signature options
|
569 |
+
handwriting_signature = ""
|
570 |
+
|
571 |
+
log_redaction_usage(
|
572 |
+
logger=usage_logger,
|
573 |
+
session_hash=session_hash,
|
574 |
+
doc_file_name=doc_file_name,
|
575 |
+
data_file_name=data_file_name,
|
576 |
+
time_taken=processing_time,
|
577 |
+
total_pages=total_pages,
|
578 |
+
textract_queries=textract_queries,
|
579 |
+
pii_method=args.pii_detector,
|
580 |
+
comprehend_queries=comprehend_queries,
|
581 |
+
cost_code=args.cost_code,
|
582 |
+
handwriting_signature=handwriting_signature,
|
583 |
+
text_extraction_method=args.ocr_method,
|
584 |
+
is_textract_call=is_textract_call,
|
585 |
+
task=args.task,
|
586 |
+
save_to_dynamodb=args.save_logs_to_dynamodb,
|
587 |
+
save_to_s3=args.upload_logs_to_s3,
|
588 |
+
s3_bucket=args.s3_bucket,
|
589 |
+
s3_key_prefix=args.s3_logs_prefix
|
590 |
+
)
|
591 |
+
except Exception as e:
|
592 |
+
print(f"Warning: Could not log usage data: {e}")
|
593 |
|
594 |
elif args.duplicate_type == 'tabular':
|
595 |
# Tabular duplicate detection
|
596 |
+
from tools.find_duplicate_tabular import run_tabular_duplicate_detection
|
597 |
if file_extension in ['.csv', '.xlsx', '.xls', '.parquet']:
|
598 |
print("--- Detected tabular file. Starting Tabular Duplicate Detection... ---")
|
599 |
+
|
600 |
+
start_time = time.time()
|
601 |
|
602 |
+
results_df, output_paths, full_data_by_file, processing_time, task_textbox = run_tabular_duplicate_detection(
|
603 |
+
files=args.input_file,
|
604 |
threshold=args.similarity_threshold,
|
605 |
min_words=args.min_word_count,
|
606 |
+
text_columns=args.text_columns,
|
607 |
+
output_folder=args.output_dir,
|
608 |
+
do_initial_clean_dup=args.do_initial_clean,
|
609 |
+
in_excel_tabular_sheets=args.excel_sheets,
|
610 |
+
remove_duplicate_rows=args.remove_duplicate_rows
|
611 |
)
|
612 |
+
|
613 |
+
end_time = time.time()
|
614 |
+
processing_time = end_time - start_time
|
615 |
+
|
616 |
+
# Log usage data if logger is available
|
617 |
+
if usage_logger:
|
618 |
+
try:
|
619 |
+
# Extract file name for logging
|
620 |
+
print("Saving logs to CSV")
|
621 |
+
doc_file_name = ""
|
622 |
+
data_file_name = os.path.basename(args.input_file[0]) if args.display_file_names_in_logs else "data_file"
|
623 |
+
|
624 |
+
# Determine if this was a Textract API call
|
625 |
+
is_textract_call = False
|
626 |
+
|
627 |
+
# Count pages (approximate from page_sizes if available)
|
628 |
+
total_pages = len(page_sizes) if page_sizes else 1
|
629 |
+
|
630 |
+
# Count API calls (approximate - would need to be tracked in the redaction function)
|
631 |
+
textract_queries = 0
|
632 |
+
comprehend_queries = 0
|
633 |
+
|
634 |
+
# Format handwriting/signature options
|
635 |
+
handwriting_signature = ""
|
636 |
+
|
637 |
+
log_redaction_usage(
|
638 |
+
logger=usage_logger,
|
639 |
+
session_hash=session_hash,
|
640 |
+
doc_file_name=doc_file_name,
|
641 |
+
data_file_name=data_file_name,
|
642 |
+
time_taken=processing_time,
|
643 |
+
total_pages=total_pages,
|
644 |
+
textract_queries=textract_queries,
|
645 |
+
pii_method=args.pii_detector,
|
646 |
+
comprehend_queries=comprehend_queries,
|
647 |
+
cost_code=args.cost_code,
|
648 |
+
handwriting_signature=handwriting_signature,
|
649 |
+
text_extraction_method=args.ocr_method,
|
650 |
+
is_textract_call=is_textract_call,
|
651 |
+
task=args.task,
|
652 |
+
save_to_dynamodb=args.save_logs_to_dynamodb,
|
653 |
+
save_to_s3=args.upload_logs_to_s3,
|
654 |
+
s3_bucket=args.s3_bucket,
|
655 |
+
s3_key_prefix=args.s3_logs_prefix
|
656 |
+
)
|
657 |
+
except Exception as e:
|
658 |
+
print(f"Warning: Could not log usage data: {e}")
|
659 |
|
660 |
print("\n--- Tabular Duplicate Detection Complete ---")
|
661 |
print(f"Found {len(results_df)} duplicate matches")
|
|
|
671 |
|
672 |
except Exception as e:
|
673 |
print(f"\nAn error occurred during the duplicate detection workflow: {e}")
|
674 |
+
|
675 |
+
# Task 3: Textract Batch Operations
|
676 |
+
elif args.task == 'textract':
|
677 |
+
print("--- Starting Textract Batch Operations Workflow... ---")
|
678 |
+
|
679 |
+
if not args.textract_action:
|
680 |
+
print("Error: --textract_action is required for textract task.")
|
681 |
+
print("Valid options: 'submit', 'retrieve', or 'list'")
|
682 |
+
return
|
683 |
+
|
684 |
+
try:
|
685 |
+
if args.textract_action == 'submit':
|
686 |
+
from tools.textract_batch_call import analyse_document_with_textract_api, load_in_textract_job_details
|
687 |
+
# Submit document to Textract for analysis
|
688 |
+
if not args.input_file:
|
689 |
+
print("Error: --input_file is required for submit action.")
|
690 |
+
return
|
691 |
+
|
692 |
+
print(f"--- Submitting document to Textract: {args.input_file} ---")
|
693 |
+
|
694 |
+
start_time = time.time()
|
695 |
+
|
696 |
+
# Load existing job details
|
697 |
+
job_df = load_in_textract_job_details(load_s3_jobs_loc=args.s3_textract_document_logs_subfolder, load_local_jobs_loc=args.local_textract_document_logs_subfolder)
|
698 |
+
|
699 |
+
# Determine signature extraction options
|
700 |
+
signature_options = ['Extract handwriting', 'Extract signatures'] if args.extract_signatures else ['Extract handwriting']
|
701 |
+
|
702 |
+
# Use configured bucket or override
|
703 |
+
textract_bucket = args.textract_bucket if args.textract_bucket else ""
|
704 |
+
|
705 |
+
# Submit the job
|
706 |
+
result_message, job_id, job_type, successful_job_number, is_textract_call, total_pages, task_textbox = analyse_document_with_textract_api(
|
707 |
+
local_pdf_path=args.input_file,
|
708 |
+
s3_input_prefix=args.textract_input_prefix,
|
709 |
+
s3_output_prefix=args.textract_output_prefix,
|
710 |
+
job_df=job_df,
|
711 |
+
s3_bucket_name=textract_bucket,
|
712 |
+
general_s3_bucket_name=args.s3_bucket,
|
713 |
+
local_output_dir=args.output_dir,
|
714 |
+
analyse_signatures=signature_options,
|
715 |
+
aws_region=args.aws_region
|
716 |
+
)
|
717 |
+
|
718 |
+
end_time = time.time()
|
719 |
+
processing_time = end_time - start_time
|
720 |
+
|
721 |
+
print(f"\n--- Textract Job Submitted Successfully ---")
|
722 |
+
print(f"Job ID: {job_id}")
|
723 |
+
print(f"Job Type: {job_type}")
|
724 |
+
print(f"Message: {result_message}")
|
725 |
+
print(f"Results will be available in: {args.output_dir}")
|
726 |
+
|
727 |
+
# Log usage data if logger is available
|
728 |
+
if usage_logger:
|
729 |
+
try:
|
730 |
+
# Extract file name for logging
|
731 |
+
print("Saving logs to CSV")
|
732 |
+
doc_file_name = os.path.basename(args.input_file[0]) if args.display_file_names_in_logs else "document"
|
733 |
+
data_file_name = ""
|
734 |
+
|
735 |
+
# Determine if this was a Textract API call
|
736 |
+
is_textract_call = True
|
737 |
+
args.ocr_method == "AWS Textract"
|
738 |
+
|
739 |
+
# Count API calls (approximate - would need to be tracked in the redaction function)
|
740 |
+
textract_queries = total_pages
|
741 |
+
comprehend_queries = 0
|
742 |
+
|
743 |
+
# Format handwriting/signature options
|
744 |
+
handwriting_signature = ""
|
745 |
+
|
746 |
+
log_redaction_usage(
|
747 |
+
logger=usage_logger,
|
748 |
+
session_hash=session_hash,
|
749 |
+
doc_file_name=doc_file_name,
|
750 |
+
data_file_name=data_file_name,
|
751 |
+
time_taken=processing_time,
|
752 |
+
total_pages=total_pages,
|
753 |
+
textract_queries=textract_queries,
|
754 |
+
pii_method=args.pii_detector,
|
755 |
+
comprehend_queries=comprehend_queries,
|
756 |
+
cost_code=args.cost_code,
|
757 |
+
handwriting_signature=handwriting_signature,
|
758 |
+
text_extraction_method=args.ocr_method,
|
759 |
+
is_textract_call=is_textract_call,
|
760 |
+
task=args.task,
|
761 |
+
save_to_dynamodb=args.save_logs_to_dynamodb,
|
762 |
+
save_to_s3=args.upload_logs_to_s3,
|
763 |
+
s3_bucket=args.s3_bucket,
|
764 |
+
s3_key_prefix=args.s3_logs_prefix
|
765 |
+
)
|
766 |
+
except Exception as e:
|
767 |
+
print(f"Warning: Could not log usage data: {e}")
|
768 |
+
|
769 |
+
elif args.textract_action == 'retrieve':
|
770 |
+
print(f"--- Retrieving Textract results for Job ID: {args.job_id} ---")
|
771 |
+
|
772 |
+
from tools.textract_batch_call import poll_whole_document_textract_analysis_progress_and_download, load_in_textract_job_details
|
773 |
+
# Retrieve results by job ID
|
774 |
+
if not args.job_id:
|
775 |
+
print("Error: --job_id is required for retrieve action.")
|
776 |
+
return
|
777 |
+
|
778 |
+
# Load existing job details to get job type
|
779 |
+
print("Loading existing job details...")
|
780 |
+
job_df = load_in_textract_job_details(load_s3_jobs_loc=args.s3_textract_document_logs_subfolder, load_local_jobs_loc=args.local_textract_document_logs_subfolder)
|
781 |
+
|
782 |
+
# Find job type from the dataframe
|
783 |
+
job_type = "document_text_detection" # default
|
784 |
+
if not job_df.empty and "job_id" in job_df.columns:
|
785 |
+
matching_jobs = job_df.loc[job_df["job_id"] == args.job_id]
|
786 |
+
if not matching_jobs.empty and "job_type" in matching_jobs.columns:
|
787 |
+
job_type = matching_jobs.iloc[0]["job_type"]
|
788 |
+
|
789 |
+
# Use configured bucket or override
|
790 |
+
textract_bucket = args.textract_bucket if args.textract_bucket else ""
|
791 |
+
|
792 |
+
# Poll for completion and download results
|
793 |
+
print("Polling for completion and downloading results...")
|
794 |
+
downloaded_file_path, job_status, updated_job_df, output_filename = poll_whole_document_textract_analysis_progress_and_download(
|
795 |
+
job_id=args.job_id,
|
796 |
+
job_type_dropdown=job_type,
|
797 |
+
s3_output_prefix=args.textract_output_prefix,
|
798 |
+
pdf_filename="", # Will be determined from job details
|
799 |
+
job_df=job_df,
|
800 |
+
s3_bucket_name=textract_bucket,
|
801 |
+
load_s3_jobs_loc=args.s3_textract_document_logs_subfolder,
|
802 |
+
load_local_jobs_loc=args.local_textract_document_logs_subfolder,
|
803 |
+
local_output_dir=args.output_dir,
|
804 |
+
poll_interval_seconds=args.poll_interval,
|
805 |
+
max_polling_attempts=args.max_poll_attempts
|
806 |
+
)
|
807 |
+
|
808 |
+
print(f"\n--- Textract Results Retrieved Successfully ---")
|
809 |
+
print(f"Job Status: {job_status}")
|
810 |
+
print(f"Downloaded File: {downloaded_file_path}")
|
811 |
+
#print(f"Output Filename: {output_filename}")
|
812 |
+
|
813 |
+
elif args.textract_action == 'list':
|
814 |
+
from tools.textract_batch_call import load_in_textract_job_details
|
815 |
+
# List recent Textract jobs
|
816 |
+
print("--- Listing Recent Textract Jobs ---")
|
817 |
+
|
818 |
+
job_df = load_in_textract_job_details(load_s3_jobs_loc=args.s3_textract_document_logs_subfolder, load_local_jobs_loc=args.local_textract_document_logs_subfolder)
|
819 |
+
|
820 |
+
if job_df.empty:
|
821 |
+
print("No recent Textract jobs found.")
|
822 |
+
else:
|
823 |
+
print(f"\nFound {len(job_df)} recent Textract jobs:")
|
824 |
+
print("-" * 80)
|
825 |
+
for _, job in job_df.iterrows():
|
826 |
+
print(f"Job ID: {job.get('job_id', 'N/A')}")
|
827 |
+
print(f"File: {job.get('file_name', 'N/A')}")
|
828 |
+
print(f"Type: {job.get('job_type', 'N/A')}")
|
829 |
+
print(f"Signatures: {job.get('signature_extraction', 'N/A')}")
|
830 |
+
print(f"Date: {job.get('job_date_time', 'N/A')}")
|
831 |
+
print("-" * 80)
|
832 |
+
|
833 |
+
else:
|
834 |
+
print(f"Error: Invalid textract_action '{args.textract_action}'.")
|
835 |
+
print("Valid options: 'submit', 'retrieve', or 'list'")
|
836 |
+
|
837 |
+
except Exception as e:
|
838 |
+
print(f"\nAn error occurred during the Textract workflow: {e}")
|
839 |
|
840 |
else:
|
841 |
print(f"Error: Invalid task '{args.task}'.")
|
842 |
+
print("Valid options: 'redact', 'deduplicate', or 'textract'")
|
843 |
|
844 |
if __name__ == "__main__":
|
845 |
main()
|
example_data/Bold minimalist professional cover letter.docx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0c8551ac157f350b2093e5d8c89f68474f613350074201cff6d52d5ed5ec28ff
|
3 |
+
size 23992
|
example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
example_data/Partnership-Agreement-Toolkit_0_0.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0db46a784d7aaafb8d02acf8686523dd376400117d07926a5dcb51ceb69e3236
|
3 |
+
size 426602
|
example_data/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
another country or territory sign a formel agreement on behalf? of their communities endorsing a
|
2 |
+
soster citues international
|
example_data/combined_case_notes.csv
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Date,Social Worker,Client,Case Note
|
2 |
+
"January 3, 2023",Jane Smith,Alex D.,"Met with Alex at school following reports of increased absences and declining grades. Alex appeared sullen and avoided eye contact. When prompted about school, Alex expressed feelings of isolation and stated, ""No one gets me."" Scheduled a follow-up meeting to further explore these feelings."
|
3 |
+
"January 17, 2023",Jane Smith,Alex D.,"Met with Alex at the community center. Alex displayed sudden outbursts of anger when discussing home life, particularly in relation to a new stepfather. Alex mentioned occasional substance use, but did not specify which substances. Recommended a comprehensive assessment."
|
4 |
+
"February 5, 2023",Jane Smith,Alex D.,Home visit conducted. Alex's mother reported frequent arguments at home. She expressed concerns about Alex's new group of friends and late-night outings. Noted potential signs of substance abuse. Suggested family counseling.
|
5 |
+
"February 21, 2023",Jane Smith,Alex D.,"Met with Alex alone at my office. Alex appeared more agitated than in previous meetings. There were visible signs of self-harm on Alex's arms. When questioned, Alex became defensive. Immediate referral made to a mental health professional."
|
6 |
+
"March 10, 2023",Jane Smith,Alex D.,Attended joint session with Alex and a therapist. Alex shared feelings of hopelessness and admitted to occasional thoughts of self-harm. Therapist recommended a comprehensive mental health evaluation and ongoing therapy.
|
7 |
+
"March 25, 2023",Jane Smith,Alex D.,"Received a call from Alex's school about a physical altercation with another student. Met with Alex, who displayed high levels of frustration and admitted to the use of alcohol. Discussed the importance of seeking help and finding positive coping mechanisms. Recommended enrollment in an anger management program."
|
8 |
+
"April 15, 2023",Jane Smith,Alex D.,Met with Alex and mother to discuss progress. Alex's mother expressed concerns about Alex's increasing aggression at home. Alex acknowledged the issues but blamed others for provoking the behavior. It was decided that a more intensive intervention may be needed.
|
9 |
+
"April 30, 2023",Jane Smith,Alex D.,"Met with Alex and a psychiatrist. Psychiatrist diagnosed Alex with Oppositional Defiant Disorder (ODD) and co-morbid substance use disorder. A treatment plan was discussed, including medication, therapy, and family counseling."
|
10 |
+
"May 20, 2023",Jane Smith,Alex D.,"Met with Alex to discuss progress. Alex has started attending group therapy and has shown slight improvements in behavior. Still, concerns remain about substance use. Discussed potential for a short-term residential treatment program."
|
11 |
+
"January 3, 2023",Jane Smith,Jamie L.,"Met with Jamie at school after receiving reports of consistent tardiness and decreased participation in class. Jamie appeared withdrawn and exhibited signs of sadness. When asked about feelings, Jamie expressed feeling ""empty"" and ""hopeless"" at times. Scheduled a follow-up meeting to further explore these feelings."
|
12 |
+
"January 17, 2023",Jane Smith,Jamie L.,"Met with Jamie at the community center. Jamie shared feelings of low self-worth, mentioning that it's hard to find motivation for daily tasks. Discussed potential triggers and learned about recent family financial struggles. Recommended counseling and possible group therapy for peer support."
|
13 |
+
"February 5, 2023",Jane Smith,Jamie L.,Home visit conducted. Jamie's parents shared concerns about Jamie's increasing withdrawal from family activities and lack of interest in hobbies. Parents mentioned that Jamie spends a lot of time alone in the room. Suggested family therapy to open communication channels.
|
14 |
+
"February 21, 2023",Jane Smith,Jamie L.,Met with Jamie in my office. Jamie opened up about feelings of isolation and mentioned difficulty sleeping. No signs of self-harm or suicidal ideation were noted. Recommended a comprehensive mental health assessment to better understand the depth of the depression.
|
15 |
+
"March 10, 2023",Jane Smith,Jamie L.,"Attended a joint session with Jamie and a therapist. The therapist noted signs of moderate depression. Together, we discussed coping strategies and potential interventions. Jamie showed interest in art therapy."
|
16 |
+
"March 25, 2023",Jane Smith,Jamie L.,"Received feedback from Jamie's school that academic performance has slightly improved. However, social interactions remain limited. Encouraged Jamie to join school clubs or groups to foster connection."
|
17 |
+
"April 15, 2023",Jane Smith,Jamie L.,"Met with Jamie and parents to discuss progress. Parents have observed slight improvements in mood on some days, but overall, Jamie still appears to struggle. It was decided to explore medication as a potential aid alongside therapy."
|
18 |
+
"April 30, 2023",Jane Smith,Jamie L.,Met with Jamie and a psychiatrist. The psychiatrist diagnosed Jamie with Major Depressive Disorder (MDD) and suggested considering antidepressant medication. Discussed the potential benefits and side effects. Jamie and parents will think it over.
|
19 |
+
"May 20, 2023",Jane Smith,Jamie L.,"Jamie has started on a low dose of an antidepressant. Initial feedback is positive, with some improvement in mood and energy levels. Will continue monitoring and adjusting as necessary."
|
example_data/doubled_output_joined.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6eeac353164447c2aa429196e1a6ffae4c095d7171e63c2d1cd1966fdf32d1ed
|
3 |
+
size 1274719
|
example_data/example_complaint_letter.jpg
ADDED
![]() |
Git LFS Details
|
example_data/example_of_emails_sent_to_a_professor_before_applying.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ed0cd82b5b5826b851ca0e7c102d2d4d27580f7a90de4211a33178a6664d008d
|
3 |
+
size 8848
|
example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_ocr_output.csv
ADDED
@@ -0,0 +1,277 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
page,text,left,top,width,height
|
2 |
+
1,Partnership Agreement,0.516078,0.027879,0.440784,0.032424
|
3 |
+
1,SisterCities,0.169804,0.033333,0.238431,0.028182
|
4 |
+
1,INTERNATIONAL,0.170196,0.06697,0.237647,0.008788
|
5 |
+
1,Toolkit,0.830588,0.07303,0.126667,0.025152
|
6 |
+
1,Connect globally. Thrive locally.,0.169804,0.08697,0.238824,0.01303
|
7 |
+
1,Types of Affiliations,0.117255,0.157576,0.241961,0.02
|
8 |
+
1,Sister City Relationship,0.117647,0.187273,0.196863,0.013939
|
9 |
+
1,"A Sister City relationship is formed when the mayor or highest elected official (or, if elections",0.117255,0.211212,0.738824,0.013636
|
10 |
+
1,"do not take place, highest appointed official) from a U.S. community and a community in",0.117647,0.227273,0.70902,0.013939
|
11 |
+
1,another country or territory sign a formal agreement on behalf of their communities endorsing a,0.117647,0.243636,0.761961,0.013636
|
12 |
+
1,"""sister city/sister cities"" relationship. Sister city agreements shall be considered active/valid",0.118039,0.259697,0.731373,0.013939
|
13 |
+
1,unless otherwise indicated by one or both of the respective communities.,0.118039,0.276061,0.58549,0.013636
|
14 |
+
1,Sister Cities International shall formally recognize only those relationships by cities/members in,0.118039,0.299697,0.758824,0.013636
|
15 |
+
1,good standing (i.e. who are current on membership dues) in its Membership Directory or on its,0.117647,0.316061,0.754902,0.013636
|
16 |
+
1,"website. However, Sister Cities International shall not assert as invalid or otherwise impugn the",0.116863,0.332121,0.760784,0.013636
|
17 |
+
1,legitimacy of those relationships formed by non-members.,0.118039,0.348485,0.466275,0.013636
|
18 |
+
1,Friendship City,0.118039,0.372121,0.127059,0.013939
|
19 |
+
1,"A Friendship City or Friendship Cities relationship is often formed by cities as a ""stepping",0.117255,0.395758,0.714118,0.013636
|
20 |
+
1,"stone"" to a more formal ""Sister City"" agreement. Typically Friendship City agreements are",0.117647,0.411515,0.720392,0.014242
|
21 |
+
1,referred to as such in the formal documents that are signed. Sister Cities International shall,0.118039,0.428182,0.72549,0.013636
|
22 |
+
1,recognize Friendship City relationships by members in its Membership Directory and website.,0.118039,0.444242,0.747843,0.013636
|
23 |
+
1,As per Sister Cities International Board of Directors:,0.117255,0.467879,0.413333,0.013636
|
24 |
+
1,Sister Cities International will recognize a new sister cities affiliation between a,0.169412,0.492121,0.626667,0.013333
|
25 |
+
1,"U.S. and an international community, even though another affiliation may exist",0.169412,0.507879,0.625098,0.013636
|
26 |
+
1,"between that international community and a different U.S. community, only if a",0.169412,0.524545,0.62902,0.013636
|
27 |
+
1,cooperative agreement among all involved communities is filed with Sister Cities,0.16902,0.540606,0.643137,0.013636
|
28 |
+
1,"International. If a cooperative agreement is denied, or no response to the request",0.170196,0.556667,0.647843,0.013333
|
29 |
+
1,"is received within a reasonable amount of time, Sister Cities International will",0.169412,0.57303,0.612157,0.012727
|
30 |
+
1,recognize the partnership as a friendship city and it will be delineated as such,0.169412,0.589091,0.621176,0.013636
|
31 |
+
1,with a symbol in the membership directories.,0.168627,0.605455,0.358824,0.013333
|
32 |
+
1,The cooperative agreement must be sent by the Mayor/County,0.168627,0.628788,0.509412,0.013939
|
33 |
+
1,"Executive/Governor of the requesting community, and must be sent to the",0.169804,0.645152,0.595294,0.014242
|
34 |
+
1,Mayor/County Executive/Governor of each of the existing partnership,0.169804,0.661212,0.555294,0.013636
|
35 |
+
1,communities. Although the Mayor/County Executive/Governor may request input,0.16902,0.677879,0.647451,0.013636
|
36 |
+
1,"from, or may be given input by, the sister cities program, it is up to the discretion",0.168627,0.693939,0.647059,0.013939
|
37 |
+
1,of the Mayor/County Executive/Governor to sign the cooperative agreement.,0.16902,0.709697,0.612941,0.013939
|
38 |
+
1,Although Sister Cities International will help with the cooperative agreement,0.168627,0.726364,0.605882,0.013636
|
39 |
+
1,"process, it is up to the requesting community to get the agreement signed. Sister",0.169412,0.742121,0.650196,0.013939
|
40 |
+
1,"Cities International will not, in any way, force a community to ""share"" and sign",0.16902,0.758182,0.623922,0.014242
|
41 |
+
1,the cooperative agreement.,0.168627,0.774848,0.219216,0.013333
|
42 |
+
1,"To place a relationship into Emeritus status, the mayor or highest elected official of the U.S.",0.117255,0.798485,0.736471,0.013939
|
43 |
+
1,community must write a letter to the mayor of the foreign city indicating that they wish to,0.118039,0.814545,0.70902,0.013636
|
44 |
+
1,"remain sister cities, but understand that the relationship will remain inactive until such time as",0.118039,0.831212,0.747451,0.013333
|
45 |
+
1,both cities are able to sustain an active relationship. Sister Cities International should be,0.118039,0.847273,0.705098,0.013636
|
46 |
+
1,informed in writing by the mayor of the U.S. city of the situation. Sister Cities International will,0.118039,0.863333,0.746275,0.013636
|
47 |
+
2,Partnership Agreement,0.516078,0.027879,0.440784,0.032424
|
48 |
+
2,SisterCities,0.169804,0.033333,0.238824,0.028182
|
49 |
+
2,INTERNATIONAL,0.170196,0.06697,0.237647,0.008788
|
50 |
+
2,Toolkit,0.83098,0.072727,0.127059,0.025455
|
51 |
+
2,Connect globally. Thrive locally.,0.169804,0.08697,0.239216,0.01303
|
52 |
+
2,then place the partnership into Emeritus Status and will reflect this status in directories and all,0.117255,0.132424,0.751373,0.013333
|
53 |
+
2,lists of sister city programs.,0.118039,0.148788,0.218431,0.013333
|
54 |
+
2,"If a community wishes to terminate a sister city relationship, then a letter from the mayor or",0.118431,0.172424,0.732549,0.013333
|
55 |
+
2,highest elected official of the U.S. city should be sent to the mayor of the sister city. Sister,0.118039,0.188485,0.721569,0.013636
|
56 |
+
2,Cities International should be informed of this action in writing by the mayor of the U.S. city,0.118039,0.204848,0.72902,0.013333
|
57 |
+
2,and Sister Cities International will then remove the partnership from its directories and all lists,0.117647,0.221212,0.746275,0.013333
|
58 |
+
2,of sister city programs. We do not recommend terminating a relationship simply because it is,0.117647,0.237273,0.743529,0.013333
|
59 |
+
2,"dormant. Many partnerships wax and wane over the years, and in many cases a dormant",0.117647,0.253939,0.713333,0.013333
|
60 |
+
2,partnership may be reinvigorated by local members years after it has been inactive.,0.118039,0.269697,0.664314,0.013636
|
61 |
+
2,General Guidelines,0.118039,0.295152,0.231765,0.016061
|
62 |
+
2,In order for a sister city/county/state partnership to be recognized by Sister Cities International,0.118431,0.324242,0.754902,0.013636
|
63 |
+
2,"(SCI), the two communities must sign formal documents which clearly endorse the link. This",0.118039,0.340606,0.74,0.013636
|
64 |
+
2,presumes several key items: that the U.S. community is already a member of SCI and has,0.118039,0.35697,0.718039,0.013636
|
65 |
+
2,followed proper procedures (e.g. passed a city council resolution declaring the intent to twin,0.117255,0.373333,0.737647,0.013636
|
66 |
+
2,with the specific city); that both communities share a mutual commitment to the relationship;,0.117255,0.389394,0.740784,0.013636
|
67 |
+
2,and that both have secured the necessary support structure to build a lasting relationship. You,0.117647,0.405455,0.758039,0.013333
|
68 |
+
2,should check with your local sister city program to see if they have any additional requirements,0.117647,0.421818,0.760784,0.013636
|
69 |
+
2,before pursuing a sister city relationship.,0.118039,0.437879,0.323137,0.013636
|
70 |
+
2,"SCI often refers to these agreements as a ""Sister City Agreement"" or ""Memorandum of",0.118039,0.461515,0.696863,0.013939
|
71 |
+
2,"Understanding."" However, as the following examples show, the actual name and format of",0.118039,0.477576,0.729804,0.013636
|
72 |
+
2,your documents is left up to you.,0.117255,0.494242,0.262745,0.013636
|
73 |
+
2,A few things to keep in mind as you draft your agreement:,0.117255,0.517879,0.463137,0.013636
|
74 |
+
2,"Your agreement can range from the ceremonial, with language focusing on each city's",0.176471,0.542121,0.69098,0.013939
|
75 |
+
2,"commitment to fostering understanding, cooperation, and mutual benefit to the precise,",0.176471,0.558485,0.701961,0.013333
|
76 |
+
2,"with particular areas of interest, specific programs/activities, or more concrete goals",0.176078,0.574848,0.673725,0.013636
|
77 |
+
2,related to anything from numbers of exchanges to economic development.,0.176863,0.591212,0.596863,0.013636
|
78 |
+
2,"Don't try to include everything you plan to do. Some specifics, like particular areas of",0.177255,0.620303,0.681176,0.013939
|
79 |
+
2,"interest or participating institutions are good to include. However, there's no need to",0.176471,0.636667,0.675686,0.013636
|
80 |
+
2,include all the programs you plan to do if it makes the document too lengthy or limits,0.176863,0.652727,0.678824,0.013939
|
81 |
+
2,the scope of projects. This is a formal document to establish the relationship; specific,0.176078,0.668788,0.684706,0.013636
|
82 |
+
2,"tasks, responsibilities, or other nuts-and-bolts text related to implementation or",0.176078,0.685455,0.635686,0.013333
|
83 |
+
2,administration of the partnership can be expressed more fully in a separate,0.176471,0.701212,0.600392,0.013636
|
84 |
+
2,memorandum between the respective sister city committees. Your partnership,0.177255,0.717576,0.626667,0.013636
|
85 |
+
2,agreement is a historical document and should not be dated or limited by being aligned,0.176471,0.733636,0.699216,0.013636
|
86 |
+
2,with very specific tasks.,0.176078,0.750606,0.190196,0.013333
|
87 |
+
2,Work with your counterparts. Remember that this is signed by both cities. You should,0.176078,0.779697,0.68549,0.013636
|
88 |
+
2,share drafts of your agreement with your international partners and solicit feedback on,0.176471,0.795758,0.691765,0.013333
|
89 |
+
2,what they'd like to see in the agreement. Be flexible to cultural or municipal priorities.,0.176471,0.811818,0.679216,0.013939
|
90 |
+
2,Ask your counterparts to translate the agreement if it is drafted in English. It is,0.176078,0.841515,0.623137,0.013636
|
91 |
+
2,important for the citizens of your partner community to be able to read and understand,0.176863,0.857576,0.693725,0.013939
|
92 |
+
2,the commitment their city has made. Have someone in your own community who,0.176078,0.873939,0.649804,0.013636
|
93 |
+
3,Partnership Agreement,0.516078,0.027879,0.441176,0.032121
|
94 |
+
3,SisterCities,0.169804,0.033333,0.239216,0.028182
|
95 |
+
3,INTERNATIONAL,0.170196,0.06697,0.237255,0.008788
|
96 |
+
3,Toolkit,0.83098,0.07303,0.126667,0.025152
|
97 |
+
3,Connect globally. Thrive locally.,0.169804,0.08697,0.239216,0.01303
|
98 |
+
3,speaks that language check the foreign-language version to make sure it mirrors what,0.176471,0.132424,0.688235,0.013333
|
99 |
+
3,you have in your own agreement.,0.176471,0.148788,0.264706,0.013333
|
100 |
+
3,Keep it to one page. Ceremonial documents such as these partnership agreements,0.176863,0.178485,0.66549,0.013636
|
101 |
+
3,work best if they can be posted in their entirety.,0.176078,0.194545,0.380392,0.013636
|
102 |
+
3,Most sister city agreements include some acknowledgement of the founding principles,0.177255,0.224242,0.694902,0.013636
|
103 |
+
3,"of the sister city movement- to promote peace through mutual respect, understanding,",0.176471,0.240303,0.698431,0.013333
|
104 |
+
3,and cooperation.,0.176471,0.25697,0.13451,0.013333
|
105 |
+
3,Consider using official letterhead and/or other embellishments such as city seals or,0.176863,0.286061,0.665882,0.013333
|
106 |
+
3,logos to reflect your enhance the document. Sister city agreements are often posted at,0.176863,0.302121,0.695686,0.013636
|
107 |
+
3,city hall or other municipal offices and should reflect their historical importance,0.176471,0.318485,0.630588,0.013333
|
108 |
+
3,Look at other agreements your city has signed. These agreements may give you an idea,0.177255,0.347879,0.705098,0.013636
|
109 |
+
3,"of what is acceptable or possible, and they may be in an easily replicable format. If you",0.176471,0.364242,0.695686,0.013636
|
110 |
+
3,"cannot access older agreements please contact Sister Cities International, we may",0.176863,0.380303,0.663137,0.013636
|
111 |
+
3,"have them on file, although we do not have copies of all partnership agreements.",0.176863,0.396667,0.64549,0.013636
|
112 |
+
3,Documents must be signed by the top elected official of both communities.,0.177255,0.426364,0.601569,0.013333
|
113 |
+
3,"Check with your mayor, city council, town clerk, et al. to make sure that the agreement",0.176863,0.455758,0.694118,0.013636
|
114 |
+
3,"is OK with them. The mayor is the one putting his or her name on the paper, and you",0.176863,0.471818,0.677255,0.013333
|
115 |
+
3,don't want to spend time developing an agreement which will never be signed.,0.176863,0.488182,0.629412,0.013636
|
116 |
+
3,Official documents are usually signed during a formal ceremony recognizing the,0.176863,0.517576,0.638431,0.013636
|
117 |
+
3,partnership. Be sure both communities receive a signed set of the official documents,0.177255,0.533939,0.683922,0.013636
|
118 |
+
3,for their records.,0.176078,0.550606,0.131373,0.010606
|
119 |
+
3,Remember to send your signed agreement to Sister Cities International. After we,0.177255,0.579697,0.645098,0.013636
|
120 |
+
3,receive your agreement we will post the relationship in the City Directory and make sure,0.176863,0.595758,0.703137,0.013636
|
121 |
+
3,it is included in our Annual Membership Directory.,0.176863,0.612121,0.398039,0.013333
|
122 |
+
3,Remember that each city's sister city program is independent and can impose requirements,0.118431,0.640606,0.736471,0.013939
|
123 |
+
3,"like the establishment of a committee, a review period, sustainability/funding plan, among",0.118039,0.65697,0.715686,0.013636
|
124 |
+
3,"others, before sanctioning a sister city agreement. Check with your local program or mayor's",0.117647,0.672727,0.743529,0.014242
|
125 |
+
3,office to see if this is the case.,0.117647,0.689091,0.241176,0.011515
|
126 |
+
3,On the following pages you'll find a series of partnership agreements to give you an idea of,0.118039,0.717879,0.728627,0.013939
|
127 |
+
3,"what is possible. While you should feel free to use some of the formatting and language, we",0.117255,0.734242,0.73451,0.013636
|
128 |
+
3,encourage you to make your agreement your own and be creative with what you produce. If,0.117647,0.750606,0.737647,0.013636
|
129 |
+
3,you are unsure about your agreement or want advice you can always solicit feedback by,0.117647,0.766667,0.708627,0.013636
|
130 |
+
3,sending it to our Membership Director at [email protected] or contacting us at (202),0.117647,0.782727,0.732157,0.013636
|
131 |
+
3,347-8630.,0.117647,0.799394,0.080392,0.010303
|
132 |
+
4,Partnership Agreement,0.516471,0.027879,0.440784,0.032727
|
133 |
+
4,SisterCities,0.169412,0.033333,0.239608,0.028485
|
134 |
+
4,INTERNATIONAL,0.170196,0.066667,0.238431,0.009091
|
135 |
+
4,Toolkit,0.830588,0.072727,0.127843,0.025758
|
136 |
+
4,Connect globally. Thrive locally.,0.169412,0.08697,0.239608,0.013333
|
137 |
+
4,"jull bubzig 2000 3,312",0.378039,0.291212,0.32549,0.019394
|
138 |
+
4,ABU DHABI MUNICIPALITY & TOWN PLANNING,0.376471,0.316667,0.327451,0.016667
|
139 |
+
4,AN AGREEMENT FOR THE ESTABLISHMENT OF,0.260784,0.373636,0.52549,0.012727
|
140 |
+
4,SISTER CITIES RELATIONSHIP,0.337647,0.393636,0.342745,0.012121
|
141 |
+
4,BETWEEN,0.454902,0.413636,0.110588,0.011212
|
142 |
+
4,THE CITY OF ABU DHABI ( U. A.E),0.337255,0.432727,0.375686,0.013939
|
143 |
+
4,AND,0.487843,0.452727,0.048235,0.011212
|
144 |
+
4,"HOUSTON, TEXAS ( U.S.A)",0.385882,0.471515,0.298039,0.014848
|
145 |
+
4,"The Sister City Program, administered by Sister Cities International, was initiated",0.221961,0.525455,0.597255,0.01303
|
146 |
+
4,By the President of the United States of America in 1956 to encourage greater,0.222745,0.539394,0.561961,0.012727
|
147 |
+
4,Friendship and understanding between the United States and other nations through,0.222745,0.553333,0.608235,0.012727
|
148 |
+
4,Direct personal contact: and,0.222745,0.567576,0.20549,0.012424
|
149 |
+
4,"In order to foster those goals, the people of Abu Dhabi and Houston, in a gesture of",0.222353,0.594242,0.603529,0.012424
|
150 |
+
4,"Friendship and goodwill, agree to collaborate for the mutual benefit of their",0.222745,0.608182,0.547843,0.01303
|
151 |
+
4,"Communities by exploring education, economic and cultural opportunities.",0.222353,0.622121,0.541961,0.012121
|
152 |
+
4,"Abu Dhabi and Houston, sharing a common interest in energy, technology and",0.221569,0.648788,0.574118,0.012424
|
153 |
+
4,"medicine, and the desire to promote mutual understanding among our citizens do",0.222353,0.66303,0.588235,0.012121
|
154 |
+
4,"hereby proclaim themselves Sister Cities beginning on the 13th day of March 2001,",0.221961,0.673636,0.594118,0.015758
|
155 |
+
4,the date of Houston City Council resolution estatblishing the Sister City,0.221961,0.690303,0.519608,0.01303
|
156 |
+
4,relationship became effective.,0.221569,0.705152,0.217647,0.012424
|
157 |
+
4,"Signed on this 26 of October 2002, in duplicate in the Arabic and English",0.221569,0.732121,0.533333,0.01303
|
158 |
+
4,"Languages, both text being equally authentic.",0.221961,0.746667,0.328627,0.012727
|
159 |
+
4,A,0.344314,0.768485,0.084706,0.030303
|
160 |
+
4,Sheikh Mohammed bin Butti AI Hamed,0.245882,0.806364,0.366275,0.010909
|
161 |
+
4,Lee P.Brown,0.729412,0.806364,0.118824,0.010303
|
162 |
+
4,Mayor of Houston,0.704706,0.823333,0.166667,0.012424
|
163 |
+
4,Chairman of Abu Dhabi Municipality,0.24549,0.823636,0.342353,0.012727
|
164 |
+
4,&Town Planning,0.324314,0.841212,0.155686,0.012424
|
165 |
+
5,Partnership Agreement,0.516078,0.027879,0.441176,0.032424
|
166 |
+
5,SisterCities,0.169412,0.033333,0.239608,0.028485
|
167 |
+
5,INTERNATIONAL,0.17098,0.066667,0.237255,0.009091
|
168 |
+
5,Toolkit,0.83098,0.072727,0.127059,0.025758
|
169 |
+
5,Connect globally. Thrive locally.,0.169412,0.08697,0.239216,0.013333
|
170 |
+
5,THE CITY OF NEW YORK,0.438824,0.262121,0.240784,0.009697
|
171 |
+
5,OFFICE OF THE MAYOR,0.450196,0.27697,0.220392,0.009697
|
172 |
+
5,"NEW YORK, N.Y. 10007",0.461176,0.29303,0.196863,0.010303
|
173 |
+
5,THE NEW YORK CITY-LONDON SISTER CITY PARTNERSHIP,0.267451,0.355758,0.582745,0.011818
|
174 |
+
5,Memorandum of Understanding,0.420392,0.371212,0.274902,0.013333
|
175 |
+
5,The Sister City partnership between New York City and London will foster mutually,0.201176,0.402121,0.674118,0.014242
|
176 |
+
5,beneficial solutions to common challenges for these two great cosmopolitan entities.,0.201176,0.417273,0.66902,0.013636
|
177 |
+
5,"Consequently, the Sister City relationship between the two will be one of the most",0.201176,0.432727,0.652549,0.015152
|
178 |
+
5,"important in their network of global partnerships, as it strives to:",0.201176,0.448182,0.50902,0.015455
|
179 |
+
5,Encourage and publicize existing exchanges between London and New York City so,0.230588,0.480303,0.671373,0.015152
|
180 |
+
5,that they can flourish to benefit a wider cross-section of the citizens of both;,0.230588,0.496061,0.602353,0.015152
|
181 |
+
5,"Support and promote the development of new social, economic, academic and",0.230196,0.512424,0.618431,0.015455
|
182 |
+
5,community programs to encourage both cities' citizens to share their experiences as a,0.229804,0.527879,0.678039,0.014848
|
183 |
+
5,medium for learning from one another;,0.229804,0.543636,0.309412,0.013939
|
184 |
+
5,Generate an improvement of the operation of the cities' various government agencies,0.229804,0.56,0.676078,0.014545
|
185 |
+
5,by serving as a conduit of information;,0.22902,0.575758,0.307843,0.014848
|
186 |
+
5,"Identify themes, common to both, that can generate new initiatives to further and",0.229412,0.591818,0.640784,0.015152
|
187 |
+
5,"nurture the increasingly powerful financial, social and cultural relationships between",0.22902,0.607576,0.671373,0.014242
|
188 |
+
5,the cities;,0.22902,0.624545,0.076471,0.012424
|
189 |
+
5,Promote key mayoral priorities relevant to both London and New York City;,0.228627,0.639394,0.608627,0.015152
|
190 |
+
5,Provide financial or in kind support to community-led programs that advance the,0.228627,0.656061,0.641569,0.013636
|
191 |
+
5,aims of the Sister City partnership;,0.22902,0.672121,0.275294,0.013636
|
192 |
+
5,"With the above purposes in mind, the Mayor of the City of New York and the Mayor of",0.198824,0.702424,0.697647,0.014848
|
193 |
+
5,London solemnly confirm that these two cities are united by an official partnership by the,0.198824,0.718182,0.710196,0.014545
|
194 |
+
5,protocol of this Memorandum of Understanding.,0.198431,0.733939,0.384314,0.015152
|
195 |
+
5,This agreement will go into effect from the date of signatures.,0.310196,0.780606,0.488235,0.014545
|
196 |
+
5,Thedder Rudolph W. Giuliani,0.178824,0.795455,0.244314,0.100909
|
197 |
+
5,Signed in March of 2001,0.455686,0.796364,0.19451,0.013636
|
198 |
+
5,Ken Mayor Livingstone,0.672157,0.877576,0.132941,0.029091
|
199 |
+
5,Mayor,0.311373,0.894848,0.053333,0.012727
|
200 |
+
5,New York City,0.287843,0.909091,0.121176,0.013333
|
201 |
+
5,London,0.701961,0.909091,0.061569,0.010606
|
202 |
+
6,Partnership Agreement,0.515686,0.027576,0.441961,0.03303
|
203 |
+
6,SisterCities,0.169412,0.03303,0.24,0.028182
|
204 |
+
6,INTERNATIONAL,0.169804,0.066667,0.238431,0.009091
|
205 |
+
6,Toolkit,0.83098,0.072727,0.127451,0.025758
|
206 |
+
6,Connect globally. Thrive locally.,0.169412,0.08697,0.239608,0.013333
|
207 |
+
6,CHIC OF STATE,0.247451,0.190606,0.141961,0.036364
|
208 |
+
6,City of Long Beach,0.388627,0.196667,0.476471,0.066364
|
209 |
+
6,California,0.551373,0.257273,0.136471,0.033333
|
210 |
+
6,Sister City Agreement,0.321961,0.305455,0.378431,0.035152
|
211 |
+
6,between the,0.464706,0.352727,0.084314,0.009697
|
212 |
+
6,City of Long Beach,0.38,0.378485,0.252549,0.01697
|
213 |
+
6,"California, USA",0.4,0.397576,0.21098,0.016061
|
214 |
+
6,and the,0.48,0.415152,0.053333,0.009091
|
215 |
+
6,City of San Pablo de Manta,0.321569,0.428788,0.369804,0.01697
|
216 |
+
6,"Ecuador, South America",0.347451,0.447879,0.317255,0.015152
|
217 |
+
6,"In accordance with the authorization and approval expressed by the City of Long Beach,",0.261569,0.482121,0.536863,0.012121
|
218 |
+
6,"California, USA, and the City of San Pablo de Manta, Ecundor, South America, it is declared",0.217647,0.492727,0.581176,0.01303
|
219 |
+
6,"that a ""Sister City Agreement between the two cities is hereby established for the following",0.217647,0.502727,0.581569,0.012121
|
220 |
+
6,purposes:,0.216863,0.516061,0.058039,0.009394
|
221 |
+
6,(1) to promote and expand the effective and mutually beneficial cooperation between,0.278824,0.532727,0.520392,0.012424
|
222 |
+
6,the people of Long Beach and the people of San Pablo de Manta; and,0.218039,0.543636,0.40549,0.012424
|
223 |
+
6,"(2) to promote international goodwill, understanding, and expanded business",0.279216,0.56303,0.520784,0.012424
|
224 |
+
6,"relations between the two cities and their respective nations by the exchange of people, ideas, and",0.218039,0.573636,0.581569,0.012121
|
225 |
+
6,"information in a unide variety of economic, social, cultural, municipal, environmental,",0.218039,0.584242,0.581176,0.012121
|
226 |
+
6,"professional, technical, youth, and other endeavors; and",0.217647,0.594848,0.333333,0.012121
|
227 |
+
6,"(3) to foster and encourage charitable, scientific, trade and commerce, literary and",0.279608,0.613939,0.520784,0.012727
|
228 |
+
6,educational activities between the two cities;,0.218039,0.625455,0.265882,0.009697
|
229 |
+
6,This Sister City Agreement shall be officially established and shall become effective when,0.263137,0.644545,0.536863,0.012727
|
230 |
+
6,"this document has been duly executed by the Mayor of Long Beach, California, USA, and the",0.218824,0.654848,0.581961,0.012424
|
231 |
+
6,"Mayor of San Pablo de Manta, Ecundor, South America.",0.218431,0.665758,0.338824,0.012121
|
232 |
+
6,STATE OFFICE,0.276471,0.713636,0.050588,0.048788
|
233 |
+
6,Beverly 0 Neill,0.587451,0.736667,0.121961,0.013636
|
234 |
+
6,"Mayor, City of Long Beach",0.542353,0.751212,0.21098,0.013636
|
235 |
+
6,"California, USA",0.582745,0.765758,0.125098,0.01303
|
236 |
+
6,10.2aulus,0.490588,0.771818,0.220392,0.062424
|
237 |
+
6,Ing. Jorge O. Zambrano Cedeño,0.527059,0.825152,0.242745,0.013333
|
238 |
+
6,"Mayor, City of San Pablo de Manta",0.505098,0.839394,0.277647,0.013636
|
239 |
+
6,"Ecuador, South America",0.551765,0.854242,0.188235,0.011818
|
240 |
+
6,"Dated: September 19, 2000",0.544706,0.883333,0.202745,0.01303
|
241 |
+
7,Partnership Agreement,0.516078,0.027879,0.441176,0.032424
|
242 |
+
7,SisterCities,0.169412,0.03303,0.24,0.028485
|
243 |
+
7,INTERNATIONAL,0.170196,0.066667,0.237647,0.009091
|
244 |
+
7,Toolkit,0.83098,0.072727,0.127451,0.025758
|
245 |
+
7,Connect globally. Thrive locally.,0.169412,0.08697,0.239216,0.013333
|
246 |
+
7,REAFFIRMATION OF SISTER CITIES DECLARATION,0.324706,0.165152,0.483529,0.013939
|
247 |
+
7,adopted by,0.2,0.213333,0.080392,0.013636
|
248 |
+
7,THE HONORABLE RICHARD M. DALEY,0.396078,0.214242,0.335686,0.012424
|
249 |
+
7,MAYOR OF CHICAGO,0.472549,0.231212,0.18549,0.011515
|
250 |
+
7,and,0.199608,0.260909,0.026275,0.010606
|
251 |
+
7,THE HONORABLE ZHANG RONGMAO,0.401961,0.261212,0.323137,0.011212
|
252 |
+
7,MAYOR OF SHENYANG,0.463529,0.273636,0.202353,0.011212
|
253 |
+
7,ON,0.551765,0.298182,0.026667,0.011515
|
254 |
+
7,"JUNE 5, 1995",0.500392,0.323636,0.128235,0.014848
|
255 |
+
7,"On this the tenth anniversary of the signing of a sister city agreement, in order to further",0.255686,0.36303,0.67098,0.015152
|
256 |
+
7,the traditional links of friendship between Chicago and Shenyang and to reaffirm their mutual,0.198824,0.378788,0.727843,0.015455
|
257 |
+
7,"aspiration to work in unison for the benefit of their cities and nations, the Honorable Mayor",0.199608,0.394848,0.727843,0.014848
|
258 |
+
7,"Richard M. Daley, Mayor of the City of Chicago, and the Honorable Zhang Rongmao, Mayor",0.199216,0.411212,0.727451,0.014242
|
259 |
+
7,"of the City of Shenyang, on this fifth day of June 1995, do hereby acknowledge and reaffirm the",0.199216,0.42697,0.72549,0.014848
|
260 |
+
7,sister cities agreement between the City of Chicago and the City of Shenyang.,0.199608,0.443636,0.57451,0.014242
|
261 |
+
7,"The City of Chicago and the City of Shenyang on the basis of friendly cooperation,",0.256078,0.473939,0.665098,0.015152
|
262 |
+
7,equality and mutual benefit will continue to develop a sister cities relationship to promote and,0.2,0.490303,0.724706,0.014242
|
263 |
+
7,broaden economic cooperation and cultural exchanges between the two cities.,0.199216,0.506061,0.57451,0.014242
|
264 |
+
7,The two cities do hereby declare their interest in exploring the establishment of business,0.255294,0.537273,0.668235,0.015455
|
265 |
+
7,and trade relations between Chicago and Shenyang.,0.198824,0.554545,0.387843,0.013636
|
266 |
+
7,"In addition, exchanges will be promoted in the area of the arts such as exhibits, music,",0.254118,0.583939,0.666667,0.015455
|
267 |
+
7,dance and other cultural activities.,0.198431,0.601212,0.256471,0.010606
|
268 |
+
7,"In addition, exchanges will be promoted in education and the establishment of contacts",0.254118,0.630303,0.668627,0.015758
|
269 |
+
7,within educational institutions encouraged.,0.198824,0.647273,0.32,0.014242
|
270 |
+
7,"In addition, we declare our intention to promote exchanges in such fields as science and",0.253725,0.678182,0.668627,0.014848
|
271 |
+
7,"technology, sports, health, youth and any areas that will contribute to the prosperity and the",0.198039,0.693636,0.722745,0.015152
|
272 |
+
7,further development of friendship between the people of our two cities.,0.194902,0.711515,0.525098,0.013636
|
273 |
+
7,3h.5.,0.593725,0.750606,0.218039,0.06303
|
274 |
+
7,THE HONORABLE ZHANG RONGMAO,0.588627,0.819394,0.287843,0.011818
|
275 |
+
7,THE HONORABLE RICHARD M. DALEY,0.197255,0.821515,0.303529,0.010606
|
276 |
+
7,MAYOR OF SHENYANG,0.587451,0.835455,0.177647,0.010303
|
277 |
+
7,MAYOR OF CHICAGO,0.195686,0.835758,0.164706,0.010606
|
example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_review_file.csv
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
image,page,label,color,xmin,ymin,xmax,ymax,id,text
|
2 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_0.png,1,ADDRESS,"(0, 0, 0)",0.598431,0.524545,0.63098,0.535455,EG3nykuwvxbk,U.S.
|
3 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_0.png,1,ADDRESS,"(0, 0, 0)",0.820392,0.798485,0.854118,0.809394,jy1R42e6phNz,U.S.
|
4 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_0.png,1,ADDRESS,"(0, 0, 0)",0.433333,0.863333,0.46549,0.873939,9sbrsroLfZy0,U.S.
|
5 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_1.png,2,ADDRESS,"(0, 0, 0)",0.354118,0.188788,0.386275,0.199697,k7bWBsQQchJZ,U.S.
|
6 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_1.png,2,ADDRESS,"(0, 0, 0)",0.780392,0.204848,0.812941,0.215758,peo6UqIxrjmR,U.S.
|
7 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_2.png,3,EMAIL,"(0, 0, 0)",0.447843,0.78303,0.648627,0.796667,DIfz0LenOtQv,[email protected]
|
8 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_2.png,3,PHONE,"(0, 0, 0)",0.809804,0.78303,0.850196,0.796667,odJdySe9XrAn,(202)
|
9 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_2.png,3,PHONE,"(0, 0, 0)",0.117647,0.799394,0.198431,0.809697,iURSkUM7BbUG,347-8630
|
10 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.637647,0.432727,0.712941,0.44697,fRxAD9qm856s,U. A.E
|
11 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.489412,0.43303,0.614902,0.444545,qzRFPlNbslpH,ABU DHABI
|
12 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.385882,0.472121,0.593725,0.486364,v1uLbGsofN1f,"HOUSTON, TEXAS"
|
13 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.392549,0.539697,0.573725,0.549394,MvbPQiHvSdL7,United States of America
|
14 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.539216,0.553333,0.635686,0.563333,05U3cgj5w9PY,United States
|
15 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.534902,0.594242,0.615294,0.603939,uHMikyBlMq5f,Abu Dhabi
|
16 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.651373,0.594242,0.717255,0.605455,XNUE0GopIBaf,Houston
|
17 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.221569,0.65,0.301176,0.659697,6FjbNu2CGA9n,Abu Dhabi
|
18 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.337647,0.65,0.404314,0.660606,Yvmm2225ityu,Houston
|
19 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,HANDWRITING,"(0, 0, 0)",0.344314,0.768485,0.42902,0.798788,EwTcqq7PENU8,A
|
20 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,NAME,"(0, 0, 0)",0.245882,0.806364,0.612549,0.817576,Mj4gqwbgsZWp,Sheikh Mohammed bin Butti AI Hamed
|
21 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,NAME,"(0, 0, 0)",0.52,0.806364,0.612549,0.81697,RXYOVgLwq8Ke,AI Hamed
|
22 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,NAME,"(0, 0, 0)",0.729412,0.806364,0.848235,0.816667,REPZhwFWGoTc,Lee P.Brown
|
23 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,NAME,"(0, 0, 0)",0.245882,0.806667,0.51451,0.817576,rFdxMRFRWLRJ,Sheikh Mohammed bin Butti
|
24 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.366667,0.823939,0.465098,0.834242,5iYCxRGdPG1i,Abu Dhabi
|
25 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.577647,0.262121,0.68,0.271515,3ZR43H3yYNdy,NEW YORK
|
26 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.461176,0.29303,0.555294,0.303333,WNoitmR9A6lu,NEW YORK
|
27 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.461176,0.29303,0.658039,0.303333,HjrhxMQhovlF,NEW YORK N.Y. 10007
|
28 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.563137,0.29303,0.658039,0.302121,nPN7g7UcnX4u,N.Y. 10007
|
29 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.314118,0.356667,0.42549,0.367576,ZoJf29CB3Wrq,NEW YORK
|
30 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.655294,0.480909,0.718431,0.491515,iezAqmD2ilnb,London
|
31 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.708627,0.639394,0.837255,0.652727,tWAuJEQVpfhi,New York City
|
32 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.60902,0.64,0.67098,0.650606,NaW3mmmlhMW9,London
|
33 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.667059,0.702727,0.751373,0.713636,pgMiwuMiBp8B,New York
|
34 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.198824,0.720303,0.261569,0.731212,fPvElSFZFRoL,London
|
35 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,HANDWRITING,"(0, 0, 0)",0.178824,0.795455,0.281961,0.896364,DfniF7P2bXAw,Thedder
|
36 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,NAME,"(0, 0, 0)",0.178824,0.795455,0.423529,0.896364,QwnWsAeslO5f,Thedder Rudolph W. Giuliani
|
37 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,NAME - ADDRESS,"(0, 0, 0)",0.672157,0.877576,0.80549,0.891212,Vdp95SShYOEO,Ken Livingstone
|
38 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.710196,0.877576,0.80549,0.891212,H5DGqsucPAjc,Livingstone
|
39 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,NAME,"(0, 0, 0)",0.672157,0.877879,0.705098,0.888182,qotGtnMbhAJr,Ken
|
40 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.287843,0.909091,0.40902,0.922727,sFX0tNJJzpE5,New York City
|
41 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.701961,0.909091,0.763922,0.919697,2xFbVTbxiOhC,London
|
42 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.55451,0.203636,0.86549,0.258485,Nfe3WTBembGQ,Long Beach
|
43 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.551373,0.257273,0.687843,0.290606,kndQY5X4itc8,California
|
44 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.558824,0.397879,0.611373,0.410303,B5vq8yhWLeOg,USA
|
45 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.425882,0.429091,0.691373,0.441818,OtNgqUkoEaZb,San Pablo de Manta
|
46 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.347451,0.447879,0.665098,0.46303,Q52VzBx2SWNF,"Ecuador, South America"
|
47 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.724314,0.482121,0.798431,0.493939,O7gd9ywvKsKh,"Long Beach,"
|
48 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.425098,0.49303,0.506275,0.502727,DzYr3xrM8Tvv,San Pablo de
|
49 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.425098,0.49303,0.715294,0.50303,iZ0knpQD54UU,"San Pablo de Manta, Ecundor, South America"
|
50 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.509804,0.49303,0.715294,0.50303,pZnYGzr7Pwsl,"Manta, Ecundor, South America"
|
51 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.217647,0.493333,0.321961,0.504242,r7Aar8FNQF6D,"California, USA"
|
52 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.471765,0.543636,0.596863,0.553939,zg9uBDlSuuA1,San Pablo de Manta
|
53 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.295294,0.544242,0.36549,0.556061,A0OY6RjMEocW,Long Beach
|
54 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.563137,0.655152,0.748627,0.667576,HQlTdEUhOCgI,"Long Beach, California, USA"
|
55 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.463529,0.665758,0.557255,0.674848,bCN9b7kJw0Ik,South America
|
56 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.277647,0.666061,0.403529,0.676061,qffN3bDgWRMk,San Pablo de Manta
|
57 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.587451,0.736667,0.709804,0.750303,eqMENFw5mbnL,Beverly 0 Neill
|
58 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.663137,0.751212,0.753333,0.764545,POqPQVBCES8h,Long Beach
|
59 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.582745,0.765758,0.708235,0.779091,mjrjsSMOxwaY,"California, USA"
|
60 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,HANDWRITING,"(0, 0, 0)",0.490588,0.771818,0.71098,0.834242,xL8dSawihWuY,10.2aulus
|
61 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,NAME,"(0, 0, 0)",0.559608,0.825152,0.769804,0.838485,fHyvwmbOgLMJ,Jorge O. Zambrano Cedeño
|
62 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.624314,0.839394,0.782745,0.850303,zGhskyehufSv,San Pablo de Manta
|
63 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.551765,0.854242,0.74,0.866061,dSPXmtb8M4nt,"Ecuador, South America"
|
64 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,NAME,"(0, 0, 0)",0.556471,0.215152,0.731765,0.226667,BEhuvaI5BVaR,RICHARD M. DALEY
|
65 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,NAME,"(0, 0, 0)",0.563137,0.261212,0.725098,0.272424,coo8KK7q6A72,ZHANG RONGMAO
|
66 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.566275,0.273636,0.666275,0.285152,0P9rVSbeNdB4,SHENYANG
|
67 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.526667,0.380303,0.588235,0.394242,1GDArufutI5y,Chicago
|
68 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.628235,0.380606,0.702353,0.394242,QyD751r4fCU1,Shenyang
|
69 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,NAME,"(0, 0, 0)",0.736863,0.411515,0.868235,0.424545,rntIekANI8BO,Zhang Rongmao
|
70 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,NAME,"(0, 0, 0)",0.199216,0.411818,0.34,0.424848,96TaHazXGIM7,Richard M. Daley
|
71 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.514902,0.412424,0.580784,0.425758,kbyVj6qhZSPi,Chicago
|
72 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.696471,0.443939,0.774118,0.45697,rJpaMvepsNln,Shenyang
|
73 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.353725,0.474545,0.415686,0.489091,PokCVpLQmDki,Chicago
|
74 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.407451,0.554545,0.469804,0.568182,HqVr414KRg59,Chicago
|
75 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,HANDWRITING,"(0, 0, 0)",0.593725,0.750606,0.811765,0.813636,xdawEv0DUH6P,3h.5.
|
76 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,NAME,"(0, 0, 0)",0.730196,0.819394,0.876471,0.830606,Gghr7ccN6lS2,ZHANG RONGMAO
|
77 |
+
C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,NAME,"(0, 0, 0)",0.34,0.821515,0.501176,0.831515,vOMIv1RS5Sag,RICHARD M. DALEY
|
example_data/example_outputs/Partnership-Agreement-Toolkit_0_0_ocr_results_with_words_textract.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_ocr_output_textract.csv
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
page,text,left,top,width,height,line
|
2 |
+
1,Example of emails sent to a professor before applying:,0.147059,0.093434,0.426471,0.013889,1
|
3 |
+
1,Fwd: Prospective Graduate Student,0.145425,0.128788,0.277778,0.013889,2
|
4 |
+
1,"Dr. Kornbluth,",0.147059,0.162879,0.114379,0.012626,3
|
5 |
+
1,I am a senior biology major at the University of Notre Dame. I am applying to the CMB,0.147059,0.198232,0.689542,0.013889,4
|
6 |
+
1,program and am very interested in your work. After glancing at a few of your recent,0.145425,0.214646,0.660131,0.013889,5
|
7 |
+
1,papers and your research summary I find your work with apoptosis very interesting. Will,0.145425,0.232323,0.697712,0.013889,6
|
8 |
+
1,"you be taking on new students next year? If I am invited to interview, is there any way",0.145425,0.25,0.683007,0.013889,7
|
9 |
+
1,you will be able to meet with me?,0.145425,0.267677,0.264706,0.013889,8
|
10 |
+
1,I have worked on several different research projects as an undergraduate in Dr. David R.,0.147059,0.30303,0.69281,0.013889,9
|
11 |
+
1,Hyde's lab at the University of Notre Dame. The Hyde lab is interested in the signals that,0.147059,0.320707,0.697712,0.013889,10
|
12 |
+
1,initiate Muller glia division post-light damage. My first research project was,0.147059,0.338384,0.598039,0.013889,11
|
13 |
+
1,characterizing the role of leukemia inhibitory factor (LIF) in the activation of cell,0.147059,0.354798,0.637255,0.013889,12
|
14 |
+
1,proliferation in the undamaged zebrafish retina. I am also working on several,0.145425,0.372475,0.604575,0.013889,13
|
15 |
+
1,experiments that are related to a genetic screen that the Hyde lab plans on performing to,0.145425,0.390152,0.689542,0.013889,14
|
16 |
+
1,identify mutants in the regeneration pathway--I am developing a neuroD4:EGFP,0.147059,0.407828,0.635621,0.013889,15
|
17 |
+
1,transgenic line for use in this screen and I am characterizing the extent of damage and,0.145425,0.425505,0.673203,0.013889,16
|
18 |
+
1,"regeneration in sheer zebrafish retinas. Finally, I am characterizing the chx10:EGFP",0.145425,0.443182,0.661765,0.013889,17
|
19 |
+
1,transgenic line during retinal development and regeneration.,0.145425,0.459596,0.472222,0.013889,18
|
20 |
+
1,Please find my CV attached.,0.145425,0.496212,0.222222,0.013889,19
|
21 |
+
1,"Thank you for your time,",0.145425,0.531566,0.196078,0.013889,20
|
22 |
+
1,--Lauren Lilley,0.147059,0.566919,0.119281,0.013889,21
|
23 |
+
1,"Dr. Poss,",0.145425,0.637626,0.070261,0.012626,22
|
24 |
+
1,I am a senior biology major at the University of Notre Dame. I am applying to your,0.145425,0.671717,0.655229,0.013889,23
|
25 |
+
1,graduate program and am very interested in your work. After glancing at a few of your,0.145425,0.689394,0.679739,0.013889,24
|
26 |
+
1,recent papers and your research summary I find your research greatly coincides with my,0.145425,0.707071,0.69281,0.013889,25
|
27 |
+
1,research experiences and interests. Will you be taking on new students next year?,0.145425,0.723485,0.643791,0.015152,26
|
28 |
+
1,I have worked on several different research projects as an undergraduate in Dr. David R.,0.145425,0.760101,0.69281,0.013889,27
|
29 |
+
1,Hyde's lab at the University of Notre Dame. The Hyde lab is interested in the signals that,0.145425,0.777778,0.699346,0.013889,28
|
30 |
+
1,initiate Muller glia division post-light damage. My first research project was,0.145425,0.795455,0.598039,0.013889,29
|
31 |
+
1,characterizing the role of leukemia inhibitory factor (LIF) in the activation of cell,0.145425,0.811869,0.638889,0.013889,30
|
32 |
+
1,proliferation in the undamaged zebrafish retina. I am also working on several,0.145425,0.829545,0.604575,0.013889,31
|
33 |
+
1,experiments that are related to a genetic screen that the Hyde lab plans on performing to,0.145425,0.847222,0.691176,0.013889,32
|
34 |
+
1,identify mutants in the regeneration pathway--I am developing a neuroD4:EGFP,0.145425,0.864899,0.635621,0.013889,33
|
35 |
+
1,transgenic line for use in this screen and I am characterizing the extent of damage and,0.145425,0.881313,0.673203,0.013889,34
|
36 |
+
2,"regeneration in sheer zebrafish retinas. Finally, I am characterizing the chx10:EGFP",0.145425,0.093434,0.661765,0.013889,1
|
37 |
+
2,transgenic line during retinal development and regeneration.,0.145425,0.111111,0.472222,0.013889,2
|
38 |
+
2,Please find my CV attached.,0.145425,0.146465,0.222222,0.013889,3
|
39 |
+
2,"Thank you for your time,",0.145425,0.181818,0.196078,0.013889,4
|
40 |
+
2,--Lauren Lilley,0.147059,0.218434,0.119281,0.013889,5
|
example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_ocr_results_with_words_textract.csv
ADDED
@@ -0,0 +1,432 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
page,line,word_text,word_x0,word_y0,word_x1,word_y1,line_text,line_x0,line_y0,line_x1,line_y1
|
2 |
+
1,1,Example,0.147059,0.093434,0.215686,0.107323,,,,,
|
3 |
+
1,1,of,0.220588,0.093434,0.240196,0.104798,,,,,
|
4 |
+
1,1,emails,0.24183,0.093434,0.292484,0.104798,,,,,
|
5 |
+
1,1,sent,0.297386,0.094697,0.330065,0.104798,,,,,
|
6 |
+
1,1,to,0.334967,0.094697,0.349673,0.104798,,,,,
|
7 |
+
1,1,a,0.354575,0.097222,0.362745,0.104798,,,,,
|
8 |
+
1,1,professor,0.367647,0.093434,0.441176,0.108586,,,,,
|
9 |
+
1,1,before,0.446078,0.093434,0.496732,0.104798,,,,,
|
10 |
+
1,1,applying:,0.501634,0.093434,0.573529,0.107323,,,,,
|
11 |
+
1,2,Fwd:,0.145425,0.128788,0.184641,0.140152,,,,,
|
12 |
+
1,2,Prospective,0.191176,0.128788,0.28268,0.142677,,,,,
|
13 |
+
1,2,Graduate,0.287582,0.128788,0.359477,0.140152,,,,,
|
14 |
+
1,2,Student,0.364379,0.128788,0.424837,0.140152,,,,,
|
15 |
+
1,3,Dr.,0.147059,0.162879,0.171569,0.174242,,,,,
|
16 |
+
1,3,"Kornbluth,",0.176471,0.162879,0.261438,0.176768,,,,,
|
17 |
+
1,4,I,0.147059,0.198232,0.153595,0.209596,,,,,
|
18 |
+
1,4,am,0.158497,0.200758,0.181373,0.209596,,,,,
|
19 |
+
1,4,a,0.186275,0.20202,0.194444,0.209596,,,,,
|
20 |
+
1,4,senior,0.199346,0.198232,0.248366,0.209596,,,,,
|
21 |
+
1,4,biology,0.253268,0.198232,0.312092,0.212121,,,,,
|
22 |
+
1,4,major,0.316993,0.198232,0.364379,0.212121,,,,,
|
23 |
+
1,4,at,0.367647,0.199495,0.382353,0.209596,,,,,
|
24 |
+
1,4,the,0.387255,0.198232,0.411765,0.209596,,,,,
|
25 |
+
1,4,University,0.416667,0.198232,0.5,0.212121,,,,,
|
26 |
+
1,4,of,0.504902,0.198232,0.522876,0.209596,,,,,
|
27 |
+
1,4,Notre,0.52451,0.198232,0.570261,0.209596,,,,,
|
28 |
+
1,4,Dame.,0.575163,0.198232,0.625817,0.209596,,,,,
|
29 |
+
1,4,I,0.632353,0.198232,0.637255,0.209596,,,,,
|
30 |
+
1,4,am,0.643791,0.200758,0.666667,0.209596,,,,,
|
31 |
+
1,4,applying,0.671569,0.198232,0.740196,0.212121,,,,,
|
32 |
+
1,4,to,0.745098,0.199495,0.759804,0.209596,,,,,
|
33 |
+
1,4,the,0.764706,0.198232,0.789216,0.209596,,,,,
|
34 |
+
1,4,CMB,0.794118,0.198232,0.836601,0.209596,,,,,
|
35 |
+
1,5,program,0.145425,0.218434,0.212418,0.229798,,,,,
|
36 |
+
1,5,and,0.21732,0.215909,0.245098,0.227273,,,,,
|
37 |
+
1,5,am,0.25,0.218434,0.27451,0.227273,,,,,
|
38 |
+
1,5,very,0.279412,0.218434,0.313725,0.229798,,,,,
|
39 |
+
1,5,interested,0.320261,0.214646,0.395425,0.22601,,,,,
|
40 |
+
1,5,in,0.400327,0.214646,0.416667,0.22601,,,,,
|
41 |
+
1,5,your,0.419935,0.218434,0.457516,0.229798,,,,,
|
42 |
+
1,5,work.,0.460784,0.214646,0.506536,0.227273,,,,,
|
43 |
+
1,5,After,0.511438,0.214646,0.553922,0.227273,,,,,
|
44 |
+
1,5,glancing,0.55719,0.215909,0.625817,0.229798,,,,,
|
45 |
+
1,5,at,0.630719,0.217172,0.645425,0.227273,,,,,
|
46 |
+
1,5,a,0.650327,0.218434,0.658497,0.227273,,,,,
|
47 |
+
1,5,few,0.663399,0.214646,0.69281,0.22601,,,,,
|
48 |
+
1,5,of,0.697712,0.214646,0.715686,0.227273,,,,,
|
49 |
+
1,5,your,0.718954,0.218434,0.754902,0.229798,,,,,
|
50 |
+
1,5,recent,0.759804,0.217172,0.80719,0.22601,,,,,
|
51 |
+
1,6,papers,0.145425,0.236111,0.197712,0.247475,,,,,
|
52 |
+
1,6,and,0.202614,0.232323,0.230392,0.243687,,,,,
|
53 |
+
1,6,your,0.235294,0.236111,0.271242,0.247475,,,,,
|
54 |
+
1,6,research,0.276144,0.232323,0.341503,0.243687,,,,,
|
55 |
+
1,6,summary,0.346405,0.236111,0.419935,0.247475,,,,,
|
56 |
+
1,6,I,0.424837,0.232323,0.431373,0.243687,,,,,
|
57 |
+
1,6,find,0.436275,0.232323,0.46732,0.243687,,,,,
|
58 |
+
1,6,your,0.472222,0.236111,0.50817,0.247475,,,,,
|
59 |
+
1,6,work,0.513072,0.232323,0.553922,0.243687,,,,,
|
60 |
+
1,6,with,0.558824,0.232323,0.593137,0.243687,,,,,
|
61 |
+
1,6,apoptosis,0.598039,0.233586,0.671569,0.247475,,,,,
|
62 |
+
1,6,very,0.678105,0.236111,0.712418,0.247475,,,,,
|
63 |
+
1,6,interesting.,0.71732,0.232323,0.803922,0.247475,,,,,
|
64 |
+
1,6,Will,0.810458,0.232323,0.844771,0.243687,,,,,
|
65 |
+
1,7,you,0.145425,0.253788,0.174837,0.263889,,,,,
|
66 |
+
1,7,be,0.179739,0.25,0.199346,0.261364,,,,,
|
67 |
+
1,7,taking,0.204248,0.25,0.253268,0.265152,,,,,
|
68 |
+
1,7,on,0.25817,0.253788,0.277778,0.261364,,,,,
|
69 |
+
1,7,new,0.28268,0.253788,0.315359,0.261364,,,,,
|
70 |
+
1,7,students,0.320261,0.25,0.383987,0.261364,,,,,
|
71 |
+
1,7,next,0.388889,0.251263,0.423203,0.261364,,,,,
|
72 |
+
1,7,year?,0.428105,0.25,0.470588,0.263889,,,,,
|
73 |
+
1,7,If,0.480392,0.25,0.495098,0.261364,,,,,
|
74 |
+
1,7,I,0.498366,0.25,0.504902,0.261364,,,,,
|
75 |
+
1,7,am,0.509804,0.253788,0.534314,0.261364,,,,,
|
76 |
+
1,7,invited,0.539216,0.25,0.593137,0.261364,,,,,
|
77 |
+
1,7,to,0.598039,0.251263,0.612745,0.261364,,,,,
|
78 |
+
1,7,"interview,",0.617647,0.25,0.696078,0.263889,,,,,
|
79 |
+
1,7,is,0.702614,0.25,0.714052,0.261364,,,,,
|
80 |
+
1,7,there,0.718954,0.25,0.759804,0.261364,,,,,
|
81 |
+
1,7,any,0.763072,0.253788,0.792484,0.263889,,,,,
|
82 |
+
1,7,way,0.797386,0.253788,0.830065,0.263889,,,,,
|
83 |
+
1,8,you,0.145425,0.271465,0.176471,0.281566,,,,,
|
84 |
+
1,8,will,0.179739,0.267677,0.210784,0.27904,,,,,
|
85 |
+
1,8,be,0.215686,0.267677,0.235294,0.27904,,,,,
|
86 |
+
1,8,able,0.238562,0.267677,0.272876,0.27904,,,,,
|
87 |
+
1,8,to,0.276144,0.268939,0.292484,0.27904,,,,,
|
88 |
+
1,8,meet,0.297386,0.268939,0.334967,0.27904,,,,,
|
89 |
+
1,8,with,0.339869,0.267677,0.375817,0.27904,,,,,
|
90 |
+
1,8,me?,0.380719,0.267677,0.411765,0.27904,,,,,
|
91 |
+
1,9,I,0.147059,0.30303,0.151961,0.314394,,,,,
|
92 |
+
1,9,have,0.156863,0.30303,0.194444,0.314394,,,,,
|
93 |
+
1,9,worked,0.199346,0.30303,0.25817,0.314394,,,,,
|
94 |
+
1,9,on,0.263072,0.306818,0.28268,0.314394,,,,,
|
95 |
+
1,9,several,0.287582,0.30303,0.343137,0.314394,,,,,
|
96 |
+
1,9,different,0.348039,0.30303,0.416667,0.314394,,,,,
|
97 |
+
1,9,research,0.419935,0.30303,0.485294,0.314394,,,,,
|
98 |
+
1,9,projects,0.490196,0.30303,0.552288,0.318182,,,,,
|
99 |
+
1,9,as,0.558824,0.306818,0.573529,0.314394,,,,,
|
100 |
+
1,9,an,0.580065,0.306818,0.598039,0.314394,,,,,
|
101 |
+
1,9,undergraduate,0.602941,0.30303,0.714052,0.318182,,,,,
|
102 |
+
1,9,in,0.718954,0.30303,0.735294,0.314394,,,,,
|
103 |
+
1,9,Dr.,0.740196,0.30303,0.764706,0.314394,,,,,
|
104 |
+
1,9,David,0.769608,0.30303,0.816993,0.314394,,,,,
|
105 |
+
1,9,R.,0.823529,0.30303,0.839869,0.314394,,,,,
|
106 |
+
1,10,Hyde's,0.147059,0.320707,0.199346,0.334596,,,,,
|
107 |
+
1,10,lab,0.204248,0.320707,0.228758,0.332071,,,,,
|
108 |
+
1,10,at,0.23366,0.32197,0.248366,0.332071,,,,,
|
109 |
+
1,10,the,0.251634,0.320707,0.276144,0.332071,,,,,
|
110 |
+
1,10,University,0.281046,0.320707,0.364379,0.334596,,,,,
|
111 |
+
1,10,of,0.369281,0.320707,0.387255,0.332071,,,,,
|
112 |
+
1,10,Notre,0.390523,0.320707,0.434641,0.332071,,,,,
|
113 |
+
1,10,Dame.,0.439542,0.320707,0.490196,0.332071,,,,,
|
114 |
+
1,10,The,0.496732,0.320707,0.527778,0.332071,,,,,
|
115 |
+
1,10,Hyde,0.53268,0.320707,0.573529,0.334596,,,,,
|
116 |
+
1,10,lab,0.580065,0.320707,0.602941,0.332071,,,,,
|
117 |
+
1,10,is,0.607843,0.320707,0.620915,0.332071,,,,,
|
118 |
+
1,10,interested,0.625817,0.320707,0.702614,0.332071,,,,,
|
119 |
+
1,10,in,0.707516,0.320707,0.722222,0.332071,,,,,
|
120 |
+
1,10,the,0.727124,0.320707,0.751634,0.332071,,,,,
|
121 |
+
1,10,signals,0.756536,0.320707,0.810458,0.334596,,,,,
|
122 |
+
1,10,that,0.815359,0.320707,0.844771,0.332071,,,,,
|
123 |
+
1,11,initiate,0.147059,0.338384,0.20098,0.349747,,,,,
|
124 |
+
1,11,Muller,0.205882,0.338384,0.259804,0.349747,,,,,
|
125 |
+
1,11,glia,0.264706,0.338384,0.292484,0.352273,,,,,
|
126 |
+
1,11,division,0.297386,0.338384,0.361111,0.349747,,,,,
|
127 |
+
1,11,post-light,0.366013,0.338384,0.44281,0.352273,,,,,
|
128 |
+
1,11,damage.,0.446078,0.338384,0.511438,0.352273,,,,,
|
129 |
+
1,11,My,0.51634,0.338384,0.544118,0.352273,,,,,
|
130 |
+
1,11,first,0.54902,0.338384,0.581699,0.349747,,,,,
|
131 |
+
1,11,research,0.584967,0.338384,0.650327,0.349747,,,,,
|
132 |
+
1,11,project,0.655229,0.338384,0.710784,0.353535,,,,,
|
133 |
+
1,11,was,0.715686,0.340909,0.745098,0.349747,,,,,
|
134 |
+
1,12,characterizing,0.147059,0.354798,0.256536,0.369949,,,,,
|
135 |
+
1,12,the,0.261438,0.356061,0.285948,0.367424,,,,,
|
136 |
+
1,12,role,0.29085,0.356061,0.321895,0.367424,,,,,
|
137 |
+
1,12,of,0.326797,0.356061,0.344771,0.367424,,,,,
|
138 |
+
1,12,leukemia,0.348039,0.356061,0.419935,0.367424,,,,,
|
139 |
+
1,12,inhibitory,0.424837,0.354798,0.501634,0.369949,,,,,
|
140 |
+
1,12,factor,0.506536,0.356061,0.553922,0.367424,,,,,
|
141 |
+
1,12,(LIF),0.55719,0.354798,0.599673,0.369949,,,,,
|
142 |
+
1,12,in,0.604575,0.356061,0.620915,0.367424,,,,,
|
143 |
+
1,12,the,0.624183,0.356061,0.648693,0.366162,,,,,
|
144 |
+
1,12,activation,0.653595,0.356061,0.732026,0.367424,,,,,
|
145 |
+
1,12,of,0.735294,0.354798,0.754902,0.367424,,,,,
|
146 |
+
1,12,cell,0.756536,0.356061,0.785948,0.367424,,,,,
|
147 |
+
1,13,proliferation,0.145425,0.372475,0.243464,0.387626,,,,,
|
148 |
+
1,13,in,0.25,0.373737,0.264706,0.383838,,,,,
|
149 |
+
1,13,the,0.269608,0.373737,0.292484,0.383838,,,,,
|
150 |
+
1,13,undamaged,0.297386,0.372475,0.388889,0.387626,,,,,
|
151 |
+
1,13,zebrafish,0.393791,0.372475,0.465686,0.383838,,,,,
|
152 |
+
1,13,retina.,0.470588,0.373737,0.519608,0.383838,,,,,
|
153 |
+
1,13,I,0.52451,0.373737,0.531046,0.383838,,,,,
|
154 |
+
1,13,am,0.535948,0.376263,0.560458,0.383838,,,,,
|
155 |
+
1,13,also,0.565359,0.372475,0.596405,0.383838,,,,,
|
156 |
+
1,13,working,0.601307,0.372475,0.666667,0.387626,,,,,
|
157 |
+
1,13,on,0.671569,0.376263,0.691176,0.385101,,,,,
|
158 |
+
1,13,several,0.696078,0.373737,0.751634,0.383838,,,,,
|
159 |
+
1,14,experiments,0.145425,0.390152,0.24183,0.405303,,,,,
|
160 |
+
1,14,that,0.246732,0.390152,0.276144,0.401515,,,,,
|
161 |
+
1,14,are,0.281046,0.393939,0.305556,0.401515,,,,,
|
162 |
+
1,14,related,0.308824,0.390152,0.362745,0.401515,,,,,
|
163 |
+
1,14,to,0.367647,0.392677,0.383987,0.401515,,,,,
|
164 |
+
1,14,a,0.388889,0.393939,0.397059,0.401515,,,,,
|
165 |
+
1,14,genetic,0.401961,0.390152,0.45915,0.405303,,,,,
|
166 |
+
1,14,screen,0.464052,0.393939,0.514706,0.401515,,,,,
|
167 |
+
1,14,that,0.517974,0.390152,0.547386,0.401515,,,,,
|
168 |
+
1,14,the,0.552288,0.390152,0.576797,0.401515,,,,,
|
169 |
+
1,14,Hyde,0.581699,0.390152,0.624183,0.405303,,,,,
|
170 |
+
1,14,lab,0.629085,0.390152,0.653595,0.401515,,,,,
|
171 |
+
1,14,plans,0.658497,0.390152,0.699346,0.405303,,,,,
|
172 |
+
1,14,on,0.704248,0.393939,0.723856,0.401515,,,,,
|
173 |
+
1,14,performing,0.728758,0.390152,0.816993,0.405303,,,,,
|
174 |
+
1,14,to,0.821895,0.391414,0.836601,0.401515,,,,,
|
175 |
+
1,15,identify,0.147059,0.407828,0.207516,0.421717,,,,,
|
176 |
+
1,15,mutants,0.212418,0.409091,0.272876,0.419192,,,,,
|
177 |
+
1,15,in,0.279412,0.407828,0.294118,0.419192,,,,,
|
178 |
+
1,15,the,0.29902,0.407828,0.323529,0.419192,,,,,
|
179 |
+
1,15,regeneration,0.328431,0.407828,0.426471,0.42298,,,,,
|
180 |
+
1,15,pathway--I,0.429739,0.407828,0.51634,0.42298,,,,,
|
181 |
+
1,15,am,0.522876,0.411616,0.545752,0.419192,,,,,
|
182 |
+
1,15,developing,0.550654,0.407828,0.638889,0.42298,,,,,
|
183 |
+
1,15,a,0.643791,0.411616,0.651961,0.419192,,,,,
|
184 |
+
1,15,neuroD4:EGFP,0.656863,0.407828,0.78268,0.419192,,,,,
|
185 |
+
1,16,transgenic,0.145425,0.425505,0.227124,0.439394,,,,,
|
186 |
+
1,16,line,0.232026,0.425505,0.261438,0.436869,,,,,
|
187 |
+
1,16,for,0.26634,0.425505,0.289216,0.436869,,,,,
|
188 |
+
1,16,use,0.294118,0.42803,0.320261,0.436869,,,,,
|
189 |
+
1,16,in,0.325163,0.425505,0.339869,0.436869,,,,,
|
190 |
+
1,16,this,0.344771,0.425505,0.372549,0.436869,,,,,
|
191 |
+
1,16,screen,0.377451,0.42803,0.428105,0.436869,,,,,
|
192 |
+
1,16,and,0.433007,0.425505,0.460784,0.436869,,,,,
|
193 |
+
1,16,I,0.46732,0.425505,0.472222,0.436869,,,,,
|
194 |
+
1,16,am,0.477124,0.42803,0.501634,0.436869,,,,,
|
195 |
+
1,16,characterizing,0.506536,0.425505,0.617647,0.439394,,,,,
|
196 |
+
1,16,the,0.622549,0.425505,0.647059,0.436869,,,,,
|
197 |
+
1,16,extent,0.651961,0.426768,0.70098,0.436869,,,,,
|
198 |
+
1,16,of,0.704248,0.425505,0.722222,0.436869,,,,,
|
199 |
+
1,16,damage,0.72549,0.425505,0.787582,0.439394,,,,,
|
200 |
+
1,16,and,0.79085,0.425505,0.820261,0.436869,,,,,
|
201 |
+
1,17,regeneration,0.145425,0.443182,0.243464,0.457071,,,,,
|
202 |
+
1,17,in,0.25,0.443182,0.264706,0.454545,,,,,
|
203 |
+
1,17,sheer,0.267974,0.443182,0.312092,0.454545,,,,,
|
204 |
+
1,17,zebrafish,0.316993,0.443182,0.388889,0.454545,,,,,
|
205 |
+
1,17,retinas.,0.393791,0.443182,0.449346,0.454545,,,,,
|
206 |
+
1,17,"Finally,",0.455882,0.443182,0.51634,0.457071,,,,,
|
207 |
+
1,17,I,0.521242,0.443182,0.527778,0.454545,,,,,
|
208 |
+
1,17,am,0.53268,0.445707,0.55719,0.454545,,,,,
|
209 |
+
1,17,characterizing,0.560458,0.443182,0.671569,0.457071,,,,,
|
210 |
+
1,17,the,0.676471,0.443182,0.70098,0.454545,,,,,
|
211 |
+
1,17,chx10:EGFP,0.705882,0.443182,0.808824,0.454545,,,,,
|
212 |
+
1,18,transgenic,0.145425,0.459596,0.227124,0.474747,,,,,
|
213 |
+
1,18,line,0.232026,0.459596,0.261438,0.47096,,,,,
|
214 |
+
1,18,during,0.26634,0.459596,0.316993,0.474747,,,,,
|
215 |
+
1,18,retinal,0.321895,0.459596,0.372549,0.47096,,,,,
|
216 |
+
1,18,development,0.377451,0.459596,0.478758,0.474747,,,,,
|
217 |
+
1,18,and,0.48366,0.460859,0.511438,0.47096,,,,,
|
218 |
+
1,18,regeneration.,0.51634,0.459596,0.619281,0.474747,,,,,
|
219 |
+
1,19,Please,0.145425,0.496212,0.196078,0.507576,,,,,
|
220 |
+
1,19,find,0.20098,0.496212,0.232026,0.507576,,,,,
|
221 |
+
1,19,my,0.236928,0.5,0.263072,0.510101,,,,,
|
222 |
+
1,19,CV,0.267974,0.496212,0.295752,0.507576,,,,,
|
223 |
+
1,19,attached.,0.29902,0.496212,0.369281,0.507576,,,,,
|
224 |
+
1,20,Thank,0.145425,0.531566,0.196078,0.542929,,,,,
|
225 |
+
1,20,you,0.20098,0.535354,0.230392,0.546717,,,,,
|
226 |
+
1,20,for,0.235294,0.531566,0.25817,0.542929,,,,,
|
227 |
+
1,20,your,0.263072,0.535354,0.29902,0.546717,,,,,
|
228 |
+
1,20,"time,",0.303922,0.531566,0.343137,0.545455,,,,,
|
229 |
+
1,21,--Lauren,0.147059,0.568182,0.215686,0.579545,,,,,
|
230 |
+
1,21,Lilley,0.218954,0.566919,0.26634,0.582071,,,,,
|
231 |
+
1,22,Dr.,0.145425,0.637626,0.171569,0.64899,,,,,
|
232 |
+
1,22,"Poss,",0.176471,0.637626,0.21732,0.651515,,,,,
|
233 |
+
1,23,I,0.145425,0.671717,0.151961,0.683081,,,,,
|
234 |
+
1,23,am,0.158497,0.675505,0.181373,0.684343,,,,,
|
235 |
+
1,23,a,0.186275,0.675505,0.194444,0.684343,,,,,
|
236 |
+
1,23,senior,0.199346,0.671717,0.248366,0.683081,,,,,
|
237 |
+
1,23,biology,0.253268,0.671717,0.312092,0.686869,,,,,
|
238 |
+
1,23,major,0.316993,0.671717,0.364379,0.686869,,,,,
|
239 |
+
1,23,at,0.369281,0.674242,0.382353,0.683081,,,,,
|
240 |
+
1,23,the,0.387255,0.671717,0.411765,0.684343,,,,,
|
241 |
+
1,23,University,0.416667,0.671717,0.498366,0.686869,,,,,
|
242 |
+
1,23,of,0.504902,0.671717,0.522876,0.683081,,,,,
|
243 |
+
1,23,Notre,0.52451,0.671717,0.570261,0.684343,,,,,
|
244 |
+
1,23,Dame.,0.575163,0.671717,0.625817,0.684343,,,,,
|
245 |
+
1,23,I,0.630719,0.671717,0.637255,0.683081,,,,,
|
246 |
+
1,23,am,0.643791,0.675505,0.666667,0.684343,,,,,
|
247 |
+
1,23,applying,0.671569,0.67298,0.740196,0.686869,,,,,
|
248 |
+
1,23,to,0.745098,0.67298,0.759804,0.683081,,,,,
|
249 |
+
1,23,your,0.764706,0.675505,0.802288,0.686869,,,,,
|
250 |
+
1,24,graduate,0.145425,0.689394,0.214052,0.704545,,,,,
|
251 |
+
1,24,program,0.218954,0.693182,0.284314,0.703283,,,,,
|
252 |
+
1,24,and,0.289216,0.689394,0.318627,0.700758,,,,,
|
253 |
+
1,24,am,0.323529,0.693182,0.348039,0.700758,,,,,
|
254 |
+
1,24,very,0.351307,0.693182,0.387255,0.703283,,,,,
|
255 |
+
1,24,interested,0.392157,0.689394,0.46732,0.700758,,,,,
|
256 |
+
1,24,in,0.473856,0.689394,0.488562,0.700758,,,,,
|
257 |
+
1,24,your,0.493464,0.693182,0.529412,0.703283,,,,,
|
258 |
+
1,24,work.,0.534314,0.689394,0.578431,0.700758,,,,,
|
259 |
+
1,24,After,0.583333,0.689394,0.625817,0.700758,,,,,
|
260 |
+
1,24,glancing,0.630719,0.689394,0.697712,0.703283,,,,,
|
261 |
+
1,24,at,0.702614,0.690657,0.71732,0.700758,,,,,
|
262 |
+
1,24,a,0.722222,0.693182,0.730392,0.700758,,,,,
|
263 |
+
1,24,few,0.735294,0.689394,0.764706,0.700758,,,,,
|
264 |
+
1,24,of,0.769608,0.689394,0.787582,0.700758,,,,,
|
265 |
+
1,24,your,0.79085,0.693182,0.826797,0.703283,,,,,
|
266 |
+
1,25,recent,0.145425,0.708333,0.194444,0.718434,,,,,
|
267 |
+
1,25,papers,0.199346,0.710859,0.25,0.72096,,,,,
|
268 |
+
1,25,and,0.254902,0.707071,0.28268,0.718434,,,,,
|
269 |
+
1,25,your,0.287582,0.710859,0.325163,0.72096,,,,,
|
270 |
+
1,25,research,0.328431,0.707071,0.393791,0.718434,,,,,
|
271 |
+
1,25,summary,0.398693,0.709596,0.472222,0.72096,,,,,
|
272 |
+
1,25,I,0.477124,0.707071,0.48366,0.718434,,,,,
|
273 |
+
1,25,find,0.488562,0.707071,0.519608,0.718434,,,,,
|
274 |
+
1,25,your,0.52451,0.710859,0.562092,0.72096,,,,,
|
275 |
+
1,25,research,0.565359,0.707071,0.632353,0.718434,,,,,
|
276 |
+
1,25,greatly,0.637255,0.707071,0.691176,0.72096,,,,,
|
277 |
+
1,25,coincides,0.696078,0.707071,0.769608,0.718434,,,,,
|
278 |
+
1,25,with,0.77451,0.707071,0.810458,0.718434,,,,,
|
279 |
+
1,25,my,0.813725,0.710859,0.839869,0.72096,,,,,
|
280 |
+
1,26,research,0.145425,0.724747,0.210784,0.736111,,,,,
|
281 |
+
1,26,experiences,0.21732,0.724747,0.308824,0.738636,,,,,
|
282 |
+
1,26,and,0.313725,0.723485,0.341503,0.736111,,,,,
|
283 |
+
1,26,interests.,0.346405,0.723485,0.416667,0.736111,,,,,
|
284 |
+
1,26,Will,0.426471,0.723485,0.462418,0.736111,,,,,
|
285 |
+
1,26,you,0.465686,0.727273,0.496732,0.738636,,,,,
|
286 |
+
1,26,be,0.5,0.723485,0.519608,0.736111,,,,,
|
287 |
+
1,26,taking,0.52451,0.724747,0.573529,0.738636,,,,,
|
288 |
+
1,26,on,0.578431,0.727273,0.598039,0.736111,,,,,
|
289 |
+
1,26,new,0.602941,0.727273,0.635621,0.736111,,,,,
|
290 |
+
1,26,students,0.640523,0.724747,0.704248,0.736111,,,,,
|
291 |
+
1,26,next,0.70915,0.72601,0.745098,0.734848,,,,,
|
292 |
+
1,26,year?,0.748366,0.724747,0.79085,0.738636,,,,,
|
293 |
+
1,27,I,0.145425,0.760101,0.151961,0.771465,,,,,
|
294 |
+
1,27,have,0.156863,0.760101,0.194444,0.771465,,,,,
|
295 |
+
1,27,worked,0.199346,0.760101,0.25817,0.771465,,,,,
|
296 |
+
1,27,on,0.263072,0.763889,0.28268,0.771465,,,,,
|
297 |
+
1,27,several,0.287582,0.760101,0.343137,0.771465,,,,,
|
298 |
+
1,27,different,0.348039,0.760101,0.416667,0.771465,,,,,
|
299 |
+
1,27,research,0.419935,0.760101,0.485294,0.771465,,,,,
|
300 |
+
1,27,projects,0.490196,0.760101,0.552288,0.775253,,,,,
|
301 |
+
1,27,as,0.55719,0.763889,0.573529,0.771465,,,,,
|
302 |
+
1,27,an,0.578431,0.763889,0.598039,0.771465,,,,,
|
303 |
+
1,27,undergraduate,0.602941,0.760101,0.714052,0.775253,,,,,
|
304 |
+
1,27,in,0.718954,0.760101,0.735294,0.771465,,,,,
|
305 |
+
1,27,Dr.,0.740196,0.760101,0.764706,0.771465,,,,,
|
306 |
+
1,27,David,0.769608,0.760101,0.818627,0.771465,,,,,
|
307 |
+
1,27,R.,0.823529,0.760101,0.839869,0.771465,,,,,
|
308 |
+
1,28,Hyde's,0.145425,0.777778,0.199346,0.791667,,,,,
|
309 |
+
1,28,lab,0.204248,0.777778,0.228758,0.789141,,,,,
|
310 |
+
1,28,at,0.23366,0.77904,0.248366,0.789141,,,,,
|
311 |
+
1,28,the,0.251634,0.777778,0.276144,0.789141,,,,,
|
312 |
+
1,28,University,0.281046,0.777778,0.364379,0.791667,,,,,
|
313 |
+
1,28,of,0.369281,0.777778,0.387255,0.789141,,,,,
|
314 |
+
1,28,Notre,0.390523,0.777778,0.434641,0.789141,,,,,
|
315 |
+
1,28,Dame.,0.439542,0.777778,0.490196,0.789141,,,,,
|
316 |
+
1,28,The,0.496732,0.777778,0.527778,0.789141,,,,,
|
317 |
+
1,28,Hyde,0.53268,0.777778,0.573529,0.791667,,,,,
|
318 |
+
1,28,lab,0.580065,0.777778,0.602941,0.789141,,,,,
|
319 |
+
1,28,is,0.607843,0.777778,0.620915,0.789141,,,,,
|
320 |
+
1,28,interested,0.625817,0.777778,0.702614,0.789141,,,,,
|
321 |
+
1,28,in,0.707516,0.777778,0.722222,0.789141,,,,,
|
322 |
+
1,28,the,0.727124,0.777778,0.751634,0.789141,,,,,
|
323 |
+
1,28,signals,0.756536,0.777778,0.810458,0.791667,,,,,
|
324 |
+
1,28,that,0.815359,0.777778,0.846405,0.789141,,,,,
|
325 |
+
1,29,initiate,0.145425,0.795455,0.20098,0.806818,,,,,
|
326 |
+
1,29,Muller,0.205882,0.795455,0.259804,0.806818,,,,,
|
327 |
+
1,29,glia,0.264706,0.795455,0.292484,0.809343,,,,,
|
328 |
+
1,29,division,0.297386,0.795455,0.361111,0.806818,,,,,
|
329 |
+
1,29,post-light,0.366013,0.795455,0.44281,0.809343,,,,,
|
330 |
+
1,29,damage.,0.446078,0.795455,0.511438,0.809343,,,,,
|
331 |
+
1,29,My,0.51634,0.795455,0.544118,0.809343,,,,,
|
332 |
+
1,29,first,0.54902,0.795455,0.581699,0.806818,,,,,
|
333 |
+
1,29,research,0.584967,0.795455,0.651961,0.806818,,,,,
|
334 |
+
1,29,project,0.655229,0.795455,0.710784,0.809343,,,,,
|
335 |
+
1,29,was,0.715686,0.799242,0.745098,0.806818,,,,,
|
336 |
+
1,30,characterizing,0.145425,0.811869,0.25817,0.82702,,,,,
|
337 |
+
1,30,the,0.261438,0.811869,0.285948,0.823232,,,,,
|
338 |
+
1,30,role,0.29085,0.813131,0.321895,0.823232,,,,,
|
339 |
+
1,30,of,0.326797,0.811869,0.344771,0.824495,,,,,
|
340 |
+
1,30,leukemia,0.348039,0.811869,0.419935,0.823232,,,,,
|
341 |
+
1,30,inhibitory,0.424837,0.811869,0.501634,0.82702,,,,,
|
342 |
+
1,30,factor,0.506536,0.811869,0.553922,0.823232,,,,,
|
343 |
+
1,30,(LIF),0.55719,0.813131,0.599673,0.82702,,,,,
|
344 |
+
1,30,in,0.604575,0.811869,0.620915,0.824495,,,,,
|
345 |
+
1,30,the,0.624183,0.811869,0.648693,0.824495,,,,,
|
346 |
+
1,30,activation,0.653595,0.813131,0.732026,0.824495,,,,,
|
347 |
+
1,30,of,0.735294,0.811869,0.754902,0.824495,,,,,
|
348 |
+
1,30,cell,0.756536,0.811869,0.785948,0.824495,,,,,
|
349 |
+
1,31,proliferation,0.145425,0.829545,0.245098,0.844697,,,,,
|
350 |
+
1,31,in,0.25,0.829545,0.264706,0.840909,,,,,
|
351 |
+
1,31,the,0.267974,0.829545,0.292484,0.840909,,,,,
|
352 |
+
1,31,undamaged,0.297386,0.830808,0.388889,0.844697,,,,,
|
353 |
+
1,31,zebrafish,0.393791,0.829545,0.465686,0.842172,,,,,
|
354 |
+
1,31,retina.,0.470588,0.830808,0.519608,0.842172,,,,,
|
355 |
+
1,31,I,0.52451,0.830808,0.531046,0.840909,,,,,
|
356 |
+
1,31,am,0.535948,0.833333,0.560458,0.842172,,,,,
|
357 |
+
1,31,also,0.565359,0.829545,0.596405,0.840909,,,,,
|
358 |
+
1,31,working,0.601307,0.830808,0.666667,0.844697,,,,,
|
359 |
+
1,31,on,0.671569,0.833333,0.691176,0.840909,,,,,
|
360 |
+
1,31,several,0.696078,0.829545,0.751634,0.840909,,,,,
|
361 |
+
1,32,experiments,0.145425,0.847222,0.24183,0.862374,,,,,
|
362 |
+
1,32,that,0.246732,0.847222,0.276144,0.858586,,,,,
|
363 |
+
1,32,are,0.281046,0.85101,0.305556,0.858586,,,,,
|
364 |
+
1,32,related,0.308824,0.847222,0.362745,0.858586,,,,,
|
365 |
+
1,32,to,0.367647,0.848485,0.383987,0.858586,,,,,
|
366 |
+
1,32,a,0.388889,0.85101,0.397059,0.858586,,,,,
|
367 |
+
1,32,genetic,0.401961,0.847222,0.45915,0.861111,,,,,
|
368 |
+
1,32,screen,0.464052,0.85101,0.514706,0.858586,,,,,
|
369 |
+
1,32,that,0.517974,0.847222,0.54902,0.858586,,,,,
|
370 |
+
1,32,the,0.552288,0.847222,0.576797,0.858586,,,,,
|
371 |
+
1,32,Hyde,0.581699,0.847222,0.624183,0.861111,,,,,
|
372 |
+
1,32,lab,0.629085,0.847222,0.653595,0.858586,,,,,
|
373 |
+
1,32,plans,0.656863,0.847222,0.699346,0.861111,,,,,
|
374 |
+
1,32,on,0.704248,0.85101,0.723856,0.858586,,,,,
|
375 |
+
1,32,performing,0.728758,0.847222,0.816993,0.862374,,,,,
|
376 |
+
1,32,to,0.821895,0.848485,0.836601,0.858586,,,,,
|
377 |
+
1,33,identify,0.145425,0.864899,0.207516,0.878788,,,,,
|
378 |
+
1,33,mutants,0.212418,0.866162,0.272876,0.876263,,,,,
|
379 |
+
1,33,in,0.279412,0.864899,0.294118,0.876263,,,,,
|
380 |
+
1,33,the,0.29902,0.864899,0.323529,0.876263,,,,,
|
381 |
+
1,33,regeneration,0.328431,0.864899,0.426471,0.878788,,,,,
|
382 |
+
1,33,pathway--I,0.431373,0.864899,0.51634,0.878788,,,,,
|
383 |
+
1,33,am,0.522876,0.868687,0.545752,0.876263,,,,,
|
384 |
+
1,33,developing,0.550654,0.864899,0.638889,0.878788,,,,,
|
385 |
+
1,33,a,0.643791,0.868687,0.651961,0.876263,,,,,
|
386 |
+
1,33,neuroD4:EGFP,0.655229,0.864899,0.78268,0.876263,,,,,
|
387 |
+
1,34,transgenic,0.145425,0.882576,0.227124,0.896465,,,,,
|
388 |
+
1,34,line,0.232026,0.882576,0.261438,0.893939,,,,,
|
389 |
+
1,34,for,0.26634,0.881313,0.289216,0.893939,,,,,
|
390 |
+
1,34,use,0.294118,0.885101,0.320261,0.893939,,,,,
|
391 |
+
1,34,in,0.325163,0.882576,0.339869,0.893939,,,,,
|
392 |
+
1,34,this,0.344771,0.882576,0.372549,0.893939,,,,,
|
393 |
+
1,34,screen,0.379085,0.885101,0.428105,0.893939,,,,,
|
394 |
+
1,34,and,0.433007,0.882576,0.460784,0.893939,,,,,
|
395 |
+
1,34,I,0.46732,0.882576,0.472222,0.893939,,,,,
|
396 |
+
1,34,am,0.478758,0.885101,0.501634,0.893939,,,,,
|
397 |
+
1,34,characterizing,0.506536,0.882576,0.617647,0.896465,,,,,
|
398 |
+
1,34,the,0.622549,0.882576,0.647059,0.893939,,,,,
|
399 |
+
1,34,extent,0.651961,0.883838,0.699346,0.892677,,,,,
|
400 |
+
1,34,of,0.704248,0.882576,0.722222,0.893939,,,,,
|
401 |
+
1,34,damage,0.72549,0.882576,0.785948,0.896465,,,,,
|
402 |
+
1,34,and,0.79085,0.882576,0.820261,0.893939,,,,,
|
403 |
+
2,1,regeneration,0.145425,0.093434,0.243464,0.107323,,,,,
|
404 |
+
2,1,in,0.248366,0.093434,0.264706,0.104798,,,,,
|
405 |
+
2,1,sheer,0.267974,0.093434,0.312092,0.104798,,,,,
|
406 |
+
2,1,zebrafish,0.316993,0.093434,0.387255,0.104798,,,,,
|
407 |
+
2,1,retinas.,0.392157,0.093434,0.449346,0.104798,,,,,
|
408 |
+
2,1,"Finally,",0.455882,0.093434,0.514706,0.107323,,,,,
|
409 |
+
2,1,I,0.521242,0.093434,0.527778,0.104798,,,,,
|
410 |
+
2,1,am,0.53268,0.097222,0.555556,0.104798,,,,,
|
411 |
+
2,1,characterizing,0.560458,0.093434,0.671569,0.107323,,,,,
|
412 |
+
2,1,the,0.676471,0.093434,0.70098,0.104798,,,,,
|
413 |
+
2,1,chx10:EGFP,0.705882,0.093434,0.808824,0.104798,,,,,
|
414 |
+
2,2,transgenic,0.145425,0.111111,0.227124,0.125,,,,,
|
415 |
+
2,2,line,0.232026,0.111111,0.261438,0.122475,,,,,
|
416 |
+
2,2,during,0.26634,0.111111,0.316993,0.125,,,,,
|
417 |
+
2,2,retinal,0.321895,0.111111,0.372549,0.122475,,,,,
|
418 |
+
2,2,development,0.377451,0.111111,0.478758,0.125,,,,,
|
419 |
+
2,2,and,0.48366,0.111111,0.511438,0.122475,,,,,
|
420 |
+
2,2,regeneration.,0.51634,0.111111,0.617647,0.125,,,,,
|
421 |
+
2,3,Please,0.145425,0.146465,0.196078,0.157828,,,,,
|
422 |
+
2,3,find,0.20098,0.146465,0.232026,0.157828,,,,,
|
423 |
+
2,3,my,0.236928,0.150253,0.263072,0.160354,,,,,
|
424 |
+
2,3,CV,0.267974,0.146465,0.295752,0.157828,,,,,
|
425 |
+
2,3,attached.,0.29902,0.146465,0.369281,0.157828,,,,,
|
426 |
+
2,4,Thank,0.145425,0.183081,0.196078,0.193182,,,,,
|
427 |
+
2,4,you,0.20098,0.185606,0.230392,0.19697,,,,,
|
428 |
+
2,4,for,0.235294,0.181818,0.25817,0.193182,,,,,
|
429 |
+
2,4,your,0.263072,0.185606,0.29902,0.19697,,,,,
|
430 |
+
2,4,"time,",0.303922,0.181818,0.343137,0.195707,,,,,
|
431 |
+
2,5,--Lauren,0.147059,0.218434,0.215686,0.229798,,,,,
|
432 |
+
2,5,Lilley,0.218954,0.218434,0.26634,0.232323,,,,,
|
example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_review_file.csv
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
image,page,label,color,xmin,ymin,xmax,ymax,id,text
|
2 |
+
placeholder_image_0.png,1,TITLES,"(0, 0, 0)",0.147059,0.162879,0.171569,0.174242,oJIosRHGyCRn,Dr
|
3 |
+
placeholder_image_0.png,1,TITLES - NAME,"(0, 0, 0)",0.147059,0.162879,0.261438,0.176768,5C5tA6mfeL7T,Dr Kornbluth
|
4 |
+
placeholder_image_0.png,1,NAME,"(0, 0, 0)",0.176471,0.162879,0.261438,0.176768,UoYN48bc2ry5,Kornbluth
|
5 |
+
placeholder_image_0.png,1,TITLES,"(0, 0, 0)",0.740196,0.30303,0.764706,0.314394,cAsjVETPEisV,Dr
|
6 |
+
placeholder_image_0.png,1,TITLES - NAME,"(0, 0, 0)",0.740196,0.30303,0.839869,0.314394,yQ5HKn4tfT7L,Dr David R.
|
7 |
+
placeholder_image_0.png,1,NAME,"(0, 0, 0)",0.769608,0.30303,0.839869,0.314394,LR8phiOYnLWi,David R.
|
8 |
+
placeholder_image_0.png,1,NAME,"(0, 0, 0)",0.218954,0.566919,0.26634,0.582071,X8iObIauqZ9k,Lauren Lilley
|
9 |
+
placeholder_image_0.png,1,TITLES,"(0, 0, 0)",0.145425,0.637626,0.171569,0.64899,SvWjK2F7R3un,Dr
|
10 |
+
placeholder_image_0.png,1,TITLES - NAME,"(0, 0, 0)",0.145425,0.637626,0.21732,0.651515,zKJFVAOszwdM,Dr Poss
|
11 |
+
placeholder_image_0.png,1,NAME,"(0, 0, 0)",0.176471,0.637626,0.21732,0.651515,Iqda7ixkzcmg,Poss
|
12 |
+
placeholder_image_0.png,1,TITLES,"(0, 0, 0)",0.740196,0.760101,0.764706,0.771465,TWQD93bGI3B3,Dr
|
13 |
+
placeholder_image_0.png,1,TITLES - NAME,"(0, 0, 0)",0.740196,0.760101,0.839869,0.771465,vQuQQwqWjSES,Dr David R.
|
14 |
+
placeholder_image_0.png,1,NAME,"(0, 0, 0)",0.769608,0.760101,0.839869,0.771465,f8xf6ORJUSnG,David R.
|
15 |
+
placeholder_image_1.png,2,NAME,"(0, 0, 0)",0.218954,0.218434,0.26634,0.232323,N0nje9UiCzZK,Lauren Lilley
|
example_data/graduate-job-example-cover-letter.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:71cc851d41f80dd8b045af32657b76bf85dd8f72d39ae08fa43dc7a78256fe35
|
3 |
+
size 77045
|
example_data/partnership_toolkit_redact_custom_deny_list.csv
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Sister
|
2 |
+
Sister City
|
3 |
+
Sister Cities
|
4 |
+
Friendship City
|
example_data/partnership_toolkit_redact_some_pages.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
2
|
2 |
+
5
|
example_data/test_allow_list_graduate.csv
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Wilson
|
example_data/test_allow_list_partnership.csv
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
lambda_entrypoint.py
CHANGED
@@ -9,7 +9,7 @@ print("Lambda entrypoint loading...")
|
|
9 |
|
10 |
# Initialize S3 client outside the handler for connection reuse
|
11 |
s3_client = boto3.client("s3", region_name=os.getenv("AWS_REGION", "eu-west-2"))
|
12 |
-
print("S3 client
|
13 |
|
14 |
# Lambda's only writable directory
|
15 |
TMP_DIR = "/tmp"
|
@@ -84,48 +84,90 @@ def lambda_handler(event, context):
|
|
84 |
'task': arguments.get('task', 'redact'),
|
85 |
'input_file': input_file_path,
|
86 |
'output_dir': OUTPUT_DIR,
|
87 |
-
'
|
88 |
-
'
|
|
|
|
|
|
|
89 |
'ocr_method': arguments.get('ocr_method', 'Tesseract OCR - all PDF types'),
|
90 |
'page_min': int(arguments.get('page_min', 0)),
|
91 |
-
'page_max': int(arguments.get('page_max',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
|
93 |
# Handle optional files like allow/deny lists
|
94 |
-
'
|
95 |
-
'
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
# Deduplication specific arguments
|
98 |
'duplicate_type': arguments.get('duplicate_type', 'pages'),
|
99 |
'similarity_threshold': float(arguments.get('similarity_threshold', 0.95)),
|
100 |
'min_word_count': int(arguments.get('min_word_count', 3)),
|
101 |
-
'
|
|
|
|
|
|
|
102 |
'text_columns': arguments.get('text_columns', []),
|
|
|
|
|
103 |
|
104 |
-
#
|
105 |
-
'
|
106 |
-
'
|
107 |
-
'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
'aws_secret_key': None,
|
109 |
-
'aws_region': os.getenv("AWS_REGION", "
|
110 |
's3_bucket': bucket_name,
|
|
|
111 |
# Set defaults for boolean flags
|
112 |
-
'prepare_images': True,
|
113 |
-
'compress_redacted_pdf': False,
|
114 |
-
'return_pdf_end_of_redaction': True
|
115 |
}
|
116 |
|
117 |
# Download optional files if they are specified
|
118 |
-
allow_list_key = arguments.get('
|
119 |
if allow_list_key:
|
120 |
allow_list_path = os.path.join(INPUT_DIR, 'allow_list.csv')
|
121 |
download_file_from_s3(bucket_name, allow_list_key, allow_list_path)
|
122 |
-
cli_args['
|
123 |
|
124 |
-
deny_list_key = arguments.get('
|
125 |
if deny_list_key:
|
126 |
deny_list_path = os.path.join(INPUT_DIR, 'deny_list.csv')
|
127 |
download_file_from_s3(bucket_name, deny_list_key, deny_list_path)
|
128 |
-
cli_args['
|
129 |
|
130 |
# 5. Execute the main application logic
|
131 |
try:
|
|
|
9 |
|
10 |
# Initialize S3 client outside the handler for connection reuse
|
11 |
s3_client = boto3.client("s3", region_name=os.getenv("AWS_REGION", "eu-west-2"))
|
12 |
+
print("S3 client initialised")
|
13 |
|
14 |
# Lambda's only writable directory
|
15 |
TMP_DIR = "/tmp"
|
|
|
84 |
'task': arguments.get('task', 'redact'),
|
85 |
'input_file': input_file_path,
|
86 |
'output_dir': OUTPUT_DIR,
|
87 |
+
'input_dir': INPUT_DIR,
|
88 |
+
'language': arguments.get('language', 'en_core_web_lg'),
|
89 |
+
'pii_detector': arguments.get('pii_detector', 'Local'), # Default to local
|
90 |
+
'username': arguments.get('username', 'lambda_user'),
|
91 |
+
'save_to_user_folders': arguments.get('save_to_user_folders', 'False'),
|
92 |
'ocr_method': arguments.get('ocr_method', 'Tesseract OCR - all PDF types'),
|
93 |
'page_min': int(arguments.get('page_min', 0)),
|
94 |
+
'page_max': int(arguments.get('page_max', 0)),
|
95 |
+
'handwrite_signature_extraction': arguments.get('handwrite_signature_checkbox', ['Extract handwriting', 'Extract signatures']),
|
96 |
+
|
97 |
+
# General arguments
|
98 |
+
'local_redact_entities': arguments.get('local_redact_entities', []),
|
99 |
+
'aws_redact_entities': arguments.get('aws_redact_entities', []),
|
100 |
+
'cost_code': arguments.get('cost_code', ''),
|
101 |
+
'save_logs_to_csv': arguments.get('save_logs_to_csv', 'False'),
|
102 |
+
'save_logs_to_dynamodb': arguments.get('save_logs_to_dynamodb', 'False'),
|
103 |
+
'display_file_names_in_logs': arguments.get('display_file_names_in_logs', 'True'),
|
104 |
+
'upload_logs_to_s3': arguments.get('upload_logs_to_s3', 'False'),
|
105 |
+
's3_logs_prefix': arguments.get('s3_logs_prefix', ''),
|
106 |
+
'do_initial_clean': arguments.get('do_initial_clean', 'False'),
|
107 |
+
|
108 |
+
# PDF/Image specific arguments
|
109 |
+
'images_dpi': float(arguments.get('images_dpi', 300.0)),
|
110 |
+
'chosen_local_ocr_model': arguments.get('chosen_local_ocr_model', 'tesseract'),
|
111 |
+
'preprocess_local_ocr_images': arguments.get('preprocess_local_ocr_images', 'False'),
|
112 |
|
113 |
# Handle optional files like allow/deny lists
|
114 |
+
'allow_list_file': arguments.get('allow_list_file', ""),
|
115 |
+
'deny_list_file': arguments.get('deny_list_file', ""),
|
116 |
+
'redact_whole_page_file': arguments.get('redact_whole_page_file', ""),
|
117 |
+
|
118 |
+
# Tabular/Anonymisation arguments
|
119 |
+
'excel_sheets': arguments.get('excel_sheets', []),
|
120 |
+
'fuzzy_mistakes': int(arguments.get('fuzzy_mistakes', 0)),
|
121 |
+
'match_fuzzy_whole_phrase_bool': arguments.get('match_fuzzy_whole_phrase_bool', 'True'),
|
122 |
|
123 |
# Deduplication specific arguments
|
124 |
'duplicate_type': arguments.get('duplicate_type', 'pages'),
|
125 |
'similarity_threshold': float(arguments.get('similarity_threshold', 0.95)),
|
126 |
'min_word_count': int(arguments.get('min_word_count', 3)),
|
127 |
+
'min_consecutive_pages': int(arguments.get('min_consecutive_pages', 1)),
|
128 |
+
'greedy_match': arguments.get('greedy_match', 'False'),
|
129 |
+
'combine_pages': arguments.get('combine_pages', 'True'),
|
130 |
+
'search_query': arguments.get('search_query', ""),
|
131 |
'text_columns': arguments.get('text_columns', []),
|
132 |
+
'remove_duplicate_rows': arguments.get('remove_duplicate_rows', 'True'),
|
133 |
+
'anon_strategy': arguments.get('anon_strategy', 'redact'),
|
134 |
|
135 |
+
# Textract specific arguments
|
136 |
+
'textract_action': arguments.get('textract_action', ''),
|
137 |
+
'job_id': arguments.get('job_id', ''),
|
138 |
+
'extract_signatures': arguments.get('extract_signatures', False),
|
139 |
+
'textract_bucket': arguments.get('textract_bucket', ''),
|
140 |
+
'textract_input_prefix': arguments.get('textract_input_prefix', ''),
|
141 |
+
'textract_output_prefix': arguments.get('textract_output_prefix', ''),
|
142 |
+
's3_textract_document_logs_subfolder': arguments.get('s3_textract_document_logs_subfolder', ''),
|
143 |
+
'local_textract_document_logs_subfolder': arguments.get('local_textract_document_logs_subfolder', ''),
|
144 |
+
'poll_interval': int(arguments.get('poll_interval', 30)),
|
145 |
+
'max_poll_attempts': int(arguments.get('max_poll_attempts', 120)),
|
146 |
+
|
147 |
+
# AWS credentials (use IAM Role instead of keys)
|
148 |
+
'aws_access_key': None,
|
149 |
'aws_secret_key': None,
|
150 |
+
'aws_region': os.getenv("AWS_REGION", ""),
|
151 |
's3_bucket': bucket_name,
|
152 |
+
|
153 |
# Set defaults for boolean flags
|
154 |
+
'prepare_images': arguments.get('prepare_images', True),
|
155 |
+
'compress_redacted_pdf': arguments.get('compress_redacted_pdf', False),
|
156 |
+
'return_pdf_end_of_redaction': arguments.get('return_pdf_end_of_redaction', True)
|
157 |
}
|
158 |
|
159 |
# Download optional files if they are specified
|
160 |
+
allow_list_key = arguments.get('allow_list_file')
|
161 |
if allow_list_key:
|
162 |
allow_list_path = os.path.join(INPUT_DIR, 'allow_list.csv')
|
163 |
download_file_from_s3(bucket_name, allow_list_key, allow_list_path)
|
164 |
+
cli_args['allow_list_file'] = allow_list_path
|
165 |
|
166 |
+
deny_list_key = arguments.get('deny_list_file')
|
167 |
if deny_list_key:
|
168 |
deny_list_path = os.path.join(INPUT_DIR, 'deny_list.csv')
|
169 |
download_file_from_s3(bucket_name, deny_list_key, deny_list_path)
|
170 |
+
cli_args['deny_list_file'] = deny_list_path
|
171 |
|
172 |
# 5. Execute the main application logic
|
173 |
try:
|
pyproject.toml
CHANGED
@@ -23,8 +23,8 @@ dependencies = [
|
|
23 |
"spacy==3.8.7",
|
24 |
# Direct URL dependency for spacy model
|
25 |
"en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
|
26 |
-
"gradio==5.
|
27 |
-
"boto3==1.40.
|
28 |
"pyarrow==21.0.0",
|
29 |
"openpyxl==3.1.5",
|
30 |
"Faker==37.5.3",
|
|
|
23 |
"spacy==3.8.7",
|
24 |
# Direct URL dependency for spacy model
|
25 |
"en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
|
26 |
+
"gradio==5.46.1",
|
27 |
+
"boto3==1.40.31",
|
28 |
"pyarrow==21.0.0",
|
29 |
"openpyxl==3.1.5",
|
30 |
"Faker==37.5.3",
|
requirements.txt
CHANGED
@@ -10,9 +10,9 @@ pandas==2.3.2
|
|
10 |
scikit-learn==1.7.1
|
11 |
spacy==3.8.7
|
12 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
13 |
-
gradio==5.
|
14 |
polars==1.33.1
|
15 |
-
boto3==1.40.
|
16 |
pyarrow==21.0.0
|
17 |
openpyxl==3.1.5
|
18 |
Faker==37.5.3
|
|
|
10 |
scikit-learn==1.7.1
|
11 |
spacy==3.8.7
|
12 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
13 |
+
gradio==5.46.1
|
14 |
polars==1.33.1
|
15 |
+
boto3==1.40.31
|
16 |
pyarrow==21.0.0
|
17 |
openpyxl==3.1.5
|
18 |
Faker==37.5.3
|
tools/aws_functions.py
CHANGED
@@ -171,7 +171,7 @@ def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=DOCU
|
|
171 |
final_out_message_str = "Could not upload files to S3 due to: " + str(e)
|
172 |
print(final_out_message_str)
|
173 |
else:
|
174 |
-
final_out_message_str = "App
|
175 |
|
176 |
return final_out_message_str
|
177 |
|
@@ -227,7 +227,7 @@ def upload_log_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=
|
|
227 |
final_out_message_str = "Could not upload files to S3 due to: " + str(e)
|
228 |
print(final_out_message_str)
|
229 |
else:
|
230 |
-
final_out_message_str = "App
|
231 |
print(final_out_message_str)
|
232 |
|
233 |
return final_out_message_str
|
|
|
171 |
final_out_message_str = "Could not upload files to S3 due to: " + str(e)
|
172 |
print(final_out_message_str)
|
173 |
else:
|
174 |
+
final_out_message_str = "App config will not AWS functions"
|
175 |
|
176 |
return final_out_message_str
|
177 |
|
|
|
227 |
final_out_message_str = "Could not upload files to S3 due to: " + str(e)
|
228 |
print(final_out_message_str)
|
229 |
else:
|
230 |
+
final_out_message_str = "App config will not AWS functions"
|
231 |
print(final_out_message_str)
|
232 |
|
233 |
return final_out_message_str
|
tools/aws_textract.py
CHANGED
@@ -24,6 +24,8 @@ def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str=""
|
|
24 |
'''
|
25 |
Analyse page with AWS Textract
|
26 |
'''
|
|
|
|
|
27 |
if client == "":
|
28 |
try:
|
29 |
if AWS_ACCESS_KEY and AWS_SECRET_KEY:
|
|
|
24 |
'''
|
25 |
Analyse page with AWS Textract
|
26 |
'''
|
27 |
+
|
28 |
+
print("handwrite_signature_checkbox in analyse_page_with_textract:", handwrite_signature_checkbox)
|
29 |
if client == "":
|
30 |
try:
|
31 |
if AWS_ACCESS_KEY and AWS_SECRET_KEY:
|
tools/cli_usage_logger.py
ADDED
@@ -0,0 +1,302 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
CLI Usage Logger - A simplified version of the Gradio CSVLogger_custom for CLI usage logging.
|
3 |
+
This module provides functionality to log usage data from CLI operations to CSV files and optionally DynamoDB.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import csv
|
7 |
+
import os
|
8 |
+
import time
|
9 |
+
import uuid
|
10 |
+
from datetime import datetime
|
11 |
+
from pathlib import Path
|
12 |
+
from typing import Any, List, Optional
|
13 |
+
import boto3
|
14 |
+
import botocore
|
15 |
+
from tools.aws_functions import upload_log_file_to_s3
|
16 |
+
from tools.config import (
|
17 |
+
USAGE_LOGS_FOLDER,
|
18 |
+
SAVE_LOGS_TO_CSV,
|
19 |
+
SAVE_LOGS_TO_DYNAMODB,
|
20 |
+
USAGE_LOG_DYNAMODB_TABLE_NAME,
|
21 |
+
DYNAMODB_USAGE_LOG_HEADERS,
|
22 |
+
CSV_USAGE_LOG_HEADERS,
|
23 |
+
DISPLAY_FILE_NAMES_IN_LOGS,
|
24 |
+
HOST_NAME,
|
25 |
+
AWS_REGION,
|
26 |
+
AWS_ACCESS_KEY,
|
27 |
+
AWS_SECRET_KEY,
|
28 |
+
RUN_AWS_FUNCTIONS,
|
29 |
+
S3_USAGE_LOGS_FOLDER,
|
30 |
+
DOCUMENT_REDACTION_BUCKET
|
31 |
+
)
|
32 |
+
|
33 |
+
|
34 |
+
class CLIUsageLogger:
|
35 |
+
"""
|
36 |
+
A simplified usage logger for CLI operations that mimics the functionality
|
37 |
+
of the Gradio CSVLogger_custom class.
|
38 |
+
"""
|
39 |
+
|
40 |
+
def __init__(self, dataset_file_name: str = "usage_log.csv"):
|
41 |
+
"""
|
42 |
+
Initialize the CLI usage logger.
|
43 |
+
|
44 |
+
Args:
|
45 |
+
dataset_file_name: Name of the CSV file to store logs
|
46 |
+
"""
|
47 |
+
self.dataset_file_name = dataset_file_name
|
48 |
+
self.flagging_dir = Path(USAGE_LOGS_FOLDER)
|
49 |
+
self.dataset_filepath = None
|
50 |
+
self.headers = None
|
51 |
+
|
52 |
+
def setup(self, headers: List[str]):
|
53 |
+
"""
|
54 |
+
Setup the logger with the specified headers.
|
55 |
+
|
56 |
+
Args:
|
57 |
+
headers: List of column headers for the CSV file
|
58 |
+
"""
|
59 |
+
self.headers = headers
|
60 |
+
self._create_dataset_file()
|
61 |
+
|
62 |
+
def _create_dataset_file(self):
|
63 |
+
"""Create the dataset CSV file with headers if it doesn't exist."""
|
64 |
+
os.makedirs(self.flagging_dir, exist_ok=True)
|
65 |
+
|
66 |
+
# Add ID and timestamp to headers (matching custom_csvlogger.py structure)
|
67 |
+
full_headers = self.headers + ["id", "timestamp"]
|
68 |
+
|
69 |
+
self.dataset_filepath = self.flagging_dir / self.dataset_file_name
|
70 |
+
|
71 |
+
if not Path(self.dataset_filepath).exists():
|
72 |
+
with open(self.dataset_filepath, "w", newline="", encoding="utf-8") as csvfile:
|
73 |
+
writer = csv.writer(csvfile)
|
74 |
+
writer.writerow(full_headers)
|
75 |
+
print(f"Created usage log file at: {self.dataset_filepath}")
|
76 |
+
else:
|
77 |
+
print(f"Using existing usage log file at: {self.dataset_filepath}")
|
78 |
+
|
79 |
+
def log_usage(
|
80 |
+
self,
|
81 |
+
data: List[Any],
|
82 |
+
save_to_csv: bool = None,
|
83 |
+
save_to_dynamodb: bool = None,
|
84 |
+
save_to_s3: bool = None,
|
85 |
+
s3_bucket: str = None,
|
86 |
+
s3_key_prefix: str = None,
|
87 |
+
dynamodb_table_name: str = None,
|
88 |
+
dynamodb_headers: List[str] = None,
|
89 |
+
replacement_headers: List[str] = None
|
90 |
+
) -> int:
|
91 |
+
"""
|
92 |
+
Log usage data to CSV and optionally DynamoDB and S3.
|
93 |
+
|
94 |
+
Args:
|
95 |
+
data: List of data values to log
|
96 |
+
save_to_csv: Whether to save to CSV (defaults to config setting)
|
97 |
+
save_to_dynamodb: Whether to save to DynamoDB (defaults to config setting)
|
98 |
+
save_to_s3: Whether to save to S3 (defaults to config setting)
|
99 |
+
s3_bucket: S3 bucket name (defaults to config setting)
|
100 |
+
s3_key_prefix: S3 key prefix (defaults to config setting)
|
101 |
+
dynamodb_table_name: DynamoDB table name (defaults to config setting)
|
102 |
+
dynamodb_headers: DynamoDB headers (defaults to config setting)
|
103 |
+
replacement_headers: Replacement headers for CSV (defaults to config setting)
|
104 |
+
|
105 |
+
Returns:
|
106 |
+
Number of lines written
|
107 |
+
"""
|
108 |
+
# Use config defaults if not specified
|
109 |
+
if save_to_csv is None:
|
110 |
+
save_to_csv = SAVE_LOGS_TO_CSV == 'True'
|
111 |
+
if save_to_dynamodb is None:
|
112 |
+
save_to_dynamodb = SAVE_LOGS_TO_DYNAMODB == 'True'
|
113 |
+
if save_to_s3 is None:
|
114 |
+
save_to_s3 = RUN_AWS_FUNCTIONS == "1" and SAVE_LOGS_TO_CSV == 'True'
|
115 |
+
if s3_bucket is None:
|
116 |
+
s3_bucket = DOCUMENT_REDACTION_BUCKET
|
117 |
+
if s3_key_prefix is None:
|
118 |
+
s3_key_prefix = S3_USAGE_LOGS_FOLDER
|
119 |
+
if dynamodb_table_name is None:
|
120 |
+
dynamodb_table_name = USAGE_LOG_DYNAMODB_TABLE_NAME
|
121 |
+
if dynamodb_headers is None:
|
122 |
+
dynamodb_headers = DYNAMODB_USAGE_LOG_HEADERS
|
123 |
+
if replacement_headers is None:
|
124 |
+
replacement_headers = CSV_USAGE_LOG_HEADERS
|
125 |
+
|
126 |
+
# Generate unique ID and add timestamp (matching custom_csvlogger.py structure)
|
127 |
+
generated_id = str(uuid.uuid4())
|
128 |
+
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] # Correct format for Amazon Athena
|
129 |
+
csv_data = data + [generated_id, timestamp]
|
130 |
+
|
131 |
+
line_count = 0
|
132 |
+
|
133 |
+
# Save to CSV
|
134 |
+
if save_to_csv and self.dataset_filepath:
|
135 |
+
try:
|
136 |
+
with open(self.dataset_filepath, "a", newline="", encoding="utf-8-sig") as csvfile:
|
137 |
+
writer = csv.writer(csvfile)
|
138 |
+
writer.writerow(csv_data)
|
139 |
+
line_count = 1
|
140 |
+
print(f"Logged usage data to CSV: {self.dataset_filepath}")
|
141 |
+
except Exception as e:
|
142 |
+
print(f"Error writing to CSV: {e}")
|
143 |
+
|
144 |
+
# Upload to S3 if enabled
|
145 |
+
if save_to_s3 and self.dataset_filepath and s3_bucket and s3_key_prefix:
|
146 |
+
try:
|
147 |
+
# Upload the log file to S3
|
148 |
+
upload_result = upload_log_file_to_s3(
|
149 |
+
local_file_paths=[str(self.dataset_filepath)],
|
150 |
+
s3_key=s3_key_prefix,
|
151 |
+
s3_bucket=s3_bucket,
|
152 |
+
RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS,
|
153 |
+
SAVE_LOGS_TO_CSV=SAVE_LOGS_TO_CSV
|
154 |
+
)
|
155 |
+
print(f"S3 upload result: {upload_result}")
|
156 |
+
except Exception as e:
|
157 |
+
print(f"Error uploading log file to S3: {e}")
|
158 |
+
|
159 |
+
# Save to DynamoDB
|
160 |
+
if save_to_dynamodb and dynamodb_table_name and dynamodb_headers:
|
161 |
+
try:
|
162 |
+
# Initialize DynamoDB client
|
163 |
+
if AWS_ACCESS_KEY and AWS_SECRET_KEY:
|
164 |
+
dynamodb = boto3.resource(
|
165 |
+
'dynamodb',
|
166 |
+
region_name=AWS_REGION,
|
167 |
+
aws_access_key_id=AWS_ACCESS_KEY,
|
168 |
+
aws_secret_access_key=AWS_SECRET_KEY
|
169 |
+
)
|
170 |
+
else:
|
171 |
+
dynamodb = boto3.resource('dynamodb', region_name=AWS_REGION)
|
172 |
+
|
173 |
+
table = dynamodb.Table(dynamodb_table_name)
|
174 |
+
|
175 |
+
# Generate unique ID
|
176 |
+
generated_id = str(uuid.uuid4())
|
177 |
+
|
178 |
+
# Prepare the DynamoDB item
|
179 |
+
item = {
|
180 |
+
'id': generated_id,
|
181 |
+
'timestamp': timestamp,
|
182 |
+
}
|
183 |
+
|
184 |
+
# Map the headers to values
|
185 |
+
item.update({header: str(value) for header, value in zip(dynamodb_headers, data)})
|
186 |
+
|
187 |
+
table.put_item(Item=item)
|
188 |
+
print("Successfully uploaded usage log to DynamoDB")
|
189 |
+
|
190 |
+
except Exception as e:
|
191 |
+
print(f"Could not upload usage log to DynamoDB: {e}")
|
192 |
+
|
193 |
+
return line_count
|
194 |
+
|
195 |
+
|
196 |
+
def create_cli_usage_logger() -> CLIUsageLogger:
|
197 |
+
"""
|
198 |
+
Create and setup a CLI usage logger with the standard headers.
|
199 |
+
|
200 |
+
Returns:
|
201 |
+
Configured CLIUsageLogger instance
|
202 |
+
"""
|
203 |
+
# Parse CSV headers from config
|
204 |
+
import json
|
205 |
+
try:
|
206 |
+
headers = json.loads(CSV_USAGE_LOG_HEADERS)
|
207 |
+
except:
|
208 |
+
# Fallback headers if parsing fails
|
209 |
+
headers = [
|
210 |
+
"session_hash_textbox", "doc_full_file_name_textbox", "data_full_file_name_textbox",
|
211 |
+
"actual_time_taken_number", "total_page_count", "textract_query_number",
|
212 |
+
"pii_detection_method", "comprehend_query_number", "cost_code",
|
213 |
+
"textract_handwriting_signature", "host_name_textbox", "text_extraction_method",
|
214 |
+
"is_this_a_textract_api_call", "task"
|
215 |
+
]
|
216 |
+
|
217 |
+
logger = CLIUsageLogger()
|
218 |
+
logger.setup(headers)
|
219 |
+
return logger
|
220 |
+
|
221 |
+
|
222 |
+
def log_redaction_usage(
|
223 |
+
logger: CLIUsageLogger,
|
224 |
+
session_hash: str,
|
225 |
+
doc_file_name: str,
|
226 |
+
data_file_name: str,
|
227 |
+
time_taken: float,
|
228 |
+
total_pages: int,
|
229 |
+
textract_queries: int,
|
230 |
+
pii_method: str,
|
231 |
+
comprehend_queries: int,
|
232 |
+
cost_code: str,
|
233 |
+
handwriting_signature: str,
|
234 |
+
text_extraction_method: str,
|
235 |
+
is_textract_call: bool,
|
236 |
+
task: str,
|
237 |
+
save_to_dynamodb: bool = None,
|
238 |
+
save_to_s3: bool = None,
|
239 |
+
s3_bucket: str = None,
|
240 |
+
s3_key_prefix: str = None
|
241 |
+
):
|
242 |
+
"""
|
243 |
+
Log redaction usage data using the provided logger.
|
244 |
+
|
245 |
+
Args:
|
246 |
+
logger: CLIUsageLogger instance
|
247 |
+
session_hash: Session identifier
|
248 |
+
doc_file_name: Document file name (or placeholder if not displaying names)
|
249 |
+
data_file_name: Data file name (or placeholder if not displaying names)
|
250 |
+
time_taken: Time taken for processing in seconds
|
251 |
+
total_pages: Total number of pages processed
|
252 |
+
textract_queries: Number of Textract API calls made
|
253 |
+
pii_method: PII detection method used
|
254 |
+
comprehend_queries: Number of Comprehend API calls made
|
255 |
+
cost_code: Cost code for the operation
|
256 |
+
handwriting_signature: Handwriting/signature extraction options
|
257 |
+
text_extraction_method: Text extraction method used
|
258 |
+
is_textract_call: Whether this was a Textract API call
|
259 |
+
task: The task performed (redact, deduplicate, textract)
|
260 |
+
save_to_dynamodb: Whether to save to DynamoDB (overrides config default)
|
261 |
+
save_to_s3: Whether to save to S3 (overrides config default)
|
262 |
+
s3_bucket: S3 bucket name (overrides config default)
|
263 |
+
s3_key_prefix: S3 key prefix (overrides config default)
|
264 |
+
"""
|
265 |
+
# Use placeholder names if not displaying file names in logs
|
266 |
+
if DISPLAY_FILE_NAMES_IN_LOGS != 'True':
|
267 |
+
if doc_file_name:
|
268 |
+
doc_file_name = "document"
|
269 |
+
data_file_name = ""
|
270 |
+
if data_file_name:
|
271 |
+
data_file_name = "data_file"
|
272 |
+
doc_file_name = ""
|
273 |
+
else:
|
274 |
+
doc_file_name = doc_file_name
|
275 |
+
data_file_name = data_file_name
|
276 |
+
|
277 |
+
rounded_time_taken = round(time_taken, 2)
|
278 |
+
|
279 |
+
data = [
|
280 |
+
session_hash,
|
281 |
+
doc_file_name,
|
282 |
+
data_file_name,
|
283 |
+
rounded_time_taken,
|
284 |
+
total_pages,
|
285 |
+
textract_queries,
|
286 |
+
pii_method,
|
287 |
+
comprehend_queries,
|
288 |
+
cost_code,
|
289 |
+
handwriting_signature,
|
290 |
+
HOST_NAME,
|
291 |
+
text_extraction_method,
|
292 |
+
is_textract_call,
|
293 |
+
task
|
294 |
+
]
|
295 |
+
|
296 |
+
logger.log_usage(
|
297 |
+
data,
|
298 |
+
save_to_dynamodb=save_to_dynamodb,
|
299 |
+
save_to_s3=save_to_s3,
|
300 |
+
s3_bucket=s3_bucket,
|
301 |
+
s3_key_prefix=s3_key_prefix
|
302 |
+
)
|
tools/config.py
CHANGED
@@ -34,7 +34,7 @@ def add_folder_to_path(folder_path: str):
|
|
34 |
'''
|
35 |
|
36 |
if os.path.exists(folder_path) and os.path.isdir(folder_path):
|
37 |
-
print(folder_path, "folder exists.")
|
38 |
|
39 |
# Resolve relative path to absolute path
|
40 |
absolute_path = os.path.abspath(folder_path)
|
@@ -45,7 +45,8 @@ def add_folder_to_path(folder_path: str):
|
|
45 |
os.environ['PATH'] = full_path_extension
|
46 |
#print(f"Updated PATH with: ", full_path_extension)
|
47 |
else:
|
48 |
-
|
|
|
49 |
else:
|
50 |
print(f"Folder not found at {folder_path} - not added to PATH")
|
51 |
|
@@ -88,13 +89,16 @@ AWS_CLIENT_SECRET = get_or_create_env_var('AWS_CLIENT_SECRET', '')
|
|
88 |
AWS_USER_POOL_ID = get_or_create_env_var('AWS_USER_POOL_ID', '')
|
89 |
|
90 |
AWS_ACCESS_KEY = get_or_create_env_var('AWS_ACCESS_KEY', '')
|
91 |
-
if AWS_ACCESS_KEY: print(f'AWS_ACCESS_KEY found in environment variables')
|
92 |
|
93 |
AWS_SECRET_KEY = get_or_create_env_var('AWS_SECRET_KEY', '')
|
94 |
-
if AWS_SECRET_KEY: print(f'AWS_SECRET_KEY found in environment variables')
|
95 |
|
96 |
DOCUMENT_REDACTION_BUCKET = get_or_create_env_var('DOCUMENT_REDACTION_BUCKET', '')
|
97 |
|
|
|
|
|
|
|
98 |
# Custom headers e.g. if routing traffic through Cloudfront
|
99 |
# Retrieving or setting CUSTOM_HEADER
|
100 |
CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
|
@@ -164,7 +168,7 @@ DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS',
|
|
164 |
# Further customisation options for CSV logs
|
165 |
CSV_ACCESS_LOG_HEADERS = get_or_create_env_var('CSV_ACCESS_LOG_HEADERS', '') # If blank, uses component labels
|
166 |
CSV_FEEDBACK_LOG_HEADERS = get_or_create_env_var('CSV_FEEDBACK_LOG_HEADERS', '') # If blank, uses component labels
|
167 |
-
CSV_USAGE_LOG_HEADERS = get_or_create_env_var('CSV_USAGE_LOG_HEADERS', '["session_hash_textbox", "doc_full_file_name_textbox", "data_full_file_name_textbox", "actual_time_taken_number", "total_page_count", "textract_query_number", "pii_detection_method", "comprehend_query_number", "cost_code", "textract_handwriting_signature", "host_name_textbox", "text_extraction_method", "is_this_a_textract_api_call"]') # If blank, uses component labels
|
168 |
|
169 |
### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
|
170 |
SAVE_LOGS_TO_DYNAMODB = get_or_create_env_var('SAVE_LOGS_TO_DYNAMODB', 'False')
|
@@ -275,7 +279,7 @@ DEFAULT_TABULAR_ANONYMISATION_STRATEGY = get_or_create_env_var('DEFAULT_TABULAR_
|
|
275 |
### Local OCR model - Tesseract vs PaddleOCR
|
276 |
CHOSEN_LOCAL_OCR_MODEL = get_or_create_env_var('CHOSEN_LOCAL_OCR_MODEL', "tesseract") # Choose between "tesseract", "hybrid", and "paddle". "paddle" will only return whole line text extraction, and so will only work for OCR, not redaction. "hybrid" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with PaddleOCR on words with low confidence.
|
277 |
|
278 |
-
PREPROCESS_LOCAL_OCR_IMAGES = get_or_create_env_var('PREPROCESS_LOCAL_OCR_IMAGES', "
|
279 |
|
280 |
# Entities for redaction
|
281 |
CHOSEN_COMPREHEND_ENTITIES = get_or_create_env_var('CHOSEN_COMPREHEND_ENTITIES', "['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']")
|
@@ -287,6 +291,10 @@ CHOSEN_REDACT_ENTITIES = get_or_create_env_var('CHOSEN_REDACT_ENTITIES', "['TITL
|
|
287 |
|
288 |
FULL_ENTITY_LIST = get_or_create_env_var('FULL_ENTITY_LIST', "['TITLES', 'PERSON', 'PHONE_NUMBER', 'EMAIL_ADDRESS', 'STREETNAME', 'UKPOSTCODE', 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM', 'CUSTOM_FUZZY']")
|
289 |
|
|
|
|
|
|
|
|
|
290 |
DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX = get_or_create_env_var('DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX', "['Extract handwriting']")
|
291 |
|
292 |
DEFAULT_SEARCH_QUERY = get_or_create_env_var('DEFAULT_SEARCH_QUERY', '')
|
@@ -327,8 +335,9 @@ LANGUAGE_CHOICES = get_or_create_env_var("LANGUAGE_CHOICES", "['en', 'fr', 'de',
|
|
327 |
DEFAULT_DUPLICATE_DETECTION_THRESHOLD = float(get_or_create_env_var("DEFAULT_DUPLICATE_DETECTION_THRESHOLD", "0.95"))
|
328 |
DEFAULT_MIN_CONSECUTIVE_PAGES = int(get_or_create_env_var("DEFAULT_MIN_CONSECUTIVE_PAGES", "1"))
|
329 |
USE_GREEDY_DUPLICATE_DETECTION = get_or_create_env_var("USE_GREEDY_DUPLICATE_DETECTION", "True")
|
330 |
-
DEFAULT_COMBINE_PAGES = get_or_create_env_var("DEFAULT_COMBINE_PAGES", "True")
|
331 |
DEFAULT_MIN_WORD_COUNT = int(get_or_create_env_var("DEFAULT_MIN_WORD_COUNT", "10"))
|
|
|
332 |
|
333 |
|
334 |
###
|
@@ -352,6 +361,7 @@ COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
|
|
352 |
RUN_DIRECT_MODE = get_or_create_env_var('RUN_DIRECT_MODE', '0')
|
353 |
|
354 |
# Direct mode configuration options
|
|
|
355 |
DIRECT_MODE_TASK = get_or_create_env_var('DIRECT_MODE_TASK', 'redact') # 'redact' or 'deduplicate'
|
356 |
DIRECT_MODE_INPUT_FILE = get_or_create_env_var('DIRECT_MODE_INPUT_FILE', '') # Path to input file
|
357 |
DIRECT_MODE_OUTPUT_DIR = get_or_create_env_var('DIRECT_MODE_OUTPUT_DIR', OUTPUT_FOLDER) # Output directory
|
@@ -447,4 +457,4 @@ TEXTRACT_JOBS_S3_INPUT_LOC = get_or_create_env_var('TEXTRACT_JOBS_S3_INPUT_LOC',
|
|
447 |
|
448 |
TEXTRACT_JOBS_LOCAL_LOC = get_or_create_env_var('TEXTRACT_JOBS_LOCAL_LOC', 'output') # Local subfolder where the Textract jobs are stored
|
449 |
|
450 |
-
DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = get_or_create_env_var('DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS', '7') # How many days into the past should whole document Textract jobs be displayed? After that, the data is not deleted from the Textract jobs csv, but it is just filtered out. Included to align with S3 buckets where the file outputs will be automatically deleted after X days.
|
|
|
34 |
'''
|
35 |
|
36 |
if os.path.exists(folder_path) and os.path.isdir(folder_path):
|
37 |
+
#print(folder_path, "folder exists.")
|
38 |
|
39 |
# Resolve relative path to absolute path
|
40 |
absolute_path = os.path.abspath(folder_path)
|
|
|
45 |
os.environ['PATH'] = full_path_extension
|
46 |
#print(f"Updated PATH with: ", full_path_extension)
|
47 |
else:
|
48 |
+
pass
|
49 |
+
#print(f"Directory {folder_path} already exists in PATH.")
|
50 |
else:
|
51 |
print(f"Folder not found at {folder_path} - not added to PATH")
|
52 |
|
|
|
89 |
AWS_USER_POOL_ID = get_or_create_env_var('AWS_USER_POOL_ID', '')
|
90 |
|
91 |
AWS_ACCESS_KEY = get_or_create_env_var('AWS_ACCESS_KEY', '')
|
92 |
+
#if AWS_ACCESS_KEY: print(f'AWS_ACCESS_KEY found in environment variables')
|
93 |
|
94 |
AWS_SECRET_KEY = get_or_create_env_var('AWS_SECRET_KEY', '')
|
95 |
+
#if AWS_SECRET_KEY: print(f'AWS_SECRET_KEY found in environment variables')
|
96 |
|
97 |
DOCUMENT_REDACTION_BUCKET = get_or_create_env_var('DOCUMENT_REDACTION_BUCKET', '')
|
98 |
|
99 |
+
# Should the app prioritise using AWS SSO over using API keys stored in environment variables/secrets (defaults to yes)
|
100 |
+
PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS = get_or_create_env_var('PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS', '1')
|
101 |
+
|
102 |
# Custom headers e.g. if routing traffic through Cloudfront
|
103 |
# Retrieving or setting CUSTOM_HEADER
|
104 |
CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
|
|
|
168 |
# Further customisation options for CSV logs
|
169 |
CSV_ACCESS_LOG_HEADERS = get_or_create_env_var('CSV_ACCESS_LOG_HEADERS', '') # If blank, uses component labels
|
170 |
CSV_FEEDBACK_LOG_HEADERS = get_or_create_env_var('CSV_FEEDBACK_LOG_HEADERS', '') # If blank, uses component labels
|
171 |
+
CSV_USAGE_LOG_HEADERS = get_or_create_env_var('CSV_USAGE_LOG_HEADERS', '["session_hash_textbox", "doc_full_file_name_textbox", "data_full_file_name_textbox", "actual_time_taken_number", "total_page_count", "textract_query_number", "pii_detection_method", "comprehend_query_number", "cost_code", "textract_handwriting_signature", "host_name_textbox", "text_extraction_method", "is_this_a_textract_api_call", "task"]') # If blank, uses component labels
|
172 |
|
173 |
### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
|
174 |
SAVE_LOGS_TO_DYNAMODB = get_or_create_env_var('SAVE_LOGS_TO_DYNAMODB', 'False')
|
|
|
279 |
### Local OCR model - Tesseract vs PaddleOCR
|
280 |
CHOSEN_LOCAL_OCR_MODEL = get_or_create_env_var('CHOSEN_LOCAL_OCR_MODEL', "tesseract") # Choose between "tesseract", "hybrid", and "paddle". "paddle" will only return whole line text extraction, and so will only work for OCR, not redaction. "hybrid" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with PaddleOCR on words with low confidence.
|
281 |
|
282 |
+
PREPROCESS_LOCAL_OCR_IMAGES = get_or_create_env_var('PREPROCESS_LOCAL_OCR_IMAGES', "True") # Whether to try and preprocess images before extracting text. NOTE: I have found in testing that this often results in WORSE results for scanned pages, so it is default False
|
283 |
|
284 |
# Entities for redaction
|
285 |
CHOSEN_COMPREHEND_ENTITIES = get_or_create_env_var('CHOSEN_COMPREHEND_ENTITIES', "['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']")
|
|
|
291 |
|
292 |
FULL_ENTITY_LIST = get_or_create_env_var('FULL_ENTITY_LIST', "['TITLES', 'PERSON', 'PHONE_NUMBER', 'EMAIL_ADDRESS', 'STREETNAME', 'UKPOSTCODE', 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM', 'CUSTOM_FUZZY']")
|
293 |
|
294 |
+
CUSTOM_ENTITIES = get_or_create_env_var('CUSTOM_ENTITIES', "['TITLES', 'UKPOSTCODE', 'STREETNAME', 'CUSTOM']")
|
295 |
+
|
296 |
+
|
297 |
+
|
298 |
DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX = get_or_create_env_var('DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX', "['Extract handwriting']")
|
299 |
|
300 |
DEFAULT_SEARCH_QUERY = get_or_create_env_var('DEFAULT_SEARCH_QUERY', '')
|
|
|
335 |
DEFAULT_DUPLICATE_DETECTION_THRESHOLD = float(get_or_create_env_var("DEFAULT_DUPLICATE_DETECTION_THRESHOLD", "0.95"))
|
336 |
DEFAULT_MIN_CONSECUTIVE_PAGES = int(get_or_create_env_var("DEFAULT_MIN_CONSECUTIVE_PAGES", "1"))
|
337 |
USE_GREEDY_DUPLICATE_DETECTION = get_or_create_env_var("USE_GREEDY_DUPLICATE_DETECTION", "True")
|
338 |
+
DEFAULT_COMBINE_PAGES = get_or_create_env_var("DEFAULT_COMBINE_PAGES", "True") # Combine text from the same page number within a file. Alternative will enable line-level duplicate detection.
|
339 |
DEFAULT_MIN_WORD_COUNT = int(get_or_create_env_var("DEFAULT_MIN_WORD_COUNT", "10"))
|
340 |
+
REMOVE_DUPLICATE_ROWS = get_or_create_env_var("REMOVE_DUPLICATE_ROWS", "False")
|
341 |
|
342 |
|
343 |
###
|
|
|
361 |
RUN_DIRECT_MODE = get_or_create_env_var('RUN_DIRECT_MODE', '0')
|
362 |
|
363 |
# Direct mode configuration options
|
364 |
+
DIRECT_MODE_DEFAULT_USER = get_or_create_env_var('DIRECT_MODE_DEFAULT_USER', '') # Default username for cli/direct mode requests
|
365 |
DIRECT_MODE_TASK = get_or_create_env_var('DIRECT_MODE_TASK', 'redact') # 'redact' or 'deduplicate'
|
366 |
DIRECT_MODE_INPUT_FILE = get_or_create_env_var('DIRECT_MODE_INPUT_FILE', '') # Path to input file
|
367 |
DIRECT_MODE_OUTPUT_DIR = get_or_create_env_var('DIRECT_MODE_OUTPUT_DIR', OUTPUT_FOLDER) # Output directory
|
|
|
457 |
|
458 |
TEXTRACT_JOBS_LOCAL_LOC = get_or_create_env_var('TEXTRACT_JOBS_LOCAL_LOC', 'output') # Local subfolder where the Textract jobs are stored
|
459 |
|
460 |
+
DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = int(get_or_create_env_var('DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS', '7')) # How many days into the past should whole document Textract jobs be displayed? After that, the data is not deleted from the Textract jobs csv, but it is just filtered out. Included to align with S3 buckets where the file outputs will be automatically deleted after X days.
|
tools/custom_csvlogger.py
CHANGED
@@ -15,7 +15,7 @@ from pathlib import Path
|
|
15 |
from typing import TYPE_CHECKING, Any
|
16 |
from gradio_client import utils as client_utils
|
17 |
import gradio as gr
|
18 |
-
from gradio import utils
|
19 |
from tools.config import AWS_REGION, AWS_ACCESS_KEY, AWS_SECRET_KEY, RUN_AWS_FUNCTIONS
|
20 |
|
21 |
|
@@ -56,9 +56,7 @@ class CSVLogger_custom(FlaggingCallback):
|
|
56 |
self.simplify_file_data = simplify_file_data
|
57 |
self.verbose = verbose
|
58 |
self.dataset_file_name = dataset_file_name
|
59 |
-
self.lock = (
|
60 |
-
Lock() if not wasm_utils.IS_WASM else contextlib.nullcontext()
|
61 |
-
) # The multiprocessing module doesn't work on Lite.
|
62 |
|
63 |
def setup(
|
64 |
self,
|
|
|
15 |
from typing import TYPE_CHECKING, Any
|
16 |
from gradio_client import utils as client_utils
|
17 |
import gradio as gr
|
18 |
+
from gradio import utils
|
19 |
from tools.config import AWS_REGION, AWS_ACCESS_KEY, AWS_SECRET_KEY, RUN_AWS_FUNCTIONS
|
20 |
|
21 |
|
|
|
56 |
self.simplify_file_data = simplify_file_data
|
57 |
self.verbose = verbose
|
58 |
self.dataset_file_name = dataset_file_name
|
59 |
+
self.lock = Lock()
|
|
|
|
|
60 |
|
61 |
def setup(
|
62 |
self,
|
tools/custom_image_analyser_engine.py
CHANGED
@@ -260,10 +260,12 @@ class ContrastSegmentedImageEnhancer(ImagePreprocessor):
|
|
260 |
adjusted_contrast = contrast
|
261 |
return adjusted_image, contrast, adjusted_contrast
|
262 |
|
263 |
-
def preprocess_image(self, image: Image.Image) -> Tuple[Image.Image, dict]:
|
264 |
"""
|
265 |
A corrected, logical pipeline for OCR preprocessing.
|
266 |
Order: Greyscale -> Rescale -> Denoise -> Enhance Contrast -> Binarize
|
|
|
|
|
267 |
"""
|
268 |
# 1. Convert to greyscale NumPy array
|
269 |
image_np = self.convert_image_to_array(image)
|
@@ -278,9 +280,13 @@ class ContrastSegmentedImageEnhancer(ImagePreprocessor):
|
|
278 |
adjusted_image_np, _, _ = self._improve_contrast(filtered_image_np)
|
279 |
|
280 |
# 5. Adaptive Thresholding (Binarization) - This is the final step
|
281 |
-
|
282 |
-
|
283 |
-
|
|
|
|
|
|
|
|
|
284 |
|
285 |
# Combine metadata
|
286 |
final_metadata = {**scale_metadata, **threshold_metadata}
|
|
|
260 |
adjusted_contrast = contrast
|
261 |
return adjusted_image, contrast, adjusted_contrast
|
262 |
|
263 |
+
def preprocess_image(self, image: Image.Image, perform_binarization: bool = False) -> Tuple[Image.Image, dict]:
|
264 |
"""
|
265 |
A corrected, logical pipeline for OCR preprocessing.
|
266 |
Order: Greyscale -> Rescale -> Denoise -> Enhance Contrast -> Binarize
|
267 |
+
|
268 |
+
I have found that binarization is not always helpful with Tesseract, and can sometimes degrade results. So it is off by default.
|
269 |
"""
|
270 |
# 1. Convert to greyscale NumPy array
|
271 |
image_np = self.convert_image_to_array(image)
|
|
|
280 |
adjusted_image_np, _, _ = self._improve_contrast(filtered_image_np)
|
281 |
|
282 |
# 5. Adaptive Thresholding (Binarization) - This is the final step
|
283 |
+
if perform_binarization:
|
284 |
+
final_image_np, threshold_metadata = self.adaptive_threshold.preprocess_image(
|
285 |
+
adjusted_image_np
|
286 |
+
)
|
287 |
+
else:
|
288 |
+
final_image_np = adjusted_image_np
|
289 |
+
threshold_metadata = {}
|
290 |
|
291 |
# Combine metadata
|
292 |
final_metadata = {**scale_metadata, **threshold_metadata}
|
tools/data_anonymise.py
CHANGED
@@ -18,15 +18,19 @@ from botocore.client import BaseClient
|
|
18 |
from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
|
19 |
from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
|
20 |
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
|
21 |
-
from tools.config import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, OUTPUT_FOLDER, DEFAULT_LANGUAGE, aws_comprehend_language_choices, DO_INITIAL_TABULAR_DATA_CLEAN
|
22 |
-
from tools.helper_functions import get_file_name_without_type, read_file, detect_file_type
|
23 |
-
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer,
|
24 |
# Use custom version of analyze_dict to be able to track progress
|
25 |
from tools.presidio_analyzer_custom import analyze_dict
|
26 |
|
27 |
if DO_INITIAL_TABULAR_DATA_CLEAN == "True": DO_INITIAL_TABULAR_DATA_CLEAN = True
|
28 |
else: DO_INITIAL_TABULAR_DATA_CLEAN = False
|
29 |
|
|
|
|
|
|
|
|
|
30 |
fake = Faker("en_UK")
|
31 |
def fake_first_name(x):
|
32 |
return fake.first_name()
|
@@ -233,7 +237,7 @@ def anon_consistent_names(df:pd.DataFrame) -> pd.DataFrame:
|
|
233 |
def handle_docx_anonymisation(
|
234 |
file_path: str,
|
235 |
output_folder: str,
|
236 |
-
|
237 |
chosen_redact_entities: List[str],
|
238 |
in_allow_list: List[str],
|
239 |
in_deny_list: List[str],
|
@@ -274,15 +278,15 @@ def handle_docx_anonymisation(
|
|
274 |
# If there's no text to process, return early
|
275 |
if not original_texts:
|
276 |
print(f"No text found in {file_path}. Skipping.")
|
277 |
-
return None, None
|
278 |
|
279 |
# 2. Convert to a DataFrame for the existing anonymisation script
|
280 |
df_to_anonymise = pd.DataFrame({'text_to_redact': original_texts})
|
281 |
|
282 |
# 3. Call the core anonymisation script
|
283 |
-
anonymised_df, _, decision_log = anonymise_script(
|
284 |
df=df_to_anonymise,
|
285 |
-
|
286 |
language=language,
|
287 |
chosen_redact_entities=chosen_redact_entities,
|
288 |
in_allow_list=in_allow_list,
|
@@ -322,11 +326,11 @@ def handle_docx_anonymisation(
|
|
322 |
with open(log_file_path, "w", encoding="utf-8-sig") as f:
|
323 |
f.write(decision_log)
|
324 |
|
325 |
-
return output_docx_path, log_file_path, output_xlsx_path
|
326 |
|
327 |
def anonymise_files_with_open_text(file_paths: List[str],
|
328 |
in_text: str,
|
329 |
-
|
330 |
chosen_cols: List[str],
|
331 |
chosen_redact_entities: List[str],
|
332 |
in_allow_list: List[str] = None,
|
@@ -354,7 +358,7 @@ def anonymise_files_with_open_text(file_paths: List[str],
|
|
354 |
Parameters:
|
355 |
- file_paths (List[str]): A list of file paths to anonymise: '.xlsx', '.xls', '.csv', '.parquet', or '.docx'.
|
356 |
- in_text (str): The text to anonymise if file_paths is 'open_text'.
|
357 |
-
-
|
358 |
- chosen_cols (List[str]): A list of column names to anonymise.
|
359 |
- language (str): The language of the text to anonymise.
|
360 |
- chosen_redact_entities (List[str]): A list of entities to redact.
|
@@ -381,6 +385,9 @@ def anonymise_files_with_open_text(file_paths: List[str],
|
|
381 |
|
382 |
tic = time.perf_counter()
|
383 |
comprehend_client = ""
|
|
|
|
|
|
|
384 |
|
385 |
# Use provided language or default
|
386 |
language = language or DEFAULT_LANGUAGE
|
@@ -427,7 +434,10 @@ def anonymise_files_with_open_text(file_paths: List[str],
|
|
427 |
# Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
|
428 |
if pii_identification_method == "AWS Comprehend":
|
429 |
print("Trying to connect to AWS Comprehend service")
|
430 |
-
if
|
|
|
|
|
|
|
431 |
print("Connecting to Comprehend using AWS access key and secret keys from textboxes.")
|
432 |
print("aws_access_key_textbox:", aws_access_key_textbox)
|
433 |
print("aws_secret_access_key:", aws_secret_key_textbox)
|
@@ -459,14 +469,18 @@ def anonymise_files_with_open_text(file_paths: List[str],
|
|
459 |
if latest_file_completed >= len(file_paths):
|
460 |
print("Last file reached") #, returning files:", str(latest_file_completed))
|
461 |
# Set to a very high number so as not to mess with subsequent file processing by the user
|
462 |
-
latest_file_completed = 99
|
463 |
final_out_message = '\n'.join(out_message)
|
464 |
-
return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, actual_time_taken_number
|
465 |
|
466 |
file_path_loop = [file_paths[int(latest_file_completed)]]
|
467 |
-
|
468 |
for anon_file in progress.tqdm(file_path_loop, desc="Anonymising files", unit = "files"):
|
469 |
|
|
|
|
|
|
|
|
|
470 |
if anon_file=='open_text':
|
471 |
anon_df = pd.DataFrame(data={'text':[in_text]})
|
472 |
chosen_cols=['text']
|
@@ -474,19 +488,19 @@ def anonymise_files_with_open_text(file_paths: List[str],
|
|
474 |
sheet_name = ""
|
475 |
file_type = ""
|
476 |
|
477 |
-
out_file_paths, out_message, key_string, log_files_output_paths = tabular_anonymise_wrapper_func(
|
478 |
else:
|
479 |
# If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
|
480 |
-
file_type = detect_file_type(
|
481 |
print("File type is:", file_type)
|
482 |
|
483 |
-
out_file_part = get_file_name_without_type(
|
484 |
|
485 |
if file_type == 'docx':
|
486 |
-
output_path, log_path, output_xlsx_path = handle_docx_anonymisation(
|
487 |
-
file_path=
|
488 |
output_folder=output_folder,
|
489 |
-
|
490 |
chosen_redact_entities=chosen_redact_entities,
|
491 |
in_allow_list=in_allow_list_flat,
|
492 |
in_deny_list=in_deny_list,
|
@@ -512,7 +526,7 @@ def anonymise_files_with_open_text(file_paths: List[str],
|
|
512 |
continue
|
513 |
|
514 |
# Create xlsx file:
|
515 |
-
anon_xlsx = pd.ExcelFile(
|
516 |
anon_xlsx_export_file_name = output_folder + out_file_part + "_redacted.xlsx"
|
517 |
|
518 |
|
@@ -523,16 +537,16 @@ def anonymise_files_with_open_text(file_paths: List[str],
|
|
523 |
if sheet_name not in anon_xlsx.sheet_names:
|
524 |
continue
|
525 |
|
526 |
-
anon_df = pd.read_excel(
|
527 |
|
528 |
-
out_file_paths, out_message, key_string, log_files_output_paths = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name,
|
529 |
|
530 |
else:
|
531 |
sheet_name = ""
|
532 |
-
anon_df = read_file(
|
533 |
-
out_file_part = get_file_name_without_type(
|
534 |
|
535 |
-
out_file_paths, out_message, key_string, log_files_output_paths = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name,
|
536 |
|
537 |
# Increase latest file completed count unless we are at the last file
|
538 |
if latest_file_completed != len(file_paths):
|
@@ -554,14 +568,14 @@ def anonymise_files_with_open_text(file_paths: List[str],
|
|
554 |
out_message_out = '\n'.join(out_message)
|
555 |
out_message_out = out_message_out + " " + out_time
|
556 |
|
557 |
-
if
|
558 |
out_message_out.append(". Your decryption key is " + key_string)
|
559 |
|
560 |
out_message_out = out_message_out + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
|
561 |
|
562 |
out_message_out = re.sub(r'^\n+|^\. ', '', out_message_out).strip()
|
563 |
|
564 |
-
return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, actual_time_taken_number
|
565 |
|
566 |
def tabular_anonymise_wrapper_func(
|
567 |
anon_file: str,
|
@@ -571,7 +585,7 @@ def tabular_anonymise_wrapper_func(
|
|
571 |
out_file_part: str,
|
572 |
out_message: str,
|
573 |
excel_sheet_name: str,
|
574 |
-
|
575 |
language: str,
|
576 |
chosen_redact_entities: List[str],
|
577 |
in_allow_list: List[str],
|
@@ -600,7 +614,7 @@ def tabular_anonymise_wrapper_func(
|
|
600 |
- out_file_part: A part of the output file name.
|
601 |
- out_message: A message to be displayed during the anonymization process.
|
602 |
- excel_sheet_name: The name of the Excel sheet where the anonymized data will be exported.
|
603 |
-
-
|
604 |
- language: The language of the data to be anonymized.
|
605 |
- chosen_redact_entities: A list of entities to be redacted.
|
606 |
- in_allow_list: A list of allowed values.
|
@@ -648,7 +662,7 @@ def tabular_anonymise_wrapper_func(
|
|
648 |
out_message = "No chosen columns found in dataframe: " + out_file_part
|
649 |
key_string = ""
|
650 |
print(out_message)
|
651 |
-
return out_file_paths, out_message, key_string, log_files_output_paths
|
652 |
else:
|
653 |
chosen_cols_in_anon_df = get_common_strings(chosen_cols, all_cols_original_order)
|
654 |
|
@@ -663,7 +677,7 @@ def tabular_anonymise_wrapper_func(
|
|
663 |
|
664 |
|
665 |
# Anonymise the selected columns
|
666 |
-
anon_df_part_out, key_string, decision_process_output_str = anonymise_script(anon_df_part,
|
667 |
|
668 |
anon_df_part_out.replace("^nan$", "", regex=True, inplace=True)
|
669 |
|
@@ -673,10 +687,10 @@ def tabular_anonymise_wrapper_func(
|
|
673 |
|
674 |
# Export file
|
675 |
# Rename anonymisation strategy for file path naming
|
676 |
-
if
|
677 |
-
elif
|
678 |
-
elif
|
679 |
-
else: anon_strat_txt =
|
680 |
|
681 |
# If the file is an xlsx, add a new sheet to the existing xlsx. Otherwise, write to csv
|
682 |
if file_type == 'xlsx':
|
@@ -716,10 +730,10 @@ def tabular_anonymise_wrapper_func(
|
|
716 |
if anon_file=='open_text':
|
717 |
out_message = ["'" + anon_df_out['text'][0] + "'"]
|
718 |
|
719 |
-
return out_file_paths, out_message, key_string, log_files_output_paths
|
720 |
|
721 |
def anonymise_script(df:pd.DataFrame,
|
722 |
-
|
723 |
language:str,
|
724 |
chosen_redact_entities:List[str],
|
725 |
in_allow_list:List[str]=list(),
|
@@ -738,7 +752,7 @@ def anonymise_script(df:pd.DataFrame,
|
|
738 |
|
739 |
Args:
|
740 |
df (pd.DataFrame): The input DataFrame containing text to be anonymised.
|
741 |
-
|
742 |
language (str): The language of the text for analysis (e.g., "en", "es").
|
743 |
chosen_redact_entities (List[str]): A list of entity types to redact using the local (Presidio) method.
|
744 |
in_allow_list (List[str], optional): A list of terms to explicitly allow and not redact. Defaults to an empty list.
|
@@ -948,12 +962,15 @@ def anonymise_script(df:pd.DataFrame,
|
|
948 |
people_encrypt_config = eval('{"PERSON": OperatorConfig("encrypt", {"key": key_string})}') # The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption.
|
949 |
fake_first_name_config = eval('{"PERSON": OperatorConfig("custom", {"lambda": fake_first_name})}')
|
950 |
|
951 |
-
if
|
952 |
-
|
953 |
-
|
954 |
-
|
955 |
-
|
956 |
-
|
|
|
|
|
|
|
957 |
chosen_mask_config = people_encrypt_config
|
958 |
key = secrets.token_bytes(16) # 128 bits = 16 bytes
|
959 |
key_string = base64.b64encode(key).decode('utf-8')
|
@@ -962,7 +979,10 @@ def anonymise_script(df:pd.DataFrame,
|
|
962 |
for entity, operator in chosen_mask_config.items():
|
963 |
if operator.operator_name == "encrypt":
|
964 |
operator.params = {"key": key_string}
|
965 |
-
elif
|
|
|
|
|
|
|
966 |
|
967 |
# I think in general people will want to keep date / times - removed Mar 2025 as I don't want to assume for people.
|
968 |
#keep_date_config = eval('{"DATE_TIME": OperatorConfig("keep")}')
|
@@ -973,4 +993,4 @@ def anonymise_script(df:pd.DataFrame,
|
|
973 |
|
974 |
scrubbed_df = pd.DataFrame(anonymizer_results)
|
975 |
|
976 |
-
return scrubbed_df, key_string, decision_process_output_str
|
|
|
18 |
from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
|
19 |
from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
|
20 |
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
|
21 |
+
from tools.config import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, OUTPUT_FOLDER, DEFAULT_LANGUAGE, aws_comprehend_language_choices, DO_INITIAL_TABULAR_DATA_CLEAN, CUSTOM_ENTITIES, PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS, AWS_REGION
|
22 |
+
from tools.helper_functions import get_file_name_without_type, read_file, detect_file_type, _get_env_list
|
23 |
+
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, create_nlp_analyser, load_spacy_model
|
24 |
# Use custom version of analyze_dict to be able to track progress
|
25 |
from tools.presidio_analyzer_custom import analyze_dict
|
26 |
|
27 |
if DO_INITIAL_TABULAR_DATA_CLEAN == "True": DO_INITIAL_TABULAR_DATA_CLEAN = True
|
28 |
else: DO_INITIAL_TABULAR_DATA_CLEAN = False
|
29 |
|
30 |
+
if CUSTOM_ENTITIES: CUSTOM_ENTITIES = _get_env_list(CUSTOM_ENTITIES)
|
31 |
+
|
32 |
+
custom_entities = CUSTOM_ENTITIES
|
33 |
+
|
34 |
fake = Faker("en_UK")
|
35 |
def fake_first_name(x):
|
36 |
return fake.first_name()
|
|
|
237 |
def handle_docx_anonymisation(
|
238 |
file_path: str,
|
239 |
output_folder: str,
|
240 |
+
anon_strategy: str,
|
241 |
chosen_redact_entities: List[str],
|
242 |
in_allow_list: List[str],
|
243 |
in_deny_list: List[str],
|
|
|
278 |
# If there's no text to process, return early
|
279 |
if not original_texts:
|
280 |
print(f"No text found in {file_path}. Skipping.")
|
281 |
+
return None, None, 0
|
282 |
|
283 |
# 2. Convert to a DataFrame for the existing anonymisation script
|
284 |
df_to_anonymise = pd.DataFrame({'text_to_redact': original_texts})
|
285 |
|
286 |
# 3. Call the core anonymisation script
|
287 |
+
anonymised_df, _, decision_log, comprehend_query_number = anonymise_script(
|
288 |
df=df_to_anonymise,
|
289 |
+
anon_strategy=anon_strategy,
|
290 |
language=language,
|
291 |
chosen_redact_entities=chosen_redact_entities,
|
292 |
in_allow_list=in_allow_list,
|
|
|
326 |
with open(log_file_path, "w", encoding="utf-8-sig") as f:
|
327 |
f.write(decision_log)
|
328 |
|
329 |
+
return output_docx_path, log_file_path, output_xlsx_path, comprehend_query_number
|
330 |
|
331 |
def anonymise_files_with_open_text(file_paths: List[str],
|
332 |
in_text: str,
|
333 |
+
anon_strategy: str,
|
334 |
chosen_cols: List[str],
|
335 |
chosen_redact_entities: List[str],
|
336 |
in_allow_list: List[str] = None,
|
|
|
358 |
Parameters:
|
359 |
- file_paths (List[str]): A list of file paths to anonymise: '.xlsx', '.xls', '.csv', '.parquet', or '.docx'.
|
360 |
- in_text (str): The text to anonymise if file_paths is 'open_text'.
|
361 |
+
- anon_strategy (str): The anonymisation strategy to use.
|
362 |
- chosen_cols (List[str]): A list of column names to anonymise.
|
363 |
- language (str): The language of the text to anonymise.
|
364 |
- chosen_redact_entities (List[str]): A list of entities to redact.
|
|
|
385 |
|
386 |
tic = time.perf_counter()
|
387 |
comprehend_client = ""
|
388 |
+
|
389 |
+
# If output folder doesn't end with a forward slash, add one
|
390 |
+
if not output_folder.endswith('/'): output_folder = output_folder + '/'
|
391 |
|
392 |
# Use provided language or default
|
393 |
language = language or DEFAULT_LANGUAGE
|
|
|
434 |
# Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
|
435 |
if pii_identification_method == "AWS Comprehend":
|
436 |
print("Trying to connect to AWS Comprehend service")
|
437 |
+
if RUN_AWS_FUNCTIONS == "1" and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS == "1":
|
438 |
+
print("Connecting to Comprehend via existing SSO connection")
|
439 |
+
comprehend_client = boto3.client('comprehend', region_name=AWS_REGION)
|
440 |
+
elif aws_access_key_textbox and aws_secret_key_textbox:
|
441 |
print("Connecting to Comprehend using AWS access key and secret keys from textboxes.")
|
442 |
print("aws_access_key_textbox:", aws_access_key_textbox)
|
443 |
print("aws_secret_access_key:", aws_secret_key_textbox)
|
|
|
469 |
if latest_file_completed >= len(file_paths):
|
470 |
print("Last file reached") #, returning files:", str(latest_file_completed))
|
471 |
# Set to a very high number so as not to mess with subsequent file processing by the user
|
472 |
+
#latest_file_completed = 99
|
473 |
final_out_message = '\n'.join(out_message)
|
474 |
+
return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, actual_time_taken_number, comprehend_query_number
|
475 |
|
476 |
file_path_loop = [file_paths[int(latest_file_completed)]]
|
477 |
+
|
478 |
for anon_file in progress.tqdm(file_path_loop, desc="Anonymising files", unit = "files"):
|
479 |
|
480 |
+
# Get a string file path
|
481 |
+
if isinstance(anon_file, str): file_path = anon_file
|
482 |
+
else: file_path = anon_file
|
483 |
+
|
484 |
if anon_file=='open_text':
|
485 |
anon_df = pd.DataFrame(data={'text':[in_text]})
|
486 |
chosen_cols=['text']
|
|
|
488 |
sheet_name = ""
|
489 |
file_type = ""
|
490 |
|
491 |
+
out_file_paths, out_message, key_string, log_files_output_paths, comprehend_query_number = tabular_anonymise_wrapper_func(file_path, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strategy, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=OUTPUT_FOLDER, do_initial_clean=do_initial_clean)
|
492 |
else:
|
493 |
# If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
|
494 |
+
file_type = detect_file_type(file_path)
|
495 |
print("File type is:", file_type)
|
496 |
|
497 |
+
out_file_part = get_file_name_without_type(file_path)
|
498 |
|
499 |
if file_type == 'docx':
|
500 |
+
output_path, log_path, output_xlsx_path, comprehend_query_number = handle_docx_anonymisation(
|
501 |
+
file_path=file_path,
|
502 |
output_folder=output_folder,
|
503 |
+
anon_strategy=anon_strategy,
|
504 |
chosen_redact_entities=chosen_redact_entities,
|
505 |
in_allow_list=in_allow_list_flat,
|
506 |
in_deny_list=in_deny_list,
|
|
|
526 |
continue
|
527 |
|
528 |
# Create xlsx file:
|
529 |
+
anon_xlsx = pd.ExcelFile(file_path)
|
530 |
anon_xlsx_export_file_name = output_folder + out_file_part + "_redacted.xlsx"
|
531 |
|
532 |
|
|
|
537 |
if sheet_name not in anon_xlsx.sheet_names:
|
538 |
continue
|
539 |
|
540 |
+
anon_df = pd.read_excel(file_path, sheet_name=sheet_name)
|
541 |
|
542 |
+
out_file_paths, out_message, key_string, log_files_output_paths, comprehend_query_number = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strategy, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name, log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, language, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder, do_initial_clean=do_initial_clean)
|
543 |
|
544 |
else:
|
545 |
sheet_name = ""
|
546 |
+
anon_df = read_file(file_path)
|
547 |
+
out_file_part = get_file_name_without_type(file_path)
|
548 |
|
549 |
+
out_file_paths, out_message, key_string, log_files_output_paths, comprehend_query_number = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strategy, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, language, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder, do_initial_clean=do_initial_clean)
|
550 |
|
551 |
# Increase latest file completed count unless we are at the last file
|
552 |
if latest_file_completed != len(file_paths):
|
|
|
568 |
out_message_out = '\n'.join(out_message)
|
569 |
out_message_out = out_message_out + " " + out_time
|
570 |
|
571 |
+
if anon_strategy == "encrypt":
|
572 |
out_message_out.append(". Your decryption key is " + key_string)
|
573 |
|
574 |
out_message_out = out_message_out + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
|
575 |
|
576 |
out_message_out = re.sub(r'^\n+|^\. ', '', out_message_out).strip()
|
577 |
|
578 |
+
return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, actual_time_taken_number, comprehend_query_number
|
579 |
|
580 |
def tabular_anonymise_wrapper_func(
|
581 |
anon_file: str,
|
|
|
585 |
out_file_part: str,
|
586 |
out_message: str,
|
587 |
excel_sheet_name: str,
|
588 |
+
anon_strategy: str,
|
589 |
language: str,
|
590 |
chosen_redact_entities: List[str],
|
591 |
in_allow_list: List[str],
|
|
|
614 |
- out_file_part: A part of the output file name.
|
615 |
- out_message: A message to be displayed during the anonymization process.
|
616 |
- excel_sheet_name: The name of the Excel sheet where the anonymized data will be exported.
|
617 |
+
- anon_strategy: The anonymization strategy to be applied.
|
618 |
- language: The language of the data to be anonymized.
|
619 |
- chosen_redact_entities: A list of entities to be redacted.
|
620 |
- in_allow_list: A list of allowed values.
|
|
|
662 |
out_message = "No chosen columns found in dataframe: " + out_file_part
|
663 |
key_string = ""
|
664 |
print(out_message)
|
665 |
+
return out_file_paths, out_message, key_string, log_files_output_paths, comprehend_query_number
|
666 |
else:
|
667 |
chosen_cols_in_anon_df = get_common_strings(chosen_cols, all_cols_original_order)
|
668 |
|
|
|
677 |
|
678 |
|
679 |
# Anonymise the selected columns
|
680 |
+
anon_df_part_out, key_string, decision_process_output_str, comprehend_query_number = anonymise_script(anon_df_part, anon_strategy, language, chosen_redact_entities, in_allow_list, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, nlp_analyser=nlp_analyser, do_initial_clean=do_initial_clean)
|
681 |
|
682 |
anon_df_part_out.replace("^nan$", "", regex=True, inplace=True)
|
683 |
|
|
|
687 |
|
688 |
# Export file
|
689 |
# Rename anonymisation strategy for file path naming
|
690 |
+
if anon_strategy == "replace with 'REDACTED'": anon_strat_txt = "redact_replace"
|
691 |
+
elif anon_strategy == "replace with <ENTITY_NAME>": anon_strat_txt = "redact_entity_type"
|
692 |
+
elif anon_strategy == "redact completely": anon_strat_txt = "redact_remove"
|
693 |
+
else: anon_strat_txt = anon_strategy
|
694 |
|
695 |
# If the file is an xlsx, add a new sheet to the existing xlsx. Otherwise, write to csv
|
696 |
if file_type == 'xlsx':
|
|
|
730 |
if anon_file=='open_text':
|
731 |
out_message = ["'" + anon_df_out['text'][0] + "'"]
|
732 |
|
733 |
+
return out_file_paths, out_message, key_string, log_files_output_paths, comprehend_query_number
|
734 |
|
735 |
def anonymise_script(df:pd.DataFrame,
|
736 |
+
anon_strategy:str,
|
737 |
language:str,
|
738 |
chosen_redact_entities:List[str],
|
739 |
in_allow_list:List[str]=list(),
|
|
|
752 |
|
753 |
Args:
|
754 |
df (pd.DataFrame): The input DataFrame containing text to be anonymised.
|
755 |
+
anon_strategy (str): The anonymisation strategy to apply (e.g., "replace with 'REDACTED'", "replace with <ENTITY_NAME>", "redact completely").
|
756 |
language (str): The language of the text for analysis (e.g., "en", "es").
|
757 |
chosen_redact_entities (List[str]): A list of entity types to redact using the local (Presidio) method.
|
758 |
in_allow_list (List[str], optional): A list of terms to explicitly allow and not redact. Defaults to an empty list.
|
|
|
962 |
people_encrypt_config = eval('{"PERSON": OperatorConfig("encrypt", {"key": key_string})}') # The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption.
|
963 |
fake_first_name_config = eval('{"PERSON": OperatorConfig("custom", {"lambda": fake_first_name})}')
|
964 |
|
965 |
+
if anon_strategy == "replace with 'REDACTED'": chosen_mask_config = simple_replace_config
|
966 |
+
elif anon_strategy == "replace_redacted": chosen_mask_config = simple_replace_config
|
967 |
+
elif anon_strategy == "replace with <ENTITY_NAME>": chosen_mask_config = replace_config
|
968 |
+
elif anon_strategy == "entity_type": chosen_mask_config = replace_config
|
969 |
+
elif anon_strategy == "redact completely": chosen_mask_config = redact_config
|
970 |
+
elif anon_strategy == "redact": chosen_mask_config = redact_config
|
971 |
+
elif anon_strategy == "hash": chosen_mask_config = hash_config
|
972 |
+
elif anon_strategy == "mask": chosen_mask_config = mask_config
|
973 |
+
elif anon_strategy == "encrypt":
|
974 |
chosen_mask_config = people_encrypt_config
|
975 |
key = secrets.token_bytes(16) # 128 bits = 16 bytes
|
976 |
key_string = base64.b64encode(key).decode('utf-8')
|
|
|
979 |
for entity, operator in chosen_mask_config.items():
|
980 |
if operator.operator_name == "encrypt":
|
981 |
operator.params = {"key": key_string}
|
982 |
+
elif anon_strategy == "fake_first_name": chosen_mask_config = fake_first_name_config
|
983 |
+
else:
|
984 |
+
print("Anonymisation strategy not found. Redacting completely by default.")
|
985 |
+
chosen_mask_config = redact_config # Redact completely by default
|
986 |
|
987 |
# I think in general people will want to keep date / times - removed Mar 2025 as I don't want to assume for people.
|
988 |
#keep_date_config = eval('{"DATE_TIME": OperatorConfig("keep")}')
|
|
|
993 |
|
994 |
scrubbed_df = pd.DataFrame(anonymizer_results)
|
995 |
|
996 |
+
return scrubbed_df, key_string, decision_process_output_str, comprehend_query_number
|
tools/example_cli_calls.txt
DELETED
@@ -1,30 +0,0 @@
|
|
1 |
-
python cli_redact.py --help
|
2 |
-
|
3 |
-
python cli_redact.py \
|
4 |
-
--input_file "documents/confidential-report.pdf" \
|
5 |
-
--output_dir "output/redacted_reports/" \
|
6 |
-
--ocr_method "Local OCR model - PDFs without selectable text" \
|
7 |
-
--pii_detector "Local" \
|
8 |
-
--page_min 2 \
|
9 |
-
--page_max 10 \
|
10 |
-
--allow_list "config/project_allowlist.csv"
|
11 |
-
|
12 |
-
python your_cli_script.py \
|
13 |
-
--input_file "data/customer_data.xlsx" \
|
14 |
-
--output_dir "output/anonymised_data/" \
|
15 |
-
--anon_strat "redact" \
|
16 |
-
--columns "Customer Name" "Email" \
|
17 |
-
--excel_sheets "Q3-Data"
|
18 |
-
|
19 |
-
python your_cli_script.py \
|
20 |
-
--input_file "legal_docs/legal_agreement.docx" \
|
21 |
-
--output_dir "output/anonymised_docs/" \
|
22 |
-
--anon_strat "encrypt" \
|
23 |
-
--deny_list "config/codenames.csv" \
|
24 |
-
--language "en"
|
25 |
-
|
26 |
-
python cli_redact.py --task deduplicate --input_file ocr_output.csv --duplicate_type pages --similarity_threshold 0.95 --min_word_count 5
|
27 |
-
|
28 |
-
python cli_redact.py --task deduplicate --input_file data.csv --duplicate_type tabular --text_columns "Name" "Email" "Description"
|
29 |
-
|
30 |
-
python cli_redact.py --task deduplicate --input_file ocr_output.csv --duplicate_type pages --search_query "confidential information"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tools/file_redaction.py
CHANGED
@@ -20,11 +20,11 @@ import gradio as gr
|
|
20 |
from gradio import Progress
|
21 |
from collections import defaultdict # For efficient grouping
|
22 |
|
23 |
-
from tools.config import OUTPUT_FOLDER, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, PAGE_BREAK_VALUE, MAX_TIME_VALUE, LOAD_TRUNCATED_IMAGES, INPUT_FOLDER, RETURN_PDF_END_OF_REDACTION, TESSERACT_TEXT_EXTRACT_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, LOCAL_PII_OPTION, AWS_PII_OPTION, NO_REDACTION_PII_OPTION, DEFAULT_LANGUAGE, textract_language_choices, aws_comprehend_language_choices
|
24 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, recreate_page_line_level_ocr_results_with_page
|
25 |
from tools.file_conversion import convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, is_pdf, is_pdf_or_image, prepare_image_or_pdf, divide_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, divide_coordinates_by_page_sizes, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, fill_missing_box_ids, load_and_convert_ocr_results_with_words_json, save_pdf_with_or_without_compression, word_level_ocr_output_to_dataframe
|
26 |
-
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold,
|
27 |
-
from tools.helper_functions import get_file_name_without_type, clean_unicode_text
|
28 |
from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
|
29 |
|
30 |
ImageFile.LOAD_TRUNCATED_IMAGES = LOAD_TRUNCATED_IMAGES.lower() == "true"
|
@@ -34,6 +34,10 @@ image_dpi = float(IMAGES_DPI)
|
|
34 |
|
35 |
RETURN_PDF_END_OF_REDACTION = RETURN_PDF_END_OF_REDACTION.lower() == "true"
|
36 |
|
|
|
|
|
|
|
|
|
37 |
def bounding_boxes_overlap(box1, box2):
|
38 |
"""Check if two bounding boxes overlap."""
|
39 |
return (box1[0] < box2[2] and box2[0] < box1[2] and
|
@@ -91,7 +95,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
91 |
chosen_redact_entities:List[str],
|
92 |
chosen_redact_comprehend_entities:List[str],
|
93 |
text_extraction_method:str,
|
94 |
-
in_allow_list:List[
|
95 |
in_deny_list:List[str]=list(),
|
96 |
redact_whole_page_list:List[str]=list(),
|
97 |
latest_file_completed:int=0,
|
@@ -146,9 +150,9 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
146 |
- chosen_redact_entities (List[str]): A list of entity types to redact from the files using the local model (spacy) with Microsoft Presidio.
|
147 |
- chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service.
|
148 |
- text_extraction_method (str): The method to use to extract text from documents.
|
149 |
-
- in_allow_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to
|
150 |
-
- in_deny_list (List[List[str]], optional): A list of denied terms for redaction. Defaults to
|
151 |
-
- redact_whole_page_list (List[List[str]], optional): A list of whole page numbers for redaction. Defaults to
|
152 |
- latest_file_completed (int, optional): The index of the last completed file. Defaults to 0.
|
153 |
- combined_out_message (list, optional): A list to store output messages. Defaults to an empty list.
|
154 |
- out_file_paths (list, optional): A list to store paths to the output files. Defaults to an empty list.
|
@@ -202,8 +206,19 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
202 |
pdf_file_name_without_ext = ""
|
203 |
page_break_return = False
|
204 |
blank_request_metadata = list()
|
|
|
205 |
all_textract_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
|
206 |
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
|
208 |
# Use provided language or default
|
209 |
language = language or DEFAULT_LANGUAGE
|
@@ -284,6 +299,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
284 |
progress(0.95, "Completed last file, performing final checks")
|
285 |
current_loop_page = 0
|
286 |
|
|
|
|
|
287 |
if isinstance(out_message, list) and out_message:
|
288 |
combined_out_message = combined_out_message + '\n'.join(out_message)
|
289 |
elif out_message:
|
@@ -312,7 +329,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
312 |
|
313 |
page_break_return = True
|
314 |
|
315 |
-
return combined_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, current_loop_page, page_break_return, all_page_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df, review_file_state
|
316 |
|
317 |
#if first_loop_state == False:
|
318 |
# Prepare documents and images as required if they don't already exist
|
@@ -346,10 +363,11 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
346 |
file_paths_loop, text_extraction_method, all_page_line_level_ocr_results_df, all_page_line_level_ocr_results_with_words_df, 0, out_message, True,
|
347 |
annotate_max_pages, annotations_all_pages, document_cropboxes, redact_whole_page_list,
|
348 |
output_folder=output_folder, prepare_images=prepare_images_flag, page_sizes=page_sizes, pymupdf_doc=pymupdf_doc, input_folder=input_folder
|
349 |
-
)
|
350 |
-
|
351 |
-
page_sizes_df = pd.DataFrame(page_sizes)
|
352 |
|
|
|
|
|
353 |
if page_sizes_df.empty:
|
354 |
page_sizes_df=pd.DataFrame(columns=["page", "image_path", "image_width", "image_height", "mediabox_width", "mediabox_height", "cropbox_width", "cropbox_height", "original_cropbox"])
|
355 |
page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
|
@@ -378,32 +396,45 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
378 |
|
379 |
page_break_return = False
|
380 |
|
381 |
-
return combined_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, current_loop_page, page_break_return, all_page_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df, review_file_state
|
382 |
|
383 |
-
|
|
|
|
|
384 |
# If string, assume file path
|
385 |
-
if isinstance(in_allow_list, str):
|
|
|
|
|
386 |
# Now, should be a pandas dataframe format
|
387 |
-
if
|
388 |
-
|
|
|
|
|
|
|
389 |
else:
|
390 |
in_allow_list_flat = list()
|
391 |
|
|
|
392 |
# If string, assume file path
|
393 |
if isinstance(in_deny_list, str):
|
394 |
-
|
|
|
|
|
395 |
if isinstance(in_deny_list, pd.DataFrame):
|
396 |
if not in_deny_list.empty:
|
397 |
custom_recogniser_word_list_flat = in_deny_list.iloc[:, 0].tolist()
|
398 |
else:
|
399 |
custom_recogniser_word_list_flat = list()
|
400 |
-
|
401 |
# Sort the strings in order from the longest string to the shortest
|
402 |
custom_recogniser_word_list_flat = sorted(custom_recogniser_word_list_flat, key=len, reverse=True)
|
|
|
|
|
403 |
|
|
|
404 |
# If string, assume file path
|
405 |
if isinstance(redact_whole_page_list, str):
|
406 |
-
|
|
|
407 |
if isinstance(redact_whole_page_list, pd.DataFrame):
|
408 |
if not redact_whole_page_list.empty:
|
409 |
try:
|
@@ -412,13 +443,18 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
412 |
print("Could not convert whole page redaction data to number list due to:", e)
|
413 |
redact_whole_page_list_flat = redact_whole_page_list.iloc[:,0].tolist()
|
414 |
else:
|
415 |
-
redact_whole_page_list_flat = list()
|
|
|
|
|
416 |
|
417 |
-
|
418 |
|
419 |
# Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
|
420 |
if pii_identification_method == AWS_PII_OPTION:
|
421 |
-
if
|
|
|
|
|
|
|
422 |
print("Connecting to Comprehend using AWS access key and secret keys from user input.")
|
423 |
comprehend_client = boto3.client('comprehend',
|
424 |
aws_access_key_id=aws_access_key_textbox,
|
@@ -441,7 +477,10 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
441 |
|
442 |
# Try to connect to AWS Textract Client if using that text extraction method
|
443 |
if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
|
444 |
-
if
|
|
|
|
|
|
|
445 |
print("Connecting to Textract using AWS access key and secret keys from user input.")
|
446 |
textract_client = boto3.client('textract',
|
447 |
aws_access_key_id=aws_access_key_textbox,
|
@@ -647,7 +686,10 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
647 |
print("Saving redacted PDF file:", out_redacted_pdf_file_path)
|
648 |
save_pdf_with_or_without_compression(pymupdf_doc, out_redacted_pdf_file_path)
|
649 |
|
650 |
-
|
|
|
|
|
|
|
651 |
|
652 |
if not all_page_line_level_ocr_results_df.empty:
|
653 |
all_page_line_level_ocr_results_df = all_page_line_level_ocr_results_df[["page", "text", "left", "top", "width", "height", "line"]]
|
@@ -658,8 +700,12 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
658 |
all_page_line_level_ocr_results_df.sort_values(["page", "line"], inplace=True)
|
659 |
all_page_line_level_ocr_results_df.to_csv(ocr_file_path, index = None, encoding="utf-8-sig")
|
660 |
|
661 |
-
|
662 |
-
|
|
|
|
|
|
|
|
|
663 |
|
664 |
if all_page_line_level_ocr_results_with_words:
|
665 |
all_page_line_level_ocr_results_with_words = merge_page_results(all_page_line_level_ocr_results_with_words)
|
@@ -670,14 +716,12 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
670 |
all_page_line_level_ocr_results_with_words_df = word_level_ocr_output_to_dataframe(all_page_line_level_ocr_results_with_words)
|
671 |
|
672 |
all_page_line_level_ocr_results_with_words_df = divide_coordinates_by_page_sizes(all_page_line_level_ocr_results_with_words_df, page_sizes_df, xmin="word_x0", xmax="word_x1", ymin="word_y0", ymax="word_y1")
|
673 |
-
|
|
|
674 |
|
675 |
if text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION:
|
676 |
# Coordinates need to be reversed for ymin and ymax to match with image annotator objects downstream
|
677 |
if not all_page_line_level_ocr_results_with_words_df.empty:
|
678 |
-
|
679 |
-
# all_page_line_level_ocr_results_with_words_df['line_y0'] = reverse_y_coords(all_page_line_level_ocr_results_with_words_df, 'line_y0')
|
680 |
-
# all_page_line_level_ocr_results_with_words_df['line_y1'] = reverse_y_coords(all_page_line_level_ocr_results_with_words_df, 'line_y1')
|
681 |
all_page_line_level_ocr_results_with_words_df['word_y0'] = reverse_y_coords(all_page_line_level_ocr_results_with_words_df, 'word_y0')
|
682 |
all_page_line_level_ocr_results_with_words_df['word_y1'] = reverse_y_coords(all_page_line_level_ocr_results_with_words_df, 'word_y1')
|
683 |
|
@@ -685,20 +729,32 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
685 |
all_page_line_level_ocr_results_with_words_df['line_x0'] = ""
|
686 |
all_page_line_level_ocr_results_with_words_df['line_x1'] = ""
|
687 |
all_page_line_level_ocr_results_with_words_df['line_y0'] = ""
|
688 |
-
all_page_line_level_ocr_results_with_words_df['line_y1'] = ""
|
689 |
|
690 |
all_page_line_level_ocr_results_with_words_df.sort_values(["page", "line", "word_x0"], inplace=True)
|
691 |
all_page_line_level_ocr_results_with_words_df_file_path = all_page_line_level_ocr_results_with_words_json_file_path.replace(".json", ".csv")
|
692 |
all_page_line_level_ocr_results_with_words_df.to_csv(all_page_line_level_ocr_results_with_words_df_file_path, index = None, encoding="utf-8-sig")
|
693 |
|
694 |
if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
|
|
|
|
|
|
|
|
|
695 |
log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
|
696 |
|
697 |
if all_page_line_level_ocr_results_with_words_df_file_path not in log_files_output_paths:
|
698 |
-
|
|
|
|
|
|
|
699 |
|
700 |
if all_page_line_level_ocr_results_with_words_df_file_path not in out_file_paths:
|
701 |
-
|
|
|
|
|
|
|
|
|
|
|
702 |
|
703 |
# Convert the gradio annotation boxes to relative coordinates
|
704 |
progress(0.93, "Creating review file output")
|
@@ -711,18 +767,27 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
711 |
# Save the gradio_annotation_boxes to a review csv file
|
712 |
review_file_state = convert_annotation_json_to_review_df(annotations_all_pages_divide, all_pages_decision_process_table, page_sizes=page_sizes)
|
713 |
|
|
|
714 |
# Don't need page sizes in outputs
|
715 |
review_file_state.drop(["image_width", "image_height", "mediabox_width", "mediabox_height", "cropbox_width", "cropbox_height"], axis=1, inplace=True, errors="ignore")
|
716 |
|
717 |
-
|
|
|
|
|
|
|
718 |
|
719 |
-
if pii_identification_method != NO_REDACTION_PII_OPTION:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
720 |
|
721 |
-
|
722 |
-
|
723 |
-
combined_out_message = combined_out_message + '\n'.join(out_message) # Ensure out_message is a list of strings
|
724 |
-
elif out_message:
|
725 |
-
combined_out_message = combined_out_message + '\n' + out_message
|
726 |
|
727 |
toc = time.perf_counter()
|
728 |
time_taken = toc - tic
|
@@ -747,7 +812,11 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
747 |
with open(all_textract_request_metadata_file_path, "w") as f: f.write(all_request_metadata_str)
|
748 |
|
749 |
# Add the request metadata to the log outputs if not there already
|
750 |
-
if all_textract_request_metadata_file_path not in log_files_output_paths:
|
|
|
|
|
|
|
|
|
751 |
|
752 |
new_textract_query_numbers = len(all_textract_request_metadata)
|
753 |
total_textract_query_number += new_textract_query_numbers
|
@@ -764,7 +833,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
764 |
|
765 |
page_break_return = True
|
766 |
|
767 |
-
return combined_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages_divide, current_loop_page, page_break_return, all_page_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df, review_file_state
|
768 |
|
769 |
def convert_pikepdf_coords_to_pymupdf(pymupdf_page:Page, pikepdf_bbox, type="pikepdf_annot"):
|
770 |
'''
|
@@ -1266,10 +1335,13 @@ def merge_img_bboxes(bboxes: list, combined_results: Dict, page_signature_recogn
|
|
1266 |
|
1267 |
# Process signature and handwriting results
|
1268 |
if page_signature_recogniser_results or page_handwriting_recogniser_results:
|
|
|
1269 |
if "Extract handwriting" in handwrite_signature_checkbox:
|
|
|
1270 |
merged_bboxes.extend(copy.deepcopy(page_handwriting_recogniser_results))
|
1271 |
|
1272 |
if "Extract signatures" in handwrite_signature_checkbox:
|
|
|
1273 |
merged_bboxes.extend(copy.deepcopy(page_signature_recogniser_results))
|
1274 |
|
1275 |
# Reconstruct bounding boxes for substrings of interest
|
@@ -2385,7 +2457,10 @@ def redact_text_pdf(
|
|
2385 |
all_page_line_text_extraction_characters.extend(line_characters)
|
2386 |
all_page_line_level_ocr_results_with_words.append(line_level_ocr_results_with_words)
|
2387 |
|
2388 |
-
|
|
|
|
|
|
|
2389 |
|
2390 |
### REDACTION
|
2391 |
if pii_identification_method != NO_REDACTION_PII_OPTION:
|
|
|
20 |
from gradio import Progress
|
21 |
from collections import defaultdict # For efficient grouping
|
22 |
|
23 |
+
from tools.config import OUTPUT_FOLDER, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, PAGE_BREAK_VALUE, MAX_TIME_VALUE, LOAD_TRUNCATED_IMAGES, INPUT_FOLDER, RETURN_PDF_END_OF_REDACTION, TESSERACT_TEXT_EXTRACT_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, LOCAL_PII_OPTION, AWS_PII_OPTION, NO_REDACTION_PII_OPTION, DEFAULT_LANGUAGE, textract_language_choices, aws_comprehend_language_choices, CUSTOM_ENTITIES, PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS
|
24 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, recreate_page_line_level_ocr_results_with_page
|
25 |
from tools.file_conversion import convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, is_pdf, is_pdf_or_image, prepare_image_or_pdf, divide_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, divide_coordinates_by_page_sizes, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, fill_missing_box_ids, load_and_convert_ocr_results_with_words_json, save_pdf_with_or_without_compression, word_level_ocr_output_to_dataframe
|
26 |
+
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer, load_spacy_model, download_tesseract_lang_pack, create_nlp_analyser
|
27 |
+
from tools.helper_functions import get_file_name_without_type, clean_unicode_text, _get_env_list
|
28 |
from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
|
29 |
|
30 |
ImageFile.LOAD_TRUNCATED_IMAGES = LOAD_TRUNCATED_IMAGES.lower() == "true"
|
|
|
34 |
|
35 |
RETURN_PDF_END_OF_REDACTION = RETURN_PDF_END_OF_REDACTION.lower() == "true"
|
36 |
|
37 |
+
if CUSTOM_ENTITIES: CUSTOM_ENTITIES = _get_env_list(CUSTOM_ENTITIES)
|
38 |
+
|
39 |
+
custom_entities = CUSTOM_ENTITIES
|
40 |
+
|
41 |
def bounding_boxes_overlap(box1, box2):
|
42 |
"""Check if two bounding boxes overlap."""
|
43 |
return (box1[0] < box2[2] and box2[0] < box1[2] and
|
|
|
95 |
chosen_redact_entities:List[str],
|
96 |
chosen_redact_comprehend_entities:List[str],
|
97 |
text_extraction_method:str,
|
98 |
+
in_allow_list:List[str]=list(),
|
99 |
in_deny_list:List[str]=list(),
|
100 |
redact_whole_page_list:List[str]=list(),
|
101 |
latest_file_completed:int=0,
|
|
|
150 |
- chosen_redact_entities (List[str]): A list of entity types to redact from the files using the local model (spacy) with Microsoft Presidio.
|
151 |
- chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service.
|
152 |
- text_extraction_method (str): The method to use to extract text from documents.
|
153 |
+
- in_allow_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to empty list. Can also be entered as a string path to a CSV file, or as a single column pandas dataframe.
|
154 |
+
- in_deny_list (List[List[str]], optional): A list of denied terms for redaction. Defaults to empty list. Can also be entered as a string path to a CSV file, or as a single column pandas dataframe.
|
155 |
+
- redact_whole_page_list (List[List[str]], optional): A list of whole page numbers for redaction. Defaults to empty list. Can also be entered as a string path to a CSV file, or as a single column pandas dataframe.
|
156 |
- latest_file_completed (int, optional): The index of the last completed file. Defaults to 0.
|
157 |
- combined_out_message (list, optional): A list to store output messages. Defaults to an empty list.
|
158 |
- out_file_paths (list, optional): A list to store paths to the output files. Defaults to an empty list.
|
|
|
206 |
pdf_file_name_without_ext = ""
|
207 |
page_break_return = False
|
208 |
blank_request_metadata = list()
|
209 |
+
custom_recogniser_word_list_flat = list()
|
210 |
all_textract_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
|
211 |
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
212 |
+
task_textbox = "redact"
|
213 |
+
|
214 |
+
# CLI mode may provide options to enter method names in a different format
|
215 |
+
if text_extraction_method == "AWS Textract": text_extraction_method = TEXTRACT_TEXT_EXTRACT_OPTION
|
216 |
+
if text_extraction_method == "Local OCR": text_extraction_method = TESSERACT_TEXT_EXTRACT_OPTION
|
217 |
+
if text_extraction_method == "Local text": text_extraction_method = SELECTABLE_TEXT_EXTRACT_OPTION
|
218 |
+
if pii_identification_method == "None": pii_identification_method = NO_REDACTION_PII_OPTION
|
219 |
+
|
220 |
+
# If output folder doesn't end with a forward slash, add one
|
221 |
+
if not output_folder.endswith('/'): output_folder = output_folder + '/'
|
222 |
|
223 |
# Use provided language or default
|
224 |
language = language or DEFAULT_LANGUAGE
|
|
|
299 |
progress(0.95, "Completed last file, performing final checks")
|
300 |
current_loop_page = 0
|
301 |
|
302 |
+
if isinstance(combined_out_message, list): combined_out_message = '\n'.join(combined_out_message)
|
303 |
+
|
304 |
if isinstance(out_message, list) and out_message:
|
305 |
combined_out_message = combined_out_message + '\n'.join(out_message)
|
306 |
elif out_message:
|
|
|
329 |
|
330 |
page_break_return = True
|
331 |
|
332 |
+
return combined_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, current_loop_page, page_break_return, all_page_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df, review_file_state, task_textbox
|
333 |
|
334 |
#if first_loop_state == False:
|
335 |
# Prepare documents and images as required if they don't already exist
|
|
|
363 |
file_paths_loop, text_extraction_method, all_page_line_level_ocr_results_df, all_page_line_level_ocr_results_with_words_df, 0, out_message, True,
|
364 |
annotate_max_pages, annotations_all_pages, document_cropboxes, redact_whole_page_list,
|
365 |
output_folder=output_folder, prepare_images=prepare_images_flag, page_sizes=page_sizes, pymupdf_doc=pymupdf_doc, input_folder=input_folder
|
366 |
+
)
|
367 |
+
|
|
|
368 |
|
369 |
+
page_sizes_df = pd.DataFrame(page_sizes)
|
370 |
+
|
371 |
if page_sizes_df.empty:
|
372 |
page_sizes_df=pd.DataFrame(columns=["page", "image_path", "image_width", "image_height", "mediabox_width", "mediabox_height", "cropbox_width", "cropbox_height", "original_cropbox"])
|
373 |
page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
|
|
|
396 |
|
397 |
page_break_return = False
|
398 |
|
399 |
+
return combined_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, current_loop_page, page_break_return, all_page_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df, review_file_state, task_textbox
|
400 |
|
401 |
+
### Load/create allow list, deny list, and whole page redaction list
|
402 |
+
|
403 |
+
### Load/create allow list
|
404 |
# If string, assume file path
|
405 |
+
if isinstance(in_allow_list, str):
|
406 |
+
if in_allow_list:
|
407 |
+
in_allow_list = pd.read_csv(in_allow_list, header=None)
|
408 |
# Now, should be a pandas dataframe format
|
409 |
+
if isinstance(in_allow_list, pd.DataFrame):
|
410 |
+
if not in_allow_list.empty:
|
411 |
+
in_allow_list_flat = in_allow_list.iloc[:,0].tolist()
|
412 |
+
else:
|
413 |
+
in_allow_list_flat = list()
|
414 |
else:
|
415 |
in_allow_list_flat = list()
|
416 |
|
417 |
+
### Load/create deny list
|
418 |
# If string, assume file path
|
419 |
if isinstance(in_deny_list, str):
|
420 |
+
if in_deny_list:
|
421 |
+
in_deny_list = pd.read_csv(in_deny_list, header=None)
|
422 |
+
|
423 |
if isinstance(in_deny_list, pd.DataFrame):
|
424 |
if not in_deny_list.empty:
|
425 |
custom_recogniser_word_list_flat = in_deny_list.iloc[:, 0].tolist()
|
426 |
else:
|
427 |
custom_recogniser_word_list_flat = list()
|
|
|
428 |
# Sort the strings in order from the longest string to the shortest
|
429 |
custom_recogniser_word_list_flat = sorted(custom_recogniser_word_list_flat, key=len, reverse=True)
|
430 |
+
else:
|
431 |
+
custom_recogniser_word_list_flat = list()
|
432 |
|
433 |
+
### Load/create whole page redaction list
|
434 |
# If string, assume file path
|
435 |
if isinstance(redact_whole_page_list, str):
|
436 |
+
if redact_whole_page_list:
|
437 |
+
redact_whole_page_list = pd.read_csv(redact_whole_page_list, header=None)
|
438 |
if isinstance(redact_whole_page_list, pd.DataFrame):
|
439 |
if not redact_whole_page_list.empty:
|
440 |
try:
|
|
|
443 |
print("Could not convert whole page redaction data to number list due to:", e)
|
444 |
redact_whole_page_list_flat = redact_whole_page_list.iloc[:,0].tolist()
|
445 |
else:
|
446 |
+
redact_whole_page_list_flat = list()
|
447 |
+
else:
|
448 |
+
redact_whole_page_list_flat = list()
|
449 |
|
450 |
+
### Load/create PII identification method
|
451 |
|
452 |
# Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
|
453 |
if pii_identification_method == AWS_PII_OPTION:
|
454 |
+
if RUN_AWS_FUNCTIONS == "1" and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS == "1":
|
455 |
+
print("Connecting to Comprehend via existing SSO connection")
|
456 |
+
comprehend_client = boto3.client('comprehend', region_name=AWS_REGION)
|
457 |
+
elif aws_access_key_textbox and aws_secret_key_textbox:
|
458 |
print("Connecting to Comprehend using AWS access key and secret keys from user input.")
|
459 |
comprehend_client = boto3.client('comprehend',
|
460 |
aws_access_key_id=aws_access_key_textbox,
|
|
|
477 |
|
478 |
# Try to connect to AWS Textract Client if using that text extraction method
|
479 |
if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
|
480 |
+
if RUN_AWS_FUNCTIONS == "1" and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS == "1":
|
481 |
+
print("Connecting to Textract via existing SSO connection")
|
482 |
+
textract_client = boto3.client('textract', region_name=AWS_REGION)
|
483 |
+
elif aws_access_key_textbox and aws_secret_key_textbox:
|
484 |
print("Connecting to Textract using AWS access key and secret keys from user input.")
|
485 |
textract_client = boto3.client('textract',
|
486 |
aws_access_key_id=aws_access_key_textbox,
|
|
|
686 |
print("Saving redacted PDF file:", out_redacted_pdf_file_path)
|
687 |
save_pdf_with_or_without_compression(pymupdf_doc, out_redacted_pdf_file_path)
|
688 |
|
689 |
+
if isinstance(out_redacted_pdf_file_path, str):
|
690 |
+
out_file_paths.append(out_redacted_pdf_file_path)
|
691 |
+
else:
|
692 |
+
out_file_paths.append(out_redacted_pdf_file_path[0])
|
693 |
|
694 |
if not all_page_line_level_ocr_results_df.empty:
|
695 |
all_page_line_level_ocr_results_df = all_page_line_level_ocr_results_df[["page", "text", "left", "top", "width", "height", "line"]]
|
|
|
700 |
all_page_line_level_ocr_results_df.sort_values(["page", "line"], inplace=True)
|
701 |
all_page_line_level_ocr_results_df.to_csv(ocr_file_path, index = None, encoding="utf-8-sig")
|
702 |
|
703 |
+
|
704 |
+
|
705 |
+
if isinstance(ocr_file_path, str):
|
706 |
+
out_file_paths.append(ocr_file_path)
|
707 |
+
else:
|
708 |
+
duplication_file_path_outputs.append(ocr_file_path[0])
|
709 |
|
710 |
if all_page_line_level_ocr_results_with_words:
|
711 |
all_page_line_level_ocr_results_with_words = merge_page_results(all_page_line_level_ocr_results_with_words)
|
|
|
716 |
all_page_line_level_ocr_results_with_words_df = word_level_ocr_output_to_dataframe(all_page_line_level_ocr_results_with_words)
|
717 |
|
718 |
all_page_line_level_ocr_results_with_words_df = divide_coordinates_by_page_sizes(all_page_line_level_ocr_results_with_words_df, page_sizes_df, xmin="word_x0", xmax="word_x1", ymin="word_y0", ymax="word_y1")
|
719 |
+
|
720 |
+
|
721 |
|
722 |
if text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION:
|
723 |
# Coordinates need to be reversed for ymin and ymax to match with image annotator objects downstream
|
724 |
if not all_page_line_level_ocr_results_with_words_df.empty:
|
|
|
|
|
|
|
725 |
all_page_line_level_ocr_results_with_words_df['word_y0'] = reverse_y_coords(all_page_line_level_ocr_results_with_words_df, 'word_y0')
|
726 |
all_page_line_level_ocr_results_with_words_df['word_y1'] = reverse_y_coords(all_page_line_level_ocr_results_with_words_df, 'word_y1')
|
727 |
|
|
|
729 |
all_page_line_level_ocr_results_with_words_df['line_x0'] = ""
|
730 |
all_page_line_level_ocr_results_with_words_df['line_x1'] = ""
|
731 |
all_page_line_level_ocr_results_with_words_df['line_y0'] = ""
|
732 |
+
all_page_line_level_ocr_results_with_words_df['line_y1'] = ""
|
733 |
|
734 |
all_page_line_level_ocr_results_with_words_df.sort_values(["page", "line", "word_x0"], inplace=True)
|
735 |
all_page_line_level_ocr_results_with_words_df_file_path = all_page_line_level_ocr_results_with_words_json_file_path.replace(".json", ".csv")
|
736 |
all_page_line_level_ocr_results_with_words_df.to_csv(all_page_line_level_ocr_results_with_words_df_file_path, index = None, encoding="utf-8-sig")
|
737 |
|
738 |
if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
|
739 |
+
if isinstance(all_page_line_level_ocr_results_with_words_json_file_path, str):
|
740 |
+
log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
|
741 |
+
else:
|
742 |
+
log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path[0])
|
743 |
log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
|
744 |
|
745 |
if all_page_line_level_ocr_results_with_words_df_file_path not in log_files_output_paths:
|
746 |
+
if isinstance(all_page_line_level_ocr_results_with_words_df_file_path, str):
|
747 |
+
log_files_output_paths.append(all_page_line_level_ocr_results_with_words_df_file_path)
|
748 |
+
else:
|
749 |
+
log_files_output_paths.append(all_page_line_level_ocr_results_with_words_df_file_path[0])
|
750 |
|
751 |
if all_page_line_level_ocr_results_with_words_df_file_path not in out_file_paths:
|
752 |
+
if isinstance(all_page_line_level_ocr_results_with_words_df_file_path, str):
|
753 |
+
out_file_paths.append(all_page_line_level_ocr_results_with_words_df_file_path)
|
754 |
+
else:
|
755 |
+
out_file_paths.append(all_page_line_level_ocr_results_with_words_df_file_path[0])
|
756 |
+
|
757 |
+
|
758 |
|
759 |
# Convert the gradio annotation boxes to relative coordinates
|
760 |
progress(0.93, "Creating review file output")
|
|
|
767 |
# Save the gradio_annotation_boxes to a review csv file
|
768 |
review_file_state = convert_annotation_json_to_review_df(annotations_all_pages_divide, all_pages_decision_process_table, page_sizes=page_sizes)
|
769 |
|
770 |
+
|
771 |
# Don't need page sizes in outputs
|
772 |
review_file_state.drop(["image_width", "image_height", "mediabox_width", "mediabox_height", "cropbox_width", "cropbox_height"], axis=1, inplace=True, errors="ignore")
|
773 |
|
774 |
+
if isinstance(review_file_path, str):
|
775 |
+
review_file_state.to_csv(review_file_path, index=None, encoding="utf-8-sig")
|
776 |
+
else:
|
777 |
+
review_file_state.to_csv(review_file_path[0], index=None, encoding="utf-8-sig")
|
778 |
|
779 |
+
if pii_identification_method != NO_REDACTION_PII_OPTION:
|
780 |
+
if isinstance(review_file_path, str):
|
781 |
+
out_file_paths.append(review_file_path)
|
782 |
+
else:
|
783 |
+
out_file_paths.append(review_file_path[0])
|
784 |
+
|
785 |
+
# Make a combined message for the file
|
786 |
+
if isinstance(combined_out_message, list): combined_out_message = '\n'.join(combined_out_message)
|
787 |
+
elif combined_out_message == None: combined_out_message = ""
|
788 |
|
789 |
+
if isinstance(out_message, list) and out_message: combined_out_message = combined_out_message + '\n'.join(out_message)
|
790 |
+
elif isinstance(out_message, str) and out_message: combined_out_message = combined_out_message + '\n' + out_message
|
|
|
|
|
|
|
791 |
|
792 |
toc = time.perf_counter()
|
793 |
time_taken = toc - tic
|
|
|
812 |
with open(all_textract_request_metadata_file_path, "w") as f: f.write(all_request_metadata_str)
|
813 |
|
814 |
# Add the request metadata to the log outputs if not there already
|
815 |
+
if all_textract_request_metadata_file_path not in log_files_output_paths:
|
816 |
+
if isinstance(all_textract_request_metadata_file_path, str):
|
817 |
+
log_files_output_paths.append(all_textract_request_metadata_file_path)
|
818 |
+
else:
|
819 |
+
log_files_output_paths.append(all_textract_request_metadata_file_path[0])
|
820 |
|
821 |
new_textract_query_numbers = len(all_textract_request_metadata)
|
822 |
total_textract_query_number += new_textract_query_numbers
|
|
|
833 |
|
834 |
page_break_return = True
|
835 |
|
836 |
+
return combined_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages_divide, current_loop_page, page_break_return, all_page_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df, review_file_state, task_textbox
|
837 |
|
838 |
def convert_pikepdf_coords_to_pymupdf(pymupdf_page:Page, pikepdf_bbox, type="pikepdf_annot"):
|
839 |
'''
|
|
|
1335 |
|
1336 |
# Process signature and handwriting results
|
1337 |
if page_signature_recogniser_results or page_handwriting_recogniser_results:
|
1338 |
+
|
1339 |
if "Extract handwriting" in handwrite_signature_checkbox:
|
1340 |
+
print("Extracting handwriting in merge_img_bboxes function")
|
1341 |
merged_bboxes.extend(copy.deepcopy(page_handwriting_recogniser_results))
|
1342 |
|
1343 |
if "Extract signatures" in handwrite_signature_checkbox:
|
1344 |
+
print("Extracting signatures in merge_img_bboxes function")
|
1345 |
merged_bboxes.extend(copy.deepcopy(page_signature_recogniser_results))
|
1346 |
|
1347 |
# Reconstruct bounding boxes for substrings of interest
|
|
|
2457 |
all_page_line_text_extraction_characters.extend(line_characters)
|
2458 |
all_page_line_level_ocr_results_with_words.append(line_level_ocr_results_with_words)
|
2459 |
|
2460 |
+
if page_text_ocr_outputs_list:
|
2461 |
+
page_text_ocr_outputs = pd.concat(page_text_ocr_outputs_list)
|
2462 |
+
else:
|
2463 |
+
page_text_ocr_outputs = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height", "line"])
|
2464 |
|
2465 |
### REDACTION
|
2466 |
if pii_identification_method != NO_REDACTION_PII_OPTION:
|
tools/find_duplicate_pages.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import pandas as pd
|
2 |
import os
|
3 |
import re
|
4 |
-
|
5 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
6 |
from sklearn.metrics.pairwise import cosine_similarity
|
7 |
from typing import List, Tuple, Optional, Dict, Union
|
@@ -725,7 +725,7 @@ def identify_similar_text_sequences(
|
|
725 |
do_text_clean:bool = True,
|
726 |
file1_name: str = '',
|
727 |
file2_name: str = '',
|
728 |
-
output_folder: str =
|
729 |
progress=Progress(track_tqdm=True)
|
730 |
) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
|
731 |
"""
|
@@ -870,7 +870,7 @@ def handle_selection_and_preview(evt: gr.SelectData, results_df:pd.DataFrame, fu
|
|
870 |
# 3. Return all three outputs in the correct order
|
871 |
return selected_index, page1_data, page2_data
|
872 |
|
873 |
-
def exclude_match(results_df:pd.DataFrame, selected_index:int, output_folder=
|
874 |
"""
|
875 |
Removes a selected row from the results DataFrame, regenerates output files,
|
876 |
and clears the text preview panes.
|
@@ -895,12 +895,16 @@ def exclude_match(results_df:pd.DataFrame, selected_index:int, output_folder="./
|
|
895 |
# Return the updated dataframe, the new file list, and clear the preview panes
|
896 |
return updated_df, new_output_paths, None, None
|
897 |
|
898 |
-
def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:int, min_consecutive:int, greedy_match:bool, combine_pages:bool=True, preview_length:int=500, progress=gr.Progress(track_tqdm=True)):
|
899 |
"""
|
900 |
Wrapper function updated to include the 'greedy_match' boolean.
|
901 |
"""
|
902 |
if not files:
|
903 |
raise Warning("Please upload files to analyse.")
|
|
|
|
|
|
|
|
|
904 |
|
905 |
progress(0, desc="Combining input files...")
|
906 |
df_combined, _, full_out_ocr_df = combine_ocr_output_text(files, combine_pages=combine_pages)
|
@@ -916,6 +920,7 @@ def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:
|
|
916 |
min_consecutive_pages=int(min_consecutive),
|
917 |
greedy_match=greedy_match,
|
918 |
combine_pages=combine_pages,
|
|
|
919 |
progress=progress
|
920 |
)
|
921 |
|
@@ -929,8 +934,11 @@ def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:
|
|
929 |
|
930 |
if results_df.empty:
|
931 |
gr.Info(f"No duplicate pages found, no results returned.")
|
|
|
|
|
|
|
932 |
|
933 |
-
return results_df, output_paths, full_data_by_file
|
934 |
|
935 |
def show_page_previews(full_data_by_file: dict, results_df: pd.DataFrame, evt: gr.SelectData, preview_length:int=500):
|
936 |
"""
|
|
|
1 |
import pandas as pd
|
2 |
import os
|
3 |
import re
|
4 |
+
import time
|
5 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
6 |
from sklearn.metrics.pairwise import cosine_similarity
|
7 |
from typing import List, Tuple, Optional, Dict, Union
|
|
|
725 |
do_text_clean:bool = True,
|
726 |
file1_name: str = '',
|
727 |
file2_name: str = '',
|
728 |
+
output_folder: str = OUTPUT_FOLDER,
|
729 |
progress=Progress(track_tqdm=True)
|
730 |
) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
|
731 |
"""
|
|
|
870 |
# 3. Return all three outputs in the correct order
|
871 |
return selected_index, page1_data, page2_data
|
872 |
|
873 |
+
def exclude_match(results_df:pd.DataFrame, selected_index:int, output_folder=OUTPUT_FOLDER):
|
874 |
"""
|
875 |
Removes a selected row from the results DataFrame, regenerates output files,
|
876 |
and clears the text preview panes.
|
|
|
895 |
# Return the updated dataframe, the new file list, and clear the preview panes
|
896 |
return updated_df, new_output_paths, None, None
|
897 |
|
898 |
+
def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:int, min_consecutive:int, greedy_match:bool, combine_pages:bool=True, preview_length:int=500, output_folder:str=OUTPUT_FOLDER, progress=gr.Progress(track_tqdm=True)):
|
899 |
"""
|
900 |
Wrapper function updated to include the 'greedy_match' boolean.
|
901 |
"""
|
902 |
if not files:
|
903 |
raise Warning("Please upload files to analyse.")
|
904 |
+
|
905 |
+
start_time = time.time()
|
906 |
+
|
907 |
+
task_textbox = "deduplicate"
|
908 |
|
909 |
progress(0, desc="Combining input files...")
|
910 |
df_combined, _, full_out_ocr_df = combine_ocr_output_text(files, combine_pages=combine_pages)
|
|
|
920 |
min_consecutive_pages=int(min_consecutive),
|
921 |
greedy_match=greedy_match,
|
922 |
combine_pages=combine_pages,
|
923 |
+
output_folder=output_folder,
|
924 |
progress=progress
|
925 |
)
|
926 |
|
|
|
934 |
|
935 |
if results_df.empty:
|
936 |
gr.Info(f"No duplicate pages found, no results returned.")
|
937 |
+
|
938 |
+
end_time = time.time()
|
939 |
+
processing_time = round(end_time - start_time, 2)
|
940 |
|
941 |
+
return results_df, output_paths, full_data_by_file, processing_time, task_textbox
|
942 |
|
943 |
def show_page_previews(full_data_by_file: dict, results_df: pd.DataFrame, evt: gr.SelectData, preview_length:int=500):
|
944 |
"""
|
tools/find_duplicate_tabular.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import pandas as pd
|
2 |
import os
|
3 |
import re
|
|
|
4 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
5 |
from sklearn.metrics.pairwise import cosine_similarity
|
6 |
from typing import List, Tuple, Dict
|
@@ -10,9 +11,10 @@ from pathlib import Path
|
|
10 |
from tools.helper_functions import OUTPUT_FOLDER, read_file
|
11 |
from tools.data_anonymise import initial_clean
|
12 |
from tools.load_spacy_model_custom_recognisers import nlp
|
13 |
-
from tools.config import DO_INITIAL_TABULAR_DATA_CLEAN
|
14 |
|
15 |
-
|
|
|
16 |
|
17 |
def clean_and_stem_text_series(df: pd.DataFrame, column: str, do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN):
|
18 |
"""
|
@@ -51,13 +53,17 @@ def convert_tabular_data_to_analysis_format(
|
|
51 |
Returns:
|
52 |
List[Tuple[str, pd.DataFrame]]: List containing (file_name, processed_df) tuple
|
53 |
"""
|
54 |
-
if text_columns is None:
|
55 |
-
|
56 |
-
|
|
|
|
|
|
|
|
|
57 |
|
58 |
if not text_columns:
|
59 |
print(f"No text columns found in {file_name}")
|
60 |
-
return
|
61 |
|
62 |
# Create a copy to avoid modifying original
|
63 |
df_copy = df.copy()
|
@@ -69,9 +75,9 @@ def convert_tabular_data_to_analysis_format(
|
|
69 |
df_copy['row_id'] = df_copy.index
|
70 |
|
71 |
# Create the format expected by the duplicate detection system
|
72 |
-
# Using '
|
73 |
processed_df = pd.DataFrame({
|
74 |
-
'
|
75 |
'text': df_copy['combined_text'],
|
76 |
'file': file_name
|
77 |
})
|
@@ -86,13 +92,15 @@ def find_duplicate_cells_in_tabular_data(
|
|
86 |
input_files: List[str],
|
87 |
similarity_threshold: float = 0.95,
|
88 |
min_word_count: int = 3,
|
89 |
-
text_columns: List[str] =
|
90 |
output_folder: str = OUTPUT_FOLDER,
|
91 |
do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN,
|
|
|
|
|
92 |
progress: Progress = Progress(track_tqdm=True)
|
93 |
) -> Tuple[pd.DataFrame, List[str], Dict[str, pd.DataFrame]]:
|
94 |
"""
|
95 |
-
Find duplicate cells/text in tabular data files (CSV, XLSX).
|
96 |
|
97 |
Args:
|
98 |
input_files (List[str]): List of file paths to analyze
|
@@ -115,26 +123,49 @@ def find_duplicate_cells_in_tabular_data(
|
|
115 |
|
116 |
progress(0.1, desc="Loading and processing files...")
|
117 |
|
118 |
-
all_data_to_process =
|
119 |
-
full_data_by_file =
|
120 |
-
file_paths =
|
121 |
|
122 |
# Process each file
|
123 |
for file_path in input_files:
|
124 |
try:
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
|
135 |
-
|
136 |
-
|
137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
|
139 |
except Exception as e:
|
140 |
print(f"Error processing {file_path}: {e}")
|
@@ -147,6 +178,8 @@ def find_duplicate_cells_in_tabular_data(
|
|
147 |
|
148 |
# Combine all data
|
149 |
combined_df = pd.concat([data[1] for data in all_data_to_process], ignore_index=True)
|
|
|
|
|
150 |
|
151 |
progress(0.3, desc="Cleaning and preparing text...")
|
152 |
|
@@ -188,9 +221,9 @@ def find_duplicate_cells_in_tabular_data(
|
|
188 |
|
189 |
results_data.append({
|
190 |
'File1': row1_data['file'],
|
191 |
-
'Row1': int(row1_data['
|
192 |
'File2': row2_data['file'],
|
193 |
-
'Row2': int(row2_data['
|
194 |
'Similarity_Score': round(similarity, 3),
|
195 |
'Text1': row1_data['text'][:200] + '...' if len(row1_data['text']) > 200 else row1_data['text'],
|
196 |
'Text2': row2_data['text'][:200] + '...' if len(row2_data['text']) > 200 else row2_data['text'],
|
@@ -204,13 +237,13 @@ def find_duplicate_cells_in_tabular_data(
|
|
204 |
progress(0.9, desc="Saving results...")
|
205 |
|
206 |
# Save results
|
207 |
-
output_paths = save_tabular_duplicate_results(results_df, output_folder, file_paths,
|
208 |
|
209 |
gr.Info(f"Found {len(results_df)} duplicate cell matches")
|
210 |
|
211 |
return results_df, output_paths, full_data_by_file
|
212 |
|
213 |
-
def save_tabular_duplicate_results(results_df: pd.DataFrame, output_folder: str, file_paths: List[str],
|
214 |
"""
|
215 |
Save tabular duplicate detection results to files.
|
216 |
|
@@ -218,52 +251,163 @@ def save_tabular_duplicate_results(results_df: pd.DataFrame, output_folder: str,
|
|
218 |
results_df (pd.DataFrame): Results DataFrame
|
219 |
output_folder (str): Output folder path
|
220 |
file_paths (List[str]): List of file paths
|
221 |
-
|
222 |
-
|
223 |
Returns:
|
224 |
List[str]: List of output file paths
|
225 |
"""
|
226 |
-
output_paths =
|
227 |
output_folder_path = Path(output_folder)
|
228 |
output_folder_path.mkdir(exist_ok=True)
|
229 |
|
230 |
if results_df.empty:
|
231 |
print("No duplicate matches to save.")
|
232 |
-
return
|
233 |
|
234 |
# Save main results
|
235 |
results_file = output_folder_path / 'tabular_duplicate_results.csv'
|
236 |
results_df.to_csv(results_file, index=False, encoding="utf-8-sig")
|
237 |
output_paths.append(str(results_file))
|
238 |
|
|
|
|
|
|
|
239 |
# Save per-file duplicate lists
|
240 |
-
for file_name, group in results_df.groupby('
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
260 |
|
261 |
return output_paths
|
262 |
|
263 |
def remove_duplicate_rows_from_tabular_data(
|
264 |
file_path: str,
|
265 |
duplicate_rows: List[int],
|
266 |
-
output_folder: str = OUTPUT_FOLDER
|
|
|
|
|
267 |
) -> str:
|
268 |
"""
|
269 |
Remove duplicate rows from a tabular data file.
|
@@ -272,13 +416,14 @@ def remove_duplicate_rows_from_tabular_data(
|
|
272 |
file_path (str): Path to the input file
|
273 |
duplicate_rows (List[int]): List of row indices to remove
|
274 |
output_folder (str): Output folder for cleaned file
|
275 |
-
|
|
|
276 |
Returns:
|
277 |
str: Path to the cleaned file
|
278 |
"""
|
279 |
try:
|
280 |
# Load the file
|
281 |
-
df = read_file(file_path)
|
282 |
|
283 |
# Remove duplicate rows (0-indexed)
|
284 |
df_cleaned = df.drop(index=duplicate_rows).reset_index(drop=True)
|
@@ -286,12 +431,12 @@ def remove_duplicate_rows_from_tabular_data(
|
|
286 |
# Save cleaned file
|
287 |
file_name = os.path.basename(file_path)
|
288 |
file_stem = os.path.splitext(file_name)[0]
|
289 |
-
file_ext = os.path.splitext(file_name)[1]
|
290 |
|
291 |
output_path = os.path.join(output_folder, f"{file_stem}_deduplicated{file_ext}")
|
292 |
|
293 |
if file_ext in ['.xlsx', '.xls']:
|
294 |
-
df_cleaned.to_excel(output_path, index=False)
|
295 |
elif file_ext in ['.parquet']:
|
296 |
df_cleaned.to_parquet(output_path, index=False)
|
297 |
else:
|
@@ -307,9 +452,11 @@ def run_tabular_duplicate_analysis(
|
|
307 |
files: List[str],
|
308 |
threshold: float,
|
309 |
min_words: int,
|
310 |
-
text_columns: List[str] =
|
311 |
output_folder: str = OUTPUT_FOLDER,
|
312 |
do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN,
|
|
|
|
|
313 |
progress: Progress = Progress(track_tqdm=True)
|
314 |
) -> Tuple[pd.DataFrame, List[str], Dict[str, pd.DataFrame]]:
|
315 |
"""
|
@@ -330,26 +477,37 @@ def run_tabular_duplicate_analysis(
|
|
330 |
input_files=files,
|
331 |
similarity_threshold=threshold,
|
332 |
min_word_count=min_words,
|
333 |
-
text_columns=text_columns,
|
334 |
output_folder=output_folder,
|
335 |
do_initial_clean_dup=do_initial_clean_dup,
|
336 |
-
|
|
|
337 |
)
|
338 |
|
339 |
|
340 |
|
341 |
# Function to update column choices when files are uploaded
|
342 |
-
def update_tabular_column_choices(files):
|
343 |
if not files:
|
344 |
return gr.update(choices=[])
|
345 |
|
346 |
all_columns = set()
|
347 |
for file in files:
|
348 |
try:
|
349 |
-
|
350 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
351 |
# Get text columns
|
352 |
text_cols = df.select_dtypes(include=['object', 'string']).columns.tolist()
|
|
|
353 |
all_columns.update(text_cols)
|
354 |
except Exception as e:
|
355 |
print(f"Error reading {file.name}: {e}")
|
@@ -358,26 +516,59 @@ def update_tabular_column_choices(files):
|
|
358 |
return gr.Dropdown(choices=sorted(list(all_columns)))
|
359 |
|
360 |
# Function to handle tabular duplicate detection
|
361 |
-
def run_tabular_duplicate_detection(files, threshold, min_words, text_columns, output_folder: str = OUTPUT_FOLDER, do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN):
|
362 |
if not files:
|
363 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
364 |
|
365 |
-
file_paths = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
366 |
results_df, output_paths, full_data = run_tabular_duplicate_analysis(
|
367 |
files=file_paths,
|
368 |
threshold=threshold,
|
369 |
min_words=min_words,
|
370 |
-
text_columns=text_columns if text_columns else
|
371 |
output_folder=output_folder,
|
372 |
-
do_initial_clean_dup=do_initial_clean_dup
|
|
|
|
|
373 |
)
|
374 |
-
|
375 |
-
print("output_paths:", output_paths)
|
376 |
|
377 |
# Update file choices for cleaning
|
378 |
file_choices = list(set([f for f in file_paths]))
|
|
|
|
|
|
|
379 |
|
380 |
-
return results_df, output_paths, gr.Dropdown(choices=file_choices)
|
381 |
|
382 |
# Function to handle row selection for preview
|
383 |
def handle_tabular_row_selection(results_df, evt:gr.SelectData):
|
@@ -398,12 +589,12 @@ def handle_tabular_row_selection(results_df, evt:gr.SelectData):
|
|
398 |
return selected_index, row['Text1'], row['Text2']
|
399 |
|
400 |
# Function to clean duplicates from selected file
|
401 |
-
def clean_tabular_duplicates(file_name, results_df, output_folder):
|
402 |
if not file_name or results_df.empty:
|
403 |
return None
|
404 |
|
405 |
# Get duplicate rows for this file
|
406 |
-
file_duplicates = results_df[results_df['
|
407 |
|
408 |
if not file_duplicates:
|
409 |
return None
|
@@ -414,7 +605,9 @@ def clean_tabular_duplicates(file_name, results_df, output_folder):
|
|
414 |
cleaned_file = remove_duplicate_rows_from_tabular_data(
|
415 |
file_path=file_name,
|
416 |
duplicate_rows=file_duplicates,
|
417 |
-
output_folder=output_folder
|
|
|
|
|
418 |
)
|
419 |
return cleaned_file
|
420 |
except Exception as e:
|
|
|
1 |
import pandas as pd
|
2 |
import os
|
3 |
import re
|
4 |
+
import time
|
5 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
6 |
from sklearn.metrics.pairwise import cosine_similarity
|
7 |
from typing import List, Tuple, Dict
|
|
|
11 |
from tools.helper_functions import OUTPUT_FOLDER, read_file
|
12 |
from tools.data_anonymise import initial_clean
|
13 |
from tools.load_spacy_model_custom_recognisers import nlp
|
14 |
+
from tools.config import DO_INITIAL_TABULAR_DATA_CLEAN, REMOVE_DUPLICATE_ROWS
|
15 |
|
16 |
+
if REMOVE_DUPLICATE_ROWS == "True": REMOVE_DUPLICATE_ROWS = True
|
17 |
+
else: REMOVE_DUPLICATE_ROWS = False
|
18 |
|
19 |
def clean_and_stem_text_series(df: pd.DataFrame, column: str, do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN):
|
20 |
"""
|
|
|
53 |
Returns:
|
54 |
List[Tuple[str, pd.DataFrame]]: List containing (file_name, processed_df) tuple
|
55 |
"""
|
56 |
+
# if text_columns is None:
|
57 |
+
# # Auto-detect text columns (string type columns)
|
58 |
+
# print(f"No text columns given for {file_name}")
|
59 |
+
# return []
|
60 |
+
# text_columns = df.select_dtypes(include=['object', 'string']).columns.tolist()
|
61 |
+
|
62 |
+
text_columns = [col for col in text_columns if col in df.columns]
|
63 |
|
64 |
if not text_columns:
|
65 |
print(f"No text columns found in {file_name}")
|
66 |
+
return list()
|
67 |
|
68 |
# Create a copy to avoid modifying original
|
69 |
df_copy = df.copy()
|
|
|
75 |
df_copy['row_id'] = df_copy.index
|
76 |
|
77 |
# Create the format expected by the duplicate detection system
|
78 |
+
# Using 'row_number' as row number and 'text' as the combined text
|
79 |
processed_df = pd.DataFrame({
|
80 |
+
'row_number': df_copy['row_id'],
|
81 |
'text': df_copy['combined_text'],
|
82 |
'file': file_name
|
83 |
})
|
|
|
92 |
input_files: List[str],
|
93 |
similarity_threshold: float = 0.95,
|
94 |
min_word_count: int = 3,
|
95 |
+
text_columns: List[str] = [],
|
96 |
output_folder: str = OUTPUT_FOLDER,
|
97 |
do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN,
|
98 |
+
remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS,
|
99 |
+
in_excel_tabular_sheets: str = "",
|
100 |
progress: Progress = Progress(track_tqdm=True)
|
101 |
) -> Tuple[pd.DataFrame, List[str], Dict[str, pd.DataFrame]]:
|
102 |
"""
|
103 |
+
Find duplicate cells/text in tabular data files (CSV, XLSX, Parquet).
|
104 |
|
105 |
Args:
|
106 |
input_files (List[str]): List of file paths to analyze
|
|
|
123 |
|
124 |
progress(0.1, desc="Loading and processing files...")
|
125 |
|
126 |
+
all_data_to_process = list()
|
127 |
+
full_data_by_file = dict()
|
128 |
+
file_paths = list()
|
129 |
|
130 |
# Process each file
|
131 |
for file_path in input_files:
|
132 |
try:
|
133 |
+
if file_path.endswith('.xlsx') or file_path.endswith('.xls'):
|
134 |
+
temp_df = pd.DataFrame()
|
135 |
+
|
136 |
+
# Try finding each sheet in the given list until a match is found
|
137 |
+
for sheet_name in in_excel_tabular_sheets:
|
138 |
+
temp_df = read_file(file_path, excel_sheet_name=sheet_name)
|
139 |
+
|
140 |
+
# If sheet was successfully_loaded
|
141 |
+
if not temp_df.empty:
|
142 |
+
file_name = os.path.basename(file_path) + "_" + sheet_name
|
143 |
+
file_paths.append(file_path)
|
144 |
+
|
145 |
+
# Convert to analysis format
|
146 |
+
processed_data = convert_tabular_data_to_analysis_format(
|
147 |
+
temp_df, file_name, text_columns
|
148 |
+
)
|
149 |
+
|
150 |
+
if processed_data:
|
151 |
+
all_data_to_process.extend(processed_data)
|
152 |
+
full_data_by_file[file_name] = processed_data[0][1]
|
153 |
+
|
154 |
+
temp_df = pd.DataFrame()
|
155 |
+
else:
|
156 |
+
temp_df = read_file(file_path)
|
157 |
|
158 |
+
file_name = os.path.basename(file_path)
|
159 |
+
file_paths.append(file_path)
|
160 |
+
|
161 |
+
# Convert to analysis format
|
162 |
+
processed_data = convert_tabular_data_to_analysis_format(
|
163 |
+
temp_df, file_name, text_columns
|
164 |
+
)
|
165 |
+
|
166 |
+
if processed_data:
|
167 |
+
all_data_to_process.extend(processed_data)
|
168 |
+
full_data_by_file[file_name] = processed_data[0][1]
|
169 |
|
170 |
except Exception as e:
|
171 |
print(f"Error processing {file_path}: {e}")
|
|
|
178 |
|
179 |
# Combine all data
|
180 |
combined_df = pd.concat([data[1] for data in all_data_to_process], ignore_index=True)
|
181 |
+
|
182 |
+
combined_df = combined_df.drop_duplicates(subset=['row_number', 'file'])
|
183 |
|
184 |
progress(0.3, desc="Cleaning and preparing text...")
|
185 |
|
|
|
221 |
|
222 |
results_data.append({
|
223 |
'File1': row1_data['file'],
|
224 |
+
'Row1': int(row1_data['row_number']),
|
225 |
'File2': row2_data['file'],
|
226 |
+
'Row2': int(row2_data['row_number']),
|
227 |
'Similarity_Score': round(similarity, 3),
|
228 |
'Text1': row1_data['text'][:200] + '...' if len(row1_data['text']) > 200 else row1_data['text'],
|
229 |
'Text2': row2_data['text'][:200] + '...' if len(row2_data['text']) > 200 else row2_data['text'],
|
|
|
237 |
progress(0.9, desc="Saving results...")
|
238 |
|
239 |
# Save results
|
240 |
+
output_paths = save_tabular_duplicate_results(results_df, output_folder, file_paths, remove_duplicate_rows=remove_duplicate_rows, in_excel_tabular_sheets=in_excel_tabular_sheets)
|
241 |
|
242 |
gr.Info(f"Found {len(results_df)} duplicate cell matches")
|
243 |
|
244 |
return results_df, output_paths, full_data_by_file
|
245 |
|
246 |
+
def save_tabular_duplicate_results(results_df: pd.DataFrame, output_folder: str, file_paths: List[str], remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS, in_excel_tabular_sheets: List[str] = []) -> List[str]:
|
247 |
"""
|
248 |
Save tabular duplicate detection results to files.
|
249 |
|
|
|
251 |
results_df (pd.DataFrame): Results DataFrame
|
252 |
output_folder (str): Output folder path
|
253 |
file_paths (List[str]): List of file paths
|
254 |
+
remove_duplicate_rows (bool): Whether to remove duplicate rows
|
255 |
+
in_excel_tabular_sheets (str): Name of the Excel sheet to save the results to
|
256 |
Returns:
|
257 |
List[str]: List of output file paths
|
258 |
"""
|
259 |
+
output_paths = list()
|
260 |
output_folder_path = Path(output_folder)
|
261 |
output_folder_path.mkdir(exist_ok=True)
|
262 |
|
263 |
if results_df.empty:
|
264 |
print("No duplicate matches to save.")
|
265 |
+
return list()
|
266 |
|
267 |
# Save main results
|
268 |
results_file = output_folder_path / 'tabular_duplicate_results.csv'
|
269 |
results_df.to_csv(results_file, index=False, encoding="utf-8-sig")
|
270 |
output_paths.append(str(results_file))
|
271 |
|
272 |
+
# Group results by original file to handle Excel files properly
|
273 |
+
excel_files_processed = dict() # Track which Excel files have been processed
|
274 |
+
|
275 |
# Save per-file duplicate lists
|
276 |
+
for file_name, group in results_df.groupby('File2'):
|
277 |
+
# Check for matches with original file names
|
278 |
+
for original_file in file_paths:
|
279 |
+
original_file_name = os.path.basename(original_file)
|
280 |
+
|
281 |
+
if original_file_name in file_name:
|
282 |
+
original_file_extension = os.path.splitext(original_file)[-1]
|
283 |
+
if original_file_extension in ['.xlsx', '.xls']:
|
284 |
+
|
285 |
+
# Split the string using a regex to handle both .xlsx_ and .xls_ delimiters
|
286 |
+
# The regex r'\.xlsx_|\.xls_' correctly matches either ".xlsx_" or ".xls_" as a delimiter.
|
287 |
+
parts = re.split(r'\.xlsx_|\.xls_', os.path.basename(file_name))
|
288 |
+
# The sheet name is the last part after splitting
|
289 |
+
file_sheet_name = parts[-1]
|
290 |
+
|
291 |
+
file_path = original_file
|
292 |
+
|
293 |
+
# Initialize Excel file tracking if not already done
|
294 |
+
if file_path not in excel_files_processed:
|
295 |
+
excel_files_processed[file_path] = {
|
296 |
+
'sheets_data': dict(),
|
297 |
+
'all_sheets': list(),
|
298 |
+
'processed_sheets': set()
|
299 |
+
}
|
300 |
+
|
301 |
+
# Read the original Excel file to get all sheet names
|
302 |
+
if not excel_files_processed[file_path]['all_sheets']:
|
303 |
+
try:
|
304 |
+
excel_file = pd.ExcelFile(file_path)
|
305 |
+
excel_files_processed[file_path]['all_sheets'] = excel_file.sheet_names
|
306 |
+
except Exception as e:
|
307 |
+
print(f"Error reading Excel file {file_path}: {e}")
|
308 |
+
continue
|
309 |
+
|
310 |
+
# Read the current sheet
|
311 |
+
df = read_file(file_path, excel_sheet_name=file_sheet_name)
|
312 |
+
|
313 |
+
# Create duplicate rows file for this sheet
|
314 |
+
file_stem = Path(file_name).stem
|
315 |
+
duplicate_rows_file = output_folder_path / f"{file_stem}_{file_sheet_name}_duplicate_rows.csv"
|
316 |
+
|
317 |
+
# Get unique row numbers to remove
|
318 |
+
rows_to_remove = sorted(group['Row2'].unique())
|
319 |
+
duplicate_df = pd.DataFrame({'Row_to_Remove': rows_to_remove})
|
320 |
+
duplicate_df.to_csv(duplicate_rows_file, index=False)
|
321 |
+
output_paths.append(str(duplicate_rows_file))
|
322 |
+
|
323 |
+
# Process the sheet data
|
324 |
+
df_cleaned = df.copy()
|
325 |
+
df_cleaned["duplicated"] = False
|
326 |
+
df_cleaned.loc[rows_to_remove, "duplicated"] = True
|
327 |
+
if remove_duplicate_rows:
|
328 |
+
df_cleaned = df_cleaned.drop(index=rows_to_remove)
|
329 |
+
|
330 |
+
# Store the processed sheet data
|
331 |
+
excel_files_processed[file_path]['sheets_data'][file_sheet_name] = df_cleaned
|
332 |
+
excel_files_processed[file_path]['processed_sheets'].add(file_sheet_name)
|
333 |
+
|
334 |
+
else:
|
335 |
+
file_sheet_name = ""
|
336 |
+
file_path = original_file
|
337 |
+
print("file_path after match:", file_path)
|
338 |
+
file_base_name = os.path.basename(file_path)
|
339 |
+
df = read_file(file_path)
|
340 |
+
|
341 |
+
file_stem = Path(file_name).stem
|
342 |
+
duplicate_rows_file = output_folder_path / f"{file_stem}_duplicate_rows.csv"
|
343 |
+
|
344 |
+
# Get unique row numbers to remove
|
345 |
+
rows_to_remove = sorted(group['Row2'].unique())
|
346 |
+
duplicate_df = pd.DataFrame({'Row_to_Remove': rows_to_remove})
|
347 |
+
duplicate_df.to_csv(duplicate_rows_file, index=False)
|
348 |
+
output_paths.append(str(duplicate_rows_file))
|
349 |
+
|
350 |
+
df_cleaned = df.copy()
|
351 |
+
df_cleaned["duplicated"] = False
|
352 |
+
df_cleaned.loc[rows_to_remove, "duplicated"] = True
|
353 |
+
if remove_duplicate_rows:
|
354 |
+
df_cleaned = df_cleaned.drop(index=rows_to_remove)
|
355 |
+
|
356 |
+
file_ext = os.path.splitext(file_name)[-1]
|
357 |
+
|
358 |
+
if file_ext in ['.parquet']:
|
359 |
+
output_path = os.path.join(output_folder, f"{file_base_name}_deduplicated.parquet")
|
360 |
+
df_cleaned.to_parquet(output_path, index=False)
|
361 |
+
else:
|
362 |
+
output_path = os.path.join(output_folder, f"{file_base_name}_deduplicated.csv")
|
363 |
+
df_cleaned.to_csv(output_path, index=False, encoding="utf-8-sig")
|
364 |
+
|
365 |
+
output_paths.append(str(output_path))
|
366 |
+
break
|
367 |
+
|
368 |
+
# Process Excel files to create complete deduplicated files
|
369 |
+
for file_path, file_data in excel_files_processed.items():
|
370 |
+
try:
|
371 |
+
# Create output filename
|
372 |
+
file_base_name = os.path.splitext(os.path.basename(file_path))[0]
|
373 |
+
file_ext = os.path.splitext(file_path)[-1]
|
374 |
+
output_path = os.path.join(output_folder, f"{file_base_name}_deduplicated{file_ext}")
|
375 |
+
|
376 |
+
# Create Excel writer
|
377 |
+
with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
|
378 |
+
# Write all sheets
|
379 |
+
for sheet_name in file_data['all_sheets']:
|
380 |
+
if sheet_name in file_data['processed_sheets']:
|
381 |
+
# Use the processed (deduplicated) version
|
382 |
+
file_data['sheets_data'][sheet_name].to_excel(
|
383 |
+
writer,
|
384 |
+
sheet_name=sheet_name,
|
385 |
+
index=False
|
386 |
+
)
|
387 |
+
else:
|
388 |
+
# Use the original sheet (no duplicates found)
|
389 |
+
original_df = read_file(file_path, excel_sheet_name=sheet_name)
|
390 |
+
original_df.to_excel(
|
391 |
+
writer,
|
392 |
+
sheet_name=sheet_name,
|
393 |
+
index=False
|
394 |
+
)
|
395 |
+
|
396 |
+
output_paths.append(str(output_path))
|
397 |
+
print(f"Created deduplicated Excel file: {output_path}")
|
398 |
+
|
399 |
+
except Exception as e:
|
400 |
+
print(f"Error creating deduplicated Excel file for {file_path}: {e}")
|
401 |
+
continue
|
402 |
|
403 |
return output_paths
|
404 |
|
405 |
def remove_duplicate_rows_from_tabular_data(
|
406 |
file_path: str,
|
407 |
duplicate_rows: List[int],
|
408 |
+
output_folder: str = OUTPUT_FOLDER,
|
409 |
+
in_excel_tabular_sheets: List[str] = [],
|
410 |
+
remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS
|
411 |
) -> str:
|
412 |
"""
|
413 |
Remove duplicate rows from a tabular data file.
|
|
|
416 |
file_path (str): Path to the input file
|
417 |
duplicate_rows (List[int]): List of row indices to remove
|
418 |
output_folder (str): Output folder for cleaned file
|
419 |
+
in_excel_tabular_sheets (str): Name of the Excel sheet to save the results to
|
420 |
+
remove_duplicate_rows (bool): Whether to remove duplicate rows
|
421 |
Returns:
|
422 |
str: Path to the cleaned file
|
423 |
"""
|
424 |
try:
|
425 |
# Load the file
|
426 |
+
df = read_file(file_path, excel_sheet_name=in_excel_tabular_sheets if in_excel_tabular_sheets else "")
|
427 |
|
428 |
# Remove duplicate rows (0-indexed)
|
429 |
df_cleaned = df.drop(index=duplicate_rows).reset_index(drop=True)
|
|
|
431 |
# Save cleaned file
|
432 |
file_name = os.path.basename(file_path)
|
433 |
file_stem = os.path.splitext(file_name)[0]
|
434 |
+
file_ext = os.path.splitext(file_name)[-1]
|
435 |
|
436 |
output_path = os.path.join(output_folder, f"{file_stem}_deduplicated{file_ext}")
|
437 |
|
438 |
if file_ext in ['.xlsx', '.xls']:
|
439 |
+
df_cleaned.to_excel(output_path, index=False, sheet_name=in_excel_tabular_sheets if in_excel_tabular_sheets else [])
|
440 |
elif file_ext in ['.parquet']:
|
441 |
df_cleaned.to_parquet(output_path, index=False)
|
442 |
else:
|
|
|
452 |
files: List[str],
|
453 |
threshold: float,
|
454 |
min_words: int,
|
455 |
+
text_columns: List[str] = [],
|
456 |
output_folder: str = OUTPUT_FOLDER,
|
457 |
do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN,
|
458 |
+
remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS,
|
459 |
+
in_excel_tabular_sheets: List[str] = [],
|
460 |
progress: Progress = Progress(track_tqdm=True)
|
461 |
) -> Tuple[pd.DataFrame, List[str], Dict[str, pd.DataFrame]]:
|
462 |
"""
|
|
|
477 |
input_files=files,
|
478 |
similarity_threshold=threshold,
|
479 |
min_word_count=min_words,
|
480 |
+
text_columns=text_columns if text_columns else [],
|
481 |
output_folder=output_folder,
|
482 |
do_initial_clean_dup=do_initial_clean_dup,
|
483 |
+
in_excel_tabular_sheets=in_excel_tabular_sheets if in_excel_tabular_sheets else [],
|
484 |
+
remove_duplicate_rows=remove_duplicate_rows
|
485 |
)
|
486 |
|
487 |
|
488 |
|
489 |
# Function to update column choices when files are uploaded
|
490 |
+
def update_tabular_column_choices(files, in_excel_tabular_sheets: List[str] = []):
|
491 |
if not files:
|
492 |
return gr.update(choices=[])
|
493 |
|
494 |
all_columns = set()
|
495 |
for file in files:
|
496 |
try:
|
497 |
+
file_extension = os.path.splitext(file.name)[-1]
|
498 |
+
if file_extension in ['.xlsx', '.xls']:
|
499 |
+
for sheet_name in in_excel_tabular_sheets:
|
500 |
+
df = read_file(file.name, excel_sheet_name=sheet_name)
|
501 |
+
text_cols = df.select_dtypes(include=['object', 'string']).columns.tolist()
|
502 |
+
all_columns.update(text_cols)
|
503 |
+
else:
|
504 |
+
df = read_file(file.name)
|
505 |
+
text_cols = df.select_dtypes(include=['object', 'string']).columns.tolist()
|
506 |
+
all_columns.update(text_cols)
|
507 |
+
|
508 |
# Get text columns
|
509 |
text_cols = df.select_dtypes(include=['object', 'string']).columns.tolist()
|
510 |
+
|
511 |
all_columns.update(text_cols)
|
512 |
except Exception as e:
|
513 |
print(f"Error reading {file.name}: {e}")
|
|
|
516 |
return gr.Dropdown(choices=sorted(list(all_columns)))
|
517 |
|
518 |
# Function to handle tabular duplicate detection
|
519 |
+
def run_tabular_duplicate_detection(files, threshold, min_words, text_columns, output_folder: str = OUTPUT_FOLDER, do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN, in_excel_tabular_sheets: List[str] = [], remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS):
|
520 |
if not files:
|
521 |
+
print("No files uploaded")
|
522 |
+
return pd.DataFrame(), [], gr.Dropdown(choices=[]), 0, "deduplicate"
|
523 |
+
|
524 |
+
start_time = time.time()
|
525 |
+
|
526 |
+
task_textbox = "deduplicate"
|
527 |
+
|
528 |
+
# If output folder doesn't end with a forward slash, add one
|
529 |
+
if not output_folder.endswith('/'): output_folder = output_folder + '/'
|
530 |
|
531 |
+
file_paths = []
|
532 |
+
if isinstance(files, str):
|
533 |
+
# If 'files' is a single string, treat it as a list with one element
|
534 |
+
file_paths.append(files)
|
535 |
+
elif isinstance(files, list):
|
536 |
+
# If 'files' is a list, iterate through its elements
|
537 |
+
for f_item in files:
|
538 |
+
if isinstance(f_item, str):
|
539 |
+
# If an element is a string, it's a direct file path
|
540 |
+
file_paths.append(f_item)
|
541 |
+
elif hasattr(f_item, 'name'):
|
542 |
+
# If an element has a '.name' attribute (e.g., a Gradio File object), use its name
|
543 |
+
file_paths.append(f_item.name)
|
544 |
+
else:
|
545 |
+
# Log a warning for unexpected element types within the list
|
546 |
+
print(f"Warning: Skipping an element in 'files' list that is neither a string nor has a '.name' attribute: {type(f_item)}")
|
547 |
+
elif hasattr(files, 'name'):
|
548 |
+
# Handle the case where a single file object (e.g., gr.File) is passed directly, not in a list
|
549 |
+
file_paths.append(files.name)
|
550 |
+
else:
|
551 |
+
# Raise an error for any other unexpected type of the 'files' argument itself
|
552 |
+
raise TypeError(f"Unexpected type for 'files' argument: {type(files)}. Expected str, list of str/file objects, or a single file object.")
|
553 |
+
|
554 |
results_df, output_paths, full_data = run_tabular_duplicate_analysis(
|
555 |
files=file_paths,
|
556 |
threshold=threshold,
|
557 |
min_words=min_words,
|
558 |
+
text_columns=text_columns if text_columns else [],
|
559 |
output_folder=output_folder,
|
560 |
+
do_initial_clean_dup=do_initial_clean_dup,
|
561 |
+
in_excel_tabular_sheets=in_excel_tabular_sheets if in_excel_tabular_sheets else None,
|
562 |
+
remove_duplicate_rows=remove_duplicate_rows
|
563 |
)
|
|
|
|
|
564 |
|
565 |
# Update file choices for cleaning
|
566 |
file_choices = list(set([f for f in file_paths]))
|
567 |
+
|
568 |
+
end_time = time.time()
|
569 |
+
processing_time = round(end_time - start_time, 2)
|
570 |
|
571 |
+
return results_df, output_paths, gr.Dropdown(choices=file_choices), processing_time, task_textbox
|
572 |
|
573 |
# Function to handle row selection for preview
|
574 |
def handle_tabular_row_selection(results_df, evt:gr.SelectData):
|
|
|
589 |
return selected_index, row['Text1'], row['Text2']
|
590 |
|
591 |
# Function to clean duplicates from selected file
|
592 |
+
def clean_tabular_duplicates(file_name, results_df, output_folder, in_excel_tabular_sheets: str = "", remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS):
|
593 |
if not file_name or results_df.empty:
|
594 |
return None
|
595 |
|
596 |
# Get duplicate rows for this file
|
597 |
+
file_duplicates = results_df[results_df['File2'] == file_name]['Row2'].tolist()
|
598 |
|
599 |
if not file_duplicates:
|
600 |
return None
|
|
|
605 |
cleaned_file = remove_duplicate_rows_from_tabular_data(
|
606 |
file_path=file_name,
|
607 |
duplicate_rows=file_duplicates,
|
608 |
+
output_folder=output_folder,
|
609 |
+
in_excel_tabular_sheets=in_excel_tabular_sheets,
|
610 |
+
remove_duplicate_rows=remove_duplicate_rows
|
611 |
)
|
612 |
return cleaned_file
|
613 |
except Exception as e:
|
tools/helper_functions.py
CHANGED
@@ -10,7 +10,6 @@ from typing import List
|
|
10 |
from math import ceil
|
11 |
from gradio_image_annotation import image_annotator
|
12 |
from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, AWS_PII_OPTION, MAPPED_LANGUAGE_CHOICES, LANGUAGE_CHOICES, textract_language_choices, aws_comprehend_language_choices, DEFAULT_LANGUAGE
|
13 |
-
# from tools.load_spacy_model_custom_recognisers import nlp_analyser
|
14 |
|
15 |
def _get_env_list(env_var_name: str) -> List[str]:
|
16 |
"""Parses a comma-separated environment variable into a list of strings."""
|
@@ -147,14 +146,21 @@ def detect_file_type(filename:str):
|
|
147 |
elif filename.endswith('.docx'): return 'docx'
|
148 |
else: raise ValueError("Unsupported file type.")
|
149 |
|
150 |
-
def read_file(filename:str):
|
151 |
"""Read the file based on its detected type."""
|
152 |
file_type = detect_file_type(filename)
|
153 |
|
154 |
if file_type == 'csv':
|
155 |
return pd.read_csv(filename, low_memory=False)
|
156 |
elif file_type == 'xlsx':
|
157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
elif file_type == 'parquet':
|
159 |
return pd.read_parquet(filename)
|
160 |
|
|
|
10 |
from math import ceil
|
11 |
from gradio_image_annotation import image_annotator
|
12 |
from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, AWS_PII_OPTION, MAPPED_LANGUAGE_CHOICES, LANGUAGE_CHOICES, textract_language_choices, aws_comprehend_language_choices, DEFAULT_LANGUAGE
|
|
|
13 |
|
14 |
def _get_env_list(env_var_name: str) -> List[str]:
|
15 |
"""Parses a comma-separated environment variable into a list of strings."""
|
|
|
146 |
elif filename.endswith('.docx'): return 'docx'
|
147 |
else: raise ValueError("Unsupported file type.")
|
148 |
|
149 |
+
def read_file(filename:str, excel_sheet_name: str = ""):
|
150 |
"""Read the file based on its detected type."""
|
151 |
file_type = detect_file_type(filename)
|
152 |
|
153 |
if file_type == 'csv':
|
154 |
return pd.read_csv(filename, low_memory=False)
|
155 |
elif file_type == 'xlsx':
|
156 |
+
if excel_sheet_name:
|
157 |
+
try:
|
158 |
+
return pd.read_excel(filename, sheet_name=excel_sheet_name)
|
159 |
+
except Exception as e:
|
160 |
+
print(f"Error reading {filename} with sheet name {excel_sheet_name}: {e}")
|
161 |
+
return pd.DataFrame()
|
162 |
+
else:
|
163 |
+
return pd.read_excel(filename)
|
164 |
elif file_type == 'parquet':
|
165 |
return pd.read_parquet(filename)
|
166 |
|
tools/load_spacy_model_custom_recognisers.py
CHANGED
@@ -11,10 +11,13 @@ import re
|
|
11 |
import os
|
12 |
import requests
|
13 |
import gradio as gr
|
14 |
-
from tools.config import DEFAULT_LANGUAGE, TESSERACT_DATA_FOLDER
|
|
|
15 |
|
16 |
score_threshold = 0.001
|
17 |
-
|
|
|
|
|
18 |
|
19 |
# Create a class inheriting from SpacyNlpEngine
|
20 |
class LoadedSpacyNlpEngine(SpacyNlpEngine):
|
|
|
11 |
import os
|
12 |
import requests
|
13 |
import gradio as gr
|
14 |
+
from tools.config import DEFAULT_LANGUAGE, TESSERACT_DATA_FOLDER, CUSTOM_ENTITIES
|
15 |
+
from tools.helper_functions import _get_env_list
|
16 |
|
17 |
score_threshold = 0.001
|
18 |
+
|
19 |
+
if CUSTOM_ENTITIES: CUSTOM_ENTITIES = _get_env_list(CUSTOM_ENTITIES)
|
20 |
+
custom_entities = CUSTOM_ENTITIES
|
21 |
|
22 |
# Create a class inheriting from SpacyNlpEngine
|
23 |
class LoadedSpacyNlpEngine(SpacyNlpEngine):
|
tools/textract_batch_call.py
CHANGED
@@ -1,17 +1,17 @@
|
|
1 |
import boto3
|
2 |
-
import time
|
3 |
import os
|
4 |
import pandas as pd
|
5 |
import json
|
6 |
import logging
|
7 |
import datetime
|
|
|
8 |
import gradio as gr
|
9 |
from gradio import FileData
|
10 |
from typing import List
|
11 |
from io import StringIO
|
12 |
from urllib.parse import urlparse
|
13 |
from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
|
14 |
-
from tools.config import TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC,
|
15 |
from tools.aws_functions import download_file_from_s3
|
16 |
from tools.file_conversion import get_input_file_names
|
17 |
from tools.helper_functions import get_file_name_without_type
|
@@ -59,6 +59,7 @@ def analyse_document_with_textract_api(
|
|
59 |
|
60 |
# This is a variable that is written to logs to indicate that a Textract API call was made
|
61 |
is_a_textract_api_call = True
|
|
|
62 |
|
63 |
# Keep only latest pdf path if it's a list
|
64 |
if isinstance(local_pdf_path, list):
|
@@ -67,6 +68,23 @@ def analyse_document_with_textract_api(
|
|
67 |
if not os.path.exists(local_pdf_path):
|
68 |
raise FileNotFoundError(f"Input document not found {local_pdf_path}")
|
69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
if not os.path.exists(local_output_dir):
|
71 |
os.makedirs(local_output_dir)
|
72 |
log_message = f"Created local output directory: {local_output_dir}"
|
@@ -96,18 +114,32 @@ def analyse_document_with_textract_api(
|
|
96 |
#logging.error(log_message)
|
97 |
raise
|
98 |
|
99 |
-
#
|
100 |
if not job_df.empty:
|
|
|
|
|
|
|
|
|
|
|
101 |
if "file_name" in job_df.columns:
|
102 |
matching_job_id_file_names = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(analyse_signatures)), "file_name"]
|
|
|
|
|
|
|
103 |
|
104 |
-
if len(
|
105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
|
107 |
# --- 2. Start Textract Document Analysis ---
|
108 |
message = "Starting Textract document analysis job..."
|
109 |
print(message)
|
110 |
-
#logging.info("Starting Textract document analysis job...")
|
111 |
|
112 |
try:
|
113 |
if "Extract signatures" in analyse_signatures:
|
@@ -143,19 +175,12 @@ def analyse_document_with_textract_api(
|
|
143 |
'S3Bucket': s3_bucket_name,
|
144 |
'S3Prefix': s3_output_prefix
|
145 |
}
|
146 |
-
# Optional: Add NotificationChannel for SNS topic notifications
|
147 |
-
# NotificationChannel={
|
148 |
-
# 'SNSTopicArn': 'YOUR_SNS_TOPIC_ARN',
|
149 |
-
# 'RoleArn': 'YOUR_IAM_ROLE_ARN_FOR_TEXTRACT_TO_ACCESS_SNS'
|
150 |
-
# }
|
151 |
)
|
152 |
job_type="document_text_detection"
|
153 |
|
154 |
job_id = response['JobId']
|
155 |
print(f"Textract job started with JobId: {job_id}")
|
156 |
-
#logging.info(f"Textract job started with JobId: {job_id}")
|
157 |
|
158 |
-
# Write job_id to memory
|
159 |
# Prepare CSV in memory
|
160 |
log_csv_key_location = f"{s3_output_prefix}/textract_document_jobs.csv"
|
161 |
job_location_full = f"s3://{s3_bucket_name}/{s3_output_prefix}/{job_id}/"
|
@@ -166,12 +191,16 @@ def analyse_document_with_textract_api(
|
|
166 |
'file_name': pdf_filename,
|
167 |
'job_type': job_type,
|
168 |
'signature_extraction':analyse_signatures,
|
169 |
-
|
170 |
-
'job_date_time': datetime.datetime.now()
|
171 |
}])
|
172 |
|
173 |
# File path
|
174 |
log_file_path = os.path.join(local_output_dir, "textract_document_jobs.csv")
|
|
|
|
|
|
|
|
|
|
|
175 |
|
176 |
# Check if file exists
|
177 |
file_exists = os.path.exists(log_file_path)
|
@@ -198,7 +227,7 @@ def analyse_document_with_textract_api(
|
|
198 |
successful_job_number += 1
|
199 |
total_number_of_textract_page_calls = total_document_page_count
|
200 |
|
201 |
-
return f"Textract analysis job submitted, job ID:{job_id}", job_id, job_type, successful_job_number, is_a_textract_api_call, total_number_of_textract_page_calls
|
202 |
|
203 |
def return_job_status(job_id:str,
|
204 |
response:dict,
|
@@ -467,13 +496,19 @@ def poll_whole_document_textract_analysis_progress_and_download(
|
|
467 |
progress(0.5, "Document analysis task outputs found. Downloading from S3")
|
468 |
|
469 |
# If job_df is not empty
|
|
|
|
|
|
|
|
|
470 |
if not job_df.empty:
|
471 |
if "file_name" in job_df.columns:
|
472 |
matching_job_id_file_names = job_df.loc[job_df["job_id"] == job_id, "file_name"]
|
473 |
|
474 |
if pdf_filename and not matching_job_id_file_names.empty:
|
475 |
if pdf_filename == matching_job_id_file_names.iloc[0]:
|
476 |
-
|
|
|
|
|
477 |
|
478 |
if not matching_job_id_file_names.empty:
|
479 |
pdf_filename = matching_job_id_file_names.iloc[0]
|
|
|
1 |
import boto3
|
|
|
2 |
import os
|
3 |
import pandas as pd
|
4 |
import json
|
5 |
import logging
|
6 |
import datetime
|
7 |
+
import pymupdf
|
8 |
import gradio as gr
|
9 |
from gradio import FileData
|
10 |
from typing import List
|
11 |
from io import StringIO
|
12 |
from urllib.parse import urlparse
|
13 |
from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
|
14 |
+
from tools.config import TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, RUN_AWS_FUNCTIONS, INPUT_FOLDER, DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS
|
15 |
from tools.aws_functions import download_file_from_s3
|
16 |
from tools.file_conversion import get_input_file_names
|
17 |
from tools.helper_functions import get_file_name_without_type
|
|
|
59 |
|
60 |
# This is a variable that is written to logs to indicate that a Textract API call was made
|
61 |
is_a_textract_api_call = True
|
62 |
+
task_textbox = "textract"
|
63 |
|
64 |
# Keep only latest pdf path if it's a list
|
65 |
if isinstance(local_pdf_path, list):
|
|
|
68 |
if not os.path.exists(local_pdf_path):
|
69 |
raise FileNotFoundError(f"Input document not found {local_pdf_path}")
|
70 |
|
71 |
+
file_extension = os.path.splitext(local_pdf_path)[1].lower()
|
72 |
+
|
73 |
+
# Load pdf to get page count if not provided
|
74 |
+
if not total_document_page_count and file_extension in ['.pdf']:
|
75 |
+
print("Page count not provided. Loading PDF to get page count")
|
76 |
+
try:
|
77 |
+
pymupdf_doc = pymupdf.open(local_pdf_path)
|
78 |
+
total_document_page_count = pymupdf_doc.page_count
|
79 |
+
pymupdf_doc.close()
|
80 |
+
print("Page count:", total_document_page_count)
|
81 |
+
except Exception as e:
|
82 |
+
print("Failed to load PDF to get page count:", e, "setting page count to 1")
|
83 |
+
total_document_page_count = 1
|
84 |
+
#raise Exception(f"Failed to load PDF to get page count: {e}")
|
85 |
+
else:
|
86 |
+
total_document_page_count = 1
|
87 |
+
|
88 |
if not os.path.exists(local_output_dir):
|
89 |
os.makedirs(local_output_dir)
|
90 |
log_message = f"Created local output directory: {local_output_dir}"
|
|
|
114 |
#logging.error(log_message)
|
115 |
raise
|
116 |
|
117 |
+
# Filter job_df to include rows only where the analysis date is after the current date - DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS
|
118 |
if not job_df.empty:
|
119 |
+
job_df = job_df.loc[job_df["job_date_time"] > (datetime.datetime.now() - datetime.timedelta(days=DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS)),:]
|
120 |
+
|
121 |
+
# If job_df is not empty
|
122 |
+
if not job_df.empty:
|
123 |
+
|
124 |
if "file_name" in job_df.columns:
|
125 |
matching_job_id_file_names = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(analyse_signatures)), "file_name"]
|
126 |
+
matching_job_id_file_names_dates = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(analyse_signatures)), "job_date_time"]
|
127 |
+
matching_job_id = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(analyse_signatures)), "job_id"]
|
128 |
+
matching_handwrite_signature = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(analyse_signatures)), "signature_extraction"]
|
129 |
|
130 |
+
if len(matching_job_id) > 0:
|
131 |
+
pass
|
132 |
+
else:
|
133 |
+
matching_job_id = "unknown_job_id"
|
134 |
+
|
135 |
+
if len(matching_job_id_file_names) > 0 and len(matching_handwrite_signature) > 0:
|
136 |
+
out_message = f"Existing Textract outputs found for file {pdf_filename} from date {matching_job_id_file_names_dates.iloc[0]}. No need to re-analyse. Please download existing results from the list with job ID {matching_job_id.iloc[0]}"
|
137 |
+
gr.Warning(out_message)
|
138 |
+
raise Exception(out_message)
|
139 |
|
140 |
# --- 2. Start Textract Document Analysis ---
|
141 |
message = "Starting Textract document analysis job..."
|
142 |
print(message)
|
|
|
143 |
|
144 |
try:
|
145 |
if "Extract signatures" in analyse_signatures:
|
|
|
175 |
'S3Bucket': s3_bucket_name,
|
176 |
'S3Prefix': s3_output_prefix
|
177 |
}
|
|
|
|
|
|
|
|
|
|
|
178 |
)
|
179 |
job_type="document_text_detection"
|
180 |
|
181 |
job_id = response['JobId']
|
182 |
print(f"Textract job started with JobId: {job_id}")
|
|
|
183 |
|
|
|
184 |
# Prepare CSV in memory
|
185 |
log_csv_key_location = f"{s3_output_prefix}/textract_document_jobs.csv"
|
186 |
job_location_full = f"s3://{s3_bucket_name}/{s3_output_prefix}/{job_id}/"
|
|
|
191 |
'file_name': pdf_filename,
|
192 |
'job_type': job_type,
|
193 |
'signature_extraction':analyse_signatures,
|
194 |
+
'job_date_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
|
|
195 |
}])
|
196 |
|
197 |
# File path
|
198 |
log_file_path = os.path.join(local_output_dir, "textract_document_jobs.csv")
|
199 |
+
log_file_path_job_id = os.path.join(local_output_dir, pdf_filename + "_textract_document_jobs_job_id.txt")
|
200 |
+
|
201 |
+
# Write latest job ID to local text file
|
202 |
+
with open(log_file_path_job_id, 'w') as f:
|
203 |
+
f.write(job_id)
|
204 |
|
205 |
# Check if file exists
|
206 |
file_exists = os.path.exists(log_file_path)
|
|
|
227 |
successful_job_number += 1
|
228 |
total_number_of_textract_page_calls = total_document_page_count
|
229 |
|
230 |
+
return f"Textract analysis job submitted, job ID:{job_id}", job_id, job_type, successful_job_number, is_a_textract_api_call, total_number_of_textract_page_calls, task_textbox
|
231 |
|
232 |
def return_job_status(job_id:str,
|
233 |
response:dict,
|
|
|
496 |
progress(0.5, "Document analysis task outputs found. Downloading from S3")
|
497 |
|
498 |
# If job_df is not empty
|
499 |
+
|
500 |
+
# if not job_df.empty:
|
501 |
+
# job_df = job_df.loc[job_df["job_date_time"] > (datetime.datetime.now() - datetime.timedelta(days=DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS)),:]
|
502 |
+
|
503 |
if not job_df.empty:
|
504 |
if "file_name" in job_df.columns:
|
505 |
matching_job_id_file_names = job_df.loc[job_df["job_id"] == job_id, "file_name"]
|
506 |
|
507 |
if pdf_filename and not matching_job_id_file_names.empty:
|
508 |
if pdf_filename == matching_job_id_file_names.iloc[0]:
|
509 |
+
out_message = f"Existing Textract outputs found for file {pdf_filename}. No need to re-download."
|
510 |
+
gr.Warning(out_message)
|
511 |
+
raise Exception(out_message)
|
512 |
|
513 |
if not matching_job_id_file_names.empty:
|
514 |
pdf_filename = matching_job_id_file_names.iloc[0]
|