seanpedrickcase commited on
Commit
d60759d
·
1 Parent(s): 64ab318

Added example data files. Greatly revised CLI redaction for redaction, deduplication, and AWS Textract batch calls. Various minor fixes and package updates.

Browse files
Files changed (42) hide show
  1. .dockerignore +0 -4
  2. .gitattributes +7 -0
  3. .gitignore +2 -5
  4. app.py +153 -62
  5. cli_redact.py +655 -118
  6. example_data/Bold minimalist professional cover letter.docx +3 -0
  7. example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv +0 -0
  8. example_data/Partnership-Agreement-Toolkit_0_0.pdf +3 -0
  9. example_data/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv +2 -0
  10. example_data/combined_case_notes.csv +19 -0
  11. example_data/doubled_output_joined.pdf +3 -0
  12. example_data/example_complaint_letter.jpg +3 -0
  13. example_data/example_of_emails_sent_to_a_professor_before_applying.pdf +3 -0
  14. example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_ocr_output.csv +277 -0
  15. example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_review_file.csv +77 -0
  16. example_data/example_outputs/Partnership-Agreement-Toolkit_0_0_ocr_results_with_words_textract.csv +0 -0
  17. example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv +0 -0
  18. example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_ocr_output_textract.csv +40 -0
  19. example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_ocr_results_with_words_textract.csv +432 -0
  20. example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_review_file.csv +15 -0
  21. example_data/graduate-job-example-cover-letter.pdf +3 -0
  22. example_data/partnership_toolkit_redact_custom_deny_list.csv +4 -0
  23. example_data/partnership_toolkit_redact_some_pages.csv +2 -0
  24. example_data/test_allow_list_graduate.csv +1 -0
  25. example_data/test_allow_list_partnership.csv +1 -0
  26. lambda_entrypoint.py +61 -19
  27. pyproject.toml +2 -2
  28. requirements.txt +2 -2
  29. tools/aws_functions.py +2 -2
  30. tools/aws_textract.py +2 -0
  31. tools/cli_usage_logger.py +302 -0
  32. tools/config.py +18 -8
  33. tools/custom_csvlogger.py +2 -4
  34. tools/custom_image_analyser_engine.py +10 -4
  35. tools/data_anonymise.py +67 -47
  36. tools/example_cli_calls.txt +0 -30
  37. tools/file_redaction.py +118 -43
  38. tools/find_duplicate_pages.py +13 -5
  39. tools/find_duplicate_tabular.py +268 -75
  40. tools/helper_functions.py +9 -3
  41. tools/load_spacy_model_custom_recognisers.py +5 -2
  42. tools/textract_batch_call.py +52 -17
.dockerignore CHANGED
@@ -1,8 +1,4 @@
1
- *.csv
2
- *.pdf
3
  *.url
4
- *.jpg
5
- *.png
6
  *.ipynb
7
  *.pyc
8
  examples/*
 
 
 
1
  *.url
 
 
2
  *.ipynb
3
  *.pyc
4
  examples/*
.gitattributes ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ *.pdf filter=lfs diff=lfs merge=lfs -text
2
+ *.jpg filter=lfs diff=lfs merge=lfs -text
3
+ *.xls filter=lfs diff=lfs merge=lfs -text
4
+ *.xlsx filter=lfs diff=lfs merge=lfs -text
5
+ *.docx filter=lfs diff=lfs merge=lfs -text
6
+ *.doc filter=lfs diff=lfs merge=lfs -text
7
+ *.png filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -1,8 +1,4 @@
1
- *.csv
2
- *.pdf
3
  *.url
4
- *.jpg
5
- *.png
6
  *.ipynb
7
  *.pyc
8
  examples/*
@@ -17,8 +13,9 @@ build/*
17
  dist/*
18
  build_deps/*
19
  logs/*
 
 
20
  config/*
21
- doc_redaction_amplify_app/*
22
  user_guide/*
23
  cdk/config/*
24
  cdk/cdk.out/*
 
 
 
1
  *.url
 
 
2
  *.ipynb
3
  *.pyc
4
  examples/*
 
13
  dist/*
14
  build_deps/*
15
  logs/*
16
+ usage/*
17
+ feedback/*
18
  config/*
 
19
  user_guide/*
20
  cdk/config/*
21
  cdk/cdk.out/*
app.py CHANGED
@@ -2,7 +2,7 @@ import os
2
  import pandas as pd
3
  import gradio as gr
4
  from gradio_image_annotation import image_annotator
5
- from tools.config import DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX, DEFAULT_TABULAR_ANONYMISATION_STRATEGY, OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, DEFAULT_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS, USAGE_LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, S3_FEEDBACK_LOGS_FOLDER, S3_ACCESS_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER, CHOSEN_LOCAL_OCR_MODEL, DEFAULT_LANGUAGE_FULL_NAME, SHOW_LANGUAGE_SELECTION, DO_INITIAL_TABULAR_DATA_CLEAN, DEFAULT_DUPLICATE_DETECTION_THRESHOLD, DIRECT_MODE_TASK, DIRECT_MODE_INPUT_FILE, DIRECT_MODE_OUTPUT_DIR, DIRECT_MODE_DUPLICATE_TYPE, LOCAL_PII_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, DOCUMENT_REDACTION_BUCKET, IMAGES_DPI, PREPROCESS_LOCAL_OCR_IMAGES, COMPRESS_REDACTED_PDF, RETURN_PDF_END_OF_REDACTION, OUTPUT_DENY_LIST_PATH, OUTPUT_WHOLE_PAGE_REDACTION_LIST_PATH, DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX, DEFAULT_TABULAR_ANONYMISATION_STRATEGY, DEFAULT_FUZZY_SPELLING_MISTAKES_NUM, DEFAULT_PAGE_MIN, DEFAULT_PAGE_MAX, DEFAULT_EXCEL_SHEETS, USE_GREEDY_DUPLICATE_DETECTION, DEFAULT_MIN_CONSECUTIVE_PAGES, DEFAULT_COMBINE_PAGES, DEFAULT_MIN_WORD_COUNT, DEFAULT_TEXT_COLUMNS, DEFAULT_SEARCH_QUERY
6
  from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_relevant_ocr_output_with_words, reset_data_vars, reset_aws_call_vars, _get_env_list, ensure_folder_exists, reset_ocr_with_words_base_dataframe, update_language_dropdown, LANGUAGE_CHOICES, MAPPED_LANGUAGE_CHOICES
7
  from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
8
  from tools.file_redaction import choose_and_run_redactor
@@ -14,6 +14,7 @@ from tools.load_spacy_model_custom_recognisers import custom_entities
14
  from tools.custom_csvlogger import CSVLogger_custom
15
  from tools.find_duplicate_pages import run_duplicate_analysis, exclude_match, handle_selection_and_preview, apply_whole_page_redactions_from_list, create_annotation_objects_from_duplicates, run_full_search_and_analysis
16
  from tools.find_duplicate_tabular import update_tabular_column_choices, run_tabular_duplicate_detection, handle_tabular_row_selection, clean_tabular_duplicates
 
17
  from tools.textract_batch_call import analyse_document_with_textract_api, poll_whole_document_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist, replace_existing_pdf_input_for_whole_document_outputs
18
 
19
  # Suppress downcasting warnings
@@ -48,6 +49,8 @@ if USE_GREEDY_DUPLICATE_DETECTION == "True": USE_GREEDY_DUPLICATE_DETECTION = Tr
48
  else: USE_GREEDY_DUPLICATE_DETECTION = False
49
  if DEFAULT_COMBINE_PAGES == "True": DEFAULT_COMBINE_PAGES = True
50
  else: DEFAULT_COMBINE_PAGES = False
 
 
51
 
52
  if CSV_ACCESS_LOG_HEADERS: CSV_ACCESS_LOG_HEADERS = _get_env_list(CSV_ACCESS_LOG_HEADERS)
53
  if CSV_FEEDBACK_LOG_HEADERS: CSV_FEEDBACK_LOG_HEADERS = _get_env_list(CSV_FEEDBACK_LOG_HEADERS)
@@ -73,6 +76,58 @@ FULL_COMPREHEND_ENTITY_LIST.extend(custom_entities)
73
 
74
  FILE_INPUT_HEIGHT = int(FILE_INPUT_HEIGHT)
75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  # Create the gradio interface
77
  app = gr.Blocks(theme = gr.themes.Default(primary_hue="blue"), fill_width=True) #gr.themes.Base()
78
 
@@ -185,7 +240,7 @@ with app:
185
  # S3 settings for default allow list load
186
  s3_default_bucket = gr.Textbox(label = "Default S3 bucket", value=DOCUMENT_REDACTION_BUCKET, visible=False)
187
  s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=S3_ALLOW_LIST_PATH, visible=False)
188
- default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=OUTPUT_ALLOW_LIST_PATH, visible=False)
189
 
190
  s3_whole_document_textract_default_bucket = gr.Textbox(label = "Default Textract whole_document S3 bucket", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, visible=False)
191
  s3_whole_document_textract_input_subfolder = gr.Textbox(label = "Default Textract whole_document S3 input folder", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, visible=False)
@@ -255,6 +310,7 @@ with app:
255
  textract_job_detail_df = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','job_date_time']), label="Previous job details", visible=False, type="pandas", wrap=True)
256
  selected_job_id_row = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','job_date_time']), label="Selected job id row", visible=False, type="pandas", wrap=True)
257
  is_a_textract_api_call = gr.Checkbox(value=False, label="is_this_a_textract_api_call", visible=False)
 
258
  job_output_textbox = gr.Textbox(value="", label="Textract call outputs", visible=False)
259
  job_input_textbox = gr.Textbox(value=TEXTRACT_JOBS_S3_INPUT_LOC, label="Textract call outputs", visible=False)
260
 
@@ -336,13 +392,13 @@ with app:
336
  textract_job_output_file = gr.File(label="Textract job output files", height=100, visible=True)
337
  with gr.Column():
338
  job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=True)
339
- convert_textract_outputs_to_ocr_results = gr.Button("Convert Textract job outputs to OCR results", variant="secondary", visible=True)
340
 
341
  gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the Redaction Settings tab.""")
342
  document_redact_btn = gr.Button("Extract text and redact document", variant="primary", scale = 4)
343
 
344
- with gr.Row(equal_height=True):
345
- redaction_output_summary_textbox = gr.Textbox(label="Output summary", scale=1)
346
  output_file = gr.File(label="Output files", scale = 2)#, height=FILE_INPUT_HEIGHT)
347
  latest_file_completed_num = gr.Number(value=0, label="Number of documents redacted", interactive=False, visible=False)
348
 
@@ -558,13 +614,13 @@ with app:
558
 
559
  with gr.Accordion("Anonymisation output format - by default will replace PII with a blank space", open = False):
560
  with gr.Row():
561
- anon_strat = gr.Radio(choices=["replace with 'REDACTED'", "replace with <ENTITY_NAME>", "redact completely", "hash", "mask"], label="Select an anonymisation method.", value = DEFAULT_TABULAR_ANONYMISATION_STRATEGY) # , "encrypt", "fake_first_name" are also available, but are not currently included as not that useful in current form
562
  do_initial_clean = gr.Checkbox(label="Do initial clean of text (remove URLs, HTML tags, and non-ASCII characters)", value=DO_INITIAL_TABULAR_DATA_CLEAN)
563
 
564
  tabular_data_redact_btn = gr.Button("Redact text/data files", variant="primary")
565
 
566
- with gr.Row(equal_height=True):
567
- text_output_summary = gr.Textbox(label="Output result")
568
  text_output_file = gr.File(label="Output files")
569
  text_tabular_files_done = gr.Number(value=0, label="Number of tabular files redacted", interactive=False, visible=False)
570
 
@@ -574,21 +630,21 @@ with app:
574
  # TABULAR DUPLICATE DETECTION TAB
575
  ###
576
  with gr.Accordion(label="Find duplicate cells in tabular data", open=False):
577
- gr.Markdown("""Find duplicate cells or rows in CSV and Excel files. This tool analyzes text content across all columns to identify similar or identical entries that may be duplicates. You can review the results and choose to remove duplicate rows from your files.""")
578
 
579
  with gr.Accordion("Step 1: Upload files and configure analysis", open=True):
580
  in_tabular_duplicate_files = gr.File(
581
- label="Upload CSV or Excel files to find duplicate cells/rows",
582
  file_count="multiple",
583
  file_types=['.csv', '.xlsx', '.xls', '.parquet'],
584
  height=FILE_INPUT_HEIGHT
585
  )
586
 
587
- with gr.Row():
588
  tabular_duplicate_threshold = gr.Number(
589
  value=DEFAULT_DUPLICATE_DETECTION_THRESHOLD,
590
  label="Similarity threshold",
591
- info="Score (0-1) to consider cells a match. Higher values = more strict matching."
592
  )
593
  tabular_min_word_count = gr.Number(
594
  value=DEFAULT_MIN_WORD_COUNT,
@@ -596,13 +652,17 @@ with app:
596
  info="Cells with fewer words than this are ignored."
597
  )
598
  do_initial_clean_dup = gr.Checkbox(label="Do initial clean of text (remove URLs, HTML tags, and non-ASCII characters)", value=DO_INITIAL_TABULAR_DATA_CLEAN)
 
599
 
600
- tabular_text_columns = gr.Dropdown(
601
- choices=DEFAULT_TEXT_COLUMNS,
602
- multiselect=True,
603
- label="Select specific columns to analyse (leave empty to analyse all text columns)",
604
- info="If no columns selected, all text columns will be analyzed"
605
- )
 
 
 
606
 
607
  find_tabular_duplicates_btn = gr.Button(
608
  value="Find duplicate cells/rows",
@@ -621,7 +681,7 @@ with app:
621
  show_copy_button=True
622
  )
623
 
624
- with gr.Row():
625
  tabular_selected_row_index = gr.Number(value=None, visible=False)
626
  tabular_text1_preview = gr.Textbox(
627
  label="Text from File 1",
@@ -775,11 +835,11 @@ with app:
775
  document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number, all_page_line_level_ocr_results_with_words]).\
776
  success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
777
  success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox, chosen_language_drop],
778
- outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, input_pdf_for_review, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state], api_name="redact_doc", show_progress_on=[redaction_output_summary_textbox])
779
 
780
  # If a file has been completed, the function will continue onto the next document
781
  latest_file_completed_num.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox, chosen_language_drop],
782
- outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, input_pdf_for_review, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state], show_progress_on=[redaction_output_summary_textbox]).\
783
  success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state], show_progress_on=[annotator]).\
784
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
785
  success(fn=check_for_relevant_ocr_output_with_words, inputs=[doc_file_name_no_extension_textbox, text_extract_method_radio, output_folder_textbox], outputs=[relevant_ocr_output_with_words_found_checkbox]).\
@@ -791,7 +851,7 @@ with app:
791
  all_page_line_level_ocr_results_with_words_df_base.change(reset_ocr_with_words_base_dataframe, inputs=[all_page_line_level_ocr_results_with_words_df_base, page_entity_dropdown_redaction], outputs=[all_page_line_level_ocr_results_with_words_df, backup_all_page_line_level_ocr_results_with_words_df_base])
792
 
793
  # Send whole document to Textract for text extraction
794
- send_document_to_textract_api_btn.click(analyse_document_with_textract_api, inputs=[prepared_pdf_state, s3_whole_document_textract_input_subfolder, s3_whole_document_textract_output_subfolder, textract_job_detail_df, s3_whole_document_textract_default_bucket, output_folder_textbox, handwrite_signature_checkbox, successful_textract_api_call_number, total_pdf_page_count], outputs=[job_output_textbox, job_id_textbox, job_type_dropdown, successful_textract_api_call_number, is_a_textract_api_call, textract_query_number]).\
795
  success(check_for_provided_job_id, inputs=[job_id_textbox]).\
796
  success(poll_whole_document_textract_analysis_progress_and_download, inputs=[job_id_textbox, job_type_dropdown, s3_whole_document_textract_output_subfolder, doc_file_name_no_extension_textbox, textract_job_detail_df, s3_whole_document_textract_default_bucket, output_folder_textbox, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder], outputs = [textract_job_output_file, job_current_status, textract_job_detail_df, doc_file_name_no_extension_textbox]).\
797
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
@@ -809,7 +869,7 @@ with app:
809
  success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
810
  success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
811
  success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox, chosen_language_drop],
812
- outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, input_pdf_for_review, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state], show_progress_on=[redaction_output_summary_textbox]).\
813
  success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state], show_progress_on=[annotator])
814
 
815
  ###
@@ -1021,10 +1081,10 @@ with app:
1021
  success(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list, total_pdf_page_count])
1022
 
1023
  tabular_data_redact_btn.click(reset_data_vars, outputs=[actual_time_taken_number, log_files_output_list_state, comprehend_query_number]).\
1024
- success(fn=anonymise_files_with_open_text, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_entities, in_allow_list_state, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox, actual_time_taken_number, do_initial_clean, chosen_language_drop], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state, actual_time_taken_number], api_name="redact_data")
1025
 
1026
  # If the output file count text box changes, keep going with redacting each data file until done
1027
- text_tabular_files_done.change(fn=anonymise_files_with_open_text, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_entities, in_allow_list_state, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox, actual_time_taken_number, do_initial_clean, chosen_language_drop], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state, actual_time_taken_number]).\
1028
  success(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
1029
 
1030
  ###
@@ -1039,12 +1099,15 @@ with app:
1039
  min_word_count_input,
1040
  min_consecutive_pages_input,
1041
  greedy_match_input,
1042
- combine_page_text_for_duplicates_bool
 
1043
  ],
1044
  outputs=[
1045
  results_df_preview,
1046
  duplicate_files_out,
1047
- full_duplicate_data_by_file
 
 
1048
  ]
1049
  )
1050
 
@@ -1076,15 +1139,13 @@ with app:
1076
 
1077
  # Event handlers
1078
  in_tabular_duplicate_files.upload(
1079
- fn=update_tabular_column_choices,
1080
- inputs=[in_tabular_duplicate_files],
1081
- outputs=[tabular_text_columns]
1082
- )
1083
 
1084
  find_tabular_duplicates_btn.click(
1085
  fn=run_tabular_duplicate_detection,
1086
- inputs=[in_tabular_duplicate_files, tabular_duplicate_threshold, tabular_min_word_count, tabular_text_columns, output_folder_textbox, do_initial_clean_dup],
1087
- outputs=[tabular_results_df, tabular_cleaned_file, tabular_file_to_clean], api_name="tabular_clean_duplicates", show_progress_on=[tabular_results_df]
1088
  )
1089
 
1090
  tabular_results_df.select(
@@ -1095,7 +1156,7 @@ with app:
1095
 
1096
  clean_duplicates_btn.click(
1097
  fn=clean_tabular_duplicates,
1098
- inputs=[tabular_file_to_clean, tabular_results_df, output_folder_textbox],
1099
  outputs=[tabular_cleaned_file]
1100
  )
1101
 
@@ -1182,15 +1243,15 @@ with app:
1182
  pdf_callback = CSVLogger_custom(dataset_file_name=FEEDBACK_LOG_FILE_NAME)
1183
  data_callback = CSVLogger_custom(dataset_file_name=FEEDBACK_LOG_FILE_NAME)
1184
 
1185
- if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
1186
  # User submitted feedback for pdf redactions
1187
  pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], FEEDBACK_LOGS_FOLDER)
1188
  pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], None, preprocess=False).\
1189
  success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
1190
 
1191
  # User submitted feedback for data redactions
1192
- data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], FEEDBACK_LOGS_FOLDER)
1193
- data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [data_feedback_radio, data_further_details_text, data_full_file_name_textbox], None, preprocess=False).\
1194
  success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
1195
  else:
1196
  # User submitted feedback for pdf redactions
@@ -1199,7 +1260,7 @@ with app:
1199
  success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
1200
 
1201
  # User submitted feedback for data redactions
1202
- data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], FEEDBACK_LOGS_FOLDER)
1203
  data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [data_feedback_radio, data_further_details_text, placeholder_data_file_name_no_extension_textbox_for_logs], None, preprocess=False).\
1204
  success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
1205
 
@@ -1207,27 +1268,41 @@ with app:
1207
  # Log processing usage - time taken for redaction queries, and also logs for queries to Textract/Comprehend
1208
  usage_callback = CSVLogger_custom(dataset_file_name=USAGE_LOG_FILE_NAME)
1209
 
1210
- if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
1211
- usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
 
 
 
1212
 
1213
- latest_file_completed_num.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False, api_name="usage_logs").\
1214
  success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
1215
 
1216
- text_tabular_files_done.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop_tabular, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
1217
  success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
1218
 
1219
- successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
 
 
 
 
1220
  success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
1221
  else:
1222
- usage_callback.setup([session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, blank_data_file_name_no_extension_textbox_for_logs, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
 
 
 
1223
 
1224
- latest_file_completed_num.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, placeholder_doc_file_name_no_extension_textbox_for_logs, blank_data_file_name_no_extension_textbox_for_logs, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
1225
  success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
1226
 
1227
- text_tabular_files_done.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, placeholder_data_file_name_no_extension_textbox_for_logs, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop_tabular, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
1228
  success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
1229
 
1230
- successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, placeholder_doc_file_name_no_extension_textbox_for_logs, blank_data_file_name_no_extension_textbox_for_logs, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
 
 
 
 
1231
  success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
1232
 
1233
  if __name__ == "__main__":
@@ -1252,46 +1327,62 @@ if __name__ == "__main__":
1252
  'task': DIRECT_MODE_TASK,
1253
  'input_file': DIRECT_MODE_INPUT_FILE,
1254
  'output_dir': DIRECT_MODE_OUTPUT_DIR,
 
1255
  'language': DEFAULT_LANGUAGE,
1256
  'allow_list': ALLOW_LIST_PATH,
1257
  'pii_detector': LOCAL_PII_OPTION,
 
 
1258
  'aws_access_key': AWS_ACCESS_KEY,
1259
  'aws_secret_key': AWS_SECRET_KEY,
1260
  'aws_region': AWS_REGION,
1261
  's3_bucket': DOCUMENT_REDACTION_BUCKET,
1262
  'do_initial_clean': DO_INITIAL_TABULAR_DATA_CLEAN,
1263
  'save_logs_to_csv': SAVE_LOGS_TO_CSV,
 
1264
  'display_file_names_in_logs': DISPLAY_FILE_NAMES_IN_LOGS,
 
 
1265
  'ocr_method': TESSERACT_TEXT_EXTRACT_OPTION,
1266
  'page_min': DEFAULT_PAGE_MIN,
1267
  'page_max': DEFAULT_PAGE_MAX,
1268
- 'prepare_for_review': False,
1269
- 'prepare_images': True,
1270
- 'no_images': False,
1271
  'images_dpi': IMAGES_DPI,
1272
- 'max_image_pixels': None,
1273
- 'load_truncated_images': True,
1274
  'chosen_local_ocr_model': CHOSEN_LOCAL_OCR_MODEL,
1275
  'preprocess_local_ocr_images': PREPROCESS_LOCAL_OCR_IMAGES,
1276
  'compress_redacted_pdf': COMPRESS_REDACTED_PDF,
1277
  'return_pdf_end_of_redaction': RETURN_PDF_END_OF_REDACTION,
1278
- 'in_allow_list': OUTPUT_ALLOW_LIST_PATH,
1279
- 'in_deny_list': OUTPUT_DENY_LIST_PATH,
1280
- 'redact_whole_page_list': OUTPUT_WHOLE_PAGE_REDACTION_LIST_PATH,
1281
- 'handwrite_signature_checkbox': DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX,
1282
- 'anon_strat': DEFAULT_TABULAR_ANONYMISATION_STRATEGY,
1283
- 'columns': DEFAULT_TEXT_COLUMNS,
1284
  'excel_sheets': DEFAULT_EXCEL_SHEETS,
1285
- 'deny_list': OUTPUT_DENY_LIST_PATH,
1286
  'fuzzy_mistakes': DEFAULT_FUZZY_SPELLING_MISTAKES_NUM,
 
1287
  'duplicate_type': DIRECT_MODE_DUPLICATE_TYPE,
1288
  'similarity_threshold': DEFAULT_DUPLICATE_DETECTION_THRESHOLD,
1289
  'min_word_count': DEFAULT_MIN_WORD_COUNT,
1290
  'min_consecutive_pages': DEFAULT_MIN_CONSECUTIVE_PAGES,
1291
  'greedy_match': USE_GREEDY_DUPLICATE_DETECTION,
1292
  'combine_pages': DEFAULT_COMBINE_PAGES,
1293
- 'search_query': DEFAULT_SEARCH_QUERY if DEFAULT_SEARCH_QUERY else None,
1294
- 'text_columns': DEFAULT_TEXT_COLUMNS.split(',') if DEFAULT_TEXT_COLUMNS else []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1295
  }
1296
 
1297
  print(f"Running in direct mode with task: {DIRECT_MODE_TASK}")
@@ -1306,6 +1397,6 @@ if __name__ == "__main__":
1306
  print(f"Search query: {DEFAULT_SEARCH_QUERY}")
1307
  if DEFAULT_TEXT_COLUMNS:
1308
  print(f"Text columns: {DEFAULT_TEXT_COLUMNS}")
1309
-
1310
  # Run the CLI main function with direct mode arguments
1311
  main(direct_mode_args=direct_mode_args)
 
2
  import pandas as pd
3
  import gradio as gr
4
  from gradio_image_annotation import image_annotator
5
+ from tools.config import DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX, DEFAULT_TABULAR_ANONYMISATION_STRATEGY, OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, DEFAULT_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS, USAGE_LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, S3_FEEDBACK_LOGS_FOLDER, S3_ACCESS_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER, CHOSEN_LOCAL_OCR_MODEL, DEFAULT_LANGUAGE_FULL_NAME, SHOW_LANGUAGE_SELECTION, DO_INITIAL_TABULAR_DATA_CLEAN, DEFAULT_DUPLICATE_DETECTION_THRESHOLD, DIRECT_MODE_TASK, DIRECT_MODE_INPUT_FILE, DIRECT_MODE_OUTPUT_DIR, DIRECT_MODE_DUPLICATE_TYPE, DIRECT_MODE_DEFAULT_USER, LOCAL_PII_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, DOCUMENT_REDACTION_BUCKET, IMAGES_DPI, PREPROCESS_LOCAL_OCR_IMAGES, COMPRESS_REDACTED_PDF, RETURN_PDF_END_OF_REDACTION, DENY_LIST_PATH, WHOLE_PAGE_REDACTION_LIST_PATH, DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX, DEFAULT_TABULAR_ANONYMISATION_STRATEGY, DEFAULT_FUZZY_SPELLING_MISTAKES_NUM, DEFAULT_PAGE_MIN, DEFAULT_PAGE_MAX, DEFAULT_EXCEL_SHEETS, USE_GREEDY_DUPLICATE_DETECTION, DEFAULT_MIN_CONSECUTIVE_PAGES, DEFAULT_COMBINE_PAGES, DEFAULT_MIN_WORD_COUNT, DEFAULT_TEXT_COLUMNS, DEFAULT_SEARCH_QUERY, REMOVE_DUPLICATE_ROWS
6
  from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_relevant_ocr_output_with_words, reset_data_vars, reset_aws_call_vars, _get_env_list, ensure_folder_exists, reset_ocr_with_words_base_dataframe, update_language_dropdown, LANGUAGE_CHOICES, MAPPED_LANGUAGE_CHOICES
7
  from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
8
  from tools.file_redaction import choose_and_run_redactor
 
14
  from tools.custom_csvlogger import CSVLogger_custom
15
  from tools.find_duplicate_pages import run_duplicate_analysis, exclude_match, handle_selection_and_preview, apply_whole_page_redactions_from_list, create_annotation_objects_from_duplicates, run_full_search_and_analysis
16
  from tools.find_duplicate_tabular import update_tabular_column_choices, run_tabular_duplicate_detection, handle_tabular_row_selection, clean_tabular_duplicates
17
+ import time
18
  from tools.textract_batch_call import analyse_document_with_textract_api, poll_whole_document_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist, replace_existing_pdf_input_for_whole_document_outputs
19
 
20
  # Suppress downcasting warnings
 
49
  else: USE_GREEDY_DUPLICATE_DETECTION = False
50
  if DEFAULT_COMBINE_PAGES == "True": DEFAULT_COMBINE_PAGES = True
51
  else: DEFAULT_COMBINE_PAGES = False
52
+ if REMOVE_DUPLICATE_ROWS == "True": REMOVE_DUPLICATE_ROWS = True
53
+ else: REMOVE_DUPLICATE_ROWS = False
54
 
55
  if CSV_ACCESS_LOG_HEADERS: CSV_ACCESS_LOG_HEADERS = _get_env_list(CSV_ACCESS_LOG_HEADERS)
56
  if CSV_FEEDBACK_LOG_HEADERS: CSV_FEEDBACK_LOG_HEADERS = _get_env_list(CSV_FEEDBACK_LOG_HEADERS)
 
76
 
77
  FILE_INPUT_HEIGHT = int(FILE_INPUT_HEIGHT)
78
 
79
+ # Wrapper functions to add timing to deduplication functions
80
+ def run_duplicate_analysis_with_timing(files, threshold, min_words, min_consecutive, greedy_match, combine_pages, output_folder):
81
+ """
82
+ Wrapper for run_duplicate_analysis that adds timing and returns time taken.
83
+ """
84
+ start_time = time.time()
85
+ results_df, output_paths, full_data_by_file = run_duplicate_analysis(
86
+ files=files,
87
+ threshold=threshold,
88
+ min_words=min_words,
89
+ min_consecutive=min_consecutive,
90
+ greedy_match=greedy_match,
91
+ combine_pages=combine_pages,
92
+ output_folder=output_folder
93
+ )
94
+ end_time = time.time()
95
+ processing_time = end_time - start_time
96
+
97
+ # Store the time taken in a global variable for logging
98
+ global duplicate_analysis_time_taken
99
+ duplicate_analysis_time_taken = processing_time
100
+
101
+ return results_df, output_paths, full_data_by_file
102
+
103
+ def run_tabular_duplicate_detection_with_timing(files, threshold, min_words, text_columns, output_folder, do_initial_clean_dup, in_excel_tabular_sheets, remove_duplicate_rows):
104
+ """
105
+ Wrapper for run_tabular_duplicate_detection that adds timing and returns time taken.
106
+ """
107
+ start_time = time.time()
108
+ results_df, output_paths, file_choices = run_tabular_duplicate_detection(
109
+ files=files,
110
+ threshold=threshold,
111
+ min_words=min_words,
112
+ text_columns=text_columns,
113
+ output_folder=output_folder,
114
+ do_initial_clean_dup=do_initial_clean_dup,
115
+ in_excel_tabular_sheets=in_excel_tabular_sheets,
116
+ remove_duplicate_rows=remove_duplicate_rows
117
+ )
118
+ end_time = time.time()
119
+ processing_time = end_time - start_time
120
+
121
+ # Store the time taken in a global variable for logging
122
+ global tabular_duplicate_analysis_time_taken
123
+ tabular_duplicate_analysis_time_taken = processing_time
124
+
125
+ return results_df, output_paths, file_choices
126
+
127
+ # Initialize global variables for timing
128
+ duplicate_analysis_time_taken = 0.0
129
+ tabular_duplicate_analysis_time_taken = 0.0
130
+
131
  # Create the gradio interface
132
  app = gr.Blocks(theme = gr.themes.Default(primary_hue="blue"), fill_width=True) #gr.themes.Base()
133
 
 
240
  # S3 settings for default allow list load
241
  s3_default_bucket = gr.Textbox(label = "Default S3 bucket", value=DOCUMENT_REDACTION_BUCKET, visible=False)
242
  s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=S3_ALLOW_LIST_PATH, visible=False)
243
+ default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=ALLOW_LIST_PATH, visible=False)
244
 
245
  s3_whole_document_textract_default_bucket = gr.Textbox(label = "Default Textract whole_document S3 bucket", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, visible=False)
246
  s3_whole_document_textract_input_subfolder = gr.Textbox(label = "Default Textract whole_document S3 input folder", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, visible=False)
 
310
  textract_job_detail_df = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','job_date_time']), label="Previous job details", visible=False, type="pandas", wrap=True)
311
  selected_job_id_row = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','job_date_time']), label="Selected job id row", visible=False, type="pandas", wrap=True)
312
  is_a_textract_api_call = gr.Checkbox(value=False, label="is_this_a_textract_api_call", visible=False)
313
+ task_textbox = gr.Textbox(value="redact", label="task", visible=False) # Track the task being performed
314
  job_output_textbox = gr.Textbox(value="", label="Textract call outputs", visible=False)
315
  job_input_textbox = gr.Textbox(value=TEXTRACT_JOBS_S3_INPUT_LOC, label="Textract call outputs", visible=False)
316
 
 
392
  textract_job_output_file = gr.File(label="Textract job output files", height=100, visible=True)
393
  with gr.Column():
394
  job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=True)
395
+ convert_textract_outputs_to_ocr_results = gr.Button("Convert Textract job outputs to OCR results", variant="secondary", visible=True)
396
 
397
  gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the Redaction Settings tab.""")
398
  document_redact_btn = gr.Button("Extract text and redact document", variant="primary", scale = 4)
399
 
400
+ with gr.Row():
401
+ redaction_output_summary_textbox = gr.Textbox(label="Output summary", scale=1, lines=4)
402
  output_file = gr.File(label="Output files", scale = 2)#, height=FILE_INPUT_HEIGHT)
403
  latest_file_completed_num = gr.Number(value=0, label="Number of documents redacted", interactive=False, visible=False)
404
 
 
614
 
615
  with gr.Accordion("Anonymisation output format - by default will replace PII with a blank space", open = False):
616
  with gr.Row():
617
+ anon_strategy = gr.Radio(choices=["replace with 'REDACTED'", "replace with <ENTITY_NAME>", "redact completely", "hash", "mask"], label="Select an anonymisation method.", value = DEFAULT_TABULAR_ANONYMISATION_STRATEGY) # , "encrypt", "fake_first_name" are also available, but are not currently included as not that useful in current form
618
  do_initial_clean = gr.Checkbox(label="Do initial clean of text (remove URLs, HTML tags, and non-ASCII characters)", value=DO_INITIAL_TABULAR_DATA_CLEAN)
619
 
620
  tabular_data_redact_btn = gr.Button("Redact text/data files", variant="primary")
621
 
622
+ with gr.Row():
623
+ text_output_summary = gr.Textbox(label="Output result", lines=4)
624
  text_output_file = gr.File(label="Output files")
625
  text_tabular_files_done = gr.Number(value=0, label="Number of tabular files redacted", interactive=False, visible=False)
626
 
 
630
  # TABULAR DUPLICATE DETECTION TAB
631
  ###
632
  with gr.Accordion(label="Find duplicate cells in tabular data", open=False):
633
+ gr.Markdown("""Find duplicate cells or rows in CSV, Excel, or Parquet files. This tool analyzes text content across all columns to identify similar or identical entries that may be duplicates. You can review the results and choose to remove duplicate rows from your files.""")
634
 
635
  with gr.Accordion("Step 1: Upload files and configure analysis", open=True):
636
  in_tabular_duplicate_files = gr.File(
637
+ label="Upload CSV, Excel, or Parquet files to find duplicate cells/rows. Note that the app will remove duplicates from later cells/files that are found in earlier cells/files and not vice versa.",
638
  file_count="multiple",
639
  file_types=['.csv', '.xlsx', '.xls', '.parquet'],
640
  height=FILE_INPUT_HEIGHT
641
  )
642
 
643
+ with gr.Row(equal_height=True):
644
  tabular_duplicate_threshold = gr.Number(
645
  value=DEFAULT_DUPLICATE_DETECTION_THRESHOLD,
646
  label="Similarity threshold",
647
+ info="Score (0-1) to consider cells a match. 1 = perfect match."
648
  )
649
  tabular_min_word_count = gr.Number(
650
  value=DEFAULT_MIN_WORD_COUNT,
 
652
  info="Cells with fewer words than this are ignored."
653
  )
654
  do_initial_clean_dup = gr.Checkbox(label="Do initial clean of text (remove URLs, HTML tags, and non-ASCII characters)", value=DO_INITIAL_TABULAR_DATA_CLEAN)
655
+ remove_duplicate_rows = gr.Checkbox(label="Remove duplicate rows from deduplicated files", value=REMOVE_DUPLICATE_ROWS)
656
 
657
+ with gr.Row():
658
+ in_excel_tabular_sheets = gr.Dropdown(choices=list(), multiselect = True, label="Select Excel sheet names that you want to deduplicate (showing sheets present across all Excel files).", visible=True, allow_custom_value=True)
659
+
660
+ tabular_text_columns = gr.Dropdown(
661
+ choices=DEFAULT_TEXT_COLUMNS,
662
+ multiselect=True,
663
+ label="Select specific columns to analyse (leave empty to analyse all text columns simultaneously - i.e. all text is joined together)",
664
+ info="If no columns selected, all text columns will combined together and analysed"
665
+ )
666
 
667
  find_tabular_duplicates_btn = gr.Button(
668
  value="Find duplicate cells/rows",
 
681
  show_copy_button=True
682
  )
683
 
684
+ with gr.Row(equal_height=True):
685
  tabular_selected_row_index = gr.Number(value=None, visible=False)
686
  tabular_text1_preview = gr.Textbox(
687
  label="Text from File 1",
 
835
  document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number, all_page_line_level_ocr_results_with_words]).\
836
  success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
837
  success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox, chosen_language_drop],
838
+ outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, input_pdf_for_review, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state, task_textbox], api_name="redact_doc", show_progress_on=[redaction_output_summary_textbox])
839
 
840
  # If a file has been completed, the function will continue onto the next document
841
  latest_file_completed_num.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox, chosen_language_drop],
842
+ outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, input_pdf_for_review, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state, task_textbox], show_progress_on=[redaction_output_summary_textbox]).\
843
  success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state], show_progress_on=[annotator]).\
844
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
845
  success(fn=check_for_relevant_ocr_output_with_words, inputs=[doc_file_name_no_extension_textbox, text_extract_method_radio, output_folder_textbox], outputs=[relevant_ocr_output_with_words_found_checkbox]).\
 
851
  all_page_line_level_ocr_results_with_words_df_base.change(reset_ocr_with_words_base_dataframe, inputs=[all_page_line_level_ocr_results_with_words_df_base, page_entity_dropdown_redaction], outputs=[all_page_line_level_ocr_results_with_words_df, backup_all_page_line_level_ocr_results_with_words_df_base])
852
 
853
  # Send whole document to Textract for text extraction
854
+ send_document_to_textract_api_btn.click(analyse_document_with_textract_api, inputs=[prepared_pdf_state, s3_whole_document_textract_input_subfolder, s3_whole_document_textract_output_subfolder, textract_job_detail_df, s3_whole_document_textract_default_bucket, output_folder_textbox, handwrite_signature_checkbox, successful_textract_api_call_number, total_pdf_page_count], outputs=[job_output_textbox, job_id_textbox, job_type_dropdown, successful_textract_api_call_number, is_a_textract_api_call, textract_query_number, task_textbox]).\
855
  success(check_for_provided_job_id, inputs=[job_id_textbox]).\
856
  success(poll_whole_document_textract_analysis_progress_and_download, inputs=[job_id_textbox, job_type_dropdown, s3_whole_document_textract_output_subfolder, doc_file_name_no_extension_textbox, textract_job_detail_df, s3_whole_document_textract_default_bucket, output_folder_textbox, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder], outputs = [textract_job_output_file, job_current_status, textract_job_detail_df, doc_file_name_no_extension_textbox]).\
857
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
 
869
  success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
870
  success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
871
  success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox, chosen_language_drop],
872
+ outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, input_pdf_for_review, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state, task_textbox], show_progress_on=[redaction_output_summary_textbox]).\
873
  success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state], show_progress_on=[annotator])
874
 
875
  ###
 
1081
  success(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list, total_pdf_page_count])
1082
 
1083
  tabular_data_redact_btn.click(reset_data_vars, outputs=[actual_time_taken_number, log_files_output_list_state, comprehend_query_number]).\
1084
+ success(fn=anonymise_files_with_open_text, inputs=[in_data_files, in_text, anon_strategy, in_colnames, in_redact_entities, in_allow_list_state, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox, actual_time_taken_number, do_initial_clean, chosen_language_drop], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state, actual_time_taken_number, comprehend_query_number], api_name="redact_data")
1085
 
1086
  # If the output file count text box changes, keep going with redacting each data file until done
1087
+ text_tabular_files_done.change(fn=anonymise_files_with_open_text, inputs=[in_data_files, in_text, anon_strategy, in_colnames, in_redact_entities, in_allow_list_state, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox, actual_time_taken_number, do_initial_clean, chosen_language_drop], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state, actual_time_taken_number, comprehend_query_number]).\
1088
  success(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
1089
 
1090
  ###
 
1099
  min_word_count_input,
1100
  min_consecutive_pages_input,
1101
  greedy_match_input,
1102
+ combine_page_text_for_duplicates_bool,
1103
+ output_folder_textbox
1104
  ],
1105
  outputs=[
1106
  results_df_preview,
1107
  duplicate_files_out,
1108
+ full_duplicate_data_by_file,
1109
+ actual_time_taken_number,
1110
+ task_textbox
1111
  ]
1112
  )
1113
 
 
1139
 
1140
  # Event handlers
1141
  in_tabular_duplicate_files.upload(
1142
+ fn=put_columns_in_df, inputs=[in_tabular_duplicate_files], outputs=[tabular_text_columns, in_excel_tabular_sheets])
1143
+
 
 
1144
 
1145
  find_tabular_duplicates_btn.click(
1146
  fn=run_tabular_duplicate_detection,
1147
+ inputs=[in_tabular_duplicate_files, tabular_duplicate_threshold, tabular_min_word_count, tabular_text_columns, output_folder_textbox, do_initial_clean_dup, in_excel_tabular_sheets, remove_duplicate_rows],
1148
+ outputs=[tabular_results_df, tabular_cleaned_file, tabular_file_to_clean, actual_time_taken_number, task_textbox], api_name="tabular_clean_duplicates", show_progress_on=[tabular_results_df]
1149
  )
1150
 
1151
  tabular_results_df.select(
 
1156
 
1157
  clean_duplicates_btn.click(
1158
  fn=clean_tabular_duplicates,
1159
+ inputs=[tabular_file_to_clean, tabular_results_df, output_folder_textbox, in_excel_tabular_sheets],
1160
  outputs=[tabular_cleaned_file]
1161
  )
1162
 
 
1243
  pdf_callback = CSVLogger_custom(dataset_file_name=FEEDBACK_LOG_FILE_NAME)
1244
  data_callback = CSVLogger_custom(dataset_file_name=FEEDBACK_LOG_FILE_NAME)
1245
 
1246
+ if DISPLAY_FILE_NAMES_IN_LOGS == True:
1247
  # User submitted feedback for pdf redactions
1248
  pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], FEEDBACK_LOGS_FOLDER)
1249
  pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], None, preprocess=False).\
1250
  success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
1251
 
1252
  # User submitted feedback for data redactions
1253
+ data_callback.setup([data_feedback_radio, data_further_details_text, data_file_name_with_extension_textbox], FEEDBACK_LOGS_FOLDER)
1254
+ data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [data_feedback_radio, data_further_details_text, data_file_name_with_extension_textbox], None, preprocess=False).\
1255
  success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
1256
  else:
1257
  # User submitted feedback for pdf redactions
 
1260
  success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
1261
 
1262
  # User submitted feedback for data redactions
1263
+ data_callback.setup([data_feedback_radio, data_further_details_text, data_file_name_with_extension_textbox], FEEDBACK_LOGS_FOLDER)
1264
  data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [data_feedback_radio, data_further_details_text, placeholder_data_file_name_no_extension_textbox_for_logs], None, preprocess=False).\
1265
  success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
1266
 
 
1268
  # Log processing usage - time taken for redaction queries, and also logs for queries to Textract/Comprehend
1269
  usage_callback = CSVLogger_custom(dataset_file_name=USAGE_LOG_FILE_NAME)
1270
 
1271
+ if DISPLAY_FILE_NAMES_IN_LOGS == True:
1272
+ usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_file_name_with_extension_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call, task_textbox], USAGE_LOGS_FOLDER)
1273
+
1274
+ latest_file_completed_num.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, doc_file_name_no_extension_textbox, data_file_name_with_extension_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call, task_textbox], None, preprocess=False, api_name="usage_logs").\
1275
+ success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
1276
 
1277
+ text_tabular_files_done.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, doc_file_name_no_extension_textbox, data_file_name_with_extension_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop_tabular, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call, task_textbox], None, preprocess=False).\
1278
  success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
1279
 
1280
+ successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, doc_file_name_no_extension_textbox, data_file_name_with_extension_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call, task_textbox], None, preprocess=False).\
1281
  success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
1282
 
1283
+ # Deduplication usage logging
1284
+ duplicate_files_out.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, blank_data_file_name_no_extension_textbox_for_logs, actual_time_taken_number, textract_query_number, pii_identification_method_drop_tabular, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call, task_textbox], None, preprocess=False).\
1285
+ success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
1286
+
1287
+ tabular_results_df.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, blank_data_file_name_no_extension_textbox_for_logs, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop_tabular, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call, task_textbox], None, preprocess=False).\
1288
  success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
1289
  else:
1290
+ usage_callback.setup([session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, blank_data_file_name_no_extension_textbox_for_logs, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call, task_textbox], USAGE_LOGS_FOLDER)
1291
+
1292
+ latest_file_completed_num.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, placeholder_doc_file_name_no_extension_textbox_for_logs, blank_data_file_name_no_extension_textbox_for_logs, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call, task_textbox], None, preprocess=False).\
1293
+ success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
1294
 
1295
+ text_tabular_files_done.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, placeholder_data_file_name_no_extension_textbox_for_logs, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop_tabular, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call, task_textbox], None, preprocess=False).\
1296
  success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
1297
 
1298
+ successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, placeholder_doc_file_name_no_extension_textbox_for_logs, blank_data_file_name_no_extension_textbox_for_logs, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call, task_textbox], None, preprocess=False).\
1299
  success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
1300
 
1301
+ # Deduplication usage logging (when file names not displayed)
1302
+ duplicate_files_out.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, placeholder_doc_file_name_no_extension_textbox_for_logs, blank_data_file_name_no_extension_textbox_for_logs, total_pdf_page_count, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop_tabular, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call, task_textbox], None, preprocess=False).\
1303
+ success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
1304
+
1305
+ tabular_results_df.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, placeholder_doc_file_name_no_extension_textbox_for_logs, blank_data_file_name_no_extension_textbox_for_logs, total_pdf_page_count, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop_tabular, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call, task_textbox], None, preprocess=False).\
1306
  success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
1307
 
1308
  if __name__ == "__main__":
 
1327
  'task': DIRECT_MODE_TASK,
1328
  'input_file': DIRECT_MODE_INPUT_FILE,
1329
  'output_dir': DIRECT_MODE_OUTPUT_DIR,
1330
+ 'input_dir': INPUT_FOLDER,
1331
  'language': DEFAULT_LANGUAGE,
1332
  'allow_list': ALLOW_LIST_PATH,
1333
  'pii_detector': LOCAL_PII_OPTION,
1334
+ 'username': DIRECT_MODE_DEFAULT_USER,
1335
+ 'save_to_user_folders': SESSION_OUTPUT_FOLDER,
1336
  'aws_access_key': AWS_ACCESS_KEY,
1337
  'aws_secret_key': AWS_SECRET_KEY,
1338
  'aws_region': AWS_REGION,
1339
  's3_bucket': DOCUMENT_REDACTION_BUCKET,
1340
  'do_initial_clean': DO_INITIAL_TABULAR_DATA_CLEAN,
1341
  'save_logs_to_csv': SAVE_LOGS_TO_CSV,
1342
+ 'save_logs_to_dynamodb': SAVE_LOGS_TO_DYNAMODB,
1343
  'display_file_names_in_logs': DISPLAY_FILE_NAMES_IN_LOGS,
1344
+ 'upload_logs_to_s3': RUN_AWS_FUNCTIONS == "1",
1345
+ 's3_logs_prefix': S3_USAGE_LOGS_FOLDER,
1346
  'ocr_method': TESSERACT_TEXT_EXTRACT_OPTION,
1347
  'page_min': DEFAULT_PAGE_MIN,
1348
  'page_max': DEFAULT_PAGE_MAX,
 
 
 
1349
  'images_dpi': IMAGES_DPI,
 
 
1350
  'chosen_local_ocr_model': CHOSEN_LOCAL_OCR_MODEL,
1351
  'preprocess_local_ocr_images': PREPROCESS_LOCAL_OCR_IMAGES,
1352
  'compress_redacted_pdf': COMPRESS_REDACTED_PDF,
1353
  'return_pdf_end_of_redaction': RETURN_PDF_END_OF_REDACTION,
1354
+ 'allow_list_file': ALLOW_LIST_PATH,
1355
+ 'deny_list_file': DENY_LIST_PATH,
1356
+ 'redact_whole_page_file': WHOLE_PAGE_REDACTION_LIST_PATH,
1357
+ 'handwrite_signature_extraction': DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX,
1358
+ 'anon_strategy': DEFAULT_TABULAR_ANONYMISATION_STRATEGY,
 
1359
  'excel_sheets': DEFAULT_EXCEL_SHEETS,
 
1360
  'fuzzy_mistakes': DEFAULT_FUZZY_SPELLING_MISTAKES_NUM,
1361
+ 'match_fuzzy_whole_phrase_bool': 'True', # Default value
1362
  'duplicate_type': DIRECT_MODE_DUPLICATE_TYPE,
1363
  'similarity_threshold': DEFAULT_DUPLICATE_DETECTION_THRESHOLD,
1364
  'min_word_count': DEFAULT_MIN_WORD_COUNT,
1365
  'min_consecutive_pages': DEFAULT_MIN_CONSECUTIVE_PAGES,
1366
  'greedy_match': USE_GREEDY_DUPLICATE_DETECTION,
1367
  'combine_pages': DEFAULT_COMBINE_PAGES,
1368
+ 'search_query': DEFAULT_SEARCH_QUERY,
1369
+ 'text_columns': DEFAULT_TEXT_COLUMNS,
1370
+ 'remove_duplicate_rows': REMOVE_DUPLICATE_ROWS,
1371
+ # Textract specific arguments (with defaults)
1372
+ 'textract_action': '',
1373
+ 'job_id': '',
1374
+ 'extract_signatures': False,
1375
+ 'textract_bucket': TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET,
1376
+ 'textract_input_prefix': TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER,
1377
+ 'textract_output_prefix': TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER,
1378
+ 's3_textract_document_logs_subfolder': TEXTRACT_JOBS_S3_LOC,
1379
+ 'local_textract_document_logs_subfolder': TEXTRACT_JOBS_LOCAL_LOC,
1380
+ 'poll_interval': 30,
1381
+ 'max_poll_attempts': 120,
1382
+ # General arguments that might be missing
1383
+ 'local_redact_entities': CHOSEN_REDACT_ENTITIES,
1384
+ 'aws_redact_entities': CHOSEN_COMPREHEND_ENTITIES,
1385
+ 'cost_code': DEFAULT_COST_CODE
1386
  }
1387
 
1388
  print(f"Running in direct mode with task: {DIRECT_MODE_TASK}")
 
1397
  print(f"Search query: {DEFAULT_SEARCH_QUERY}")
1398
  if DEFAULT_TEXT_COLUMNS:
1399
  print(f"Text columns: {DEFAULT_TEXT_COLUMNS}")
1400
+ print(f"Remove duplicate rows: {REMOVE_DUPLICATE_ROWS}")
1401
  # Run the CLI main function with direct mode arguments
1402
  main(direct_mode_args=direct_mode_args)
cli_redact.py CHANGED
@@ -1,15 +1,63 @@
1
  import argparse
2
  import os
3
  import pandas as pd
4
- from tools.config import get_or_create_env_var, LOCAL_PII_OPTION, AWS_PII_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, INPUT_FOLDER, OUTPUT_FOLDER, DEFAULT_LANGUAGE, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST
 
 
 
5
  from tools.helper_functions import ensure_output_folder_exists
6
- from tools.file_conversion import prepare_image_or_pdf
7
- from tools.file_redaction import choose_and_run_redactor
8
- from tools.data_anonymise import anonymise_files_with_open_text
9
- from tools.helper_functions import _get_env_list
10
- from tools.load_spacy_model_custom_recognisers import custom_entities
11
- from tools.find_duplicate_pages import run_duplicate_analysis, run_full_search_and_analysis
12
- from tools.find_duplicate_tabular import run_tabular_duplicate_analysis
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  # --- Constants and Configuration ---
15
 
@@ -17,18 +65,21 @@ if CHOSEN_COMPREHEND_ENTITIES: CHOSEN_COMPREHEND_ENTITIES = _get_env_list(CHOSEN
17
  if FULL_COMPREHEND_ENTITY_LIST: FULL_COMPREHEND_ENTITY_LIST = _get_env_list(FULL_COMPREHEND_ENTITY_LIST)
18
  if CHOSEN_REDACT_ENTITIES: CHOSEN_REDACT_ENTITIES = _get_env_list(CHOSEN_REDACT_ENTITIES)
19
  if FULL_ENTITY_LIST: FULL_ENTITY_LIST = _get_env_list(FULL_ENTITY_LIST)
 
 
20
 
21
  # Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
22
- CHOSEN_COMPREHEND_ENTITIES.extend(custom_entities)
23
- FULL_COMPREHEND_ENTITY_LIST.extend(custom_entities)
24
 
25
  chosen_redact_entities = CHOSEN_REDACT_ENTITIES
26
  full_entity_list = FULL_ENTITY_LIST
27
  chosen_comprehend_entities = CHOSEN_COMPREHEND_ENTITIES
28
  full_comprehend_entity_list = FULL_COMPREHEND_ENTITY_LIST
 
29
 
30
  # --- Main CLI Function ---
31
- def main(direct_mode_args=None):
32
  """
33
  A unified command-line interface to prepare, redact, and anonymise various document types.
34
 
@@ -40,99 +91,159 @@ def main(direct_mode_args=None):
40
  description='A versatile CLI for redacting PII from PDF/image files and anonymising Word/tabular data.',
41
  formatter_class=argparse.RawTextHelpFormatter,
42
  epilog='''
43
- Examples:
44
- # Redact a PDF with default settings:
45
- python cli_redact.py --input_file document.pdf
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
- # Redact specific pages with custom OCR:
48
- python cli_redact.py --input_file document.pdf --page_min 1 --page_max 10 --ocr_method "AWS Textract service - all PDF types"
49
 
50
- # Anonymize Excel file with specific columns:
51
- python cli_redact.py --input_file data.xlsx --columns "Name" "Email" --anon_strat "replace with 'REDACTED'"
52
 
53
- # Use AWS services with custom settings:
54
- python cli_redact.py --input_file document.pdf --pii_detector "AWS Comprehend" --aws_access_key YOUR_KEY --aws_secret_key YOUR_SECRET
55
 
56
- # Advanced redaction with custom word list:
57
- python cli_redact.py --input_file document.pdf --in_deny_list "CompanyName" "ProjectCode" --deny_list custom_terms.csv
58
 
59
- # Find duplicate pages in OCR files:
60
- python cli_redact.py --task deduplicate --input_file ocr_output.csv --duplicate_type pages --similarity_threshold 0.95
61
 
62
- # Find duplicate content with search query:
63
- python cli_redact.py --task deduplicate --input_file ocr_output.csv --duplicate_type pages --search_query "confidential information"
64
 
65
- # Find duplicate rows in tabular data:
66
- python cli_redact.py --task deduplicate --input_file data.csv --duplicate_type tabular --text_columns "Name" "Description"
67
- '''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  )
69
 
70
  # --- Task Selection ---
71
  task_group = parser.add_argument_group('Task Selection')
72
  task_group.add_argument('--task',
73
- choices=['redact', 'deduplicate'],
74
  default='redact',
75
- help='Task to perform: redact (PII redaction/anonymization) or deduplicate (find duplicate content).')
76
 
77
  # --- General Arguments (apply to all file types) ---
78
  general_group = parser.add_argument_group('General Options')
79
- general_group.add_argument('--input_file', required=True, help='Path to the input file to process.')
80
  general_group.add_argument('--output_dir', default=OUTPUT_FOLDER, help='Directory for all output files.')
 
81
  general_group.add_argument('--language', default=DEFAULT_LANGUAGE, help='Language of the document content.')
82
- general_group.add_argument('--allow_list', help='Path to a CSV file with words to exclude from redaction.')
83
- general_group.add_argument('--pii_detector',
84
- choices=[LOCAL_PII_OPTION, AWS_PII_OPTION],
85
- default=LOCAL_PII_OPTION,
86
- help='Core PII detection method (Local or AWS).')
87
- general_group.add_argument('--aws_access_key', default='', help='Your AWS Access Key ID.')
88
- general_group.add_argument('--aws_secret_key', default='', help='Your AWS Secret Access Key.')
89
- general_group.add_argument('--aws_region', default='', help='AWS region for cloud services.')
90
- general_group.add_argument('--s3_bucket', default='', help='S3 bucket name for cloud operations.')
91
- general_group.add_argument('--do_initial_clean', action='store_true', help='Perform initial text cleaning for tabular data.')
92
- general_group.add_argument('--save_logs_to_csv', action='store_true', help='Save processing logs to CSV files.')
93
- general_group.add_argument('--display_file_names_in_logs', action='store_true', help='Include file names in log outputs.')
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  # --- PDF/Image Redaction Arguments ---
96
  pdf_group = parser.add_argument_group('PDF/Image Redaction Options (.pdf, .png, .jpg)')
97
- pdf_group.add_argument('--ocr_method',
98
- choices=[SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION],
99
- default=TESSERACT_TEXT_EXTRACT_OPTION,
100
- help='OCR method for text extraction from images.')
101
  pdf_group.add_argument('--page_min', type=int, default=0, help='First page to redact.')
102
- pdf_group.add_argument('--page_max', type=int, default=999, help='Last page to redact.')
103
- pdf_group.add_argument('--prepare_for_review', action='store_true', help='Prepare files for reviewing redactions.')
104
- pdf_group.add_argument('--prepare_images', action='store_true', default=True, help='Enable image creation for PDF pages.')
105
- pdf_group.add_argument('--no_images', action='store_false', dest='prepare_images', help='Disable image creation for PDF pages.')
106
- pdf_group.add_argument('--images_dpi', type=float, default=300.0, help='DPI for image processing.')
107
- pdf_group.add_argument('--max_image_pixels', type=int, help='Maximum image pixels for processing.')
108
- pdf_group.add_argument('--load_truncated_images', action='store_true', help='Load truncated images during processing.')
109
- pdf_group.add_argument('--chosen_local_ocr_model', choices=['tesseract', 'hybrid', 'paddle'], default='tesseract', help='Local OCR model to use.')
110
- pdf_group.add_argument('--preprocess_local_ocr_images', action='store_true', help='Preprocess images before OCR.')
111
- pdf_group.add_argument('--compress_redacted_pdf', action='store_true', help='Compress the final redacted PDF.')
112
- pdf_group.add_argument('--return_pdf_end_of_redaction', action='store_true', default=True, help='Return PDF at end of redaction process.')
113
- pdf_group.add_argument('--in_deny_list', nargs='+', default=list(), help='Custom words to recognize for redaction.')
114
- pdf_group.add_argument('--redact_whole_page_list', nargs='+', default=list(), help='Pages to redact completely.')
115
- pdf_group.add_argument('--handwrite_signature_checkbox', nargs='+', default=['Extract handwriting', 'Extract signatures'], help='Handwriting and signature extraction options.')
116
 
117
  # --- Word/Tabular Anonymisation Arguments ---
118
  tabular_group = parser.add_argument_group('Word/Tabular Anonymisation Options (.docx, .csv, .xlsx)')
119
- tabular_group.add_argument('--anon_strat', choices=['redact', 'encrypt', 'hash', 'replace with \'REDACTED\'', 'replace with <ENTITY_NAME>', 'redact completely', 'mask', 'fake_first_name'], default='redact', help='The anonymisation strategy to apply.')
120
- tabular_group.add_argument('--columns', nargs='+', default=list(), help='A list of column names to anonymise in tabular data.')
121
  tabular_group.add_argument('--excel_sheets', nargs='+', default=list(), help='Specific Excel sheet names to process.')
122
- tabular_group.add_argument('--deny_list', help='Path to a CSV file with specific terms/phrases to redact.')
123
- tabular_group.add_argument('--fuzzy_mistakes', type=int, default=1, help='Number of allowed spelling mistakes for fuzzy matching.')
124
-
125
  # --- Duplicate Detection Arguments ---
126
  duplicate_group = parser.add_argument_group('Duplicate Detection Options')
127
  duplicate_group.add_argument('--duplicate_type', choices=['pages', 'tabular'], default='pages', help='Type of duplicate detection: pages (for OCR files) or tabular (for CSV/Excel files).')
128
- duplicate_group.add_argument('--similarity_threshold', type=float, default=0.95, help='Similarity threshold (0-1) to consider content as duplicates.')
129
- duplicate_group.add_argument('--min_word_count', type=int, default=3, help='Minimum word count for text to be considered in duplicate analysis.')
130
- duplicate_group.add_argument('--min_consecutive_pages', type=int, default=1, help='Minimum number of consecutive pages to consider as a match.')
131
- duplicate_group.add_argument('--greedy_match', action='store_true', default=True, help='Use greedy matching strategy for consecutive pages.')
132
- duplicate_group.add_argument('--combine_pages', action='store_true', default=True, help='Combine text from the same page number within a file.')
133
- duplicate_group.add_argument('--search_query', help='Search query text to find specific duplicate content (for page duplicates).')
134
- duplicate_group.add_argument('--text_columns', nargs='+', default=list(), help='Specific text columns to analyze for duplicates (for tabular data).')
135
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  # Parse arguments - either from command line or direct mode
137
  if direct_mode_args:
138
  # Use direct mode arguments
@@ -142,54 +253,161 @@ Examples:
142
  args = parser.parse_args()
143
 
144
  # --- Initial Setup ---
145
- ensure_output_folder_exists(args.output_dir)
146
- _, file_extension = os.path.splitext(args.input_file)
147
- file_extension = file_extension.lower()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
- # Load allow/deny lists
150
- allow_list = pd.read_csv(args.allow_list) if args.allow_list else pd.DataFrame()
151
- deny_list = pd.read_csv(args.deny_list).iloc[:, 0].tolist() if args.deny_list else []
 
 
 
 
 
 
 
 
 
 
 
 
152
 
153
  # --- Route to the Correct Workflow Based on Task and File Type ---
154
 
155
- # Task 1: Redaction/Anonymization
 
 
 
 
 
 
 
 
 
 
 
156
  if args.task == 'redact':
 
157
  # Workflow 1: PDF/Image Redaction
158
  if file_extension in ['.pdf', '.png', '.jpg', '.jpeg']:
159
  print("--- Detected PDF/Image file. Starting Redaction Workflow... ---")
 
160
  try:
 
 
161
  # Step 1: Prepare the document
162
  print("\nStep 1: Preparing document...")
163
  (
164
  prep_summary, prepared_pdf_paths, image_file_paths, _, _, pdf_doc,
165
- image_annotations, _, original_cropboxes, page_sizes, textract_output_found, _, _, _, _
166
  ) = prepare_image_or_pdf(
167
- file_paths=[args.input_file], text_extract_method=args.ocr_method,
168
- all_line_level_ocr_results_df=pd.DataFrame(), all_page_line_level_ocr_results_with_words_df=pd.DataFrame(),
169
- first_loop_state=True, prepare_for_review=args.prepare_for_review,
170
- output_folder=args.output_dir, prepare_images=args.prepare_images
171
  )
172
  print(f"Preparation complete. {prep_summary}")
173
 
174
  # Step 2: Redact the prepared document
175
  print("\nStep 2: Running redaction...")
176
  (
177
- output_summary, output_files, _, _, log_files, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _
178
  ) = choose_and_run_redactor(
179
- file_paths=[args.input_file], prepared_pdf_file_paths=prepared_pdf_paths,
180
- pdf_image_file_paths=image_file_paths, chosen_redact_entities=chosen_redact_entities,
181
- chosen_redact_comprehend_entities=chosen_comprehend_entities, text_extraction_method=args.ocr_method,
182
- in_allow_list=allow_list, in_deny_list=args.in_deny_list,
183
- redact_whole_page_list=args.redact_whole_page_list, first_loop_state=True,
184
- page_min=args.page_min, page_max=args.page_max, handwrite_signature_checkbox=args.handwrite_signature_checkbox,
185
  pymupdf_doc=pdf_doc, annotations_all_pages=image_annotations, page_sizes=page_sizes,
186
  document_cropboxes=original_cropboxes, pii_identification_method=args.pii_detector,
187
  aws_access_key_textbox=args.aws_access_key, aws_secret_key_textbox=args.aws_secret_key,
188
- language=args.language, output_folder=args.output_dir
189
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
  print("\n--- Redaction Process Complete ---")
192
  print(f"Summary: {output_summary}")
 
193
  print(f"\nOutput files saved to: {args.output_dir}")
194
  print("Generated Files:", sorted(output_files))
195
  if log_files: print("Log Files:", sorted(log_files))
@@ -200,30 +418,83 @@ Examples:
200
  # Workflow 2: Word/Tabular Data Anonymisation
201
  elif file_extension in ['.docx', '.xlsx', '.xls', '.csv', '.parquet']:
202
  print("--- Detected Word/Tabular file. Starting Anonymisation Workflow... ---")
 
203
  try:
 
 
204
  # Run the anonymisation function directly
205
- output_summary, output_files, _, _, log_files, _, _ = anonymise_files_with_open_text(
206
- file_paths=[args.input_file],
 
207
  in_text="", # Not used for file-based operations
208
- anon_strat=args.anon_strat,
209
- chosen_cols=args.columns,
210
- chosen_redact_entities=chosen_redact_entities,
211
- in_allow_list=allow_list,
212
  in_excel_sheets=args.excel_sheets,
213
  first_loop_state=True,
214
  output_folder=args.output_dir,
215
- in_deny_list=deny_list,
216
  max_fuzzy_spelling_mistakes_num=args.fuzzy_mistakes,
217
  pii_identification_method=args.pii_detector,
218
- chosen_redact_comprehend_entities=chosen_comprehend_entities,
219
  aws_access_key_textbox=args.aws_access_key,
220
  aws_secret_key_textbox=args.aws_secret_key,
221
  language=args.language,
222
  do_initial_clean=args.do_initial_clean
223
  )
224
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  print("\n--- Anonymisation Process Complete ---")
226
  print(f"Summary: {output_summary}")
 
227
  print(f"\nOutput files saved to: {args.output_dir}")
228
  print("Generated Files:", sorted(output_files))
229
  if log_files: print("Log Files:", sorted(log_files))
@@ -240,29 +511,33 @@ Examples:
240
  elif args.task == 'deduplicate':
241
  print("--- Starting Duplicate Detection Workflow... ---")
242
  try:
 
243
  if args.duplicate_type == 'pages':
244
  # Page duplicate detection
245
  if file_extension == '.csv':
246
  print("--- Detected OCR CSV file. Starting Page Duplicate Detection... ---")
247
 
248
- if args.search_query:
249
- # Use search-based duplicate detection
250
- print(f"Searching for duplicates of: '{args.search_query}'")
251
- # Note: This would require the OCR data to be loaded first
252
- # For now, we'll use the general duplicate analysis
253
- print("Note: Search-based duplicate detection requires OCR data preparation.")
254
- print("Using general duplicate analysis instead.")
255
-
256
  # Load the CSV file as a list for the duplicate analysis function
257
- results_df, output_paths, full_data_by_file = run_duplicate_analysis(
258
- files=[args.input_file],
259
  threshold=args.similarity_threshold,
260
  min_words=args.min_word_count,
261
  min_consecutive=args.min_consecutive_pages,
262
  greedy_match=args.greedy_match,
263
- combine_pages=args.combine_pages
 
264
  )
265
 
 
 
 
266
  print("\n--- Page Duplicate Detection Complete ---")
267
  print(f"Found {len(results_df)} duplicate matches")
268
  print(f"\nOutput files saved to: {args.output_dir}")
@@ -271,19 +546,116 @@ Examples:
271
  else:
272
  print(f"Error: Page duplicate detection requires CSV files with OCR data.")
273
  print("Please provide a CSV file containing OCR output data.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
 
275
  elif args.duplicate_type == 'tabular':
276
  # Tabular duplicate detection
 
277
  if file_extension in ['.csv', '.xlsx', '.xls', '.parquet']:
278
  print("--- Detected tabular file. Starting Tabular Duplicate Detection... ---")
 
 
279
 
280
- results_df, output_paths, full_data_by_file = run_tabular_duplicate_analysis(
281
- files=[args.input_file],
282
  threshold=args.similarity_threshold,
283
  min_words=args.min_word_count,
284
- text_columns=args.text_columns if args.text_columns else None,
285
- output_folder=args.output_dir
 
 
 
286
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
287
 
288
  print("\n--- Tabular Duplicate Detection Complete ---")
289
  print(f"Found {len(results_df)} duplicate matches")
@@ -299,10 +671,175 @@ Examples:
299
 
300
  except Exception as e:
301
  print(f"\nAn error occurred during the duplicate detection workflow: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
 
303
  else:
304
  print(f"Error: Invalid task '{args.task}'.")
305
- print("Valid options: 'redact' or 'deduplicate'")
306
 
307
  if __name__ == "__main__":
308
  main()
 
1
  import argparse
2
  import os
3
  import pandas as pd
4
+ import time
5
+ import uuid
6
+ from tools.config import LOCAL_PII_OPTION, AWS_PII_OPTION, OUTPUT_FOLDER, DEFAULT_LANGUAGE, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, CUSTOM_ENTITIES, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, DOCUMENT_REDACTION_BUCKET, DEFAULT_COST_CODE, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, DISPLAY_FILE_NAMES_IN_LOGS, DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX, DO_INITIAL_TABULAR_DATA_CLEAN, ALLOW_LIST_PATH, DENY_LIST_PATH, WHOLE_PAGE_REDACTION_LIST_PATH, PREPROCESS_LOCAL_OCR_IMAGES, IMAGES_DPI, RETURN_PDF_END_OF_REDACTION, COMPRESS_REDACTED_PDF, CHOSEN_LOCAL_OCR_MODEL, DEFAULT_TABULAR_ANONYMISATION_STRATEGY, DEFAULT_FUZZY_SPELLING_MISTAKES_NUM, DEFAULT_DUPLICATE_DETECTION_THRESHOLD, DEFAULT_MIN_WORD_COUNT, DEFAULT_MIN_CONSECUTIVE_PAGES, USE_GREEDY_DUPLICATE_DETECTION, DEFAULT_COMBINE_PAGES, REMOVE_DUPLICATE_ROWS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, INPUT_FOLDER, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, SESSION_OUTPUT_FOLDER, DIRECT_MODE_DEFAULT_USER, RUN_AWS_FUNCTIONS, S3_USAGE_LOGS_FOLDER
7
+
8
  from tools.helper_functions import ensure_output_folder_exists
9
+
10
+
11
+ def _generate_session_hash() -> str:
12
+ """Generate a unique session hash for logging purposes."""
13
+ return str(uuid.uuid4())[:8]
14
+
15
+ def get_username_and_folders(username:str = "",
16
+ output_folder_textbox:str=OUTPUT_FOLDER,
17
+ input_folder_textbox:str=INPUT_FOLDER,
18
+ session_output_folder:str=SESSION_OUTPUT_FOLDER,
19
+ textract_document_upload_input_folder:str=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER,
20
+ textract_document_upload_output_folder:str=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER,
21
+ s3_textract_document_logs_subfolder:str=TEXTRACT_JOBS_S3_LOC,
22
+ local_textract_document_logs_subfolder:str=TEXTRACT_JOBS_LOCAL_LOC):
23
+
24
+
25
+ # Generate session hash for logging. Either from input user name or generated
26
+ if username:
27
+ out_session_hash = username
28
+ else:
29
+ out_session_hash = _generate_session_hash()
30
+
31
+
32
+ if session_output_folder == 'True' or session_output_folder == True:
33
+ output_folder = output_folder_textbox + out_session_hash + "/"
34
+ input_folder = input_folder_textbox + out_session_hash + "/"
35
+
36
+ textract_document_upload_input_folder = textract_document_upload_input_folder + "/" + out_session_hash
37
+ textract_document_upload_output_folder = textract_document_upload_output_folder + "/" + out_session_hash
38
+
39
+ s3_textract_document_logs_subfolder = s3_textract_document_logs_subfolder + "/" + out_session_hash
40
+ local_textract_document_logs_subfolder = local_textract_document_logs_subfolder + "/" + out_session_hash + "/"
41
+
42
+ else:
43
+ output_folder = output_folder_textbox
44
+ input_folder = input_folder_textbox
45
+
46
+ if not os.path.exists(output_folder): os.mkdir(output_folder)
47
+ if not os.path.exists(input_folder): os.mkdir(input_folder)
48
+
49
+ return out_session_hash, output_folder, out_session_hash, input_folder, textract_document_upload_input_folder, textract_document_upload_output_folder, s3_textract_document_logs_subfolder, local_textract_document_logs_subfolder
50
+
51
+
52
+ def _get_env_list(env_var_name: str) -> list[str]:
53
+ """Parses a comma-separated environment variable into a list of strings."""
54
+ value = env_var_name[1:-1].strip().replace('\"', '').replace("\'","")
55
+ if not value:
56
+ return []
57
+ # Split by comma and filter out any empty strings that might result from extra commas
58
+ return [s.strip() for s in value.split(',') if s.strip()]
59
+
60
+
61
 
62
  # --- Constants and Configuration ---
63
 
 
65
  if FULL_COMPREHEND_ENTITY_LIST: FULL_COMPREHEND_ENTITY_LIST = _get_env_list(FULL_COMPREHEND_ENTITY_LIST)
66
  if CHOSEN_REDACT_ENTITIES: CHOSEN_REDACT_ENTITIES = _get_env_list(CHOSEN_REDACT_ENTITIES)
67
  if FULL_ENTITY_LIST: FULL_ENTITY_LIST = _get_env_list(FULL_ENTITY_LIST)
68
+ if CUSTOM_ENTITIES: CUSTOM_ENTITIES = _get_env_list(CUSTOM_ENTITIES)
69
+ if DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX: DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX = _get_env_list(DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX)
70
 
71
  # Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
72
+ CHOSEN_COMPREHEND_ENTITIES.extend(CUSTOM_ENTITIES)
73
+ FULL_COMPREHEND_ENTITY_LIST.extend(CUSTOM_ENTITIES)
74
 
75
  chosen_redact_entities = CHOSEN_REDACT_ENTITIES
76
  full_entity_list = FULL_ENTITY_LIST
77
  chosen_comprehend_entities = CHOSEN_COMPREHEND_ENTITIES
78
  full_comprehend_entity_list = FULL_COMPREHEND_ENTITY_LIST
79
+ default_handwrite_signature_checkbox = DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX
80
 
81
  # --- Main CLI Function ---
82
+ def main(direct_mode_args={}):
83
  """
84
  A unified command-line interface to prepare, redact, and anonymise various document types.
85
 
 
91
  description='A versatile CLI for redacting PII from PDF/image files and anonymising Word/tabular data.',
92
  formatter_class=argparse.RawTextHelpFormatter,
93
  epilog='''
94
+ Examples (look in the output/ folder to see output files):
95
+
96
+ # Redaction
97
+
98
+ ## Redact a PDF with default settings (local OCR):
99
+ python cli_redact.py --input_file example_data/example_of_emails_sent_to_a_professor_before_applying.pdf
100
+
101
+ ## Extract text from a PDF only (i.e. no redaction), using local OCR:
102
+ python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --redact_whole_page_file example_data/partnership_toolkit_redact_some_pages.csv --pii_detector None
103
+
104
+ ## Extract text from a PDF only (i.e. no redaction), using local OCR, with a whole page redaction list:
105
+ python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --redact_whole_page_file example_data/partnership_toolkit_redact_some_pages.csv --pii_detector Local --local_redact_entities CUSTOM
106
+
107
+ ## Redact a PDF with allow list (local OCR) and custom list of redaction entities:
108
+ python cli_redact.py --input_file example_data/graduate-job-example-cover-letter.pdf --allow_list_file example_data/test_allow_list_graduate.csv --local_redact_entities TITLES PERSON DATE_TIME
109
 
110
+ ## Redact a PDF with limited pages and text extraction method (local text) with custom fuzzy matching:
111
+ python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --deny_list_file example_data/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv --local_redact_entities CUSTOM_FUZZY --page_min 1 --page_max 3 --ocr_method "Local text" --fuzzy_mistakes 3
112
 
113
+ ## Redaction with custom deny list, allow list, and whole page redaction list:
114
+ python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --deny_list_file example_data/partnership_toolkit_redact_custom_deny_list.csv --redact_whole_page_file example_data/partnership_toolkit_redact_some_pages.csv --allow_list_file example_data/test_allow_list_partnership.csv
115
 
116
+ ## Redact an image:
117
+ python cli_redact.py --input_file example_data/example_complaint_letter.jpg
118
 
119
+ ## Anonymise csv file with specific columns:
120
+ python cli_redact.py --input_file example_data/combined_case_notes.csv --text_columns "Case Note" "Client" --anon_strategy replace_redacted
121
 
122
+ ## Anonymise csv file with a different strategy (remove text completely):
123
+ python cli_redact.py --input_file example_data/combined_case_notes.csv --text_columns "Case Note" "Client" --anon_strategy redact
124
 
125
+ ## Anonymise a word document:
126
+ python cli_redact.py --input_file "example_data/Bold minimalist professional cover letter.docx" --anon_strategy replace_redacted
127
 
128
+ # Redaction with AWS services:
129
+
130
+ ## Use Textract and Comprehend::
131
+ python cli_redact.py --input_file example_data/example_of_emails_sent_to_a_professor_before_applying.pdf --ocr_method "AWS Textract" --pii_detector "AWS Comprehend"
132
+
133
+ ## Redact specific pages with AWS OCR and signature extraction:
134
+ python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --page_min 6 --page_max 7 --ocr_method "AWS Textract" --handwrite_signature_extraction "Extract handwriting" "Extract signatures"
135
+
136
+ # Duplicate page detection
137
+
138
+ ## Find duplicate pages in OCR files:
139
+ python cli_redact.py --task deduplicate --input_file example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv --duplicate_type pages --similarity_threshold 0.95
140
+
141
+ ## Find duplicate in OCR files at the line level:
142
+ python cli_redact.py --task deduplicate --input_file example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv --duplicate_type pages --similarity_threshold 0.95 --combine_pages False --min_word_count 3
143
+
144
+ ## Find duplicate rows in tabular data:
145
+ python cli_redact.py --task deduplicate --input_file example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv --duplicate_type tabular --text_columns "text" --similarity_threshold 0.95
146
+
147
+ # AWS Textract whole document analysis
148
+
149
+ ## Submit document to Textract for basic text analysis:
150
+ python cli_redact.py --task textract --textract_action submit --input_file example_data/example_of_emails_sent_to_a_professor_before_applying.pdf
151
+
152
+ ## Submit document to Textract for analysis with signature extraction (Job ID will be printed to the console, you need this to retrieve the results):
153
+ python cli_redact.py --task textract --textract_action submit --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --extract_signatures
154
+
155
+ ## Retrieve Textract results by job ID (returns a .json file output):
156
+ python cli_redact.py --task textract --textract_action retrieve --job_id 12345678-1234-1234-1234-123456789012
157
+
158
+ ## List recent Textract jobs:
159
+ python cli_redact.py --task textract --textract_action list
160
+
161
+ '''
162
  )
163
 
164
  # --- Task Selection ---
165
  task_group = parser.add_argument_group('Task Selection')
166
  task_group.add_argument('--task',
167
+ choices=['redact', 'deduplicate', 'textract'],
168
  default='redact',
169
+ help='Task to perform: redact (PII redaction/anonymisation), deduplicate (find duplicate content), or textract (AWS Textract batch operations).')
170
 
171
  # --- General Arguments (apply to all file types) ---
172
  general_group = parser.add_argument_group('General Options')
173
+ general_group.add_argument('--input_file', nargs='+', help='Path to the input file(s) to process. Separate multiple files with a space, and use quotes if there are spaces in the file name.')
174
  general_group.add_argument('--output_dir', default=OUTPUT_FOLDER, help='Directory for all output files.')
175
+ general_group.add_argument('--input_dir', default=INPUT_FOLDER, help='Directory for all input files.')
176
  general_group.add_argument('--language', default=DEFAULT_LANGUAGE, help='Language of the document content.')
177
+ general_group.add_argument('--allow_list', default=ALLOW_LIST_PATH, help='Path to a CSV file with words to exclude from redaction.')
178
+ general_group.add_argument('--pii_detector', choices=[LOCAL_PII_OPTION, AWS_PII_OPTION, "None"], default=LOCAL_PII_OPTION,
179
+ help='Core PII detection method (Local or AWS Comprehend, or None).')
180
+ general_group.add_argument('--username', default=DIRECT_MODE_DEFAULT_USER, help='Username for the session.')
181
+ general_group.add_argument('--save_to_user_folders', default=SESSION_OUTPUT_FOLDER, help='Whether to save to user folders or not.')
182
+
183
+ general_group.add_argument('--local_redact_entities', nargs='+', choices=full_entity_list, default=chosen_redact_entities,
184
+ help=f'Local redaction entities to use. Default: {chosen_redact_entities}. Full list: {full_entity_list}.')
185
+
186
+ general_group.add_argument('--aws_redact_entities', nargs='+', choices=full_comprehend_entity_list, default=chosen_comprehend_entities,
187
+ help=f'AWS redaction entities to use. Default: {chosen_comprehend_entities}. Full list: {full_comprehend_entity_list}.')
188
+
189
+ general_group.add_argument('--aws_access_key', default=AWS_ACCESS_KEY, help='Your AWS Access Key ID.')
190
+ general_group.add_argument('--aws_secret_key', default=AWS_SECRET_KEY, help='Your AWS Secret Access Key.')
191
+ general_group.add_argument('--cost_code', default=DEFAULT_COST_CODE, help='Cost code for tracking usage.')
192
+ general_group.add_argument('--aws_region', default=AWS_REGION, help='AWS region for cloud services.')
193
+ general_group.add_argument('--s3_bucket', default=DOCUMENT_REDACTION_BUCKET, help='S3 bucket name for cloud operations.')
194
+ general_group.add_argument('--do_initial_clean', default=DO_INITIAL_TABULAR_DATA_CLEAN, help='Perform initial text cleaning for tabular data.')
195
+ general_group.add_argument('--save_logs_to_csv', default=SAVE_LOGS_TO_CSV, help='Save processing logs to CSV files.')
196
+ general_group.add_argument('--save_logs_to_dynamodb', default=SAVE_LOGS_TO_DYNAMODB, help='Save processing logs to DynamoDB.')
197
+ general_group.add_argument('--display_file_names_in_logs', default=DISPLAY_FILE_NAMES_IN_LOGS, help='Include file names in log outputs.')
198
+ general_group.add_argument('--upload_logs_to_s3', default=RUN_AWS_FUNCTIONS == "1", help='Upload log files to S3 after processing.')
199
+ general_group.add_argument('--s3_logs_prefix', default=S3_USAGE_LOGS_FOLDER, help='S3 prefix for usage log files.')
200
 
201
  # --- PDF/Image Redaction Arguments ---
202
  pdf_group = parser.add_argument_group('PDF/Image Redaction Options (.pdf, .png, .jpg)')
203
+ pdf_group.add_argument('--ocr_method', choices=["AWS Textract", "Local OCR", "Local text"], default="Local OCR", help='OCR method for text extraction from images.')
 
 
 
204
  pdf_group.add_argument('--page_min', type=int, default=0, help='First page to redact.')
205
+ pdf_group.add_argument('--page_max', type=int, default=0, help='Last page to redact.')
206
+ pdf_group.add_argument('--images_dpi', type=float, default=float(IMAGES_DPI), help='DPI for image processing.')
207
+ pdf_group.add_argument('--chosen_local_ocr_model', choices=['tesseract', 'hybrid', 'paddle'], default=CHOSEN_LOCAL_OCR_MODEL, help='Local OCR model to use.')
208
+ pdf_group.add_argument('--preprocess_local_ocr_images', default=PREPROCESS_LOCAL_OCR_IMAGES, help='Preprocess images before OCR.')
209
+ pdf_group.add_argument('--compress_redacted_pdf', default=COMPRESS_REDACTED_PDF, help='Compress the final redacted PDF.')
210
+ pdf_group.add_argument('--return_pdf_end_of_redaction', default=RETURN_PDF_END_OF_REDACTION, help='Return PDF at end of redaction process.')
211
+ pdf_group.add_argument('--deny_list_file', default=DENY_LIST_PATH, help='Custom words file to recognize for redaction.')
212
+ pdf_group.add_argument('--allow_list_file', default=ALLOW_LIST_PATH, help='Custom words file to recognize for redaction.')
213
+ pdf_group.add_argument('--redact_whole_page_file', default=WHOLE_PAGE_REDACTION_LIST_PATH, help='File for pages to redact completely.')
214
+ pdf_group.add_argument('--handwrite_signature_extraction', nargs='+', default=default_handwrite_signature_checkbox, help='Handwriting and signature extraction options. Choose from "Extract handwriting", "Extract signatures".')
 
 
 
 
215
 
216
  # --- Word/Tabular Anonymisation Arguments ---
217
  tabular_group = parser.add_argument_group('Word/Tabular Anonymisation Options (.docx, .csv, .xlsx)')
218
+ tabular_group.add_argument('--anon_strategy', choices=['redact', 'redact completely', 'replace_redacted', 'entity_type', 'encrypt', 'hash', 'replace with \'REDACTED\'', 'replace with <ENTITY_NAME>', 'mask', 'fake_first_name'], default=DEFAULT_TABULAR_ANONYMISATION_STRATEGY, help='The anonymisation strategy to apply.')
219
+ tabular_group.add_argument('--text_columns', nargs='+', default=list(), help='A list of column names to anonymise or deduplicate in tabular data.')
220
  tabular_group.add_argument('--excel_sheets', nargs='+', default=list(), help='Specific Excel sheet names to process.')
221
+ tabular_group.add_argument('--fuzzy_mistakes', type=int, default=DEFAULT_FUZZY_SPELLING_MISTAKES_NUM, help='Number of allowed spelling mistakes for fuzzy matching.')
222
+ tabular_group.add_argument('--match_fuzzy_whole_phrase_bool', default=True, help='Match fuzzy whole phrase boolean.')
 
223
  # --- Duplicate Detection Arguments ---
224
  duplicate_group = parser.add_argument_group('Duplicate Detection Options')
225
  duplicate_group.add_argument('--duplicate_type', choices=['pages', 'tabular'], default='pages', help='Type of duplicate detection: pages (for OCR files) or tabular (for CSV/Excel files).')
226
+ duplicate_group.add_argument('--similarity_threshold', type=float, default=DEFAULT_DUPLICATE_DETECTION_THRESHOLD, help='Similarity threshold (0-1) to consider content as duplicates.')
227
+ duplicate_group.add_argument('--min_word_count', type=int, default=DEFAULT_MIN_WORD_COUNT, help='Minimum word count for text to be considered in duplicate analysis.')
228
+ duplicate_group.add_argument('--min_consecutive_pages', type=int, default=DEFAULT_MIN_CONSECUTIVE_PAGES, help='Minimum number of consecutive pages to consider as a match.')
229
+ duplicate_group.add_argument('--greedy_match', default=USE_GREEDY_DUPLICATE_DETECTION, help='Use greedy matching strategy for consecutive pages.')
230
+ duplicate_group.add_argument('--combine_pages', default=DEFAULT_COMBINE_PAGES, help='Combine text from the same page number within a file. Alternative will enable line-level duplicate detection.')
231
+ duplicate_group.add_argument('--remove_duplicate_rows', default=REMOVE_DUPLICATE_ROWS, help='Remove duplicate rows from the output.')
 
232
 
233
+ # --- Textract Batch Operations Arguments ---
234
+ textract_group = parser.add_argument_group('Textract Batch Operations Options')
235
+ textract_group.add_argument('--textract_action',
236
+ choices=['submit', 'retrieve', 'list'],
237
+ help='Textract action to perform: submit (submit document for analysis), retrieve (get results by job ID), or list (show recent jobs).')
238
+ textract_group.add_argument('--job_id', help='Textract job ID for retrieve action.')
239
+ textract_group.add_argument('--extract_signatures', action='store_true', help='Extract signatures during Textract analysis (for submit action).')
240
+ textract_group.add_argument('--textract_bucket', default=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, help='S3 bucket name for Textract operations (overrides default).')
241
+ textract_group.add_argument('--textract_input_prefix', default=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, help='S3 prefix for input files in Textract operations.')
242
+ textract_group.add_argument('--textract_output_prefix', default=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, help='S3 prefix for output files in Textract operations.')
243
+ textract_group.add_argument('--s3_textract_document_logs_subfolder', default=TEXTRACT_JOBS_S3_LOC, help='S3 prefix for logs in Textract operations.')
244
+ textract_group.add_argument('--local_textract_document_logs_subfolder', default=TEXTRACT_JOBS_LOCAL_LOC, help='Local prefix for logs in Textract operations.')
245
+ textract_group.add_argument('--poll_interval', type=int, default=30, help='Polling interval in seconds for Textract job status.')
246
+ textract_group.add_argument('--max_poll_attempts', type=int, default=120, help='Maximum number of polling attempts for Textract job completion.')
247
  # Parse arguments - either from command line or direct mode
248
  if direct_mode_args:
249
  # Use direct mode arguments
 
253
  args = parser.parse_args()
254
 
255
  # --- Initial Setup ---
256
+ # Convert string boolean variables to boolean
257
+ if args.preprocess_local_ocr_images == "True": args.preprocess_local_ocr_images = True
258
+ else: args.preprocess_local_ocr_images = False
259
+ if args.greedy_match == "True": args.greedy_match = True
260
+ else: args.greedy_match = False
261
+ if args.combine_pages == "True": args.combine_pages = True
262
+ else: args.combine_pages = False
263
+ if args.remove_duplicate_rows == "True": args.remove_duplicate_rows = True
264
+ else: args.remove_duplicate_rows = False
265
+ if args.return_pdf_end_of_redaction == "True": args.return_pdf_end_of_redaction = True
266
+ else: args.return_pdf_end_of_redaction = False
267
+ if args.compress_redacted_pdf == "True": args.compress_redacted_pdf = True
268
+ else: args.compress_redacted_pdf = False
269
+ if args.do_initial_clean == "True": args.do_initial_clean = True
270
+ else: args.do_initial_clean = False
271
+ if args.save_logs_to_csv == "True": args.save_logs_to_csv = True
272
+ else: args.save_logs_to_csv = False
273
+ if args.save_logs_to_dynamodb == "True": args.save_logs_to_dynamodb = True
274
+ else: args.save_logs_to_dynamodb = False
275
+ if args.display_file_names_in_logs == "True": args.display_file_names_in_logs = True
276
+ else: args.display_file_names_in_logs = False
277
+ if args.match_fuzzy_whole_phrase_bool == "True": args.match_fuzzy_whole_phrase_bool = True
278
+ else: args.match_fuzzy_whole_phrase_bool = False
279
+ if args.save_to_user_folders == "True": args.save_to_user_folders = True
280
+ else: args.save_to_user_folders = False
281
+
282
+ if args.task in ['redact', 'deduplicate']:
283
+ if args.input_file:
284
+ if isinstance(args.input_file, str):
285
+ args.input_file = [args.input_file]
286
+
287
+ _, file_extension = os.path.splitext(args.input_file[0])
288
+ file_extension = file_extension.lower()
289
+ else:
290
+ raise ValueError("Error: --input_file is required for 'redact' task.")
291
 
292
+ # Initialise usage logger if logging is enabled
293
+ usage_logger = None
294
+ if args.save_logs_to_csv or args.save_logs_to_dynamodb:
295
+ from tools.cli_usage_logger import create_cli_usage_logger
296
+ try:
297
+ usage_logger = create_cli_usage_logger()
298
+ except Exception as e:
299
+ print(f"Warning: Could not initialise usage logger: {e}")
300
+
301
+ print(f"Argument args.save_to_user_folders: {args.save_to_user_folders} will be used to determine if outputs will be saved to user folders.")
302
+
303
+ # Get username and folders
304
+ session_hash, args.output_dir, _, args.input_dir, args.textract_input_prefix, args.textract_output_prefix, args.s3_textract_document_logs_subfolder, args.local_textract_document_logs_subfolder = get_username_and_folders(username=args.username, output_folder_textbox=args.output_dir, input_folder_textbox=args.input_dir, session_output_folder=args.save_to_user_folders, textract_document_upload_input_folder=args.textract_input_prefix, textract_document_upload_output_folder=args.textract_output_prefix, s3_textract_document_logs_subfolder=args.s3_textract_document_logs_subfolder, local_textract_document_logs_subfolder=args.local_textract_document_logs_subfolder)
305
+
306
+ print(f"Conducting analyses with user {args.username}. Outputs will be saved to {args.output_dir}.")
307
 
308
  # --- Route to the Correct Workflow Based on Task and File Type ---
309
 
310
+ # Validate input_file requirement for tasks that need it
311
+ if args.task in ['redact', 'deduplicate'] and not args.input_file:
312
+ print(f"Error: --input_file is required for '{args.task}' task.")
313
+ return
314
+
315
+ if args.ocr_method in ["Local OCR", "AWS Textract"]:
316
+ args.prepare_images = True
317
+ else:
318
+ args.prepare_images = False
319
+
320
+ from tools.cli_usage_logger import create_cli_usage_logger, log_redaction_usage
321
+ # Task 1: Redaction/Anonymisation
322
  if args.task == 'redact':
323
+
324
  # Workflow 1: PDF/Image Redaction
325
  if file_extension in ['.pdf', '.png', '.jpg', '.jpeg']:
326
  print("--- Detected PDF/Image file. Starting Redaction Workflow... ---")
327
+ start_time = time.time()
328
  try:
329
+ from tools.file_conversion import prepare_image_or_pdf
330
+ from tools.file_redaction import choose_and_run_redactor
331
  # Step 1: Prepare the document
332
  print("\nStep 1: Preparing document...")
333
  (
334
  prep_summary, prepared_pdf_paths, image_file_paths, _, _, pdf_doc,
335
+ image_annotations, _, original_cropboxes, page_sizes, _, _, _, _, _
336
  ) = prepare_image_or_pdf(
337
+ file_paths=args.input_file, text_extract_method=args.ocr_method, all_line_level_ocr_results_df=pd.DataFrame(), all_page_line_level_ocr_results_with_words_df=pd.DataFrame(),
338
+ first_loop_state=True, prepare_for_review=False,
339
+ output_folder=args.output_dir, input_folder=args.input_dir, prepare_images=args.prepare_images
 
340
  )
341
  print(f"Preparation complete. {prep_summary}")
342
 
343
  # Step 2: Redact the prepared document
344
  print("\nStep 2: Running redaction...")
345
  (
346
+ output_summary, output_files, _, _, log_files, _, _, _, _, _, _, _, _, _, comprehend_query_number, _, _, _, _, _, _, page_sizes, _, _, _, total_textract_query_number, _, _, _, _, _, _
347
  ) = choose_and_run_redactor(
348
+ file_paths=args.input_file, prepared_pdf_file_paths=prepared_pdf_paths,
349
+ pdf_image_file_paths=image_file_paths, chosen_redact_entities=args.local_redact_entities,
350
+ chosen_redact_comprehend_entities=args.aws_redact_entities, text_extraction_method=args.ocr_method,
351
+ in_allow_list=args.allow_list_file, in_deny_list=args.deny_list_file,
352
+ redact_whole_page_list=args.redact_whole_page_file, first_loop_state=True,
353
+ page_min=args.page_min, page_max=args.page_max, handwrite_signature_checkbox=args.handwrite_signature_extraction, max_fuzzy_spelling_mistakes_num=args.fuzzy_mistakes, match_fuzzy_whole_phrase_bool=args.match_fuzzy_whole_phrase_bool,
354
  pymupdf_doc=pdf_doc, annotations_all_pages=image_annotations, page_sizes=page_sizes,
355
  document_cropboxes=original_cropboxes, pii_identification_method=args.pii_detector,
356
  aws_access_key_textbox=args.aws_access_key, aws_secret_key_textbox=args.aws_secret_key,
357
+ language=args.language, output_folder=args.output_dir, input_folder=args.input_dir
358
  )
359
+
360
+ # Calculate processing time
361
+ end_time = time.time()
362
+ processing_time = end_time - start_time
363
+
364
+ # Log usage data if logger is available
365
+ if usage_logger:
366
+ try:
367
+ # Extract file name for logging
368
+ print("Saving logs to CSV")
369
+ doc_file_name = os.path.basename(args.input_file[0]) if args.display_file_names_in_logs else "document"
370
+ data_file_name = "" # Not applicable for PDF/image redaction
371
+
372
+ # Determine if this was a Textract API call
373
+ is_textract_call = args.ocr_method == "AWS Textract"
374
+
375
+ # Count pages (approximate from page_sizes if available)
376
+ total_pages = len(page_sizes) if page_sizes else 1
377
+
378
+ # Count API calls (approximate - would need to be tracked in the redaction function)
379
+ textract_queries = int(total_textract_query_number) if is_textract_call else 0
380
+ comprehend_queries = int(comprehend_query_number) if args.pii_detector == "AWS Comprehend" else 0
381
+
382
+ # Format handwriting/signature options
383
+ handwriting_signature = ", ".join(args.handwrite_signature_extraction) if args.handwrite_signature_extraction else ""
384
+
385
+ log_redaction_usage(
386
+ logger=usage_logger,
387
+ session_hash=session_hash,
388
+ doc_file_name=doc_file_name,
389
+ data_file_name=data_file_name,
390
+ time_taken=processing_time,
391
+ total_pages=total_pages,
392
+ textract_queries=textract_queries,
393
+ pii_method=args.pii_detector,
394
+ comprehend_queries=comprehend_queries,
395
+ cost_code=args.cost_code,
396
+ handwriting_signature=handwriting_signature,
397
+ text_extraction_method=args.ocr_method,
398
+ is_textract_call=is_textract_call,
399
+ task=args.task,
400
+ save_to_dynamodb=args.save_logs_to_dynamodb,
401
+ save_to_s3=args.upload_logs_to_s3,
402
+ s3_bucket=args.s3_bucket,
403
+ s3_key_prefix=args.s3_logs_prefix
404
+ )
405
+ except Exception as e:
406
+ print(f"Warning: Could not log usage data: {e}")
407
 
408
  print("\n--- Redaction Process Complete ---")
409
  print(f"Summary: {output_summary}")
410
+ print(f"Processing time: {processing_time:.2f} seconds")
411
  print(f"\nOutput files saved to: {args.output_dir}")
412
  print("Generated Files:", sorted(output_files))
413
  if log_files: print("Log Files:", sorted(log_files))
 
418
  # Workflow 2: Word/Tabular Data Anonymisation
419
  elif file_extension in ['.docx', '.xlsx', '.xls', '.csv', '.parquet']:
420
  print("--- Detected Word/Tabular file. Starting Anonymisation Workflow... ---")
421
+ start_time = time.time()
422
  try:
423
+ from tools.data_anonymise import anonymise_files_with_open_text
424
+
425
  # Run the anonymisation function directly
426
+
427
+ output_summary, output_files, _, _, log_files, _, processing_time, comprehend_query_number = anonymise_files_with_open_text(
428
+ file_paths=args.input_file,
429
  in_text="", # Not used for file-based operations
430
+ anon_strategy=args.anon_strategy,
431
+ chosen_cols=args.text_columns,
432
+ chosen_redact_entities=args.local_redact_entities,
433
+ in_allow_list=args.allow_list_file,
434
  in_excel_sheets=args.excel_sheets,
435
  first_loop_state=True,
436
  output_folder=args.output_dir,
437
+ in_deny_list=args.deny_list_file,
438
  max_fuzzy_spelling_mistakes_num=args.fuzzy_mistakes,
439
  pii_identification_method=args.pii_detector,
440
+ chosen_redact_comprehend_entities=args.aws_redact_entities,
441
  aws_access_key_textbox=args.aws_access_key,
442
  aws_secret_key_textbox=args.aws_secret_key,
443
  language=args.language,
444
  do_initial_clean=args.do_initial_clean
445
  )
446
 
447
+ # Calculate processing time
448
+ end_time = time.time()
449
+ processing_time = end_time - start_time
450
+
451
+ # Log usage data if logger is available
452
+ if usage_logger:
453
+ try:
454
+ print("Saving logs to CSV")
455
+ # Extract file name for logging
456
+ doc_file_name = "" # Not applicable for tabular data
457
+ data_file_name = os.path.basename(args.input_file[0]) if args.display_file_names_in_logs else "data_file"
458
+
459
+ # Determine if this was a Textract API call (not applicable for tabular)
460
+ is_textract_call = False
461
+
462
+ # Count pages (not applicable for tabular data)
463
+ total_pages = 0
464
+
465
+ # Count API calls (approximate - would need to be tracked in the anonymisation function)
466
+ textract_queries = 0 # Not applicable for tabular data
467
+ comprehend_queries = comprehend_query_number if args.pii_detector == "AWS Comprehend" else 0
468
+
469
+ # Format handwriting/signature options (not applicable for tabular)
470
+ handwriting_signature = ""
471
+
472
+ log_redaction_usage(
473
+ logger=usage_logger,
474
+ session_hash=session_hash,
475
+ doc_file_name=doc_file_name,
476
+ data_file_name=data_file_name,
477
+ time_taken=processing_time,
478
+ total_pages=total_pages,
479
+ textract_queries=textract_queries,
480
+ pii_method=args.pii_detector,
481
+ comprehend_queries=comprehend_queries,
482
+ cost_code=args.cost_code,
483
+ handwriting_signature=handwriting_signature,
484
+ text_extraction_method="tabular", # Indicate this is tabular processing
485
+ is_textract_call=is_textract_call,
486
+ task=args.task,
487
+ save_to_dynamodb=args.save_logs_to_dynamodb,
488
+ save_to_s3=args.upload_logs_to_s3,
489
+ s3_bucket=args.s3_bucket,
490
+ s3_key_prefix=args.s3_logs_prefix
491
+ )
492
+ except Exception as e:
493
+ print(f"Warning: Could not log usage data: {e}")
494
+
495
  print("\n--- Anonymisation Process Complete ---")
496
  print(f"Summary: {output_summary}")
497
+ print(f"Processing time: {processing_time:.2f} seconds")
498
  print(f"\nOutput files saved to: {args.output_dir}")
499
  print("Generated Files:", sorted(output_files))
500
  if log_files: print("Log Files:", sorted(log_files))
 
511
  elif args.task == 'deduplicate':
512
  print("--- Starting Duplicate Detection Workflow... ---")
513
  try:
514
+ from tools.find_duplicate_pages import run_duplicate_analysis
515
  if args.duplicate_type == 'pages':
516
  # Page duplicate detection
517
  if file_extension == '.csv':
518
  print("--- Detected OCR CSV file. Starting Page Duplicate Detection... ---")
519
 
520
+ start_time = time.time()
521
+
522
+ if args.combine_pages == True:
523
+ print("Combining pages...")
524
+ else:
525
+ print("Using line-level duplicate detection...")
526
+
 
527
  # Load the CSV file as a list for the duplicate analysis function
528
+ results_df, output_paths, full_data_by_file, processing_time, task_textbox = run_duplicate_analysis(
529
+ files=args.input_file,
530
  threshold=args.similarity_threshold,
531
  min_words=args.min_word_count,
532
  min_consecutive=args.min_consecutive_pages,
533
  greedy_match=args.greedy_match,
534
+ combine_pages=args.combine_pages,
535
+ output_folder=args.output_dir
536
  )
537
 
538
+ end_time = time.time()
539
+ processing_time = end_time - start_time
540
+
541
  print("\n--- Page Duplicate Detection Complete ---")
542
  print(f"Found {len(results_df)} duplicate matches")
543
  print(f"\nOutput files saved to: {args.output_dir}")
 
546
  else:
547
  print(f"Error: Page duplicate detection requires CSV files with OCR data.")
548
  print("Please provide a CSV file containing OCR output data.")
549
+
550
+ # Log usage data if logger is available
551
+ if usage_logger:
552
+ try:
553
+ # Extract file name for logging
554
+ print("Saving logs to CSV")
555
+ doc_file_name = os.path.basename(args.input_file[0]) if args.display_file_names_in_logs else "document"
556
+ data_file_name = "" # Not applicable for PDF/image redaction
557
+
558
+ # Determine if this was a Textract API call
559
+ is_textract_call = False
560
+
561
+ # Count pages (approximate from page_sizes if available)
562
+ total_pages = len(page_sizes) if page_sizes else 1
563
+
564
+ # Count API calls (approximate - would need to be tracked in the redaction function)
565
+ textract_queries = 0
566
+ comprehend_queries = 0
567
+
568
+ # Format handwriting/signature options
569
+ handwriting_signature = ""
570
+
571
+ log_redaction_usage(
572
+ logger=usage_logger,
573
+ session_hash=session_hash,
574
+ doc_file_name=doc_file_name,
575
+ data_file_name=data_file_name,
576
+ time_taken=processing_time,
577
+ total_pages=total_pages,
578
+ textract_queries=textract_queries,
579
+ pii_method=args.pii_detector,
580
+ comprehend_queries=comprehend_queries,
581
+ cost_code=args.cost_code,
582
+ handwriting_signature=handwriting_signature,
583
+ text_extraction_method=args.ocr_method,
584
+ is_textract_call=is_textract_call,
585
+ task=args.task,
586
+ save_to_dynamodb=args.save_logs_to_dynamodb,
587
+ save_to_s3=args.upload_logs_to_s3,
588
+ s3_bucket=args.s3_bucket,
589
+ s3_key_prefix=args.s3_logs_prefix
590
+ )
591
+ except Exception as e:
592
+ print(f"Warning: Could not log usage data: {e}")
593
 
594
  elif args.duplicate_type == 'tabular':
595
  # Tabular duplicate detection
596
+ from tools.find_duplicate_tabular import run_tabular_duplicate_detection
597
  if file_extension in ['.csv', '.xlsx', '.xls', '.parquet']:
598
  print("--- Detected tabular file. Starting Tabular Duplicate Detection... ---")
599
+
600
+ start_time = time.time()
601
 
602
+ results_df, output_paths, full_data_by_file, processing_time, task_textbox = run_tabular_duplicate_detection(
603
+ files=args.input_file,
604
  threshold=args.similarity_threshold,
605
  min_words=args.min_word_count,
606
+ text_columns=args.text_columns,
607
+ output_folder=args.output_dir,
608
+ do_initial_clean_dup=args.do_initial_clean,
609
+ in_excel_tabular_sheets=args.excel_sheets,
610
+ remove_duplicate_rows=args.remove_duplicate_rows
611
  )
612
+
613
+ end_time = time.time()
614
+ processing_time = end_time - start_time
615
+
616
+ # Log usage data if logger is available
617
+ if usage_logger:
618
+ try:
619
+ # Extract file name for logging
620
+ print("Saving logs to CSV")
621
+ doc_file_name = ""
622
+ data_file_name = os.path.basename(args.input_file[0]) if args.display_file_names_in_logs else "data_file"
623
+
624
+ # Determine if this was a Textract API call
625
+ is_textract_call = False
626
+
627
+ # Count pages (approximate from page_sizes if available)
628
+ total_pages = len(page_sizes) if page_sizes else 1
629
+
630
+ # Count API calls (approximate - would need to be tracked in the redaction function)
631
+ textract_queries = 0
632
+ comprehend_queries = 0
633
+
634
+ # Format handwriting/signature options
635
+ handwriting_signature = ""
636
+
637
+ log_redaction_usage(
638
+ logger=usage_logger,
639
+ session_hash=session_hash,
640
+ doc_file_name=doc_file_name,
641
+ data_file_name=data_file_name,
642
+ time_taken=processing_time,
643
+ total_pages=total_pages,
644
+ textract_queries=textract_queries,
645
+ pii_method=args.pii_detector,
646
+ comprehend_queries=comprehend_queries,
647
+ cost_code=args.cost_code,
648
+ handwriting_signature=handwriting_signature,
649
+ text_extraction_method=args.ocr_method,
650
+ is_textract_call=is_textract_call,
651
+ task=args.task,
652
+ save_to_dynamodb=args.save_logs_to_dynamodb,
653
+ save_to_s3=args.upload_logs_to_s3,
654
+ s3_bucket=args.s3_bucket,
655
+ s3_key_prefix=args.s3_logs_prefix
656
+ )
657
+ except Exception as e:
658
+ print(f"Warning: Could not log usage data: {e}")
659
 
660
  print("\n--- Tabular Duplicate Detection Complete ---")
661
  print(f"Found {len(results_df)} duplicate matches")
 
671
 
672
  except Exception as e:
673
  print(f"\nAn error occurred during the duplicate detection workflow: {e}")
674
+
675
+ # Task 3: Textract Batch Operations
676
+ elif args.task == 'textract':
677
+ print("--- Starting Textract Batch Operations Workflow... ---")
678
+
679
+ if not args.textract_action:
680
+ print("Error: --textract_action is required for textract task.")
681
+ print("Valid options: 'submit', 'retrieve', or 'list'")
682
+ return
683
+
684
+ try:
685
+ if args.textract_action == 'submit':
686
+ from tools.textract_batch_call import analyse_document_with_textract_api, load_in_textract_job_details
687
+ # Submit document to Textract for analysis
688
+ if not args.input_file:
689
+ print("Error: --input_file is required for submit action.")
690
+ return
691
+
692
+ print(f"--- Submitting document to Textract: {args.input_file} ---")
693
+
694
+ start_time = time.time()
695
+
696
+ # Load existing job details
697
+ job_df = load_in_textract_job_details(load_s3_jobs_loc=args.s3_textract_document_logs_subfolder, load_local_jobs_loc=args.local_textract_document_logs_subfolder)
698
+
699
+ # Determine signature extraction options
700
+ signature_options = ['Extract handwriting', 'Extract signatures'] if args.extract_signatures else ['Extract handwriting']
701
+
702
+ # Use configured bucket or override
703
+ textract_bucket = args.textract_bucket if args.textract_bucket else ""
704
+
705
+ # Submit the job
706
+ result_message, job_id, job_type, successful_job_number, is_textract_call, total_pages, task_textbox = analyse_document_with_textract_api(
707
+ local_pdf_path=args.input_file,
708
+ s3_input_prefix=args.textract_input_prefix,
709
+ s3_output_prefix=args.textract_output_prefix,
710
+ job_df=job_df,
711
+ s3_bucket_name=textract_bucket,
712
+ general_s3_bucket_name=args.s3_bucket,
713
+ local_output_dir=args.output_dir,
714
+ analyse_signatures=signature_options,
715
+ aws_region=args.aws_region
716
+ )
717
+
718
+ end_time = time.time()
719
+ processing_time = end_time - start_time
720
+
721
+ print(f"\n--- Textract Job Submitted Successfully ---")
722
+ print(f"Job ID: {job_id}")
723
+ print(f"Job Type: {job_type}")
724
+ print(f"Message: {result_message}")
725
+ print(f"Results will be available in: {args.output_dir}")
726
+
727
+ # Log usage data if logger is available
728
+ if usage_logger:
729
+ try:
730
+ # Extract file name for logging
731
+ print("Saving logs to CSV")
732
+ doc_file_name = os.path.basename(args.input_file[0]) if args.display_file_names_in_logs else "document"
733
+ data_file_name = ""
734
+
735
+ # Determine if this was a Textract API call
736
+ is_textract_call = True
737
+ args.ocr_method == "AWS Textract"
738
+
739
+ # Count API calls (approximate - would need to be tracked in the redaction function)
740
+ textract_queries = total_pages
741
+ comprehend_queries = 0
742
+
743
+ # Format handwriting/signature options
744
+ handwriting_signature = ""
745
+
746
+ log_redaction_usage(
747
+ logger=usage_logger,
748
+ session_hash=session_hash,
749
+ doc_file_name=doc_file_name,
750
+ data_file_name=data_file_name,
751
+ time_taken=processing_time,
752
+ total_pages=total_pages,
753
+ textract_queries=textract_queries,
754
+ pii_method=args.pii_detector,
755
+ comprehend_queries=comprehend_queries,
756
+ cost_code=args.cost_code,
757
+ handwriting_signature=handwriting_signature,
758
+ text_extraction_method=args.ocr_method,
759
+ is_textract_call=is_textract_call,
760
+ task=args.task,
761
+ save_to_dynamodb=args.save_logs_to_dynamodb,
762
+ save_to_s3=args.upload_logs_to_s3,
763
+ s3_bucket=args.s3_bucket,
764
+ s3_key_prefix=args.s3_logs_prefix
765
+ )
766
+ except Exception as e:
767
+ print(f"Warning: Could not log usage data: {e}")
768
+
769
+ elif args.textract_action == 'retrieve':
770
+ print(f"--- Retrieving Textract results for Job ID: {args.job_id} ---")
771
+
772
+ from tools.textract_batch_call import poll_whole_document_textract_analysis_progress_and_download, load_in_textract_job_details
773
+ # Retrieve results by job ID
774
+ if not args.job_id:
775
+ print("Error: --job_id is required for retrieve action.")
776
+ return
777
+
778
+ # Load existing job details to get job type
779
+ print("Loading existing job details...")
780
+ job_df = load_in_textract_job_details(load_s3_jobs_loc=args.s3_textract_document_logs_subfolder, load_local_jobs_loc=args.local_textract_document_logs_subfolder)
781
+
782
+ # Find job type from the dataframe
783
+ job_type = "document_text_detection" # default
784
+ if not job_df.empty and "job_id" in job_df.columns:
785
+ matching_jobs = job_df.loc[job_df["job_id"] == args.job_id]
786
+ if not matching_jobs.empty and "job_type" in matching_jobs.columns:
787
+ job_type = matching_jobs.iloc[0]["job_type"]
788
+
789
+ # Use configured bucket or override
790
+ textract_bucket = args.textract_bucket if args.textract_bucket else ""
791
+
792
+ # Poll for completion and download results
793
+ print("Polling for completion and downloading results...")
794
+ downloaded_file_path, job_status, updated_job_df, output_filename = poll_whole_document_textract_analysis_progress_and_download(
795
+ job_id=args.job_id,
796
+ job_type_dropdown=job_type,
797
+ s3_output_prefix=args.textract_output_prefix,
798
+ pdf_filename="", # Will be determined from job details
799
+ job_df=job_df,
800
+ s3_bucket_name=textract_bucket,
801
+ load_s3_jobs_loc=args.s3_textract_document_logs_subfolder,
802
+ load_local_jobs_loc=args.local_textract_document_logs_subfolder,
803
+ local_output_dir=args.output_dir,
804
+ poll_interval_seconds=args.poll_interval,
805
+ max_polling_attempts=args.max_poll_attempts
806
+ )
807
+
808
+ print(f"\n--- Textract Results Retrieved Successfully ---")
809
+ print(f"Job Status: {job_status}")
810
+ print(f"Downloaded File: {downloaded_file_path}")
811
+ #print(f"Output Filename: {output_filename}")
812
+
813
+ elif args.textract_action == 'list':
814
+ from tools.textract_batch_call import load_in_textract_job_details
815
+ # List recent Textract jobs
816
+ print("--- Listing Recent Textract Jobs ---")
817
+
818
+ job_df = load_in_textract_job_details(load_s3_jobs_loc=args.s3_textract_document_logs_subfolder, load_local_jobs_loc=args.local_textract_document_logs_subfolder)
819
+
820
+ if job_df.empty:
821
+ print("No recent Textract jobs found.")
822
+ else:
823
+ print(f"\nFound {len(job_df)} recent Textract jobs:")
824
+ print("-" * 80)
825
+ for _, job in job_df.iterrows():
826
+ print(f"Job ID: {job.get('job_id', 'N/A')}")
827
+ print(f"File: {job.get('file_name', 'N/A')}")
828
+ print(f"Type: {job.get('job_type', 'N/A')}")
829
+ print(f"Signatures: {job.get('signature_extraction', 'N/A')}")
830
+ print(f"Date: {job.get('job_date_time', 'N/A')}")
831
+ print("-" * 80)
832
+
833
+ else:
834
+ print(f"Error: Invalid textract_action '{args.textract_action}'.")
835
+ print("Valid options: 'submit', 'retrieve', or 'list'")
836
+
837
+ except Exception as e:
838
+ print(f"\nAn error occurred during the Textract workflow: {e}")
839
 
840
  else:
841
  print(f"Error: Invalid task '{args.task}'.")
842
+ print("Valid options: 'redact', 'deduplicate', or 'textract'")
843
 
844
  if __name__ == "__main__":
845
  main()
example_data/Bold minimalist professional cover letter.docx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c8551ac157f350b2093e5d8c89f68474f613350074201cff6d52d5ed5ec28ff
3
+ size 23992
example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv ADDED
The diff for this file is too large to render. See raw diff
 
example_data/Partnership-Agreement-Toolkit_0_0.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0db46a784d7aaafb8d02acf8686523dd376400117d07926a5dcb51ceb69e3236
3
+ size 426602
example_data/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ another country or territory sign a formel agreement on behalf? of their communities endorsing a
2
+ soster citues international
example_data/combined_case_notes.csv ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Date,Social Worker,Client,Case Note
2
+ "January 3, 2023",Jane Smith,Alex D.,"Met with Alex at school following reports of increased absences and declining grades. Alex appeared sullen and avoided eye contact. When prompted about school, Alex expressed feelings of isolation and stated, ""No one gets me."" Scheduled a follow-up meeting to further explore these feelings."
3
+ "January 17, 2023",Jane Smith,Alex D.,"Met with Alex at the community center. Alex displayed sudden outbursts of anger when discussing home life, particularly in relation to a new stepfather. Alex mentioned occasional substance use, but did not specify which substances. Recommended a comprehensive assessment."
4
+ "February 5, 2023",Jane Smith,Alex D.,Home visit conducted. Alex's mother reported frequent arguments at home. She expressed concerns about Alex's new group of friends and late-night outings. Noted potential signs of substance abuse. Suggested family counseling.
5
+ "February 21, 2023",Jane Smith,Alex D.,"Met with Alex alone at my office. Alex appeared more agitated than in previous meetings. There were visible signs of self-harm on Alex's arms. When questioned, Alex became defensive. Immediate referral made to a mental health professional."
6
+ "March 10, 2023",Jane Smith,Alex D.,Attended joint session with Alex and a therapist. Alex shared feelings of hopelessness and admitted to occasional thoughts of self-harm. Therapist recommended a comprehensive mental health evaluation and ongoing therapy.
7
+ "March 25, 2023",Jane Smith,Alex D.,"Received a call from Alex's school about a physical altercation with another student. Met with Alex, who displayed high levels of frustration and admitted to the use of alcohol. Discussed the importance of seeking help and finding positive coping mechanisms. Recommended enrollment in an anger management program."
8
+ "April 15, 2023",Jane Smith,Alex D.,Met with Alex and mother to discuss progress. Alex's mother expressed concerns about Alex's increasing aggression at home. Alex acknowledged the issues but blamed others for provoking the behavior. It was decided that a more intensive intervention may be needed.
9
+ "April 30, 2023",Jane Smith,Alex D.,"Met with Alex and a psychiatrist. Psychiatrist diagnosed Alex with Oppositional Defiant Disorder (ODD) and co-morbid substance use disorder. A treatment plan was discussed, including medication, therapy, and family counseling."
10
+ "May 20, 2023",Jane Smith,Alex D.,"Met with Alex to discuss progress. Alex has started attending group therapy and has shown slight improvements in behavior. Still, concerns remain about substance use. Discussed potential for a short-term residential treatment program."
11
+ "January 3, 2023",Jane Smith,Jamie L.,"Met with Jamie at school after receiving reports of consistent tardiness and decreased participation in class. Jamie appeared withdrawn and exhibited signs of sadness. When asked about feelings, Jamie expressed feeling ""empty"" and ""hopeless"" at times. Scheduled a follow-up meeting to further explore these feelings."
12
+ "January 17, 2023",Jane Smith,Jamie L.,"Met with Jamie at the community center. Jamie shared feelings of low self-worth, mentioning that it's hard to find motivation for daily tasks. Discussed potential triggers and learned about recent family financial struggles. Recommended counseling and possible group therapy for peer support."
13
+ "February 5, 2023",Jane Smith,Jamie L.,Home visit conducted. Jamie's parents shared concerns about Jamie's increasing withdrawal from family activities and lack of interest in hobbies. Parents mentioned that Jamie spends a lot of time alone in the room. Suggested family therapy to open communication channels.
14
+ "February 21, 2023",Jane Smith,Jamie L.,Met with Jamie in my office. Jamie opened up about feelings of isolation and mentioned difficulty sleeping. No signs of self-harm or suicidal ideation were noted. Recommended a comprehensive mental health assessment to better understand the depth of the depression.
15
+ "March 10, 2023",Jane Smith,Jamie L.,"Attended a joint session with Jamie and a therapist. The therapist noted signs of moderate depression. Together, we discussed coping strategies and potential interventions. Jamie showed interest in art therapy."
16
+ "March 25, 2023",Jane Smith,Jamie L.,"Received feedback from Jamie's school that academic performance has slightly improved. However, social interactions remain limited. Encouraged Jamie to join school clubs or groups to foster connection."
17
+ "April 15, 2023",Jane Smith,Jamie L.,"Met with Jamie and parents to discuss progress. Parents have observed slight improvements in mood on some days, but overall, Jamie still appears to struggle. It was decided to explore medication as a potential aid alongside therapy."
18
+ "April 30, 2023",Jane Smith,Jamie L.,Met with Jamie and a psychiatrist. The psychiatrist diagnosed Jamie with Major Depressive Disorder (MDD) and suggested considering antidepressant medication. Discussed the potential benefits and side effects. Jamie and parents will think it over.
19
+ "May 20, 2023",Jane Smith,Jamie L.,"Jamie has started on a low dose of an antidepressant. Initial feedback is positive, with some improvement in mood and energy levels. Will continue monitoring and adjusting as necessary."
example_data/doubled_output_joined.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6eeac353164447c2aa429196e1a6ffae4c095d7171e63c2d1cd1966fdf32d1ed
3
+ size 1274719
example_data/example_complaint_letter.jpg ADDED

Git LFS Details

  • SHA256: db33b67ebe685132a589593e4a3ca05f2dbce358b63de9142c2f2a36202e3f15
  • Pointer size: 131 Bytes
  • Size of remote file: 118 kB
example_data/example_of_emails_sent_to_a_professor_before_applying.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed0cd82b5b5826b851ca0e7c102d2d4d27580f7a90de4211a33178a6664d008d
3
+ size 8848
example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_ocr_output.csv ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ page,text,left,top,width,height
2
+ 1,Partnership Agreement,0.516078,0.027879,0.440784,0.032424
3
+ 1,SisterCities,0.169804,0.033333,0.238431,0.028182
4
+ 1,INTERNATIONAL,0.170196,0.06697,0.237647,0.008788
5
+ 1,Toolkit,0.830588,0.07303,0.126667,0.025152
6
+ 1,Connect globally. Thrive locally.,0.169804,0.08697,0.238824,0.01303
7
+ 1,Types of Affiliations,0.117255,0.157576,0.241961,0.02
8
+ 1,Sister City Relationship,0.117647,0.187273,0.196863,0.013939
9
+ 1,"A Sister City relationship is formed when the mayor or highest elected official (or, if elections",0.117255,0.211212,0.738824,0.013636
10
+ 1,"do not take place, highest appointed official) from a U.S. community and a community in",0.117647,0.227273,0.70902,0.013939
11
+ 1,another country or territory sign a formal agreement on behalf of their communities endorsing a,0.117647,0.243636,0.761961,0.013636
12
+ 1,"""sister city/sister cities"" relationship. Sister city agreements shall be considered active/valid",0.118039,0.259697,0.731373,0.013939
13
+ 1,unless otherwise indicated by one or both of the respective communities.,0.118039,0.276061,0.58549,0.013636
14
+ 1,Sister Cities International shall formally recognize only those relationships by cities/members in,0.118039,0.299697,0.758824,0.013636
15
+ 1,good standing (i.e. who are current on membership dues) in its Membership Directory or on its,0.117647,0.316061,0.754902,0.013636
16
+ 1,"website. However, Sister Cities International shall not assert as invalid or otherwise impugn the",0.116863,0.332121,0.760784,0.013636
17
+ 1,legitimacy of those relationships formed by non-members.,0.118039,0.348485,0.466275,0.013636
18
+ 1,Friendship City,0.118039,0.372121,0.127059,0.013939
19
+ 1,"A Friendship City or Friendship Cities relationship is often formed by cities as a ""stepping",0.117255,0.395758,0.714118,0.013636
20
+ 1,"stone"" to a more formal ""Sister City"" agreement. Typically Friendship City agreements are",0.117647,0.411515,0.720392,0.014242
21
+ 1,referred to as such in the formal documents that are signed. Sister Cities International shall,0.118039,0.428182,0.72549,0.013636
22
+ 1,recognize Friendship City relationships by members in its Membership Directory and website.,0.118039,0.444242,0.747843,0.013636
23
+ 1,As per Sister Cities International Board of Directors:,0.117255,0.467879,0.413333,0.013636
24
+ 1,Sister Cities International will recognize a new sister cities affiliation between a,0.169412,0.492121,0.626667,0.013333
25
+ 1,"U.S. and an international community, even though another affiliation may exist",0.169412,0.507879,0.625098,0.013636
26
+ 1,"between that international community and a different U.S. community, only if a",0.169412,0.524545,0.62902,0.013636
27
+ 1,cooperative agreement among all involved communities is filed with Sister Cities,0.16902,0.540606,0.643137,0.013636
28
+ 1,"International. If a cooperative agreement is denied, or no response to the request",0.170196,0.556667,0.647843,0.013333
29
+ 1,"is received within a reasonable amount of time, Sister Cities International will",0.169412,0.57303,0.612157,0.012727
30
+ 1,recognize the partnership as a friendship city and it will be delineated as such,0.169412,0.589091,0.621176,0.013636
31
+ 1,with a symbol in the membership directories.,0.168627,0.605455,0.358824,0.013333
32
+ 1,The cooperative agreement must be sent by the Mayor/County,0.168627,0.628788,0.509412,0.013939
33
+ 1,"Executive/Governor of the requesting community, and must be sent to the",0.169804,0.645152,0.595294,0.014242
34
+ 1,Mayor/County Executive/Governor of each of the existing partnership,0.169804,0.661212,0.555294,0.013636
35
+ 1,communities. Although the Mayor/County Executive/Governor may request input,0.16902,0.677879,0.647451,0.013636
36
+ 1,"from, or may be given input by, the sister cities program, it is up to the discretion",0.168627,0.693939,0.647059,0.013939
37
+ 1,of the Mayor/County Executive/Governor to sign the cooperative agreement.,0.16902,0.709697,0.612941,0.013939
38
+ 1,Although Sister Cities International will help with the cooperative agreement,0.168627,0.726364,0.605882,0.013636
39
+ 1,"process, it is up to the requesting community to get the agreement signed. Sister",0.169412,0.742121,0.650196,0.013939
40
+ 1,"Cities International will not, in any way, force a community to ""share"" and sign",0.16902,0.758182,0.623922,0.014242
41
+ 1,the cooperative agreement.,0.168627,0.774848,0.219216,0.013333
42
+ 1,"To place a relationship into Emeritus status, the mayor or highest elected official of the U.S.",0.117255,0.798485,0.736471,0.013939
43
+ 1,community must write a letter to the mayor of the foreign city indicating that they wish to,0.118039,0.814545,0.70902,0.013636
44
+ 1,"remain sister cities, but understand that the relationship will remain inactive until such time as",0.118039,0.831212,0.747451,0.013333
45
+ 1,both cities are able to sustain an active relationship. Sister Cities International should be,0.118039,0.847273,0.705098,0.013636
46
+ 1,informed in writing by the mayor of the U.S. city of the situation. Sister Cities International will,0.118039,0.863333,0.746275,0.013636
47
+ 2,Partnership Agreement,0.516078,0.027879,0.440784,0.032424
48
+ 2,SisterCities,0.169804,0.033333,0.238824,0.028182
49
+ 2,INTERNATIONAL,0.170196,0.06697,0.237647,0.008788
50
+ 2,Toolkit,0.83098,0.072727,0.127059,0.025455
51
+ 2,Connect globally. Thrive locally.,0.169804,0.08697,0.239216,0.01303
52
+ 2,then place the partnership into Emeritus Status and will reflect this status in directories and all,0.117255,0.132424,0.751373,0.013333
53
+ 2,lists of sister city programs.,0.118039,0.148788,0.218431,0.013333
54
+ 2,"If a community wishes to terminate a sister city relationship, then a letter from the mayor or",0.118431,0.172424,0.732549,0.013333
55
+ 2,highest elected official of the U.S. city should be sent to the mayor of the sister city. Sister,0.118039,0.188485,0.721569,0.013636
56
+ 2,Cities International should be informed of this action in writing by the mayor of the U.S. city,0.118039,0.204848,0.72902,0.013333
57
+ 2,and Sister Cities International will then remove the partnership from its directories and all lists,0.117647,0.221212,0.746275,0.013333
58
+ 2,of sister city programs. We do not recommend terminating a relationship simply because it is,0.117647,0.237273,0.743529,0.013333
59
+ 2,"dormant. Many partnerships wax and wane over the years, and in many cases a dormant",0.117647,0.253939,0.713333,0.013333
60
+ 2,partnership may be reinvigorated by local members years after it has been inactive.,0.118039,0.269697,0.664314,0.013636
61
+ 2,General Guidelines,0.118039,0.295152,0.231765,0.016061
62
+ 2,In order for a sister city/county/state partnership to be recognized by Sister Cities International,0.118431,0.324242,0.754902,0.013636
63
+ 2,"(SCI), the two communities must sign formal documents which clearly endorse the link. This",0.118039,0.340606,0.74,0.013636
64
+ 2,presumes several key items: that the U.S. community is already a member of SCI and has,0.118039,0.35697,0.718039,0.013636
65
+ 2,followed proper procedures (e.g. passed a city council resolution declaring the intent to twin,0.117255,0.373333,0.737647,0.013636
66
+ 2,with the specific city); that both communities share a mutual commitment to the relationship;,0.117255,0.389394,0.740784,0.013636
67
+ 2,and that both have secured the necessary support structure to build a lasting relationship. You,0.117647,0.405455,0.758039,0.013333
68
+ 2,should check with your local sister city program to see if they have any additional requirements,0.117647,0.421818,0.760784,0.013636
69
+ 2,before pursuing a sister city relationship.,0.118039,0.437879,0.323137,0.013636
70
+ 2,"SCI often refers to these agreements as a ""Sister City Agreement"" or ""Memorandum of",0.118039,0.461515,0.696863,0.013939
71
+ 2,"Understanding."" However, as the following examples show, the actual name and format of",0.118039,0.477576,0.729804,0.013636
72
+ 2,your documents is left up to you.,0.117255,0.494242,0.262745,0.013636
73
+ 2,A few things to keep in mind as you draft your agreement:,0.117255,0.517879,0.463137,0.013636
74
+ 2,"Your agreement can range from the ceremonial, with language focusing on each city's",0.176471,0.542121,0.69098,0.013939
75
+ 2,"commitment to fostering understanding, cooperation, and mutual benefit to the precise,",0.176471,0.558485,0.701961,0.013333
76
+ 2,"with particular areas of interest, specific programs/activities, or more concrete goals",0.176078,0.574848,0.673725,0.013636
77
+ 2,related to anything from numbers of exchanges to economic development.,0.176863,0.591212,0.596863,0.013636
78
+ 2,"Don't try to include everything you plan to do. Some specifics, like particular areas of",0.177255,0.620303,0.681176,0.013939
79
+ 2,"interest or participating institutions are good to include. However, there's no need to",0.176471,0.636667,0.675686,0.013636
80
+ 2,include all the programs you plan to do if it makes the document too lengthy or limits,0.176863,0.652727,0.678824,0.013939
81
+ 2,the scope of projects. This is a formal document to establish the relationship; specific,0.176078,0.668788,0.684706,0.013636
82
+ 2,"tasks, responsibilities, or other nuts-and-bolts text related to implementation or",0.176078,0.685455,0.635686,0.013333
83
+ 2,administration of the partnership can be expressed more fully in a separate,0.176471,0.701212,0.600392,0.013636
84
+ 2,memorandum between the respective sister city committees. Your partnership,0.177255,0.717576,0.626667,0.013636
85
+ 2,agreement is a historical document and should not be dated or limited by being aligned,0.176471,0.733636,0.699216,0.013636
86
+ 2,with very specific tasks.,0.176078,0.750606,0.190196,0.013333
87
+ 2,Work with your counterparts. Remember that this is signed by both cities. You should,0.176078,0.779697,0.68549,0.013636
88
+ 2,share drafts of your agreement with your international partners and solicit feedback on,0.176471,0.795758,0.691765,0.013333
89
+ 2,what they'd like to see in the agreement. Be flexible to cultural or municipal priorities.,0.176471,0.811818,0.679216,0.013939
90
+ 2,Ask your counterparts to translate the agreement if it is drafted in English. It is,0.176078,0.841515,0.623137,0.013636
91
+ 2,important for the citizens of your partner community to be able to read and understand,0.176863,0.857576,0.693725,0.013939
92
+ 2,the commitment their city has made. Have someone in your own community who,0.176078,0.873939,0.649804,0.013636
93
+ 3,Partnership Agreement,0.516078,0.027879,0.441176,0.032121
94
+ 3,SisterCities,0.169804,0.033333,0.239216,0.028182
95
+ 3,INTERNATIONAL,0.170196,0.06697,0.237255,0.008788
96
+ 3,Toolkit,0.83098,0.07303,0.126667,0.025152
97
+ 3,Connect globally. Thrive locally.,0.169804,0.08697,0.239216,0.01303
98
+ 3,speaks that language check the foreign-language version to make sure it mirrors what,0.176471,0.132424,0.688235,0.013333
99
+ 3,you have in your own agreement.,0.176471,0.148788,0.264706,0.013333
100
+ 3,Keep it to one page. Ceremonial documents such as these partnership agreements,0.176863,0.178485,0.66549,0.013636
101
+ 3,work best if they can be posted in their entirety.,0.176078,0.194545,0.380392,0.013636
102
+ 3,Most sister city agreements include some acknowledgement of the founding principles,0.177255,0.224242,0.694902,0.013636
103
+ 3,"of the sister city movement- to promote peace through mutual respect, understanding,",0.176471,0.240303,0.698431,0.013333
104
+ 3,and cooperation.,0.176471,0.25697,0.13451,0.013333
105
+ 3,Consider using official letterhead and/or other embellishments such as city seals or,0.176863,0.286061,0.665882,0.013333
106
+ 3,logos to reflect your enhance the document. Sister city agreements are often posted at,0.176863,0.302121,0.695686,0.013636
107
+ 3,city hall or other municipal offices and should reflect their historical importance,0.176471,0.318485,0.630588,0.013333
108
+ 3,Look at other agreements your city has signed. These agreements may give you an idea,0.177255,0.347879,0.705098,0.013636
109
+ 3,"of what is acceptable or possible, and they may be in an easily replicable format. If you",0.176471,0.364242,0.695686,0.013636
110
+ 3,"cannot access older agreements please contact Sister Cities International, we may",0.176863,0.380303,0.663137,0.013636
111
+ 3,"have them on file, although we do not have copies of all partnership agreements.",0.176863,0.396667,0.64549,0.013636
112
+ 3,Documents must be signed by the top elected official of both communities.,0.177255,0.426364,0.601569,0.013333
113
+ 3,"Check with your mayor, city council, town clerk, et al. to make sure that the agreement",0.176863,0.455758,0.694118,0.013636
114
+ 3,"is OK with them. The mayor is the one putting his or her name on the paper, and you",0.176863,0.471818,0.677255,0.013333
115
+ 3,don't want to spend time developing an agreement which will never be signed.,0.176863,0.488182,0.629412,0.013636
116
+ 3,Official documents are usually signed during a formal ceremony recognizing the,0.176863,0.517576,0.638431,0.013636
117
+ 3,partnership. Be sure both communities receive a signed set of the official documents,0.177255,0.533939,0.683922,0.013636
118
+ 3,for their records.,0.176078,0.550606,0.131373,0.010606
119
+ 3,Remember to send your signed agreement to Sister Cities International. After we,0.177255,0.579697,0.645098,0.013636
120
+ 3,receive your agreement we will post the relationship in the City Directory and make sure,0.176863,0.595758,0.703137,0.013636
121
+ 3,it is included in our Annual Membership Directory.,0.176863,0.612121,0.398039,0.013333
122
+ 3,Remember that each city's sister city program is independent and can impose requirements,0.118431,0.640606,0.736471,0.013939
123
+ 3,"like the establishment of a committee, a review period, sustainability/funding plan, among",0.118039,0.65697,0.715686,0.013636
124
+ 3,"others, before sanctioning a sister city agreement. Check with your local program or mayor's",0.117647,0.672727,0.743529,0.014242
125
+ 3,office to see if this is the case.,0.117647,0.689091,0.241176,0.011515
126
+ 3,On the following pages you'll find a series of partnership agreements to give you an idea of,0.118039,0.717879,0.728627,0.013939
127
+ 3,"what is possible. While you should feel free to use some of the formatting and language, we",0.117255,0.734242,0.73451,0.013636
128
+ 3,encourage you to make your agreement your own and be creative with what you produce. If,0.117647,0.750606,0.737647,0.013636
129
+ 3,you are unsure about your agreement or want advice you can always solicit feedback by,0.117647,0.766667,0.708627,0.013636
130
+ 3,sending it to our Membership Director at [email protected] or contacting us at (202),0.117647,0.782727,0.732157,0.013636
131
+ 3,347-8630.,0.117647,0.799394,0.080392,0.010303
132
+ 4,Partnership Agreement,0.516471,0.027879,0.440784,0.032727
133
+ 4,SisterCities,0.169412,0.033333,0.239608,0.028485
134
+ 4,INTERNATIONAL,0.170196,0.066667,0.238431,0.009091
135
+ 4,Toolkit,0.830588,0.072727,0.127843,0.025758
136
+ 4,Connect globally. Thrive locally.,0.169412,0.08697,0.239608,0.013333
137
+ 4,"jull bubzig 2000 3,312",0.378039,0.291212,0.32549,0.019394
138
+ 4,ABU DHABI MUNICIPALITY & TOWN PLANNING,0.376471,0.316667,0.327451,0.016667
139
+ 4,AN AGREEMENT FOR THE ESTABLISHMENT OF,0.260784,0.373636,0.52549,0.012727
140
+ 4,SISTER CITIES RELATIONSHIP,0.337647,0.393636,0.342745,0.012121
141
+ 4,BETWEEN,0.454902,0.413636,0.110588,0.011212
142
+ 4,THE CITY OF ABU DHABI ( U. A.E),0.337255,0.432727,0.375686,0.013939
143
+ 4,AND,0.487843,0.452727,0.048235,0.011212
144
+ 4,"HOUSTON, TEXAS ( U.S.A)",0.385882,0.471515,0.298039,0.014848
145
+ 4,"The Sister City Program, administered by Sister Cities International, was initiated",0.221961,0.525455,0.597255,0.01303
146
+ 4,By the President of the United States of America in 1956 to encourage greater,0.222745,0.539394,0.561961,0.012727
147
+ 4,Friendship and understanding between the United States and other nations through,0.222745,0.553333,0.608235,0.012727
148
+ 4,Direct personal contact: and,0.222745,0.567576,0.20549,0.012424
149
+ 4,"In order to foster those goals, the people of Abu Dhabi and Houston, in a gesture of",0.222353,0.594242,0.603529,0.012424
150
+ 4,"Friendship and goodwill, agree to collaborate for the mutual benefit of their",0.222745,0.608182,0.547843,0.01303
151
+ 4,"Communities by exploring education, economic and cultural opportunities.",0.222353,0.622121,0.541961,0.012121
152
+ 4,"Abu Dhabi and Houston, sharing a common interest in energy, technology and",0.221569,0.648788,0.574118,0.012424
153
+ 4,"medicine, and the desire to promote mutual understanding among our citizens do",0.222353,0.66303,0.588235,0.012121
154
+ 4,"hereby proclaim themselves Sister Cities beginning on the 13th day of March 2001,",0.221961,0.673636,0.594118,0.015758
155
+ 4,the date of Houston City Council resolution estatblishing the Sister City,0.221961,0.690303,0.519608,0.01303
156
+ 4,relationship became effective.,0.221569,0.705152,0.217647,0.012424
157
+ 4,"Signed on this 26 of October 2002, in duplicate in the Arabic and English",0.221569,0.732121,0.533333,0.01303
158
+ 4,"Languages, both text being equally authentic.",0.221961,0.746667,0.328627,0.012727
159
+ 4,A,0.344314,0.768485,0.084706,0.030303
160
+ 4,Sheikh Mohammed bin Butti AI Hamed,0.245882,0.806364,0.366275,0.010909
161
+ 4,Lee P.Brown,0.729412,0.806364,0.118824,0.010303
162
+ 4,Mayor of Houston,0.704706,0.823333,0.166667,0.012424
163
+ 4,Chairman of Abu Dhabi Municipality,0.24549,0.823636,0.342353,0.012727
164
+ 4,&Town Planning,0.324314,0.841212,0.155686,0.012424
165
+ 5,Partnership Agreement,0.516078,0.027879,0.441176,0.032424
166
+ 5,SisterCities,0.169412,0.033333,0.239608,0.028485
167
+ 5,INTERNATIONAL,0.17098,0.066667,0.237255,0.009091
168
+ 5,Toolkit,0.83098,0.072727,0.127059,0.025758
169
+ 5,Connect globally. Thrive locally.,0.169412,0.08697,0.239216,0.013333
170
+ 5,THE CITY OF NEW YORK,0.438824,0.262121,0.240784,0.009697
171
+ 5,OFFICE OF THE MAYOR,0.450196,0.27697,0.220392,0.009697
172
+ 5,"NEW YORK, N.Y. 10007",0.461176,0.29303,0.196863,0.010303
173
+ 5,THE NEW YORK CITY-LONDON SISTER CITY PARTNERSHIP,0.267451,0.355758,0.582745,0.011818
174
+ 5,Memorandum of Understanding,0.420392,0.371212,0.274902,0.013333
175
+ 5,The Sister City partnership between New York City and London will foster mutually,0.201176,0.402121,0.674118,0.014242
176
+ 5,beneficial solutions to common challenges for these two great cosmopolitan entities.,0.201176,0.417273,0.66902,0.013636
177
+ 5,"Consequently, the Sister City relationship between the two will be one of the most",0.201176,0.432727,0.652549,0.015152
178
+ 5,"important in their network of global partnerships, as it strives to:",0.201176,0.448182,0.50902,0.015455
179
+ 5,Encourage and publicize existing exchanges between London and New York City so,0.230588,0.480303,0.671373,0.015152
180
+ 5,that they can flourish to benefit a wider cross-section of the citizens of both;,0.230588,0.496061,0.602353,0.015152
181
+ 5,"Support and promote the development of new social, economic, academic and",0.230196,0.512424,0.618431,0.015455
182
+ 5,community programs to encourage both cities' citizens to share their experiences as a,0.229804,0.527879,0.678039,0.014848
183
+ 5,medium for learning from one another;,0.229804,0.543636,0.309412,0.013939
184
+ 5,Generate an improvement of the operation of the cities' various government agencies,0.229804,0.56,0.676078,0.014545
185
+ 5,by serving as a conduit of information;,0.22902,0.575758,0.307843,0.014848
186
+ 5,"Identify themes, common to both, that can generate new initiatives to further and",0.229412,0.591818,0.640784,0.015152
187
+ 5,"nurture the increasingly powerful financial, social and cultural relationships between",0.22902,0.607576,0.671373,0.014242
188
+ 5,the cities;,0.22902,0.624545,0.076471,0.012424
189
+ 5,Promote key mayoral priorities relevant to both London and New York City;,0.228627,0.639394,0.608627,0.015152
190
+ 5,Provide financial or in kind support to community-led programs that advance the,0.228627,0.656061,0.641569,0.013636
191
+ 5,aims of the Sister City partnership;,0.22902,0.672121,0.275294,0.013636
192
+ 5,"With the above purposes in mind, the Mayor of the City of New York and the Mayor of",0.198824,0.702424,0.697647,0.014848
193
+ 5,London solemnly confirm that these two cities are united by an official partnership by the,0.198824,0.718182,0.710196,0.014545
194
+ 5,protocol of this Memorandum of Understanding.,0.198431,0.733939,0.384314,0.015152
195
+ 5,This agreement will go into effect from the date of signatures.,0.310196,0.780606,0.488235,0.014545
196
+ 5,Thedder Rudolph W. Giuliani,0.178824,0.795455,0.244314,0.100909
197
+ 5,Signed in March of 2001,0.455686,0.796364,0.19451,0.013636
198
+ 5,Ken Mayor Livingstone,0.672157,0.877576,0.132941,0.029091
199
+ 5,Mayor,0.311373,0.894848,0.053333,0.012727
200
+ 5,New York City,0.287843,0.909091,0.121176,0.013333
201
+ 5,London,0.701961,0.909091,0.061569,0.010606
202
+ 6,Partnership Agreement,0.515686,0.027576,0.441961,0.03303
203
+ 6,SisterCities,0.169412,0.03303,0.24,0.028182
204
+ 6,INTERNATIONAL,0.169804,0.066667,0.238431,0.009091
205
+ 6,Toolkit,0.83098,0.072727,0.127451,0.025758
206
+ 6,Connect globally. Thrive locally.,0.169412,0.08697,0.239608,0.013333
207
+ 6,CHIC OF STATE,0.247451,0.190606,0.141961,0.036364
208
+ 6,City of Long Beach,0.388627,0.196667,0.476471,0.066364
209
+ 6,California,0.551373,0.257273,0.136471,0.033333
210
+ 6,Sister City Agreement,0.321961,0.305455,0.378431,0.035152
211
+ 6,between the,0.464706,0.352727,0.084314,0.009697
212
+ 6,City of Long Beach,0.38,0.378485,0.252549,0.01697
213
+ 6,"California, USA",0.4,0.397576,0.21098,0.016061
214
+ 6,and the,0.48,0.415152,0.053333,0.009091
215
+ 6,City of San Pablo de Manta,0.321569,0.428788,0.369804,0.01697
216
+ 6,"Ecuador, South America",0.347451,0.447879,0.317255,0.015152
217
+ 6,"In accordance with the authorization and approval expressed by the City of Long Beach,",0.261569,0.482121,0.536863,0.012121
218
+ 6,"California, USA, and the City of San Pablo de Manta, Ecundor, South America, it is declared",0.217647,0.492727,0.581176,0.01303
219
+ 6,"that a ""Sister City Agreement between the two cities is hereby established for the following",0.217647,0.502727,0.581569,0.012121
220
+ 6,purposes:,0.216863,0.516061,0.058039,0.009394
221
+ 6,(1) to promote and expand the effective and mutually beneficial cooperation between,0.278824,0.532727,0.520392,0.012424
222
+ 6,the people of Long Beach and the people of San Pablo de Manta; and,0.218039,0.543636,0.40549,0.012424
223
+ 6,"(2) to promote international goodwill, understanding, and expanded business",0.279216,0.56303,0.520784,0.012424
224
+ 6,"relations between the two cities and their respective nations by the exchange of people, ideas, and",0.218039,0.573636,0.581569,0.012121
225
+ 6,"information in a unide variety of economic, social, cultural, municipal, environmental,",0.218039,0.584242,0.581176,0.012121
226
+ 6,"professional, technical, youth, and other endeavors; and",0.217647,0.594848,0.333333,0.012121
227
+ 6,"(3) to foster and encourage charitable, scientific, trade and commerce, literary and",0.279608,0.613939,0.520784,0.012727
228
+ 6,educational activities between the two cities;,0.218039,0.625455,0.265882,0.009697
229
+ 6,This Sister City Agreement shall be officially established and shall become effective when,0.263137,0.644545,0.536863,0.012727
230
+ 6,"this document has been duly executed by the Mayor of Long Beach, California, USA, and the",0.218824,0.654848,0.581961,0.012424
231
+ 6,"Mayor of San Pablo de Manta, Ecundor, South America.",0.218431,0.665758,0.338824,0.012121
232
+ 6,STATE OFFICE,0.276471,0.713636,0.050588,0.048788
233
+ 6,Beverly 0 Neill,0.587451,0.736667,0.121961,0.013636
234
+ 6,"Mayor, City of Long Beach",0.542353,0.751212,0.21098,0.013636
235
+ 6,"California, USA",0.582745,0.765758,0.125098,0.01303
236
+ 6,10.2aulus,0.490588,0.771818,0.220392,0.062424
237
+ 6,Ing. Jorge O. Zambrano Cedeño,0.527059,0.825152,0.242745,0.013333
238
+ 6,"Mayor, City of San Pablo de Manta",0.505098,0.839394,0.277647,0.013636
239
+ 6,"Ecuador, South America",0.551765,0.854242,0.188235,0.011818
240
+ 6,"Dated: September 19, 2000",0.544706,0.883333,0.202745,0.01303
241
+ 7,Partnership Agreement,0.516078,0.027879,0.441176,0.032424
242
+ 7,SisterCities,0.169412,0.03303,0.24,0.028485
243
+ 7,INTERNATIONAL,0.170196,0.066667,0.237647,0.009091
244
+ 7,Toolkit,0.83098,0.072727,0.127451,0.025758
245
+ 7,Connect globally. Thrive locally.,0.169412,0.08697,0.239216,0.013333
246
+ 7,REAFFIRMATION OF SISTER CITIES DECLARATION,0.324706,0.165152,0.483529,0.013939
247
+ 7,adopted by,0.2,0.213333,0.080392,0.013636
248
+ 7,THE HONORABLE RICHARD M. DALEY,0.396078,0.214242,0.335686,0.012424
249
+ 7,MAYOR OF CHICAGO,0.472549,0.231212,0.18549,0.011515
250
+ 7,and,0.199608,0.260909,0.026275,0.010606
251
+ 7,THE HONORABLE ZHANG RONGMAO,0.401961,0.261212,0.323137,0.011212
252
+ 7,MAYOR OF SHENYANG,0.463529,0.273636,0.202353,0.011212
253
+ 7,ON,0.551765,0.298182,0.026667,0.011515
254
+ 7,"JUNE 5, 1995",0.500392,0.323636,0.128235,0.014848
255
+ 7,"On this the tenth anniversary of the signing of a sister city agreement, in order to further",0.255686,0.36303,0.67098,0.015152
256
+ 7,the traditional links of friendship between Chicago and Shenyang and to reaffirm their mutual,0.198824,0.378788,0.727843,0.015455
257
+ 7,"aspiration to work in unison for the benefit of their cities and nations, the Honorable Mayor",0.199608,0.394848,0.727843,0.014848
258
+ 7,"Richard M. Daley, Mayor of the City of Chicago, and the Honorable Zhang Rongmao, Mayor",0.199216,0.411212,0.727451,0.014242
259
+ 7,"of the City of Shenyang, on this fifth day of June 1995, do hereby acknowledge and reaffirm the",0.199216,0.42697,0.72549,0.014848
260
+ 7,sister cities agreement between the City of Chicago and the City of Shenyang.,0.199608,0.443636,0.57451,0.014242
261
+ 7,"The City of Chicago and the City of Shenyang on the basis of friendly cooperation,",0.256078,0.473939,0.665098,0.015152
262
+ 7,equality and mutual benefit will continue to develop a sister cities relationship to promote and,0.2,0.490303,0.724706,0.014242
263
+ 7,broaden economic cooperation and cultural exchanges between the two cities.,0.199216,0.506061,0.57451,0.014242
264
+ 7,The two cities do hereby declare their interest in exploring the establishment of business,0.255294,0.537273,0.668235,0.015455
265
+ 7,and trade relations between Chicago and Shenyang.,0.198824,0.554545,0.387843,0.013636
266
+ 7,"In addition, exchanges will be promoted in the area of the arts such as exhibits, music,",0.254118,0.583939,0.666667,0.015455
267
+ 7,dance and other cultural activities.,0.198431,0.601212,0.256471,0.010606
268
+ 7,"In addition, exchanges will be promoted in education and the establishment of contacts",0.254118,0.630303,0.668627,0.015758
269
+ 7,within educational institutions encouraged.,0.198824,0.647273,0.32,0.014242
270
+ 7,"In addition, we declare our intention to promote exchanges in such fields as science and",0.253725,0.678182,0.668627,0.014848
271
+ 7,"technology, sports, health, youth and any areas that will contribute to the prosperity and the",0.198039,0.693636,0.722745,0.015152
272
+ 7,further development of friendship between the people of our two cities.,0.194902,0.711515,0.525098,0.013636
273
+ 7,3h.5.,0.593725,0.750606,0.218039,0.06303
274
+ 7,THE HONORABLE ZHANG RONGMAO,0.588627,0.819394,0.287843,0.011818
275
+ 7,THE HONORABLE RICHARD M. DALEY,0.197255,0.821515,0.303529,0.010606
276
+ 7,MAYOR OF SHENYANG,0.587451,0.835455,0.177647,0.010303
277
+ 7,MAYOR OF CHICAGO,0.195686,0.835758,0.164706,0.010606
example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_review_file.csv ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ image,page,label,color,xmin,ymin,xmax,ymax,id,text
2
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_0.png,1,ADDRESS,"(0, 0, 0)",0.598431,0.524545,0.63098,0.535455,EG3nykuwvxbk,U.S.
3
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_0.png,1,ADDRESS,"(0, 0, 0)",0.820392,0.798485,0.854118,0.809394,jy1R42e6phNz,U.S.
4
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_0.png,1,ADDRESS,"(0, 0, 0)",0.433333,0.863333,0.46549,0.873939,9sbrsroLfZy0,U.S.
5
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_1.png,2,ADDRESS,"(0, 0, 0)",0.354118,0.188788,0.386275,0.199697,k7bWBsQQchJZ,U.S.
6
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_1.png,2,ADDRESS,"(0, 0, 0)",0.780392,0.204848,0.812941,0.215758,peo6UqIxrjmR,U.S.
7
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_2.png,3,EMAIL,"(0, 0, 0)",0.447843,0.78303,0.648627,0.796667,DIfz0LenOtQv,[email protected]
8
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_2.png,3,PHONE,"(0, 0, 0)",0.809804,0.78303,0.850196,0.796667,odJdySe9XrAn,(202)
9
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_2.png,3,PHONE,"(0, 0, 0)",0.117647,0.799394,0.198431,0.809697,iURSkUM7BbUG,347-8630
10
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.637647,0.432727,0.712941,0.44697,fRxAD9qm856s,U. A.E
11
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.489412,0.43303,0.614902,0.444545,qzRFPlNbslpH,ABU DHABI
12
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.385882,0.472121,0.593725,0.486364,v1uLbGsofN1f,"HOUSTON, TEXAS"
13
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.392549,0.539697,0.573725,0.549394,MvbPQiHvSdL7,United States of America
14
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.539216,0.553333,0.635686,0.563333,05U3cgj5w9PY,United States
15
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.534902,0.594242,0.615294,0.603939,uHMikyBlMq5f,Abu Dhabi
16
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.651373,0.594242,0.717255,0.605455,XNUE0GopIBaf,Houston
17
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.221569,0.65,0.301176,0.659697,6FjbNu2CGA9n,Abu Dhabi
18
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.337647,0.65,0.404314,0.660606,Yvmm2225ityu,Houston
19
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,HANDWRITING,"(0, 0, 0)",0.344314,0.768485,0.42902,0.798788,EwTcqq7PENU8,A
20
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,NAME,"(0, 0, 0)",0.245882,0.806364,0.612549,0.817576,Mj4gqwbgsZWp,Sheikh Mohammed bin Butti AI Hamed
21
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,NAME,"(0, 0, 0)",0.52,0.806364,0.612549,0.81697,RXYOVgLwq8Ke,AI Hamed
22
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,NAME,"(0, 0, 0)",0.729412,0.806364,0.848235,0.816667,REPZhwFWGoTc,Lee P.Brown
23
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,NAME,"(0, 0, 0)",0.245882,0.806667,0.51451,0.817576,rFdxMRFRWLRJ,Sheikh Mohammed bin Butti
24
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_3.png,4,ADDRESS,"(0, 0, 0)",0.366667,0.823939,0.465098,0.834242,5iYCxRGdPG1i,Abu Dhabi
25
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.577647,0.262121,0.68,0.271515,3ZR43H3yYNdy,NEW YORK
26
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.461176,0.29303,0.555294,0.303333,WNoitmR9A6lu,NEW YORK
27
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.461176,0.29303,0.658039,0.303333,HjrhxMQhovlF,NEW YORK N.Y. 10007
28
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.563137,0.29303,0.658039,0.302121,nPN7g7UcnX4u,N.Y. 10007
29
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.314118,0.356667,0.42549,0.367576,ZoJf29CB3Wrq,NEW YORK
30
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.655294,0.480909,0.718431,0.491515,iezAqmD2ilnb,London
31
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.708627,0.639394,0.837255,0.652727,tWAuJEQVpfhi,New York City
32
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.60902,0.64,0.67098,0.650606,NaW3mmmlhMW9,London
33
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.667059,0.702727,0.751373,0.713636,pgMiwuMiBp8B,New York
34
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.198824,0.720303,0.261569,0.731212,fPvElSFZFRoL,London
35
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,HANDWRITING,"(0, 0, 0)",0.178824,0.795455,0.281961,0.896364,DfniF7P2bXAw,Thedder
36
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,NAME,"(0, 0, 0)",0.178824,0.795455,0.423529,0.896364,QwnWsAeslO5f,Thedder Rudolph W. Giuliani
37
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,NAME - ADDRESS,"(0, 0, 0)",0.672157,0.877576,0.80549,0.891212,Vdp95SShYOEO,Ken Livingstone
38
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.710196,0.877576,0.80549,0.891212,H5DGqsucPAjc,Livingstone
39
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,NAME,"(0, 0, 0)",0.672157,0.877879,0.705098,0.888182,qotGtnMbhAJr,Ken
40
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.287843,0.909091,0.40902,0.922727,sFX0tNJJzpE5,New York City
41
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_4.png,5,ADDRESS,"(0, 0, 0)",0.701961,0.909091,0.763922,0.919697,2xFbVTbxiOhC,London
42
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.55451,0.203636,0.86549,0.258485,Nfe3WTBembGQ,Long Beach
43
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.551373,0.257273,0.687843,0.290606,kndQY5X4itc8,California
44
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.558824,0.397879,0.611373,0.410303,B5vq8yhWLeOg,USA
45
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.425882,0.429091,0.691373,0.441818,OtNgqUkoEaZb,San Pablo de Manta
46
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.347451,0.447879,0.665098,0.46303,Q52VzBx2SWNF,"Ecuador, South America"
47
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.724314,0.482121,0.798431,0.493939,O7gd9ywvKsKh,"Long Beach,"
48
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.425098,0.49303,0.506275,0.502727,DzYr3xrM8Tvv,San Pablo de
49
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.425098,0.49303,0.715294,0.50303,iZ0knpQD54UU,"San Pablo de Manta, Ecundor, South America"
50
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.509804,0.49303,0.715294,0.50303,pZnYGzr7Pwsl,"Manta, Ecundor, South America"
51
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.217647,0.493333,0.321961,0.504242,r7Aar8FNQF6D,"California, USA"
52
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.471765,0.543636,0.596863,0.553939,zg9uBDlSuuA1,San Pablo de Manta
53
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.295294,0.544242,0.36549,0.556061,A0OY6RjMEocW,Long Beach
54
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.563137,0.655152,0.748627,0.667576,HQlTdEUhOCgI,"Long Beach, California, USA"
55
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.463529,0.665758,0.557255,0.674848,bCN9b7kJw0Ik,South America
56
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.277647,0.666061,0.403529,0.676061,qffN3bDgWRMk,San Pablo de Manta
57
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.587451,0.736667,0.709804,0.750303,eqMENFw5mbnL,Beverly 0 Neill
58
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.663137,0.751212,0.753333,0.764545,POqPQVBCES8h,Long Beach
59
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.582745,0.765758,0.708235,0.779091,mjrjsSMOxwaY,"California, USA"
60
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,HANDWRITING,"(0, 0, 0)",0.490588,0.771818,0.71098,0.834242,xL8dSawihWuY,10.2aulus
61
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,NAME,"(0, 0, 0)",0.559608,0.825152,0.769804,0.838485,fHyvwmbOgLMJ,Jorge O. Zambrano Cedeño
62
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.624314,0.839394,0.782745,0.850303,zGhskyehufSv,San Pablo de Manta
63
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_5.png,6,ADDRESS,"(0, 0, 0)",0.551765,0.854242,0.74,0.866061,dSPXmtb8M4nt,"Ecuador, South America"
64
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,NAME,"(0, 0, 0)",0.556471,0.215152,0.731765,0.226667,BEhuvaI5BVaR,RICHARD M. DALEY
65
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,NAME,"(0, 0, 0)",0.563137,0.261212,0.725098,0.272424,coo8KK7q6A72,ZHANG RONGMAO
66
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.566275,0.273636,0.666275,0.285152,0P9rVSbeNdB4,SHENYANG
67
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.526667,0.380303,0.588235,0.394242,1GDArufutI5y,Chicago
68
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.628235,0.380606,0.702353,0.394242,QyD751r4fCU1,Shenyang
69
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,NAME,"(0, 0, 0)",0.736863,0.411515,0.868235,0.424545,rntIekANI8BO,Zhang Rongmao
70
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,NAME,"(0, 0, 0)",0.199216,0.411818,0.34,0.424848,96TaHazXGIM7,Richard M. Daley
71
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.514902,0.412424,0.580784,0.425758,kbyVj6qhZSPi,Chicago
72
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.696471,0.443939,0.774118,0.45697,rJpaMvepsNln,Shenyang
73
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.353725,0.474545,0.415686,0.489091,PokCVpLQmDki,Chicago
74
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,ADDRESS,"(0, 0, 0)",0.407451,0.554545,0.469804,0.568182,HqVr414KRg59,Chicago
75
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,HANDWRITING,"(0, 0, 0)",0.593725,0.750606,0.811765,0.813636,xdawEv0DUH6P,3h.5.
76
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,NAME,"(0, 0, 0)",0.730196,0.819394,0.876471,0.830606,Gghr7ccN6lS2,ZHANG RONGMAO
77
+ C:\Users\spedrickcase\OneDrive - Lambeth Council\Apps\doc_redaction\input/Partnership-Agreement-Toolkit_0_0.pdf_6.png,7,NAME,"(0, 0, 0)",0.34,0.821515,0.501176,0.831515,vOMIv1RS5Sag,RICHARD M. DALEY
example_data/example_outputs/Partnership-Agreement-Toolkit_0_0_ocr_results_with_words_textract.csv ADDED
The diff for this file is too large to render. See raw diff
 
example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv ADDED
The diff for this file is too large to render. See raw diff
 
example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_ocr_output_textract.csv ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ page,text,left,top,width,height,line
2
+ 1,Example of emails sent to a professor before applying:,0.147059,0.093434,0.426471,0.013889,1
3
+ 1,Fwd: Prospective Graduate Student,0.145425,0.128788,0.277778,0.013889,2
4
+ 1,"Dr. Kornbluth,",0.147059,0.162879,0.114379,0.012626,3
5
+ 1,I am a senior biology major at the University of Notre Dame. I am applying to the CMB,0.147059,0.198232,0.689542,0.013889,4
6
+ 1,program and am very interested in your work. After glancing at a few of your recent,0.145425,0.214646,0.660131,0.013889,5
7
+ 1,papers and your research summary I find your work with apoptosis very interesting. Will,0.145425,0.232323,0.697712,0.013889,6
8
+ 1,"you be taking on new students next year? If I am invited to interview, is there any way",0.145425,0.25,0.683007,0.013889,7
9
+ 1,you will be able to meet with me?,0.145425,0.267677,0.264706,0.013889,8
10
+ 1,I have worked on several different research projects as an undergraduate in Dr. David R.,0.147059,0.30303,0.69281,0.013889,9
11
+ 1,Hyde's lab at the University of Notre Dame. The Hyde lab is interested in the signals that,0.147059,0.320707,0.697712,0.013889,10
12
+ 1,initiate Muller glia division post-light damage. My first research project was,0.147059,0.338384,0.598039,0.013889,11
13
+ 1,characterizing the role of leukemia inhibitory factor (LIF) in the activation of cell,0.147059,0.354798,0.637255,0.013889,12
14
+ 1,proliferation in the undamaged zebrafish retina. I am also working on several,0.145425,0.372475,0.604575,0.013889,13
15
+ 1,experiments that are related to a genetic screen that the Hyde lab plans on performing to,0.145425,0.390152,0.689542,0.013889,14
16
+ 1,identify mutants in the regeneration pathway--I am developing a neuroD4:EGFP,0.147059,0.407828,0.635621,0.013889,15
17
+ 1,transgenic line for use in this screen and I am characterizing the extent of damage and,0.145425,0.425505,0.673203,0.013889,16
18
+ 1,"regeneration in sheer zebrafish retinas. Finally, I am characterizing the chx10:EGFP",0.145425,0.443182,0.661765,0.013889,17
19
+ 1,transgenic line during retinal development and regeneration.,0.145425,0.459596,0.472222,0.013889,18
20
+ 1,Please find my CV attached.,0.145425,0.496212,0.222222,0.013889,19
21
+ 1,"Thank you for your time,",0.145425,0.531566,0.196078,0.013889,20
22
+ 1,--Lauren Lilley,0.147059,0.566919,0.119281,0.013889,21
23
+ 1,"Dr. Poss,",0.145425,0.637626,0.070261,0.012626,22
24
+ 1,I am a senior biology major at the University of Notre Dame. I am applying to your,0.145425,0.671717,0.655229,0.013889,23
25
+ 1,graduate program and am very interested in your work. After glancing at a few of your,0.145425,0.689394,0.679739,0.013889,24
26
+ 1,recent papers and your research summary I find your research greatly coincides with my,0.145425,0.707071,0.69281,0.013889,25
27
+ 1,research experiences and interests. Will you be taking on new students next year?,0.145425,0.723485,0.643791,0.015152,26
28
+ 1,I have worked on several different research projects as an undergraduate in Dr. David R.,0.145425,0.760101,0.69281,0.013889,27
29
+ 1,Hyde's lab at the University of Notre Dame. The Hyde lab is interested in the signals that,0.145425,0.777778,0.699346,0.013889,28
30
+ 1,initiate Muller glia division post-light damage. My first research project was,0.145425,0.795455,0.598039,0.013889,29
31
+ 1,characterizing the role of leukemia inhibitory factor (LIF) in the activation of cell,0.145425,0.811869,0.638889,0.013889,30
32
+ 1,proliferation in the undamaged zebrafish retina. I am also working on several,0.145425,0.829545,0.604575,0.013889,31
33
+ 1,experiments that are related to a genetic screen that the Hyde lab plans on performing to,0.145425,0.847222,0.691176,0.013889,32
34
+ 1,identify mutants in the regeneration pathway--I am developing a neuroD4:EGFP,0.145425,0.864899,0.635621,0.013889,33
35
+ 1,transgenic line for use in this screen and I am characterizing the extent of damage and,0.145425,0.881313,0.673203,0.013889,34
36
+ 2,"regeneration in sheer zebrafish retinas. Finally, I am characterizing the chx10:EGFP",0.145425,0.093434,0.661765,0.013889,1
37
+ 2,transgenic line during retinal development and regeneration.,0.145425,0.111111,0.472222,0.013889,2
38
+ 2,Please find my CV attached.,0.145425,0.146465,0.222222,0.013889,3
39
+ 2,"Thank you for your time,",0.145425,0.181818,0.196078,0.013889,4
40
+ 2,--Lauren Lilley,0.147059,0.218434,0.119281,0.013889,5
example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_ocr_results_with_words_textract.csv ADDED
@@ -0,0 +1,432 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ page,line,word_text,word_x0,word_y0,word_x1,word_y1,line_text,line_x0,line_y0,line_x1,line_y1
2
+ 1,1,Example,0.147059,0.093434,0.215686,0.107323,,,,,
3
+ 1,1,of,0.220588,0.093434,0.240196,0.104798,,,,,
4
+ 1,1,emails,0.24183,0.093434,0.292484,0.104798,,,,,
5
+ 1,1,sent,0.297386,0.094697,0.330065,0.104798,,,,,
6
+ 1,1,to,0.334967,0.094697,0.349673,0.104798,,,,,
7
+ 1,1,a,0.354575,0.097222,0.362745,0.104798,,,,,
8
+ 1,1,professor,0.367647,0.093434,0.441176,0.108586,,,,,
9
+ 1,1,before,0.446078,0.093434,0.496732,0.104798,,,,,
10
+ 1,1,applying:,0.501634,0.093434,0.573529,0.107323,,,,,
11
+ 1,2,Fwd:,0.145425,0.128788,0.184641,0.140152,,,,,
12
+ 1,2,Prospective,0.191176,0.128788,0.28268,0.142677,,,,,
13
+ 1,2,Graduate,0.287582,0.128788,0.359477,0.140152,,,,,
14
+ 1,2,Student,0.364379,0.128788,0.424837,0.140152,,,,,
15
+ 1,3,Dr.,0.147059,0.162879,0.171569,0.174242,,,,,
16
+ 1,3,"Kornbluth,",0.176471,0.162879,0.261438,0.176768,,,,,
17
+ 1,4,I,0.147059,0.198232,0.153595,0.209596,,,,,
18
+ 1,4,am,0.158497,0.200758,0.181373,0.209596,,,,,
19
+ 1,4,a,0.186275,0.20202,0.194444,0.209596,,,,,
20
+ 1,4,senior,0.199346,0.198232,0.248366,0.209596,,,,,
21
+ 1,4,biology,0.253268,0.198232,0.312092,0.212121,,,,,
22
+ 1,4,major,0.316993,0.198232,0.364379,0.212121,,,,,
23
+ 1,4,at,0.367647,0.199495,0.382353,0.209596,,,,,
24
+ 1,4,the,0.387255,0.198232,0.411765,0.209596,,,,,
25
+ 1,4,University,0.416667,0.198232,0.5,0.212121,,,,,
26
+ 1,4,of,0.504902,0.198232,0.522876,0.209596,,,,,
27
+ 1,4,Notre,0.52451,0.198232,0.570261,0.209596,,,,,
28
+ 1,4,Dame.,0.575163,0.198232,0.625817,0.209596,,,,,
29
+ 1,4,I,0.632353,0.198232,0.637255,0.209596,,,,,
30
+ 1,4,am,0.643791,0.200758,0.666667,0.209596,,,,,
31
+ 1,4,applying,0.671569,0.198232,0.740196,0.212121,,,,,
32
+ 1,4,to,0.745098,0.199495,0.759804,0.209596,,,,,
33
+ 1,4,the,0.764706,0.198232,0.789216,0.209596,,,,,
34
+ 1,4,CMB,0.794118,0.198232,0.836601,0.209596,,,,,
35
+ 1,5,program,0.145425,0.218434,0.212418,0.229798,,,,,
36
+ 1,5,and,0.21732,0.215909,0.245098,0.227273,,,,,
37
+ 1,5,am,0.25,0.218434,0.27451,0.227273,,,,,
38
+ 1,5,very,0.279412,0.218434,0.313725,0.229798,,,,,
39
+ 1,5,interested,0.320261,0.214646,0.395425,0.22601,,,,,
40
+ 1,5,in,0.400327,0.214646,0.416667,0.22601,,,,,
41
+ 1,5,your,0.419935,0.218434,0.457516,0.229798,,,,,
42
+ 1,5,work.,0.460784,0.214646,0.506536,0.227273,,,,,
43
+ 1,5,After,0.511438,0.214646,0.553922,0.227273,,,,,
44
+ 1,5,glancing,0.55719,0.215909,0.625817,0.229798,,,,,
45
+ 1,5,at,0.630719,0.217172,0.645425,0.227273,,,,,
46
+ 1,5,a,0.650327,0.218434,0.658497,0.227273,,,,,
47
+ 1,5,few,0.663399,0.214646,0.69281,0.22601,,,,,
48
+ 1,5,of,0.697712,0.214646,0.715686,0.227273,,,,,
49
+ 1,5,your,0.718954,0.218434,0.754902,0.229798,,,,,
50
+ 1,5,recent,0.759804,0.217172,0.80719,0.22601,,,,,
51
+ 1,6,papers,0.145425,0.236111,0.197712,0.247475,,,,,
52
+ 1,6,and,0.202614,0.232323,0.230392,0.243687,,,,,
53
+ 1,6,your,0.235294,0.236111,0.271242,0.247475,,,,,
54
+ 1,6,research,0.276144,0.232323,0.341503,0.243687,,,,,
55
+ 1,6,summary,0.346405,0.236111,0.419935,0.247475,,,,,
56
+ 1,6,I,0.424837,0.232323,0.431373,0.243687,,,,,
57
+ 1,6,find,0.436275,0.232323,0.46732,0.243687,,,,,
58
+ 1,6,your,0.472222,0.236111,0.50817,0.247475,,,,,
59
+ 1,6,work,0.513072,0.232323,0.553922,0.243687,,,,,
60
+ 1,6,with,0.558824,0.232323,0.593137,0.243687,,,,,
61
+ 1,6,apoptosis,0.598039,0.233586,0.671569,0.247475,,,,,
62
+ 1,6,very,0.678105,0.236111,0.712418,0.247475,,,,,
63
+ 1,6,interesting.,0.71732,0.232323,0.803922,0.247475,,,,,
64
+ 1,6,Will,0.810458,0.232323,0.844771,0.243687,,,,,
65
+ 1,7,you,0.145425,0.253788,0.174837,0.263889,,,,,
66
+ 1,7,be,0.179739,0.25,0.199346,0.261364,,,,,
67
+ 1,7,taking,0.204248,0.25,0.253268,0.265152,,,,,
68
+ 1,7,on,0.25817,0.253788,0.277778,0.261364,,,,,
69
+ 1,7,new,0.28268,0.253788,0.315359,0.261364,,,,,
70
+ 1,7,students,0.320261,0.25,0.383987,0.261364,,,,,
71
+ 1,7,next,0.388889,0.251263,0.423203,0.261364,,,,,
72
+ 1,7,year?,0.428105,0.25,0.470588,0.263889,,,,,
73
+ 1,7,If,0.480392,0.25,0.495098,0.261364,,,,,
74
+ 1,7,I,0.498366,0.25,0.504902,0.261364,,,,,
75
+ 1,7,am,0.509804,0.253788,0.534314,0.261364,,,,,
76
+ 1,7,invited,0.539216,0.25,0.593137,0.261364,,,,,
77
+ 1,7,to,0.598039,0.251263,0.612745,0.261364,,,,,
78
+ 1,7,"interview,",0.617647,0.25,0.696078,0.263889,,,,,
79
+ 1,7,is,0.702614,0.25,0.714052,0.261364,,,,,
80
+ 1,7,there,0.718954,0.25,0.759804,0.261364,,,,,
81
+ 1,7,any,0.763072,0.253788,0.792484,0.263889,,,,,
82
+ 1,7,way,0.797386,0.253788,0.830065,0.263889,,,,,
83
+ 1,8,you,0.145425,0.271465,0.176471,0.281566,,,,,
84
+ 1,8,will,0.179739,0.267677,0.210784,0.27904,,,,,
85
+ 1,8,be,0.215686,0.267677,0.235294,0.27904,,,,,
86
+ 1,8,able,0.238562,0.267677,0.272876,0.27904,,,,,
87
+ 1,8,to,0.276144,0.268939,0.292484,0.27904,,,,,
88
+ 1,8,meet,0.297386,0.268939,0.334967,0.27904,,,,,
89
+ 1,8,with,0.339869,0.267677,0.375817,0.27904,,,,,
90
+ 1,8,me?,0.380719,0.267677,0.411765,0.27904,,,,,
91
+ 1,9,I,0.147059,0.30303,0.151961,0.314394,,,,,
92
+ 1,9,have,0.156863,0.30303,0.194444,0.314394,,,,,
93
+ 1,9,worked,0.199346,0.30303,0.25817,0.314394,,,,,
94
+ 1,9,on,0.263072,0.306818,0.28268,0.314394,,,,,
95
+ 1,9,several,0.287582,0.30303,0.343137,0.314394,,,,,
96
+ 1,9,different,0.348039,0.30303,0.416667,0.314394,,,,,
97
+ 1,9,research,0.419935,0.30303,0.485294,0.314394,,,,,
98
+ 1,9,projects,0.490196,0.30303,0.552288,0.318182,,,,,
99
+ 1,9,as,0.558824,0.306818,0.573529,0.314394,,,,,
100
+ 1,9,an,0.580065,0.306818,0.598039,0.314394,,,,,
101
+ 1,9,undergraduate,0.602941,0.30303,0.714052,0.318182,,,,,
102
+ 1,9,in,0.718954,0.30303,0.735294,0.314394,,,,,
103
+ 1,9,Dr.,0.740196,0.30303,0.764706,0.314394,,,,,
104
+ 1,9,David,0.769608,0.30303,0.816993,0.314394,,,,,
105
+ 1,9,R.,0.823529,0.30303,0.839869,0.314394,,,,,
106
+ 1,10,Hyde's,0.147059,0.320707,0.199346,0.334596,,,,,
107
+ 1,10,lab,0.204248,0.320707,0.228758,0.332071,,,,,
108
+ 1,10,at,0.23366,0.32197,0.248366,0.332071,,,,,
109
+ 1,10,the,0.251634,0.320707,0.276144,0.332071,,,,,
110
+ 1,10,University,0.281046,0.320707,0.364379,0.334596,,,,,
111
+ 1,10,of,0.369281,0.320707,0.387255,0.332071,,,,,
112
+ 1,10,Notre,0.390523,0.320707,0.434641,0.332071,,,,,
113
+ 1,10,Dame.,0.439542,0.320707,0.490196,0.332071,,,,,
114
+ 1,10,The,0.496732,0.320707,0.527778,0.332071,,,,,
115
+ 1,10,Hyde,0.53268,0.320707,0.573529,0.334596,,,,,
116
+ 1,10,lab,0.580065,0.320707,0.602941,0.332071,,,,,
117
+ 1,10,is,0.607843,0.320707,0.620915,0.332071,,,,,
118
+ 1,10,interested,0.625817,0.320707,0.702614,0.332071,,,,,
119
+ 1,10,in,0.707516,0.320707,0.722222,0.332071,,,,,
120
+ 1,10,the,0.727124,0.320707,0.751634,0.332071,,,,,
121
+ 1,10,signals,0.756536,0.320707,0.810458,0.334596,,,,,
122
+ 1,10,that,0.815359,0.320707,0.844771,0.332071,,,,,
123
+ 1,11,initiate,0.147059,0.338384,0.20098,0.349747,,,,,
124
+ 1,11,Muller,0.205882,0.338384,0.259804,0.349747,,,,,
125
+ 1,11,glia,0.264706,0.338384,0.292484,0.352273,,,,,
126
+ 1,11,division,0.297386,0.338384,0.361111,0.349747,,,,,
127
+ 1,11,post-light,0.366013,0.338384,0.44281,0.352273,,,,,
128
+ 1,11,damage.,0.446078,0.338384,0.511438,0.352273,,,,,
129
+ 1,11,My,0.51634,0.338384,0.544118,0.352273,,,,,
130
+ 1,11,first,0.54902,0.338384,0.581699,0.349747,,,,,
131
+ 1,11,research,0.584967,0.338384,0.650327,0.349747,,,,,
132
+ 1,11,project,0.655229,0.338384,0.710784,0.353535,,,,,
133
+ 1,11,was,0.715686,0.340909,0.745098,0.349747,,,,,
134
+ 1,12,characterizing,0.147059,0.354798,0.256536,0.369949,,,,,
135
+ 1,12,the,0.261438,0.356061,0.285948,0.367424,,,,,
136
+ 1,12,role,0.29085,0.356061,0.321895,0.367424,,,,,
137
+ 1,12,of,0.326797,0.356061,0.344771,0.367424,,,,,
138
+ 1,12,leukemia,0.348039,0.356061,0.419935,0.367424,,,,,
139
+ 1,12,inhibitory,0.424837,0.354798,0.501634,0.369949,,,,,
140
+ 1,12,factor,0.506536,0.356061,0.553922,0.367424,,,,,
141
+ 1,12,(LIF),0.55719,0.354798,0.599673,0.369949,,,,,
142
+ 1,12,in,0.604575,0.356061,0.620915,0.367424,,,,,
143
+ 1,12,the,0.624183,0.356061,0.648693,0.366162,,,,,
144
+ 1,12,activation,0.653595,0.356061,0.732026,0.367424,,,,,
145
+ 1,12,of,0.735294,0.354798,0.754902,0.367424,,,,,
146
+ 1,12,cell,0.756536,0.356061,0.785948,0.367424,,,,,
147
+ 1,13,proliferation,0.145425,0.372475,0.243464,0.387626,,,,,
148
+ 1,13,in,0.25,0.373737,0.264706,0.383838,,,,,
149
+ 1,13,the,0.269608,0.373737,0.292484,0.383838,,,,,
150
+ 1,13,undamaged,0.297386,0.372475,0.388889,0.387626,,,,,
151
+ 1,13,zebrafish,0.393791,0.372475,0.465686,0.383838,,,,,
152
+ 1,13,retina.,0.470588,0.373737,0.519608,0.383838,,,,,
153
+ 1,13,I,0.52451,0.373737,0.531046,0.383838,,,,,
154
+ 1,13,am,0.535948,0.376263,0.560458,0.383838,,,,,
155
+ 1,13,also,0.565359,0.372475,0.596405,0.383838,,,,,
156
+ 1,13,working,0.601307,0.372475,0.666667,0.387626,,,,,
157
+ 1,13,on,0.671569,0.376263,0.691176,0.385101,,,,,
158
+ 1,13,several,0.696078,0.373737,0.751634,0.383838,,,,,
159
+ 1,14,experiments,0.145425,0.390152,0.24183,0.405303,,,,,
160
+ 1,14,that,0.246732,0.390152,0.276144,0.401515,,,,,
161
+ 1,14,are,0.281046,0.393939,0.305556,0.401515,,,,,
162
+ 1,14,related,0.308824,0.390152,0.362745,0.401515,,,,,
163
+ 1,14,to,0.367647,0.392677,0.383987,0.401515,,,,,
164
+ 1,14,a,0.388889,0.393939,0.397059,0.401515,,,,,
165
+ 1,14,genetic,0.401961,0.390152,0.45915,0.405303,,,,,
166
+ 1,14,screen,0.464052,0.393939,0.514706,0.401515,,,,,
167
+ 1,14,that,0.517974,0.390152,0.547386,0.401515,,,,,
168
+ 1,14,the,0.552288,0.390152,0.576797,0.401515,,,,,
169
+ 1,14,Hyde,0.581699,0.390152,0.624183,0.405303,,,,,
170
+ 1,14,lab,0.629085,0.390152,0.653595,0.401515,,,,,
171
+ 1,14,plans,0.658497,0.390152,0.699346,0.405303,,,,,
172
+ 1,14,on,0.704248,0.393939,0.723856,0.401515,,,,,
173
+ 1,14,performing,0.728758,0.390152,0.816993,0.405303,,,,,
174
+ 1,14,to,0.821895,0.391414,0.836601,0.401515,,,,,
175
+ 1,15,identify,0.147059,0.407828,0.207516,0.421717,,,,,
176
+ 1,15,mutants,0.212418,0.409091,0.272876,0.419192,,,,,
177
+ 1,15,in,0.279412,0.407828,0.294118,0.419192,,,,,
178
+ 1,15,the,0.29902,0.407828,0.323529,0.419192,,,,,
179
+ 1,15,regeneration,0.328431,0.407828,0.426471,0.42298,,,,,
180
+ 1,15,pathway--I,0.429739,0.407828,0.51634,0.42298,,,,,
181
+ 1,15,am,0.522876,0.411616,0.545752,0.419192,,,,,
182
+ 1,15,developing,0.550654,0.407828,0.638889,0.42298,,,,,
183
+ 1,15,a,0.643791,0.411616,0.651961,0.419192,,,,,
184
+ 1,15,neuroD4:EGFP,0.656863,0.407828,0.78268,0.419192,,,,,
185
+ 1,16,transgenic,0.145425,0.425505,0.227124,0.439394,,,,,
186
+ 1,16,line,0.232026,0.425505,0.261438,0.436869,,,,,
187
+ 1,16,for,0.26634,0.425505,0.289216,0.436869,,,,,
188
+ 1,16,use,0.294118,0.42803,0.320261,0.436869,,,,,
189
+ 1,16,in,0.325163,0.425505,0.339869,0.436869,,,,,
190
+ 1,16,this,0.344771,0.425505,0.372549,0.436869,,,,,
191
+ 1,16,screen,0.377451,0.42803,0.428105,0.436869,,,,,
192
+ 1,16,and,0.433007,0.425505,0.460784,0.436869,,,,,
193
+ 1,16,I,0.46732,0.425505,0.472222,0.436869,,,,,
194
+ 1,16,am,0.477124,0.42803,0.501634,0.436869,,,,,
195
+ 1,16,characterizing,0.506536,0.425505,0.617647,0.439394,,,,,
196
+ 1,16,the,0.622549,0.425505,0.647059,0.436869,,,,,
197
+ 1,16,extent,0.651961,0.426768,0.70098,0.436869,,,,,
198
+ 1,16,of,0.704248,0.425505,0.722222,0.436869,,,,,
199
+ 1,16,damage,0.72549,0.425505,0.787582,0.439394,,,,,
200
+ 1,16,and,0.79085,0.425505,0.820261,0.436869,,,,,
201
+ 1,17,regeneration,0.145425,0.443182,0.243464,0.457071,,,,,
202
+ 1,17,in,0.25,0.443182,0.264706,0.454545,,,,,
203
+ 1,17,sheer,0.267974,0.443182,0.312092,0.454545,,,,,
204
+ 1,17,zebrafish,0.316993,0.443182,0.388889,0.454545,,,,,
205
+ 1,17,retinas.,0.393791,0.443182,0.449346,0.454545,,,,,
206
+ 1,17,"Finally,",0.455882,0.443182,0.51634,0.457071,,,,,
207
+ 1,17,I,0.521242,0.443182,0.527778,0.454545,,,,,
208
+ 1,17,am,0.53268,0.445707,0.55719,0.454545,,,,,
209
+ 1,17,characterizing,0.560458,0.443182,0.671569,0.457071,,,,,
210
+ 1,17,the,0.676471,0.443182,0.70098,0.454545,,,,,
211
+ 1,17,chx10:EGFP,0.705882,0.443182,0.808824,0.454545,,,,,
212
+ 1,18,transgenic,0.145425,0.459596,0.227124,0.474747,,,,,
213
+ 1,18,line,0.232026,0.459596,0.261438,0.47096,,,,,
214
+ 1,18,during,0.26634,0.459596,0.316993,0.474747,,,,,
215
+ 1,18,retinal,0.321895,0.459596,0.372549,0.47096,,,,,
216
+ 1,18,development,0.377451,0.459596,0.478758,0.474747,,,,,
217
+ 1,18,and,0.48366,0.460859,0.511438,0.47096,,,,,
218
+ 1,18,regeneration.,0.51634,0.459596,0.619281,0.474747,,,,,
219
+ 1,19,Please,0.145425,0.496212,0.196078,0.507576,,,,,
220
+ 1,19,find,0.20098,0.496212,0.232026,0.507576,,,,,
221
+ 1,19,my,0.236928,0.5,0.263072,0.510101,,,,,
222
+ 1,19,CV,0.267974,0.496212,0.295752,0.507576,,,,,
223
+ 1,19,attached.,0.29902,0.496212,0.369281,0.507576,,,,,
224
+ 1,20,Thank,0.145425,0.531566,0.196078,0.542929,,,,,
225
+ 1,20,you,0.20098,0.535354,0.230392,0.546717,,,,,
226
+ 1,20,for,0.235294,0.531566,0.25817,0.542929,,,,,
227
+ 1,20,your,0.263072,0.535354,0.29902,0.546717,,,,,
228
+ 1,20,"time,",0.303922,0.531566,0.343137,0.545455,,,,,
229
+ 1,21,--Lauren,0.147059,0.568182,0.215686,0.579545,,,,,
230
+ 1,21,Lilley,0.218954,0.566919,0.26634,0.582071,,,,,
231
+ 1,22,Dr.,0.145425,0.637626,0.171569,0.64899,,,,,
232
+ 1,22,"Poss,",0.176471,0.637626,0.21732,0.651515,,,,,
233
+ 1,23,I,0.145425,0.671717,0.151961,0.683081,,,,,
234
+ 1,23,am,0.158497,0.675505,0.181373,0.684343,,,,,
235
+ 1,23,a,0.186275,0.675505,0.194444,0.684343,,,,,
236
+ 1,23,senior,0.199346,0.671717,0.248366,0.683081,,,,,
237
+ 1,23,biology,0.253268,0.671717,0.312092,0.686869,,,,,
238
+ 1,23,major,0.316993,0.671717,0.364379,0.686869,,,,,
239
+ 1,23,at,0.369281,0.674242,0.382353,0.683081,,,,,
240
+ 1,23,the,0.387255,0.671717,0.411765,0.684343,,,,,
241
+ 1,23,University,0.416667,0.671717,0.498366,0.686869,,,,,
242
+ 1,23,of,0.504902,0.671717,0.522876,0.683081,,,,,
243
+ 1,23,Notre,0.52451,0.671717,0.570261,0.684343,,,,,
244
+ 1,23,Dame.,0.575163,0.671717,0.625817,0.684343,,,,,
245
+ 1,23,I,0.630719,0.671717,0.637255,0.683081,,,,,
246
+ 1,23,am,0.643791,0.675505,0.666667,0.684343,,,,,
247
+ 1,23,applying,0.671569,0.67298,0.740196,0.686869,,,,,
248
+ 1,23,to,0.745098,0.67298,0.759804,0.683081,,,,,
249
+ 1,23,your,0.764706,0.675505,0.802288,0.686869,,,,,
250
+ 1,24,graduate,0.145425,0.689394,0.214052,0.704545,,,,,
251
+ 1,24,program,0.218954,0.693182,0.284314,0.703283,,,,,
252
+ 1,24,and,0.289216,0.689394,0.318627,0.700758,,,,,
253
+ 1,24,am,0.323529,0.693182,0.348039,0.700758,,,,,
254
+ 1,24,very,0.351307,0.693182,0.387255,0.703283,,,,,
255
+ 1,24,interested,0.392157,0.689394,0.46732,0.700758,,,,,
256
+ 1,24,in,0.473856,0.689394,0.488562,0.700758,,,,,
257
+ 1,24,your,0.493464,0.693182,0.529412,0.703283,,,,,
258
+ 1,24,work.,0.534314,0.689394,0.578431,0.700758,,,,,
259
+ 1,24,After,0.583333,0.689394,0.625817,0.700758,,,,,
260
+ 1,24,glancing,0.630719,0.689394,0.697712,0.703283,,,,,
261
+ 1,24,at,0.702614,0.690657,0.71732,0.700758,,,,,
262
+ 1,24,a,0.722222,0.693182,0.730392,0.700758,,,,,
263
+ 1,24,few,0.735294,0.689394,0.764706,0.700758,,,,,
264
+ 1,24,of,0.769608,0.689394,0.787582,0.700758,,,,,
265
+ 1,24,your,0.79085,0.693182,0.826797,0.703283,,,,,
266
+ 1,25,recent,0.145425,0.708333,0.194444,0.718434,,,,,
267
+ 1,25,papers,0.199346,0.710859,0.25,0.72096,,,,,
268
+ 1,25,and,0.254902,0.707071,0.28268,0.718434,,,,,
269
+ 1,25,your,0.287582,0.710859,0.325163,0.72096,,,,,
270
+ 1,25,research,0.328431,0.707071,0.393791,0.718434,,,,,
271
+ 1,25,summary,0.398693,0.709596,0.472222,0.72096,,,,,
272
+ 1,25,I,0.477124,0.707071,0.48366,0.718434,,,,,
273
+ 1,25,find,0.488562,0.707071,0.519608,0.718434,,,,,
274
+ 1,25,your,0.52451,0.710859,0.562092,0.72096,,,,,
275
+ 1,25,research,0.565359,0.707071,0.632353,0.718434,,,,,
276
+ 1,25,greatly,0.637255,0.707071,0.691176,0.72096,,,,,
277
+ 1,25,coincides,0.696078,0.707071,0.769608,0.718434,,,,,
278
+ 1,25,with,0.77451,0.707071,0.810458,0.718434,,,,,
279
+ 1,25,my,0.813725,0.710859,0.839869,0.72096,,,,,
280
+ 1,26,research,0.145425,0.724747,0.210784,0.736111,,,,,
281
+ 1,26,experiences,0.21732,0.724747,0.308824,0.738636,,,,,
282
+ 1,26,and,0.313725,0.723485,0.341503,0.736111,,,,,
283
+ 1,26,interests.,0.346405,0.723485,0.416667,0.736111,,,,,
284
+ 1,26,Will,0.426471,0.723485,0.462418,0.736111,,,,,
285
+ 1,26,you,0.465686,0.727273,0.496732,0.738636,,,,,
286
+ 1,26,be,0.5,0.723485,0.519608,0.736111,,,,,
287
+ 1,26,taking,0.52451,0.724747,0.573529,0.738636,,,,,
288
+ 1,26,on,0.578431,0.727273,0.598039,0.736111,,,,,
289
+ 1,26,new,0.602941,0.727273,0.635621,0.736111,,,,,
290
+ 1,26,students,0.640523,0.724747,0.704248,0.736111,,,,,
291
+ 1,26,next,0.70915,0.72601,0.745098,0.734848,,,,,
292
+ 1,26,year?,0.748366,0.724747,0.79085,0.738636,,,,,
293
+ 1,27,I,0.145425,0.760101,0.151961,0.771465,,,,,
294
+ 1,27,have,0.156863,0.760101,0.194444,0.771465,,,,,
295
+ 1,27,worked,0.199346,0.760101,0.25817,0.771465,,,,,
296
+ 1,27,on,0.263072,0.763889,0.28268,0.771465,,,,,
297
+ 1,27,several,0.287582,0.760101,0.343137,0.771465,,,,,
298
+ 1,27,different,0.348039,0.760101,0.416667,0.771465,,,,,
299
+ 1,27,research,0.419935,0.760101,0.485294,0.771465,,,,,
300
+ 1,27,projects,0.490196,0.760101,0.552288,0.775253,,,,,
301
+ 1,27,as,0.55719,0.763889,0.573529,0.771465,,,,,
302
+ 1,27,an,0.578431,0.763889,0.598039,0.771465,,,,,
303
+ 1,27,undergraduate,0.602941,0.760101,0.714052,0.775253,,,,,
304
+ 1,27,in,0.718954,0.760101,0.735294,0.771465,,,,,
305
+ 1,27,Dr.,0.740196,0.760101,0.764706,0.771465,,,,,
306
+ 1,27,David,0.769608,0.760101,0.818627,0.771465,,,,,
307
+ 1,27,R.,0.823529,0.760101,0.839869,0.771465,,,,,
308
+ 1,28,Hyde's,0.145425,0.777778,0.199346,0.791667,,,,,
309
+ 1,28,lab,0.204248,0.777778,0.228758,0.789141,,,,,
310
+ 1,28,at,0.23366,0.77904,0.248366,0.789141,,,,,
311
+ 1,28,the,0.251634,0.777778,0.276144,0.789141,,,,,
312
+ 1,28,University,0.281046,0.777778,0.364379,0.791667,,,,,
313
+ 1,28,of,0.369281,0.777778,0.387255,0.789141,,,,,
314
+ 1,28,Notre,0.390523,0.777778,0.434641,0.789141,,,,,
315
+ 1,28,Dame.,0.439542,0.777778,0.490196,0.789141,,,,,
316
+ 1,28,The,0.496732,0.777778,0.527778,0.789141,,,,,
317
+ 1,28,Hyde,0.53268,0.777778,0.573529,0.791667,,,,,
318
+ 1,28,lab,0.580065,0.777778,0.602941,0.789141,,,,,
319
+ 1,28,is,0.607843,0.777778,0.620915,0.789141,,,,,
320
+ 1,28,interested,0.625817,0.777778,0.702614,0.789141,,,,,
321
+ 1,28,in,0.707516,0.777778,0.722222,0.789141,,,,,
322
+ 1,28,the,0.727124,0.777778,0.751634,0.789141,,,,,
323
+ 1,28,signals,0.756536,0.777778,0.810458,0.791667,,,,,
324
+ 1,28,that,0.815359,0.777778,0.846405,0.789141,,,,,
325
+ 1,29,initiate,0.145425,0.795455,0.20098,0.806818,,,,,
326
+ 1,29,Muller,0.205882,0.795455,0.259804,0.806818,,,,,
327
+ 1,29,glia,0.264706,0.795455,0.292484,0.809343,,,,,
328
+ 1,29,division,0.297386,0.795455,0.361111,0.806818,,,,,
329
+ 1,29,post-light,0.366013,0.795455,0.44281,0.809343,,,,,
330
+ 1,29,damage.,0.446078,0.795455,0.511438,0.809343,,,,,
331
+ 1,29,My,0.51634,0.795455,0.544118,0.809343,,,,,
332
+ 1,29,first,0.54902,0.795455,0.581699,0.806818,,,,,
333
+ 1,29,research,0.584967,0.795455,0.651961,0.806818,,,,,
334
+ 1,29,project,0.655229,0.795455,0.710784,0.809343,,,,,
335
+ 1,29,was,0.715686,0.799242,0.745098,0.806818,,,,,
336
+ 1,30,characterizing,0.145425,0.811869,0.25817,0.82702,,,,,
337
+ 1,30,the,0.261438,0.811869,0.285948,0.823232,,,,,
338
+ 1,30,role,0.29085,0.813131,0.321895,0.823232,,,,,
339
+ 1,30,of,0.326797,0.811869,0.344771,0.824495,,,,,
340
+ 1,30,leukemia,0.348039,0.811869,0.419935,0.823232,,,,,
341
+ 1,30,inhibitory,0.424837,0.811869,0.501634,0.82702,,,,,
342
+ 1,30,factor,0.506536,0.811869,0.553922,0.823232,,,,,
343
+ 1,30,(LIF),0.55719,0.813131,0.599673,0.82702,,,,,
344
+ 1,30,in,0.604575,0.811869,0.620915,0.824495,,,,,
345
+ 1,30,the,0.624183,0.811869,0.648693,0.824495,,,,,
346
+ 1,30,activation,0.653595,0.813131,0.732026,0.824495,,,,,
347
+ 1,30,of,0.735294,0.811869,0.754902,0.824495,,,,,
348
+ 1,30,cell,0.756536,0.811869,0.785948,0.824495,,,,,
349
+ 1,31,proliferation,0.145425,0.829545,0.245098,0.844697,,,,,
350
+ 1,31,in,0.25,0.829545,0.264706,0.840909,,,,,
351
+ 1,31,the,0.267974,0.829545,0.292484,0.840909,,,,,
352
+ 1,31,undamaged,0.297386,0.830808,0.388889,0.844697,,,,,
353
+ 1,31,zebrafish,0.393791,0.829545,0.465686,0.842172,,,,,
354
+ 1,31,retina.,0.470588,0.830808,0.519608,0.842172,,,,,
355
+ 1,31,I,0.52451,0.830808,0.531046,0.840909,,,,,
356
+ 1,31,am,0.535948,0.833333,0.560458,0.842172,,,,,
357
+ 1,31,also,0.565359,0.829545,0.596405,0.840909,,,,,
358
+ 1,31,working,0.601307,0.830808,0.666667,0.844697,,,,,
359
+ 1,31,on,0.671569,0.833333,0.691176,0.840909,,,,,
360
+ 1,31,several,0.696078,0.829545,0.751634,0.840909,,,,,
361
+ 1,32,experiments,0.145425,0.847222,0.24183,0.862374,,,,,
362
+ 1,32,that,0.246732,0.847222,0.276144,0.858586,,,,,
363
+ 1,32,are,0.281046,0.85101,0.305556,0.858586,,,,,
364
+ 1,32,related,0.308824,0.847222,0.362745,0.858586,,,,,
365
+ 1,32,to,0.367647,0.848485,0.383987,0.858586,,,,,
366
+ 1,32,a,0.388889,0.85101,0.397059,0.858586,,,,,
367
+ 1,32,genetic,0.401961,0.847222,0.45915,0.861111,,,,,
368
+ 1,32,screen,0.464052,0.85101,0.514706,0.858586,,,,,
369
+ 1,32,that,0.517974,0.847222,0.54902,0.858586,,,,,
370
+ 1,32,the,0.552288,0.847222,0.576797,0.858586,,,,,
371
+ 1,32,Hyde,0.581699,0.847222,0.624183,0.861111,,,,,
372
+ 1,32,lab,0.629085,0.847222,0.653595,0.858586,,,,,
373
+ 1,32,plans,0.656863,0.847222,0.699346,0.861111,,,,,
374
+ 1,32,on,0.704248,0.85101,0.723856,0.858586,,,,,
375
+ 1,32,performing,0.728758,0.847222,0.816993,0.862374,,,,,
376
+ 1,32,to,0.821895,0.848485,0.836601,0.858586,,,,,
377
+ 1,33,identify,0.145425,0.864899,0.207516,0.878788,,,,,
378
+ 1,33,mutants,0.212418,0.866162,0.272876,0.876263,,,,,
379
+ 1,33,in,0.279412,0.864899,0.294118,0.876263,,,,,
380
+ 1,33,the,0.29902,0.864899,0.323529,0.876263,,,,,
381
+ 1,33,regeneration,0.328431,0.864899,0.426471,0.878788,,,,,
382
+ 1,33,pathway--I,0.431373,0.864899,0.51634,0.878788,,,,,
383
+ 1,33,am,0.522876,0.868687,0.545752,0.876263,,,,,
384
+ 1,33,developing,0.550654,0.864899,0.638889,0.878788,,,,,
385
+ 1,33,a,0.643791,0.868687,0.651961,0.876263,,,,,
386
+ 1,33,neuroD4:EGFP,0.655229,0.864899,0.78268,0.876263,,,,,
387
+ 1,34,transgenic,0.145425,0.882576,0.227124,0.896465,,,,,
388
+ 1,34,line,0.232026,0.882576,0.261438,0.893939,,,,,
389
+ 1,34,for,0.26634,0.881313,0.289216,0.893939,,,,,
390
+ 1,34,use,0.294118,0.885101,0.320261,0.893939,,,,,
391
+ 1,34,in,0.325163,0.882576,0.339869,0.893939,,,,,
392
+ 1,34,this,0.344771,0.882576,0.372549,0.893939,,,,,
393
+ 1,34,screen,0.379085,0.885101,0.428105,0.893939,,,,,
394
+ 1,34,and,0.433007,0.882576,0.460784,0.893939,,,,,
395
+ 1,34,I,0.46732,0.882576,0.472222,0.893939,,,,,
396
+ 1,34,am,0.478758,0.885101,0.501634,0.893939,,,,,
397
+ 1,34,characterizing,0.506536,0.882576,0.617647,0.896465,,,,,
398
+ 1,34,the,0.622549,0.882576,0.647059,0.893939,,,,,
399
+ 1,34,extent,0.651961,0.883838,0.699346,0.892677,,,,,
400
+ 1,34,of,0.704248,0.882576,0.722222,0.893939,,,,,
401
+ 1,34,damage,0.72549,0.882576,0.785948,0.896465,,,,,
402
+ 1,34,and,0.79085,0.882576,0.820261,0.893939,,,,,
403
+ 2,1,regeneration,0.145425,0.093434,0.243464,0.107323,,,,,
404
+ 2,1,in,0.248366,0.093434,0.264706,0.104798,,,,,
405
+ 2,1,sheer,0.267974,0.093434,0.312092,0.104798,,,,,
406
+ 2,1,zebrafish,0.316993,0.093434,0.387255,0.104798,,,,,
407
+ 2,1,retinas.,0.392157,0.093434,0.449346,0.104798,,,,,
408
+ 2,1,"Finally,",0.455882,0.093434,0.514706,0.107323,,,,,
409
+ 2,1,I,0.521242,0.093434,0.527778,0.104798,,,,,
410
+ 2,1,am,0.53268,0.097222,0.555556,0.104798,,,,,
411
+ 2,1,characterizing,0.560458,0.093434,0.671569,0.107323,,,,,
412
+ 2,1,the,0.676471,0.093434,0.70098,0.104798,,,,,
413
+ 2,1,chx10:EGFP,0.705882,0.093434,0.808824,0.104798,,,,,
414
+ 2,2,transgenic,0.145425,0.111111,0.227124,0.125,,,,,
415
+ 2,2,line,0.232026,0.111111,0.261438,0.122475,,,,,
416
+ 2,2,during,0.26634,0.111111,0.316993,0.125,,,,,
417
+ 2,2,retinal,0.321895,0.111111,0.372549,0.122475,,,,,
418
+ 2,2,development,0.377451,0.111111,0.478758,0.125,,,,,
419
+ 2,2,and,0.48366,0.111111,0.511438,0.122475,,,,,
420
+ 2,2,regeneration.,0.51634,0.111111,0.617647,0.125,,,,,
421
+ 2,3,Please,0.145425,0.146465,0.196078,0.157828,,,,,
422
+ 2,3,find,0.20098,0.146465,0.232026,0.157828,,,,,
423
+ 2,3,my,0.236928,0.150253,0.263072,0.160354,,,,,
424
+ 2,3,CV,0.267974,0.146465,0.295752,0.157828,,,,,
425
+ 2,3,attached.,0.29902,0.146465,0.369281,0.157828,,,,,
426
+ 2,4,Thank,0.145425,0.183081,0.196078,0.193182,,,,,
427
+ 2,4,you,0.20098,0.185606,0.230392,0.19697,,,,,
428
+ 2,4,for,0.235294,0.181818,0.25817,0.193182,,,,,
429
+ 2,4,your,0.263072,0.185606,0.29902,0.19697,,,,,
430
+ 2,4,"time,",0.303922,0.181818,0.343137,0.195707,,,,,
431
+ 2,5,--Lauren,0.147059,0.218434,0.215686,0.229798,,,,,
432
+ 2,5,Lilley,0.218954,0.218434,0.26634,0.232323,,,,,
example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_review_file.csv ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ image,page,label,color,xmin,ymin,xmax,ymax,id,text
2
+ placeholder_image_0.png,1,TITLES,"(0, 0, 0)",0.147059,0.162879,0.171569,0.174242,oJIosRHGyCRn,Dr
3
+ placeholder_image_0.png,1,TITLES - NAME,"(0, 0, 0)",0.147059,0.162879,0.261438,0.176768,5C5tA6mfeL7T,Dr Kornbluth
4
+ placeholder_image_0.png,1,NAME,"(0, 0, 0)",0.176471,0.162879,0.261438,0.176768,UoYN48bc2ry5,Kornbluth
5
+ placeholder_image_0.png,1,TITLES,"(0, 0, 0)",0.740196,0.30303,0.764706,0.314394,cAsjVETPEisV,Dr
6
+ placeholder_image_0.png,1,TITLES - NAME,"(0, 0, 0)",0.740196,0.30303,0.839869,0.314394,yQ5HKn4tfT7L,Dr David R.
7
+ placeholder_image_0.png,1,NAME,"(0, 0, 0)",0.769608,0.30303,0.839869,0.314394,LR8phiOYnLWi,David R.
8
+ placeholder_image_0.png,1,NAME,"(0, 0, 0)",0.218954,0.566919,0.26634,0.582071,X8iObIauqZ9k,Lauren Lilley
9
+ placeholder_image_0.png,1,TITLES,"(0, 0, 0)",0.145425,0.637626,0.171569,0.64899,SvWjK2F7R3un,Dr
10
+ placeholder_image_0.png,1,TITLES - NAME,"(0, 0, 0)",0.145425,0.637626,0.21732,0.651515,zKJFVAOszwdM,Dr Poss
11
+ placeholder_image_0.png,1,NAME,"(0, 0, 0)",0.176471,0.637626,0.21732,0.651515,Iqda7ixkzcmg,Poss
12
+ placeholder_image_0.png,1,TITLES,"(0, 0, 0)",0.740196,0.760101,0.764706,0.771465,TWQD93bGI3B3,Dr
13
+ placeholder_image_0.png,1,TITLES - NAME,"(0, 0, 0)",0.740196,0.760101,0.839869,0.771465,vQuQQwqWjSES,Dr David R.
14
+ placeholder_image_0.png,1,NAME,"(0, 0, 0)",0.769608,0.760101,0.839869,0.771465,f8xf6ORJUSnG,David R.
15
+ placeholder_image_1.png,2,NAME,"(0, 0, 0)",0.218954,0.218434,0.26634,0.232323,N0nje9UiCzZK,Lauren Lilley
example_data/graduate-job-example-cover-letter.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71cc851d41f80dd8b045af32657b76bf85dd8f72d39ae08fa43dc7a78256fe35
3
+ size 77045
example_data/partnership_toolkit_redact_custom_deny_list.csv ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Sister
2
+ Sister City
3
+ Sister Cities
4
+ Friendship City
example_data/partnership_toolkit_redact_some_pages.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 2
2
+ 5
example_data/test_allow_list_graduate.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ Wilson
example_data/test_allow_list_partnership.csv ADDED
@@ -0,0 +1 @@
 
 
1
lambda_entrypoint.py CHANGED
@@ -9,7 +9,7 @@ print("Lambda entrypoint loading...")
9
 
10
  # Initialize S3 client outside the handler for connection reuse
11
  s3_client = boto3.client("s3", region_name=os.getenv("AWS_REGION", "eu-west-2"))
12
- print("S3 client initialized")
13
 
14
  # Lambda's only writable directory
15
  TMP_DIR = "/tmp"
@@ -84,48 +84,90 @@ def lambda_handler(event, context):
84
  'task': arguments.get('task', 'redact'),
85
  'input_file': input_file_path,
86
  'output_dir': OUTPUT_DIR,
87
- 'language': arguments.get('language', 'en_core_web_sm'),
88
- 'pii_detector': arguments.get('pii_detector', 'Local Spacy model'), # Default to local
 
 
 
89
  'ocr_method': arguments.get('ocr_method', 'Tesseract OCR - all PDF types'),
90
  'page_min': int(arguments.get('page_min', 0)),
91
- 'page_max': int(arguments.get('page_max', 999)),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
  # Handle optional files like allow/deny lists
94
- 'allow_list': None,
95
- 'deny_list': None,
 
 
 
 
 
 
96
 
97
  # Deduplication specific arguments
98
  'duplicate_type': arguments.get('duplicate_type', 'pages'),
99
  'similarity_threshold': float(arguments.get('similarity_threshold', 0.95)),
100
  'min_word_count': int(arguments.get('min_word_count', 3)),
101
- 'search_query': arguments.get('search_query'),
 
 
 
102
  'text_columns': arguments.get('text_columns', []),
 
 
103
 
104
- # Add other arguments from your app.py as needed, using .get() for safety
105
- 'anon_strat': arguments.get('anon_strat', 'redact'),
106
- 'columns': arguments.get('columns', []),
107
- 'aws_access_key': None, # Best practice: use IAM Role instead of keys
 
 
 
 
 
 
 
 
 
 
108
  'aws_secret_key': None,
109
- 'aws_region': os.getenv("AWS_REGION", "eu-west-2"),
110
  's3_bucket': bucket_name,
 
111
  # Set defaults for boolean flags
112
- 'prepare_images': True,
113
- 'compress_redacted_pdf': False,
114
- 'return_pdf_end_of_redaction': True
115
  }
116
 
117
  # Download optional files if they are specified
118
- allow_list_key = arguments.get('allow_list')
119
  if allow_list_key:
120
  allow_list_path = os.path.join(INPUT_DIR, 'allow_list.csv')
121
  download_file_from_s3(bucket_name, allow_list_key, allow_list_path)
122
- cli_args['allow_list'] = allow_list_path
123
 
124
- deny_list_key = arguments.get('deny_list')
125
  if deny_list_key:
126
  deny_list_path = os.path.join(INPUT_DIR, 'deny_list.csv')
127
  download_file_from_s3(bucket_name, deny_list_key, deny_list_path)
128
- cli_args['deny_list'] = deny_list_path
129
 
130
  # 5. Execute the main application logic
131
  try:
 
9
 
10
  # Initialize S3 client outside the handler for connection reuse
11
  s3_client = boto3.client("s3", region_name=os.getenv("AWS_REGION", "eu-west-2"))
12
+ print("S3 client initialised")
13
 
14
  # Lambda's only writable directory
15
  TMP_DIR = "/tmp"
 
84
  'task': arguments.get('task', 'redact'),
85
  'input_file': input_file_path,
86
  'output_dir': OUTPUT_DIR,
87
+ 'input_dir': INPUT_DIR,
88
+ 'language': arguments.get('language', 'en_core_web_lg'),
89
+ 'pii_detector': arguments.get('pii_detector', 'Local'), # Default to local
90
+ 'username': arguments.get('username', 'lambda_user'),
91
+ 'save_to_user_folders': arguments.get('save_to_user_folders', 'False'),
92
  'ocr_method': arguments.get('ocr_method', 'Tesseract OCR - all PDF types'),
93
  'page_min': int(arguments.get('page_min', 0)),
94
+ 'page_max': int(arguments.get('page_max', 0)),
95
+ 'handwrite_signature_extraction': arguments.get('handwrite_signature_checkbox', ['Extract handwriting', 'Extract signatures']),
96
+
97
+ # General arguments
98
+ 'local_redact_entities': arguments.get('local_redact_entities', []),
99
+ 'aws_redact_entities': arguments.get('aws_redact_entities', []),
100
+ 'cost_code': arguments.get('cost_code', ''),
101
+ 'save_logs_to_csv': arguments.get('save_logs_to_csv', 'False'),
102
+ 'save_logs_to_dynamodb': arguments.get('save_logs_to_dynamodb', 'False'),
103
+ 'display_file_names_in_logs': arguments.get('display_file_names_in_logs', 'True'),
104
+ 'upload_logs_to_s3': arguments.get('upload_logs_to_s3', 'False'),
105
+ 's3_logs_prefix': arguments.get('s3_logs_prefix', ''),
106
+ 'do_initial_clean': arguments.get('do_initial_clean', 'False'),
107
+
108
+ # PDF/Image specific arguments
109
+ 'images_dpi': float(arguments.get('images_dpi', 300.0)),
110
+ 'chosen_local_ocr_model': arguments.get('chosen_local_ocr_model', 'tesseract'),
111
+ 'preprocess_local_ocr_images': arguments.get('preprocess_local_ocr_images', 'False'),
112
 
113
  # Handle optional files like allow/deny lists
114
+ 'allow_list_file': arguments.get('allow_list_file', ""),
115
+ 'deny_list_file': arguments.get('deny_list_file', ""),
116
+ 'redact_whole_page_file': arguments.get('redact_whole_page_file', ""),
117
+
118
+ # Tabular/Anonymisation arguments
119
+ 'excel_sheets': arguments.get('excel_sheets', []),
120
+ 'fuzzy_mistakes': int(arguments.get('fuzzy_mistakes', 0)),
121
+ 'match_fuzzy_whole_phrase_bool': arguments.get('match_fuzzy_whole_phrase_bool', 'True'),
122
 
123
  # Deduplication specific arguments
124
  'duplicate_type': arguments.get('duplicate_type', 'pages'),
125
  'similarity_threshold': float(arguments.get('similarity_threshold', 0.95)),
126
  'min_word_count': int(arguments.get('min_word_count', 3)),
127
+ 'min_consecutive_pages': int(arguments.get('min_consecutive_pages', 1)),
128
+ 'greedy_match': arguments.get('greedy_match', 'False'),
129
+ 'combine_pages': arguments.get('combine_pages', 'True'),
130
+ 'search_query': arguments.get('search_query', ""),
131
  'text_columns': arguments.get('text_columns', []),
132
+ 'remove_duplicate_rows': arguments.get('remove_duplicate_rows', 'True'),
133
+ 'anon_strategy': arguments.get('anon_strategy', 'redact'),
134
 
135
+ # Textract specific arguments
136
+ 'textract_action': arguments.get('textract_action', ''),
137
+ 'job_id': arguments.get('job_id', ''),
138
+ 'extract_signatures': arguments.get('extract_signatures', False),
139
+ 'textract_bucket': arguments.get('textract_bucket', ''),
140
+ 'textract_input_prefix': arguments.get('textract_input_prefix', ''),
141
+ 'textract_output_prefix': arguments.get('textract_output_prefix', ''),
142
+ 's3_textract_document_logs_subfolder': arguments.get('s3_textract_document_logs_subfolder', ''),
143
+ 'local_textract_document_logs_subfolder': arguments.get('local_textract_document_logs_subfolder', ''),
144
+ 'poll_interval': int(arguments.get('poll_interval', 30)),
145
+ 'max_poll_attempts': int(arguments.get('max_poll_attempts', 120)),
146
+
147
+ # AWS credentials (use IAM Role instead of keys)
148
+ 'aws_access_key': None,
149
  'aws_secret_key': None,
150
+ 'aws_region': os.getenv("AWS_REGION", ""),
151
  's3_bucket': bucket_name,
152
+
153
  # Set defaults for boolean flags
154
+ 'prepare_images': arguments.get('prepare_images', True),
155
+ 'compress_redacted_pdf': arguments.get('compress_redacted_pdf', False),
156
+ 'return_pdf_end_of_redaction': arguments.get('return_pdf_end_of_redaction', True)
157
  }
158
 
159
  # Download optional files if they are specified
160
+ allow_list_key = arguments.get('allow_list_file')
161
  if allow_list_key:
162
  allow_list_path = os.path.join(INPUT_DIR, 'allow_list.csv')
163
  download_file_from_s3(bucket_name, allow_list_key, allow_list_path)
164
+ cli_args['allow_list_file'] = allow_list_path
165
 
166
+ deny_list_key = arguments.get('deny_list_file')
167
  if deny_list_key:
168
  deny_list_path = os.path.join(INPUT_DIR, 'deny_list.csv')
169
  download_file_from_s3(bucket_name, deny_list_key, deny_list_path)
170
+ cli_args['deny_list_file'] = deny_list_path
171
 
172
  # 5. Execute the main application logic
173
  try:
pyproject.toml CHANGED
@@ -23,8 +23,8 @@ dependencies = [
23
  "spacy==3.8.7",
24
  # Direct URL dependency for spacy model
25
  "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
26
- "gradio==5.45.0",
27
- "boto3==1.40.10",
28
  "pyarrow==21.0.0",
29
  "openpyxl==3.1.5",
30
  "Faker==37.5.3",
 
23
  "spacy==3.8.7",
24
  # Direct URL dependency for spacy model
25
  "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
26
+ "gradio==5.46.1",
27
+ "boto3==1.40.31",
28
  "pyarrow==21.0.0",
29
  "openpyxl==3.1.5",
30
  "Faker==37.5.3",
requirements.txt CHANGED
@@ -10,9 +10,9 @@ pandas==2.3.2
10
  scikit-learn==1.7.1
11
  spacy==3.8.7
12
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
13
- gradio==5.45.0
14
  polars==1.33.1
15
- boto3==1.40.10
16
  pyarrow==21.0.0
17
  openpyxl==3.1.5
18
  Faker==37.5.3
 
10
  scikit-learn==1.7.1
11
  spacy==3.8.7
12
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
13
+ gradio==5.46.1
14
  polars==1.33.1
15
+ boto3==1.40.31
16
  pyarrow==21.0.0
17
  openpyxl==3.1.5
18
  Faker==37.5.3
tools/aws_functions.py CHANGED
@@ -171,7 +171,7 @@ def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=DOCU
171
  final_out_message_str = "Could not upload files to S3 due to: " + str(e)
172
  print(final_out_message_str)
173
  else:
174
- final_out_message_str = "App not set to run AWS functions"
175
 
176
  return final_out_message_str
177
 
@@ -227,7 +227,7 @@ def upload_log_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=
227
  final_out_message_str = "Could not upload files to S3 due to: " + str(e)
228
  print(final_out_message_str)
229
  else:
230
- final_out_message_str = "App not set to run AWS functions"
231
  print(final_out_message_str)
232
 
233
  return final_out_message_str
 
171
  final_out_message_str = "Could not upload files to S3 due to: " + str(e)
172
  print(final_out_message_str)
173
  else:
174
+ final_out_message_str = "App config will not AWS functions"
175
 
176
  return final_out_message_str
177
 
 
227
  final_out_message_str = "Could not upload files to S3 due to: " + str(e)
228
  print(final_out_message_str)
229
  else:
230
+ final_out_message_str = "App config will not AWS functions"
231
  print(final_out_message_str)
232
 
233
  return final_out_message_str
tools/aws_textract.py CHANGED
@@ -24,6 +24,8 @@ def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str=""
24
  '''
25
  Analyse page with AWS Textract
26
  '''
 
 
27
  if client == "":
28
  try:
29
  if AWS_ACCESS_KEY and AWS_SECRET_KEY:
 
24
  '''
25
  Analyse page with AWS Textract
26
  '''
27
+
28
+ print("handwrite_signature_checkbox in analyse_page_with_textract:", handwrite_signature_checkbox)
29
  if client == "":
30
  try:
31
  if AWS_ACCESS_KEY and AWS_SECRET_KEY:
tools/cli_usage_logger.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CLI Usage Logger - A simplified version of the Gradio CSVLogger_custom for CLI usage logging.
3
+ This module provides functionality to log usage data from CLI operations to CSV files and optionally DynamoDB.
4
+ """
5
+
6
+ import csv
7
+ import os
8
+ import time
9
+ import uuid
10
+ from datetime import datetime
11
+ from pathlib import Path
12
+ from typing import Any, List, Optional
13
+ import boto3
14
+ import botocore
15
+ from tools.aws_functions import upload_log_file_to_s3
16
+ from tools.config import (
17
+ USAGE_LOGS_FOLDER,
18
+ SAVE_LOGS_TO_CSV,
19
+ SAVE_LOGS_TO_DYNAMODB,
20
+ USAGE_LOG_DYNAMODB_TABLE_NAME,
21
+ DYNAMODB_USAGE_LOG_HEADERS,
22
+ CSV_USAGE_LOG_HEADERS,
23
+ DISPLAY_FILE_NAMES_IN_LOGS,
24
+ HOST_NAME,
25
+ AWS_REGION,
26
+ AWS_ACCESS_KEY,
27
+ AWS_SECRET_KEY,
28
+ RUN_AWS_FUNCTIONS,
29
+ S3_USAGE_LOGS_FOLDER,
30
+ DOCUMENT_REDACTION_BUCKET
31
+ )
32
+
33
+
34
+ class CLIUsageLogger:
35
+ """
36
+ A simplified usage logger for CLI operations that mimics the functionality
37
+ of the Gradio CSVLogger_custom class.
38
+ """
39
+
40
+ def __init__(self, dataset_file_name: str = "usage_log.csv"):
41
+ """
42
+ Initialize the CLI usage logger.
43
+
44
+ Args:
45
+ dataset_file_name: Name of the CSV file to store logs
46
+ """
47
+ self.dataset_file_name = dataset_file_name
48
+ self.flagging_dir = Path(USAGE_LOGS_FOLDER)
49
+ self.dataset_filepath = None
50
+ self.headers = None
51
+
52
+ def setup(self, headers: List[str]):
53
+ """
54
+ Setup the logger with the specified headers.
55
+
56
+ Args:
57
+ headers: List of column headers for the CSV file
58
+ """
59
+ self.headers = headers
60
+ self._create_dataset_file()
61
+
62
+ def _create_dataset_file(self):
63
+ """Create the dataset CSV file with headers if it doesn't exist."""
64
+ os.makedirs(self.flagging_dir, exist_ok=True)
65
+
66
+ # Add ID and timestamp to headers (matching custom_csvlogger.py structure)
67
+ full_headers = self.headers + ["id", "timestamp"]
68
+
69
+ self.dataset_filepath = self.flagging_dir / self.dataset_file_name
70
+
71
+ if not Path(self.dataset_filepath).exists():
72
+ with open(self.dataset_filepath, "w", newline="", encoding="utf-8") as csvfile:
73
+ writer = csv.writer(csvfile)
74
+ writer.writerow(full_headers)
75
+ print(f"Created usage log file at: {self.dataset_filepath}")
76
+ else:
77
+ print(f"Using existing usage log file at: {self.dataset_filepath}")
78
+
79
+ def log_usage(
80
+ self,
81
+ data: List[Any],
82
+ save_to_csv: bool = None,
83
+ save_to_dynamodb: bool = None,
84
+ save_to_s3: bool = None,
85
+ s3_bucket: str = None,
86
+ s3_key_prefix: str = None,
87
+ dynamodb_table_name: str = None,
88
+ dynamodb_headers: List[str] = None,
89
+ replacement_headers: List[str] = None
90
+ ) -> int:
91
+ """
92
+ Log usage data to CSV and optionally DynamoDB and S3.
93
+
94
+ Args:
95
+ data: List of data values to log
96
+ save_to_csv: Whether to save to CSV (defaults to config setting)
97
+ save_to_dynamodb: Whether to save to DynamoDB (defaults to config setting)
98
+ save_to_s3: Whether to save to S3 (defaults to config setting)
99
+ s3_bucket: S3 bucket name (defaults to config setting)
100
+ s3_key_prefix: S3 key prefix (defaults to config setting)
101
+ dynamodb_table_name: DynamoDB table name (defaults to config setting)
102
+ dynamodb_headers: DynamoDB headers (defaults to config setting)
103
+ replacement_headers: Replacement headers for CSV (defaults to config setting)
104
+
105
+ Returns:
106
+ Number of lines written
107
+ """
108
+ # Use config defaults if not specified
109
+ if save_to_csv is None:
110
+ save_to_csv = SAVE_LOGS_TO_CSV == 'True'
111
+ if save_to_dynamodb is None:
112
+ save_to_dynamodb = SAVE_LOGS_TO_DYNAMODB == 'True'
113
+ if save_to_s3 is None:
114
+ save_to_s3 = RUN_AWS_FUNCTIONS == "1" and SAVE_LOGS_TO_CSV == 'True'
115
+ if s3_bucket is None:
116
+ s3_bucket = DOCUMENT_REDACTION_BUCKET
117
+ if s3_key_prefix is None:
118
+ s3_key_prefix = S3_USAGE_LOGS_FOLDER
119
+ if dynamodb_table_name is None:
120
+ dynamodb_table_name = USAGE_LOG_DYNAMODB_TABLE_NAME
121
+ if dynamodb_headers is None:
122
+ dynamodb_headers = DYNAMODB_USAGE_LOG_HEADERS
123
+ if replacement_headers is None:
124
+ replacement_headers = CSV_USAGE_LOG_HEADERS
125
+
126
+ # Generate unique ID and add timestamp (matching custom_csvlogger.py structure)
127
+ generated_id = str(uuid.uuid4())
128
+ timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] # Correct format for Amazon Athena
129
+ csv_data = data + [generated_id, timestamp]
130
+
131
+ line_count = 0
132
+
133
+ # Save to CSV
134
+ if save_to_csv and self.dataset_filepath:
135
+ try:
136
+ with open(self.dataset_filepath, "a", newline="", encoding="utf-8-sig") as csvfile:
137
+ writer = csv.writer(csvfile)
138
+ writer.writerow(csv_data)
139
+ line_count = 1
140
+ print(f"Logged usage data to CSV: {self.dataset_filepath}")
141
+ except Exception as e:
142
+ print(f"Error writing to CSV: {e}")
143
+
144
+ # Upload to S3 if enabled
145
+ if save_to_s3 and self.dataset_filepath and s3_bucket and s3_key_prefix:
146
+ try:
147
+ # Upload the log file to S3
148
+ upload_result = upload_log_file_to_s3(
149
+ local_file_paths=[str(self.dataset_filepath)],
150
+ s3_key=s3_key_prefix,
151
+ s3_bucket=s3_bucket,
152
+ RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS,
153
+ SAVE_LOGS_TO_CSV=SAVE_LOGS_TO_CSV
154
+ )
155
+ print(f"S3 upload result: {upload_result}")
156
+ except Exception as e:
157
+ print(f"Error uploading log file to S3: {e}")
158
+
159
+ # Save to DynamoDB
160
+ if save_to_dynamodb and dynamodb_table_name and dynamodb_headers:
161
+ try:
162
+ # Initialize DynamoDB client
163
+ if AWS_ACCESS_KEY and AWS_SECRET_KEY:
164
+ dynamodb = boto3.resource(
165
+ 'dynamodb',
166
+ region_name=AWS_REGION,
167
+ aws_access_key_id=AWS_ACCESS_KEY,
168
+ aws_secret_access_key=AWS_SECRET_KEY
169
+ )
170
+ else:
171
+ dynamodb = boto3.resource('dynamodb', region_name=AWS_REGION)
172
+
173
+ table = dynamodb.Table(dynamodb_table_name)
174
+
175
+ # Generate unique ID
176
+ generated_id = str(uuid.uuid4())
177
+
178
+ # Prepare the DynamoDB item
179
+ item = {
180
+ 'id': generated_id,
181
+ 'timestamp': timestamp,
182
+ }
183
+
184
+ # Map the headers to values
185
+ item.update({header: str(value) for header, value in zip(dynamodb_headers, data)})
186
+
187
+ table.put_item(Item=item)
188
+ print("Successfully uploaded usage log to DynamoDB")
189
+
190
+ except Exception as e:
191
+ print(f"Could not upload usage log to DynamoDB: {e}")
192
+
193
+ return line_count
194
+
195
+
196
+ def create_cli_usage_logger() -> CLIUsageLogger:
197
+ """
198
+ Create and setup a CLI usage logger with the standard headers.
199
+
200
+ Returns:
201
+ Configured CLIUsageLogger instance
202
+ """
203
+ # Parse CSV headers from config
204
+ import json
205
+ try:
206
+ headers = json.loads(CSV_USAGE_LOG_HEADERS)
207
+ except:
208
+ # Fallback headers if parsing fails
209
+ headers = [
210
+ "session_hash_textbox", "doc_full_file_name_textbox", "data_full_file_name_textbox",
211
+ "actual_time_taken_number", "total_page_count", "textract_query_number",
212
+ "pii_detection_method", "comprehend_query_number", "cost_code",
213
+ "textract_handwriting_signature", "host_name_textbox", "text_extraction_method",
214
+ "is_this_a_textract_api_call", "task"
215
+ ]
216
+
217
+ logger = CLIUsageLogger()
218
+ logger.setup(headers)
219
+ return logger
220
+
221
+
222
+ def log_redaction_usage(
223
+ logger: CLIUsageLogger,
224
+ session_hash: str,
225
+ doc_file_name: str,
226
+ data_file_name: str,
227
+ time_taken: float,
228
+ total_pages: int,
229
+ textract_queries: int,
230
+ pii_method: str,
231
+ comprehend_queries: int,
232
+ cost_code: str,
233
+ handwriting_signature: str,
234
+ text_extraction_method: str,
235
+ is_textract_call: bool,
236
+ task: str,
237
+ save_to_dynamodb: bool = None,
238
+ save_to_s3: bool = None,
239
+ s3_bucket: str = None,
240
+ s3_key_prefix: str = None
241
+ ):
242
+ """
243
+ Log redaction usage data using the provided logger.
244
+
245
+ Args:
246
+ logger: CLIUsageLogger instance
247
+ session_hash: Session identifier
248
+ doc_file_name: Document file name (or placeholder if not displaying names)
249
+ data_file_name: Data file name (or placeholder if not displaying names)
250
+ time_taken: Time taken for processing in seconds
251
+ total_pages: Total number of pages processed
252
+ textract_queries: Number of Textract API calls made
253
+ pii_method: PII detection method used
254
+ comprehend_queries: Number of Comprehend API calls made
255
+ cost_code: Cost code for the operation
256
+ handwriting_signature: Handwriting/signature extraction options
257
+ text_extraction_method: Text extraction method used
258
+ is_textract_call: Whether this was a Textract API call
259
+ task: The task performed (redact, deduplicate, textract)
260
+ save_to_dynamodb: Whether to save to DynamoDB (overrides config default)
261
+ save_to_s3: Whether to save to S3 (overrides config default)
262
+ s3_bucket: S3 bucket name (overrides config default)
263
+ s3_key_prefix: S3 key prefix (overrides config default)
264
+ """
265
+ # Use placeholder names if not displaying file names in logs
266
+ if DISPLAY_FILE_NAMES_IN_LOGS != 'True':
267
+ if doc_file_name:
268
+ doc_file_name = "document"
269
+ data_file_name = ""
270
+ if data_file_name:
271
+ data_file_name = "data_file"
272
+ doc_file_name = ""
273
+ else:
274
+ doc_file_name = doc_file_name
275
+ data_file_name = data_file_name
276
+
277
+ rounded_time_taken = round(time_taken, 2)
278
+
279
+ data = [
280
+ session_hash,
281
+ doc_file_name,
282
+ data_file_name,
283
+ rounded_time_taken,
284
+ total_pages,
285
+ textract_queries,
286
+ pii_method,
287
+ comprehend_queries,
288
+ cost_code,
289
+ handwriting_signature,
290
+ HOST_NAME,
291
+ text_extraction_method,
292
+ is_textract_call,
293
+ task
294
+ ]
295
+
296
+ logger.log_usage(
297
+ data,
298
+ save_to_dynamodb=save_to_dynamodb,
299
+ save_to_s3=save_to_s3,
300
+ s3_bucket=s3_bucket,
301
+ s3_key_prefix=s3_key_prefix
302
+ )
tools/config.py CHANGED
@@ -34,7 +34,7 @@ def add_folder_to_path(folder_path: str):
34
  '''
35
 
36
  if os.path.exists(folder_path) and os.path.isdir(folder_path):
37
- print(folder_path, "folder exists.")
38
 
39
  # Resolve relative path to absolute path
40
  absolute_path = os.path.abspath(folder_path)
@@ -45,7 +45,8 @@ def add_folder_to_path(folder_path: str):
45
  os.environ['PATH'] = full_path_extension
46
  #print(f"Updated PATH with: ", full_path_extension)
47
  else:
48
- print(f"Directory {folder_path} already exists in PATH.")
 
49
  else:
50
  print(f"Folder not found at {folder_path} - not added to PATH")
51
 
@@ -88,13 +89,16 @@ AWS_CLIENT_SECRET = get_or_create_env_var('AWS_CLIENT_SECRET', '')
88
  AWS_USER_POOL_ID = get_or_create_env_var('AWS_USER_POOL_ID', '')
89
 
90
  AWS_ACCESS_KEY = get_or_create_env_var('AWS_ACCESS_KEY', '')
91
- if AWS_ACCESS_KEY: print(f'AWS_ACCESS_KEY found in environment variables')
92
 
93
  AWS_SECRET_KEY = get_or_create_env_var('AWS_SECRET_KEY', '')
94
- if AWS_SECRET_KEY: print(f'AWS_SECRET_KEY found in environment variables')
95
 
96
  DOCUMENT_REDACTION_BUCKET = get_or_create_env_var('DOCUMENT_REDACTION_BUCKET', '')
97
 
 
 
 
98
  # Custom headers e.g. if routing traffic through Cloudfront
99
  # Retrieving or setting CUSTOM_HEADER
100
  CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
@@ -164,7 +168,7 @@ DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS',
164
  # Further customisation options for CSV logs
165
  CSV_ACCESS_LOG_HEADERS = get_or_create_env_var('CSV_ACCESS_LOG_HEADERS', '') # If blank, uses component labels
166
  CSV_FEEDBACK_LOG_HEADERS = get_or_create_env_var('CSV_FEEDBACK_LOG_HEADERS', '') # If blank, uses component labels
167
- CSV_USAGE_LOG_HEADERS = get_or_create_env_var('CSV_USAGE_LOG_HEADERS', '["session_hash_textbox", "doc_full_file_name_textbox", "data_full_file_name_textbox", "actual_time_taken_number", "total_page_count", "textract_query_number", "pii_detection_method", "comprehend_query_number", "cost_code", "textract_handwriting_signature", "host_name_textbox", "text_extraction_method", "is_this_a_textract_api_call"]') # If blank, uses component labels
168
 
169
  ### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
170
  SAVE_LOGS_TO_DYNAMODB = get_or_create_env_var('SAVE_LOGS_TO_DYNAMODB', 'False')
@@ -275,7 +279,7 @@ DEFAULT_TABULAR_ANONYMISATION_STRATEGY = get_or_create_env_var('DEFAULT_TABULAR_
275
  ### Local OCR model - Tesseract vs PaddleOCR
276
  CHOSEN_LOCAL_OCR_MODEL = get_or_create_env_var('CHOSEN_LOCAL_OCR_MODEL', "tesseract") # Choose between "tesseract", "hybrid", and "paddle". "paddle" will only return whole line text extraction, and so will only work for OCR, not redaction. "hybrid" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with PaddleOCR on words with low confidence.
277
 
278
- PREPROCESS_LOCAL_OCR_IMAGES = get_or_create_env_var('PREPROCESS_LOCAL_OCR_IMAGES', "False") # Whether to try and preprocess images before extracting text. NOTE: I have found in testing that this often results in WORSE results for scanned pages, so it is default False
279
 
280
  # Entities for redaction
281
  CHOSEN_COMPREHEND_ENTITIES = get_or_create_env_var('CHOSEN_COMPREHEND_ENTITIES', "['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']")
@@ -287,6 +291,10 @@ CHOSEN_REDACT_ENTITIES = get_or_create_env_var('CHOSEN_REDACT_ENTITIES', "['TITL
287
 
288
  FULL_ENTITY_LIST = get_or_create_env_var('FULL_ENTITY_LIST', "['TITLES', 'PERSON', 'PHONE_NUMBER', 'EMAIL_ADDRESS', 'STREETNAME', 'UKPOSTCODE', 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM', 'CUSTOM_FUZZY']")
289
 
 
 
 
 
290
  DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX = get_or_create_env_var('DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX', "['Extract handwriting']")
291
 
292
  DEFAULT_SEARCH_QUERY = get_or_create_env_var('DEFAULT_SEARCH_QUERY', '')
@@ -327,8 +335,9 @@ LANGUAGE_CHOICES = get_or_create_env_var("LANGUAGE_CHOICES", "['en', 'fr', 'de',
327
  DEFAULT_DUPLICATE_DETECTION_THRESHOLD = float(get_or_create_env_var("DEFAULT_DUPLICATE_DETECTION_THRESHOLD", "0.95"))
328
  DEFAULT_MIN_CONSECUTIVE_PAGES = int(get_or_create_env_var("DEFAULT_MIN_CONSECUTIVE_PAGES", "1"))
329
  USE_GREEDY_DUPLICATE_DETECTION = get_or_create_env_var("USE_GREEDY_DUPLICATE_DETECTION", "True")
330
- DEFAULT_COMBINE_PAGES = get_or_create_env_var("DEFAULT_COMBINE_PAGES", "True")
331
  DEFAULT_MIN_WORD_COUNT = int(get_or_create_env_var("DEFAULT_MIN_WORD_COUNT", "10"))
 
332
 
333
 
334
  ###
@@ -352,6 +361,7 @@ COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
352
  RUN_DIRECT_MODE = get_or_create_env_var('RUN_DIRECT_MODE', '0')
353
 
354
  # Direct mode configuration options
 
355
  DIRECT_MODE_TASK = get_or_create_env_var('DIRECT_MODE_TASK', 'redact') # 'redact' or 'deduplicate'
356
  DIRECT_MODE_INPUT_FILE = get_or_create_env_var('DIRECT_MODE_INPUT_FILE', '') # Path to input file
357
  DIRECT_MODE_OUTPUT_DIR = get_or_create_env_var('DIRECT_MODE_OUTPUT_DIR', OUTPUT_FOLDER) # Output directory
@@ -447,4 +457,4 @@ TEXTRACT_JOBS_S3_INPUT_LOC = get_or_create_env_var('TEXTRACT_JOBS_S3_INPUT_LOC',
447
 
448
  TEXTRACT_JOBS_LOCAL_LOC = get_or_create_env_var('TEXTRACT_JOBS_LOCAL_LOC', 'output') # Local subfolder where the Textract jobs are stored
449
 
450
- DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = get_or_create_env_var('DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS', '7') # How many days into the past should whole document Textract jobs be displayed? After that, the data is not deleted from the Textract jobs csv, but it is just filtered out. Included to align with S3 buckets where the file outputs will be automatically deleted after X days.
 
34
  '''
35
 
36
  if os.path.exists(folder_path) and os.path.isdir(folder_path):
37
+ #print(folder_path, "folder exists.")
38
 
39
  # Resolve relative path to absolute path
40
  absolute_path = os.path.abspath(folder_path)
 
45
  os.environ['PATH'] = full_path_extension
46
  #print(f"Updated PATH with: ", full_path_extension)
47
  else:
48
+ pass
49
+ #print(f"Directory {folder_path} already exists in PATH.")
50
  else:
51
  print(f"Folder not found at {folder_path} - not added to PATH")
52
 
 
89
  AWS_USER_POOL_ID = get_or_create_env_var('AWS_USER_POOL_ID', '')
90
 
91
  AWS_ACCESS_KEY = get_or_create_env_var('AWS_ACCESS_KEY', '')
92
+ #if AWS_ACCESS_KEY: print(f'AWS_ACCESS_KEY found in environment variables')
93
 
94
  AWS_SECRET_KEY = get_or_create_env_var('AWS_SECRET_KEY', '')
95
+ #if AWS_SECRET_KEY: print(f'AWS_SECRET_KEY found in environment variables')
96
 
97
  DOCUMENT_REDACTION_BUCKET = get_or_create_env_var('DOCUMENT_REDACTION_BUCKET', '')
98
 
99
+ # Should the app prioritise using AWS SSO over using API keys stored in environment variables/secrets (defaults to yes)
100
+ PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS = get_or_create_env_var('PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS', '1')
101
+
102
  # Custom headers e.g. if routing traffic through Cloudfront
103
  # Retrieving or setting CUSTOM_HEADER
104
  CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
 
168
  # Further customisation options for CSV logs
169
  CSV_ACCESS_LOG_HEADERS = get_or_create_env_var('CSV_ACCESS_LOG_HEADERS', '') # If blank, uses component labels
170
  CSV_FEEDBACK_LOG_HEADERS = get_or_create_env_var('CSV_FEEDBACK_LOG_HEADERS', '') # If blank, uses component labels
171
+ CSV_USAGE_LOG_HEADERS = get_or_create_env_var('CSV_USAGE_LOG_HEADERS', '["session_hash_textbox", "doc_full_file_name_textbox", "data_full_file_name_textbox", "actual_time_taken_number", "total_page_count", "textract_query_number", "pii_detection_method", "comprehend_query_number", "cost_code", "textract_handwriting_signature", "host_name_textbox", "text_extraction_method", "is_this_a_textract_api_call", "task"]') # If blank, uses component labels
172
 
173
  ### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
174
  SAVE_LOGS_TO_DYNAMODB = get_or_create_env_var('SAVE_LOGS_TO_DYNAMODB', 'False')
 
279
  ### Local OCR model - Tesseract vs PaddleOCR
280
  CHOSEN_LOCAL_OCR_MODEL = get_or_create_env_var('CHOSEN_LOCAL_OCR_MODEL', "tesseract") # Choose between "tesseract", "hybrid", and "paddle". "paddle" will only return whole line text extraction, and so will only work for OCR, not redaction. "hybrid" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with PaddleOCR on words with low confidence.
281
 
282
+ PREPROCESS_LOCAL_OCR_IMAGES = get_or_create_env_var('PREPROCESS_LOCAL_OCR_IMAGES', "True") # Whether to try and preprocess images before extracting text. NOTE: I have found in testing that this often results in WORSE results for scanned pages, so it is default False
283
 
284
  # Entities for redaction
285
  CHOSEN_COMPREHEND_ENTITIES = get_or_create_env_var('CHOSEN_COMPREHEND_ENTITIES', "['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']")
 
291
 
292
  FULL_ENTITY_LIST = get_or_create_env_var('FULL_ENTITY_LIST', "['TITLES', 'PERSON', 'PHONE_NUMBER', 'EMAIL_ADDRESS', 'STREETNAME', 'UKPOSTCODE', 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM', 'CUSTOM_FUZZY']")
293
 
294
+ CUSTOM_ENTITIES = get_or_create_env_var('CUSTOM_ENTITIES', "['TITLES', 'UKPOSTCODE', 'STREETNAME', 'CUSTOM']")
295
+
296
+
297
+
298
  DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX = get_or_create_env_var('DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX', "['Extract handwriting']")
299
 
300
  DEFAULT_SEARCH_QUERY = get_or_create_env_var('DEFAULT_SEARCH_QUERY', '')
 
335
  DEFAULT_DUPLICATE_DETECTION_THRESHOLD = float(get_or_create_env_var("DEFAULT_DUPLICATE_DETECTION_THRESHOLD", "0.95"))
336
  DEFAULT_MIN_CONSECUTIVE_PAGES = int(get_or_create_env_var("DEFAULT_MIN_CONSECUTIVE_PAGES", "1"))
337
  USE_GREEDY_DUPLICATE_DETECTION = get_or_create_env_var("USE_GREEDY_DUPLICATE_DETECTION", "True")
338
+ DEFAULT_COMBINE_PAGES = get_or_create_env_var("DEFAULT_COMBINE_PAGES", "True") # Combine text from the same page number within a file. Alternative will enable line-level duplicate detection.
339
  DEFAULT_MIN_WORD_COUNT = int(get_or_create_env_var("DEFAULT_MIN_WORD_COUNT", "10"))
340
+ REMOVE_DUPLICATE_ROWS = get_or_create_env_var("REMOVE_DUPLICATE_ROWS", "False")
341
 
342
 
343
  ###
 
361
  RUN_DIRECT_MODE = get_or_create_env_var('RUN_DIRECT_MODE', '0')
362
 
363
  # Direct mode configuration options
364
+ DIRECT_MODE_DEFAULT_USER = get_or_create_env_var('DIRECT_MODE_DEFAULT_USER', '') # Default username for cli/direct mode requests
365
  DIRECT_MODE_TASK = get_or_create_env_var('DIRECT_MODE_TASK', 'redact') # 'redact' or 'deduplicate'
366
  DIRECT_MODE_INPUT_FILE = get_or_create_env_var('DIRECT_MODE_INPUT_FILE', '') # Path to input file
367
  DIRECT_MODE_OUTPUT_DIR = get_or_create_env_var('DIRECT_MODE_OUTPUT_DIR', OUTPUT_FOLDER) # Output directory
 
457
 
458
  TEXTRACT_JOBS_LOCAL_LOC = get_or_create_env_var('TEXTRACT_JOBS_LOCAL_LOC', 'output') # Local subfolder where the Textract jobs are stored
459
 
460
+ DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = int(get_or_create_env_var('DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS', '7')) # How many days into the past should whole document Textract jobs be displayed? After that, the data is not deleted from the Textract jobs csv, but it is just filtered out. Included to align with S3 buckets where the file outputs will be automatically deleted after X days.
tools/custom_csvlogger.py CHANGED
@@ -15,7 +15,7 @@ from pathlib import Path
15
  from typing import TYPE_CHECKING, Any
16
  from gradio_client import utils as client_utils
17
  import gradio as gr
18
- from gradio import utils, wasm_utils
19
  from tools.config import AWS_REGION, AWS_ACCESS_KEY, AWS_SECRET_KEY, RUN_AWS_FUNCTIONS
20
 
21
 
@@ -56,9 +56,7 @@ class CSVLogger_custom(FlaggingCallback):
56
  self.simplify_file_data = simplify_file_data
57
  self.verbose = verbose
58
  self.dataset_file_name = dataset_file_name
59
- self.lock = (
60
- Lock() if not wasm_utils.IS_WASM else contextlib.nullcontext()
61
- ) # The multiprocessing module doesn't work on Lite.
62
 
63
  def setup(
64
  self,
 
15
  from typing import TYPE_CHECKING, Any
16
  from gradio_client import utils as client_utils
17
  import gradio as gr
18
+ from gradio import utils
19
  from tools.config import AWS_REGION, AWS_ACCESS_KEY, AWS_SECRET_KEY, RUN_AWS_FUNCTIONS
20
 
21
 
 
56
  self.simplify_file_data = simplify_file_data
57
  self.verbose = verbose
58
  self.dataset_file_name = dataset_file_name
59
+ self.lock = Lock()
 
 
60
 
61
  def setup(
62
  self,
tools/custom_image_analyser_engine.py CHANGED
@@ -260,10 +260,12 @@ class ContrastSegmentedImageEnhancer(ImagePreprocessor):
260
  adjusted_contrast = contrast
261
  return adjusted_image, contrast, adjusted_contrast
262
 
263
- def preprocess_image(self, image: Image.Image) -> Tuple[Image.Image, dict]:
264
  """
265
  A corrected, logical pipeline for OCR preprocessing.
266
  Order: Greyscale -> Rescale -> Denoise -> Enhance Contrast -> Binarize
 
 
267
  """
268
  # 1. Convert to greyscale NumPy array
269
  image_np = self.convert_image_to_array(image)
@@ -278,9 +280,13 @@ class ContrastSegmentedImageEnhancer(ImagePreprocessor):
278
  adjusted_image_np, _, _ = self._improve_contrast(filtered_image_np)
279
 
280
  # 5. Adaptive Thresholding (Binarization) - This is the final step
281
- final_image_np, threshold_metadata = self.adaptive_threshold.preprocess_image(
282
- adjusted_image_np
283
- )
 
 
 
 
284
 
285
  # Combine metadata
286
  final_metadata = {**scale_metadata, **threshold_metadata}
 
260
  adjusted_contrast = contrast
261
  return adjusted_image, contrast, adjusted_contrast
262
 
263
+ def preprocess_image(self, image: Image.Image, perform_binarization: bool = False) -> Tuple[Image.Image, dict]:
264
  """
265
  A corrected, logical pipeline for OCR preprocessing.
266
  Order: Greyscale -> Rescale -> Denoise -> Enhance Contrast -> Binarize
267
+
268
+ I have found that binarization is not always helpful with Tesseract, and can sometimes degrade results. So it is off by default.
269
  """
270
  # 1. Convert to greyscale NumPy array
271
  image_np = self.convert_image_to_array(image)
 
280
  adjusted_image_np, _, _ = self._improve_contrast(filtered_image_np)
281
 
282
  # 5. Adaptive Thresholding (Binarization) - This is the final step
283
+ if perform_binarization:
284
+ final_image_np, threshold_metadata = self.adaptive_threshold.preprocess_image(
285
+ adjusted_image_np
286
+ )
287
+ else:
288
+ final_image_np = adjusted_image_np
289
+ threshold_metadata = {}
290
 
291
  # Combine metadata
292
  final_metadata = {**scale_metadata, **threshold_metadata}
tools/data_anonymise.py CHANGED
@@ -18,15 +18,19 @@ from botocore.client import BaseClient
18
  from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
19
  from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
20
  from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
21
- from tools.config import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, OUTPUT_FOLDER, DEFAULT_LANGUAGE, aws_comprehend_language_choices, DO_INITIAL_TABULAR_DATA_CLEAN
22
- from tools.helper_functions import get_file_name_without_type, read_file, detect_file_type
23
- from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, custom_entities, create_nlp_analyser, load_spacy_model
24
  # Use custom version of analyze_dict to be able to track progress
25
  from tools.presidio_analyzer_custom import analyze_dict
26
 
27
  if DO_INITIAL_TABULAR_DATA_CLEAN == "True": DO_INITIAL_TABULAR_DATA_CLEAN = True
28
  else: DO_INITIAL_TABULAR_DATA_CLEAN = False
29
 
 
 
 
 
30
  fake = Faker("en_UK")
31
  def fake_first_name(x):
32
  return fake.first_name()
@@ -233,7 +237,7 @@ def anon_consistent_names(df:pd.DataFrame) -> pd.DataFrame:
233
  def handle_docx_anonymisation(
234
  file_path: str,
235
  output_folder: str,
236
- anon_strat: str,
237
  chosen_redact_entities: List[str],
238
  in_allow_list: List[str],
239
  in_deny_list: List[str],
@@ -274,15 +278,15 @@ def handle_docx_anonymisation(
274
  # If there's no text to process, return early
275
  if not original_texts:
276
  print(f"No text found in {file_path}. Skipping.")
277
- return None, None
278
 
279
  # 2. Convert to a DataFrame for the existing anonymisation script
280
  df_to_anonymise = pd.DataFrame({'text_to_redact': original_texts})
281
 
282
  # 3. Call the core anonymisation script
283
- anonymised_df, _, decision_log = anonymise_script(
284
  df=df_to_anonymise,
285
- anon_strat=anon_strat,
286
  language=language,
287
  chosen_redact_entities=chosen_redact_entities,
288
  in_allow_list=in_allow_list,
@@ -322,11 +326,11 @@ def handle_docx_anonymisation(
322
  with open(log_file_path, "w", encoding="utf-8-sig") as f:
323
  f.write(decision_log)
324
 
325
- return output_docx_path, log_file_path, output_xlsx_path
326
 
327
  def anonymise_files_with_open_text(file_paths: List[str],
328
  in_text: str,
329
- anon_strat: str,
330
  chosen_cols: List[str],
331
  chosen_redact_entities: List[str],
332
  in_allow_list: List[str] = None,
@@ -354,7 +358,7 @@ def anonymise_files_with_open_text(file_paths: List[str],
354
  Parameters:
355
  - file_paths (List[str]): A list of file paths to anonymise: '.xlsx', '.xls', '.csv', '.parquet', or '.docx'.
356
  - in_text (str): The text to anonymise if file_paths is 'open_text'.
357
- - anon_strat (str): The anonymisation strategy to use.
358
  - chosen_cols (List[str]): A list of column names to anonymise.
359
  - language (str): The language of the text to anonymise.
360
  - chosen_redact_entities (List[str]): A list of entities to redact.
@@ -381,6 +385,9 @@ def anonymise_files_with_open_text(file_paths: List[str],
381
 
382
  tic = time.perf_counter()
383
  comprehend_client = ""
 
 
 
384
 
385
  # Use provided language or default
386
  language = language or DEFAULT_LANGUAGE
@@ -427,7 +434,10 @@ def anonymise_files_with_open_text(file_paths: List[str],
427
  # Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
428
  if pii_identification_method == "AWS Comprehend":
429
  print("Trying to connect to AWS Comprehend service")
430
- if aws_access_key_textbox and aws_secret_key_textbox:
 
 
 
431
  print("Connecting to Comprehend using AWS access key and secret keys from textboxes.")
432
  print("aws_access_key_textbox:", aws_access_key_textbox)
433
  print("aws_secret_access_key:", aws_secret_key_textbox)
@@ -459,14 +469,18 @@ def anonymise_files_with_open_text(file_paths: List[str],
459
  if latest_file_completed >= len(file_paths):
460
  print("Last file reached") #, returning files:", str(latest_file_completed))
461
  # Set to a very high number so as not to mess with subsequent file processing by the user
462
- latest_file_completed = 99
463
  final_out_message = '\n'.join(out_message)
464
- return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, actual_time_taken_number
465
 
466
  file_path_loop = [file_paths[int(latest_file_completed)]]
467
-
468
  for anon_file in progress.tqdm(file_path_loop, desc="Anonymising files", unit = "files"):
469
 
 
 
 
 
470
  if anon_file=='open_text':
471
  anon_df = pd.DataFrame(data={'text':[in_text]})
472
  chosen_cols=['text']
@@ -474,19 +488,19 @@ def anonymise_files_with_open_text(file_paths: List[str],
474
  sheet_name = ""
475
  file_type = ""
476
 
477
- out_file_paths, out_message, key_string, log_files_output_paths = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=OUTPUT_FOLDER, do_initial_clean=do_initial_clean)
478
  else:
479
  # If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
480
- file_type = detect_file_type(anon_file)
481
  print("File type is:", file_type)
482
 
483
- out_file_part = get_file_name_without_type(anon_file.name)
484
 
485
  if file_type == 'docx':
486
- output_path, log_path, output_xlsx_path = handle_docx_anonymisation(
487
- file_path=anon_file.name, # .name if it's a temp file object
488
  output_folder=output_folder,
489
- anon_strat=anon_strat,
490
  chosen_redact_entities=chosen_redact_entities,
491
  in_allow_list=in_allow_list_flat,
492
  in_deny_list=in_deny_list,
@@ -512,7 +526,7 @@ def anonymise_files_with_open_text(file_paths: List[str],
512
  continue
513
 
514
  # Create xlsx file:
515
- anon_xlsx = pd.ExcelFile(anon_file)
516
  anon_xlsx_export_file_name = output_folder + out_file_part + "_redacted.xlsx"
517
 
518
 
@@ -523,16 +537,16 @@ def anonymise_files_with_open_text(file_paths: List[str],
523
  if sheet_name not in anon_xlsx.sheet_names:
524
  continue
525
 
526
- anon_df = pd.read_excel(anon_file, sheet_name=sheet_name)
527
 
528
- out_file_paths, out_message, key_string, log_files_output_paths = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name, log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, language, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder, do_initial_clean=do_initial_clean)
529
 
530
  else:
531
  sheet_name = ""
532
- anon_df = read_file(anon_file)
533
- out_file_part = get_file_name_without_type(anon_file.name)
534
 
535
- out_file_paths, out_message, key_string, log_files_output_paths = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, language, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder, do_initial_clean=do_initial_clean)
536
 
537
  # Increase latest file completed count unless we are at the last file
538
  if latest_file_completed != len(file_paths):
@@ -554,14 +568,14 @@ def anonymise_files_with_open_text(file_paths: List[str],
554
  out_message_out = '\n'.join(out_message)
555
  out_message_out = out_message_out + " " + out_time
556
 
557
- if anon_strat == "encrypt":
558
  out_message_out.append(". Your decryption key is " + key_string)
559
 
560
  out_message_out = out_message_out + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
561
 
562
  out_message_out = re.sub(r'^\n+|^\. ', '', out_message_out).strip()
563
 
564
- return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, actual_time_taken_number
565
 
566
  def tabular_anonymise_wrapper_func(
567
  anon_file: str,
@@ -571,7 +585,7 @@ def tabular_anonymise_wrapper_func(
571
  out_file_part: str,
572
  out_message: str,
573
  excel_sheet_name: str,
574
- anon_strat: str,
575
  language: str,
576
  chosen_redact_entities: List[str],
577
  in_allow_list: List[str],
@@ -600,7 +614,7 @@ def tabular_anonymise_wrapper_func(
600
  - out_file_part: A part of the output file name.
601
  - out_message: A message to be displayed during the anonymization process.
602
  - excel_sheet_name: The name of the Excel sheet where the anonymized data will be exported.
603
- - anon_strat: The anonymization strategy to be applied.
604
  - language: The language of the data to be anonymized.
605
  - chosen_redact_entities: A list of entities to be redacted.
606
  - in_allow_list: A list of allowed values.
@@ -648,7 +662,7 @@ def tabular_anonymise_wrapper_func(
648
  out_message = "No chosen columns found in dataframe: " + out_file_part
649
  key_string = ""
650
  print(out_message)
651
- return out_file_paths, out_message, key_string, log_files_output_paths
652
  else:
653
  chosen_cols_in_anon_df = get_common_strings(chosen_cols, all_cols_original_order)
654
 
@@ -663,7 +677,7 @@ def tabular_anonymise_wrapper_func(
663
 
664
 
665
  # Anonymise the selected columns
666
- anon_df_part_out, key_string, decision_process_output_str = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, nlp_analyser=nlp_analyser, do_initial_clean=do_initial_clean)
667
 
668
  anon_df_part_out.replace("^nan$", "", regex=True, inplace=True)
669
 
@@ -673,10 +687,10 @@ def tabular_anonymise_wrapper_func(
673
 
674
  # Export file
675
  # Rename anonymisation strategy for file path naming
676
- if anon_strat == "replace with 'REDACTED'": anon_strat_txt = "redact_replace"
677
- elif anon_strat == "replace with <ENTITY_NAME>": anon_strat_txt = "redact_entity_type"
678
- elif anon_strat == "redact completely": anon_strat_txt = "redact_remove"
679
- else: anon_strat_txt = anon_strat
680
 
681
  # If the file is an xlsx, add a new sheet to the existing xlsx. Otherwise, write to csv
682
  if file_type == 'xlsx':
@@ -716,10 +730,10 @@ def tabular_anonymise_wrapper_func(
716
  if anon_file=='open_text':
717
  out_message = ["'" + anon_df_out['text'][0] + "'"]
718
 
719
- return out_file_paths, out_message, key_string, log_files_output_paths
720
 
721
  def anonymise_script(df:pd.DataFrame,
722
- anon_strat:str,
723
  language:str,
724
  chosen_redact_entities:List[str],
725
  in_allow_list:List[str]=list(),
@@ -738,7 +752,7 @@ def anonymise_script(df:pd.DataFrame,
738
 
739
  Args:
740
  df (pd.DataFrame): The input DataFrame containing text to be anonymised.
741
- anon_strat (str): The anonymisation strategy to apply (e.g., "replace with 'REDACTED'", "replace with <ENTITY_NAME>", "redact completely").
742
  language (str): The language of the text for analysis (e.g., "en", "es").
743
  chosen_redact_entities (List[str]): A list of entity types to redact using the local (Presidio) method.
744
  in_allow_list (List[str], optional): A list of terms to explicitly allow and not redact. Defaults to an empty list.
@@ -948,12 +962,15 @@ def anonymise_script(df:pd.DataFrame,
948
  people_encrypt_config = eval('{"PERSON": OperatorConfig("encrypt", {"key": key_string})}') # The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption.
949
  fake_first_name_config = eval('{"PERSON": OperatorConfig("custom", {"lambda": fake_first_name})}')
950
 
951
- if anon_strat == "replace with 'REDACTED'": chosen_mask_config = simple_replace_config
952
- if anon_strat == "replace with <ENTITY_NAME>": chosen_mask_config = replace_config
953
- if anon_strat == "redact completely": chosen_mask_config = redact_config
954
- if anon_strat == "hash": chosen_mask_config = hash_config
955
- if anon_strat == "mask": chosen_mask_config = mask_config
956
- if anon_strat == "encrypt":
 
 
 
957
  chosen_mask_config = people_encrypt_config
958
  key = secrets.token_bytes(16) # 128 bits = 16 bytes
959
  key_string = base64.b64encode(key).decode('utf-8')
@@ -962,7 +979,10 @@ def anonymise_script(df:pd.DataFrame,
962
  for entity, operator in chosen_mask_config.items():
963
  if operator.operator_name == "encrypt":
964
  operator.params = {"key": key_string}
965
- elif anon_strat == "fake_first_name": chosen_mask_config = fake_first_name_config
 
 
 
966
 
967
  # I think in general people will want to keep date / times - removed Mar 2025 as I don't want to assume for people.
968
  #keep_date_config = eval('{"DATE_TIME": OperatorConfig("keep")}')
@@ -973,4 +993,4 @@ def anonymise_script(df:pd.DataFrame,
973
 
974
  scrubbed_df = pd.DataFrame(anonymizer_results)
975
 
976
- return scrubbed_df, key_string, decision_process_output_str
 
18
  from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
19
  from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
20
  from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
21
+ from tools.config import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, OUTPUT_FOLDER, DEFAULT_LANGUAGE, aws_comprehend_language_choices, DO_INITIAL_TABULAR_DATA_CLEAN, CUSTOM_ENTITIES, PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS, AWS_REGION
22
+ from tools.helper_functions import get_file_name_without_type, read_file, detect_file_type, _get_env_list
23
+ from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, create_nlp_analyser, load_spacy_model
24
  # Use custom version of analyze_dict to be able to track progress
25
  from tools.presidio_analyzer_custom import analyze_dict
26
 
27
  if DO_INITIAL_TABULAR_DATA_CLEAN == "True": DO_INITIAL_TABULAR_DATA_CLEAN = True
28
  else: DO_INITIAL_TABULAR_DATA_CLEAN = False
29
 
30
+ if CUSTOM_ENTITIES: CUSTOM_ENTITIES = _get_env_list(CUSTOM_ENTITIES)
31
+
32
+ custom_entities = CUSTOM_ENTITIES
33
+
34
  fake = Faker("en_UK")
35
  def fake_first_name(x):
36
  return fake.first_name()
 
237
  def handle_docx_anonymisation(
238
  file_path: str,
239
  output_folder: str,
240
+ anon_strategy: str,
241
  chosen_redact_entities: List[str],
242
  in_allow_list: List[str],
243
  in_deny_list: List[str],
 
278
  # If there's no text to process, return early
279
  if not original_texts:
280
  print(f"No text found in {file_path}. Skipping.")
281
+ return None, None, 0
282
 
283
  # 2. Convert to a DataFrame for the existing anonymisation script
284
  df_to_anonymise = pd.DataFrame({'text_to_redact': original_texts})
285
 
286
  # 3. Call the core anonymisation script
287
+ anonymised_df, _, decision_log, comprehend_query_number = anonymise_script(
288
  df=df_to_anonymise,
289
+ anon_strategy=anon_strategy,
290
  language=language,
291
  chosen_redact_entities=chosen_redact_entities,
292
  in_allow_list=in_allow_list,
 
326
  with open(log_file_path, "w", encoding="utf-8-sig") as f:
327
  f.write(decision_log)
328
 
329
+ return output_docx_path, log_file_path, output_xlsx_path, comprehend_query_number
330
 
331
  def anonymise_files_with_open_text(file_paths: List[str],
332
  in_text: str,
333
+ anon_strategy: str,
334
  chosen_cols: List[str],
335
  chosen_redact_entities: List[str],
336
  in_allow_list: List[str] = None,
 
358
  Parameters:
359
  - file_paths (List[str]): A list of file paths to anonymise: '.xlsx', '.xls', '.csv', '.parquet', or '.docx'.
360
  - in_text (str): The text to anonymise if file_paths is 'open_text'.
361
+ - anon_strategy (str): The anonymisation strategy to use.
362
  - chosen_cols (List[str]): A list of column names to anonymise.
363
  - language (str): The language of the text to anonymise.
364
  - chosen_redact_entities (List[str]): A list of entities to redact.
 
385
 
386
  tic = time.perf_counter()
387
  comprehend_client = ""
388
+
389
+ # If output folder doesn't end with a forward slash, add one
390
+ if not output_folder.endswith('/'): output_folder = output_folder + '/'
391
 
392
  # Use provided language or default
393
  language = language or DEFAULT_LANGUAGE
 
434
  # Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
435
  if pii_identification_method == "AWS Comprehend":
436
  print("Trying to connect to AWS Comprehend service")
437
+ if RUN_AWS_FUNCTIONS == "1" and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS == "1":
438
+ print("Connecting to Comprehend via existing SSO connection")
439
+ comprehend_client = boto3.client('comprehend', region_name=AWS_REGION)
440
+ elif aws_access_key_textbox and aws_secret_key_textbox:
441
  print("Connecting to Comprehend using AWS access key and secret keys from textboxes.")
442
  print("aws_access_key_textbox:", aws_access_key_textbox)
443
  print("aws_secret_access_key:", aws_secret_key_textbox)
 
469
  if latest_file_completed >= len(file_paths):
470
  print("Last file reached") #, returning files:", str(latest_file_completed))
471
  # Set to a very high number so as not to mess with subsequent file processing by the user
472
+ #latest_file_completed = 99
473
  final_out_message = '\n'.join(out_message)
474
+ return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, actual_time_taken_number, comprehend_query_number
475
 
476
  file_path_loop = [file_paths[int(latest_file_completed)]]
477
+
478
  for anon_file in progress.tqdm(file_path_loop, desc="Anonymising files", unit = "files"):
479
 
480
+ # Get a string file path
481
+ if isinstance(anon_file, str): file_path = anon_file
482
+ else: file_path = anon_file
483
+
484
  if anon_file=='open_text':
485
  anon_df = pd.DataFrame(data={'text':[in_text]})
486
  chosen_cols=['text']
 
488
  sheet_name = ""
489
  file_type = ""
490
 
491
+ out_file_paths, out_message, key_string, log_files_output_paths, comprehend_query_number = tabular_anonymise_wrapper_func(file_path, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strategy, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=OUTPUT_FOLDER, do_initial_clean=do_initial_clean)
492
  else:
493
  # If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
494
+ file_type = detect_file_type(file_path)
495
  print("File type is:", file_type)
496
 
497
+ out_file_part = get_file_name_without_type(file_path)
498
 
499
  if file_type == 'docx':
500
+ output_path, log_path, output_xlsx_path, comprehend_query_number = handle_docx_anonymisation(
501
+ file_path=file_path,
502
  output_folder=output_folder,
503
+ anon_strategy=anon_strategy,
504
  chosen_redact_entities=chosen_redact_entities,
505
  in_allow_list=in_allow_list_flat,
506
  in_deny_list=in_deny_list,
 
526
  continue
527
 
528
  # Create xlsx file:
529
+ anon_xlsx = pd.ExcelFile(file_path)
530
  anon_xlsx_export_file_name = output_folder + out_file_part + "_redacted.xlsx"
531
 
532
 
 
537
  if sheet_name not in anon_xlsx.sheet_names:
538
  continue
539
 
540
+ anon_df = pd.read_excel(file_path, sheet_name=sheet_name)
541
 
542
+ out_file_paths, out_message, key_string, log_files_output_paths, comprehend_query_number = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strategy, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name, log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, language, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder, do_initial_clean=do_initial_clean)
543
 
544
  else:
545
  sheet_name = ""
546
+ anon_df = read_file(file_path)
547
+ out_file_part = get_file_name_without_type(file_path)
548
 
549
+ out_file_paths, out_message, key_string, log_files_output_paths, comprehend_query_number = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strategy, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, language, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder, do_initial_clean=do_initial_clean)
550
 
551
  # Increase latest file completed count unless we are at the last file
552
  if latest_file_completed != len(file_paths):
 
568
  out_message_out = '\n'.join(out_message)
569
  out_message_out = out_message_out + " " + out_time
570
 
571
+ if anon_strategy == "encrypt":
572
  out_message_out.append(". Your decryption key is " + key_string)
573
 
574
  out_message_out = out_message_out + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
575
 
576
  out_message_out = re.sub(r'^\n+|^\. ', '', out_message_out).strip()
577
 
578
+ return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, actual_time_taken_number, comprehend_query_number
579
 
580
  def tabular_anonymise_wrapper_func(
581
  anon_file: str,
 
585
  out_file_part: str,
586
  out_message: str,
587
  excel_sheet_name: str,
588
+ anon_strategy: str,
589
  language: str,
590
  chosen_redact_entities: List[str],
591
  in_allow_list: List[str],
 
614
  - out_file_part: A part of the output file name.
615
  - out_message: A message to be displayed during the anonymization process.
616
  - excel_sheet_name: The name of the Excel sheet where the anonymized data will be exported.
617
+ - anon_strategy: The anonymization strategy to be applied.
618
  - language: The language of the data to be anonymized.
619
  - chosen_redact_entities: A list of entities to be redacted.
620
  - in_allow_list: A list of allowed values.
 
662
  out_message = "No chosen columns found in dataframe: " + out_file_part
663
  key_string = ""
664
  print(out_message)
665
+ return out_file_paths, out_message, key_string, log_files_output_paths, comprehend_query_number
666
  else:
667
  chosen_cols_in_anon_df = get_common_strings(chosen_cols, all_cols_original_order)
668
 
 
677
 
678
 
679
  # Anonymise the selected columns
680
+ anon_df_part_out, key_string, decision_process_output_str, comprehend_query_number = anonymise_script(anon_df_part, anon_strategy, language, chosen_redact_entities, in_allow_list, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, nlp_analyser=nlp_analyser, do_initial_clean=do_initial_clean)
681
 
682
  anon_df_part_out.replace("^nan$", "", regex=True, inplace=True)
683
 
 
687
 
688
  # Export file
689
  # Rename anonymisation strategy for file path naming
690
+ if anon_strategy == "replace with 'REDACTED'": anon_strat_txt = "redact_replace"
691
+ elif anon_strategy == "replace with <ENTITY_NAME>": anon_strat_txt = "redact_entity_type"
692
+ elif anon_strategy == "redact completely": anon_strat_txt = "redact_remove"
693
+ else: anon_strat_txt = anon_strategy
694
 
695
  # If the file is an xlsx, add a new sheet to the existing xlsx. Otherwise, write to csv
696
  if file_type == 'xlsx':
 
730
  if anon_file=='open_text':
731
  out_message = ["'" + anon_df_out['text'][0] + "'"]
732
 
733
+ return out_file_paths, out_message, key_string, log_files_output_paths, comprehend_query_number
734
 
735
  def anonymise_script(df:pd.DataFrame,
736
+ anon_strategy:str,
737
  language:str,
738
  chosen_redact_entities:List[str],
739
  in_allow_list:List[str]=list(),
 
752
 
753
  Args:
754
  df (pd.DataFrame): The input DataFrame containing text to be anonymised.
755
+ anon_strategy (str): The anonymisation strategy to apply (e.g., "replace with 'REDACTED'", "replace with <ENTITY_NAME>", "redact completely").
756
  language (str): The language of the text for analysis (e.g., "en", "es").
757
  chosen_redact_entities (List[str]): A list of entity types to redact using the local (Presidio) method.
758
  in_allow_list (List[str], optional): A list of terms to explicitly allow and not redact. Defaults to an empty list.
 
962
  people_encrypt_config = eval('{"PERSON": OperatorConfig("encrypt", {"key": key_string})}') # The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption.
963
  fake_first_name_config = eval('{"PERSON": OperatorConfig("custom", {"lambda": fake_first_name})}')
964
 
965
+ if anon_strategy == "replace with 'REDACTED'": chosen_mask_config = simple_replace_config
966
+ elif anon_strategy == "replace_redacted": chosen_mask_config = simple_replace_config
967
+ elif anon_strategy == "replace with <ENTITY_NAME>": chosen_mask_config = replace_config
968
+ elif anon_strategy == "entity_type": chosen_mask_config = replace_config
969
+ elif anon_strategy == "redact completely": chosen_mask_config = redact_config
970
+ elif anon_strategy == "redact": chosen_mask_config = redact_config
971
+ elif anon_strategy == "hash": chosen_mask_config = hash_config
972
+ elif anon_strategy == "mask": chosen_mask_config = mask_config
973
+ elif anon_strategy == "encrypt":
974
  chosen_mask_config = people_encrypt_config
975
  key = secrets.token_bytes(16) # 128 bits = 16 bytes
976
  key_string = base64.b64encode(key).decode('utf-8')
 
979
  for entity, operator in chosen_mask_config.items():
980
  if operator.operator_name == "encrypt":
981
  operator.params = {"key": key_string}
982
+ elif anon_strategy == "fake_first_name": chosen_mask_config = fake_first_name_config
983
+ else:
984
+ print("Anonymisation strategy not found. Redacting completely by default.")
985
+ chosen_mask_config = redact_config # Redact completely by default
986
 
987
  # I think in general people will want to keep date / times - removed Mar 2025 as I don't want to assume for people.
988
  #keep_date_config = eval('{"DATE_TIME": OperatorConfig("keep")}')
 
993
 
994
  scrubbed_df = pd.DataFrame(anonymizer_results)
995
 
996
+ return scrubbed_df, key_string, decision_process_output_str, comprehend_query_number
tools/example_cli_calls.txt DELETED
@@ -1,30 +0,0 @@
1
- python cli_redact.py --help
2
-
3
- python cli_redact.py \
4
- --input_file "documents/confidential-report.pdf" \
5
- --output_dir "output/redacted_reports/" \
6
- --ocr_method "Local OCR model - PDFs without selectable text" \
7
- --pii_detector "Local" \
8
- --page_min 2 \
9
- --page_max 10 \
10
- --allow_list "config/project_allowlist.csv"
11
-
12
- python your_cli_script.py \
13
- --input_file "data/customer_data.xlsx" \
14
- --output_dir "output/anonymised_data/" \
15
- --anon_strat "redact" \
16
- --columns "Customer Name" "Email" \
17
- --excel_sheets "Q3-Data"
18
-
19
- python your_cli_script.py \
20
- --input_file "legal_docs/legal_agreement.docx" \
21
- --output_dir "output/anonymised_docs/" \
22
- --anon_strat "encrypt" \
23
- --deny_list "config/codenames.csv" \
24
- --language "en"
25
-
26
- python cli_redact.py --task deduplicate --input_file ocr_output.csv --duplicate_type pages --similarity_threshold 0.95 --min_word_count 5
27
-
28
- python cli_redact.py --task deduplicate --input_file data.csv --duplicate_type tabular --text_columns "Name" "Email" "Description"
29
-
30
- python cli_redact.py --task deduplicate --input_file ocr_output.csv --duplicate_type pages --search_query "confidential information"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tools/file_redaction.py CHANGED
@@ -20,11 +20,11 @@ import gradio as gr
20
  from gradio import Progress
21
  from collections import defaultdict # For efficient grouping
22
 
23
- from tools.config import OUTPUT_FOLDER, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, PAGE_BREAK_VALUE, MAX_TIME_VALUE, LOAD_TRUNCATED_IMAGES, INPUT_FOLDER, RETURN_PDF_END_OF_REDACTION, TESSERACT_TEXT_EXTRACT_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, LOCAL_PII_OPTION, AWS_PII_OPTION, NO_REDACTION_PII_OPTION, DEFAULT_LANGUAGE, textract_language_choices, aws_comprehend_language_choices
24
  from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, recreate_page_line_level_ocr_results_with_page
25
  from tools.file_conversion import convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, is_pdf, is_pdf_or_image, prepare_image_or_pdf, divide_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, divide_coordinates_by_page_sizes, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, fill_missing_box_ids, load_and_convert_ocr_results_with_words_json, save_pdf_with_or_without_compression, word_level_ocr_output_to_dataframe
26
- from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer, load_spacy_model, download_tesseract_lang_pack, create_nlp_analyser
27
- from tools.helper_functions import get_file_name_without_type, clean_unicode_text
28
  from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
29
 
30
  ImageFile.LOAD_TRUNCATED_IMAGES = LOAD_TRUNCATED_IMAGES.lower() == "true"
@@ -34,6 +34,10 @@ image_dpi = float(IMAGES_DPI)
34
 
35
  RETURN_PDF_END_OF_REDACTION = RETURN_PDF_END_OF_REDACTION.lower() == "true"
36
 
 
 
 
 
37
  def bounding_boxes_overlap(box1, box2):
38
  """Check if two bounding boxes overlap."""
39
  return (box1[0] < box2[2] and box2[0] < box1[2] and
@@ -91,7 +95,7 @@ def choose_and_run_redactor(file_paths:List[str],
91
  chosen_redact_entities:List[str],
92
  chosen_redact_comprehend_entities:List[str],
93
  text_extraction_method:str,
94
- in_allow_list:List[List[str]]=list(),
95
  in_deny_list:List[str]=list(),
96
  redact_whole_page_list:List[str]=list(),
97
  latest_file_completed:int=0,
@@ -146,9 +150,9 @@ def choose_and_run_redactor(file_paths:List[str],
146
  - chosen_redact_entities (List[str]): A list of entity types to redact from the files using the local model (spacy) with Microsoft Presidio.
147
  - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service.
148
  - text_extraction_method (str): The method to use to extract text from documents.
149
- - in_allow_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
150
- - in_deny_list (List[List[str]], optional): A list of denied terms for redaction. Defaults to None.
151
- - redact_whole_page_list (List[List[str]], optional): A list of whole page numbers for redaction. Defaults to None.
152
  - latest_file_completed (int, optional): The index of the last completed file. Defaults to 0.
153
  - combined_out_message (list, optional): A list to store output messages. Defaults to an empty list.
154
  - out_file_paths (list, optional): A list to store paths to the output files. Defaults to an empty list.
@@ -202,8 +206,19 @@ def choose_and_run_redactor(file_paths:List[str],
202
  pdf_file_name_without_ext = ""
203
  page_break_return = False
204
  blank_request_metadata = list()
 
205
  all_textract_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
206
  review_out_file_paths = [prepared_pdf_file_paths[0]]
 
 
 
 
 
 
 
 
 
 
207
 
208
  # Use provided language or default
209
  language = language or DEFAULT_LANGUAGE
@@ -284,6 +299,8 @@ def choose_and_run_redactor(file_paths:List[str],
284
  progress(0.95, "Completed last file, performing final checks")
285
  current_loop_page = 0
286
 
 
 
287
  if isinstance(out_message, list) and out_message:
288
  combined_out_message = combined_out_message + '\n'.join(out_message)
289
  elif out_message:
@@ -312,7 +329,7 @@ def choose_and_run_redactor(file_paths:List[str],
312
 
313
  page_break_return = True
314
 
315
- return combined_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, current_loop_page, page_break_return, all_page_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df, review_file_state
316
 
317
  #if first_loop_state == False:
318
  # Prepare documents and images as required if they don't already exist
@@ -346,10 +363,11 @@ def choose_and_run_redactor(file_paths:List[str],
346
  file_paths_loop, text_extraction_method, all_page_line_level_ocr_results_df, all_page_line_level_ocr_results_with_words_df, 0, out_message, True,
347
  annotate_max_pages, annotations_all_pages, document_cropboxes, redact_whole_page_list,
348
  output_folder=output_folder, prepare_images=prepare_images_flag, page_sizes=page_sizes, pymupdf_doc=pymupdf_doc, input_folder=input_folder
349
- )
350
-
351
- page_sizes_df = pd.DataFrame(page_sizes)
352
 
 
 
353
  if page_sizes_df.empty:
354
  page_sizes_df=pd.DataFrame(columns=["page", "image_path", "image_width", "image_height", "mediabox_width", "mediabox_height", "cropbox_width", "cropbox_height", "original_cropbox"])
355
  page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
@@ -378,32 +396,45 @@ def choose_and_run_redactor(file_paths:List[str],
378
 
379
  page_break_return = False
380
 
381
- return combined_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, current_loop_page, page_break_return, all_page_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df, review_file_state
382
 
383
- # Load/create allow list
 
 
384
  # If string, assume file path
385
- if isinstance(in_allow_list, str): in_allow_list = pd.read_csv(in_allow_list)
 
 
386
  # Now, should be a pandas dataframe format
387
- if not in_allow_list.empty:
388
- in_allow_list_flat = in_allow_list.iloc[:,0].tolist()
 
 
 
389
  else:
390
  in_allow_list_flat = list()
391
 
 
392
  # If string, assume file path
393
  if isinstance(in_deny_list, str):
394
- in_deny_list = pd.read_csv(in_deny_list)
 
 
395
  if isinstance(in_deny_list, pd.DataFrame):
396
  if not in_deny_list.empty:
397
  custom_recogniser_word_list_flat = in_deny_list.iloc[:, 0].tolist()
398
  else:
399
  custom_recogniser_word_list_flat = list()
400
-
401
  # Sort the strings in order from the longest string to the shortest
402
  custom_recogniser_word_list_flat = sorted(custom_recogniser_word_list_flat, key=len, reverse=True)
 
 
403
 
 
404
  # If string, assume file path
405
  if isinstance(redact_whole_page_list, str):
406
- redact_whole_page_list = pd.read_csv(redact_whole_page_list)
 
407
  if isinstance(redact_whole_page_list, pd.DataFrame):
408
  if not redact_whole_page_list.empty:
409
  try:
@@ -412,13 +443,18 @@ def choose_and_run_redactor(file_paths:List[str],
412
  print("Could not convert whole page redaction data to number list due to:", e)
413
  redact_whole_page_list_flat = redact_whole_page_list.iloc[:,0].tolist()
414
  else:
415
- redact_whole_page_list_flat = list()
 
 
416
 
417
-
418
 
419
  # Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
420
  if pii_identification_method == AWS_PII_OPTION:
421
- if aws_access_key_textbox and aws_secret_key_textbox:
 
 
 
422
  print("Connecting to Comprehend using AWS access key and secret keys from user input.")
423
  comprehend_client = boto3.client('comprehend',
424
  aws_access_key_id=aws_access_key_textbox,
@@ -441,7 +477,10 @@ def choose_and_run_redactor(file_paths:List[str],
441
 
442
  # Try to connect to AWS Textract Client if using that text extraction method
443
  if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
444
- if aws_access_key_textbox and aws_secret_key_textbox:
 
 
 
445
  print("Connecting to Textract using AWS access key and secret keys from user input.")
446
  textract_client = boto3.client('textract',
447
  aws_access_key_id=aws_access_key_textbox,
@@ -647,7 +686,10 @@ def choose_and_run_redactor(file_paths:List[str],
647
  print("Saving redacted PDF file:", out_redacted_pdf_file_path)
648
  save_pdf_with_or_without_compression(pymupdf_doc, out_redacted_pdf_file_path)
649
 
650
- out_file_paths.append(out_redacted_pdf_file_path)
 
 
 
651
 
652
  if not all_page_line_level_ocr_results_df.empty:
653
  all_page_line_level_ocr_results_df = all_page_line_level_ocr_results_df[["page", "text", "left", "top", "width", "height", "line"]]
@@ -658,8 +700,12 @@ def choose_and_run_redactor(file_paths:List[str],
658
  all_page_line_level_ocr_results_df.sort_values(["page", "line"], inplace=True)
659
  all_page_line_level_ocr_results_df.to_csv(ocr_file_path, index = None, encoding="utf-8-sig")
660
 
661
- out_file_paths.append(ocr_file_path)
662
- duplication_file_path_outputs.append(ocr_file_path)
 
 
 
 
663
 
664
  if all_page_line_level_ocr_results_with_words:
665
  all_page_line_level_ocr_results_with_words = merge_page_results(all_page_line_level_ocr_results_with_words)
@@ -670,14 +716,12 @@ def choose_and_run_redactor(file_paths:List[str],
670
  all_page_line_level_ocr_results_with_words_df = word_level_ocr_output_to_dataframe(all_page_line_level_ocr_results_with_words)
671
 
672
  all_page_line_level_ocr_results_with_words_df = divide_coordinates_by_page_sizes(all_page_line_level_ocr_results_with_words_df, page_sizes_df, xmin="word_x0", xmax="word_x1", ymin="word_y0", ymax="word_y1")
673
- # all_page_line_level_ocr_results_with_words_df = divide_coordinates_by_page_sizes(all_page_line_level_ocr_results_with_words_df, page_sizes_df, xmin="line_x0", xmax="line_x1", ymin="line_y0", ymax="line_y1")
 
674
 
675
  if text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION:
676
  # Coordinates need to be reversed for ymin and ymax to match with image annotator objects downstream
677
  if not all_page_line_level_ocr_results_with_words_df.empty:
678
-
679
- # all_page_line_level_ocr_results_with_words_df['line_y0'] = reverse_y_coords(all_page_line_level_ocr_results_with_words_df, 'line_y0')
680
- # all_page_line_level_ocr_results_with_words_df['line_y1'] = reverse_y_coords(all_page_line_level_ocr_results_with_words_df, 'line_y1')
681
  all_page_line_level_ocr_results_with_words_df['word_y0'] = reverse_y_coords(all_page_line_level_ocr_results_with_words_df, 'word_y0')
682
  all_page_line_level_ocr_results_with_words_df['word_y1'] = reverse_y_coords(all_page_line_level_ocr_results_with_words_df, 'word_y1')
683
 
@@ -685,20 +729,32 @@ def choose_and_run_redactor(file_paths:List[str],
685
  all_page_line_level_ocr_results_with_words_df['line_x0'] = ""
686
  all_page_line_level_ocr_results_with_words_df['line_x1'] = ""
687
  all_page_line_level_ocr_results_with_words_df['line_y0'] = ""
688
- all_page_line_level_ocr_results_with_words_df['line_y1'] = ""
689
 
690
  all_page_line_level_ocr_results_with_words_df.sort_values(["page", "line", "word_x0"], inplace=True)
691
  all_page_line_level_ocr_results_with_words_df_file_path = all_page_line_level_ocr_results_with_words_json_file_path.replace(".json", ".csv")
692
  all_page_line_level_ocr_results_with_words_df.to_csv(all_page_line_level_ocr_results_with_words_df_file_path, index = None, encoding="utf-8-sig")
693
 
694
  if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
 
 
 
 
695
  log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
696
 
697
  if all_page_line_level_ocr_results_with_words_df_file_path not in log_files_output_paths:
698
- log_files_output_paths.append(all_page_line_level_ocr_results_with_words_df_file_path)
 
 
 
699
 
700
  if all_page_line_level_ocr_results_with_words_df_file_path not in out_file_paths:
701
- out_file_paths.append(all_page_line_level_ocr_results_with_words_df_file_path)
 
 
 
 
 
702
 
703
  # Convert the gradio annotation boxes to relative coordinates
704
  progress(0.93, "Creating review file output")
@@ -711,18 +767,27 @@ def choose_and_run_redactor(file_paths:List[str],
711
  # Save the gradio_annotation_boxes to a review csv file
712
  review_file_state = convert_annotation_json_to_review_df(annotations_all_pages_divide, all_pages_decision_process_table, page_sizes=page_sizes)
713
 
 
714
  # Don't need page sizes in outputs
715
  review_file_state.drop(["image_width", "image_height", "mediabox_width", "mediabox_height", "cropbox_width", "cropbox_height"], axis=1, inplace=True, errors="ignore")
716
 
717
- review_file_state.to_csv(review_file_path, index=None, encoding="utf-8-sig")
 
 
 
718
 
719
- if pii_identification_method != NO_REDACTION_PII_OPTION: out_file_paths.append(review_file_path)
 
 
 
 
 
 
 
 
720
 
721
- # Make a combined message for the file
722
- if isinstance(out_message, list) and out_message:
723
- combined_out_message = combined_out_message + '\n'.join(out_message) # Ensure out_message is a list of strings
724
- elif out_message:
725
- combined_out_message = combined_out_message + '\n' + out_message
726
 
727
  toc = time.perf_counter()
728
  time_taken = toc - tic
@@ -747,7 +812,11 @@ def choose_and_run_redactor(file_paths:List[str],
747
  with open(all_textract_request_metadata_file_path, "w") as f: f.write(all_request_metadata_str)
748
 
749
  # Add the request metadata to the log outputs if not there already
750
- if all_textract_request_metadata_file_path not in log_files_output_paths: log_files_output_paths.append(all_textract_request_metadata_file_path)
 
 
 
 
751
 
752
  new_textract_query_numbers = len(all_textract_request_metadata)
753
  total_textract_query_number += new_textract_query_numbers
@@ -764,7 +833,7 @@ def choose_and_run_redactor(file_paths:List[str],
764
 
765
  page_break_return = True
766
 
767
- return combined_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages_divide, current_loop_page, page_break_return, all_page_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df, review_file_state
768
 
769
  def convert_pikepdf_coords_to_pymupdf(pymupdf_page:Page, pikepdf_bbox, type="pikepdf_annot"):
770
  '''
@@ -1266,10 +1335,13 @@ def merge_img_bboxes(bboxes: list, combined_results: Dict, page_signature_recogn
1266
 
1267
  # Process signature and handwriting results
1268
  if page_signature_recogniser_results or page_handwriting_recogniser_results:
 
1269
  if "Extract handwriting" in handwrite_signature_checkbox:
 
1270
  merged_bboxes.extend(copy.deepcopy(page_handwriting_recogniser_results))
1271
 
1272
  if "Extract signatures" in handwrite_signature_checkbox:
 
1273
  merged_bboxes.extend(copy.deepcopy(page_signature_recogniser_results))
1274
 
1275
  # Reconstruct bounding boxes for substrings of interest
@@ -2385,7 +2457,10 @@ def redact_text_pdf(
2385
  all_page_line_text_extraction_characters.extend(line_characters)
2386
  all_page_line_level_ocr_results_with_words.append(line_level_ocr_results_with_words)
2387
 
2388
- page_text_ocr_outputs = pd.concat(page_text_ocr_outputs_list)
 
 
 
2389
 
2390
  ### REDACTION
2391
  if pii_identification_method != NO_REDACTION_PII_OPTION:
 
20
  from gradio import Progress
21
  from collections import defaultdict # For efficient grouping
22
 
23
+ from tools.config import OUTPUT_FOLDER, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, PAGE_BREAK_VALUE, MAX_TIME_VALUE, LOAD_TRUNCATED_IMAGES, INPUT_FOLDER, RETURN_PDF_END_OF_REDACTION, TESSERACT_TEXT_EXTRACT_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, LOCAL_PII_OPTION, AWS_PII_OPTION, NO_REDACTION_PII_OPTION, DEFAULT_LANGUAGE, textract_language_choices, aws_comprehend_language_choices, CUSTOM_ENTITIES, PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS
24
  from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, recreate_page_line_level_ocr_results_with_page
25
  from tools.file_conversion import convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, is_pdf, is_pdf_or_image, prepare_image_or_pdf, divide_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, divide_coordinates_by_page_sizes, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, fill_missing_box_ids, load_and_convert_ocr_results_with_words_json, save_pdf_with_or_without_compression, word_level_ocr_output_to_dataframe
26
+ from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer, load_spacy_model, download_tesseract_lang_pack, create_nlp_analyser
27
+ from tools.helper_functions import get_file_name_without_type, clean_unicode_text, _get_env_list
28
  from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
29
 
30
  ImageFile.LOAD_TRUNCATED_IMAGES = LOAD_TRUNCATED_IMAGES.lower() == "true"
 
34
 
35
  RETURN_PDF_END_OF_REDACTION = RETURN_PDF_END_OF_REDACTION.lower() == "true"
36
 
37
+ if CUSTOM_ENTITIES: CUSTOM_ENTITIES = _get_env_list(CUSTOM_ENTITIES)
38
+
39
+ custom_entities = CUSTOM_ENTITIES
40
+
41
  def bounding_boxes_overlap(box1, box2):
42
  """Check if two bounding boxes overlap."""
43
  return (box1[0] < box2[2] and box2[0] < box1[2] and
 
95
  chosen_redact_entities:List[str],
96
  chosen_redact_comprehend_entities:List[str],
97
  text_extraction_method:str,
98
+ in_allow_list:List[str]=list(),
99
  in_deny_list:List[str]=list(),
100
  redact_whole_page_list:List[str]=list(),
101
  latest_file_completed:int=0,
 
150
  - chosen_redact_entities (List[str]): A list of entity types to redact from the files using the local model (spacy) with Microsoft Presidio.
151
  - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service.
152
  - text_extraction_method (str): The method to use to extract text from documents.
153
+ - in_allow_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to empty list. Can also be entered as a string path to a CSV file, or as a single column pandas dataframe.
154
+ - in_deny_list (List[List[str]], optional): A list of denied terms for redaction. Defaults to empty list. Can also be entered as a string path to a CSV file, or as a single column pandas dataframe.
155
+ - redact_whole_page_list (List[List[str]], optional): A list of whole page numbers for redaction. Defaults to empty list. Can also be entered as a string path to a CSV file, or as a single column pandas dataframe.
156
  - latest_file_completed (int, optional): The index of the last completed file. Defaults to 0.
157
  - combined_out_message (list, optional): A list to store output messages. Defaults to an empty list.
158
  - out_file_paths (list, optional): A list to store paths to the output files. Defaults to an empty list.
 
206
  pdf_file_name_without_ext = ""
207
  page_break_return = False
208
  blank_request_metadata = list()
209
+ custom_recogniser_word_list_flat = list()
210
  all_textract_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
211
  review_out_file_paths = [prepared_pdf_file_paths[0]]
212
+ task_textbox = "redact"
213
+
214
+ # CLI mode may provide options to enter method names in a different format
215
+ if text_extraction_method == "AWS Textract": text_extraction_method = TEXTRACT_TEXT_EXTRACT_OPTION
216
+ if text_extraction_method == "Local OCR": text_extraction_method = TESSERACT_TEXT_EXTRACT_OPTION
217
+ if text_extraction_method == "Local text": text_extraction_method = SELECTABLE_TEXT_EXTRACT_OPTION
218
+ if pii_identification_method == "None": pii_identification_method = NO_REDACTION_PII_OPTION
219
+
220
+ # If output folder doesn't end with a forward slash, add one
221
+ if not output_folder.endswith('/'): output_folder = output_folder + '/'
222
 
223
  # Use provided language or default
224
  language = language or DEFAULT_LANGUAGE
 
299
  progress(0.95, "Completed last file, performing final checks")
300
  current_loop_page = 0
301
 
302
+ if isinstance(combined_out_message, list): combined_out_message = '\n'.join(combined_out_message)
303
+
304
  if isinstance(out_message, list) and out_message:
305
  combined_out_message = combined_out_message + '\n'.join(out_message)
306
  elif out_message:
 
329
 
330
  page_break_return = True
331
 
332
+ return combined_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, current_loop_page, page_break_return, all_page_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df, review_file_state, task_textbox
333
 
334
  #if first_loop_state == False:
335
  # Prepare documents and images as required if they don't already exist
 
363
  file_paths_loop, text_extraction_method, all_page_line_level_ocr_results_df, all_page_line_level_ocr_results_with_words_df, 0, out_message, True,
364
  annotate_max_pages, annotations_all_pages, document_cropboxes, redact_whole_page_list,
365
  output_folder=output_folder, prepare_images=prepare_images_flag, page_sizes=page_sizes, pymupdf_doc=pymupdf_doc, input_folder=input_folder
366
+ )
367
+
 
368
 
369
+ page_sizes_df = pd.DataFrame(page_sizes)
370
+
371
  if page_sizes_df.empty:
372
  page_sizes_df=pd.DataFrame(columns=["page", "image_path", "image_width", "image_height", "mediabox_width", "mediabox_height", "cropbox_width", "cropbox_height", "original_cropbox"])
373
  page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
 
396
 
397
  page_break_return = False
398
 
399
+ return combined_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, current_loop_page, page_break_return, all_page_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df, review_file_state, task_textbox
400
 
401
+ ### Load/create allow list, deny list, and whole page redaction list
402
+
403
+ ### Load/create allow list
404
  # If string, assume file path
405
+ if isinstance(in_allow_list, str):
406
+ if in_allow_list:
407
+ in_allow_list = pd.read_csv(in_allow_list, header=None)
408
  # Now, should be a pandas dataframe format
409
+ if isinstance(in_allow_list, pd.DataFrame):
410
+ if not in_allow_list.empty:
411
+ in_allow_list_flat = in_allow_list.iloc[:,0].tolist()
412
+ else:
413
+ in_allow_list_flat = list()
414
  else:
415
  in_allow_list_flat = list()
416
 
417
+ ### Load/create deny list
418
  # If string, assume file path
419
  if isinstance(in_deny_list, str):
420
+ if in_deny_list:
421
+ in_deny_list = pd.read_csv(in_deny_list, header=None)
422
+
423
  if isinstance(in_deny_list, pd.DataFrame):
424
  if not in_deny_list.empty:
425
  custom_recogniser_word_list_flat = in_deny_list.iloc[:, 0].tolist()
426
  else:
427
  custom_recogniser_word_list_flat = list()
 
428
  # Sort the strings in order from the longest string to the shortest
429
  custom_recogniser_word_list_flat = sorted(custom_recogniser_word_list_flat, key=len, reverse=True)
430
+ else:
431
+ custom_recogniser_word_list_flat = list()
432
 
433
+ ### Load/create whole page redaction list
434
  # If string, assume file path
435
  if isinstance(redact_whole_page_list, str):
436
+ if redact_whole_page_list:
437
+ redact_whole_page_list = pd.read_csv(redact_whole_page_list, header=None)
438
  if isinstance(redact_whole_page_list, pd.DataFrame):
439
  if not redact_whole_page_list.empty:
440
  try:
 
443
  print("Could not convert whole page redaction data to number list due to:", e)
444
  redact_whole_page_list_flat = redact_whole_page_list.iloc[:,0].tolist()
445
  else:
446
+ redact_whole_page_list_flat = list()
447
+ else:
448
+ redact_whole_page_list_flat = list()
449
 
450
+ ### Load/create PII identification method
451
 
452
  # Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
453
  if pii_identification_method == AWS_PII_OPTION:
454
+ if RUN_AWS_FUNCTIONS == "1" and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS == "1":
455
+ print("Connecting to Comprehend via existing SSO connection")
456
+ comprehend_client = boto3.client('comprehend', region_name=AWS_REGION)
457
+ elif aws_access_key_textbox and aws_secret_key_textbox:
458
  print("Connecting to Comprehend using AWS access key and secret keys from user input.")
459
  comprehend_client = boto3.client('comprehend',
460
  aws_access_key_id=aws_access_key_textbox,
 
477
 
478
  # Try to connect to AWS Textract Client if using that text extraction method
479
  if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
480
+ if RUN_AWS_FUNCTIONS == "1" and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS == "1":
481
+ print("Connecting to Textract via existing SSO connection")
482
+ textract_client = boto3.client('textract', region_name=AWS_REGION)
483
+ elif aws_access_key_textbox and aws_secret_key_textbox:
484
  print("Connecting to Textract using AWS access key and secret keys from user input.")
485
  textract_client = boto3.client('textract',
486
  aws_access_key_id=aws_access_key_textbox,
 
686
  print("Saving redacted PDF file:", out_redacted_pdf_file_path)
687
  save_pdf_with_or_without_compression(pymupdf_doc, out_redacted_pdf_file_path)
688
 
689
+ if isinstance(out_redacted_pdf_file_path, str):
690
+ out_file_paths.append(out_redacted_pdf_file_path)
691
+ else:
692
+ out_file_paths.append(out_redacted_pdf_file_path[0])
693
 
694
  if not all_page_line_level_ocr_results_df.empty:
695
  all_page_line_level_ocr_results_df = all_page_line_level_ocr_results_df[["page", "text", "left", "top", "width", "height", "line"]]
 
700
  all_page_line_level_ocr_results_df.sort_values(["page", "line"], inplace=True)
701
  all_page_line_level_ocr_results_df.to_csv(ocr_file_path, index = None, encoding="utf-8-sig")
702
 
703
+
704
+
705
+ if isinstance(ocr_file_path, str):
706
+ out_file_paths.append(ocr_file_path)
707
+ else:
708
+ duplication_file_path_outputs.append(ocr_file_path[0])
709
 
710
  if all_page_line_level_ocr_results_with_words:
711
  all_page_line_level_ocr_results_with_words = merge_page_results(all_page_line_level_ocr_results_with_words)
 
716
  all_page_line_level_ocr_results_with_words_df = word_level_ocr_output_to_dataframe(all_page_line_level_ocr_results_with_words)
717
 
718
  all_page_line_level_ocr_results_with_words_df = divide_coordinates_by_page_sizes(all_page_line_level_ocr_results_with_words_df, page_sizes_df, xmin="word_x0", xmax="word_x1", ymin="word_y0", ymax="word_y1")
719
+
720
+
721
 
722
  if text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION:
723
  # Coordinates need to be reversed for ymin and ymax to match with image annotator objects downstream
724
  if not all_page_line_level_ocr_results_with_words_df.empty:
 
 
 
725
  all_page_line_level_ocr_results_with_words_df['word_y0'] = reverse_y_coords(all_page_line_level_ocr_results_with_words_df, 'word_y0')
726
  all_page_line_level_ocr_results_with_words_df['word_y1'] = reverse_y_coords(all_page_line_level_ocr_results_with_words_df, 'word_y1')
727
 
 
729
  all_page_line_level_ocr_results_with_words_df['line_x0'] = ""
730
  all_page_line_level_ocr_results_with_words_df['line_x1'] = ""
731
  all_page_line_level_ocr_results_with_words_df['line_y0'] = ""
732
+ all_page_line_level_ocr_results_with_words_df['line_y1'] = ""
733
 
734
  all_page_line_level_ocr_results_with_words_df.sort_values(["page", "line", "word_x0"], inplace=True)
735
  all_page_line_level_ocr_results_with_words_df_file_path = all_page_line_level_ocr_results_with_words_json_file_path.replace(".json", ".csv")
736
  all_page_line_level_ocr_results_with_words_df.to_csv(all_page_line_level_ocr_results_with_words_df_file_path, index = None, encoding="utf-8-sig")
737
 
738
  if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
739
+ if isinstance(all_page_line_level_ocr_results_with_words_json_file_path, str):
740
+ log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
741
+ else:
742
+ log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path[0])
743
  log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
744
 
745
  if all_page_line_level_ocr_results_with_words_df_file_path not in log_files_output_paths:
746
+ if isinstance(all_page_line_level_ocr_results_with_words_df_file_path, str):
747
+ log_files_output_paths.append(all_page_line_level_ocr_results_with_words_df_file_path)
748
+ else:
749
+ log_files_output_paths.append(all_page_line_level_ocr_results_with_words_df_file_path[0])
750
 
751
  if all_page_line_level_ocr_results_with_words_df_file_path not in out_file_paths:
752
+ if isinstance(all_page_line_level_ocr_results_with_words_df_file_path, str):
753
+ out_file_paths.append(all_page_line_level_ocr_results_with_words_df_file_path)
754
+ else:
755
+ out_file_paths.append(all_page_line_level_ocr_results_with_words_df_file_path[0])
756
+
757
+
758
 
759
  # Convert the gradio annotation boxes to relative coordinates
760
  progress(0.93, "Creating review file output")
 
767
  # Save the gradio_annotation_boxes to a review csv file
768
  review_file_state = convert_annotation_json_to_review_df(annotations_all_pages_divide, all_pages_decision_process_table, page_sizes=page_sizes)
769
 
770
+
771
  # Don't need page sizes in outputs
772
  review_file_state.drop(["image_width", "image_height", "mediabox_width", "mediabox_height", "cropbox_width", "cropbox_height"], axis=1, inplace=True, errors="ignore")
773
 
774
+ if isinstance(review_file_path, str):
775
+ review_file_state.to_csv(review_file_path, index=None, encoding="utf-8-sig")
776
+ else:
777
+ review_file_state.to_csv(review_file_path[0], index=None, encoding="utf-8-sig")
778
 
779
+ if pii_identification_method != NO_REDACTION_PII_OPTION:
780
+ if isinstance(review_file_path, str):
781
+ out_file_paths.append(review_file_path)
782
+ else:
783
+ out_file_paths.append(review_file_path[0])
784
+
785
+ # Make a combined message for the file
786
+ if isinstance(combined_out_message, list): combined_out_message = '\n'.join(combined_out_message)
787
+ elif combined_out_message == None: combined_out_message = ""
788
 
789
+ if isinstance(out_message, list) and out_message: combined_out_message = combined_out_message + '\n'.join(out_message)
790
+ elif isinstance(out_message, str) and out_message: combined_out_message = combined_out_message + '\n' + out_message
 
 
 
791
 
792
  toc = time.perf_counter()
793
  time_taken = toc - tic
 
812
  with open(all_textract_request_metadata_file_path, "w") as f: f.write(all_request_metadata_str)
813
 
814
  # Add the request metadata to the log outputs if not there already
815
+ if all_textract_request_metadata_file_path not in log_files_output_paths:
816
+ if isinstance(all_textract_request_metadata_file_path, str):
817
+ log_files_output_paths.append(all_textract_request_metadata_file_path)
818
+ else:
819
+ log_files_output_paths.append(all_textract_request_metadata_file_path[0])
820
 
821
  new_textract_query_numbers = len(all_textract_request_metadata)
822
  total_textract_query_number += new_textract_query_numbers
 
833
 
834
  page_break_return = True
835
 
836
+ return combined_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages_divide, current_loop_page, page_break_return, all_page_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df, review_file_state, task_textbox
837
 
838
  def convert_pikepdf_coords_to_pymupdf(pymupdf_page:Page, pikepdf_bbox, type="pikepdf_annot"):
839
  '''
 
1335
 
1336
  # Process signature and handwriting results
1337
  if page_signature_recogniser_results or page_handwriting_recogniser_results:
1338
+
1339
  if "Extract handwriting" in handwrite_signature_checkbox:
1340
+ print("Extracting handwriting in merge_img_bboxes function")
1341
  merged_bboxes.extend(copy.deepcopy(page_handwriting_recogniser_results))
1342
 
1343
  if "Extract signatures" in handwrite_signature_checkbox:
1344
+ print("Extracting signatures in merge_img_bboxes function")
1345
  merged_bboxes.extend(copy.deepcopy(page_signature_recogniser_results))
1346
 
1347
  # Reconstruct bounding boxes for substrings of interest
 
2457
  all_page_line_text_extraction_characters.extend(line_characters)
2458
  all_page_line_level_ocr_results_with_words.append(line_level_ocr_results_with_words)
2459
 
2460
+ if page_text_ocr_outputs_list:
2461
+ page_text_ocr_outputs = pd.concat(page_text_ocr_outputs_list)
2462
+ else:
2463
+ page_text_ocr_outputs = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height", "line"])
2464
 
2465
  ### REDACTION
2466
  if pii_identification_method != NO_REDACTION_PII_OPTION:
tools/find_duplicate_pages.py CHANGED
@@ -1,7 +1,7 @@
1
  import pandas as pd
2
  import os
3
  import re
4
-
5
  from sklearn.feature_extraction.text import TfidfVectorizer
6
  from sklearn.metrics.pairwise import cosine_similarity
7
  from typing import List, Tuple, Optional, Dict, Union
@@ -725,7 +725,7 @@ def identify_similar_text_sequences(
725
  do_text_clean:bool = True,
726
  file1_name: str = '',
727
  file2_name: str = '',
728
- output_folder: str = "output/",
729
  progress=Progress(track_tqdm=True)
730
  ) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
731
  """
@@ -870,7 +870,7 @@ def handle_selection_and_preview(evt: gr.SelectData, results_df:pd.DataFrame, fu
870
  # 3. Return all three outputs in the correct order
871
  return selected_index, page1_data, page2_data
872
 
873
- def exclude_match(results_df:pd.DataFrame, selected_index:int, output_folder="./output/"):
874
  """
875
  Removes a selected row from the results DataFrame, regenerates output files,
876
  and clears the text preview panes.
@@ -895,12 +895,16 @@ def exclude_match(results_df:pd.DataFrame, selected_index:int, output_folder="./
895
  # Return the updated dataframe, the new file list, and clear the preview panes
896
  return updated_df, new_output_paths, None, None
897
 
898
- def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:int, min_consecutive:int, greedy_match:bool, combine_pages:bool=True, preview_length:int=500, progress=gr.Progress(track_tqdm=True)):
899
  """
900
  Wrapper function updated to include the 'greedy_match' boolean.
901
  """
902
  if not files:
903
  raise Warning("Please upload files to analyse.")
 
 
 
 
904
 
905
  progress(0, desc="Combining input files...")
906
  df_combined, _, full_out_ocr_df = combine_ocr_output_text(files, combine_pages=combine_pages)
@@ -916,6 +920,7 @@ def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:
916
  min_consecutive_pages=int(min_consecutive),
917
  greedy_match=greedy_match,
918
  combine_pages=combine_pages,
 
919
  progress=progress
920
  )
921
 
@@ -929,8 +934,11 @@ def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:
929
 
930
  if results_df.empty:
931
  gr.Info(f"No duplicate pages found, no results returned.")
 
 
 
932
 
933
- return results_df, output_paths, full_data_by_file
934
 
935
  def show_page_previews(full_data_by_file: dict, results_df: pd.DataFrame, evt: gr.SelectData, preview_length:int=500):
936
  """
 
1
  import pandas as pd
2
  import os
3
  import re
4
+ import time
5
  from sklearn.feature_extraction.text import TfidfVectorizer
6
  from sklearn.metrics.pairwise import cosine_similarity
7
  from typing import List, Tuple, Optional, Dict, Union
 
725
  do_text_clean:bool = True,
726
  file1_name: str = '',
727
  file2_name: str = '',
728
+ output_folder: str = OUTPUT_FOLDER,
729
  progress=Progress(track_tqdm=True)
730
  ) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
731
  """
 
870
  # 3. Return all three outputs in the correct order
871
  return selected_index, page1_data, page2_data
872
 
873
+ def exclude_match(results_df:pd.DataFrame, selected_index:int, output_folder=OUTPUT_FOLDER):
874
  """
875
  Removes a selected row from the results DataFrame, regenerates output files,
876
  and clears the text preview panes.
 
895
  # Return the updated dataframe, the new file list, and clear the preview panes
896
  return updated_df, new_output_paths, None, None
897
 
898
+ def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:int, min_consecutive:int, greedy_match:bool, combine_pages:bool=True, preview_length:int=500, output_folder:str=OUTPUT_FOLDER, progress=gr.Progress(track_tqdm=True)):
899
  """
900
  Wrapper function updated to include the 'greedy_match' boolean.
901
  """
902
  if not files:
903
  raise Warning("Please upload files to analyse.")
904
+
905
+ start_time = time.time()
906
+
907
+ task_textbox = "deduplicate"
908
 
909
  progress(0, desc="Combining input files...")
910
  df_combined, _, full_out_ocr_df = combine_ocr_output_text(files, combine_pages=combine_pages)
 
920
  min_consecutive_pages=int(min_consecutive),
921
  greedy_match=greedy_match,
922
  combine_pages=combine_pages,
923
+ output_folder=output_folder,
924
  progress=progress
925
  )
926
 
 
934
 
935
  if results_df.empty:
936
  gr.Info(f"No duplicate pages found, no results returned.")
937
+
938
+ end_time = time.time()
939
+ processing_time = round(end_time - start_time, 2)
940
 
941
+ return results_df, output_paths, full_data_by_file, processing_time, task_textbox
942
 
943
  def show_page_previews(full_data_by_file: dict, results_df: pd.DataFrame, evt: gr.SelectData, preview_length:int=500):
944
  """
tools/find_duplicate_tabular.py CHANGED
@@ -1,6 +1,7 @@
1
  import pandas as pd
2
  import os
3
  import re
 
4
  from sklearn.feature_extraction.text import TfidfVectorizer
5
  from sklearn.metrics.pairwise import cosine_similarity
6
  from typing import List, Tuple, Dict
@@ -10,9 +11,10 @@ from pathlib import Path
10
  from tools.helper_functions import OUTPUT_FOLDER, read_file
11
  from tools.data_anonymise import initial_clean
12
  from tools.load_spacy_model_custom_recognisers import nlp
13
- from tools.config import DO_INITIAL_TABULAR_DATA_CLEAN
14
 
15
- similarity_threshold = 0.95
 
16
 
17
  def clean_and_stem_text_series(df: pd.DataFrame, column: str, do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN):
18
  """
@@ -51,13 +53,17 @@ def convert_tabular_data_to_analysis_format(
51
  Returns:
52
  List[Tuple[str, pd.DataFrame]]: List containing (file_name, processed_df) tuple
53
  """
54
- if text_columns is None:
55
- # Auto-detect text columns (string type columns)
56
- text_columns = df.select_dtypes(include=['object', 'string']).columns.tolist()
 
 
 
 
57
 
58
  if not text_columns:
59
  print(f"No text columns found in {file_name}")
60
- return []
61
 
62
  # Create a copy to avoid modifying original
63
  df_copy = df.copy()
@@ -69,9 +75,9 @@ def convert_tabular_data_to_analysis_format(
69
  df_copy['row_id'] = df_copy.index
70
 
71
  # Create the format expected by the duplicate detection system
72
- # Using 'page' as row number and 'text' as the combined text
73
  processed_df = pd.DataFrame({
74
- 'page': df_copy['row_id'],
75
  'text': df_copy['combined_text'],
76
  'file': file_name
77
  })
@@ -86,13 +92,15 @@ def find_duplicate_cells_in_tabular_data(
86
  input_files: List[str],
87
  similarity_threshold: float = 0.95,
88
  min_word_count: int = 3,
89
- text_columns: List[str] = None,
90
  output_folder: str = OUTPUT_FOLDER,
91
  do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN,
 
 
92
  progress: Progress = Progress(track_tqdm=True)
93
  ) -> Tuple[pd.DataFrame, List[str], Dict[str, pd.DataFrame]]:
94
  """
95
- Find duplicate cells/text in tabular data files (CSV, XLSX).
96
 
97
  Args:
98
  input_files (List[str]): List of file paths to analyze
@@ -115,26 +123,49 @@ def find_duplicate_cells_in_tabular_data(
115
 
116
  progress(0.1, desc="Loading and processing files...")
117
 
118
- all_data_to_process = []
119
- full_data_by_file = {}
120
- file_paths = []
121
 
122
  # Process each file
123
  for file_path in input_files:
124
  try:
125
- df = read_file(file_path)
126
-
127
- file_name = os.path.basename(file_path)
128
- file_paths.append(file_path)
129
-
130
- # Convert to analysis format
131
- processed_data = convert_tabular_data_to_analysis_format(
132
- df, file_name, text_columns
133
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
- if processed_data:
136
- all_data_to_process.extend(processed_data)
137
- full_data_by_file[file_name] = processed_data[0][1]
 
 
 
 
 
 
 
 
138
 
139
  except Exception as e:
140
  print(f"Error processing {file_path}: {e}")
@@ -147,6 +178,8 @@ def find_duplicate_cells_in_tabular_data(
147
 
148
  # Combine all data
149
  combined_df = pd.concat([data[1] for data in all_data_to_process], ignore_index=True)
 
 
150
 
151
  progress(0.3, desc="Cleaning and preparing text...")
152
 
@@ -188,9 +221,9 @@ def find_duplicate_cells_in_tabular_data(
188
 
189
  results_data.append({
190
  'File1': row1_data['file'],
191
- 'Row1': int(row1_data['page']),
192
  'File2': row2_data['file'],
193
- 'Row2': int(row2_data['page']),
194
  'Similarity_Score': round(similarity, 3),
195
  'Text1': row1_data['text'][:200] + '...' if len(row1_data['text']) > 200 else row1_data['text'],
196
  'Text2': row2_data['text'][:200] + '...' if len(row2_data['text']) > 200 else row2_data['text'],
@@ -204,13 +237,13 @@ def find_duplicate_cells_in_tabular_data(
204
  progress(0.9, desc="Saving results...")
205
 
206
  # Save results
207
- output_paths = save_tabular_duplicate_results(results_df, output_folder, file_paths, file_replaced_index=0)
208
 
209
  gr.Info(f"Found {len(results_df)} duplicate cell matches")
210
 
211
  return results_df, output_paths, full_data_by_file
212
 
213
- def save_tabular_duplicate_results(results_df: pd.DataFrame, output_folder: str, file_paths: List[str], file_replaced_index: int = 0) -> List[str]:
214
  """
215
  Save tabular duplicate detection results to files.
216
 
@@ -218,52 +251,163 @@ def save_tabular_duplicate_results(results_df: pd.DataFrame, output_folder: str,
218
  results_df (pd.DataFrame): Results DataFrame
219
  output_folder (str): Output folder path
220
  file_paths (List[str]): List of file paths
221
- file_replaced_index (int): Index of the file to replace with duplicate rows removed
222
- (0 is the first file in the list)
223
  Returns:
224
  List[str]: List of output file paths
225
  """
226
- output_paths = []
227
  output_folder_path = Path(output_folder)
228
  output_folder_path.mkdir(exist_ok=True)
229
 
230
  if results_df.empty:
231
  print("No duplicate matches to save.")
232
- return []
233
 
234
  # Save main results
235
  results_file = output_folder_path / 'tabular_duplicate_results.csv'
236
  results_df.to_csv(results_file, index=False, encoding="utf-8-sig")
237
  output_paths.append(str(results_file))
238
 
 
 
 
239
  # Save per-file duplicate lists
240
- for file_name, group in results_df.groupby('File1'):
241
- file_stem = Path(file_name).stem
242
- duplicate_rows_file = output_folder_path / f"{file_stem}_duplicate_rows.csv"
243
-
244
- # Get unique row numbers to remove
245
- rows_to_remove = sorted(group['Row1'].unique())
246
- duplicate_df = pd.DataFrame({'Row_to_Remove': rows_to_remove})
247
- duplicate_df.to_csv(duplicate_rows_file, index=False)
248
- output_paths.append(str(duplicate_rows_file))
249
-
250
- # Save also original file (first file in list) with duplicate rows removed
251
- file_path = file_paths[file_replaced_index]
252
- file_base_name = os.path.basename(file_path)
253
- df = read_file(file_path)
254
- df_cleaned = df.drop(index=rows_to_remove).reset_index(drop=True)
255
-
256
- output_path = os.path.join(output_folder, f"{file_base_name}_deduplicated.csv")
257
- df_cleaned.to_csv(output_path, index=False, encoding="utf-8-sig")
258
-
259
- output_paths.append(str(output_path))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
 
261
  return output_paths
262
 
263
  def remove_duplicate_rows_from_tabular_data(
264
  file_path: str,
265
  duplicate_rows: List[int],
266
- output_folder: str = OUTPUT_FOLDER
 
 
267
  ) -> str:
268
  """
269
  Remove duplicate rows from a tabular data file.
@@ -272,13 +416,14 @@ def remove_duplicate_rows_from_tabular_data(
272
  file_path (str): Path to the input file
273
  duplicate_rows (List[int]): List of row indices to remove
274
  output_folder (str): Output folder for cleaned file
275
-
 
276
  Returns:
277
  str: Path to the cleaned file
278
  """
279
  try:
280
  # Load the file
281
- df = read_file(file_path)
282
 
283
  # Remove duplicate rows (0-indexed)
284
  df_cleaned = df.drop(index=duplicate_rows).reset_index(drop=True)
@@ -286,12 +431,12 @@ def remove_duplicate_rows_from_tabular_data(
286
  # Save cleaned file
287
  file_name = os.path.basename(file_path)
288
  file_stem = os.path.splitext(file_name)[0]
289
- file_ext = os.path.splitext(file_name)[1]
290
 
291
  output_path = os.path.join(output_folder, f"{file_stem}_deduplicated{file_ext}")
292
 
293
  if file_ext in ['.xlsx', '.xls']:
294
- df_cleaned.to_excel(output_path, index=False)
295
  elif file_ext in ['.parquet']:
296
  df_cleaned.to_parquet(output_path, index=False)
297
  else:
@@ -307,9 +452,11 @@ def run_tabular_duplicate_analysis(
307
  files: List[str],
308
  threshold: float,
309
  min_words: int,
310
- text_columns: List[str] = None,
311
  output_folder: str = OUTPUT_FOLDER,
312
  do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN,
 
 
313
  progress: Progress = Progress(track_tqdm=True)
314
  ) -> Tuple[pd.DataFrame, List[str], Dict[str, pd.DataFrame]]:
315
  """
@@ -330,26 +477,37 @@ def run_tabular_duplicate_analysis(
330
  input_files=files,
331
  similarity_threshold=threshold,
332
  min_word_count=min_words,
333
- text_columns=text_columns,
334
  output_folder=output_folder,
335
  do_initial_clean_dup=do_initial_clean_dup,
336
- progress=progress
 
337
  )
338
 
339
 
340
 
341
  # Function to update column choices when files are uploaded
342
- def update_tabular_column_choices(files):
343
  if not files:
344
  return gr.update(choices=[])
345
 
346
  all_columns = set()
347
  for file in files:
348
  try:
349
- df = read_file(file.name)
350
-
 
 
 
 
 
 
 
 
 
351
  # Get text columns
352
  text_cols = df.select_dtypes(include=['object', 'string']).columns.tolist()
 
353
  all_columns.update(text_cols)
354
  except Exception as e:
355
  print(f"Error reading {file.name}: {e}")
@@ -358,26 +516,59 @@ def update_tabular_column_choices(files):
358
  return gr.Dropdown(choices=sorted(list(all_columns)))
359
 
360
  # Function to handle tabular duplicate detection
361
- def run_tabular_duplicate_detection(files, threshold, min_words, text_columns, output_folder: str = OUTPUT_FOLDER, do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN):
362
  if not files:
363
- return pd.DataFrame(), [], gr.Dropdown(choices=[])
 
 
 
 
 
 
 
 
364
 
365
- file_paths = [f.name for f in files]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
366
  results_df, output_paths, full_data = run_tabular_duplicate_analysis(
367
  files=file_paths,
368
  threshold=threshold,
369
  min_words=min_words,
370
- text_columns=text_columns if text_columns else None,
371
  output_folder=output_folder,
372
- do_initial_clean_dup=do_initial_clean_dup
 
 
373
  )
374
-
375
- print("output_paths:", output_paths)
376
 
377
  # Update file choices for cleaning
378
  file_choices = list(set([f for f in file_paths]))
 
 
 
379
 
380
- return results_df, output_paths, gr.Dropdown(choices=file_choices)
381
 
382
  # Function to handle row selection for preview
383
  def handle_tabular_row_selection(results_df, evt:gr.SelectData):
@@ -398,12 +589,12 @@ def handle_tabular_row_selection(results_df, evt:gr.SelectData):
398
  return selected_index, row['Text1'], row['Text2']
399
 
400
  # Function to clean duplicates from selected file
401
- def clean_tabular_duplicates(file_name, results_df, output_folder):
402
  if not file_name or results_df.empty:
403
  return None
404
 
405
  # Get duplicate rows for this file
406
- file_duplicates = results_df[results_df['File1'] == file_name]['Row1'].tolist()
407
 
408
  if not file_duplicates:
409
  return None
@@ -414,7 +605,9 @@ def clean_tabular_duplicates(file_name, results_df, output_folder):
414
  cleaned_file = remove_duplicate_rows_from_tabular_data(
415
  file_path=file_name,
416
  duplicate_rows=file_duplicates,
417
- output_folder=output_folder
 
 
418
  )
419
  return cleaned_file
420
  except Exception as e:
 
1
  import pandas as pd
2
  import os
3
  import re
4
+ import time
5
  from sklearn.feature_extraction.text import TfidfVectorizer
6
  from sklearn.metrics.pairwise import cosine_similarity
7
  from typing import List, Tuple, Dict
 
11
  from tools.helper_functions import OUTPUT_FOLDER, read_file
12
  from tools.data_anonymise import initial_clean
13
  from tools.load_spacy_model_custom_recognisers import nlp
14
+ from tools.config import DO_INITIAL_TABULAR_DATA_CLEAN, REMOVE_DUPLICATE_ROWS
15
 
16
+ if REMOVE_DUPLICATE_ROWS == "True": REMOVE_DUPLICATE_ROWS = True
17
+ else: REMOVE_DUPLICATE_ROWS = False
18
 
19
  def clean_and_stem_text_series(df: pd.DataFrame, column: str, do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN):
20
  """
 
53
  Returns:
54
  List[Tuple[str, pd.DataFrame]]: List containing (file_name, processed_df) tuple
55
  """
56
+ # if text_columns is None:
57
+ # # Auto-detect text columns (string type columns)
58
+ # print(f"No text columns given for {file_name}")
59
+ # return []
60
+ # text_columns = df.select_dtypes(include=['object', 'string']).columns.tolist()
61
+
62
+ text_columns = [col for col in text_columns if col in df.columns]
63
 
64
  if not text_columns:
65
  print(f"No text columns found in {file_name}")
66
+ return list()
67
 
68
  # Create a copy to avoid modifying original
69
  df_copy = df.copy()
 
75
  df_copy['row_id'] = df_copy.index
76
 
77
  # Create the format expected by the duplicate detection system
78
+ # Using 'row_number' as row number and 'text' as the combined text
79
  processed_df = pd.DataFrame({
80
+ 'row_number': df_copy['row_id'],
81
  'text': df_copy['combined_text'],
82
  'file': file_name
83
  })
 
92
  input_files: List[str],
93
  similarity_threshold: float = 0.95,
94
  min_word_count: int = 3,
95
+ text_columns: List[str] = [],
96
  output_folder: str = OUTPUT_FOLDER,
97
  do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN,
98
+ remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS,
99
+ in_excel_tabular_sheets: str = "",
100
  progress: Progress = Progress(track_tqdm=True)
101
  ) -> Tuple[pd.DataFrame, List[str], Dict[str, pd.DataFrame]]:
102
  """
103
+ Find duplicate cells/text in tabular data files (CSV, XLSX, Parquet).
104
 
105
  Args:
106
  input_files (List[str]): List of file paths to analyze
 
123
 
124
  progress(0.1, desc="Loading and processing files...")
125
 
126
+ all_data_to_process = list()
127
+ full_data_by_file = dict()
128
+ file_paths = list()
129
 
130
  # Process each file
131
  for file_path in input_files:
132
  try:
133
+ if file_path.endswith('.xlsx') or file_path.endswith('.xls'):
134
+ temp_df = pd.DataFrame()
135
+
136
+ # Try finding each sheet in the given list until a match is found
137
+ for sheet_name in in_excel_tabular_sheets:
138
+ temp_df = read_file(file_path, excel_sheet_name=sheet_name)
139
+
140
+ # If sheet was successfully_loaded
141
+ if not temp_df.empty:
142
+ file_name = os.path.basename(file_path) + "_" + sheet_name
143
+ file_paths.append(file_path)
144
+
145
+ # Convert to analysis format
146
+ processed_data = convert_tabular_data_to_analysis_format(
147
+ temp_df, file_name, text_columns
148
+ )
149
+
150
+ if processed_data:
151
+ all_data_to_process.extend(processed_data)
152
+ full_data_by_file[file_name] = processed_data[0][1]
153
+
154
+ temp_df = pd.DataFrame()
155
+ else:
156
+ temp_df = read_file(file_path)
157
 
158
+ file_name = os.path.basename(file_path)
159
+ file_paths.append(file_path)
160
+
161
+ # Convert to analysis format
162
+ processed_data = convert_tabular_data_to_analysis_format(
163
+ temp_df, file_name, text_columns
164
+ )
165
+
166
+ if processed_data:
167
+ all_data_to_process.extend(processed_data)
168
+ full_data_by_file[file_name] = processed_data[0][1]
169
 
170
  except Exception as e:
171
  print(f"Error processing {file_path}: {e}")
 
178
 
179
  # Combine all data
180
  combined_df = pd.concat([data[1] for data in all_data_to_process], ignore_index=True)
181
+
182
+ combined_df = combined_df.drop_duplicates(subset=['row_number', 'file'])
183
 
184
  progress(0.3, desc="Cleaning and preparing text...")
185
 
 
221
 
222
  results_data.append({
223
  'File1': row1_data['file'],
224
+ 'Row1': int(row1_data['row_number']),
225
  'File2': row2_data['file'],
226
+ 'Row2': int(row2_data['row_number']),
227
  'Similarity_Score': round(similarity, 3),
228
  'Text1': row1_data['text'][:200] + '...' if len(row1_data['text']) > 200 else row1_data['text'],
229
  'Text2': row2_data['text'][:200] + '...' if len(row2_data['text']) > 200 else row2_data['text'],
 
237
  progress(0.9, desc="Saving results...")
238
 
239
  # Save results
240
+ output_paths = save_tabular_duplicate_results(results_df, output_folder, file_paths, remove_duplicate_rows=remove_duplicate_rows, in_excel_tabular_sheets=in_excel_tabular_sheets)
241
 
242
  gr.Info(f"Found {len(results_df)} duplicate cell matches")
243
 
244
  return results_df, output_paths, full_data_by_file
245
 
246
+ def save_tabular_duplicate_results(results_df: pd.DataFrame, output_folder: str, file_paths: List[str], remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS, in_excel_tabular_sheets: List[str] = []) -> List[str]:
247
  """
248
  Save tabular duplicate detection results to files.
249
 
 
251
  results_df (pd.DataFrame): Results DataFrame
252
  output_folder (str): Output folder path
253
  file_paths (List[str]): List of file paths
254
+ remove_duplicate_rows (bool): Whether to remove duplicate rows
255
+ in_excel_tabular_sheets (str): Name of the Excel sheet to save the results to
256
  Returns:
257
  List[str]: List of output file paths
258
  """
259
+ output_paths = list()
260
  output_folder_path = Path(output_folder)
261
  output_folder_path.mkdir(exist_ok=True)
262
 
263
  if results_df.empty:
264
  print("No duplicate matches to save.")
265
+ return list()
266
 
267
  # Save main results
268
  results_file = output_folder_path / 'tabular_duplicate_results.csv'
269
  results_df.to_csv(results_file, index=False, encoding="utf-8-sig")
270
  output_paths.append(str(results_file))
271
 
272
+ # Group results by original file to handle Excel files properly
273
+ excel_files_processed = dict() # Track which Excel files have been processed
274
+
275
  # Save per-file duplicate lists
276
+ for file_name, group in results_df.groupby('File2'):
277
+ # Check for matches with original file names
278
+ for original_file in file_paths:
279
+ original_file_name = os.path.basename(original_file)
280
+
281
+ if original_file_name in file_name:
282
+ original_file_extension = os.path.splitext(original_file)[-1]
283
+ if original_file_extension in ['.xlsx', '.xls']:
284
+
285
+ # Split the string using a regex to handle both .xlsx_ and .xls_ delimiters
286
+ # The regex r'\.xlsx_|\.xls_' correctly matches either ".xlsx_" or ".xls_" as a delimiter.
287
+ parts = re.split(r'\.xlsx_|\.xls_', os.path.basename(file_name))
288
+ # The sheet name is the last part after splitting
289
+ file_sheet_name = parts[-1]
290
+
291
+ file_path = original_file
292
+
293
+ # Initialize Excel file tracking if not already done
294
+ if file_path not in excel_files_processed:
295
+ excel_files_processed[file_path] = {
296
+ 'sheets_data': dict(),
297
+ 'all_sheets': list(),
298
+ 'processed_sheets': set()
299
+ }
300
+
301
+ # Read the original Excel file to get all sheet names
302
+ if not excel_files_processed[file_path]['all_sheets']:
303
+ try:
304
+ excel_file = pd.ExcelFile(file_path)
305
+ excel_files_processed[file_path]['all_sheets'] = excel_file.sheet_names
306
+ except Exception as e:
307
+ print(f"Error reading Excel file {file_path}: {e}")
308
+ continue
309
+
310
+ # Read the current sheet
311
+ df = read_file(file_path, excel_sheet_name=file_sheet_name)
312
+
313
+ # Create duplicate rows file for this sheet
314
+ file_stem = Path(file_name).stem
315
+ duplicate_rows_file = output_folder_path / f"{file_stem}_{file_sheet_name}_duplicate_rows.csv"
316
+
317
+ # Get unique row numbers to remove
318
+ rows_to_remove = sorted(group['Row2'].unique())
319
+ duplicate_df = pd.DataFrame({'Row_to_Remove': rows_to_remove})
320
+ duplicate_df.to_csv(duplicate_rows_file, index=False)
321
+ output_paths.append(str(duplicate_rows_file))
322
+
323
+ # Process the sheet data
324
+ df_cleaned = df.copy()
325
+ df_cleaned["duplicated"] = False
326
+ df_cleaned.loc[rows_to_remove, "duplicated"] = True
327
+ if remove_duplicate_rows:
328
+ df_cleaned = df_cleaned.drop(index=rows_to_remove)
329
+
330
+ # Store the processed sheet data
331
+ excel_files_processed[file_path]['sheets_data'][file_sheet_name] = df_cleaned
332
+ excel_files_processed[file_path]['processed_sheets'].add(file_sheet_name)
333
+
334
+ else:
335
+ file_sheet_name = ""
336
+ file_path = original_file
337
+ print("file_path after match:", file_path)
338
+ file_base_name = os.path.basename(file_path)
339
+ df = read_file(file_path)
340
+
341
+ file_stem = Path(file_name).stem
342
+ duplicate_rows_file = output_folder_path / f"{file_stem}_duplicate_rows.csv"
343
+
344
+ # Get unique row numbers to remove
345
+ rows_to_remove = sorted(group['Row2'].unique())
346
+ duplicate_df = pd.DataFrame({'Row_to_Remove': rows_to_remove})
347
+ duplicate_df.to_csv(duplicate_rows_file, index=False)
348
+ output_paths.append(str(duplicate_rows_file))
349
+
350
+ df_cleaned = df.copy()
351
+ df_cleaned["duplicated"] = False
352
+ df_cleaned.loc[rows_to_remove, "duplicated"] = True
353
+ if remove_duplicate_rows:
354
+ df_cleaned = df_cleaned.drop(index=rows_to_remove)
355
+
356
+ file_ext = os.path.splitext(file_name)[-1]
357
+
358
+ if file_ext in ['.parquet']:
359
+ output_path = os.path.join(output_folder, f"{file_base_name}_deduplicated.parquet")
360
+ df_cleaned.to_parquet(output_path, index=False)
361
+ else:
362
+ output_path = os.path.join(output_folder, f"{file_base_name}_deduplicated.csv")
363
+ df_cleaned.to_csv(output_path, index=False, encoding="utf-8-sig")
364
+
365
+ output_paths.append(str(output_path))
366
+ break
367
+
368
+ # Process Excel files to create complete deduplicated files
369
+ for file_path, file_data in excel_files_processed.items():
370
+ try:
371
+ # Create output filename
372
+ file_base_name = os.path.splitext(os.path.basename(file_path))[0]
373
+ file_ext = os.path.splitext(file_path)[-1]
374
+ output_path = os.path.join(output_folder, f"{file_base_name}_deduplicated{file_ext}")
375
+
376
+ # Create Excel writer
377
+ with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
378
+ # Write all sheets
379
+ for sheet_name in file_data['all_sheets']:
380
+ if sheet_name in file_data['processed_sheets']:
381
+ # Use the processed (deduplicated) version
382
+ file_data['sheets_data'][sheet_name].to_excel(
383
+ writer,
384
+ sheet_name=sheet_name,
385
+ index=False
386
+ )
387
+ else:
388
+ # Use the original sheet (no duplicates found)
389
+ original_df = read_file(file_path, excel_sheet_name=sheet_name)
390
+ original_df.to_excel(
391
+ writer,
392
+ sheet_name=sheet_name,
393
+ index=False
394
+ )
395
+
396
+ output_paths.append(str(output_path))
397
+ print(f"Created deduplicated Excel file: {output_path}")
398
+
399
+ except Exception as e:
400
+ print(f"Error creating deduplicated Excel file for {file_path}: {e}")
401
+ continue
402
 
403
  return output_paths
404
 
405
  def remove_duplicate_rows_from_tabular_data(
406
  file_path: str,
407
  duplicate_rows: List[int],
408
+ output_folder: str = OUTPUT_FOLDER,
409
+ in_excel_tabular_sheets: List[str] = [],
410
+ remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS
411
  ) -> str:
412
  """
413
  Remove duplicate rows from a tabular data file.
 
416
  file_path (str): Path to the input file
417
  duplicate_rows (List[int]): List of row indices to remove
418
  output_folder (str): Output folder for cleaned file
419
+ in_excel_tabular_sheets (str): Name of the Excel sheet to save the results to
420
+ remove_duplicate_rows (bool): Whether to remove duplicate rows
421
  Returns:
422
  str: Path to the cleaned file
423
  """
424
  try:
425
  # Load the file
426
+ df = read_file(file_path, excel_sheet_name=in_excel_tabular_sheets if in_excel_tabular_sheets else "")
427
 
428
  # Remove duplicate rows (0-indexed)
429
  df_cleaned = df.drop(index=duplicate_rows).reset_index(drop=True)
 
431
  # Save cleaned file
432
  file_name = os.path.basename(file_path)
433
  file_stem = os.path.splitext(file_name)[0]
434
+ file_ext = os.path.splitext(file_name)[-1]
435
 
436
  output_path = os.path.join(output_folder, f"{file_stem}_deduplicated{file_ext}")
437
 
438
  if file_ext in ['.xlsx', '.xls']:
439
+ df_cleaned.to_excel(output_path, index=False, sheet_name=in_excel_tabular_sheets if in_excel_tabular_sheets else [])
440
  elif file_ext in ['.parquet']:
441
  df_cleaned.to_parquet(output_path, index=False)
442
  else:
 
452
  files: List[str],
453
  threshold: float,
454
  min_words: int,
455
+ text_columns: List[str] = [],
456
  output_folder: str = OUTPUT_FOLDER,
457
  do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN,
458
+ remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS,
459
+ in_excel_tabular_sheets: List[str] = [],
460
  progress: Progress = Progress(track_tqdm=True)
461
  ) -> Tuple[pd.DataFrame, List[str], Dict[str, pd.DataFrame]]:
462
  """
 
477
  input_files=files,
478
  similarity_threshold=threshold,
479
  min_word_count=min_words,
480
+ text_columns=text_columns if text_columns else [],
481
  output_folder=output_folder,
482
  do_initial_clean_dup=do_initial_clean_dup,
483
+ in_excel_tabular_sheets=in_excel_tabular_sheets if in_excel_tabular_sheets else [],
484
+ remove_duplicate_rows=remove_duplicate_rows
485
  )
486
 
487
 
488
 
489
  # Function to update column choices when files are uploaded
490
+ def update_tabular_column_choices(files, in_excel_tabular_sheets: List[str] = []):
491
  if not files:
492
  return gr.update(choices=[])
493
 
494
  all_columns = set()
495
  for file in files:
496
  try:
497
+ file_extension = os.path.splitext(file.name)[-1]
498
+ if file_extension in ['.xlsx', '.xls']:
499
+ for sheet_name in in_excel_tabular_sheets:
500
+ df = read_file(file.name, excel_sheet_name=sheet_name)
501
+ text_cols = df.select_dtypes(include=['object', 'string']).columns.tolist()
502
+ all_columns.update(text_cols)
503
+ else:
504
+ df = read_file(file.name)
505
+ text_cols = df.select_dtypes(include=['object', 'string']).columns.tolist()
506
+ all_columns.update(text_cols)
507
+
508
  # Get text columns
509
  text_cols = df.select_dtypes(include=['object', 'string']).columns.tolist()
510
+
511
  all_columns.update(text_cols)
512
  except Exception as e:
513
  print(f"Error reading {file.name}: {e}")
 
516
  return gr.Dropdown(choices=sorted(list(all_columns)))
517
 
518
  # Function to handle tabular duplicate detection
519
+ def run_tabular_duplicate_detection(files, threshold, min_words, text_columns, output_folder: str = OUTPUT_FOLDER, do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN, in_excel_tabular_sheets: List[str] = [], remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS):
520
  if not files:
521
+ print("No files uploaded")
522
+ return pd.DataFrame(), [], gr.Dropdown(choices=[]), 0, "deduplicate"
523
+
524
+ start_time = time.time()
525
+
526
+ task_textbox = "deduplicate"
527
+
528
+ # If output folder doesn't end with a forward slash, add one
529
+ if not output_folder.endswith('/'): output_folder = output_folder + '/'
530
 
531
+ file_paths = []
532
+ if isinstance(files, str):
533
+ # If 'files' is a single string, treat it as a list with one element
534
+ file_paths.append(files)
535
+ elif isinstance(files, list):
536
+ # If 'files' is a list, iterate through its elements
537
+ for f_item in files:
538
+ if isinstance(f_item, str):
539
+ # If an element is a string, it's a direct file path
540
+ file_paths.append(f_item)
541
+ elif hasattr(f_item, 'name'):
542
+ # If an element has a '.name' attribute (e.g., a Gradio File object), use its name
543
+ file_paths.append(f_item.name)
544
+ else:
545
+ # Log a warning for unexpected element types within the list
546
+ print(f"Warning: Skipping an element in 'files' list that is neither a string nor has a '.name' attribute: {type(f_item)}")
547
+ elif hasattr(files, 'name'):
548
+ # Handle the case where a single file object (e.g., gr.File) is passed directly, not in a list
549
+ file_paths.append(files.name)
550
+ else:
551
+ # Raise an error for any other unexpected type of the 'files' argument itself
552
+ raise TypeError(f"Unexpected type for 'files' argument: {type(files)}. Expected str, list of str/file objects, or a single file object.")
553
+
554
  results_df, output_paths, full_data = run_tabular_duplicate_analysis(
555
  files=file_paths,
556
  threshold=threshold,
557
  min_words=min_words,
558
+ text_columns=text_columns if text_columns else [],
559
  output_folder=output_folder,
560
+ do_initial_clean_dup=do_initial_clean_dup,
561
+ in_excel_tabular_sheets=in_excel_tabular_sheets if in_excel_tabular_sheets else None,
562
+ remove_duplicate_rows=remove_duplicate_rows
563
  )
 
 
564
 
565
  # Update file choices for cleaning
566
  file_choices = list(set([f for f in file_paths]))
567
+
568
+ end_time = time.time()
569
+ processing_time = round(end_time - start_time, 2)
570
 
571
+ return results_df, output_paths, gr.Dropdown(choices=file_choices), processing_time, task_textbox
572
 
573
  # Function to handle row selection for preview
574
  def handle_tabular_row_selection(results_df, evt:gr.SelectData):
 
589
  return selected_index, row['Text1'], row['Text2']
590
 
591
  # Function to clean duplicates from selected file
592
+ def clean_tabular_duplicates(file_name, results_df, output_folder, in_excel_tabular_sheets: str = "", remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS):
593
  if not file_name or results_df.empty:
594
  return None
595
 
596
  # Get duplicate rows for this file
597
+ file_duplicates = results_df[results_df['File2'] == file_name]['Row2'].tolist()
598
 
599
  if not file_duplicates:
600
  return None
 
605
  cleaned_file = remove_duplicate_rows_from_tabular_data(
606
  file_path=file_name,
607
  duplicate_rows=file_duplicates,
608
+ output_folder=output_folder,
609
+ in_excel_tabular_sheets=in_excel_tabular_sheets,
610
+ remove_duplicate_rows=remove_duplicate_rows
611
  )
612
  return cleaned_file
613
  except Exception as e:
tools/helper_functions.py CHANGED
@@ -10,7 +10,6 @@ from typing import List
10
  from math import ceil
11
  from gradio_image_annotation import image_annotator
12
  from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, AWS_PII_OPTION, MAPPED_LANGUAGE_CHOICES, LANGUAGE_CHOICES, textract_language_choices, aws_comprehend_language_choices, DEFAULT_LANGUAGE
13
- # from tools.load_spacy_model_custom_recognisers import nlp_analyser
14
 
15
  def _get_env_list(env_var_name: str) -> List[str]:
16
  """Parses a comma-separated environment variable into a list of strings."""
@@ -147,14 +146,21 @@ def detect_file_type(filename:str):
147
  elif filename.endswith('.docx'): return 'docx'
148
  else: raise ValueError("Unsupported file type.")
149
 
150
- def read_file(filename:str):
151
  """Read the file based on its detected type."""
152
  file_type = detect_file_type(filename)
153
 
154
  if file_type == 'csv':
155
  return pd.read_csv(filename, low_memory=False)
156
  elif file_type == 'xlsx':
157
- return pd.read_excel(filename)
 
 
 
 
 
 
 
158
  elif file_type == 'parquet':
159
  return pd.read_parquet(filename)
160
 
 
10
  from math import ceil
11
  from gradio_image_annotation import image_annotator
12
  from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, AWS_PII_OPTION, MAPPED_LANGUAGE_CHOICES, LANGUAGE_CHOICES, textract_language_choices, aws_comprehend_language_choices, DEFAULT_LANGUAGE
 
13
 
14
  def _get_env_list(env_var_name: str) -> List[str]:
15
  """Parses a comma-separated environment variable into a list of strings."""
 
146
  elif filename.endswith('.docx'): return 'docx'
147
  else: raise ValueError("Unsupported file type.")
148
 
149
+ def read_file(filename:str, excel_sheet_name: str = ""):
150
  """Read the file based on its detected type."""
151
  file_type = detect_file_type(filename)
152
 
153
  if file_type == 'csv':
154
  return pd.read_csv(filename, low_memory=False)
155
  elif file_type == 'xlsx':
156
+ if excel_sheet_name:
157
+ try:
158
+ return pd.read_excel(filename, sheet_name=excel_sheet_name)
159
+ except Exception as e:
160
+ print(f"Error reading {filename} with sheet name {excel_sheet_name}: {e}")
161
+ return pd.DataFrame()
162
+ else:
163
+ return pd.read_excel(filename)
164
  elif file_type == 'parquet':
165
  return pd.read_parquet(filename)
166
 
tools/load_spacy_model_custom_recognisers.py CHANGED
@@ -11,10 +11,13 @@ import re
11
  import os
12
  import requests
13
  import gradio as gr
14
- from tools.config import DEFAULT_LANGUAGE, TESSERACT_DATA_FOLDER
 
15
 
16
  score_threshold = 0.001
17
- custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"]
 
 
18
 
19
  # Create a class inheriting from SpacyNlpEngine
20
  class LoadedSpacyNlpEngine(SpacyNlpEngine):
 
11
  import os
12
  import requests
13
  import gradio as gr
14
+ from tools.config import DEFAULT_LANGUAGE, TESSERACT_DATA_FOLDER, CUSTOM_ENTITIES
15
+ from tools.helper_functions import _get_env_list
16
 
17
  score_threshold = 0.001
18
+
19
+ if CUSTOM_ENTITIES: CUSTOM_ENTITIES = _get_env_list(CUSTOM_ENTITIES)
20
+ custom_entities = CUSTOM_ENTITIES
21
 
22
  # Create a class inheriting from SpacyNlpEngine
23
  class LoadedSpacyNlpEngine(SpacyNlpEngine):
tools/textract_batch_call.py CHANGED
@@ -1,17 +1,17 @@
1
  import boto3
2
- import time
3
  import os
4
  import pandas as pd
5
  import json
6
  import logging
7
  import datetime
 
8
  import gradio as gr
9
  from gradio import FileData
10
  from typing import List
11
  from io import StringIO
12
  from urllib.parse import urlparse
13
  from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
14
- from tools.config import TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, TEXTRACT_JOBS_S3_INPUT_LOC, RUN_AWS_FUNCTIONS, INPUT_FOLDER, DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS
15
  from tools.aws_functions import download_file_from_s3
16
  from tools.file_conversion import get_input_file_names
17
  from tools.helper_functions import get_file_name_without_type
@@ -59,6 +59,7 @@ def analyse_document_with_textract_api(
59
 
60
  # This is a variable that is written to logs to indicate that a Textract API call was made
61
  is_a_textract_api_call = True
 
62
 
63
  # Keep only latest pdf path if it's a list
64
  if isinstance(local_pdf_path, list):
@@ -67,6 +68,23 @@ def analyse_document_with_textract_api(
67
  if not os.path.exists(local_pdf_path):
68
  raise FileNotFoundError(f"Input document not found {local_pdf_path}")
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  if not os.path.exists(local_output_dir):
71
  os.makedirs(local_output_dir)
72
  log_message = f"Created local output directory: {local_output_dir}"
@@ -96,18 +114,32 @@ def analyse_document_with_textract_api(
96
  #logging.error(log_message)
97
  raise
98
 
99
- # If job_df is not empty
100
  if not job_df.empty:
 
 
 
 
 
101
  if "file_name" in job_df.columns:
102
  matching_job_id_file_names = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(analyse_signatures)), "file_name"]
 
 
 
103
 
104
- if len(matching_job_id_file_names) > 0:
105
- raise Exception("Existing Textract outputs found. No need to re-analyse. Please download existing results from the list")
 
 
 
 
 
 
 
106
 
107
  # --- 2. Start Textract Document Analysis ---
108
  message = "Starting Textract document analysis job..."
109
  print(message)
110
- #logging.info("Starting Textract document analysis job...")
111
 
112
  try:
113
  if "Extract signatures" in analyse_signatures:
@@ -143,19 +175,12 @@ def analyse_document_with_textract_api(
143
  'S3Bucket': s3_bucket_name,
144
  'S3Prefix': s3_output_prefix
145
  }
146
- # Optional: Add NotificationChannel for SNS topic notifications
147
- # NotificationChannel={
148
- # 'SNSTopicArn': 'YOUR_SNS_TOPIC_ARN',
149
- # 'RoleArn': 'YOUR_IAM_ROLE_ARN_FOR_TEXTRACT_TO_ACCESS_SNS'
150
- # }
151
  )
152
  job_type="document_text_detection"
153
 
154
  job_id = response['JobId']
155
  print(f"Textract job started with JobId: {job_id}")
156
- #logging.info(f"Textract job started with JobId: {job_id}")
157
 
158
- # Write job_id to memory
159
  # Prepare CSV in memory
160
  log_csv_key_location = f"{s3_output_prefix}/textract_document_jobs.csv"
161
  job_location_full = f"s3://{s3_bucket_name}/{s3_output_prefix}/{job_id}/"
@@ -166,12 +191,16 @@ def analyse_document_with_textract_api(
166
  'file_name': pdf_filename,
167
  'job_type': job_type,
168
  'signature_extraction':analyse_signatures,
169
- #'s3_location': job_location_full,
170
- 'job_date_time': datetime.datetime.now()
171
  }])
172
 
173
  # File path
174
  log_file_path = os.path.join(local_output_dir, "textract_document_jobs.csv")
 
 
 
 
 
175
 
176
  # Check if file exists
177
  file_exists = os.path.exists(log_file_path)
@@ -198,7 +227,7 @@ def analyse_document_with_textract_api(
198
  successful_job_number += 1
199
  total_number_of_textract_page_calls = total_document_page_count
200
 
201
- return f"Textract analysis job submitted, job ID:{job_id}", job_id, job_type, successful_job_number, is_a_textract_api_call, total_number_of_textract_page_calls
202
 
203
  def return_job_status(job_id:str,
204
  response:dict,
@@ -467,13 +496,19 @@ def poll_whole_document_textract_analysis_progress_and_download(
467
  progress(0.5, "Document analysis task outputs found. Downloading from S3")
468
 
469
  # If job_df is not empty
 
 
 
 
470
  if not job_df.empty:
471
  if "file_name" in job_df.columns:
472
  matching_job_id_file_names = job_df.loc[job_df["job_id"] == job_id, "file_name"]
473
 
474
  if pdf_filename and not matching_job_id_file_names.empty:
475
  if pdf_filename == matching_job_id_file_names.iloc[0]:
476
- raise Exception("Existing Textract outputs found. No need to re-download.")
 
 
477
 
478
  if not matching_job_id_file_names.empty:
479
  pdf_filename = matching_job_id_file_names.iloc[0]
 
1
  import boto3
 
2
  import os
3
  import pandas as pd
4
  import json
5
  import logging
6
  import datetime
7
+ import pymupdf
8
  import gradio as gr
9
  from gradio import FileData
10
  from typing import List
11
  from io import StringIO
12
  from urllib.parse import urlparse
13
  from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
14
+ from tools.config import TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, RUN_AWS_FUNCTIONS, INPUT_FOLDER, DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS
15
  from tools.aws_functions import download_file_from_s3
16
  from tools.file_conversion import get_input_file_names
17
  from tools.helper_functions import get_file_name_without_type
 
59
 
60
  # This is a variable that is written to logs to indicate that a Textract API call was made
61
  is_a_textract_api_call = True
62
+ task_textbox = "textract"
63
 
64
  # Keep only latest pdf path if it's a list
65
  if isinstance(local_pdf_path, list):
 
68
  if not os.path.exists(local_pdf_path):
69
  raise FileNotFoundError(f"Input document not found {local_pdf_path}")
70
 
71
+ file_extension = os.path.splitext(local_pdf_path)[1].lower()
72
+
73
+ # Load pdf to get page count if not provided
74
+ if not total_document_page_count and file_extension in ['.pdf']:
75
+ print("Page count not provided. Loading PDF to get page count")
76
+ try:
77
+ pymupdf_doc = pymupdf.open(local_pdf_path)
78
+ total_document_page_count = pymupdf_doc.page_count
79
+ pymupdf_doc.close()
80
+ print("Page count:", total_document_page_count)
81
+ except Exception as e:
82
+ print("Failed to load PDF to get page count:", e, "setting page count to 1")
83
+ total_document_page_count = 1
84
+ #raise Exception(f"Failed to load PDF to get page count: {e}")
85
+ else:
86
+ total_document_page_count = 1
87
+
88
  if not os.path.exists(local_output_dir):
89
  os.makedirs(local_output_dir)
90
  log_message = f"Created local output directory: {local_output_dir}"
 
114
  #logging.error(log_message)
115
  raise
116
 
117
+ # Filter job_df to include rows only where the analysis date is after the current date - DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS
118
  if not job_df.empty:
119
+ job_df = job_df.loc[job_df["job_date_time"] > (datetime.datetime.now() - datetime.timedelta(days=DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS)),:]
120
+
121
+ # If job_df is not empty
122
+ if not job_df.empty:
123
+
124
  if "file_name" in job_df.columns:
125
  matching_job_id_file_names = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(analyse_signatures)), "file_name"]
126
+ matching_job_id_file_names_dates = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(analyse_signatures)), "job_date_time"]
127
+ matching_job_id = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(analyse_signatures)), "job_id"]
128
+ matching_handwrite_signature = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(analyse_signatures)), "signature_extraction"]
129
 
130
+ if len(matching_job_id) > 0:
131
+ pass
132
+ else:
133
+ matching_job_id = "unknown_job_id"
134
+
135
+ if len(matching_job_id_file_names) > 0 and len(matching_handwrite_signature) > 0:
136
+ out_message = f"Existing Textract outputs found for file {pdf_filename} from date {matching_job_id_file_names_dates.iloc[0]}. No need to re-analyse. Please download existing results from the list with job ID {matching_job_id.iloc[0]}"
137
+ gr.Warning(out_message)
138
+ raise Exception(out_message)
139
 
140
  # --- 2. Start Textract Document Analysis ---
141
  message = "Starting Textract document analysis job..."
142
  print(message)
 
143
 
144
  try:
145
  if "Extract signatures" in analyse_signatures:
 
175
  'S3Bucket': s3_bucket_name,
176
  'S3Prefix': s3_output_prefix
177
  }
 
 
 
 
 
178
  )
179
  job_type="document_text_detection"
180
 
181
  job_id = response['JobId']
182
  print(f"Textract job started with JobId: {job_id}")
 
183
 
 
184
  # Prepare CSV in memory
185
  log_csv_key_location = f"{s3_output_prefix}/textract_document_jobs.csv"
186
  job_location_full = f"s3://{s3_bucket_name}/{s3_output_prefix}/{job_id}/"
 
191
  'file_name': pdf_filename,
192
  'job_type': job_type,
193
  'signature_extraction':analyse_signatures,
194
+ 'job_date_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
 
195
  }])
196
 
197
  # File path
198
  log_file_path = os.path.join(local_output_dir, "textract_document_jobs.csv")
199
+ log_file_path_job_id = os.path.join(local_output_dir, pdf_filename + "_textract_document_jobs_job_id.txt")
200
+
201
+ # Write latest job ID to local text file
202
+ with open(log_file_path_job_id, 'w') as f:
203
+ f.write(job_id)
204
 
205
  # Check if file exists
206
  file_exists = os.path.exists(log_file_path)
 
227
  successful_job_number += 1
228
  total_number_of_textract_page_calls = total_document_page_count
229
 
230
+ return f"Textract analysis job submitted, job ID:{job_id}", job_id, job_type, successful_job_number, is_a_textract_api_call, total_number_of_textract_page_calls, task_textbox
231
 
232
  def return_job_status(job_id:str,
233
  response:dict,
 
496
  progress(0.5, "Document analysis task outputs found. Downloading from S3")
497
 
498
  # If job_df is not empty
499
+
500
+ # if not job_df.empty:
501
+ # job_df = job_df.loc[job_df["job_date_time"] > (datetime.datetime.now() - datetime.timedelta(days=DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS)),:]
502
+
503
  if not job_df.empty:
504
  if "file_name" in job_df.columns:
505
  matching_job_id_file_names = job_df.loc[job_df["job_id"] == job_id, "file_name"]
506
 
507
  if pdf_filename and not matching_job_id_file_names.empty:
508
  if pdf_filename == matching_job_id_file_names.iloc[0]:
509
+ out_message = f"Existing Textract outputs found for file {pdf_filename}. No need to re-download."
510
+ gr.Warning(out_message)
511
+ raise Exception(out_message)
512
 
513
  if not matching_job_id_file_names.empty:
514
  pdf_filename = matching_job_id_file_names.iloc[0]