seanpedrickcase commited on
Commit
9ae09da
·
1 Parent(s): 003292d

Added support for other languages. Improved DynamoDB download

Browse files
Dockerfile CHANGED
@@ -54,7 +54,9 @@ ENV GRADIO_TEMP_DIR=/tmp/gradio_tmp/ \
54
  ACCESS_LOGS_FOLDER=$APP_HOME/app/logs/ \
55
  USAGE_LOGS_FOLDER=$APP_HOME/app/usage/ \
56
  CONFIG_FOLDER=$APP_HOME/app/config/ \
57
- XDG_CACHE_HOME=/tmp/xdg_cache/user_1000
 
 
58
 
59
  # Create the base application directory and set its ownership
60
  RUN mkdir -p ${APP_HOME}/app && chown user:user ${APP_HOME}/app
@@ -83,17 +85,23 @@ RUN mkdir -p \
83
  ${APP_HOME}/app/feedback \
84
  ${APP_HOME}/app/config
85
 
86
- # Now handle the /tmp and /var/tmp directories and their subdirectories
87
  RUN mkdir -p /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache /tmp /var/tmp ${XDG_CACHE_HOME} \
88
  && chown user:user /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache ${XDG_CACHE_HOME} \
89
  && chmod 1777 /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache \
90
- && chmod 700 ${XDG_CACHE_HOME}
91
-
92
- RUN mkdir -p ${APP_HOME}/.paddlex/official_models \
93
  && chown user:user \
94
  ${APP_HOME}/.paddlex/official_models \
95
  && chmod 755 \
96
  ${APP_HOME}/.paddlex/official_models
 
 
 
 
 
 
 
97
 
98
  # Copy installed packages from builder stage
99
  COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
@@ -122,6 +130,8 @@ VOLUME ["/home/user/app/usage"]
122
  VOLUME ["/home/user/app/feedback"]
123
  VOLUME ["/home/user/app/config"]
124
  VOLUME ["/home/user/.paddlex/official_models"]
 
 
125
  VOLUME ["/tmp"]
126
  VOLUME ["/var/tmp"]
127
 
@@ -134,7 +144,8 @@ ENV PATH=$APP_HOME/.local/bin:$PATH \
134
  GRADIO_NUM_PORTS=1 \
135
  GRADIO_SERVER_NAME=0.0.0.0 \
136
  GRADIO_SERVER_PORT=7860 \
137
- GRADIO_ANALYTICS_ENABLED=False
 
138
 
139
  ENTRYPOINT ["/entrypoint.sh"]
140
 
 
54
  ACCESS_LOGS_FOLDER=$APP_HOME/app/logs/ \
55
  USAGE_LOGS_FOLDER=$APP_HOME/app/usage/ \
56
  CONFIG_FOLDER=$APP_HOME/app/config/ \
57
+ XDG_CACHE_HOME=/tmp/xdg_cache/user_1000 \
58
+ TESSERACT_FOLDER=/usr/bin/tesseract \
59
+ TESSERACT_DATA_FOLDER=/usr/share/tessdata
60
 
61
  # Create the base application directory and set its ownership
62
  RUN mkdir -p ${APP_HOME}/app && chown user:user ${APP_HOME}/app
 
85
  ${APP_HOME}/app/feedback \
86
  ${APP_HOME}/app/config
87
 
88
+ # Now handle the /tmp and /var/tmp directories and their subdirectories, paddle, spacy, tessdata
89
  RUN mkdir -p /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache /tmp /var/tmp ${XDG_CACHE_HOME} \
90
  && chown user:user /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache ${XDG_CACHE_HOME} \
91
  && chmod 1777 /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache \
92
+ && chmod 700 ${XDG_CACHE_HOME} \
93
+ && mkdir -p ${APP_HOME}/.paddlex/official_models \
 
94
  && chown user:user \
95
  ${APP_HOME}/.paddlex/official_models \
96
  && chmod 755 \
97
  ${APP_HOME}/.paddlex/official_models
98
+ && mkdir -p ${APP_HOME}/.local/share/spacy/data \
99
+ && chown user:user \
100
+ ${APP_HOME}/.local/share/spacy/data \
101
+ && chmod 755 \
102
+ ${APP_HOME}/.local/share/spacy/data \
103
+ mkdir -p /usr/share/tessdata && \
104
+ chmod 755 /usr/share/tessdata # Create tessdata directory and set permissions
105
 
106
  # Copy installed packages from builder stage
107
  COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
 
130
  VOLUME ["/home/user/app/feedback"]
131
  VOLUME ["/home/user/app/config"]
132
  VOLUME ["/home/user/.paddlex/official_models"]
133
+ VOLUME ["/home/user/.local/share/spacy/data"]
134
+ VOLUME ["/usr/share/tessdata"]
135
  VOLUME ["/tmp"]
136
  VOLUME ["/var/tmp"]
137
 
 
144
  GRADIO_NUM_PORTS=1 \
145
  GRADIO_SERVER_NAME=0.0.0.0 \
146
  GRADIO_SERVER_PORT=7860 \
147
+ GRADIO_ANALYTICS_ENABLED=False \
148
+
149
 
150
  ENTRYPOINT ["/entrypoint.sh"]
151
 
app.py CHANGED
@@ -2,15 +2,15 @@ import os
2
  import pandas as pd
3
  import gradio as gr
4
  from gradio_image_annotation import image_annotator
5
- from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS, USAGE_LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, S3_FEEDBACK_LOGS_FOLDER, S3_ACCESS_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER, CHOSEN_LOCAL_OCR_MODEL
6
- from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_relevant_ocr_output_with_words, reset_data_vars, reset_aws_call_vars, _get_env_list, ensure_folder_exists, reset_ocr_with_words_base_dataframe
7
  from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
8
  from tools.file_redaction import choose_and_run_redactor
9
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
10
  from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback_dataframe_row, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr, df_select_callback_textract_api, get_all_rows_with_same_text, increase_bottom_page_count_based_on_top, store_duplicate_selection, create_annotation_objects_from_filtered_ocr_results_with_words, df_select_callback_dataframe_row_ocr_with_words, update_redact_choice_df_from_page_dropdown, get_all_rows_with_same_text_redact
11
  from tools.data_anonymise import anonymise_files_with_open_text
12
  from tools.auth import authenticate_user
13
- from tools.load_spacy_model_custom_recognisers import custom_entities
14
  from tools.custom_csvlogger import CSVLogger_custom
15
  from tools.find_duplicate_pages import run_duplicate_analysis, exclude_match, handle_selection_and_preview, apply_whole_page_redactions_from_list, create_annotation_objects_from_duplicates, run_full_search_and_analysis
16
  from tools.textract_batch_call import analyse_document_with_textract_api, poll_whole_document_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist, replace_existing_pdf_input_for_whole_document_outputs
@@ -33,6 +33,8 @@ if SAVE_LOGS_TO_CSV == "True": SAVE_LOGS_TO_CSV = True
33
  else: SAVE_LOGS_TO_CSV = False
34
  if SAVE_LOGS_TO_DYNAMODB == "True": SAVE_LOGS_TO_DYNAMODB = True
35
  else: SAVE_LOGS_TO_DYNAMODB = False
 
 
36
 
37
  if CSV_ACCESS_LOG_HEADERS: CSV_ACCESS_LOG_HEADERS = _get_env_list(CSV_ACCESS_LOG_HEADERS)
38
  if CSV_FEEDBACK_LOG_HEADERS: CSV_FEEDBACK_LOG_HEADERS = _get_env_list(CSV_FEEDBACK_LOG_HEADERS)
@@ -244,6 +246,10 @@ with app:
244
  ## Duplicate search object
245
  new_duplicate_search_annotation_object = gr.Dropdown(value=None, label="new_duplicate_search_annotation_object", allow_custom_value=True, visible=False)
246
 
 
 
 
 
247
  ###
248
  # UI DESIGN
249
  ###
@@ -588,10 +594,18 @@ with app:
588
  page_min = gr.Number(value=0, precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
589
  page_max = gr.Number(value=0, precision=0,minimum=0,maximum=9999, label="Highest page to redact")
590
 
591
- with gr.Accordion("AWS options", open = False):
592
- #with gr.Row():
593
- in_redact_language = gr.Dropdown(value = REDACTION_LANGUAGE, choices = [REDACTION_LANGUAGE], label="Redaction language", multiselect=False, visible=False)
 
 
 
 
 
 
 
594
 
 
595
  with gr.Row():
596
  aws_access_key_textbox = gr.Textbox(value='', label="AWS access key for account with permissions for AWS Textract and Comprehend", visible=True, type="password")
597
  aws_secret_key_textbox = gr.Textbox(value='', label="AWS secret key for account with permissions for AWS Textract and Comprehend", visible=True, type="password")
@@ -651,15 +665,11 @@ with app:
651
  # Run redaction function
652
  document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number, all_page_line_level_ocr_results_with_words]).\
653
  success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
654
- success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox],
655
  outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, input_pdf_for_review, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state], api_name="redact_doc")
656
-
657
- # If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
658
- # current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox],
659
- # outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, input_pdf_for_review, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state])
660
 
661
  # If a file has been completed, the function will continue onto the next document
662
- latest_file_completed_num.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox],
663
  outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, input_pdf_for_review, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state]).\
664
  success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
665
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
@@ -689,7 +699,7 @@ with app:
689
  success(fn=check_for_relevant_ocr_output_with_words, inputs=[doc_file_name_no_extension_textbox, text_extract_method_radio, output_folder_textbox], outputs=[relevant_ocr_output_with_words_found_checkbox]).\
690
  success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
691
  success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
692
- success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox],
693
  outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, input_pdf_for_review, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state]).\
694
  success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
695
 
@@ -889,13 +899,11 @@ with app:
889
  success(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list, total_pdf_page_count])
890
 
891
  tabular_data_redact_btn.click(reset_data_vars, outputs=[actual_time_taken_number, log_files_output_list_state, comprehend_query_number]).\
892
- success(fn=anonymise_files_with_open_text, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list_state, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox, actual_time_taken_number], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state, actual_time_taken_number], api_name="redact_data").\
893
- success(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
894
 
895
- # Currently only supports redacting one data file at a time, following code block not used
896
  # If the output file count text box changes, keep going with redacting each data file until done
897
- # text_tabular_files_done.change(fn=anonymise_files_with_open_text, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list_state, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox, actual_time_taken_number], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state, actual_time_taken_number]).\
898
- # success(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
899
 
900
  ###
901
  # IDENTIFY DUPLICATE PAGES
@@ -966,7 +974,12 @@ with app:
966
  merge_multiple_review_files_btn.click(fn=merge_csv_files, inputs=multiple_review_files_in_out, outputs=multiple_review_files_in_out)
967
 
968
  #
969
- all_output_files_btn.click(fn=load_all_output_files, inputs=output_folder_textbox, outputs=all_output_files)
 
 
 
 
 
970
 
971
  ###
972
  # APP LOAD AND LOGGING
@@ -1082,17 +1095,4 @@ if __name__ == "__main__":
1082
 
1083
  main(first_loop_state, latest_file_completed=0, redaction_output_summary_textbox="", output_file_list=None,
1084
  log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
1085
- current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_page_line_level_ocr_results_df = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),CHOSEN_COMPREHEND_ENTITIES = CHOSEN_COMPREHEND_ENTITIES, CHOSEN_REDACT_ENTITIES = CHOSEN_REDACT_ENTITIES, handwrite_signature_checkbox = ["Extract handwriting", "Extract signatures"])
1086
-
1087
- # AWS options - placeholder for possibility of storing data on s3 and retrieving it in app
1088
- # with gr.Tab(label="Advanced options"):
1089
- # with gr.Accordion(label = "AWS data access", open = True):
1090
- # aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
1091
- # with gr.Row():
1092
- # in_aws_file = gr.Dropdown(label="Choose file to load from AWS (only valid for API Gateway app)", choices=["None", "Lambeth borough plan"])
1093
- # load_aws_data_button = gr.Button(value="Load data from AWS", variant="secondary")
1094
-
1095
- # aws_log_box = gr.Textbox(label="AWS data load status")
1096
-
1097
- # ### Loading AWS data ###
1098
- # load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_doc_files, aws_log_box])
 
2
  import pandas as pd
3
  import gradio as gr
4
  from gradio_image_annotation import image_annotator
5
+ from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, DEFAULT_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS, USAGE_LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, S3_FEEDBACK_LOGS_FOLDER, S3_ACCESS_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER, CHOSEN_LOCAL_OCR_MODEL, DEFAULT_LANGUAGE_FULL_NAME, SHOW_LANGUAGE_SELECTION
6
+ from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_relevant_ocr_output_with_words, reset_data_vars, reset_aws_call_vars, _get_env_list, ensure_folder_exists, reset_ocr_with_words_base_dataframe, update_language_dropdown, LANGUAGE_CHOICES, MAPPED_LANGUAGE_CHOICES
7
  from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
8
  from tools.file_redaction import choose_and_run_redactor
9
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
10
  from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback_dataframe_row, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr, df_select_callback_textract_api, get_all_rows_with_same_text, increase_bottom_page_count_based_on_top, store_duplicate_selection, create_annotation_objects_from_filtered_ocr_results_with_words, df_select_callback_dataframe_row_ocr_with_words, update_redact_choice_df_from_page_dropdown, get_all_rows_with_same_text_redact
11
  from tools.data_anonymise import anonymise_files_with_open_text
12
  from tools.auth import authenticate_user
13
+ from tools.load_spacy_model_custom_recognisers import custom_entities, load_spacy_model, download_tesseract_lang_pack
14
  from tools.custom_csvlogger import CSVLogger_custom
15
  from tools.find_duplicate_pages import run_duplicate_analysis, exclude_match, handle_selection_and_preview, apply_whole_page_redactions_from_list, create_annotation_objects_from_duplicates, run_full_search_and_analysis
16
  from tools.textract_batch_call import analyse_document_with_textract_api, poll_whole_document_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist, replace_existing_pdf_input_for_whole_document_outputs
 
33
  else: SAVE_LOGS_TO_CSV = False
34
  if SAVE_LOGS_TO_DYNAMODB == "True": SAVE_LOGS_TO_DYNAMODB = True
35
  else: SAVE_LOGS_TO_DYNAMODB = False
36
+ if SHOW_LANGUAGE_SELECTION == "True": SHOW_LANGUAGE_SELECTION = True
37
+ else: SHOW_LANGUAGE_SELECTION = False
38
 
39
  if CSV_ACCESS_LOG_HEADERS: CSV_ACCESS_LOG_HEADERS = _get_env_list(CSV_ACCESS_LOG_HEADERS)
40
  if CSV_FEEDBACK_LOG_HEADERS: CSV_FEEDBACK_LOG_HEADERS = _get_env_list(CSV_FEEDBACK_LOG_HEADERS)
 
246
  ## Duplicate search object
247
  new_duplicate_search_annotation_object = gr.Dropdown(value=None, label="new_duplicate_search_annotation_object", allow_custom_value=True, visible=False)
248
 
249
+ # Spacy analyser state
250
+ updated_nlp_analyser_state = gr.State([])
251
+ tesseract_lang_data_file_path = gr.Textbox("", visible=False)
252
+
253
  ###
254
  # UI DESIGN
255
  ###
 
594
  page_min = gr.Number(value=0, precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
595
  page_max = gr.Number(value=0, precision=0,minimum=0,maximum=9999, label="Highest page to redact")
596
 
597
+ if SHOW_LANGUAGE_SELECTION:
598
+ with gr.Accordion("Language selection", open=False):
599
+ gr.Markdown("""Note that AWS Textract is only compatible with English, Spanish, Italian, Portuguese, French, and German, and handwriting detection is only available in English. AWS Comprehend is additionally compatible with Arabic, Hindi, Japanese, Korean, Chinese, and Chinese (Traditional).
600
+ The local models (Tesseract and SpaCy) are compatible with the other languages in the list below. However, the language packs for these models need to be installed on your system. When you first run a document through the app, the language packs will be downloaded automatically, but please expect a delay as the models are large.""")
601
+ with gr.Row():
602
+ chosen_language_full_name_drop = gr.Dropdown(value = DEFAULT_LANGUAGE_FULL_NAME, choices = MAPPED_LANGUAGE_CHOICES, label="Chosen language", multiselect=False, visible=True)
603
+ chosen_language_drop = gr.Dropdown(value = DEFAULT_LANGUAGE, choices = LANGUAGE_CHOICES, label="Chosen language short code", multiselect=False, visible=True)
604
+ else:
605
+ chosen_language_full_name_drop = gr.Dropdown(value = DEFAULT_LANGUAGE_FULL_NAME, choices = MAPPED_LANGUAGE_CHOICES, label="Chosen language", multiselect=False, visible=False)
606
+ chosen_language_drop = gr.Dropdown(value = DEFAULT_LANGUAGE, choices = LANGUAGE_CHOICES, label="Chosen language short code", multiselect=False, visible=False)
607
 
608
+ with gr.Accordion("Use API keys for AWS services", open = False):
609
  with gr.Row():
610
  aws_access_key_textbox = gr.Textbox(value='', label="AWS access key for account with permissions for AWS Textract and Comprehend", visible=True, type="password")
611
  aws_secret_key_textbox = gr.Textbox(value='', label="AWS secret key for account with permissions for AWS Textract and Comprehend", visible=True, type="password")
 
665
  # Run redaction function
666
  document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number, all_page_line_level_ocr_results_with_words]).\
667
  success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
668
+ success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox, chosen_language_drop],
669
  outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, input_pdf_for_review, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state], api_name="redact_doc")
 
 
 
 
670
 
671
  # If a file has been completed, the function will continue onto the next document
672
+ latest_file_completed_num.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox, chosen_language_drop],
673
  outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, input_pdf_for_review, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state]).\
674
  success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
675
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
 
699
  success(fn=check_for_relevant_ocr_output_with_words, inputs=[doc_file_name_no_extension_textbox, text_extract_method_radio, output_folder_textbox], outputs=[relevant_ocr_output_with_words_found_checkbox]).\
700
  success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
701
  success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
702
+ success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox, chosen_language_drop],
703
  outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, input_pdf_for_review, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state]).\
704
  success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
705
 
 
899
  success(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list, total_pdf_page_count])
900
 
901
  tabular_data_redact_btn.click(reset_data_vars, outputs=[actual_time_taken_number, log_files_output_list_state, comprehend_query_number]).\
902
+ success(fn=anonymise_files_with_open_text, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_entities, in_allow_list_state, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox, actual_time_taken_number, chosen_language_drop], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state, actual_time_taken_number], api_name="redact_data")
 
903
 
 
904
  # If the output file count text box changes, keep going with redacting each data file until done
905
+ text_tabular_files_done.change(fn=anonymise_files_with_open_text, inputs=[in_data_files, in_text, anon_strat, in_colnames, chosen_language_drop, in_redact_entities, in_allow_list_state, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox, actual_time_taken_number, chosen_language_drop], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state, actual_time_taken_number]).\
906
+ success(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
907
 
908
  ###
909
  # IDENTIFY DUPLICATE PAGES
 
974
  merge_multiple_review_files_btn.click(fn=merge_csv_files, inputs=multiple_review_files_in_out, outputs=multiple_review_files_in_out)
975
 
976
  #
977
+ all_output_files_btn.click(fn=load_all_output_files, inputs=output_folder_textbox, outputs=all_output_files)
978
+
979
+ # Language selection dropdown
980
+ chosen_language_full_name_drop.select(update_language_dropdown, inputs=[chosen_language_full_name_drop], outputs=[chosen_language_drop])#.\
981
+ #success(download_tesseract_lang_pack, inputs=[chosen_language_drop], outputs = [tesseract_lang_data_file_path]).\
982
+ #success(load_spacy_model, inputs=[chosen_language_drop], outputs=[updated_nlp_analyser_state])
983
 
984
  ###
985
  # APP LOAD AND LOGGING
 
1095
 
1096
  main(first_loop_state, latest_file_completed=0, redaction_output_summary_textbox="", output_file_list=None,
1097
  log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
1098
+ current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_page_line_level_ocr_results_df = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),CHOSEN_COMPREHEND_ENTITIES = CHOSEN_COMPREHEND_ENTITIES, CHOSEN_REDACT_ENTITIES = CHOSEN_REDACT_ENTITIES, handwrite_signature_checkbox = ["Extract handwriting", "Extract signatures"])
 
 
 
 
 
 
 
 
 
 
 
 
 
load_dynamo_logs.py CHANGED
@@ -1,6 +1,7 @@
1
  import boto3
2
  import csv
3
  from decimal import Decimal
 
4
  from boto3.dynamodb.conditions import Key
5
 
6
  from tools.config import AWS_REGION, ACCESS_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, OUTPUT_FOLDER
@@ -16,11 +17,26 @@ table = dynamodb.Table(TABLE_NAME)
16
 
17
  # Helper function to convert Decimal to float or int
18
  def convert_types(item):
 
19
  for key, value in item.items():
 
20
  if isinstance(value, Decimal):
21
- # Convert to int if no decimal places, else float
22
- item[key] = int(value) if value % 1 == 0 else float(value)
23
- return item
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  # Paginated scan
26
  def scan_table():
@@ -35,22 +51,43 @@ def scan_table():
35
  return items
36
 
37
  # Export to CSV
38
- def export_to_csv(items, output_path):
 
39
  if not items:
40
  print("No items found.")
41
  return
42
 
43
- fieldnames = sorted(items[0].keys())
 
 
 
 
 
 
 
 
 
 
 
44
 
45
- with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
46
- writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
 
 
 
 
 
 
 
47
  writer.writeheader()
48
 
49
  for item in items:
 
 
50
  writer.writerow(convert_types(item))
51
 
52
  print(f"Exported {len(items)} items to {output_path}")
53
 
54
  # Run export
55
  items = scan_table()
56
- export_to_csv(items, CSV_OUTPUT)
 
1
  import boto3
2
  import csv
3
  from decimal import Decimal
4
+ import datetime
5
  from boto3.dynamodb.conditions import Key
6
 
7
  from tools.config import AWS_REGION, ACCESS_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, OUTPUT_FOLDER
 
17
 
18
  # Helper function to convert Decimal to float or int
19
  def convert_types(item):
20
+ new_item = {}
21
  for key, value in item.items():
22
+ # Handle Decimals first
23
  if isinstance(value, Decimal):
24
+ new_item[key] = int(value) if value % 1 == 0 else float(value)
25
+ # Handle Strings that might be dates
26
+ elif isinstance(value, str):
27
+ try:
28
+ # Attempt to parse a common ISO 8601 format.
29
+ # The .replace() handles the 'Z' for Zulu/UTC time.
30
+ dt_obj = datetime.datetime.fromisoformat(value.replace('Z', '+00:00'))
31
+ # Now that we have a datetime object, format it as desired
32
+ new_item[key] = dt_obj.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
33
+ except (ValueError, TypeError):
34
+ # If it fails to parse, it's just a regular string
35
+ new_item[key] = value
36
+ # Handle all other types
37
+ else:
38
+ new_item[key] = value
39
+ return new_item
40
 
41
  # Paginated scan
42
  def scan_table():
 
51
  return items
52
 
53
  # Export to CSV
54
+ # Export to CSV
55
+ def export_to_csv(items, output_path, fields_to_drop: list = None):
56
  if not items:
57
  print("No items found.")
58
  return
59
 
60
+ # Use a set for efficient lookup
61
+ drop_set = set(fields_to_drop or [])
62
+
63
+ # Get a comprehensive list of all possible headers from all items
64
+ all_keys = set()
65
+ for item in items:
66
+ all_keys.update(item.keys())
67
+
68
+ # Determine the final fieldnames by subtracting the ones to drop
69
+ fieldnames = sorted(list(all_keys - drop_set))
70
+
71
+ print("Final CSV columns will be:", fieldnames)
72
 
73
+ with open(output_path, 'w', newline='', encoding='utf-8-sig') as csvfile:
74
+ # The key fix is here: extrasaction='ignore'
75
+ # restval='' is also good practice to handle rows that are missing a key
76
+ writer = csv.DictWriter(
77
+ csvfile,
78
+ fieldnames=fieldnames,
79
+ extrasaction='ignore',
80
+ restval=''
81
+ )
82
  writer.writeheader()
83
 
84
  for item in items:
85
+ # The convert_types function can now return the full dict,
86
+ # and the writer will simply ignore the extra fields.
87
  writer.writerow(convert_types(item))
88
 
89
  print(f"Exported {len(items)} items to {output_path}")
90
 
91
  # Run export
92
  items = scan_table()
93
+ export_to_csv(items, CSV_OUTPUT, fields_to_drop=[])
tools/aws_textract.py CHANGED
@@ -278,7 +278,6 @@ def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_
278
 
279
  return all_ocr_results_with_page, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_words_with_page
280
 
281
-
282
  def load_and_convert_textract_json(textract_json_file_path:str, log_files_output_paths:str, page_sizes_df:pd.DataFrame):
283
  """
284
  Loads Textract JSON from a file, detects if conversion is needed, and converts if necessary.
 
278
 
279
  return all_ocr_results_with_page, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_words_with_page
280
 
 
281
  def load_and_convert_textract_json(textract_json_file_path:str, log_files_output_paths:str, page_sizes_df:pd.DataFrame):
282
  """
283
  Loads Textract JSON from a file, detects if conversion is needed, and converts if necessary.
tools/config.py CHANGED
@@ -195,7 +195,8 @@ FEEDBACK_LOG_FILE_NAME = get_or_create_env_var('FEEDBACK_LOG_FILE_NAME', LOG_FIL
195
  ###
196
 
197
  # Create Tesseract and Poppler folders if you have installed them locally
198
- TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "") # # If installing for Windows, install Tesseract 5.5.0 from here: https://github.com/UB-Mannheim/tesseract/wiki. Then this environment variable should point to the Tesseract folder e.g. tesseract/
 
199
  POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "") # If installing on Windows,install Poppler from here https://github.com/oschwartz10612/poppler-windows. This variable needs to point to the poppler bin folder e.g. poppler/poppler-24.02.0/Library/bin/
200
 
201
  if TESSERACT_FOLDER: add_folder_to_path(TESSERACT_FOLDER)
@@ -288,7 +289,26 @@ MAX_TIME_VALUE = get_or_create_env_var('MAX_TIME_VALUE', '999999')
288
 
289
  CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "") # only "grey" is currently supported as a custom box colour
290
 
291
- REDACTION_LANGUAGE = get_or_create_env_var("REDACTION_LANGUAGE", "en") # Currently only English is supported by the app
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
 
293
  RETURN_PDF_END_OF_REDACTION = get_or_create_env_var("RETURN_PDF_END_OF_REDACTION", "True") # Return a redacted PDF at the end of the redaction task. Could be useful to set this to "False" if you want to ensure that the user always goes to the 'Review Redactions' tab before getting the final redacted PDF product.
294
 
 
195
  ###
196
 
197
  # Create Tesseract and Poppler folders if you have installed them locally
198
+ TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "/usr/bin/tesseract") # # If installing for Windows, install Tesseract 5.5.0 from here: https://github.com/UB-Mannheim/tesseract/wiki. Then this environment variable should point to the Tesseract folder e.g. tesseract/
199
+ TESSERACT_DATA_FOLDER = get_or_create_env_var('TESSERACT_DATA_FOLDER', "/usr/share/tessdata")
200
  POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "") # If installing on Windows,install Poppler from here https://github.com/oschwartz10612/poppler-windows. This variable needs to point to the poppler bin folder e.g. poppler/poppler-24.02.0/Library/bin/
201
 
202
  if TESSERACT_FOLDER: add_folder_to_path(TESSERACT_FOLDER)
 
289
 
290
  CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "") # only "grey" is currently supported as a custom box colour
291
 
292
+ ### Language selection options
293
+
294
+ SHOW_LANGUAGE_SELECTION = get_or_create_env_var("SHOW_LANGUAGE_SELECTION", "False")
295
+
296
+ DEFAULT_LANGUAGE_FULL_NAME = get_or_create_env_var("DEFAULT_LANGUAGE_FULL_NAME", "english")
297
+ DEFAULT_LANGUAGE = get_or_create_env_var("DEFAULT_LANGUAGE", "en") # For tesseract, ensure the Tesseract language data (e.g., fra.traineddata) is installed on your system. You can find the relevant language packs here: https://github.com/tesseract-ocr/tessdata.
298
+ # For paddle, ensure the paddle language data (e.g., fra.traineddata) is installed on your system. You can find information on supported languages here: https://www.paddleocr.ai/main/en/version3.x/algorithm/PP-OCRv5/PP-OCRv5_multi_languages.html
299
+ # For AWS Comprehend, ensure the language data is installed on your system. You can find the relevant language packs here: https://docs.aws.amazon.com/comprehend/latest/dg/supported-languages.html: ('en'|'es'|'fr'|'de'|'it'|'pt'|'ar'|'hi'|'ja'|'ko'|'zh'|'zh-TW')
300
+ # AWS Textract automatically detects the language of the document and supports the following languages: https://aws.amazon.com/textract/faqs/#topic-0. 'English, Spanish, Italian, Portuguese, French, German. Handwriting, Invoices and Receipts, Identity documents and Queries processing are in English only'
301
+
302
+ textract_language_choices = get_or_create_env_var("textract_language_choices", "['en', 'es', 'fr', 'de', 'it', 'pt']")
303
+ aws_comprehend_language_choices = get_or_create_env_var("aws_comprehend_language_choices", "['en', 'es', 'fr', 'de', 'it', 'pt', 'ar', 'hi', 'ja', 'ko', 'zh', 'zh-TW']")
304
+
305
+ # The choices that the user sees
306
+ MAPPED_LANGUAGE_CHOICES = get_or_create_env_var("MAPPED_LANGUAGE_CHOICES", "['english', 'french', 'german', 'spanish', 'italian', 'dutch', 'portuguese', 'chinese', 'japanese', 'korean', 'lithuanian', 'macedonian', 'norwegian_bokmaal', 'polish', 'romanian', 'russian', 'slovenian', 'swedish', 'catalan', 'ukrainian']")
307
+ LANGUAGE_CHOICES = get_or_create_env_var("LANGUAGE_CHOICES", "['en', 'fr', 'de', 'es', 'it', 'nl', 'pt', 'zh', 'ja', 'ko', 'lt', 'mk', 'nb', 'pl', 'ro', 'ru', 'sl', 'sv', 'ca', 'uk']")
308
+
309
+
310
+
311
+ ### File output options
312
 
313
  RETURN_PDF_END_OF_REDACTION = get_or_create_env_var("RETURN_PDF_END_OF_REDACTION", "True") # Return a redacted PDF at the end of the redaction task. Could be useful to set this to "False" if you want to ensure that the user always goes to the 'Review Redactions' tab before getting the final redacted PDF product.
314
 
tools/custom_image_analyser_engine.py CHANGED
@@ -16,7 +16,7 @@ from typing import Optional, Tuple, Union
16
  from tools.helper_functions import clean_unicode_text
17
  from tools.presidio_analyzer_custom import recognizer_result_from_dict
18
  from tools.load_spacy_model_custom_recognisers import custom_entities
19
- from tools.config import PREPROCESS_LOCAL_OCR_IMAGES
20
 
21
  if PREPROCESS_LOCAL_OCR_IMAGES == "True": PREPROCESS_LOCAL_OCR_IMAGES = True
22
  else: PREPROCESS_LOCAL_OCR_IMAGES = False
@@ -26,6 +26,86 @@ try:
26
  except ImportError:
27
  PaddleOCR = None
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  @dataclass
30
  class OCRResult:
31
  text: str
@@ -234,6 +314,7 @@ def rescale_ocr_data(ocr_data, scale_factor:float):
234
  ocr_data['height'][i] = h_orig
235
 
236
  return ocr_data
 
237
  class CustomImageAnalyzerEngine:
238
  def __init__(
239
  self,
@@ -241,28 +322,38 @@ class CustomImageAnalyzerEngine:
241
  ocr_engine: str = "tesseract",
242
  tesseract_config: Optional[str] = None,
243
  paddle_kwargs: Optional[Dict[str, Any]] = None,
244
- image_preprocessor: Optional[ImagePreprocessor] = None
 
245
  ):
246
  """
247
  Initializes the CustomImageAnalyzerEngine.
248
 
249
- :param ocr_engine: The OCR engine to use ("tesseract" or "paddle").
250
  :param analyzer_engine: The Presidio AnalyzerEngine instance.
251
  :param tesseract_config: Configuration string for Tesseract.
252
  :param paddle_kwargs: Dictionary of keyword arguments for PaddleOCR constructor.
253
  :param image_preprocessor: Optional image preprocessor.
 
254
  """
255
  if ocr_engine not in ["tesseract", "paddle", "hybrid"]:
256
  raise ValueError("ocr_engine must be either 'tesseract', 'hybrid', or 'paddle'")
257
 
258
  self.ocr_engine = ocr_engine
 
 
 
 
 
259
 
260
  if self.ocr_engine == "paddle" or self.ocr_engine == "hybrid":
261
  if PaddleOCR is None:
262
  raise ImportError("paddleocr is not installed. Please run 'pip install paddleocr paddlepaddle'")
263
  # Default paddle configuration if none provided
264
  if paddle_kwargs is None:
265
- paddle_kwargs = {'use_textline_orientation': True, 'lang': 'en'}
 
 
 
266
  self.paddle_ocr = PaddleOCR(**paddle_kwargs)
267
 
268
  if not analyzer_engine:
@@ -394,7 +485,8 @@ class CustomImageAnalyzerEngine:
394
  tesseract_data = pytesseract.image_to_data(
395
  image,
396
  output_type=pytesseract.Output.DICT,
397
- config=self.tesseract_config
 
398
  )
399
 
400
  #tesseract_data['abs_line_id'] = tesseract_data.groupby(['block_num', 'par_num', 'line_num']).ngroup()
@@ -510,7 +602,8 @@ class CustomImageAnalyzerEngine:
510
  ocr_data = pytesseract.image_to_data(
511
  image,
512
  output_type=pytesseract.Output.DICT,
513
- config=self.tesseract_config
 
514
  )
515
 
516
  #ocr_data['abs_line_id'] = ocr_data.groupby(['block_num', 'par_num', 'line_num']).ngroup()
@@ -569,6 +662,7 @@ class CustomImageAnalyzerEngine:
569
  pii_identification_method: str = "Local",
570
  comprehend_client = "",
571
  custom_entities:List[str]=custom_entities,
 
572
  **text_analyzer_kwargs
573
  ) -> List[CustomImageRecognizerResult]:
574
 
@@ -586,10 +680,14 @@ class CustomImageAnalyzerEngine:
586
  # Note: We're not passing line_characters here since it's not needed for this use case
587
  page_text_mapping.append((start_pos, i, line_level_ocr_result, None))
588
 
 
 
 
589
  # Process using either Local or AWS Comprehend
590
  if pii_identification_method == "Local":
591
  analyzer_result = self.analyzer_engine.analyze(
592
  text=page_text,
 
593
  **text_analyzer_kwargs
594
  )
595
  all_text_line_results = map_back_entity_results(
@@ -609,6 +707,7 @@ class CustomImageAnalyzerEngine:
609
  text_analyzer_kwargs["entities"] = custom_redact_entities
610
  page_analyser_result = self.analyzer_engine.analyze(
611
  text=page_text,
 
612
  **text_analyzer_kwargs
613
  )
614
  all_text_line_results = map_back_entity_results(
@@ -641,7 +740,7 @@ class CustomImageAnalyzerEngine:
641
  current_batch,
642
  current_batch_mapping,
643
  comprehend_client,
644
- text_analyzer_kwargs["language"],
645
  text_analyzer_kwargs.get('allow_list', []),
646
  chosen_redact_comprehend_entities,
647
  all_text_line_results
@@ -676,7 +775,7 @@ class CustomImageAnalyzerEngine:
676
  current_batch,
677
  current_batch_mapping,
678
  comprehend_client,
679
- text_analyzer_kwargs["language"],
680
  text_analyzer_kwargs.get('allow_list', []),
681
  chosen_redact_comprehend_entities,
682
  all_text_line_results
@@ -988,7 +1087,7 @@ def run_page_text_redaction(
988
  comprehend_client = None,
989
  allow_list: List[str] = None,
990
  pii_identification_method: str = "Local",
991
- nlp_analyser = None,
992
  score_threshold: float = 0.0,
993
  custom_entities: List[str] = None,
994
  comprehend_query_number:int = 0#,
 
16
  from tools.helper_functions import clean_unicode_text
17
  from tools.presidio_analyzer_custom import recognizer_result_from_dict
18
  from tools.load_spacy_model_custom_recognisers import custom_entities
19
+ from tools.config import PREPROCESS_LOCAL_OCR_IMAGES, DEFAULT_LANGUAGE
20
 
21
  if PREPROCESS_LOCAL_OCR_IMAGES == "True": PREPROCESS_LOCAL_OCR_IMAGES = True
22
  else: PREPROCESS_LOCAL_OCR_IMAGES = False
 
26
  except ImportError:
27
  PaddleOCR = None
28
 
29
+ # --- Language utilities ---
30
+ def _normalize_lang(language: str) -> str:
31
+ return language.strip().lower().replace("-", "_") if language else "en"
32
+
33
+
34
+ def _tesseract_lang_code(language: str) -> str:
35
+ """Map a user language input to a Tesseract traineddata code."""
36
+ lang = _normalize_lang(language)
37
+
38
+ mapping = {
39
+ # Common
40
+ "en": "eng", "eng": "eng",
41
+ "fr": "fra", "fre": "fra", "fra": "fra",
42
+ "de": "deu", "ger": "deu", "deu": "deu",
43
+ "es": "spa", "spa": "spa",
44
+ "it": "ita", "ita": "ita",
45
+ "nl": "nld", "dut": "nld", "nld": "nld",
46
+ "pt": "por", "por": "por",
47
+ "ru": "rus", "rus": "rus",
48
+ "ar": "ara", "ara": "ara",
49
+ # Nordics
50
+ "sv": "swe", "swe": "swe",
51
+ "no": "nor", "nb": "nor", "nn": "nor", "nor": "nor",
52
+ "fi": "fin", "fin": "fin",
53
+ "da": "dan", "dan": "dan",
54
+ # Eastern/Central
55
+ "pl": "pol", "pol": "pol",
56
+ "cs": "ces", "cz": "ces", "ces": "ces",
57
+ "hu": "hun", "hun": "hun",
58
+ "ro": "ron", "rum": "ron", "ron": "ron",
59
+ "bg": "bul", "bul": "bul",
60
+ "el": "ell", "gre": "ell", "ell": "ell",
61
+ # Asian
62
+ "ja": "jpn", "jp": "jpn", "jpn": "jpn",
63
+ "zh": "chi_sim", "zh_cn": "chi_sim", "zh_hans": "chi_sim", "chi_sim": "chi_sim",
64
+ "zh_tw": "chi_tra", "zh_hk": "chi_tra", "zh_tr": "chi_tra", "chi_tra": "chi_tra",
65
+ "hi": "hin", "hin": "hin",
66
+ "bn": "ben", "ben": "ben",
67
+ "ur": "urd", "urd": "urd",
68
+ "fa": "fas", "per": "fas", "fas": "fas",
69
+ }
70
+
71
+ return mapping.get(lang, "eng")
72
+
73
+
74
+ def _paddle_lang_code(language: str) -> str:
75
+ """Map a user language input to a PaddleOCR language code.
76
+
77
+ PaddleOCR supports codes like: 'en', 'ch', 'chinese_cht', 'korean', 'japan', 'german', 'fr', 'it', 'es',
78
+ as well as script packs like 'arabic', 'cyrillic', 'latin'.
79
+ """
80
+ lang = _normalize_lang(language)
81
+
82
+ mapping = {
83
+ "en": "en",
84
+ "fr": "fr",
85
+ "de": "german",
86
+ "es": "es",
87
+ "it": "it",
88
+ "pt": "pt",
89
+ "nl": "nl",
90
+ "ru": "cyrillic", # Russian is covered by cyrillic models
91
+ "uk": "cyrillic",
92
+ "bg": "cyrillic",
93
+ "sr": "cyrillic",
94
+ "ar": "arabic",
95
+ "tr": "tr",
96
+ "fa": "arabic", # fallback to arabic script pack
97
+ "zh": "ch",
98
+ "zh_cn": "ch",
99
+ "zh_tw": "chinese_cht",
100
+ "zh_hk": "chinese_cht",
101
+ "ja": "japan",
102
+ "jp": "japan",
103
+ "ko": "korean",
104
+ "hi": "latin", # fallback; dedicated Hindi not always available
105
+ }
106
+
107
+ return mapping.get(lang, "en")
108
+
109
  @dataclass
110
  class OCRResult:
111
  text: str
 
314
  ocr_data['height'][i] = h_orig
315
 
316
  return ocr_data
317
+
318
  class CustomImageAnalyzerEngine:
319
  def __init__(
320
  self,
 
322
  ocr_engine: str = "tesseract",
323
  tesseract_config: Optional[str] = None,
324
  paddle_kwargs: Optional[Dict[str, Any]] = None,
325
+ image_preprocessor: Optional[ImagePreprocessor] = None,
326
+ language: Optional[str] = None
327
  ):
328
  """
329
  Initializes the CustomImageAnalyzerEngine.
330
 
331
+ :param ocr_engine: The OCR engine to use ("tesseract", "hybrid", or "paddle").
332
  :param analyzer_engine: The Presidio AnalyzerEngine instance.
333
  :param tesseract_config: Configuration string for Tesseract.
334
  :param paddle_kwargs: Dictionary of keyword arguments for PaddleOCR constructor.
335
  :param image_preprocessor: Optional image preprocessor.
336
+ :param language: Preferred OCR language (e.g., "en", "fr", "de"). Defaults to DEFAULT_LANGUAGE.
337
  """
338
  if ocr_engine not in ["tesseract", "paddle", "hybrid"]:
339
  raise ValueError("ocr_engine must be either 'tesseract', 'hybrid', or 'paddle'")
340
 
341
  self.ocr_engine = ocr_engine
342
+
343
+ # Language setup
344
+ self.language = language or DEFAULT_LANGUAGE or "en"
345
+ self.tesseract_lang = _tesseract_lang_code(self.language)
346
+ self.paddle_lang = _paddle_lang_code(self.language)
347
 
348
  if self.ocr_engine == "paddle" or self.ocr_engine == "hybrid":
349
  if PaddleOCR is None:
350
  raise ImportError("paddleocr is not installed. Please run 'pip install paddleocr paddlepaddle'")
351
  # Default paddle configuration if none provided
352
  if paddle_kwargs is None:
353
+ paddle_kwargs = {'use_textline_orientation': True, 'lang': self.paddle_lang}
354
+ else:
355
+ # Enforce language if not explicitly provided
356
+ paddle_kwargs.setdefault('lang', self.paddle_lang)
357
  self.paddle_ocr = PaddleOCR(**paddle_kwargs)
358
 
359
  if not analyzer_engine:
 
485
  tesseract_data = pytesseract.image_to_data(
486
  image,
487
  output_type=pytesseract.Output.DICT,
488
+ config=self.tesseract_config,
489
+ lang=self.tesseract_lang
490
  )
491
 
492
  #tesseract_data['abs_line_id'] = tesseract_data.groupby(['block_num', 'par_num', 'line_num']).ngroup()
 
602
  ocr_data = pytesseract.image_to_data(
603
  image,
604
  output_type=pytesseract.Output.DICT,
605
+ config=self.tesseract_config,
606
+ lang=self.tesseract_lang # Ensure the Tesseract language data (e.g., fra.traineddata) is installed on your system.
607
  )
608
 
609
  #ocr_data['abs_line_id'] = ocr_data.groupby(['block_num', 'par_num', 'line_num']).ngroup()
 
662
  pii_identification_method: str = "Local",
663
  comprehend_client = "",
664
  custom_entities:List[str]=custom_entities,
665
+ language: Optional[str] = None,
666
  **text_analyzer_kwargs
667
  ) -> List[CustomImageRecognizerResult]:
668
 
 
680
  # Note: We're not passing line_characters here since it's not needed for this use case
681
  page_text_mapping.append((start_pos, i, line_level_ocr_result, None))
682
 
683
+ # Determine language for downstream services
684
+ aws_language = language or getattr(self, 'language', None) or 'en'
685
+
686
  # Process using either Local or AWS Comprehend
687
  if pii_identification_method == "Local":
688
  analyzer_result = self.analyzer_engine.analyze(
689
  text=page_text,
690
+ language=language,
691
  **text_analyzer_kwargs
692
  )
693
  all_text_line_results = map_back_entity_results(
 
707
  text_analyzer_kwargs["entities"] = custom_redact_entities
708
  page_analyser_result = self.analyzer_engine.analyze(
709
  text=page_text,
710
+ language=language,
711
  **text_analyzer_kwargs
712
  )
713
  all_text_line_results = map_back_entity_results(
 
740
  current_batch,
741
  current_batch_mapping,
742
  comprehend_client,
743
+ aws_language,
744
  text_analyzer_kwargs.get('allow_list', []),
745
  chosen_redact_comprehend_entities,
746
  all_text_line_results
 
775
  current_batch,
776
  current_batch_mapping,
777
  comprehend_client,
778
+ aws_language,
779
  text_analyzer_kwargs.get('allow_list', []),
780
  chosen_redact_comprehend_entities,
781
  all_text_line_results
 
1087
  comprehend_client = None,
1088
  allow_list: List[str] = None,
1089
  pii_identification_method: str = "Local",
1090
+ nlp_analyser: AnalyzerEngine = None,
1091
  score_threshold: float = 0.0,
1092
  custom_entities: List[str] = None,
1093
  comprehend_query_number:int = 0#,
tools/data_anonymise.py CHANGED
@@ -10,11 +10,11 @@ import docx
10
  from openpyxl import Workbook
11
  from faker import Faker
12
  from gradio import Progress
13
- from typing import List, Dict, Any
14
  from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
15
  from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
16
  from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
17
- from tools.config import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, OUTPUT_FOLDER
18
  from tools.helper_functions import get_file_name_without_type, read_file, detect_file_type
19
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, custom_entities
20
  # Use custom version of analyze_dict to be able to track progress
@@ -119,7 +119,7 @@ def anon_consistent_names(df:pd.DataFrame) -> pd.DataFrame:
119
  #analyzer = AnalyzerEngine()
120
  batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
121
 
122
- analyzer_results = batch_analyzer.analyze_dict(df_dict, language="en")
123
  analyzer_results = list(analyzer_results)
124
 
125
  # + tags=[]
@@ -208,7 +208,6 @@ def handle_docx_anonymisation(
208
  file_path: str,
209
  output_folder: str,
210
  anon_strat: str,
211
- language: str,
212
  chosen_redact_entities: List[str],
213
  in_allow_list: List[str],
214
  in_deny_list: List[str],
@@ -216,7 +215,8 @@ def handle_docx_anonymisation(
216
  pii_identification_method: str,
217
  chosen_redact_comprehend_entities: List[str],
218
  comprehend_query_number: int,
219
- comprehend_client # Assuming botocore.client.BaseClient type
 
220
  ):
221
  """
222
  Anonymises a .docx file by extracting text, processing it, and re-inserting it.
@@ -253,11 +253,14 @@ def handle_docx_anonymisation(
253
  # 2. Convert to a DataFrame for the existing anonymisation script
254
  df_to_anonymise = pd.DataFrame({'text_to_redact': original_texts})
255
 
 
 
 
256
  # 3. Call the core anonymisation script
257
  anonymised_df, _, decision_log = anonymise_script(
258
  df=df_to_anonymise,
259
  anon_strat=anon_strat,
260
- language=language,
261
  chosen_redact_entities=chosen_redact_entities,
262
  in_allow_list=in_allow_list,
263
  in_deny_list=in_deny_list,
@@ -307,7 +310,6 @@ def anonymise_files_with_open_text(file_paths: List[str],
307
  in_text: str,
308
  anon_strat: str,
309
  chosen_cols: List[str],
310
- language: str,
311
  chosen_redact_entities: List[str],
312
  in_allow_list: List[str] = None,
313
  latest_file_completed: int = 0,
@@ -325,7 +327,9 @@ def anonymise_files_with_open_text(file_paths: List[str],
325
  aws_access_key_textbox:str='',
326
  aws_secret_key_textbox:str='',
327
  actual_time_taken_number:float=0,
328
- progress: Progress = Progress(track_tqdm=True)):
 
 
329
  """
330
  This function anonymises data files based on the provided parameters.
331
 
@@ -352,11 +356,21 @@ def anonymise_files_with_open_text(file_paths: List[str],
352
  - aws_access_key_textbox (str, optional): AWS access key for account with Textract and Comprehend permissions.
353
  - aws_secret_key_textbox (str, optional): AWS secret key for account with Textract and Comprehend permissions.
354
  - actual_time_taken_number (float, optional): Time taken to do the redaction.
 
355
  - progress (Progress, optional): A Progress object to track progress. Defaults to a Progress object with track_tqdm=True.
356
  """
357
 
358
  tic = time.perf_counter()
359
  comprehend_client = ""
 
 
 
 
 
 
 
 
 
360
 
361
  # If this is the first time around, set variables to 0/blank
362
  if first_loop_state==True:
@@ -455,7 +469,6 @@ def anonymise_files_with_open_text(file_paths: List[str],
455
  file_path=anon_file.name, # .name if it's a temp file object
456
  output_folder=output_folder,
457
  anon_strat=anon_strat,
458
- language=language,
459
  chosen_redact_entities=chosen_redact_entities,
460
  in_allow_list=in_allow_list_flat,
461
  in_deny_list=in_deny_list,
@@ -463,7 +476,8 @@ def anonymise_files_with_open_text(file_paths: List[str],
463
  pii_identification_method=pii_identification_method,
464
  chosen_redact_comprehend_entities=chosen_redact_comprehend_entities,
465
  comprehend_query_number=comprehend_query_number,
466
- comprehend_client=comprehend_client
 
467
  )
468
  if output_path:
469
  out_file_paths.append(output_path)
@@ -493,14 +507,14 @@ def anonymise_files_with_open_text(file_paths: List[str],
493
 
494
  anon_df = pd.read_excel(anon_file, sheet_name=sheet_name)
495
 
496
- out_file_paths, out_message, key_string, log_files_output_paths = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name, log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder)
497
 
498
  else:
499
  sheet_name = ""
500
  anon_df = read_file(anon_file)
501
  out_file_part = get_file_name_without_type(anon_file.name)
502
 
503
- out_file_paths, out_message, key_string, log_files_output_paths = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder)
504
 
505
  # Increase latest file completed count unless we are at the last file
506
  if latest_file_completed != len(file_paths):
@@ -537,7 +551,7 @@ def tabular_anonymise_wrapper_func(
537
  out_message: str,
538
  excel_sheet_name: str,
539
  anon_strat: str,
540
- language: str,
541
  chosen_redact_entities: List[str],
542
  in_allow_list: List[str],
543
  file_type: str,
@@ -546,6 +560,7 @@ def tabular_anonymise_wrapper_func(
546
  in_deny_list: List[str]=[],
547
  max_fuzzy_spelling_mistakes_num:int=0,
548
  pii_identification_method:str="Local",
 
549
  chosen_redact_comprehend_entities:List[str]=[],
550
  comprehend_query_number:int=0,
551
  comprehend_client:botocore.client.BaseClient="",
@@ -617,8 +632,11 @@ def tabular_anonymise_wrapper_func(
617
  anon_df_part = anon_df[chosen_cols_in_anon_df]
618
  anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
619
 
 
 
 
620
  # Anonymise the selected columns
621
- anon_df_part_out, key_string, decision_process_output_str = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client)
622
 
623
  anon_df_part_out.replace("^nan$", "", regex=True, inplace=True)
624
 
@@ -681,6 +699,7 @@ def anonymise_script(df:pd.DataFrame,
681
  in_deny_list:List[str]=[],
682
  max_fuzzy_spelling_mistakes_num:int=0,
683
  pii_identification_method:str="Local",
 
684
  chosen_redact_comprehend_entities:List[str]=[],
685
  comprehend_query_number:int=0,
686
  comprehend_client:botocore.client.BaseClient="",
@@ -738,6 +757,9 @@ def anonymise_script(df:pd.DataFrame,
738
  batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
739
  analyzer_results = []
740
 
 
 
 
741
  if pii_identification_method == "Local":
742
 
743
  # Use custom analyzer to be able to track progress with Gradio
@@ -801,7 +823,7 @@ def anonymise_script(df:pd.DataFrame,
801
  try:
802
  response = comprehend_client.detect_pii_entities(
803
  Text=str(text),
804
- LanguageCode=language
805
  )
806
 
807
  comprehend_query_number += 1
 
10
  from openpyxl import Workbook
11
  from faker import Faker
12
  from gradio import Progress
13
+ from typing import List, Dict, Any, Optional
14
  from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
15
  from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
16
  from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
17
+ from tools.config import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, OUTPUT_FOLDER, DEFAULT_LANGUAGE, aws_comprehend_language_choices
18
  from tools.helper_functions import get_file_name_without_type, read_file, detect_file_type
19
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, custom_entities
20
  # Use custom version of analyze_dict to be able to track progress
 
119
  #analyzer = AnalyzerEngine()
120
  batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
121
 
122
+ analyzer_results = batch_analyzer.analyze_dict(df_dict, language=DEFAULT_LANGUAGE)
123
  analyzer_results = list(analyzer_results)
124
 
125
  # + tags=[]
 
208
  file_path: str,
209
  output_folder: str,
210
  anon_strat: str,
 
211
  chosen_redact_entities: List[str],
212
  in_allow_list: List[str],
213
  in_deny_list: List[str],
 
215
  pii_identification_method: str,
216
  chosen_redact_comprehend_entities: List[str],
217
  comprehend_query_number: int,
218
+ comprehend_client, # Assuming botocore.client.BaseClient type
219
+ language: Optional[str] = None
220
  ):
221
  """
222
  Anonymises a .docx file by extracting text, processing it, and re-inserting it.
 
253
  # 2. Convert to a DataFrame for the existing anonymisation script
254
  df_to_anonymise = pd.DataFrame({'text_to_redact': original_texts})
255
 
256
+ # Use provided language or default
257
+ effective_language = language or DEFAULT_LANGUAGE
258
+
259
  # 3. Call the core anonymisation script
260
  anonymised_df, _, decision_log = anonymise_script(
261
  df=df_to_anonymise,
262
  anon_strat=anon_strat,
263
+ language=effective_language,
264
  chosen_redact_entities=chosen_redact_entities,
265
  in_allow_list=in_allow_list,
266
  in_deny_list=in_deny_list,
 
310
  in_text: str,
311
  anon_strat: str,
312
  chosen_cols: List[str],
 
313
  chosen_redact_entities: List[str],
314
  in_allow_list: List[str] = None,
315
  latest_file_completed: int = 0,
 
327
  aws_access_key_textbox:str='',
328
  aws_secret_key_textbox:str='',
329
  actual_time_taken_number:float=0,
330
+ language: Optional[str] = None,
331
+ progress: Progress = Progress(track_tqdm=True),
332
+ comprehend_language: Optional[str] = None):
333
  """
334
  This function anonymises data files based on the provided parameters.
335
 
 
356
  - aws_access_key_textbox (str, optional): AWS access key for account with Textract and Comprehend permissions.
357
  - aws_secret_key_textbox (str, optional): AWS secret key for account with Textract and Comprehend permissions.
358
  - actual_time_taken_number (float, optional): Time taken to do the redaction.
359
+ - language (str, optional): The language of the text to anonymise.
360
  - progress (Progress, optional): A Progress object to track progress. Defaults to a Progress object with track_tqdm=True.
361
  """
362
 
363
  tic = time.perf_counter()
364
  comprehend_client = ""
365
+
366
+ # Use provided language or default
367
+ effective_language = language or DEFAULT_LANGUAGE
368
+ effective_comprehend_language = comprehend_language or effective_language
369
+
370
+ if pii_identification_method == "AWS Comprehend":
371
+ if effective_comprehend_language not in aws_comprehend_language_choices:
372
+ out_message = f"Please note that this language is not supported by AWS Comprehend: {effective_comprehend_language}"
373
+ raise Warning(out_message)
374
 
375
  # If this is the first time around, set variables to 0/blank
376
  if first_loop_state==True:
 
469
  file_path=anon_file.name, # .name if it's a temp file object
470
  output_folder=output_folder,
471
  anon_strat=anon_strat,
 
472
  chosen_redact_entities=chosen_redact_entities,
473
  in_allow_list=in_allow_list_flat,
474
  in_deny_list=in_deny_list,
 
476
  pii_identification_method=pii_identification_method,
477
  chosen_redact_comprehend_entities=chosen_redact_comprehend_entities,
478
  comprehend_query_number=comprehend_query_number,
479
+ comprehend_client=comprehend_client,
480
+ language=effective_language
481
  )
482
  if output_path:
483
  out_file_paths.append(output_path)
 
507
 
508
  anon_df = pd.read_excel(anon_file, sheet_name=sheet_name)
509
 
510
+ out_file_paths, out_message, key_string, log_files_output_paths = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, effective_language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name, log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, effective_comprehend_language, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder)
511
 
512
  else:
513
  sheet_name = ""
514
  anon_df = read_file(anon_file)
515
  out_file_part = get_file_name_without_type(anon_file.name)
516
 
517
+ out_file_paths, out_message, key_string, log_files_output_paths = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, effective_language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, effective_comprehend_language, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder)
518
 
519
  # Increase latest file completed count unless we are at the last file
520
  if latest_file_completed != len(file_paths):
 
551
  out_message: str,
552
  excel_sheet_name: str,
553
  anon_strat: str,
554
+ language: str,
555
  chosen_redact_entities: List[str],
556
  in_allow_list: List[str],
557
  file_type: str,
 
560
  in_deny_list: List[str]=[],
561
  max_fuzzy_spelling_mistakes_num:int=0,
562
  pii_identification_method:str="Local",
563
+ comprehend_language: Optional[str] = None,
564
  chosen_redact_comprehend_entities:List[str]=[],
565
  comprehend_query_number:int=0,
566
  comprehend_client:botocore.client.BaseClient="",
 
632
  anon_df_part = anon_df[chosen_cols_in_anon_df]
633
  anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
634
 
635
+ # Use provided comprehend language or fall back to main language
636
+ effective_comprehend_language = comprehend_language or language
637
+
638
  # Anonymise the selected columns
639
+ anon_df_part_out, key_string, decision_process_output_str = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, effective_comprehend_language, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client)
640
 
641
  anon_df_part_out.replace("^nan$", "", regex=True, inplace=True)
642
 
 
699
  in_deny_list:List[str]=[],
700
  max_fuzzy_spelling_mistakes_num:int=0,
701
  pii_identification_method:str="Local",
702
+ comprehend_language:Optional[str]=None,
703
  chosen_redact_comprehend_entities:List[str]=[],
704
  comprehend_query_number:int=0,
705
  comprehend_client:botocore.client.BaseClient="",
 
757
  batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
758
  analyzer_results = []
759
 
760
+ # Use provided comprehend language or fall back to main language
761
+ effective_comprehend_language = comprehend_language or language
762
+
763
  if pii_identification_method == "Local":
764
 
765
  # Use custom analyzer to be able to track progress with Gradio
 
823
  try:
824
  response = comprehend_client.detect_pii_entities(
825
  Text=str(text),
826
+ LanguageCode=effective_comprehend_language
827
  )
828
 
829
  comprehend_query_number += 1
tools/file_conversion.py CHANGED
@@ -673,7 +673,11 @@ def prepare_image_or_pdf(
673
  all_annotations_object = convert_review_df_to_annotation_json(review_file_csv, image_file_paths, page_sizes)
674
  json_from_csv = True
675
  elif '_ocr_output' in file_path_without_ext:
676
- all_line_level_ocr_results_df = read_file(file_path)
 
 
 
 
677
  json_from_csv = False
678
  elif '_ocr_results_with_words' in file_path_without_ext:
679
  all_page_line_level_ocr_results_with_words_df = read_file(file_path)
 
673
  all_annotations_object = convert_review_df_to_annotation_json(review_file_csv, image_file_paths, page_sizes)
674
  json_from_csv = True
675
  elif '_ocr_output' in file_path_without_ext:
676
+ all_line_level_ocr_results_df = read_file(file_path)
677
+
678
+ if "line" not in all_line_level_ocr_results_df.columns:
679
+ all_line_level_ocr_results_df["line"] = ""
680
+
681
  json_from_csv = False
682
  elif '_ocr_results_with_words' in file_path_without_ext:
683
  all_page_line_level_ocr_results_with_words_df = read_file(file_path)
tools/file_redaction.py CHANGED
@@ -15,14 +15,15 @@ from pdfminer.high_level import extract_pages
15
  from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTTextLineHorizontal, LTAnno
16
  from pikepdf import Pdf, Dictionary, Name
17
  from pymupdf import Rect, Page, Document
 
18
  import gradio as gr
19
  from gradio import Progress
20
  from collections import defaultdict # For efficient grouping
21
 
22
- from tools.config import OUTPUT_FOLDER, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, PAGE_BREAK_VALUE, MAX_TIME_VALUE, LOAD_TRUNCATED_IMAGES, INPUT_FOLDER, RETURN_PDF_END_OF_REDACTION, TESSERACT_TEXT_EXTRACT_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, LOCAL_PII_OPTION, AWS_PII_OPTION, NO_REDACTION_PII_OPTION
23
  from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, recreate_page_line_level_ocr_results_with_page
24
  from tools.file_conversion import convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, is_pdf, is_pdf_or_image, prepare_image_or_pdf, divide_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, divide_coordinates_by_page_sizes, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, fill_missing_box_ids, load_and_convert_ocr_results_with_words_json, save_pdf_with_or_without_compression, word_level_ocr_output_to_dataframe
25
- from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
26
  from tools.helper_functions import get_file_name_without_type, clean_unicode_text
27
  from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
28
 
@@ -84,12 +85,9 @@ def merge_page_results(data:list):
84
 
85
  return list(merged.values())
86
 
87
-
88
-
89
  def choose_and_run_redactor(file_paths:List[str],
90
  prepared_pdf_file_paths:List[str],
91
- pdf_image_file_paths:List[str],
92
- language:str,
93
  chosen_redact_entities:List[str],
94
  chosen_redact_comprehend_entities:List[str],
95
  text_extraction_method:str,
@@ -112,7 +110,7 @@ def choose_and_run_redactor(file_paths:List[str],
112
  pymupdf_doc=list(),
113
  current_loop_page:int=0,
114
  page_break_return:bool=False,
115
- pii_identification_method:str="Local",
116
  comprehend_query_number:int=0,
117
  max_fuzzy_spelling_mistakes_num:int=1,
118
  match_fuzzy_whole_phrase_bool:bool=True,
@@ -134,7 +132,8 @@ def choose_and_run_redactor(file_paths:List[str],
134
  all_page_line_level_ocr_results_with_words:list[dict] = list(),
135
  all_page_line_level_ocr_results_with_words_df:pd.DataFrame=None,
136
  chosen_local_model:str="tesseract",
137
- prepare_images:bool=True,
 
138
  RETURN_PDF_END_OF_REDACTION:bool=RETURN_PDF_END_OF_REDACTION,
139
  progress=gr.Progress(track_tqdm=True)):
140
  '''
@@ -143,7 +142,7 @@ def choose_and_run_redactor(file_paths:List[str],
143
  - file_paths (List[str]): A list of paths to the files to be redacted.
144
  - prepared_pdf_file_paths (List[str]): A list of paths to the PDF files prepared for redaction.
145
  - pdf_image_file_paths (List[str]): A list of paths to the PDF files converted to images for redaction.
146
- - language (str): The language of the text in the files.
147
  - chosen_redact_entities (List[str]): A list of entity types to redact from the files using the local model (spacy) with Microsoft Presidio.
148
  - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service.
149
  - text_extraction_method (str): The method to use to extract text from documents.
@@ -188,7 +187,9 @@ def choose_and_run_redactor(file_paths:List[str],
188
  - all_page_line_level_ocr_results_with_words (list, optional): All word level text on the page with bounding boxes.
189
  - all_page_line_level_ocr_results_with_words_df (pd.Dataframe, optional): All word level text on the page with bounding boxes as a dataframe.
190
  - chosen_local_model (str): Which local model is being used for OCR on images - "tesseract", "paddle" for PaddleOCR, or "hybrid" to combine both.
191
- - prepare_images (bool, optional): Boolean to determine whether to load images for the PDF.
 
 
192
  - RETURN_PDF_END_OF_REDACTION (bool, optional): Boolean to determine whether to return a redacted PDF at the end of the redaction process.
193
  - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
194
 
@@ -203,6 +204,18 @@ def choose_and_run_redactor(file_paths:List[str],
203
  blank_request_metadata = []
204
  all_textract_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
205
  review_out_file_paths = [prepared_pdf_file_paths[0]]
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
  if all_page_line_level_ocr_results_with_words_df is None:
208
  all_page_line_level_ocr_results_with_words_df = pd.DataFrame()
@@ -452,6 +465,19 @@ def choose_and_run_redactor(file_paths:List[str],
452
  else:
453
  textract_client = ""
454
 
 
 
 
 
 
 
 
 
 
 
 
 
 
455
  # Check if output_folder exists, create it if it doesn't
456
  if not os.path.exists(output_folder): os.makedirs(output_folder)
457
 
@@ -511,7 +537,7 @@ def choose_and_run_redactor(file_paths:List[str],
511
 
512
  pymupdf_doc, all_pages_decision_process_table, out_file_paths, new_textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_page_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words = redact_image_pdf(file_path,
513
  pdf_image_file_paths,
514
- language,
515
  chosen_redact_entities,
516
  chosen_redact_comprehend_entities,
517
  in_allow_list_flat,
@@ -538,7 +564,7 @@ def choose_and_run_redactor(file_paths:List[str],
538
  text_extraction_only,
539
  all_page_line_level_ocr_results,
540
  all_page_line_level_ocr_results_with_words,
541
- chosen_local_model,
542
  log_files_output_paths=log_files_output_paths,
543
  output_folder=output_folder)
544
 
@@ -560,7 +586,7 @@ def choose_and_run_redactor(file_paths:List[str],
560
 
561
  pymupdf_doc, all_pages_decision_process_table, all_page_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, all_page_line_level_ocr_results_with_words = redact_text_pdf(
562
  file_path,
563
- language,
564
  chosen_redact_entities,
565
  chosen_redact_comprehend_entities,
566
  in_allow_list_flat,
@@ -1352,6 +1378,7 @@ def redact_image_pdf(file_path:str,
1352
  log_files_output_paths:List=list(),
1353
  max_time:int=int(MAX_TIME_VALUE),
1354
  output_folder:str=OUTPUT_FOLDER,
 
1355
  progress=Progress(track_tqdm=True)):
1356
 
1357
  '''
@@ -1391,6 +1418,7 @@ def redact_image_pdf(file_path:str,
1391
  - log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
1392
  - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
1393
  - output_folder (str, optional): The folder for file outputs.
 
1394
  - progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
1395
 
1396
  The function returns a redacted PDF document along with processing output objects.
@@ -1400,6 +1428,20 @@ def redact_image_pdf(file_path:str,
1400
 
1401
  file_name = get_file_name_without_type(file_path)
1402
  comprehend_query_number_new = 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1403
 
1404
  # Update custom word list analyser object with any new words that have been added to the custom deny list
1405
  if custom_recogniser_word_list:
@@ -1413,9 +1455,9 @@ def redact_image_pdf(file_path:str,
1413
 
1414
  # Only load in PaddleOCR models if not running Textract
1415
  if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
1416
- image_analyser = CustomImageAnalyzerEngine(nlp_analyser, ocr_engine="tesseract")
1417
  else:
1418
- image_analyser = CustomImageAnalyzerEngine(nlp_analyser, ocr_engine=chosen_local_model)
1419
 
1420
  if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
1421
  out_message = "Connection to AWS Comprehend service unsuccessful."
@@ -1635,7 +1677,7 @@ def redact_image_pdf(file_path:str,
1635
  chosen_redact_comprehend_entities = chosen_redact_comprehend_entities,
1636
  pii_identification_method = pii_identification_method,
1637
  comprehend_client=comprehend_client,
1638
- language=language,
1639
  entities=chosen_redact_entities,
1640
  allow_list=allow_list,
1641
  score_threshold=score_threshold
@@ -2155,7 +2197,7 @@ def redact_text_pdf(
2155
  all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"]), # DataFrame for decision process table
2156
  pymupdf_doc: List = list(), # List of PyMuPDF documents
2157
  all_page_line_level_ocr_results_with_words: List = list(),
2158
- pii_identification_method: str = "Local",
2159
  comprehend_query_number:int = 0,
2160
  comprehend_client="",
2161
  custom_recogniser_word_list:List[str]=list(),
@@ -2167,10 +2209,10 @@ def redact_text_pdf(
2167
  text_extraction_only:bool=False,
2168
  output_folder:str=OUTPUT_FOLDER,
2169
  page_break_val: int = int(PAGE_BREAK_VALUE), # Value for page break
2170
- max_time: int = int(MAX_TIME_VALUE),
 
2171
  progress: Progress = Progress(track_tqdm=True) # Progress tracking object
2172
- ):
2173
-
2174
  '''
2175
  Redact chosen entities from a PDF that is made up of multiple pages that are not images.
2176
 
@@ -2199,13 +2241,15 @@ def redact_text_pdf(
2199
  - page_sizes_df (pd.DataFrame, optional): A pandas dataframe containing page size information.
2200
  - original_cropboxes (List[dict], optional): A list of dictionaries containing pymupdf cropbox information.
2201
  - text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
 
2202
  - output_folder (str, optional): The output folder for the function
2203
  - page_break_val: Value for page break
2204
- - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
 
2205
  - progress: Progress tracking object
2206
  '''
2207
 
2208
- tic = time.perf_counter()
2209
 
2210
  if isinstance(all_line_level_ocr_results_df, pd.DataFrame):
2211
  all_line_level_ocr_results_list = [all_line_level_ocr_results_df]
@@ -2218,6 +2262,20 @@ def redact_text_pdf(
2218
  out_message = "Connection to AWS Comprehend service not found."
2219
  raise Exception(out_message)
2220
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2221
  # Update custom word list analyser object with any new words that have been added to the custom deny list
2222
  if custom_recogniser_word_list:
2223
  nlp_analyser.registry.remove_recognizer("CUSTOM")
@@ -2228,6 +2286,8 @@ def redact_text_pdf(
2228
  new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
2229
  nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
2230
 
 
 
2231
  # Open with Pikepdf to get text lines
2232
  pikepdf_pdf = Pdf.open(file_path)
2233
  number_of_pages = len(pikepdf_pdf.pages)
@@ -2323,7 +2383,7 @@ def redact_text_pdf(
2323
 
2324
  if chosen_redact_entities or chosen_redact_comprehend_entities:
2325
  page_redaction_bounding_boxes = run_page_text_redaction(
2326
- language,
2327
  chosen_redact_entities,
2328
  chosen_redact_comprehend_entities,
2329
  all_page_line_level_text_extraction_results_list,
 
15
  from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTTextLineHorizontal, LTAnno
16
  from pikepdf import Pdf, Dictionary, Name
17
  from pymupdf import Rect, Page, Document
18
+ from presidio_analyzer import AnalyzerEngine
19
  import gradio as gr
20
  from gradio import Progress
21
  from collections import defaultdict # For efficient grouping
22
 
23
+ from tools.config import OUTPUT_FOLDER, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, PAGE_BREAK_VALUE, MAX_TIME_VALUE, LOAD_TRUNCATED_IMAGES, INPUT_FOLDER, RETURN_PDF_END_OF_REDACTION, TESSERACT_TEXT_EXTRACT_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, LOCAL_PII_OPTION, AWS_PII_OPTION, NO_REDACTION_PII_OPTION, DEFAULT_LANGUAGE, textract_language_choices, aws_comprehend_language_choices
24
  from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, recreate_page_line_level_ocr_results_with_page
25
  from tools.file_conversion import convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, is_pdf, is_pdf_or_image, prepare_image_or_pdf, divide_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, divide_coordinates_by_page_sizes, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, fill_missing_box_ids, load_and_convert_ocr_results_with_words_json, save_pdf_with_or_without_compression, word_level_ocr_output_to_dataframe
26
+ from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer, load_spacy_model, download_tesseract_lang_pack, create_nlp_analyser
27
  from tools.helper_functions import get_file_name_without_type, clean_unicode_text
28
  from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
29
 
 
85
 
86
  return list(merged.values())
87
 
 
 
88
  def choose_and_run_redactor(file_paths:List[str],
89
  prepared_pdf_file_paths:List[str],
90
+ pdf_image_file_paths:List[str],
 
91
  chosen_redact_entities:List[str],
92
  chosen_redact_comprehend_entities:List[str],
93
  text_extraction_method:str,
 
110
  pymupdf_doc=list(),
111
  current_loop_page:int=0,
112
  page_break_return:bool=False,
113
+ pii_identification_method:str="Local",
114
  comprehend_query_number:int=0,
115
  max_fuzzy_spelling_mistakes_num:int=1,
116
  match_fuzzy_whole_phrase_bool:bool=True,
 
132
  all_page_line_level_ocr_results_with_words:list[dict] = list(),
133
  all_page_line_level_ocr_results_with_words_df:pd.DataFrame=None,
134
  chosen_local_model:str="tesseract",
135
+ language:str=DEFAULT_LANGUAGE,
136
+ prepare_images:bool=True,
137
  RETURN_PDF_END_OF_REDACTION:bool=RETURN_PDF_END_OF_REDACTION,
138
  progress=gr.Progress(track_tqdm=True)):
139
  '''
 
142
  - file_paths (List[str]): A list of paths to the files to be redacted.
143
  - prepared_pdf_file_paths (List[str]): A list of paths to the PDF files prepared for redaction.
144
  - pdf_image_file_paths (List[str]): A list of paths to the PDF files converted to images for redaction.
145
+
146
  - chosen_redact_entities (List[str]): A list of entity types to redact from the files using the local model (spacy) with Microsoft Presidio.
147
  - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service.
148
  - text_extraction_method (str): The method to use to extract text from documents.
 
187
  - all_page_line_level_ocr_results_with_words (list, optional): All word level text on the page with bounding boxes.
188
  - all_page_line_level_ocr_results_with_words_df (pd.Dataframe, optional): All word level text on the page with bounding boxes as a dataframe.
189
  - chosen_local_model (str): Which local model is being used for OCR on images - "tesseract", "paddle" for PaddleOCR, or "hybrid" to combine both.
190
+ - language (str, optional): The language of the text in the files. Defaults to English.
191
+ - language (str, optional): The language to do AWS Comprehend calls. Defaults to value of language if not provided.
192
+ - prepare_images (bool, optional): Boolean to determine whether to load images for the PDF.
193
  - RETURN_PDF_END_OF_REDACTION (bool, optional): Boolean to determine whether to return a redacted PDF at the end of the redaction process.
194
  - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
195
 
 
204
  blank_request_metadata = []
205
  all_textract_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
206
  review_out_file_paths = [prepared_pdf_file_paths[0]]
207
+
208
+ # Use provided language or default
209
+ effective_language = language or DEFAULT_LANGUAGE
210
+
211
+ if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
212
+ if effective_language not in textract_language_choices:
213
+ out_message = f"Language '{effective_language}' is not supported by AWS Textract. Please select a different language."
214
+ raise Warning(out_message)
215
+ elif pii_identification_method == AWS_PII_OPTION:
216
+ if effective_language not in aws_comprehend_language_choices:
217
+ out_message = f"Language '{effective_language}' is not supported by AWS Comprehend. Please select a different language."
218
+ raise Warning(out_message)
219
 
220
  if all_page_line_level_ocr_results_with_words_df is None:
221
  all_page_line_level_ocr_results_with_words_df = pd.DataFrame()
 
465
  else:
466
  textract_client = ""
467
 
468
+ ### Language check - check if selected language packs exist
469
+ try:
470
+ if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION and chosen_local_model == "tesseract":
471
+ progress(0.1, desc=f"Downloading Tesseract language pack for {effective_language}")
472
+ download_tesseract_lang_pack(effective_language)
473
+
474
+ progress(0.1, desc=f"Loading SpaCy model for {effective_language}")
475
+ load_spacy_model(effective_language)
476
+
477
+ except Exception as e:
478
+ print(f"Error downloading language packs for {effective_language}: {e}")
479
+ raise Exception(f"Error downloading language packs for {effective_language}: {e}")
480
+
481
  # Check if output_folder exists, create it if it doesn't
482
  if not os.path.exists(output_folder): os.makedirs(output_folder)
483
 
 
537
 
538
  pymupdf_doc, all_pages_decision_process_table, out_file_paths, new_textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_page_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words = redact_image_pdf(file_path,
539
  pdf_image_file_paths,
540
+ effective_language,
541
  chosen_redact_entities,
542
  chosen_redact_comprehend_entities,
543
  in_allow_list_flat,
 
564
  text_extraction_only,
565
  all_page_line_level_ocr_results,
566
  all_page_line_level_ocr_results_with_words,
567
+ chosen_local_model,
568
  log_files_output_paths=log_files_output_paths,
569
  output_folder=output_folder)
570
 
 
586
 
587
  pymupdf_doc, all_pages_decision_process_table, all_page_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, all_page_line_level_ocr_results_with_words = redact_text_pdf(
588
  file_path,
589
+ effective_language,
590
  chosen_redact_entities,
591
  chosen_redact_comprehend_entities,
592
  in_allow_list_flat,
 
1378
  log_files_output_paths:List=list(),
1379
  max_time:int=int(MAX_TIME_VALUE),
1380
  output_folder:str=OUTPUT_FOLDER,
1381
+ nlp_analyser: AnalyzerEngine = nlp_analyser,
1382
  progress=Progress(track_tqdm=True)):
1383
 
1384
  '''
 
1418
  - log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
1419
  - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
1420
  - output_folder (str, optional): The folder for file outputs.
1421
+ - nlp_analyser (AnalyzerEngine, optional): The nlp_analyser object to use for entity detection. Defaults to nlp_analyser.
1422
  - progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
1423
 
1424
  The function returns a redacted PDF document along with processing output objects.
 
1428
 
1429
  file_name = get_file_name_without_type(file_path)
1430
  comprehend_query_number_new = 0
1431
+
1432
+ # Use provided comprehend language or fall back to main language
1433
+ effective_language = language or language
1434
+
1435
+ # Try updating the supported languages for the spacy analyser
1436
+ try:
1437
+ nlp_analyser = create_nlp_analyser(language, existing_nlp_analyser=nlp_analyser)
1438
+ # Check list of nlp_analyser recognisers and languages
1439
+ if language != "en":
1440
+ gr.Info(f"Language: {language} only supports the following entity detection: {str(nlp_analyser.registry.get_supported_entities(languages=[language]))}")
1441
+
1442
+ except Exception as e:
1443
+ print(f"Error creating nlp_analyser for {language}: {e}")
1444
+ raise Exception(f"Error creating nlp_analyser for {language}: {e}")
1445
 
1446
  # Update custom word list analyser object with any new words that have been added to the custom deny list
1447
  if custom_recogniser_word_list:
 
1455
 
1456
  # Only load in PaddleOCR models if not running Textract
1457
  if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
1458
+ image_analyser = CustomImageAnalyzerEngine(analyzer_engine=nlp_analyser, ocr_engine="tesseract", language=language)
1459
  else:
1460
+ image_analyser = CustomImageAnalyzerEngine(analyzer_engine=nlp_analyser, ocr_engine=chosen_local_model, language=language)
1461
 
1462
  if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
1463
  out_message = "Connection to AWS Comprehend service unsuccessful."
 
1677
  chosen_redact_comprehend_entities = chosen_redact_comprehend_entities,
1678
  pii_identification_method = pii_identification_method,
1679
  comprehend_client=comprehend_client,
1680
+ language=effective_language,
1681
  entities=chosen_redact_entities,
1682
  allow_list=allow_list,
1683
  score_threshold=score_threshold
 
2197
  all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"]), # DataFrame for decision process table
2198
  pymupdf_doc: List = list(), # List of PyMuPDF documents
2199
  all_page_line_level_ocr_results_with_words: List = list(),
2200
+ pii_identification_method: str = "Local",
2201
  comprehend_query_number:int = 0,
2202
  comprehend_client="",
2203
  custom_recogniser_word_list:List[str]=list(),
 
2209
  text_extraction_only:bool=False,
2210
  output_folder:str=OUTPUT_FOLDER,
2211
  page_break_val: int = int(PAGE_BREAK_VALUE), # Value for page break
2212
+ max_time: int = int(MAX_TIME_VALUE),
2213
+ nlp_analyser: AnalyzerEngine = nlp_analyser,
2214
  progress: Progress = Progress(track_tqdm=True) # Progress tracking object
2215
+ ):
 
2216
  '''
2217
  Redact chosen entities from a PDF that is made up of multiple pages that are not images.
2218
 
 
2241
  - page_sizes_df (pd.DataFrame, optional): A pandas dataframe containing page size information.
2242
  - original_cropboxes (List[dict], optional): A list of dictionaries containing pymupdf cropbox information.
2243
  - text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
2244
+ - language (str, optional): The language to do AWS Comprehend calls. Defaults to value of language if not provided.
2245
  - output_folder (str, optional): The output folder for the function
2246
  - page_break_val: Value for page break
2247
+ - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
2248
+ - nlp_analyser (AnalyzerEngine, optional): The nlp_analyser object to use for entity detection. Defaults to nlp_analyser.
2249
  - progress: Progress tracking object
2250
  '''
2251
 
2252
+ tic = time.perf_counter()
2253
 
2254
  if isinstance(all_line_level_ocr_results_df, pd.DataFrame):
2255
  all_line_level_ocr_results_list = [all_line_level_ocr_results_df]
 
2262
  out_message = "Connection to AWS Comprehend service not found."
2263
  raise Exception(out_message)
2264
 
2265
+ # Use provided comprehend language or fall back to main language
2266
+ effective_language = language or language
2267
+
2268
+ # Try updating the supported languages for the spacy analyser
2269
+ try:
2270
+ nlp_analyser = create_nlp_analyser(language, existing_nlp_analyser=nlp_analyser)
2271
+ # Check list of nlp_analyser recognisers and languages
2272
+ if language != "en":
2273
+ gr.Info(f"Language: {language} only supports the following entity detection: {str(nlp_analyser.registry.get_supported_entities(languages=[language]))}")
2274
+
2275
+ except Exception as e:
2276
+ print(f"Error creating nlp_analyser for {language}: {e}")
2277
+ raise Exception(f"Error creating nlp_analyser for {language}: {e}")
2278
+
2279
  # Update custom word list analyser object with any new words that have been added to the custom deny list
2280
  if custom_recogniser_word_list:
2281
  nlp_analyser.registry.remove_recognizer("CUSTOM")
 
2286
  new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
2287
  nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
2288
 
2289
+
2290
+
2291
  # Open with Pikepdf to get text lines
2292
  pikepdf_pdf = Pdf.open(file_path)
2293
  number_of_pages = len(pikepdf_pdf.pages)
 
2383
 
2384
  if chosen_redact_entities or chosen_redact_comprehend_entities:
2385
  page_redaction_bounding_boxes = run_page_text_redaction(
2386
+ effective_language,
2387
  chosen_redact_entities,
2388
  chosen_redact_comprehend_entities,
2389
  all_page_line_level_text_extraction_results_list,
tools/find_duplicate_pages.py CHANGED
@@ -14,9 +14,7 @@ from pathlib import Path
14
  from typing import List
15
  from tools.helper_functions import OUTPUT_FOLDER
16
  from tools.file_conversion import redact_whole_pymupdf_page, convert_annotation_data_to_dataframe, fill_missing_box_ids_each_box
17
- import en_core_web_lg
18
-
19
- nlp = en_core_web_lg.load()
20
 
21
  similarity_threshold = 0.95
22
  number_of_zeros_to_add_to_index = 7 # Number of zeroes to add between page number and line numbers to get a unique page/line index value
 
14
  from typing import List
15
  from tools.helper_functions import OUTPUT_FOLDER
16
  from tools.file_conversion import redact_whole_pymupdf_page, convert_annotation_data_to_dataframe, fill_missing_box_ids_each_box
17
+ from tools.load_spacy_model_custom_recognisers import nlp
 
 
18
 
19
  similarity_threshold = 0.95
20
  number_of_zeros_to_add_to_index = 7 # Number of zeroes to add between page number and line numbers to get a unique page/line index value
tools/helper_functions.py CHANGED
@@ -9,7 +9,24 @@ import unicodedata
9
  from typing import List
10
  from math import ceil
11
  from gradio_image_annotation import image_annotator
12
- from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, AWS_PII_OPTION
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  def reset_state_vars():
15
  return [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
@@ -157,13 +174,7 @@ def ensure_output_folder_exists(output_folder:str):
157
  else:
158
  print(f"The {output_folder} folder already exists.")
159
 
160
- def _get_env_list(env_var_name: str) -> List[str]:
161
- """Parses a comma-separated environment variable into a list of strings."""
162
- value = env_var_name[1:-1].strip().replace('\"', '').replace("\'","")
163
- if not value:
164
- return []
165
- # Split by comma and filter out any empty strings that might result from extra commas
166
- return [s.strip() for s in value.split(',') if s.strip()]
167
 
168
  def custom_regex_load(in_file:List[str], file_type:str = "allow_list"):
169
  '''
@@ -189,7 +200,7 @@ def custom_regex_load(in_file:List[str], file_type:str = "allow_list"):
189
  print(output_text)
190
  else:
191
  output_text = "No file provided."
192
- print(output_text)
193
  return output_text, custom_regex_df
194
 
195
  return output_text, custom_regex_df
@@ -590,4 +601,25 @@ def reset_ocr_with_words_base_dataframe(df:pd.DataFrame, page_entity_dropdown_re
590
 
591
  output_df_filtered = df.loc[df["page"]==str(page_entity_dropdown_redaction_value), ["page", "line", "word_text", "word_x0", "word_y0", "word_x1", "word_y1", "index"]]
592
  return output_df_filtered, output_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
593
 
 
9
  from typing import List
10
  from math import ceil
11
  from gradio_image_annotation import image_annotator
12
+ from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, AWS_PII_OPTION, MAPPED_LANGUAGE_CHOICES, LANGUAGE_CHOICES, textract_language_choices, aws_comprehend_language_choices, DEFAULT_LANGUAGE
13
+ # from tools.load_spacy_model_custom_recognisers import nlp_analyser
14
+
15
+ def _get_env_list(env_var_name: str) -> List[str]:
16
+ """Parses a comma-separated environment variable into a list of strings."""
17
+ value = env_var_name[1:-1].strip().replace('\"', '').replace("\'","")
18
+ if not value:
19
+ return []
20
+ # Split by comma and filter out any empty strings that might result from extra commas
21
+ return [s.strip() for s in value.split(',') if s.strip()]
22
+
23
+ if textract_language_choices: textract_language_choices = _get_env_list(textract_language_choices)
24
+ if aws_comprehend_language_choices: aws_comprehend_language_choices = _get_env_list(aws_comprehend_language_choices)
25
+
26
+ if MAPPED_LANGUAGE_CHOICES: MAPPED_LANGUAGE_CHOICES = _get_env_list(MAPPED_LANGUAGE_CHOICES)
27
+ if LANGUAGE_CHOICES: LANGUAGE_CHOICES = _get_env_list(LANGUAGE_CHOICES)
28
+
29
+ LANGUAGE_MAP = dict(zip(MAPPED_LANGUAGE_CHOICES, LANGUAGE_CHOICES))
30
 
31
  def reset_state_vars():
32
  return [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
 
174
  else:
175
  print(f"The {output_folder} folder already exists.")
176
 
177
+
 
 
 
 
 
 
178
 
179
  def custom_regex_load(in_file:List[str], file_type:str = "allow_list"):
180
  '''
 
200
  print(output_text)
201
  else:
202
  output_text = "No file provided."
203
+ #print(output_text)
204
  return output_text, custom_regex_df
205
 
206
  return output_text, custom_regex_df
 
601
 
602
  output_df_filtered = df.loc[df["page"]==str(page_entity_dropdown_redaction_value), ["page", "line", "word_text", "word_x0", "word_y0", "word_x1", "word_y1", "index"]]
603
  return output_df_filtered, output_df
604
+
605
+ def update_language_dropdown(chosen_language_full_name_drop, textract_language_choices=textract_language_choices, aws_comprehend_language_choices=aws_comprehend_language_choices, LANGUAGE_MAP=LANGUAGE_MAP):
606
+
607
+ try:
608
+ full_language_name = chosen_language_full_name_drop.lower()
609
+ matched_language = LANGUAGE_MAP[full_language_name]
610
+
611
+ chosen_language_drop = gr.Dropdown(value = matched_language, choices = LANGUAGE_CHOICES, label="Chosen language short code", multiselect=False, visible=True)
612
+
613
+ if matched_language not in aws_comprehend_language_choices and matched_language not in textract_language_choices:
614
+ gr.Info(f"Note that {full_language_name} is not supported by AWS Comprehend or AWS Textract")
615
+ elif matched_language not in aws_comprehend_language_choices:
616
+ gr.Info(f"Note that {full_language_name} is not supported by AWS Comprehend")
617
+ elif matched_language not in textract_language_choices:
618
+ gr.Info(f"Note that {full_language_name} is not supported by AWS Textract")
619
+ except Exception as e:
620
+ print(e)
621
+ gr.Info("Could not find language in list")
622
+ chosen_language_drop = gr.Dropdown(value = DEFAULT_LANGUAGE, choices = LANGUAGE_CHOICES, label="Chosen language short code", multiselect=False)
623
+
624
+ return chosen_language_drop
625
 
tools/load_spacy_model_custom_recognisers.py CHANGED
@@ -1,48 +1,255 @@
1
  from typing import List
2
  from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
3
  from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts, NerModelConfiguration
4
-
5
  import spacy
6
- from spacy.matcher import Matcher, PhraseMatcher
7
  from spaczz.matcher import FuzzyMatcher
8
  spacy.prefer_gpu()
9
  from spacy.cli.download import download
10
  import Levenshtein
11
  import re
 
 
12
  import gradio as gr
 
13
 
14
- model_name = "en_core_web_lg" #"en_core_web_sm" #"en_core_web_trf"
15
  score_threshold = 0.001
16
  custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"]
17
 
18
- #Load spacy model
19
- try:
20
- import en_core_web_lg #en_core_web_sm
21
- nlp = en_core_web_lg.load() #en_core_web_sm.load()
22
- print("Successfully imported spaCy model")
23
-
24
- except:
25
- download(model_name)
26
- nlp = spacy.load(model_name)
27
- print("Successfully downloaded and imported spaCy model", model_name)
28
-
29
  # Create a class inheriting from SpacyNlpEngine
30
  class LoadedSpacyNlpEngine(SpacyNlpEngine):
31
- def __init__(self, loaded_spacy_model):
32
  super().__init__(ner_model_configuration=NerModelConfiguration(labels_to_ignore=["CARDINAL", "ORDINAL"])) # Ignore non-relevant labels
33
- self.nlp = {"en": loaded_spacy_model}
34
 
35
- # Pass the loaded model to the new LoadedSpacyNlpEngine
36
- loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
 
 
38
 
39
- nlp_analyser = AnalyzerEngine(nlp_engine=loaded_nlp_engine,
40
- default_score_threshold=score_threshold,
41
- supported_languages=["en"],
42
- log_decision_process=False,
43
- ) # New custom recognisers based on the following functions are added at the end of this script
44
 
45
- # #### Custom recognisers
46
  def custom_word_list_recogniser(custom_list:List[str]=[]):
47
  # Create regex pattern, handling quotes carefully
48
 
@@ -297,7 +504,6 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
297
 
298
  return all_start_positions, all_end_positions
299
 
300
-
301
  class CustomWordFuzzyRecognizer(EntityRecognizer):
302
  def __init__(self, supported_entities: List[str], custom_list: List[str] = [], spelling_mistakes_max: int = 1, search_whole_phrase: bool = True):
303
  super().__init__(supported_entities=supported_entities)
@@ -332,10 +538,79 @@ custom_list_default = []
332
  custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_list_default)
333
 
334
 
335
- # Add custom recognisers to nlp_analyser
336
- nlp_analyser.registry.add_recognizer(street_recogniser)
337
- nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
338
- nlp_analyser.registry.add_recognizer(titles_recogniser)
339
- nlp_analyser.registry.add_recognizer(custom_recogniser)
340
- nlp_analyser.registry.add_recognizer(custom_word_fuzzy_recognizer)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
 
 
1
  from typing import List
2
  from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
3
  from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts, NerModelConfiguration
 
4
  import spacy
5
+ from spacy.matcher import Matcher
6
  from spaczz.matcher import FuzzyMatcher
7
  spacy.prefer_gpu()
8
  from spacy.cli.download import download
9
  import Levenshtein
10
  import re
11
+ import os
12
+ import requests
13
  import gradio as gr
14
+ from tools.config import DEFAULT_LANGUAGE, TESSERACT_FOLDER
15
 
 
16
  score_threshold = 0.001
17
  custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"]
18
 
 
 
 
 
 
 
 
 
 
 
 
19
  # Create a class inheriting from SpacyNlpEngine
20
  class LoadedSpacyNlpEngine(SpacyNlpEngine):
21
+ def __init__(self, loaded_spacy_model, language_code: str):
22
  super().__init__(ner_model_configuration=NerModelConfiguration(labels_to_ignore=["CARDINAL", "ORDINAL"])) # Ignore non-relevant labels
23
+ self.nlp = {language_code: loaded_spacy_model}
24
 
25
+ def _base_language_code(language: str) -> str:
26
+ lang = _normalize_language_input(language)
27
+ if "_" in lang:
28
+ return lang.split("_")[0]
29
+ return lang
30
+
31
+ def load_spacy_model(language: str = DEFAULT_LANGUAGE):
32
+ """
33
+ Load a spaCy model for the requested language and return it as `nlp`.
34
+
35
+ Accepts common inputs like: "en", "en_lg", "en_sm", "de", "fr", "es", "it", "nl", "pt", "zh", "ja", "xx".
36
+ Falls back through sensible candidates and will download if missing.
37
+ """
38
+
39
+ synonyms = {
40
+ "english": "en",
41
+ "catalan": "ca",
42
+ "danish": "da",
43
+ "german": "de",
44
+ "french": "fr",
45
+ "greek": "el",
46
+ "finnish": "fi",
47
+ "croatian": "hr",
48
+ "lithuanian": "lt",
49
+ "macedonian": "mk",
50
+ "norwegian_bokmaal": "nb",
51
+ "polish": "pl",
52
+ "russian": "ru",
53
+ "slovenian": "sl",
54
+ "swedish": "sv",
55
+ "dutch": "nl",
56
+ "portuguese": "pt",
57
+ "chinese": "zh",
58
+ "japanese": "ja",
59
+ "multilingual": "xx",
60
+ }
61
+
62
+ lang_norm = _normalize_language_input(language)
63
+ lang_norm = synonyms.get(lang_norm, lang_norm)
64
+ base_lang = _base_language_code(lang_norm)
65
+
66
+ candidates_by_lang = {
67
+ # English
68
+ "en": [
69
+ "en_core_web_lg",
70
+ "en_core_web_trf",
71
+ "en_core_web_md",
72
+ "en_core_web_sm",
73
+ ],
74
+ "en_lg": ["en_core_web_lg"],
75
+ "en_trf": ["en_core_web_trf"],
76
+ "en_md": ["en_core_web_md"],
77
+ "en_sm": ["en_core_web_sm"],
78
+
79
+ # Major languages (news pipelines)
80
+ "ca": ["ca_core_news_lg", "ca_core_news_md", "ca_core_news_sm"], # Catalan
81
+ "da": ["da_core_news_lg", "da_core_news_md", "da_core_news_sm"], # Danish
82
+ "de": ["de_core_news_lg", "de_core_news_md", "de_core_news_sm"], # German
83
+ "el": ["el_core_news_lg", "el_core_news_md", "el_core_news_sm"], # Greek
84
+ "es": ["es_core_news_lg", "es_core_news_md", "es_core_news_sm"], # Spanish
85
+ "fi": ["fi_core_news_lg", "fi_core_news_md", "fi_core_news_sm"], # Finnish
86
+ "fr": ["fr_core_news_lg", "fr_core_news_md", "fr_core_news_sm"], # French
87
+ "hr": ["hr_core_news_lg", "hr_core_news_md", "hr_core_news_sm"], # Croatian
88
+ "it": ["it_core_news_lg", "it_core_news_md", "it_core_news_sm"], # Italian
89
+ "ja": ["ja_core_news_lg", "ja_core_news_md", "ja_core_news_sm"], # Japanese
90
+ "ko": ["ko_core_news_lg", "ko_core_news_md", "ko_core_news_sm"], # Korean
91
+ "lt": ["lt_core_news_lg", "lt_core_news_md", "lt_core_news_sm"], # Lithuanian
92
+ "mk": ["mk_core_news_lg", "mk_core_news_md", "mk_core_news_sm"], # Macedonian
93
+ "nb": ["nb_core_news_lg", "nb_core_news_md", "nb_core_news_sm"], # Norwegian Bokmål
94
+ "nl": ["nl_core_news_lg", "nl_core_news_md", "nl_core_news_sm"], # Dutch
95
+ "pl": ["pl_core_news_lg", "pl_core_news_md", "pl_core_news_sm"], # Polish
96
+ "pt": ["pt_core_news_lg", "pt_core_news_md", "pt_core_news_sm"], # Portuguese
97
+ "ro": ["ro_core_news_lg", "ro_core_news_md", "ro_core_news_sm"], # Romanian
98
+ "ru": ["ru_core_news_lg", "ru_core_news_md", "ru_core_news_sm"], # Russian
99
+ "sl": ["sl_core_news_lg", "sl_core_news_md", "sl_core_news_sm"], # Slovenian
100
+ "sv": ["sv_core_news_lg", "sv_core_news_md", "sv_core_news_sm"], # Swedish
101
+ "uk": ["uk_core_news_lg", "uk_core_news_md", "uk_core_news_sm"], # Ukrainian
102
+ "zh": ["zh_core_web_lg", "zh_core_web_mod", "zh_core_web_sm", "zh_core_web_trf"], # Chinese
103
+
104
+ # Multilingual NER
105
+ "xx": ["xx_ent_wiki_sm"],
106
+ }
107
+
108
+ if lang_norm in candidates_by_lang:
109
+ candidates = candidates_by_lang[lang_norm]
110
+ elif base_lang in candidates_by_lang:
111
+ candidates = candidates_by_lang[base_lang]
112
+ else:
113
+ # Fallback to multilingual if unknown
114
+ candidates = candidates_by_lang["xx"]
115
+
116
+ last_error = None
117
+ for candidate in candidates:
118
+ # Try importable package first (fast-path when installed as a package)
119
+ try:
120
+ module = __import__(candidate)
121
+ print(f"Successfully imported spaCy model: {candidate}")
122
+ return module.load()
123
+ except Exception as e:
124
+ last_error = e
125
+
126
+ # Try spacy.load if package is linked/installed
127
+ try:
128
+ nlp = spacy.load(candidate)
129
+ print(f"Successfully loaded spaCy model via spacy.load: {candidate}")
130
+ return nlp
131
+ except Exception as e:
132
+ last_error = e
133
+
134
+ # Check if model is already downloaded before attempting to download
135
+ try:
136
+ # Try to load the model to see if it's already available
137
+ nlp = spacy.load(candidate)
138
+ print(f"Model {candidate} is already available, skipping download")
139
+ return nlp
140
+ except OSError:
141
+ # Model not found, proceed with download
142
+ pass
143
+ except Exception as e:
144
+ last_error = e
145
+ continue
146
+
147
+ # Attempt to download then load
148
+ try:
149
+ print(f"Downloading spaCy model: {candidate}")
150
+ download(candidate)
151
+ nlp = spacy.load(candidate)
152
+ print(f"Successfully downloaded and loaded spaCy model: {candidate}")
153
+ return nlp
154
+ except Exception as e:
155
+ last_error = e
156
+ continue
157
+
158
+ raise RuntimeError(f"Failed to load spaCy model for language '{language}'. Last error: {last_error}")
159
+
160
+ # Language-aware spaCy model loader
161
+ def _normalize_language_input(language: str) -> str:
162
+ return language.strip().lower().replace("-", "_")
163
+
164
+ # Update the global variables to use the new function
165
+ ACTIVE_LANGUAGE_CODE = _base_language_code(DEFAULT_LANGUAGE)
166
+ nlp = None # Placeholder, will be loaded in the create_nlp_analyser function below #load_spacy_model(DEFAULT_LANGUAGE)
167
+
168
+ def get_tesseract_lang_code(short_code:str):
169
+ """
170
+ Maps a two-letter language code to the corresponding Tesseract OCR code.
171
+
172
+ Args:
173
+ short_code (str): The two-letter language code (e.g., "en", "de").
174
+
175
+ Returns:
176
+ str or None: The Tesseract language code (e.g., "eng", "deu"),
177
+ or None if no mapping is found.
178
+ """
179
+ # Mapping from 2-letter codes to Tesseract 3-letter codes
180
+ # Based on ISO 639-2/T codes.
181
+ lang_map = {
182
+ "en": "eng",
183
+ "de": "deu",
184
+ "fr": "fra",
185
+ "es": "spa",
186
+ "it": "ita",
187
+ "nl": "nld",
188
+ "pt": "por",
189
+ "zh": "chi_sim", # Mapping to Simplified Chinese by default
190
+ "ja": "jpn",
191
+ "ko": "kor",
192
+ "lt": "lit",
193
+ "mk": "mkd",
194
+ "nb": "nor",
195
+ "pl": "pol",
196
+ "ro": "ron",
197
+ "ru": "rus",
198
+ "sl": "slv",
199
+ "sv": "swe",
200
+ "uk": "ukr"
201
+ }
202
+
203
+ return lang_map.get(short_code)
204
+
205
+ def download_tesseract_lang_pack(short_lang_code:str, tessdata_dir=TESSERACT_FOLDER + "/tessdata"):
206
+ """
207
+ Downloads a Tesseract language pack to a local directory.
208
+
209
+ Args:
210
+ lang_code (str): The short code for the language (e.g., "eng", "fra").
211
+ tessdata_dir (str, optional): The directory to save the language pack.
212
+ Defaults to "tessdata".
213
+ """
214
+
215
+ # Create the directory if it doesn't exist
216
+ if not os.path.exists(tessdata_dir):
217
+ os.makedirs(tessdata_dir)
218
+
219
+ # Get the Tesseract language code
220
+ lang_code = get_tesseract_lang_code(short_lang_code)
221
+
222
+ if lang_code is None:
223
+ raise ValueError(f"Language code {short_lang_code} not found in Tesseract language map")
224
+
225
+ # Set the local file path
226
+ file_path = os.path.join(tessdata_dir, f"{lang_code}.traineddata")
227
+
228
+ # Check if the file already exists
229
+ if os.path.exists(file_path):
230
+ print(f"Language pack {lang_code}.traineddata already exists at {file_path}")
231
+ return file_path
232
+
233
+ # Construct the URL for the language pack
234
+ url = f"https://raw.githubusercontent.com/tesseract-ocr/tessdata/main/{lang_code}.traineddata"
235
+
236
+ # Download the file
237
+ try:
238
+ response = requests.get(url, stream=True)
239
+ response.raise_for_status() # Raise an exception for bad status codes
240
+
241
+ with open(file_path, "wb") as f:
242
+ for chunk in response.iter_content(chunk_size=8192):
243
+ f.write(chunk)
244
 
245
+ print(f"Successfully downloaded {lang_code}.traineddata to {file_path}")
246
+ return file_path
247
 
248
+ except requests.exceptions.RequestException as e:
249
+ print(f"Error downloading {lang_code}.traineddata: {e}")
250
+ return None
 
 
251
 
252
+ #### Custom recognisers
253
  def custom_word_list_recogniser(custom_list:List[str]=[]):
254
  # Create regex pattern, handling quotes carefully
255
 
 
504
 
505
  return all_start_positions, all_end_positions
506
 
 
507
  class CustomWordFuzzyRecognizer(EntityRecognizer):
508
  def __init__(self, supported_entities: List[str], custom_list: List[str] = [], spelling_mistakes_max: int = 1, search_whole_phrase: bool = True):
509
  super().__init__(supported_entities=supported_entities)
 
538
  custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_list_default)
539
 
540
 
541
+ # Pass the loaded model to the new LoadedSpacyNlpEngine
542
+ loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp, language_code = ACTIVE_LANGUAGE_CODE)
543
+
544
+
545
+ def create_nlp_analyser(language: str = DEFAULT_LANGUAGE, custom_list: List[str] = None,
546
+ spelling_mistakes_max: int = 1, search_whole_phrase: bool = True, existing_nlp_analyser: AnalyzerEngine = None):
547
+ """
548
+ Create an nlp_analyser object based on the specified language input.
549
+
550
+ Args:
551
+ language (str): Language code (e.g., "en", "de", "fr", "es", etc.)
552
+ custom_list (List[str], optional): List of custom words to recognize. Defaults to None.
553
+ spelling_mistakes_max (int, optional): Maximum number of spelling mistakes for fuzzy matching. Defaults to 1.
554
+ search_whole_phrase (bool, optional): Whether to search for whole phrases or individual words. Defaults to True.
555
+
556
+ Returns:
557
+ AnalyzerEngine: Configured nlp_analyser object with custom recognizers
558
+ """
559
+ print("existing_nlp_analyser:", existing_nlp_analyser)
560
+
561
+ if existing_nlp_analyser is None:
562
+ pass
563
+ else:
564
+ if existing_nlp_analyser.supported_languages[0] == language:
565
+ nlp_analyser = existing_nlp_analyser
566
+ print(f"Using existing nlp_analyser for {language}")
567
+ return nlp_analyser
568
+
569
+ # Load spaCy model for the specified language
570
+ nlp_model = load_spacy_model(language)
571
+
572
+ # Get base language code
573
+ base_lang_code = _base_language_code(language)
574
+
575
+ # Create custom recognizers
576
+ if custom_list is None:
577
+ custom_list = []
578
+
579
+ custom_recogniser = custom_word_list_recogniser(custom_list)
580
+ custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(
581
+ supported_entities=["CUSTOM_FUZZY"],
582
+ custom_list=custom_list,
583
+ spelling_mistakes_max=spelling_mistakes_max,
584
+ search_whole_phrase=search_whole_phrase
585
+ )
586
+
587
+ # Create NLP engine with loaded model
588
+ loaded_nlp_engine = LoadedSpacyNlpEngine(
589
+ loaded_spacy_model=nlp_model,
590
+ language_code=base_lang_code
591
+ )
592
+
593
+ # Create analyzer engine
594
+ nlp_analyser = AnalyzerEngine(
595
+ nlp_engine=loaded_nlp_engine,
596
+ default_score_threshold=score_threshold,
597
+ supported_languages=[base_lang_code],
598
+ log_decision_process=False,
599
+ )
600
+
601
+ # Add custom recognizers to nlp_analyser
602
+ nlp_analyser.registry.add_recognizer(custom_recogniser)
603
+ nlp_analyser.registry.add_recognizer(custom_word_fuzzy_recognizer)
604
+
605
+ # Add language-specific recognizers for English
606
+ if base_lang_code == "en":
607
+ nlp_analyser.registry.add_recognizer(street_recogniser)
608
+ nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
609
+ nlp_analyser.registry.add_recognizer(titles_recogniser)
610
+
611
+ return nlp_analyser
612
+
613
+ # Create the default nlp_analyser using the new function
614
+ nlp_analyser = create_nlp_analyser(DEFAULT_LANGUAGE)
615
+
616