Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Aug 21

Commit

9ae09da

1 Parent(s): 003292d

Added support for other languages. Improved DynamoDB download

Browse files

Files changed (12) hide show

Dockerfile +17 -6
app.py +33 -33
load_dynamo_logs.py +45 -8
tools/aws_textract.py +0 -1
tools/config.py +22 -2
tools/custom_image_analyser_engine.py +108 -9
tools/data_anonymise.py +37 -15
tools/file_conversion.py +5 -1
tools/file_redaction.py +83 -23
tools/find_duplicate_pages.py +1 -3
tools/helper_functions.py +41 -9
tools/load_spacy_model_custom_recognisers.py +306 -31

Dockerfile CHANGED Viewed

@@ -54,7 +54,9 @@ ENV GRADIO_TEMP_DIR=/tmp/gradio_tmp/ \
     ACCESS_LOGS_FOLDER=$APP_HOME/app/logs/ \
     USAGE_LOGS_FOLDER=$APP_HOME/app/usage/ \
     CONFIG_FOLDER=$APP_HOME/app/config/ \
-    XDG_CACHE_HOME=/tmp/xdg_cache/user_1000
 # Create the base application directory and set its ownership
 RUN mkdir -p ${APP_HOME}/app && chown user:user ${APP_HOME}/app
@@ -83,17 +85,23 @@ RUN mkdir -p \
     ${APP_HOME}/app/feedback \
     ${APP_HOME}/app/config
-# Now handle the /tmp and /var/tmp directories and their subdirectories
 RUN mkdir -p /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache /tmp /var/tmp ${XDG_CACHE_HOME} \
     && chown user:user /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache ${XDG_CACHE_HOME} \
     && chmod 1777 /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache \
-    && chmod 700 ${XDG_CACHE_HOME}
-RUN mkdir -p ${APP_HOME}/.paddlex/official_models \
     && chown user:user \
     ${APP_HOME}/.paddlex/official_models \
     && chmod 755 \
     ${APP_HOME}/.paddlex/official_models
 # Copy installed packages from builder stage
 COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
@@ -122,6 +130,8 @@ VOLUME ["/home/user/app/usage"]
 VOLUME ["/home/user/app/feedback"]
 VOLUME ["/home/user/app/config"]
 VOLUME ["/home/user/.paddlex/official_models"]
 VOLUME ["/tmp"]
 VOLUME ["/var/tmp"]
@@ -134,7 +144,8 @@ ENV PATH=$APP_HOME/.local/bin:$PATH \
     GRADIO_NUM_PORTS=1 \
     GRADIO_SERVER_NAME=0.0.0.0 \
     GRADIO_SERVER_PORT=7860 \
-    GRADIO_ANALYTICS_ENABLED=False
 ENTRYPOINT ["/entrypoint.sh"]

     ACCESS_LOGS_FOLDER=$APP_HOME/app/logs/ \
     USAGE_LOGS_FOLDER=$APP_HOME/app/usage/ \
     CONFIG_FOLDER=$APP_HOME/app/config/ \
+    XDG_CACHE_HOME=/tmp/xdg_cache/user_1000 \
+    TESSERACT_FOLDER=/usr/bin/tesseract \
+    TESSERACT_DATA_FOLDER=/usr/share/tessdata
 # Create the base application directory and set its ownership
 RUN mkdir -p ${APP_HOME}/app && chown user:user ${APP_HOME}/app
     ${APP_HOME}/app/feedback \
     ${APP_HOME}/app/config
+# Now handle the /tmp and /var/tmp directories and their subdirectories, paddle, spacy, tessdata
 RUN mkdir -p /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache /tmp /var/tmp ${XDG_CACHE_HOME} \
     && chown user:user /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache ${XDG_CACHE_HOME} \
     && chmod 1777 /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache \
+    && chmod 700 ${XDG_CACHE_HOME} \
+    && mkdir -p ${APP_HOME}/.paddlex/official_models \
     && chown user:user \
     ${APP_HOME}/.paddlex/official_models \
     && chmod 755 \
     ${APP_HOME}/.paddlex/official_models
+    && mkdir -p ${APP_HOME}/.local/share/spacy/data \
+    && chown user:user \
+    ${APP_HOME}/.local/share/spacy/data \
+    && chmod 755 \
+    ${APP_HOME}/.local/share/spacy/data \
+    mkdir -p /usr/share/tessdata && \
+    chmod 755 /usr/share/tessdata # Create tessdata directory and set permissions
 # Copy installed packages from builder stage
 COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
 VOLUME ["/home/user/app/feedback"]
 VOLUME ["/home/user/app/config"]
 VOLUME ["/home/user/.paddlex/official_models"]
+VOLUME ["/home/user/.local/share/spacy/data"]
+VOLUME ["/usr/share/tessdata"]
 VOLUME ["/tmp"]
 VOLUME ["/var/tmp"]
     GRADIO_NUM_PORTS=1 \
     GRADIO_SERVER_NAME=0.0.0.0 \
     GRADIO_SERVER_PORT=7860 \
+    GRADIO_ANALYTICS_ENABLED=False \
 ENTRYPOINT ["/entrypoint.sh"]

app.py CHANGED Viewed

@@ -2,15 +2,15 @@ import os
 import pandas as pd
 import gradio as gr
 from gradio_image_annotation import image_annotator
-from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC,  TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS, USAGE_LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, S3_FEEDBACK_LOGS_FOLDER, S3_ACCESS_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER, CHOSEN_LOCAL_OCR_MODEL
-from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_relevant_ocr_output_with_words, reset_data_vars, reset_aws_call_vars, _get_env_list, ensure_folder_exists, reset_ocr_with_words_base_dataframe
 from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
 from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback_dataframe_row, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr, df_select_callback_textract_api, get_all_rows_with_same_text, increase_bottom_page_count_based_on_top, store_duplicate_selection, create_annotation_objects_from_filtered_ocr_results_with_words, df_select_callback_dataframe_row_ocr_with_words, update_redact_choice_df_from_page_dropdown, get_all_rows_with_same_text_redact
 from tools.data_anonymise import anonymise_files_with_open_text
 from tools.auth import authenticate_user
-from tools.load_spacy_model_custom_recognisers import custom_entities
 from tools.custom_csvlogger import CSVLogger_custom
 from tools.find_duplicate_pages import run_duplicate_analysis, exclude_match, handle_selection_and_preview, apply_whole_page_redactions_from_list, create_annotation_objects_from_duplicates, run_full_search_and_analysis
 from tools.textract_batch_call import analyse_document_with_textract_api, poll_whole_document_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist, replace_existing_pdf_input_for_whole_document_outputs
@@ -33,6 +33,8 @@ if SAVE_LOGS_TO_CSV == "True": SAVE_LOGS_TO_CSV = True
 else: SAVE_LOGS_TO_CSV = False
 if SAVE_LOGS_TO_DYNAMODB == "True": SAVE_LOGS_TO_DYNAMODB = True
 else: SAVE_LOGS_TO_DYNAMODB = False
 if CSV_ACCESS_LOG_HEADERS: CSV_ACCESS_LOG_HEADERS = _get_env_list(CSV_ACCESS_LOG_HEADERS)
 if CSV_FEEDBACK_LOG_HEADERS: CSV_FEEDBACK_LOG_HEADERS = _get_env_list(CSV_FEEDBACK_LOG_HEADERS)
@@ -244,6 +246,10 @@ with app:
     ## Duplicate search object
     new_duplicate_search_annotation_object = gr.Dropdown(value=None, label="new_duplicate_search_annotation_object", allow_custom_value=True, visible=False)
     ###
     # UI DESIGN
     ###
@@ -588,10 +594,18 @@ with app:
                 page_min = gr.Number(value=0, precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
                 page_max = gr.Number(value=0, precision=0,minimum=0,maximum=9999, label="Highest page to redact")
-        with gr.Accordion("AWS options", open = False):
-            #with gr.Row():
-            in_redact_language = gr.Dropdown(value = REDACTION_LANGUAGE, choices = [REDACTION_LANGUAGE], label="Redaction language", multiselect=False, visible=False)
             with gr.Row():
                 aws_access_key_textbox = gr.Textbox(value='', label="AWS access key for account with permissions for AWS Textract and Comprehend", visible=True, type="password")
                 aws_secret_key_textbox = gr.Textbox(value='', label="AWS secret key for account with permissions for AWS Textract and Comprehend", visible=True, type="password")
@@ -651,15 +665,11 @@ with app:
     # Run redaction function
     document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number, all_page_line_level_ocr_results_with_words]).\
         success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
-        success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox],
                     outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, input_pdf_for_review, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state], api_name="redact_doc")
-    # If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
-    # current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox],
-    #                 outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, input_pdf_for_review, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state])
     # If a file has been completed, the function will continue onto the next document
-    latest_file_completed_num.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox],
                     outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, input_pdf_for_review, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state]).\
                     success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
                     success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
@@ -689,7 +699,7 @@ with app:
         success(fn=check_for_relevant_ocr_output_with_words, inputs=[doc_file_name_no_extension_textbox, text_extract_method_radio, output_folder_textbox], outputs=[relevant_ocr_output_with_words_found_checkbox]).\
         success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
         success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
-        success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox],
                     outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, input_pdf_for_review, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state]).\
                     success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
@@ -889,13 +899,11 @@ with app:
                   success(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list, total_pdf_page_count])
     tabular_data_redact_btn.click(reset_data_vars, outputs=[actual_time_taken_number, log_files_output_list_state, comprehend_query_number]).\
-    success(fn=anonymise_files_with_open_text, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list_state, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox, actual_time_taken_number], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state, actual_time_taken_number], api_name="redact_data").\
-    success(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
-    # Currently only supports redacting one data file at a time, following code block not used
     # If the output file count text box changes, keep going with redacting each data file until done
-    # text_tabular_files_done.change(fn=anonymise_files_with_open_text, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list_state, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox, actual_time_taken_number], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state, actual_time_taken_number]).\
-    # success(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
     ###
     # IDENTIFY DUPLICATE PAGES
@@ -966,7 +974,12 @@ with app:
     merge_multiple_review_files_btn.click(fn=merge_csv_files, inputs=multiple_review_files_in_out, outputs=multiple_review_files_in_out)
     #
-    all_output_files_btn.click(fn=load_all_output_files, inputs=output_folder_textbox, outputs=all_output_files)
     ###
     # APP LOAD AND LOGGING
@@ -1082,17 +1095,4 @@ if __name__ == "__main__":
         main(first_loop_state, latest_file_completed=0, redaction_output_summary_textbox="", output_file_list=None,
          log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
-         current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_page_line_level_ocr_results_df = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),CHOSEN_COMPREHEND_ENTITIES = CHOSEN_COMPREHEND_ENTITIES, CHOSEN_REDACT_ENTITIES = CHOSEN_REDACT_ENTITIES, handwrite_signature_checkbox = ["Extract handwriting", "Extract signatures"])
-# AWS options - placeholder for possibility of storing data on s3 and retrieving it in app
-# with gr.Tab(label="Advanced options"):
-#     with gr.Accordion(label = "AWS data access", open = True):
-#         aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
-#         with gr.Row():
-#             in_aws_file = gr.Dropdown(label="Choose file to load from AWS (only valid for API Gateway app)", choices=["None", "Lambeth borough plan"])
-#             load_aws_data_button = gr.Button(value="Load data from AWS", variant="secondary")
-#         aws_log_box = gr.Textbox(label="AWS data load status")
-# ### Loading AWS data ###
-# load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_doc_files, aws_log_box])

 import pandas as pd
 import gradio as gr
 from gradio_image_annotation import image_annotator
+from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, DEFAULT_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC,  TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS, USAGE_LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, S3_FEEDBACK_LOGS_FOLDER, S3_ACCESS_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER, CHOSEN_LOCAL_OCR_MODEL, DEFAULT_LANGUAGE_FULL_NAME, SHOW_LANGUAGE_SELECTION
+from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_relevant_ocr_output_with_words, reset_data_vars, reset_aws_call_vars, _get_env_list, ensure_folder_exists, reset_ocr_with_words_base_dataframe, update_language_dropdown, LANGUAGE_CHOICES, MAPPED_LANGUAGE_CHOICES
 from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
 from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback_dataframe_row, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr, df_select_callback_textract_api, get_all_rows_with_same_text, increase_bottom_page_count_based_on_top, store_duplicate_selection, create_annotation_objects_from_filtered_ocr_results_with_words, df_select_callback_dataframe_row_ocr_with_words, update_redact_choice_df_from_page_dropdown, get_all_rows_with_same_text_redact
 from tools.data_anonymise import anonymise_files_with_open_text
 from tools.auth import authenticate_user
+from tools.load_spacy_model_custom_recognisers import custom_entities, load_spacy_model, download_tesseract_lang_pack
 from tools.custom_csvlogger import CSVLogger_custom
 from tools.find_duplicate_pages import run_duplicate_analysis, exclude_match, handle_selection_and_preview, apply_whole_page_redactions_from_list, create_annotation_objects_from_duplicates, run_full_search_and_analysis
 from tools.textract_batch_call import analyse_document_with_textract_api, poll_whole_document_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist, replace_existing_pdf_input_for_whole_document_outputs
 else: SAVE_LOGS_TO_CSV = False
 if SAVE_LOGS_TO_DYNAMODB == "True": SAVE_LOGS_TO_DYNAMODB = True
 else: SAVE_LOGS_TO_DYNAMODB = False
+if SHOW_LANGUAGE_SELECTION == "True": SHOW_LANGUAGE_SELECTION = True
+else: SHOW_LANGUAGE_SELECTION = False
 if CSV_ACCESS_LOG_HEADERS: CSV_ACCESS_LOG_HEADERS = _get_env_list(CSV_ACCESS_LOG_HEADERS)
 if CSV_FEEDBACK_LOG_HEADERS: CSV_FEEDBACK_LOG_HEADERS = _get_env_list(CSV_FEEDBACK_LOG_HEADERS)
     ## Duplicate search object
     new_duplicate_search_annotation_object = gr.Dropdown(value=None, label="new_duplicate_search_annotation_object", allow_custom_value=True, visible=False)
+    # Spacy analyser state
+    updated_nlp_analyser_state = gr.State([])
+    tesseract_lang_data_file_path = gr.Textbox("", visible=False)
     ###
     # UI DESIGN
     ###
                 page_min = gr.Number(value=0, precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
                 page_max = gr.Number(value=0, precision=0,minimum=0,maximum=9999, label="Highest page to redact")
+        if SHOW_LANGUAGE_SELECTION:
+            with gr.Accordion("Language selection", open=False):
+                gr.Markdown("""Note that AWS Textract is only compatible with English, Spanish, Italian, Portuguese, French, and German, and handwriting detection is only available in English. AWS Comprehend is additionally compatible with Arabic, Hindi, Japanese, Korean, Chinese, and Chinese (Traditional).
+                The local models (Tesseract and SpaCy) are compatible with the other languages in the list below. However, the language packs for these models need to be installed on your system. When you first run a document through the app, the language packs will be downloaded automatically, but please expect a delay as the models are large.""")
+                with gr.Row():
+                    chosen_language_full_name_drop = gr.Dropdown(value = DEFAULT_LANGUAGE_FULL_NAME, choices = MAPPED_LANGUAGE_CHOICES, label="Chosen language", multiselect=False, visible=True)
+                    chosen_language_drop = gr.Dropdown(value = DEFAULT_LANGUAGE, choices = LANGUAGE_CHOICES, label="Chosen language short code", multiselect=False, visible=True)
+        else:
+            chosen_language_full_name_drop = gr.Dropdown(value = DEFAULT_LANGUAGE_FULL_NAME, choices = MAPPED_LANGUAGE_CHOICES, label="Chosen language", multiselect=False, visible=False)
+            chosen_language_drop = gr.Dropdown(value = DEFAULT_LANGUAGE, choices = LANGUAGE_CHOICES, label="Chosen language short code", multiselect=False, visible=False)
+        with gr.Accordion("Use API keys for AWS services", open = False):
             with gr.Row():
                 aws_access_key_textbox = gr.Textbox(value='', label="AWS access key for account with permissions for AWS Textract and Comprehend", visible=True, type="password")
                 aws_secret_key_textbox = gr.Textbox(value='', label="AWS secret key for account with permissions for AWS Textract and Comprehend", visible=True, type="password")
     # Run redaction function
     document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number, all_page_line_level_ocr_results_with_words]).\
         success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
+        success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox, chosen_language_drop],
                     outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, input_pdf_for_review, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state], api_name="redact_doc")
     # If a file has been completed, the function will continue onto the next document
+    latest_file_completed_num.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox, chosen_language_drop],
                     outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, input_pdf_for_review, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state]).\
                     success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
                     success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
         success(fn=check_for_relevant_ocr_output_with_words, inputs=[doc_file_name_no_extension_textbox, text_extract_method_radio, output_folder_textbox], outputs=[relevant_ocr_output_with_words_found_checkbox]).\
         success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
         success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
+        success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox, chosen_language_drop],
                     outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, input_pdf_for_review, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state]).\
                     success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
                   success(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list, total_pdf_page_count])
     tabular_data_redact_btn.click(reset_data_vars, outputs=[actual_time_taken_number, log_files_output_list_state, comprehend_query_number]).\
+    success(fn=anonymise_files_with_open_text, inputs=[in_data_files, in_text, anon_strat, in_colnames,  in_redact_entities, in_allow_list_state, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox, actual_time_taken_number, chosen_language_drop], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state, actual_time_taken_number], api_name="redact_data")
     # If the output file count text box changes, keep going with redacting each data file until done
+    text_tabular_files_done.change(fn=anonymise_files_with_open_text, inputs=[in_data_files, in_text, anon_strat, in_colnames, chosen_language_drop, in_redact_entities, in_allow_list_state, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox, actual_time_taken_number, chosen_language_drop], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state, actual_time_taken_number]).\
+    success(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
     ###
     # IDENTIFY DUPLICATE PAGES
     merge_multiple_review_files_btn.click(fn=merge_csv_files, inputs=multiple_review_files_in_out, outputs=multiple_review_files_in_out)
     #
+    all_output_files_btn.click(fn=load_all_output_files, inputs=output_folder_textbox, outputs=all_output_files)
+    # Language selection dropdown
+    chosen_language_full_name_drop.select(update_language_dropdown, inputs=[chosen_language_full_name_drop], outputs=[chosen_language_drop])#.\
+    #success(download_tesseract_lang_pack, inputs=[chosen_language_drop], outputs = [tesseract_lang_data_file_path]).\
+    #success(load_spacy_model, inputs=[chosen_language_drop], outputs=[updated_nlp_analyser_state])
     ###
     # APP LOAD AND LOGGING
         main(first_loop_state, latest_file_completed=0, redaction_output_summary_textbox="", output_file_list=None,
          log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
+         current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_page_line_level_ocr_results_df = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),CHOSEN_COMPREHEND_ENTITIES = CHOSEN_COMPREHEND_ENTITIES, CHOSEN_REDACT_ENTITIES = CHOSEN_REDACT_ENTITIES, handwrite_signature_checkbox = ["Extract handwriting", "Extract signatures"])

load_dynamo_logs.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import boto3
 import csv
 from decimal import Decimal
 from boto3.dynamodb.conditions import Key
 from tools.config import AWS_REGION, ACCESS_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, OUTPUT_FOLDER
@@ -16,11 +17,26 @@ table = dynamodb.Table(TABLE_NAME)
 # Helper function to convert Decimal to float or int
 def convert_types(item):
     for key, value in item.items():
         if isinstance(value, Decimal):
-            # Convert to int if no decimal places, else float
-            item[key] = int(value) if value % 1 == 0 else float(value)
-    return item
 # Paginated scan
 def scan_table():
@@ -35,22 +51,43 @@ def scan_table():
     return items
 # Export to CSV
-def export_to_csv(items, output_path):
     if not items:
         print("No items found.")
         return
-    fieldnames = sorted(items[0].keys())
-    with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
-        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
         writer.writeheader()
         for item in items:
             writer.writerow(convert_types(item))
     print(f"Exported {len(items)} items to {output_path}")
 # Run export
 items = scan_table()
-export_to_csv(items, CSV_OUTPUT)

 import boto3
 import csv
 from decimal import Decimal
+import datetime
 from boto3.dynamodb.conditions import Key
 from tools.config import AWS_REGION, ACCESS_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, OUTPUT_FOLDER
 # Helper function to convert Decimal to float or int
 def convert_types(item):
+    new_item = {}
     for key, value in item.items():
+        # Handle Decimals first
         if isinstance(value, Decimal):
+            new_item[key] = int(value) if value % 1 == 0 else float(value)
+        # Handle Strings that might be dates
+        elif isinstance(value, str):
+            try:
+                # Attempt to parse a common ISO 8601 format.
+                # The .replace() handles the 'Z' for Zulu/UTC time.
+                dt_obj = datetime.datetime.fromisoformat(value.replace('Z', '+00:00'))
+                # Now that we have a datetime object, format it as desired
+                new_item[key] = dt_obj.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
+            except (ValueError, TypeError):
+                # If it fails to parse, it's just a regular string
+                new_item[key] = value
+        # Handle all other types
+        else:
+            new_item[key] = value
+    return new_item
 # Paginated scan
 def scan_table():
     return items
 # Export to CSV
+# Export to CSV
+def export_to_csv(items, output_path, fields_to_drop: list = None):
     if not items:
         print("No items found.")
         return
+    # Use a set for efficient lookup
+    drop_set = set(fields_to_drop or [])
+    # Get a comprehensive list of all possible headers from all items
+    all_keys = set()
+    for item in items:
+        all_keys.update(item.keys())
+    # Determine the final fieldnames by subtracting the ones to drop
+    fieldnames = sorted(list(all_keys - drop_set))
+    print("Final CSV columns will be:", fieldnames)
+    with open(output_path, 'w', newline='', encoding='utf-8-sig') as csvfile:
+        # The key fix is here: extrasaction='ignore'
+        # restval='' is also good practice to handle rows that are missing a key
+        writer = csv.DictWriter(
+            csvfile,
+            fieldnames=fieldnames,
+            extrasaction='ignore',
+            restval=''
+        )
         writer.writeheader()
         for item in items:
+            # The convert_types function can now return the full dict,
+            # and the writer will simply ignore the extra fields.
             writer.writerow(convert_types(item))
     print(f"Exported {len(items)} items to {output_path}")
 # Run export
 items = scan_table()
+export_to_csv(items, CSV_OUTPUT, fields_to_drop=[])

tools/aws_textract.py CHANGED Viewed

@@ -278,7 +278,6 @@ def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_
     return all_ocr_results_with_page, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_words_with_page
 def load_and_convert_textract_json(textract_json_file_path:str, log_files_output_paths:str, page_sizes_df:pd.DataFrame):
     """
     Loads Textract JSON from a file, detects if conversion is needed, and converts if necessary.

     return all_ocr_results_with_page, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_words_with_page
 def load_and_convert_textract_json(textract_json_file_path:str, log_files_output_paths:str, page_sizes_df:pd.DataFrame):
     """
     Loads Textract JSON from a file, detects if conversion is needed, and converts if necessary.

tools/config.py CHANGED Viewed

@@ -195,7 +195,8 @@ FEEDBACK_LOG_FILE_NAME = get_or_create_env_var('FEEDBACK_LOG_FILE_NAME', LOG_FIL
 ###
 # Create Tesseract and Poppler folders if you have installed them locally
-TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "") #  # If installing for Windows, install Tesseract 5.5.0 from here: https://github.com/UB-Mannheim/tesseract/wiki. Then this environment variable should point to the Tesseract folder e.g. tesseract/
 POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "") # If installing on Windows,install Poppler from here https://github.com/oschwartz10612/poppler-windows. This variable needs to point to the poppler bin folder e.g. poppler/poppler-24.02.0/Library/bin/
 if TESSERACT_FOLDER: add_folder_to_path(TESSERACT_FOLDER)
@@ -288,7 +289,26 @@ MAX_TIME_VALUE = get_or_create_env_var('MAX_TIME_VALUE', '999999')
 CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "") # only "grey" is currently supported as a custom box colour
-REDACTION_LANGUAGE = get_or_create_env_var("REDACTION_LANGUAGE", "en") # Currently only English is supported by the app
 RETURN_PDF_END_OF_REDACTION = get_or_create_env_var("RETURN_PDF_END_OF_REDACTION", "True") # Return a redacted PDF at the end of the redaction task. Could be useful to set this to "False" if you want to ensure that the user always goes to the 'Review Redactions' tab before getting the final redacted PDF product.

 ###
 # Create Tesseract and Poppler folders if you have installed them locally
+TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "/usr/bin/tesseract") #  # If installing for Windows, install Tesseract 5.5.0 from here: https://github.com/UB-Mannheim/tesseract/wiki. Then this environment variable should point to the Tesseract folder e.g. tesseract/
+TESSERACT_DATA_FOLDER = get_or_create_env_var('TESSERACT_DATA_FOLDER', "/usr/share/tessdata")
 POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "") # If installing on Windows,install Poppler from here https://github.com/oschwartz10612/poppler-windows. This variable needs to point to the poppler bin folder e.g. poppler/poppler-24.02.0/Library/bin/
 if TESSERACT_FOLDER: add_folder_to_path(TESSERACT_FOLDER)
 CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "") # only "grey" is currently supported as a custom box colour
+### Language selection options
+SHOW_LANGUAGE_SELECTION = get_or_create_env_var("SHOW_LANGUAGE_SELECTION", "False")
+DEFAULT_LANGUAGE_FULL_NAME = get_or_create_env_var("DEFAULT_LANGUAGE_FULL_NAME", "english")
+DEFAULT_LANGUAGE = get_or_create_env_var("DEFAULT_LANGUAGE", "en") # For tesseract, ensure the Tesseract language data (e.g., fra.traineddata) is installed on your system. You can find the relevant language packs here: https://github.com/tesseract-ocr/tessdata.
+# For paddle, ensure the paddle language data (e.g., fra.traineddata) is installed on your system. You can find information on supported languages here: https://www.paddleocr.ai/main/en/version3.x/algorithm/PP-OCRv5/PP-OCRv5_multi_languages.html
+# For AWS Comprehend, ensure the language data is installed on your system. You can find the relevant language packs here: https://docs.aws.amazon.com/comprehend/latest/dg/supported-languages.html: ('en'|'es'|'fr'|'de'|'it'|'pt'|'ar'|'hi'|'ja'|'ko'|'zh'|'zh-TW')
+# AWS Textract automatically detects the language of the document and supports the following languages: https://aws.amazon.com/textract/faqs/#topic-0. 'English, Spanish, Italian, Portuguese, French, German. Handwriting, Invoices and Receipts, Identity documents and Queries processing are in English only'
+textract_language_choices = get_or_create_env_var("textract_language_choices", "['en', 'es', 'fr', 'de', 'it', 'pt']")
+aws_comprehend_language_choices = get_or_create_env_var("aws_comprehend_language_choices", "['en', 'es', 'fr', 'de', 'it', 'pt', 'ar', 'hi', 'ja', 'ko', 'zh', 'zh-TW']")
+# The choices that the user sees
+MAPPED_LANGUAGE_CHOICES = get_or_create_env_var("MAPPED_LANGUAGE_CHOICES", "['english', 'french', 'german', 'spanish', 'italian', 'dutch', 'portuguese', 'chinese', 'japanese', 'korean', 'lithuanian', 'macedonian', 'norwegian_bokmaal', 'polish', 'romanian', 'russian', 'slovenian', 'swedish', 'catalan', 'ukrainian']")
+LANGUAGE_CHOICES = get_or_create_env_var("LANGUAGE_CHOICES", "['en', 'fr', 'de', 'es', 'it', 'nl', 'pt', 'zh', 'ja', 'ko', 'lt', 'mk', 'nb', 'pl', 'ro', 'ru', 'sl', 'sv', 'ca', 'uk']")
+### File output options
 RETURN_PDF_END_OF_REDACTION = get_or_create_env_var("RETURN_PDF_END_OF_REDACTION", "True") # Return a redacted PDF at the end of the redaction task. Could be useful to set this to "False" if you want to ensure that the user always goes to the 'Review Redactions' tab before getting the final redacted PDF product.

tools/custom_image_analyser_engine.py CHANGED Viewed

@@ -16,7 +16,7 @@ from typing import Optional, Tuple, Union
 from tools.helper_functions import clean_unicode_text
 from tools.presidio_analyzer_custom import recognizer_result_from_dict
 from tools.load_spacy_model_custom_recognisers import custom_entities
-from tools.config import PREPROCESS_LOCAL_OCR_IMAGES
 if PREPROCESS_LOCAL_OCR_IMAGES == "True": PREPROCESS_LOCAL_OCR_IMAGES = True
 else: PREPROCESS_LOCAL_OCR_IMAGES = False
@@ -26,6 +26,86 @@ try:
 except ImportError:
     PaddleOCR = None
 @dataclass
 class OCRResult:
     text: str
@@ -234,6 +314,7 @@ def rescale_ocr_data(ocr_data, scale_factor:float):
             ocr_data['height'][i] = h_orig
     return ocr_data
 class CustomImageAnalyzerEngine:
     def __init__(
         self,
@@ -241,28 +322,38 @@ class CustomImageAnalyzerEngine:
         ocr_engine: str = "tesseract",
         tesseract_config: Optional[str] = None,
         paddle_kwargs: Optional[Dict[str, Any]] = None,
-        image_preprocessor: Optional[ImagePreprocessor] = None
     ):
         """
         Initializes the CustomImageAnalyzerEngine.
-        :param ocr_engine: The OCR engine to use ("tesseract" or "paddle").
         :param analyzer_engine: The Presidio AnalyzerEngine instance.
         :param tesseract_config: Configuration string for Tesseract.
         :param paddle_kwargs: Dictionary of keyword arguments for PaddleOCR constructor.
         :param image_preprocessor: Optional image preprocessor.
         """
         if ocr_engine not in ["tesseract", "paddle", "hybrid"]:
             raise ValueError("ocr_engine must be either 'tesseract', 'hybrid', or 'paddle'")
         self.ocr_engine = ocr_engine
         if self.ocr_engine == "paddle" or self.ocr_engine == "hybrid":
             if PaddleOCR is None:
                 raise ImportError("paddleocr is not installed. Please run 'pip install paddleocr paddlepaddle'")
             # Default paddle configuration if none provided
             if paddle_kwargs is None:
-                paddle_kwargs = {'use_textline_orientation': True, 'lang': 'en'}
             self.paddle_ocr = PaddleOCR(**paddle_kwargs)
         if not analyzer_engine:
@@ -394,7 +485,8 @@ class CustomImageAnalyzerEngine:
         tesseract_data = pytesseract.image_to_data(
             image,
             output_type=pytesseract.Output.DICT,
-            config=self.tesseract_config
         )
         #tesseract_data['abs_line_id'] = tesseract_data.groupby(['block_num', 'par_num', 'line_num']).ngroup()
@@ -510,7 +602,8 @@ class CustomImageAnalyzerEngine:
             ocr_data = pytesseract.image_to_data(
                 image,
                 output_type=pytesseract.Output.DICT,
-                config=self.tesseract_config
             )
             #ocr_data['abs_line_id'] = ocr_data.groupby(['block_num', 'par_num', 'line_num']).ngroup()
@@ -569,6 +662,7 @@ class CustomImageAnalyzerEngine:
         pii_identification_method: str = "Local",
         comprehend_client = "",
         custom_entities:List[str]=custom_entities,
         **text_analyzer_kwargs
     ) -> List[CustomImageRecognizerResult]:
@@ -586,10 +680,14 @@ class CustomImageAnalyzerEngine:
             # Note: We're not passing line_characters here since it's not needed for this use case
             page_text_mapping.append((start_pos, i, line_level_ocr_result, None))
         # Process using either Local or AWS Comprehend
         if pii_identification_method == "Local":
             analyzer_result = self.analyzer_engine.analyze(
                 text=page_text,
                 **text_analyzer_kwargs
             )
             all_text_line_results = map_back_entity_results(
@@ -609,6 +707,7 @@ class CustomImageAnalyzerEngine:
                     text_analyzer_kwargs["entities"] = custom_redact_entities
                     page_analyser_result = self.analyzer_engine.analyze(
                         text=page_text,
                         **text_analyzer_kwargs
                     )
                     all_text_line_results = map_back_entity_results(
@@ -641,7 +740,7 @@ class CustomImageAnalyzerEngine:
                             current_batch,
                             current_batch_mapping,
                             comprehend_client,
-                            text_analyzer_kwargs["language"],
                             text_analyzer_kwargs.get('allow_list', []),
                             chosen_redact_comprehend_entities,
                             all_text_line_results
@@ -676,7 +775,7 @@ class CustomImageAnalyzerEngine:
                     current_batch,
                     current_batch_mapping,
                     comprehend_client,
-                    text_analyzer_kwargs["language"],
                     text_analyzer_kwargs.get('allow_list', []),
                     chosen_redact_comprehend_entities,
                     all_text_line_results
@@ -988,7 +1087,7 @@ def run_page_text_redaction(
     comprehend_client = None,
     allow_list: List[str] = None,
     pii_identification_method: str = "Local",
-    nlp_analyser = None,
     score_threshold: float = 0.0,
     custom_entities: List[str] = None,
     comprehend_query_number:int = 0#,

 from tools.helper_functions import clean_unicode_text
 from tools.presidio_analyzer_custom import recognizer_result_from_dict
 from tools.load_spacy_model_custom_recognisers import custom_entities
+from tools.config import PREPROCESS_LOCAL_OCR_IMAGES, DEFAULT_LANGUAGE
 if PREPROCESS_LOCAL_OCR_IMAGES == "True": PREPROCESS_LOCAL_OCR_IMAGES = True
 else: PREPROCESS_LOCAL_OCR_IMAGES = False
 except ImportError:
     PaddleOCR = None
+# --- Language utilities ---
+def _normalize_lang(language: str) -> str:
+    return language.strip().lower().replace("-", "_") if language else "en"
+def _tesseract_lang_code(language: str) -> str:
+    """Map a user language input to a Tesseract traineddata code."""
+    lang = _normalize_lang(language)
+    mapping = {
+        # Common
+        "en": "eng", "eng": "eng",
+        "fr": "fra", "fre": "fra", "fra": "fra",
+        "de": "deu", "ger": "deu", "deu": "deu",
+        "es": "spa", "spa": "spa",
+        "it": "ita", "ita": "ita",
+        "nl": "nld", "dut": "nld", "nld": "nld",
+        "pt": "por", "por": "por",
+        "ru": "rus", "rus": "rus",
+        "ar": "ara", "ara": "ara",
+        # Nordics
+        "sv": "swe", "swe": "swe",
+        "no": "nor", "nb": "nor", "nn": "nor", "nor": "nor",
+        "fi": "fin", "fin": "fin",
+        "da": "dan", "dan": "dan",
+        # Eastern/Central
+        "pl": "pol", "pol": "pol",
+        "cs": "ces", "cz": "ces", "ces": "ces",
+        "hu": "hun", "hun": "hun",
+        "ro": "ron", "rum": "ron", "ron": "ron",
+        "bg": "bul", "bul": "bul",
+        "el": "ell", "gre": "ell", "ell": "ell",
+        # Asian
+        "ja": "jpn", "jp": "jpn", "jpn": "jpn",
+        "zh": "chi_sim", "zh_cn": "chi_sim", "zh_hans": "chi_sim", "chi_sim": "chi_sim",
+        "zh_tw": "chi_tra", "zh_hk": "chi_tra", "zh_tr": "chi_tra", "chi_tra": "chi_tra",
+        "hi": "hin", "hin": "hin",
+        "bn": "ben", "ben": "ben",
+        "ur": "urd", "urd": "urd",
+        "fa": "fas", "per": "fas", "fas": "fas",
+    }
+    return mapping.get(lang, "eng")
+def _paddle_lang_code(language: str) -> str:
+    """Map a user language input to a PaddleOCR language code.
+    PaddleOCR supports codes like: 'en', 'ch', 'chinese_cht', 'korean', 'japan', 'german', 'fr', 'it', 'es',
+    as well as script packs like 'arabic', 'cyrillic', 'latin'.
+    """
+    lang = _normalize_lang(language)
+    mapping = {
+        "en": "en",
+        "fr": "fr",
+        "de": "german",
+        "es": "es",
+        "it": "it",
+        "pt": "pt",
+        "nl": "nl",
+        "ru": "cyrillic",  # Russian is covered by cyrillic models
+        "uk": "cyrillic",
+        "bg": "cyrillic",
+        "sr": "cyrillic",
+        "ar": "arabic",
+        "tr": "tr",
+        "fa": "arabic",  # fallback to arabic script pack
+        "zh": "ch",
+        "zh_cn": "ch",
+        "zh_tw": "chinese_cht",
+        "zh_hk": "chinese_cht",
+        "ja": "japan",
+        "jp": "japan",
+        "ko": "korean",
+        "hi": "latin",  # fallback; dedicated Hindi not always available
+    }
+    return mapping.get(lang, "en")
 @dataclass
 class OCRResult:
     text: str
             ocr_data['height'][i] = h_orig
     return ocr_data
 class CustomImageAnalyzerEngine:
     def __init__(
         self,
         ocr_engine: str = "tesseract",
         tesseract_config: Optional[str] = None,
         paddle_kwargs: Optional[Dict[str, Any]] = None,
+        image_preprocessor: Optional[ImagePreprocessor] = None,
+        language: Optional[str] = None
     ):
         """
         Initializes the CustomImageAnalyzerEngine.
+        :param ocr_engine: The OCR engine to use ("tesseract", "hybrid", or "paddle").
         :param analyzer_engine: The Presidio AnalyzerEngine instance.
         :param tesseract_config: Configuration string for Tesseract.
         :param paddle_kwargs: Dictionary of keyword arguments for PaddleOCR constructor.
         :param image_preprocessor: Optional image preprocessor.
+        :param language: Preferred OCR language (e.g., "en", "fr", "de"). Defaults to DEFAULT_LANGUAGE.
         """
         if ocr_engine not in ["tesseract", "paddle", "hybrid"]:
             raise ValueError("ocr_engine must be either 'tesseract', 'hybrid', or 'paddle'")
         self.ocr_engine = ocr_engine
+        # Language setup
+        self.language = language or DEFAULT_LANGUAGE or "en"
+        self.tesseract_lang = _tesseract_lang_code(self.language)
+        self.paddle_lang = _paddle_lang_code(self.language)
         if self.ocr_engine == "paddle" or self.ocr_engine == "hybrid":
             if PaddleOCR is None:
                 raise ImportError("paddleocr is not installed. Please run 'pip install paddleocr paddlepaddle'")
             # Default paddle configuration if none provided
             if paddle_kwargs is None:
+                paddle_kwargs = {'use_textline_orientation': True, 'lang': self.paddle_lang}
+            else:
+                # Enforce language if not explicitly provided
+                paddle_kwargs.setdefault('lang', self.paddle_lang)
             self.paddle_ocr = PaddleOCR(**paddle_kwargs)
         if not analyzer_engine:
         tesseract_data = pytesseract.image_to_data(
             image,
             output_type=pytesseract.Output.DICT,
+            config=self.tesseract_config,
+            lang=self.tesseract_lang
         )
         #tesseract_data['abs_line_id'] = tesseract_data.groupby(['block_num', 'par_num', 'line_num']).ngroup()
             ocr_data = pytesseract.image_to_data(
                 image,
                 output_type=pytesseract.Output.DICT,
+                config=self.tesseract_config,
+                lang=self.tesseract_lang # Ensure the Tesseract language data (e.g., fra.traineddata) is installed on your system.
             )
             #ocr_data['abs_line_id'] = ocr_data.groupby(['block_num', 'par_num', 'line_num']).ngroup()
         pii_identification_method: str = "Local",
         comprehend_client = "",
         custom_entities:List[str]=custom_entities,
+        language: Optional[str] = None,
         **text_analyzer_kwargs
     ) -> List[CustomImageRecognizerResult]:
             # Note: We're not passing line_characters here since it's not needed for this use case
             page_text_mapping.append((start_pos, i, line_level_ocr_result, None))
+        # Determine language for downstream services
+        aws_language = language or getattr(self, 'language', None) or 'en'
         # Process using either Local or AWS Comprehend
         if pii_identification_method == "Local":
             analyzer_result = self.analyzer_engine.analyze(
                 text=page_text,
+                language=language,
                 **text_analyzer_kwargs
             )
             all_text_line_results = map_back_entity_results(
                     text_analyzer_kwargs["entities"] = custom_redact_entities
                     page_analyser_result = self.analyzer_engine.analyze(
                         text=page_text,
+                        language=language,
                         **text_analyzer_kwargs
                     )
                     all_text_line_results = map_back_entity_results(
                             current_batch,
                             current_batch_mapping,
                             comprehend_client,
+                            aws_language,
                             text_analyzer_kwargs.get('allow_list', []),
                             chosen_redact_comprehend_entities,
                             all_text_line_results
                     current_batch,
                     current_batch_mapping,
                     comprehend_client,
+                    aws_language,
                     text_analyzer_kwargs.get('allow_list', []),
                     chosen_redact_comprehend_entities,
                     all_text_line_results
     comprehend_client = None,
     allow_list: List[str] = None,
     pii_identification_method: str = "Local",
+    nlp_analyser: AnalyzerEngine = None,
     score_threshold: float = 0.0,
     custom_entities: List[str] = None,
     comprehend_query_number:int = 0#,

tools/data_anonymise.py CHANGED Viewed

@@ -10,11 +10,11 @@ import docx
 from openpyxl import Workbook
 from faker import Faker
 from gradio import Progress
-from typing import List, Dict, Any
 from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
 from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
 from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
-from tools.config import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, OUTPUT_FOLDER
 from tools.helper_functions import get_file_name_without_type, read_file, detect_file_type
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, custom_entities
 # Use custom version of analyze_dict to be able to track progress
@@ -119,7 +119,7 @@ def anon_consistent_names(df:pd.DataFrame) -> pd.DataFrame:
     #analyzer = AnalyzerEngine()
     batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
-    analyzer_results = batch_analyzer.analyze_dict(df_dict, language="en")
     analyzer_results = list(analyzer_results)
     # + tags=[]
@@ -208,7 +208,6 @@ def handle_docx_anonymisation(
     file_path: str,
     output_folder: str,
     anon_strat: str,
-    language: str,
     chosen_redact_entities: List[str],
     in_allow_list: List[str],
     in_deny_list: List[str],
@@ -216,7 +215,8 @@ def handle_docx_anonymisation(
     pii_identification_method: str,
     chosen_redact_comprehend_entities: List[str],
     comprehend_query_number: int,
-    comprehend_client # Assuming botocore.client.BaseClient type
 ):
     """
     Anonymises a .docx file by extracting text, processing it, and re-inserting it.
@@ -253,11 +253,14 @@ def handle_docx_anonymisation(
     # 2. Convert to a DataFrame for the existing anonymisation script
     df_to_anonymise = pd.DataFrame({'text_to_redact': original_texts})
     # 3. Call the core anonymisation script
     anonymised_df, _, decision_log = anonymise_script(
         df=df_to_anonymise,
         anon_strat=anon_strat,
-        language=language,
         chosen_redact_entities=chosen_redact_entities,
         in_allow_list=in_allow_list,
         in_deny_list=in_deny_list,
@@ -307,7 +310,6 @@ def anonymise_files_with_open_text(file_paths: List[str],
                          in_text: str,
                          anon_strat: str,
                          chosen_cols: List[str],
-                         language: str,
                          chosen_redact_entities: List[str],
                          in_allow_list: List[str] = None,
                          latest_file_completed: int = 0,
@@ -325,7 +327,9 @@ def anonymise_files_with_open_text(file_paths: List[str],
                          aws_access_key_textbox:str='',
                          aws_secret_key_textbox:str='',
                          actual_time_taken_number:float=0,
-                         progress: Progress = Progress(track_tqdm=True)):
     """
     This function anonymises data files based on the provided parameters.
@@ -352,11 +356,21 @@ def anonymise_files_with_open_text(file_paths: List[str],
     - aws_access_key_textbox (str, optional): AWS access key for account with Textract and Comprehend permissions.
     - aws_secret_key_textbox (str, optional): AWS secret key for account with Textract and Comprehend permissions.
     - actual_time_taken_number (float, optional): Time taken to do the redaction.
     - progress (Progress, optional): A Progress object to track progress. Defaults to a Progress object with track_tqdm=True.
     """
     tic = time.perf_counter()
     comprehend_client = ""
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
@@ -455,7 +469,6 @@ def anonymise_files_with_open_text(file_paths: List[str],
                     file_path=anon_file.name, # .name if it's a temp file object
                     output_folder=output_folder,
                     anon_strat=anon_strat,
-                    language=language,
                     chosen_redact_entities=chosen_redact_entities,
                     in_allow_list=in_allow_list_flat,
                     in_deny_list=in_deny_list,
@@ -463,7 +476,8 @@ def anonymise_files_with_open_text(file_paths: List[str],
                     pii_identification_method=pii_identification_method,
                     chosen_redact_comprehend_entities=chosen_redact_comprehend_entities,
                     comprehend_query_number=comprehend_query_number,
-                    comprehend_client=comprehend_client
                 )
                 if output_path:
                     out_file_paths.append(output_path)
@@ -493,14 +507,14 @@ def anonymise_files_with_open_text(file_paths: List[str],
                     anon_df = pd.read_excel(anon_file, sheet_name=sheet_name)
-                    out_file_paths, out_message, key_string, log_files_output_paths  = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name, log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder)
             else:
                 sheet_name = ""
                 anon_df = read_file(anon_file)
                 out_file_part = get_file_name_without_type(anon_file.name)
-                out_file_paths, out_message, key_string, log_files_output_paths = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder)
         # Increase latest file completed count unless we are at the last file
         if latest_file_completed != len(file_paths):
@@ -537,7 +551,7 @@ def tabular_anonymise_wrapper_func(
     out_message: str,
     excel_sheet_name: str,
     anon_strat: str,
-    language: str,
     chosen_redact_entities: List[str],
     in_allow_list: List[str],
     file_type: str,
@@ -546,6 +560,7 @@ def tabular_anonymise_wrapper_func(
     in_deny_list: List[str]=[],
     max_fuzzy_spelling_mistakes_num:int=0,
     pii_identification_method:str="Local",
     chosen_redact_comprehend_entities:List[str]=[],
     comprehend_query_number:int=0,
     comprehend_client:botocore.client.BaseClient="",
@@ -617,8 +632,11 @@ def tabular_anonymise_wrapper_func(
     anon_df_part = anon_df[chosen_cols_in_anon_df]
     anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
     # Anonymise the selected columns
-    anon_df_part_out, key_string, decision_process_output_str = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client)
     anon_df_part_out.replace("^nan$", "", regex=True, inplace=True)
@@ -681,6 +699,7 @@ def anonymise_script(df:pd.DataFrame,
                      in_deny_list:List[str]=[],
                      max_fuzzy_spelling_mistakes_num:int=0,
                      pii_identification_method:str="Local",
                      chosen_redact_comprehend_entities:List[str]=[],
                      comprehend_query_number:int=0,
                      comprehend_client:botocore.client.BaseClient="",
@@ -738,6 +757,9 @@ def anonymise_script(df:pd.DataFrame,
     batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
     analyzer_results = []
     if pii_identification_method == "Local":
         # Use custom analyzer to be able to track progress with Gradio
@@ -801,7 +823,7 @@ def anonymise_script(df:pd.DataFrame,
                     try:
                         response = comprehend_client.detect_pii_entities(
                             Text=str(text),
-                            LanguageCode=language
                         )
                         comprehend_query_number += 1

 from openpyxl import Workbook
 from faker import Faker
 from gradio import Progress
+from typing import List, Dict, Any, Optional
 from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
 from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
 from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
+from tools.config import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, OUTPUT_FOLDER, DEFAULT_LANGUAGE, aws_comprehend_language_choices
 from tools.helper_functions import get_file_name_without_type, read_file, detect_file_type
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, custom_entities
 # Use custom version of analyze_dict to be able to track progress
     #analyzer = AnalyzerEngine()
     batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
+    analyzer_results = batch_analyzer.analyze_dict(df_dict, language=DEFAULT_LANGUAGE)
     analyzer_results = list(analyzer_results)
     # + tags=[]
     file_path: str,
     output_folder: str,
     anon_strat: str,
     chosen_redact_entities: List[str],
     in_allow_list: List[str],
     in_deny_list: List[str],
     pii_identification_method: str,
     chosen_redact_comprehend_entities: List[str],
     comprehend_query_number: int,
+    comprehend_client, # Assuming botocore.client.BaseClient type
+    language: Optional[str] = None
 ):
     """
     Anonymises a .docx file by extracting text, processing it, and re-inserting it.
     # 2. Convert to a DataFrame for the existing anonymisation script
     df_to_anonymise = pd.DataFrame({'text_to_redact': original_texts})
+    # Use provided language or default
+    effective_language = language or DEFAULT_LANGUAGE
     # 3. Call the core anonymisation script
     anonymised_df, _, decision_log = anonymise_script(
         df=df_to_anonymise,
         anon_strat=anon_strat,
+        language=effective_language,
         chosen_redact_entities=chosen_redact_entities,
         in_allow_list=in_allow_list,
         in_deny_list=in_deny_list,
                          in_text: str,
                          anon_strat: str,
                          chosen_cols: List[str],
                          chosen_redact_entities: List[str],
                          in_allow_list: List[str] = None,
                          latest_file_completed: int = 0,
                          aws_access_key_textbox:str='',
                          aws_secret_key_textbox:str='',
                          actual_time_taken_number:float=0,
+                         language: Optional[str] = None,
+                         progress: Progress = Progress(track_tqdm=True),
+                         comprehend_language: Optional[str] = None):
     """
     This function anonymises data files based on the provided parameters.
     - aws_access_key_textbox (str, optional): AWS access key for account with Textract and Comprehend permissions.
     - aws_secret_key_textbox (str, optional): AWS secret key for account with Textract and Comprehend permissions.
     - actual_time_taken_number (float, optional): Time taken to do the redaction.
+    - language (str, optional): The language of the text to anonymise.
     - progress (Progress, optional): A Progress object to track progress. Defaults to a Progress object with track_tqdm=True.
     """
     tic = time.perf_counter()
     comprehend_client = ""
+    # Use provided language or default
+    effective_language = language or DEFAULT_LANGUAGE
+    effective_comprehend_language = comprehend_language or effective_language
+    if pii_identification_method == "AWS Comprehend":
+        if effective_comprehend_language not in aws_comprehend_language_choices:
+            out_message = f"Please note that this language is not supported by AWS Comprehend: {effective_comprehend_language}"
+            raise Warning(out_message)
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
                     file_path=anon_file.name, # .name if it's a temp file object
                     output_folder=output_folder,
                     anon_strat=anon_strat,
                     chosen_redact_entities=chosen_redact_entities,
                     in_allow_list=in_allow_list_flat,
                     in_deny_list=in_deny_list,
                     pii_identification_method=pii_identification_method,
                     chosen_redact_comprehend_entities=chosen_redact_comprehend_entities,
                     comprehend_query_number=comprehend_query_number,
+                    comprehend_client=comprehend_client,
+                    language=effective_language
                 )
                 if output_path:
                     out_file_paths.append(output_path)
                     anon_df = pd.read_excel(anon_file, sheet_name=sheet_name)
+                    out_file_paths, out_message, key_string, log_files_output_paths  = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, effective_language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name, log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, effective_comprehend_language, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder)
             else:
                 sheet_name = ""
                 anon_df = read_file(anon_file)
                 out_file_part = get_file_name_without_type(anon_file.name)
+                out_file_paths, out_message, key_string, log_files_output_paths = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, effective_language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, effective_comprehend_language, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder)
         # Increase latest file completed count unless we are at the last file
         if latest_file_completed != len(file_paths):
     out_message: str,
     excel_sheet_name: str,
     anon_strat: str,
+    language: str,
     chosen_redact_entities: List[str],
     in_allow_list: List[str],
     file_type: str,
     in_deny_list: List[str]=[],
     max_fuzzy_spelling_mistakes_num:int=0,
     pii_identification_method:str="Local",
+    comprehend_language: Optional[str] = None,
     chosen_redact_comprehend_entities:List[str]=[],
     comprehend_query_number:int=0,
     comprehend_client:botocore.client.BaseClient="",
     anon_df_part = anon_df[chosen_cols_in_anon_df]
     anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
+    # Use provided comprehend language or fall back to main language
+    effective_comprehend_language = comprehend_language or language
     # Anonymise the selected columns
+    anon_df_part_out, key_string, decision_process_output_str = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, effective_comprehend_language, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client)
     anon_df_part_out.replace("^nan$", "", regex=True, inplace=True)
                      in_deny_list:List[str]=[],
                      max_fuzzy_spelling_mistakes_num:int=0,
                      pii_identification_method:str="Local",
+                     comprehend_language:Optional[str]=None,
                      chosen_redact_comprehend_entities:List[str]=[],
                      comprehend_query_number:int=0,
                      comprehend_client:botocore.client.BaseClient="",
     batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
     analyzer_results = []
+    # Use provided comprehend language or fall back to main language
+    effective_comprehend_language = comprehend_language or language
     if pii_identification_method == "Local":
         # Use custom analyzer to be able to track progress with Gradio
                     try:
                         response = comprehend_client.detect_pii_entities(
                             Text=str(text),
+                            LanguageCode=effective_comprehend_language
                         )
                         comprehend_query_number += 1

tools/file_conversion.py CHANGED Viewed

@@ -673,7 +673,11 @@ def prepare_image_or_pdf(
                 all_annotations_object = convert_review_df_to_annotation_json(review_file_csv, image_file_paths, page_sizes)
                 json_from_csv = True
             elif '_ocr_output' in file_path_without_ext:
-                all_line_level_ocr_results_df = read_file(file_path)
                 json_from_csv = False
             elif '_ocr_results_with_words' in file_path_without_ext:
                 all_page_line_level_ocr_results_with_words_df = read_file(file_path)

                 all_annotations_object = convert_review_df_to_annotation_json(review_file_csv, image_file_paths, page_sizes)
                 json_from_csv = True
             elif '_ocr_output' in file_path_without_ext:
+                all_line_level_ocr_results_df = read_file(file_path)
+                if "line" not in all_line_level_ocr_results_df.columns:
+                    all_line_level_ocr_results_df["line"] = ""
                 json_from_csv = False
             elif '_ocr_results_with_words' in file_path_without_ext:
                 all_page_line_level_ocr_results_with_words_df = read_file(file_path)

tools/file_redaction.py CHANGED Viewed

@@ -15,14 +15,15 @@ from pdfminer.high_level import extract_pages
 from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTTextLineHorizontal, LTAnno
 from pikepdf import Pdf, Dictionary, Name
 from pymupdf import Rect, Page, Document
 import gradio as gr
 from gradio import Progress
 from collections import defaultdict  # For efficient grouping
-from tools.config import OUTPUT_FOLDER, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, PAGE_BREAK_VALUE, MAX_TIME_VALUE, LOAD_TRUNCATED_IMAGES, INPUT_FOLDER, RETURN_PDF_END_OF_REDACTION, TESSERACT_TEXT_EXTRACT_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, LOCAL_PII_OPTION, AWS_PII_OPTION, NO_REDACTION_PII_OPTION
 from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction,  recreate_page_line_level_ocr_results_with_page
 from tools.file_conversion import convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, is_pdf, is_pdf_or_image, prepare_image_or_pdf, divide_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, divide_coordinates_by_page_sizes, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, fill_missing_box_ids, load_and_convert_ocr_results_with_words_json, save_pdf_with_or_without_compression, word_level_ocr_output_to_dataframe
-from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
 from tools.helper_functions import get_file_name_without_type, clean_unicode_text
 from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
@@ -84,12 +85,9 @@ def merge_page_results(data:list):
     return list(merged.values())
 def choose_and_run_redactor(file_paths:List[str],
  prepared_pdf_file_paths:List[str],
- pdf_image_file_paths:List[str],
- language:str,
  chosen_redact_entities:List[str],
  chosen_redact_comprehend_entities:List[str],
  text_extraction_method:str,
@@ -112,7 +110,7 @@ def choose_and_run_redactor(file_paths:List[str],
  pymupdf_doc=list(),
  current_loop_page:int=0,
  page_break_return:bool=False,
- pii_identification_method:str="Local",
  comprehend_query_number:int=0,
  max_fuzzy_spelling_mistakes_num:int=1,
  match_fuzzy_whole_phrase_bool:bool=True,
@@ -134,7 +132,8 @@ def choose_and_run_redactor(file_paths:List[str],
  all_page_line_level_ocr_results_with_words:list[dict] = list(),
  all_page_line_level_ocr_results_with_words_df:pd.DataFrame=None,
  chosen_local_model:str="tesseract",
- prepare_images:bool=True,
  RETURN_PDF_END_OF_REDACTION:bool=RETURN_PDF_END_OF_REDACTION,
  progress=gr.Progress(track_tqdm=True)):
     '''
@@ -143,7 +142,7 @@ def choose_and_run_redactor(file_paths:List[str],
     - file_paths (List[str]): A list of paths to the files to be redacted.
     - prepared_pdf_file_paths (List[str]): A list of paths to the PDF files prepared for redaction.
     - pdf_image_file_paths (List[str]): A list of paths to the PDF files converted to images for redaction.
-    - language (str): The language of the text in the files.
     - chosen_redact_entities (List[str]): A list of entity types to redact from the files using the local model (spacy) with Microsoft Presidio.
     - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service.
     - text_extraction_method (str): The method to use to extract text from documents.
@@ -188,7 +187,9 @@ def choose_and_run_redactor(file_paths:List[str],
     - all_page_line_level_ocr_results_with_words (list, optional): All word level text on the page with bounding boxes.
     - all_page_line_level_ocr_results_with_words_df (pd.Dataframe, optional): All word level text on the page with bounding boxes as a dataframe.
     - chosen_local_model (str): Which local model is being used for OCR on images - "tesseract", "paddle" for PaddleOCR, or "hybrid" to combine both.
-    - prepare_images (bool, optional): Boolean to determine whether to load images for the PDF.
     - RETURN_PDF_END_OF_REDACTION (bool, optional): Boolean to determine whether to return a redacted PDF at the end of the redaction process.
     - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
@@ -203,6 +204,18 @@ def choose_and_run_redactor(file_paths:List[str],
     blank_request_metadata = []
     all_textract_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
     review_out_file_paths = [prepared_pdf_file_paths[0]]
     if all_page_line_level_ocr_results_with_words_df is None:
          all_page_line_level_ocr_results_with_words_df = pd.DataFrame()
@@ -452,6 +465,19 @@ def choose_and_run_redactor(file_paths:List[str],
     else:
         textract_client = ""
     # Check if output_folder exists, create it if it doesn't
     if not os.path.exists(output_folder): os.makedirs(output_folder)
@@ -511,7 +537,7 @@ def choose_and_run_redactor(file_paths:List[str],
             pymupdf_doc, all_pages_decision_process_table, out_file_paths, new_textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_page_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words = redact_image_pdf(file_path,
              pdf_image_file_paths,
-             language,
              chosen_redact_entities,
              chosen_redact_comprehend_entities,
              in_allow_list_flat,
@@ -538,7 +564,7 @@ def choose_and_run_redactor(file_paths:List[str],
              text_extraction_only,
              all_page_line_level_ocr_results,
              all_page_line_level_ocr_results_with_words,
-             chosen_local_model,
              log_files_output_paths=log_files_output_paths,
              output_folder=output_folder)
@@ -560,7 +586,7 @@ def choose_and_run_redactor(file_paths:List[str],
             pymupdf_doc, all_pages_decision_process_table, all_page_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, all_page_line_level_ocr_results_with_words = redact_text_pdf(
             file_path,
-            language,
             chosen_redact_entities,
             chosen_redact_comprehend_entities,
             in_allow_list_flat,
@@ -1352,6 +1378,7 @@ def redact_image_pdf(file_path:str,
                      log_files_output_paths:List=list(),
                      max_time:int=int(MAX_TIME_VALUE),
                      output_folder:str=OUTPUT_FOLDER,
                      progress=Progress(track_tqdm=True)):
     '''
@@ -1391,6 +1418,7 @@ def redact_image_pdf(file_path:str,
     - log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
     - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
     - output_folder (str, optional): The folder for file outputs.
     - progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
     The function returns a redacted PDF document along with processing output objects.
@@ -1400,6 +1428,20 @@ def redact_image_pdf(file_path:str,
     file_name = get_file_name_without_type(file_path)
     comprehend_query_number_new = 0
     # Update custom word list analyser object with any new words that have been added to the custom deny list
     if custom_recogniser_word_list:
@@ -1413,9 +1455,9 @@ def redact_image_pdf(file_path:str,
     # Only load in PaddleOCR models if not running Textract
     if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
-        image_analyser = CustomImageAnalyzerEngine(nlp_analyser, ocr_engine="tesseract")
     else:
-        image_analyser = CustomImageAnalyzerEngine(nlp_analyser, ocr_engine=chosen_local_model)
     if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
         out_message = "Connection to AWS Comprehend service unsuccessful."
@@ -1635,7 +1677,7 @@ def redact_image_pdf(file_path:str,
                         chosen_redact_comprehend_entities = chosen_redact_comprehend_entities,
                         pii_identification_method = pii_identification_method,
                         comprehend_client=comprehend_client,
-                        language=language,
                         entities=chosen_redact_entities,
                         allow_list=allow_list,
                         score_threshold=score_threshold
@@ -2155,7 +2197,7 @@ def redact_text_pdf(
     all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"]),  # DataFrame for decision process table
     pymupdf_doc: List = list(),  # List of PyMuPDF documents
     all_page_line_level_ocr_results_with_words: List = list(),
-    pii_identification_method: str = "Local",
     comprehend_query_number:int = 0,
     comprehend_client="",
     custom_recogniser_word_list:List[str]=list(),
@@ -2167,10 +2209,10 @@ def redact_text_pdf(
     text_extraction_only:bool=False,
     output_folder:str=OUTPUT_FOLDER,
     page_break_val: int = int(PAGE_BREAK_VALUE),  # Value for page break
-    max_time: int = int(MAX_TIME_VALUE),
     progress: Progress = Progress(track_tqdm=True)  # Progress tracking object
-):
     '''
     Redact chosen entities from a PDF that is made up of multiple pages that are not images.
@@ -2199,13 +2241,15 @@ def redact_text_pdf(
     - page_sizes_df (pd.DataFrame, optional): A pandas dataframe containing page size information.
     - original_cropboxes (List[dict], optional): A list of dictionaries containing pymupdf cropbox information.
     - text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
     - output_folder (str, optional): The output folder for the function
     - page_break_val: Value for page break
-    - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
     - progress: Progress tracking object
     '''
-    tic = time.perf_counter()
     if isinstance(all_line_level_ocr_results_df, pd.DataFrame):
         all_line_level_ocr_results_list = [all_line_level_ocr_results_df]
@@ -2218,6 +2262,20 @@ def redact_text_pdf(
         out_message = "Connection to AWS Comprehend service not found."
         raise Exception(out_message)
     # Update custom word list analyser object with any new words that have been added to the custom deny list
     if custom_recogniser_word_list:
         nlp_analyser.registry.remove_recognizer("CUSTOM")
@@ -2228,6 +2286,8 @@ def redact_text_pdf(
         new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
         nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
     # Open with Pikepdf to get text lines
     pikepdf_pdf = Pdf.open(file_path)
     number_of_pages = len(pikepdf_pdf.pages)
@@ -2323,7 +2383,7 @@ def redact_text_pdf(
                     if chosen_redact_entities or chosen_redact_comprehend_entities:
                         page_redaction_bounding_boxes = run_page_text_redaction(
-                            language,
                             chosen_redact_entities,
                             chosen_redact_comprehend_entities,
                             all_page_line_level_text_extraction_results_list,

 from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTTextLineHorizontal, LTAnno
 from pikepdf import Pdf, Dictionary, Name
 from pymupdf import Rect, Page, Document
+from presidio_analyzer import AnalyzerEngine
 import gradio as gr
 from gradio import Progress
 from collections import defaultdict  # For efficient grouping
+from tools.config import OUTPUT_FOLDER, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, PAGE_BREAK_VALUE, MAX_TIME_VALUE, LOAD_TRUNCATED_IMAGES, INPUT_FOLDER, RETURN_PDF_END_OF_REDACTION, TESSERACT_TEXT_EXTRACT_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, LOCAL_PII_OPTION, AWS_PII_OPTION, NO_REDACTION_PII_OPTION, DEFAULT_LANGUAGE, textract_language_choices, aws_comprehend_language_choices
 from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction,  recreate_page_line_level_ocr_results_with_page
 from tools.file_conversion import convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, is_pdf, is_pdf_or_image, prepare_image_or_pdf, divide_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, divide_coordinates_by_page_sizes, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, fill_missing_box_ids, load_and_convert_ocr_results_with_words_json, save_pdf_with_or_without_compression, word_level_ocr_output_to_dataframe
+from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer, load_spacy_model, download_tesseract_lang_pack, create_nlp_analyser
 from tools.helper_functions import get_file_name_without_type, clean_unicode_text
 from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
     return list(merged.values())
 def choose_and_run_redactor(file_paths:List[str],
  prepared_pdf_file_paths:List[str],
+ pdf_image_file_paths:List[str],
  chosen_redact_entities:List[str],
  chosen_redact_comprehend_entities:List[str],
  text_extraction_method:str,
  pymupdf_doc=list(),
  current_loop_page:int=0,
  page_break_return:bool=False,
+ pii_identification_method:str="Local",
  comprehend_query_number:int=0,
  max_fuzzy_spelling_mistakes_num:int=1,
  match_fuzzy_whole_phrase_bool:bool=True,
  all_page_line_level_ocr_results_with_words:list[dict] = list(),
  all_page_line_level_ocr_results_with_words_df:pd.DataFrame=None,
  chosen_local_model:str="tesseract",
+ language:str=DEFAULT_LANGUAGE,
+ prepare_images:bool=True,
  RETURN_PDF_END_OF_REDACTION:bool=RETURN_PDF_END_OF_REDACTION,
  progress=gr.Progress(track_tqdm=True)):
     '''
     - file_paths (List[str]): A list of paths to the files to be redacted.
     - prepared_pdf_file_paths (List[str]): A list of paths to the PDF files prepared for redaction.
     - pdf_image_file_paths (List[str]): A list of paths to the PDF files converted to images for redaction.
     - chosen_redact_entities (List[str]): A list of entity types to redact from the files using the local model (spacy) with Microsoft Presidio.
     - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service.
     - text_extraction_method (str): The method to use to extract text from documents.
     - all_page_line_level_ocr_results_with_words (list, optional): All word level text on the page with bounding boxes.
     - all_page_line_level_ocr_results_with_words_df (pd.Dataframe, optional): All word level text on the page with bounding boxes as a dataframe.
     - chosen_local_model (str): Which local model is being used for OCR on images - "tesseract", "paddle" for PaddleOCR, or "hybrid" to combine both.
+    - language (str, optional): The language of the text in the files. Defaults to English.
+    - language (str, optional): The language to do AWS Comprehend calls. Defaults to value of language if not provided.
+    - prepare_images (bool, optional): Boolean to determine whether to load images for the PDF.
     - RETURN_PDF_END_OF_REDACTION (bool, optional): Boolean to determine whether to return a redacted PDF at the end of the redaction process.
     - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
     blank_request_metadata = []
     all_textract_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
     review_out_file_paths = [prepared_pdf_file_paths[0]]
+    # Use provided language or default
+    effective_language = language or DEFAULT_LANGUAGE
+    if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
+        if effective_language not in textract_language_choices:
+            out_message = f"Language '{effective_language}' is not supported by AWS Textract. Please select a different language."
+            raise Warning(out_message)
+    elif pii_identification_method == AWS_PII_OPTION:
+        if effective_language not in aws_comprehend_language_choices:
+            out_message = f"Language '{effective_language}' is not supported by AWS Comprehend. Please select a different language."
+            raise Warning(out_message)
     if all_page_line_level_ocr_results_with_words_df is None:
          all_page_line_level_ocr_results_with_words_df = pd.DataFrame()
     else:
         textract_client = ""
+    ### Language check - check if selected language packs exist
+    try:
+        if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION and chosen_local_model == "tesseract":
+            progress(0.1, desc=f"Downloading Tesseract language pack for {effective_language}")
+            download_tesseract_lang_pack(effective_language)
+        progress(0.1, desc=f"Loading SpaCy model for {effective_language}")
+        load_spacy_model(effective_language)
+    except Exception as e:
+        print(f"Error downloading language packs for {effective_language}: {e}")
+        raise Exception(f"Error downloading language packs for {effective_language}: {e}")
     # Check if output_folder exists, create it if it doesn't
     if not os.path.exists(output_folder): os.makedirs(output_folder)
             pymupdf_doc, all_pages_decision_process_table, out_file_paths, new_textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_page_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words = redact_image_pdf(file_path,
              pdf_image_file_paths,
+             effective_language,
              chosen_redact_entities,
              chosen_redact_comprehend_entities,
              in_allow_list_flat,
              text_extraction_only,
              all_page_line_level_ocr_results,
              all_page_line_level_ocr_results_with_words,
+             chosen_local_model,
              log_files_output_paths=log_files_output_paths,
              output_folder=output_folder)
             pymupdf_doc, all_pages_decision_process_table, all_page_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, all_page_line_level_ocr_results_with_words = redact_text_pdf(
             file_path,
+            effective_language,
             chosen_redact_entities,
             chosen_redact_comprehend_entities,
             in_allow_list_flat,
                      log_files_output_paths:List=list(),
                      max_time:int=int(MAX_TIME_VALUE),
                      output_folder:str=OUTPUT_FOLDER,
+                     nlp_analyser: AnalyzerEngine = nlp_analyser,
                      progress=Progress(track_tqdm=True)):
     '''
     - log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
     - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
     - output_folder (str, optional): The folder for file outputs.
+    - nlp_analyser (AnalyzerEngine, optional): The nlp_analyser object to use for entity detection. Defaults to nlp_analyser.
     - progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
     The function returns a redacted PDF document along with processing output objects.
     file_name = get_file_name_without_type(file_path)
     comprehend_query_number_new = 0
+    # Use provided comprehend language or fall back to main language
+    effective_language = language or language
+    # Try updating the supported languages for the spacy analyser
+    try:
+        nlp_analyser = create_nlp_analyser(language, existing_nlp_analyser=nlp_analyser)
+        # Check list of nlp_analyser recognisers and languages
+        if language != "en":
+            gr.Info(f"Language: {language} only supports the following entity detection: {str(nlp_analyser.registry.get_supported_entities(languages=[language]))}")
+    except Exception as e:
+        print(f"Error creating nlp_analyser for {language}: {e}")
+        raise Exception(f"Error creating nlp_analyser for {language}: {e}")
     # Update custom word list analyser object with any new words that have been added to the custom deny list
     if custom_recogniser_word_list:
     # Only load in PaddleOCR models if not running Textract
     if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
+        image_analyser = CustomImageAnalyzerEngine(analyzer_engine=nlp_analyser, ocr_engine="tesseract", language=language)
     else:
+        image_analyser = CustomImageAnalyzerEngine(analyzer_engine=nlp_analyser, ocr_engine=chosen_local_model, language=language)
     if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
         out_message = "Connection to AWS Comprehend service unsuccessful."
                         chosen_redact_comprehend_entities = chosen_redact_comprehend_entities,
                         pii_identification_method = pii_identification_method,
                         comprehend_client=comprehend_client,
+                        language=effective_language,
                         entities=chosen_redact_entities,
                         allow_list=allow_list,
                         score_threshold=score_threshold
     all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"]),  # DataFrame for decision process table
     pymupdf_doc: List = list(),  # List of PyMuPDF documents
     all_page_line_level_ocr_results_with_words: List = list(),
+    pii_identification_method: str = "Local",
     comprehend_query_number:int = 0,
     comprehend_client="",
     custom_recogniser_word_list:List[str]=list(),
     text_extraction_only:bool=False,
     output_folder:str=OUTPUT_FOLDER,
     page_break_val: int = int(PAGE_BREAK_VALUE),  # Value for page break
+    max_time: int = int(MAX_TIME_VALUE),
+    nlp_analyser: AnalyzerEngine = nlp_analyser,
     progress: Progress = Progress(track_tqdm=True)  # Progress tracking object
+):
     '''
     Redact chosen entities from a PDF that is made up of multiple pages that are not images.
     - page_sizes_df (pd.DataFrame, optional): A pandas dataframe containing page size information.
     - original_cropboxes (List[dict], optional): A list of dictionaries containing pymupdf cropbox information.
     - text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
+    - language (str, optional): The language to do AWS Comprehend calls. Defaults to value of language if not provided.
     - output_folder (str, optional): The output folder for the function
     - page_break_val: Value for page break
+    - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
+    - nlp_analyser (AnalyzerEngine, optional): The nlp_analyser object to use for entity detection. Defaults to nlp_analyser.
     - progress: Progress tracking object
     '''
+    tic = time.perf_counter()
     if isinstance(all_line_level_ocr_results_df, pd.DataFrame):
         all_line_level_ocr_results_list = [all_line_level_ocr_results_df]
         out_message = "Connection to AWS Comprehend service not found."
         raise Exception(out_message)
+    # Use provided comprehend language or fall back to main language
+    effective_language = language or language
+    # Try updating the supported languages for the spacy analyser
+    try:
+        nlp_analyser = create_nlp_analyser(language, existing_nlp_analyser=nlp_analyser)
+        # Check list of nlp_analyser recognisers and languages
+        if language != "en":
+            gr.Info(f"Language: {language} only supports the following entity detection: {str(nlp_analyser.registry.get_supported_entities(languages=[language]))}")
+    except Exception as e:
+        print(f"Error creating nlp_analyser for {language}: {e}")
+        raise Exception(f"Error creating nlp_analyser for {language}: {e}")
     # Update custom word list analyser object with any new words that have been added to the custom deny list
     if custom_recogniser_word_list:
         nlp_analyser.registry.remove_recognizer("CUSTOM")
         new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
         nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
     # Open with Pikepdf to get text lines
     pikepdf_pdf = Pdf.open(file_path)
     number_of_pages = len(pikepdf_pdf.pages)
                     if chosen_redact_entities or chosen_redact_comprehend_entities:
                         page_redaction_bounding_boxes = run_page_text_redaction(
+                            effective_language,
                             chosen_redact_entities,
                             chosen_redact_comprehend_entities,
                             all_page_line_level_text_extraction_results_list,

tools/find_duplicate_pages.py CHANGED Viewed

@@ -14,9 +14,7 @@ from pathlib import Path
 from typing import List
 from tools.helper_functions import OUTPUT_FOLDER
 from tools.file_conversion import redact_whole_pymupdf_page, convert_annotation_data_to_dataframe, fill_missing_box_ids_each_box
-import en_core_web_lg
-nlp = en_core_web_lg.load()
 similarity_threshold = 0.95
 number_of_zeros_to_add_to_index = 7 # Number of zeroes to add between page number and line numbers to get a unique page/line index value

 from typing import List
 from tools.helper_functions import OUTPUT_FOLDER
 from tools.file_conversion import redact_whole_pymupdf_page, convert_annotation_data_to_dataframe, fill_missing_box_ids_each_box
+from tools.load_spacy_model_custom_recognisers import nlp
 similarity_threshold = 0.95
 number_of_zeros_to_add_to_index = 7 # Number of zeroes to add between page number and line numbers to get a unique page/line index value

tools/helper_functions.py CHANGED Viewed

@@ -9,7 +9,24 @@ import unicodedata
 from typing import List
 from math import ceil
 from gradio_image_annotation import image_annotator
-from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, AWS_PII_OPTION
 def reset_state_vars():
     return [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
@@ -157,13 +174,7 @@ def ensure_output_folder_exists(output_folder:str):
     else:
         print(f"The {output_folder} folder already exists.")
-def _get_env_list(env_var_name: str) -> List[str]:
-    """Parses a comma-separated environment variable into a list of strings."""
-    value = env_var_name[1:-1].strip().replace('\"', '').replace("\'","")
-    if not value:
-        return []
-    # Split by comma and filter out any empty strings that might result from extra commas
-    return [s.strip() for s in value.split(',') if s.strip()]
 def custom_regex_load(in_file:List[str], file_type:str = "allow_list"):
     '''
@@ -189,7 +200,7 @@ def custom_regex_load(in_file:List[str], file_type:str = "allow_list"):
             print(output_text)
     else:
         output_text = "No file provided."
-        print(output_text)
         return output_text, custom_regex_df
     return output_text, custom_regex_df
@@ -590,4 +601,25 @@ def reset_ocr_with_words_base_dataframe(df:pd.DataFrame, page_entity_dropdown_re
     output_df_filtered = df.loc[df["page"]==str(page_entity_dropdown_redaction_value), ["page", "line", "word_text", "word_x0", "word_y0", "word_x1", "word_y1", "index"]]
     return output_df_filtered, output_df

 from typing import List
 from math import ceil
 from gradio_image_annotation import image_annotator
+from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, AWS_PII_OPTION, MAPPED_LANGUAGE_CHOICES, LANGUAGE_CHOICES, textract_language_choices, aws_comprehend_language_choices, DEFAULT_LANGUAGE
+# from tools.load_spacy_model_custom_recognisers import nlp_analyser
+def _get_env_list(env_var_name: str) -> List[str]:
+    """Parses a comma-separated environment variable into a list of strings."""
+    value = env_var_name[1:-1].strip().replace('\"', '').replace("\'","")
+    if not value:
+        return []
+    # Split by comma and filter out any empty strings that might result from extra commas
+    return [s.strip() for s in value.split(',') if s.strip()]
+if textract_language_choices: textract_language_choices = _get_env_list(textract_language_choices)
+if aws_comprehend_language_choices: aws_comprehend_language_choices = _get_env_list(aws_comprehend_language_choices)
+if MAPPED_LANGUAGE_CHOICES: MAPPED_LANGUAGE_CHOICES = _get_env_list(MAPPED_LANGUAGE_CHOICES)
+if LANGUAGE_CHOICES: LANGUAGE_CHOICES = _get_env_list(LANGUAGE_CHOICES)
+LANGUAGE_MAP = dict(zip(MAPPED_LANGUAGE_CHOICES, LANGUAGE_CHOICES))
 def reset_state_vars():
     return [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
     else:
         print(f"The {output_folder} folder already exists.")
 def custom_regex_load(in_file:List[str], file_type:str = "allow_list"):
     '''
             print(output_text)
     else:
         output_text = "No file provided."
+        #print(output_text)
         return output_text, custom_regex_df
     return output_text, custom_regex_df
     output_df_filtered = df.loc[df["page"]==str(page_entity_dropdown_redaction_value), ["page", "line", "word_text", "word_x0", "word_y0", "word_x1", "word_y1", "index"]]
     return output_df_filtered, output_df
+def update_language_dropdown(chosen_language_full_name_drop, textract_language_choices=textract_language_choices, aws_comprehend_language_choices=aws_comprehend_language_choices, LANGUAGE_MAP=LANGUAGE_MAP):
+    try:
+        full_language_name = chosen_language_full_name_drop.lower()
+        matched_language = LANGUAGE_MAP[full_language_name]
+        chosen_language_drop = gr.Dropdown(value = matched_language, choices = LANGUAGE_CHOICES, label="Chosen language short code", multiselect=False, visible=True)
+        if matched_language not in aws_comprehend_language_choices and matched_language not in textract_language_choices:
+            gr.Info(f"Note that {full_language_name} is not supported by AWS Comprehend or AWS Textract")
+        elif matched_language not in aws_comprehend_language_choices:
+            gr.Info(f"Note that {full_language_name} is not supported by AWS Comprehend")
+        elif matched_language not in textract_language_choices:
+            gr.Info(f"Note that {full_language_name} is not supported by AWS Textract")
+    except Exception as e:
+        print(e)
+        gr.Info("Could not find language in list")
+        chosen_language_drop = gr.Dropdown(value = DEFAULT_LANGUAGE, choices = LANGUAGE_CHOICES, label="Chosen language short code", multiselect=False)
+    return chosen_language_drop

tools/load_spacy_model_custom_recognisers.py CHANGED Viewed

@@ -1,48 +1,255 @@
 from typing import List
 from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
 from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts, NerModelConfiguration
 import spacy
-from spacy.matcher import Matcher, PhraseMatcher
 from spaczz.matcher import FuzzyMatcher
 spacy.prefer_gpu()
 from spacy.cli.download import download
 import Levenshtein
 import re
 import gradio as gr
-model_name = "en_core_web_lg" #"en_core_web_sm" #"en_core_web_trf"
 score_threshold = 0.001
 custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"]
-#Load spacy model
-try:
-	import en_core_web_lg #en_core_web_sm
-	nlp = en_core_web_lg.load() #en_core_web_sm.load()
-	print("Successfully imported spaCy model")
-except:
-	download(model_name)
-	nlp = spacy.load(model_name)
-	print("Successfully downloaded and imported spaCy model", model_name)
 # Create a class inheriting from SpacyNlpEngine
 class LoadedSpacyNlpEngine(SpacyNlpEngine):
-    def __init__(self, loaded_spacy_model):
         super().__init__(ner_model_configuration=NerModelConfiguration(labels_to_ignore=["CARDINAL", "ORDINAL"])) # Ignore non-relevant labels
-        self.nlp = {"en": loaded_spacy_model}
-# Pass the loaded model to the new LoadedSpacyNlpEngine
-loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
-nlp_analyser = AnalyzerEngine(nlp_engine=loaded_nlp_engine,
-                default_score_threshold=score_threshold,
-                supported_languages=["en"],
-                log_decision_process=False,
-                ) # New custom recognisers based on the following functions are added at the end of this script
-# #### Custom recognisers
 def custom_word_list_recogniser(custom_list:List[str]=[]):
     # Create regex pattern, handling quotes carefully
@@ -297,7 +504,6 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
     return all_start_positions, all_end_positions
 class CustomWordFuzzyRecognizer(EntityRecognizer):
     def __init__(self, supported_entities: List[str], custom_list: List[str] = [], spelling_mistakes_max: int = 1, search_whole_phrase: bool = True):
         super().__init__(supported_entities=supported_entities)
@@ -332,10 +538,79 @@ custom_list_default = []
 custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_list_default)
-# Add custom recognisers to nlp_analyser
-nlp_analyser.registry.add_recognizer(street_recogniser)
-nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
-nlp_analyser.registry.add_recognizer(titles_recogniser)
-nlp_analyser.registry.add_recognizer(custom_recogniser)
-nlp_analyser.registry.add_recognizer(custom_word_fuzzy_recognizer)

 from typing import List
 from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
 from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts, NerModelConfiguration
 import spacy
+from spacy.matcher import Matcher
 from spaczz.matcher import FuzzyMatcher
 spacy.prefer_gpu()
 from spacy.cli.download import download
 import Levenshtein
 import re
+import os
+import requests
 import gradio as gr
+from tools.config import DEFAULT_LANGUAGE, TESSERACT_FOLDER
 score_threshold = 0.001
 custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"]
 # Create a class inheriting from SpacyNlpEngine
 class LoadedSpacyNlpEngine(SpacyNlpEngine):
+    def __init__(self, loaded_spacy_model, language_code: str):
         super().__init__(ner_model_configuration=NerModelConfiguration(labels_to_ignore=["CARDINAL", "ORDINAL"])) # Ignore non-relevant labels
+        self.nlp = {language_code: loaded_spacy_model}
+def _base_language_code(language: str) -> str:
+    lang = _normalize_language_input(language)
+    if "_" in lang:
+        return lang.split("_")[0]
+    return lang
+def load_spacy_model(language: str = DEFAULT_LANGUAGE):
+    """
+    Load a spaCy model for the requested language and return it as `nlp`.
+    Accepts common inputs like: "en", "en_lg", "en_sm", "de", "fr", "es", "it", "nl", "pt", "zh", "ja", "xx".
+    Falls back through sensible candidates and will download if missing.
+    """
+    synonyms = {
+        "english": "en",
+        "catalan": "ca",
+        "danish": "da",
+        "german": "de",
+        "french": "fr",
+        "greek": "el",
+        "finnish": "fi",
+        "croatian": "hr",
+        "lithuanian": "lt",
+        "macedonian": "mk",
+        "norwegian_bokmaal": "nb",
+        "polish": "pl",
+        "russian": "ru",
+        "slovenian": "sl",
+        "swedish": "sv",
+        "dutch": "nl",
+        "portuguese": "pt",
+        "chinese": "zh",
+        "japanese": "ja",
+        "multilingual": "xx",
+    }
+    lang_norm = _normalize_language_input(language)
+    lang_norm = synonyms.get(lang_norm, lang_norm)
+    base_lang = _base_language_code(lang_norm)
+    candidates_by_lang = {
+        # English
+        "en": [
+            "en_core_web_lg",
+            "en_core_web_trf",
+            "en_core_web_md",
+            "en_core_web_sm",
+        ],
+        "en_lg": ["en_core_web_lg"],
+        "en_trf": ["en_core_web_trf"],
+        "en_md": ["en_core_web_md"],
+        "en_sm": ["en_core_web_sm"],
+        # Major languages (news pipelines)
+        "ca": ["ca_core_news_lg", "ca_core_news_md", "ca_core_news_sm"], # Catalan
+        "da": ["da_core_news_lg", "da_core_news_md", "da_core_news_sm"], # Danish
+        "de": ["de_core_news_lg", "de_core_news_md", "de_core_news_sm"], # German
+        "el": ["el_core_news_lg", "el_core_news_md", "el_core_news_sm"], # Greek
+        "es": ["es_core_news_lg", "es_core_news_md", "es_core_news_sm"], # Spanish
+        "fi": ["fi_core_news_lg", "fi_core_news_md", "fi_core_news_sm"], # Finnish
+        "fr": ["fr_core_news_lg", "fr_core_news_md", "fr_core_news_sm"], # French
+        "hr": ["hr_core_news_lg", "hr_core_news_md", "hr_core_news_sm"], # Croatian
+        "it": ["it_core_news_lg", "it_core_news_md", "it_core_news_sm"], # Italian
+        "ja": ["ja_core_news_lg", "ja_core_news_md", "ja_core_news_sm"], # Japanese
+        "ko": ["ko_core_news_lg", "ko_core_news_md", "ko_core_news_sm"], # Korean
+        "lt": ["lt_core_news_lg", "lt_core_news_md", "lt_core_news_sm"], # Lithuanian
+        "mk": ["mk_core_news_lg", "mk_core_news_md", "mk_core_news_sm"], # Macedonian
+        "nb": ["nb_core_news_lg", "nb_core_news_md", "nb_core_news_sm"], # Norwegian Bokmål
+        "nl": ["nl_core_news_lg", "nl_core_news_md", "nl_core_news_sm"], # Dutch
+        "pl": ["pl_core_news_lg", "pl_core_news_md", "pl_core_news_sm"], # Polish
+        "pt": ["pt_core_news_lg", "pt_core_news_md", "pt_core_news_sm"], # Portuguese
+        "ro": ["ro_core_news_lg", "ro_core_news_md", "ro_core_news_sm"], # Romanian
+        "ru": ["ru_core_news_lg", "ru_core_news_md", "ru_core_news_sm"], # Russian
+        "sl": ["sl_core_news_lg", "sl_core_news_md", "sl_core_news_sm"], # Slovenian
+        "sv": ["sv_core_news_lg", "sv_core_news_md", "sv_core_news_sm"], # Swedish
+        "uk": ["uk_core_news_lg", "uk_core_news_md", "uk_core_news_sm"], # Ukrainian
+        "zh": ["zh_core_web_lg", "zh_core_web_mod", "zh_core_web_sm", "zh_core_web_trf"], # Chinese
+        # Multilingual NER
+        "xx": ["xx_ent_wiki_sm"],
+    }
+    if lang_norm in candidates_by_lang:
+        candidates = candidates_by_lang[lang_norm]
+    elif base_lang in candidates_by_lang:
+        candidates = candidates_by_lang[base_lang]
+    else:
+        # Fallback to multilingual if unknown
+        candidates = candidates_by_lang["xx"]
+    last_error = None
+    for candidate in candidates:
+        # Try importable package first (fast-path when installed as a package)
+        try:
+            module = __import__(candidate)
+            print(f"Successfully imported spaCy model: {candidate}")
+            return module.load()
+        except Exception as e:
+            last_error = e
+        # Try spacy.load if package is linked/installed
+        try:
+            nlp = spacy.load(candidate)
+            print(f"Successfully loaded spaCy model via spacy.load: {candidate}")
+            return nlp
+        except Exception as e:
+            last_error = e
+        # Check if model is already downloaded before attempting to download
+        try:
+            # Try to load the model to see if it's already available
+            nlp = spacy.load(candidate)
+            print(f"Model {candidate} is already available, skipping download")
+            return nlp
+        except OSError:
+            # Model not found, proceed with download
+            pass
+        except Exception as e:
+            last_error = e
+            continue
+        # Attempt to download then load
+        try:
+            print(f"Downloading spaCy model: {candidate}")
+            download(candidate)
+            nlp = spacy.load(candidate)
+            print(f"Successfully downloaded and loaded spaCy model: {candidate}")
+            return nlp
+        except Exception as e:
+            last_error = e
+            continue
+    raise RuntimeError(f"Failed to load spaCy model for language '{language}'. Last error: {last_error}")
+# Language-aware spaCy model loader
+def _normalize_language_input(language: str) -> str:
+    return language.strip().lower().replace("-", "_")
+# Update the global variables to use the new function
+ACTIVE_LANGUAGE_CODE = _base_language_code(DEFAULT_LANGUAGE)
+nlp = None # Placeholder, will be loaded in the create_nlp_analyser function below #load_spacy_model(DEFAULT_LANGUAGE)
+def get_tesseract_lang_code(short_code:str):
+    """
+    Maps a two-letter language code to the corresponding Tesseract OCR code.
+    Args:
+        short_code (str): The two-letter language code (e.g., "en", "de").
+    Returns:
+        str or None: The Tesseract language code (e.g., "eng", "deu"),
+                     or None if no mapping is found.
+    """
+    # Mapping from 2-letter codes to Tesseract 3-letter codes
+    # Based on ISO 639-2/T codes.
+    lang_map = {
+        "en": "eng",
+        "de": "deu",
+        "fr": "fra",
+        "es": "spa",
+        "it": "ita",
+        "nl": "nld",
+        "pt": "por",
+        "zh": "chi_sim",  # Mapping to Simplified Chinese by default
+        "ja": "jpn",
+        "ko": "kor",
+        "lt": "lit",
+        "mk": "mkd",
+        "nb": "nor",
+        "pl": "pol",
+        "ro": "ron",
+        "ru": "rus",
+        "sl": "slv",
+        "sv": "swe",
+        "uk": "ukr"
+    }
+    return lang_map.get(short_code)
+def download_tesseract_lang_pack(short_lang_code:str, tessdata_dir=TESSERACT_FOLDER + "/tessdata"):
+    """
+    Downloads a Tesseract language pack to a local directory.
+    Args:
+        lang_code (str): The short code for the language (e.g., "eng", "fra").
+        tessdata_dir (str, optional): The directory to save the language pack.
+                                     Defaults to "tessdata".
+    """
+    # Create the directory if it doesn't exist
+    if not os.path.exists(tessdata_dir):
+        os.makedirs(tessdata_dir)
+    # Get the Tesseract language code
+    lang_code = get_tesseract_lang_code(short_lang_code)
+    if lang_code is None:
+        raise ValueError(f"Language code {short_lang_code} not found in Tesseract language map")
+    # Set the local file path
+    file_path = os.path.join(tessdata_dir, f"{lang_code}.traineddata")
+    # Check if the file already exists
+    if os.path.exists(file_path):
+        print(f"Language pack {lang_code}.traineddata already exists at {file_path}")
+        return file_path
+    # Construct the URL for the language pack
+    url = f"https://raw.githubusercontent.com/tesseract-ocr/tessdata/main/{lang_code}.traineddata"
+    # Download the file
+    try:
+        response = requests.get(url, stream=True)
+        response.raise_for_status()  # Raise an exception for bad status codes
+        with open(file_path, "wb") as f:
+            for chunk in response.iter_content(chunk_size=8192):
+                f.write(chunk)
+        print(f"Successfully downloaded {lang_code}.traineddata to {file_path}")
+        return file_path
+    except requests.exceptions.RequestException as e:
+        print(f"Error downloading {lang_code}.traineddata: {e}")
+        return None
+#### Custom recognisers
 def custom_word_list_recogniser(custom_list:List[str]=[]):
     # Create regex pattern, handling quotes carefully
     return all_start_positions, all_end_positions
 class CustomWordFuzzyRecognizer(EntityRecognizer):
     def __init__(self, supported_entities: List[str], custom_list: List[str] = [], spelling_mistakes_max: int = 1, search_whole_phrase: bool = True):
         super().__init__(supported_entities=supported_entities)
 custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_list_default)
+# Pass the loaded model to the new LoadedSpacyNlpEngine
+loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp, language_code = ACTIVE_LANGUAGE_CODE)
+def create_nlp_analyser(language: str = DEFAULT_LANGUAGE, custom_list: List[str] = None,
+                       spelling_mistakes_max: int = 1, search_whole_phrase: bool = True, existing_nlp_analyser: AnalyzerEngine = None):
+    """
+    Create an nlp_analyser object based on the specified language input.
+    Args:
+        language (str): Language code (e.g., "en", "de", "fr", "es", etc.)
+        custom_list (List[str], optional): List of custom words to recognize. Defaults to None.
+        spelling_mistakes_max (int, optional): Maximum number of spelling mistakes for fuzzy matching. Defaults to 1.
+        search_whole_phrase (bool, optional): Whether to search for whole phrases or individual words. Defaults to True.
+    Returns:
+        AnalyzerEngine: Configured nlp_analyser object with custom recognizers
+    """
+    print("existing_nlp_analyser:", existing_nlp_analyser)
+    if existing_nlp_analyser is None:
+        pass
+    else:
+        if existing_nlp_analyser.supported_languages[0] == language:
+            nlp_analyser = existing_nlp_analyser
+            print(f"Using existing nlp_analyser for {language}")
+            return nlp_analyser
+    # Load spaCy model for the specified language
+    nlp_model = load_spacy_model(language)
+    # Get base language code
+    base_lang_code = _base_language_code(language)
+    # Create custom recognizers
+    if custom_list is None:
+        custom_list = []
+    custom_recogniser = custom_word_list_recogniser(custom_list)
+    custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(
+        supported_entities=["CUSTOM_FUZZY"],
+        custom_list=custom_list,
+        spelling_mistakes_max=spelling_mistakes_max,
+        search_whole_phrase=search_whole_phrase
+    )
+    # Create NLP engine with loaded model
+    loaded_nlp_engine = LoadedSpacyNlpEngine(
+        loaded_spacy_model=nlp_model,
+        language_code=base_lang_code
+    )
+    # Create analyzer engine
+    nlp_analyser = AnalyzerEngine(
+        nlp_engine=loaded_nlp_engine,
+        default_score_threshold=score_threshold,
+        supported_languages=[base_lang_code],
+        log_decision_process=False,
+    )
+    # Add custom recognizers to nlp_analyser
+    nlp_analyser.registry.add_recognizer(custom_recogniser)
+    nlp_analyser.registry.add_recognizer(custom_word_fuzzy_recognizer)
+    # Add language-specific recognizers for English
+    if base_lang_code == "en":
+        nlp_analyser.registry.add_recognizer(street_recogniser)
+        nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
+        nlp_analyser.registry.add_recognizer(titles_recogniser)
+    return nlp_analyser
+# Create the default nlp_analyser using the new function
+nlp_analyser = create_nlp_analyser(DEFAULT_LANGUAGE)