Commit
·
9ae09da
1
Parent(s):
003292d
Added support for other languages. Improved DynamoDB download
Browse files- Dockerfile +17 -6
- app.py +33 -33
- load_dynamo_logs.py +45 -8
- tools/aws_textract.py +0 -1
- tools/config.py +22 -2
- tools/custom_image_analyser_engine.py +108 -9
- tools/data_anonymise.py +37 -15
- tools/file_conversion.py +5 -1
- tools/file_redaction.py +83 -23
- tools/find_duplicate_pages.py +1 -3
- tools/helper_functions.py +41 -9
- tools/load_spacy_model_custom_recognisers.py +306 -31
Dockerfile
CHANGED
|
@@ -54,7 +54,9 @@ ENV GRADIO_TEMP_DIR=/tmp/gradio_tmp/ \
|
|
| 54 |
ACCESS_LOGS_FOLDER=$APP_HOME/app/logs/ \
|
| 55 |
USAGE_LOGS_FOLDER=$APP_HOME/app/usage/ \
|
| 56 |
CONFIG_FOLDER=$APP_HOME/app/config/ \
|
| 57 |
-
XDG_CACHE_HOME=/tmp/xdg_cache/user_1000
|
|
|
|
|
|
|
| 58 |
|
| 59 |
# Create the base application directory and set its ownership
|
| 60 |
RUN mkdir -p ${APP_HOME}/app && chown user:user ${APP_HOME}/app
|
|
@@ -83,17 +85,23 @@ RUN mkdir -p \
|
|
| 83 |
${APP_HOME}/app/feedback \
|
| 84 |
${APP_HOME}/app/config
|
| 85 |
|
| 86 |
-
# Now handle the /tmp and /var/tmp directories and their subdirectories
|
| 87 |
RUN mkdir -p /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache /tmp /var/tmp ${XDG_CACHE_HOME} \
|
| 88 |
&& chown user:user /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache ${XDG_CACHE_HOME} \
|
| 89 |
&& chmod 1777 /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache \
|
| 90 |
-
&& chmod 700 ${XDG_CACHE_HOME}
|
| 91 |
-
|
| 92 |
-
RUN mkdir -p ${APP_HOME}/.paddlex/official_models \
|
| 93 |
&& chown user:user \
|
| 94 |
${APP_HOME}/.paddlex/official_models \
|
| 95 |
&& chmod 755 \
|
| 96 |
${APP_HOME}/.paddlex/official_models
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
# Copy installed packages from builder stage
|
| 99 |
COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
|
|
@@ -122,6 +130,8 @@ VOLUME ["/home/user/app/usage"]
|
|
| 122 |
VOLUME ["/home/user/app/feedback"]
|
| 123 |
VOLUME ["/home/user/app/config"]
|
| 124 |
VOLUME ["/home/user/.paddlex/official_models"]
|
|
|
|
|
|
|
| 125 |
VOLUME ["/tmp"]
|
| 126 |
VOLUME ["/var/tmp"]
|
| 127 |
|
|
@@ -134,7 +144,8 @@ ENV PATH=$APP_HOME/.local/bin:$PATH \
|
|
| 134 |
GRADIO_NUM_PORTS=1 \
|
| 135 |
GRADIO_SERVER_NAME=0.0.0.0 \
|
| 136 |
GRADIO_SERVER_PORT=7860 \
|
| 137 |
-
GRADIO_ANALYTICS_ENABLED=False
|
|
|
|
| 138 |
|
| 139 |
ENTRYPOINT ["/entrypoint.sh"]
|
| 140 |
|
|
|
|
| 54 |
ACCESS_LOGS_FOLDER=$APP_HOME/app/logs/ \
|
| 55 |
USAGE_LOGS_FOLDER=$APP_HOME/app/usage/ \
|
| 56 |
CONFIG_FOLDER=$APP_HOME/app/config/ \
|
| 57 |
+
XDG_CACHE_HOME=/tmp/xdg_cache/user_1000 \
|
| 58 |
+
TESSERACT_FOLDER=/usr/bin/tesseract \
|
| 59 |
+
TESSERACT_DATA_FOLDER=/usr/share/tessdata
|
| 60 |
|
| 61 |
# Create the base application directory and set its ownership
|
| 62 |
RUN mkdir -p ${APP_HOME}/app && chown user:user ${APP_HOME}/app
|
|
|
|
| 85 |
${APP_HOME}/app/feedback \
|
| 86 |
${APP_HOME}/app/config
|
| 87 |
|
| 88 |
+
# Now handle the /tmp and /var/tmp directories and their subdirectories, paddle, spacy, tessdata
|
| 89 |
RUN mkdir -p /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache /tmp /var/tmp ${XDG_CACHE_HOME} \
|
| 90 |
&& chown user:user /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache ${XDG_CACHE_HOME} \
|
| 91 |
&& chmod 1777 /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache \
|
| 92 |
+
&& chmod 700 ${XDG_CACHE_HOME} \
|
| 93 |
+
&& mkdir -p ${APP_HOME}/.paddlex/official_models \
|
|
|
|
| 94 |
&& chown user:user \
|
| 95 |
${APP_HOME}/.paddlex/official_models \
|
| 96 |
&& chmod 755 \
|
| 97 |
${APP_HOME}/.paddlex/official_models
|
| 98 |
+
&& mkdir -p ${APP_HOME}/.local/share/spacy/data \
|
| 99 |
+
&& chown user:user \
|
| 100 |
+
${APP_HOME}/.local/share/spacy/data \
|
| 101 |
+
&& chmod 755 \
|
| 102 |
+
${APP_HOME}/.local/share/spacy/data \
|
| 103 |
+
mkdir -p /usr/share/tessdata && \
|
| 104 |
+
chmod 755 /usr/share/tessdata # Create tessdata directory and set permissions
|
| 105 |
|
| 106 |
# Copy installed packages from builder stage
|
| 107 |
COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
|
|
|
|
| 130 |
VOLUME ["/home/user/app/feedback"]
|
| 131 |
VOLUME ["/home/user/app/config"]
|
| 132 |
VOLUME ["/home/user/.paddlex/official_models"]
|
| 133 |
+
VOLUME ["/home/user/.local/share/spacy/data"]
|
| 134 |
+
VOLUME ["/usr/share/tessdata"]
|
| 135 |
VOLUME ["/tmp"]
|
| 136 |
VOLUME ["/var/tmp"]
|
| 137 |
|
|
|
|
| 144 |
GRADIO_NUM_PORTS=1 \
|
| 145 |
GRADIO_SERVER_NAME=0.0.0.0 \
|
| 146 |
GRADIO_SERVER_PORT=7860 \
|
| 147 |
+
GRADIO_ANALYTICS_ENABLED=False \
|
| 148 |
+
|
| 149 |
|
| 150 |
ENTRYPOINT ["/entrypoint.sh"]
|
| 151 |
|
app.py
CHANGED
|
@@ -2,15 +2,15 @@ import os
|
|
| 2 |
import pandas as pd
|
| 3 |
import gradio as gr
|
| 4 |
from gradio_image_annotation import image_annotator
|
| 5 |
-
from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER,
|
| 6 |
-
from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_relevant_ocr_output_with_words, reset_data_vars, reset_aws_call_vars, _get_env_list, ensure_folder_exists, reset_ocr_with_words_base_dataframe
|
| 7 |
from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
|
| 8 |
from tools.file_redaction import choose_and_run_redactor
|
| 9 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
| 10 |
from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback_dataframe_row, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr, df_select_callback_textract_api, get_all_rows_with_same_text, increase_bottom_page_count_based_on_top, store_duplicate_selection, create_annotation_objects_from_filtered_ocr_results_with_words, df_select_callback_dataframe_row_ocr_with_words, update_redact_choice_df_from_page_dropdown, get_all_rows_with_same_text_redact
|
| 11 |
from tools.data_anonymise import anonymise_files_with_open_text
|
| 12 |
from tools.auth import authenticate_user
|
| 13 |
-
from tools.load_spacy_model_custom_recognisers import custom_entities
|
| 14 |
from tools.custom_csvlogger import CSVLogger_custom
|
| 15 |
from tools.find_duplicate_pages import run_duplicate_analysis, exclude_match, handle_selection_and_preview, apply_whole_page_redactions_from_list, create_annotation_objects_from_duplicates, run_full_search_and_analysis
|
| 16 |
from tools.textract_batch_call import analyse_document_with_textract_api, poll_whole_document_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist, replace_existing_pdf_input_for_whole_document_outputs
|
|
@@ -33,6 +33,8 @@ if SAVE_LOGS_TO_CSV == "True": SAVE_LOGS_TO_CSV = True
|
|
| 33 |
else: SAVE_LOGS_TO_CSV = False
|
| 34 |
if SAVE_LOGS_TO_DYNAMODB == "True": SAVE_LOGS_TO_DYNAMODB = True
|
| 35 |
else: SAVE_LOGS_TO_DYNAMODB = False
|
|
|
|
|
|
|
| 36 |
|
| 37 |
if CSV_ACCESS_LOG_HEADERS: CSV_ACCESS_LOG_HEADERS = _get_env_list(CSV_ACCESS_LOG_HEADERS)
|
| 38 |
if CSV_FEEDBACK_LOG_HEADERS: CSV_FEEDBACK_LOG_HEADERS = _get_env_list(CSV_FEEDBACK_LOG_HEADERS)
|
|
@@ -244,6 +246,10 @@ with app:
|
|
| 244 |
## Duplicate search object
|
| 245 |
new_duplicate_search_annotation_object = gr.Dropdown(value=None, label="new_duplicate_search_annotation_object", allow_custom_value=True, visible=False)
|
| 246 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
###
|
| 248 |
# UI DESIGN
|
| 249 |
###
|
|
@@ -588,10 +594,18 @@ with app:
|
|
| 588 |
page_min = gr.Number(value=0, precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
|
| 589 |
page_max = gr.Number(value=0, precision=0,minimum=0,maximum=9999, label="Highest page to redact")
|
| 590 |
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 594 |
|
|
|
|
| 595 |
with gr.Row():
|
| 596 |
aws_access_key_textbox = gr.Textbox(value='', label="AWS access key for account with permissions for AWS Textract and Comprehend", visible=True, type="password")
|
| 597 |
aws_secret_key_textbox = gr.Textbox(value='', label="AWS secret key for account with permissions for AWS Textract and Comprehend", visible=True, type="password")
|
|
@@ -651,15 +665,11 @@ with app:
|
|
| 651 |
# Run redaction function
|
| 652 |
document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number, all_page_line_level_ocr_results_with_words]).\
|
| 653 |
success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
|
| 654 |
-
success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state,
|
| 655 |
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, input_pdf_for_review, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state], api_name="redact_doc")
|
| 656 |
-
|
| 657 |
-
# If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
|
| 658 |
-
# current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox],
|
| 659 |
-
# outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, input_pdf_for_review, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state])
|
| 660 |
|
| 661 |
# If a file has been completed, the function will continue onto the next document
|
| 662 |
-
latest_file_completed_num.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state,
|
| 663 |
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, input_pdf_for_review, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state]).\
|
| 664 |
success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
|
| 665 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
|
|
@@ -689,7 +699,7 @@ with app:
|
|
| 689 |
success(fn=check_for_relevant_ocr_output_with_words, inputs=[doc_file_name_no_extension_textbox, text_extract_method_radio, output_folder_textbox], outputs=[relevant_ocr_output_with_words_found_checkbox]).\
|
| 690 |
success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
|
| 691 |
success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
|
| 692 |
-
success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state,
|
| 693 |
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, input_pdf_for_review, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state]).\
|
| 694 |
success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
|
| 695 |
|
|
@@ -889,13 +899,11 @@ with app:
|
|
| 889 |
success(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list, total_pdf_page_count])
|
| 890 |
|
| 891 |
tabular_data_redact_btn.click(reset_data_vars, outputs=[actual_time_taken_number, log_files_output_list_state, comprehend_query_number]).\
|
| 892 |
-
success(fn=anonymise_files_with_open_text, inputs=[in_data_files, in_text, anon_strat, in_colnames,
|
| 893 |
-
success(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
|
| 894 |
|
| 895 |
-
# Currently only supports redacting one data file at a time, following code block not used
|
| 896 |
# If the output file count text box changes, keep going with redacting each data file until done
|
| 897 |
-
|
| 898 |
-
|
| 899 |
|
| 900 |
###
|
| 901 |
# IDENTIFY DUPLICATE PAGES
|
|
@@ -966,7 +974,12 @@ with app:
|
|
| 966 |
merge_multiple_review_files_btn.click(fn=merge_csv_files, inputs=multiple_review_files_in_out, outputs=multiple_review_files_in_out)
|
| 967 |
|
| 968 |
#
|
| 969 |
-
all_output_files_btn.click(fn=load_all_output_files, inputs=output_folder_textbox, outputs=all_output_files)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 970 |
|
| 971 |
###
|
| 972 |
# APP LOAD AND LOGGING
|
|
@@ -1082,17 +1095,4 @@ if __name__ == "__main__":
|
|
| 1082 |
|
| 1083 |
main(first_loop_state, latest_file_completed=0, redaction_output_summary_textbox="", output_file_list=None,
|
| 1084 |
log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
|
| 1085 |
-
current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_page_line_level_ocr_results_df = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),CHOSEN_COMPREHEND_ENTITIES = CHOSEN_COMPREHEND_ENTITIES, CHOSEN_REDACT_ENTITIES = CHOSEN_REDACT_ENTITIES, handwrite_signature_checkbox = ["Extract handwriting", "Extract signatures"])
|
| 1086 |
-
|
| 1087 |
-
# AWS options - placeholder for possibility of storing data on s3 and retrieving it in app
|
| 1088 |
-
# with gr.Tab(label="Advanced options"):
|
| 1089 |
-
# with gr.Accordion(label = "AWS data access", open = True):
|
| 1090 |
-
# aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
|
| 1091 |
-
# with gr.Row():
|
| 1092 |
-
# in_aws_file = gr.Dropdown(label="Choose file to load from AWS (only valid for API Gateway app)", choices=["None", "Lambeth borough plan"])
|
| 1093 |
-
# load_aws_data_button = gr.Button(value="Load data from AWS", variant="secondary")
|
| 1094 |
-
|
| 1095 |
-
# aws_log_box = gr.Textbox(label="AWS data load status")
|
| 1096 |
-
|
| 1097 |
-
# ### Loading AWS data ###
|
| 1098 |
-
# load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_doc_files, aws_log_box])
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
import gradio as gr
|
| 4 |
from gradio_image_annotation import image_annotator
|
| 5 |
+
from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, DEFAULT_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS, USAGE_LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, S3_FEEDBACK_LOGS_FOLDER, S3_ACCESS_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER, CHOSEN_LOCAL_OCR_MODEL, DEFAULT_LANGUAGE_FULL_NAME, SHOW_LANGUAGE_SELECTION
|
| 6 |
+
from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_relevant_ocr_output_with_words, reset_data_vars, reset_aws_call_vars, _get_env_list, ensure_folder_exists, reset_ocr_with_words_base_dataframe, update_language_dropdown, LANGUAGE_CHOICES, MAPPED_LANGUAGE_CHOICES
|
| 7 |
from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
|
| 8 |
from tools.file_redaction import choose_and_run_redactor
|
| 9 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
| 10 |
from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback_dataframe_row, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr, df_select_callback_textract_api, get_all_rows_with_same_text, increase_bottom_page_count_based_on_top, store_duplicate_selection, create_annotation_objects_from_filtered_ocr_results_with_words, df_select_callback_dataframe_row_ocr_with_words, update_redact_choice_df_from_page_dropdown, get_all_rows_with_same_text_redact
|
| 11 |
from tools.data_anonymise import anonymise_files_with_open_text
|
| 12 |
from tools.auth import authenticate_user
|
| 13 |
+
from tools.load_spacy_model_custom_recognisers import custom_entities, load_spacy_model, download_tesseract_lang_pack
|
| 14 |
from tools.custom_csvlogger import CSVLogger_custom
|
| 15 |
from tools.find_duplicate_pages import run_duplicate_analysis, exclude_match, handle_selection_and_preview, apply_whole_page_redactions_from_list, create_annotation_objects_from_duplicates, run_full_search_and_analysis
|
| 16 |
from tools.textract_batch_call import analyse_document_with_textract_api, poll_whole_document_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist, replace_existing_pdf_input_for_whole_document_outputs
|
|
|
|
| 33 |
else: SAVE_LOGS_TO_CSV = False
|
| 34 |
if SAVE_LOGS_TO_DYNAMODB == "True": SAVE_LOGS_TO_DYNAMODB = True
|
| 35 |
else: SAVE_LOGS_TO_DYNAMODB = False
|
| 36 |
+
if SHOW_LANGUAGE_SELECTION == "True": SHOW_LANGUAGE_SELECTION = True
|
| 37 |
+
else: SHOW_LANGUAGE_SELECTION = False
|
| 38 |
|
| 39 |
if CSV_ACCESS_LOG_HEADERS: CSV_ACCESS_LOG_HEADERS = _get_env_list(CSV_ACCESS_LOG_HEADERS)
|
| 40 |
if CSV_FEEDBACK_LOG_HEADERS: CSV_FEEDBACK_LOG_HEADERS = _get_env_list(CSV_FEEDBACK_LOG_HEADERS)
|
|
|
|
| 246 |
## Duplicate search object
|
| 247 |
new_duplicate_search_annotation_object = gr.Dropdown(value=None, label="new_duplicate_search_annotation_object", allow_custom_value=True, visible=False)
|
| 248 |
|
| 249 |
+
# Spacy analyser state
|
| 250 |
+
updated_nlp_analyser_state = gr.State([])
|
| 251 |
+
tesseract_lang_data_file_path = gr.Textbox("", visible=False)
|
| 252 |
+
|
| 253 |
###
|
| 254 |
# UI DESIGN
|
| 255 |
###
|
|
|
|
| 594 |
page_min = gr.Number(value=0, precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
|
| 595 |
page_max = gr.Number(value=0, precision=0,minimum=0,maximum=9999, label="Highest page to redact")
|
| 596 |
|
| 597 |
+
if SHOW_LANGUAGE_SELECTION:
|
| 598 |
+
with gr.Accordion("Language selection", open=False):
|
| 599 |
+
gr.Markdown("""Note that AWS Textract is only compatible with English, Spanish, Italian, Portuguese, French, and German, and handwriting detection is only available in English. AWS Comprehend is additionally compatible with Arabic, Hindi, Japanese, Korean, Chinese, and Chinese (Traditional).
|
| 600 |
+
The local models (Tesseract and SpaCy) are compatible with the other languages in the list below. However, the language packs for these models need to be installed on your system. When you first run a document through the app, the language packs will be downloaded automatically, but please expect a delay as the models are large.""")
|
| 601 |
+
with gr.Row():
|
| 602 |
+
chosen_language_full_name_drop = gr.Dropdown(value = DEFAULT_LANGUAGE_FULL_NAME, choices = MAPPED_LANGUAGE_CHOICES, label="Chosen language", multiselect=False, visible=True)
|
| 603 |
+
chosen_language_drop = gr.Dropdown(value = DEFAULT_LANGUAGE, choices = LANGUAGE_CHOICES, label="Chosen language short code", multiselect=False, visible=True)
|
| 604 |
+
else:
|
| 605 |
+
chosen_language_full_name_drop = gr.Dropdown(value = DEFAULT_LANGUAGE_FULL_NAME, choices = MAPPED_LANGUAGE_CHOICES, label="Chosen language", multiselect=False, visible=False)
|
| 606 |
+
chosen_language_drop = gr.Dropdown(value = DEFAULT_LANGUAGE, choices = LANGUAGE_CHOICES, label="Chosen language short code", multiselect=False, visible=False)
|
| 607 |
|
| 608 |
+
with gr.Accordion("Use API keys for AWS services", open = False):
|
| 609 |
with gr.Row():
|
| 610 |
aws_access_key_textbox = gr.Textbox(value='', label="AWS access key for account with permissions for AWS Textract and Comprehend", visible=True, type="password")
|
| 611 |
aws_secret_key_textbox = gr.Textbox(value='', label="AWS secret key for account with permissions for AWS Textract and Comprehend", visible=True, type="password")
|
|
|
|
| 665 |
# Run redaction function
|
| 666 |
document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number, all_page_line_level_ocr_results_with_words]).\
|
| 667 |
success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
|
| 668 |
+
success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox, chosen_language_drop],
|
| 669 |
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, input_pdf_for_review, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state], api_name="redact_doc")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 670 |
|
| 671 |
# If a file has been completed, the function will continue onto the next document
|
| 672 |
+
latest_file_completed_num.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox, chosen_language_drop],
|
| 673 |
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, input_pdf_for_review, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state]).\
|
| 674 |
success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
|
| 675 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
|
|
|
|
| 699 |
success(fn=check_for_relevant_ocr_output_with_words, inputs=[doc_file_name_no_extension_textbox, text_extract_method_radio, output_folder_textbox], outputs=[relevant_ocr_output_with_words_found_checkbox]).\
|
| 700 |
success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
|
| 701 |
success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
|
| 702 |
+
success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, chosen_local_model_textbox, chosen_language_drop],
|
| 703 |
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, input_pdf_for_review, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state]).\
|
| 704 |
success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
|
| 705 |
|
|
|
|
| 899 |
success(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list, total_pdf_page_count])
|
| 900 |
|
| 901 |
tabular_data_redact_btn.click(reset_data_vars, outputs=[actual_time_taken_number, log_files_output_list_state, comprehend_query_number]).\
|
| 902 |
+
success(fn=anonymise_files_with_open_text, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_entities, in_allow_list_state, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox, actual_time_taken_number, chosen_language_drop], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state, actual_time_taken_number], api_name="redact_data")
|
|
|
|
| 903 |
|
|
|
|
| 904 |
# If the output file count text box changes, keep going with redacting each data file until done
|
| 905 |
+
text_tabular_files_done.change(fn=anonymise_files_with_open_text, inputs=[in_data_files, in_text, anon_strat, in_colnames, chosen_language_drop, in_redact_entities, in_allow_list_state, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox, actual_time_taken_number, chosen_language_drop], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state, actual_time_taken_number]).\
|
| 906 |
+
success(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
|
| 907 |
|
| 908 |
###
|
| 909 |
# IDENTIFY DUPLICATE PAGES
|
|
|
|
| 974 |
merge_multiple_review_files_btn.click(fn=merge_csv_files, inputs=multiple_review_files_in_out, outputs=multiple_review_files_in_out)
|
| 975 |
|
| 976 |
#
|
| 977 |
+
all_output_files_btn.click(fn=load_all_output_files, inputs=output_folder_textbox, outputs=all_output_files)
|
| 978 |
+
|
| 979 |
+
# Language selection dropdown
|
| 980 |
+
chosen_language_full_name_drop.select(update_language_dropdown, inputs=[chosen_language_full_name_drop], outputs=[chosen_language_drop])#.\
|
| 981 |
+
#success(download_tesseract_lang_pack, inputs=[chosen_language_drop], outputs = [tesseract_lang_data_file_path]).\
|
| 982 |
+
#success(load_spacy_model, inputs=[chosen_language_drop], outputs=[updated_nlp_analyser_state])
|
| 983 |
|
| 984 |
###
|
| 985 |
# APP LOAD AND LOGGING
|
|
|
|
| 1095 |
|
| 1096 |
main(first_loop_state, latest_file_completed=0, redaction_output_summary_textbox="", output_file_list=None,
|
| 1097 |
log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
|
| 1098 |
+
current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_page_line_level_ocr_results_df = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),CHOSEN_COMPREHEND_ENTITIES = CHOSEN_COMPREHEND_ENTITIES, CHOSEN_REDACT_ENTITIES = CHOSEN_REDACT_ENTITIES, handwrite_signature_checkbox = ["Extract handwriting", "Extract signatures"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
load_dynamo_logs.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import boto3
|
| 2 |
import csv
|
| 3 |
from decimal import Decimal
|
|
|
|
| 4 |
from boto3.dynamodb.conditions import Key
|
| 5 |
|
| 6 |
from tools.config import AWS_REGION, ACCESS_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, OUTPUT_FOLDER
|
|
@@ -16,11 +17,26 @@ table = dynamodb.Table(TABLE_NAME)
|
|
| 16 |
|
| 17 |
# Helper function to convert Decimal to float or int
|
| 18 |
def convert_types(item):
|
|
|
|
| 19 |
for key, value in item.items():
|
|
|
|
| 20 |
if isinstance(value, Decimal):
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
# Paginated scan
|
| 26 |
def scan_table():
|
|
@@ -35,22 +51,43 @@ def scan_table():
|
|
| 35 |
return items
|
| 36 |
|
| 37 |
# Export to CSV
|
| 38 |
-
|
|
|
|
| 39 |
if not items:
|
| 40 |
print("No items found.")
|
| 41 |
return
|
| 42 |
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
-
with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
writer.writeheader()
|
| 48 |
|
| 49 |
for item in items:
|
|
|
|
|
|
|
| 50 |
writer.writerow(convert_types(item))
|
| 51 |
|
| 52 |
print(f"Exported {len(items)} items to {output_path}")
|
| 53 |
|
| 54 |
# Run export
|
| 55 |
items = scan_table()
|
| 56 |
-
export_to_csv(items, CSV_OUTPUT)
|
|
|
|
| 1 |
import boto3
|
| 2 |
import csv
|
| 3 |
from decimal import Decimal
|
| 4 |
+
import datetime
|
| 5 |
from boto3.dynamodb.conditions import Key
|
| 6 |
|
| 7 |
from tools.config import AWS_REGION, ACCESS_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, OUTPUT_FOLDER
|
|
|
|
| 17 |
|
| 18 |
# Helper function to convert Decimal to float or int
|
| 19 |
def convert_types(item):
|
| 20 |
+
new_item = {}
|
| 21 |
for key, value in item.items():
|
| 22 |
+
# Handle Decimals first
|
| 23 |
if isinstance(value, Decimal):
|
| 24 |
+
new_item[key] = int(value) if value % 1 == 0 else float(value)
|
| 25 |
+
# Handle Strings that might be dates
|
| 26 |
+
elif isinstance(value, str):
|
| 27 |
+
try:
|
| 28 |
+
# Attempt to parse a common ISO 8601 format.
|
| 29 |
+
# The .replace() handles the 'Z' for Zulu/UTC time.
|
| 30 |
+
dt_obj = datetime.datetime.fromisoformat(value.replace('Z', '+00:00'))
|
| 31 |
+
# Now that we have a datetime object, format it as desired
|
| 32 |
+
new_item[key] = dt_obj.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
|
| 33 |
+
except (ValueError, TypeError):
|
| 34 |
+
# If it fails to parse, it's just a regular string
|
| 35 |
+
new_item[key] = value
|
| 36 |
+
# Handle all other types
|
| 37 |
+
else:
|
| 38 |
+
new_item[key] = value
|
| 39 |
+
return new_item
|
| 40 |
|
| 41 |
# Paginated scan
|
| 42 |
def scan_table():
|
|
|
|
| 51 |
return items
|
| 52 |
|
| 53 |
# Export to CSV
|
| 54 |
+
# Export to CSV
|
| 55 |
+
def export_to_csv(items, output_path, fields_to_drop: list = None):
|
| 56 |
if not items:
|
| 57 |
print("No items found.")
|
| 58 |
return
|
| 59 |
|
| 60 |
+
# Use a set for efficient lookup
|
| 61 |
+
drop_set = set(fields_to_drop or [])
|
| 62 |
+
|
| 63 |
+
# Get a comprehensive list of all possible headers from all items
|
| 64 |
+
all_keys = set()
|
| 65 |
+
for item in items:
|
| 66 |
+
all_keys.update(item.keys())
|
| 67 |
+
|
| 68 |
+
# Determine the final fieldnames by subtracting the ones to drop
|
| 69 |
+
fieldnames = sorted(list(all_keys - drop_set))
|
| 70 |
+
|
| 71 |
+
print("Final CSV columns will be:", fieldnames)
|
| 72 |
|
| 73 |
+
with open(output_path, 'w', newline='', encoding='utf-8-sig') as csvfile:
|
| 74 |
+
# The key fix is here: extrasaction='ignore'
|
| 75 |
+
# restval='' is also good practice to handle rows that are missing a key
|
| 76 |
+
writer = csv.DictWriter(
|
| 77 |
+
csvfile,
|
| 78 |
+
fieldnames=fieldnames,
|
| 79 |
+
extrasaction='ignore',
|
| 80 |
+
restval=''
|
| 81 |
+
)
|
| 82 |
writer.writeheader()
|
| 83 |
|
| 84 |
for item in items:
|
| 85 |
+
# The convert_types function can now return the full dict,
|
| 86 |
+
# and the writer will simply ignore the extra fields.
|
| 87 |
writer.writerow(convert_types(item))
|
| 88 |
|
| 89 |
print(f"Exported {len(items)} items to {output_path}")
|
| 90 |
|
| 91 |
# Run export
|
| 92 |
items = scan_table()
|
| 93 |
+
export_to_csv(items, CSV_OUTPUT, fields_to_drop=[])
|
tools/aws_textract.py
CHANGED
|
@@ -278,7 +278,6 @@ def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_
|
|
| 278 |
|
| 279 |
return all_ocr_results_with_page, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_words_with_page
|
| 280 |
|
| 281 |
-
|
| 282 |
def load_and_convert_textract_json(textract_json_file_path:str, log_files_output_paths:str, page_sizes_df:pd.DataFrame):
|
| 283 |
"""
|
| 284 |
Loads Textract JSON from a file, detects if conversion is needed, and converts if necessary.
|
|
|
|
| 278 |
|
| 279 |
return all_ocr_results_with_page, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_words_with_page
|
| 280 |
|
|
|
|
| 281 |
def load_and_convert_textract_json(textract_json_file_path:str, log_files_output_paths:str, page_sizes_df:pd.DataFrame):
|
| 282 |
"""
|
| 283 |
Loads Textract JSON from a file, detects if conversion is needed, and converts if necessary.
|
tools/config.py
CHANGED
|
@@ -195,7 +195,8 @@ FEEDBACK_LOG_FILE_NAME = get_or_create_env_var('FEEDBACK_LOG_FILE_NAME', LOG_FIL
|
|
| 195 |
###
|
| 196 |
|
| 197 |
# Create Tesseract and Poppler folders if you have installed them locally
|
| 198 |
-
TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "") # # If installing for Windows, install Tesseract 5.5.0 from here: https://github.com/UB-Mannheim/tesseract/wiki. Then this environment variable should point to the Tesseract folder e.g. tesseract/
|
|
|
|
| 199 |
POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "") # If installing on Windows,install Poppler from here https://github.com/oschwartz10612/poppler-windows. This variable needs to point to the poppler bin folder e.g. poppler/poppler-24.02.0/Library/bin/
|
| 200 |
|
| 201 |
if TESSERACT_FOLDER: add_folder_to_path(TESSERACT_FOLDER)
|
|
@@ -288,7 +289,26 @@ MAX_TIME_VALUE = get_or_create_env_var('MAX_TIME_VALUE', '999999')
|
|
| 288 |
|
| 289 |
CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "") # only "grey" is currently supported as a custom box colour
|
| 290 |
|
| 291 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
|
| 293 |
RETURN_PDF_END_OF_REDACTION = get_or_create_env_var("RETURN_PDF_END_OF_REDACTION", "True") # Return a redacted PDF at the end of the redaction task. Could be useful to set this to "False" if you want to ensure that the user always goes to the 'Review Redactions' tab before getting the final redacted PDF product.
|
| 294 |
|
|
|
|
| 195 |
###
|
| 196 |
|
| 197 |
# Create Tesseract and Poppler folders if you have installed them locally
|
| 198 |
+
TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "/usr/bin/tesseract") # # If installing for Windows, install Tesseract 5.5.0 from here: https://github.com/UB-Mannheim/tesseract/wiki. Then this environment variable should point to the Tesseract folder e.g. tesseract/
|
| 199 |
+
TESSERACT_DATA_FOLDER = get_or_create_env_var('TESSERACT_DATA_FOLDER', "/usr/share/tessdata")
|
| 200 |
POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "") # If installing on Windows,install Poppler from here https://github.com/oschwartz10612/poppler-windows. This variable needs to point to the poppler bin folder e.g. poppler/poppler-24.02.0/Library/bin/
|
| 201 |
|
| 202 |
if TESSERACT_FOLDER: add_folder_to_path(TESSERACT_FOLDER)
|
|
|
|
| 289 |
|
| 290 |
CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "") # only "grey" is currently supported as a custom box colour
|
| 291 |
|
| 292 |
+
### Language selection options
|
| 293 |
+
|
| 294 |
+
SHOW_LANGUAGE_SELECTION = get_or_create_env_var("SHOW_LANGUAGE_SELECTION", "False")
|
| 295 |
+
|
| 296 |
+
DEFAULT_LANGUAGE_FULL_NAME = get_or_create_env_var("DEFAULT_LANGUAGE_FULL_NAME", "english")
|
| 297 |
+
DEFAULT_LANGUAGE = get_or_create_env_var("DEFAULT_LANGUAGE", "en") # For tesseract, ensure the Tesseract language data (e.g., fra.traineddata) is installed on your system. You can find the relevant language packs here: https://github.com/tesseract-ocr/tessdata.
|
| 298 |
+
# For paddle, ensure the paddle language data (e.g., fra.traineddata) is installed on your system. You can find information on supported languages here: https://www.paddleocr.ai/main/en/version3.x/algorithm/PP-OCRv5/PP-OCRv5_multi_languages.html
|
| 299 |
+
# For AWS Comprehend, ensure the language data is installed on your system. You can find the relevant language packs here: https://docs.aws.amazon.com/comprehend/latest/dg/supported-languages.html: ('en'|'es'|'fr'|'de'|'it'|'pt'|'ar'|'hi'|'ja'|'ko'|'zh'|'zh-TW')
|
| 300 |
+
# AWS Textract automatically detects the language of the document and supports the following languages: https://aws.amazon.com/textract/faqs/#topic-0. 'English, Spanish, Italian, Portuguese, French, German. Handwriting, Invoices and Receipts, Identity documents and Queries processing are in English only'
|
| 301 |
+
|
| 302 |
+
textract_language_choices = get_or_create_env_var("textract_language_choices", "['en', 'es', 'fr', 'de', 'it', 'pt']")
|
| 303 |
+
aws_comprehend_language_choices = get_or_create_env_var("aws_comprehend_language_choices", "['en', 'es', 'fr', 'de', 'it', 'pt', 'ar', 'hi', 'ja', 'ko', 'zh', 'zh-TW']")
|
| 304 |
+
|
| 305 |
+
# The choices that the user sees
|
| 306 |
+
MAPPED_LANGUAGE_CHOICES = get_or_create_env_var("MAPPED_LANGUAGE_CHOICES", "['english', 'french', 'german', 'spanish', 'italian', 'dutch', 'portuguese', 'chinese', 'japanese', 'korean', 'lithuanian', 'macedonian', 'norwegian_bokmaal', 'polish', 'romanian', 'russian', 'slovenian', 'swedish', 'catalan', 'ukrainian']")
|
| 307 |
+
LANGUAGE_CHOICES = get_or_create_env_var("LANGUAGE_CHOICES", "['en', 'fr', 'de', 'es', 'it', 'nl', 'pt', 'zh', 'ja', 'ko', 'lt', 'mk', 'nb', 'pl', 'ro', 'ru', 'sl', 'sv', 'ca', 'uk']")
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
|
| 311 |
+
### File output options
|
| 312 |
|
| 313 |
RETURN_PDF_END_OF_REDACTION = get_or_create_env_var("RETURN_PDF_END_OF_REDACTION", "True") # Return a redacted PDF at the end of the redaction task. Could be useful to set this to "False" if you want to ensure that the user always goes to the 'Review Redactions' tab before getting the final redacted PDF product.
|
| 314 |
|
tools/custom_image_analyser_engine.py
CHANGED
|
@@ -16,7 +16,7 @@ from typing import Optional, Tuple, Union
|
|
| 16 |
from tools.helper_functions import clean_unicode_text
|
| 17 |
from tools.presidio_analyzer_custom import recognizer_result_from_dict
|
| 18 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
| 19 |
-
from tools.config import PREPROCESS_LOCAL_OCR_IMAGES
|
| 20 |
|
| 21 |
if PREPROCESS_LOCAL_OCR_IMAGES == "True": PREPROCESS_LOCAL_OCR_IMAGES = True
|
| 22 |
else: PREPROCESS_LOCAL_OCR_IMAGES = False
|
|
@@ -26,6 +26,86 @@ try:
|
|
| 26 |
except ImportError:
|
| 27 |
PaddleOCR = None
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
@dataclass
|
| 30 |
class OCRResult:
|
| 31 |
text: str
|
|
@@ -234,6 +314,7 @@ def rescale_ocr_data(ocr_data, scale_factor:float):
|
|
| 234 |
ocr_data['height'][i] = h_orig
|
| 235 |
|
| 236 |
return ocr_data
|
|
|
|
| 237 |
class CustomImageAnalyzerEngine:
|
| 238 |
def __init__(
|
| 239 |
self,
|
|
@@ -241,28 +322,38 @@ class CustomImageAnalyzerEngine:
|
|
| 241 |
ocr_engine: str = "tesseract",
|
| 242 |
tesseract_config: Optional[str] = None,
|
| 243 |
paddle_kwargs: Optional[Dict[str, Any]] = None,
|
| 244 |
-
image_preprocessor: Optional[ImagePreprocessor] = None
|
|
|
|
| 245 |
):
|
| 246 |
"""
|
| 247 |
Initializes the CustomImageAnalyzerEngine.
|
| 248 |
|
| 249 |
-
:param ocr_engine: The OCR engine to use ("tesseract" or "paddle").
|
| 250 |
:param analyzer_engine: The Presidio AnalyzerEngine instance.
|
| 251 |
:param tesseract_config: Configuration string for Tesseract.
|
| 252 |
:param paddle_kwargs: Dictionary of keyword arguments for PaddleOCR constructor.
|
| 253 |
:param image_preprocessor: Optional image preprocessor.
|
|
|
|
| 254 |
"""
|
| 255 |
if ocr_engine not in ["tesseract", "paddle", "hybrid"]:
|
| 256 |
raise ValueError("ocr_engine must be either 'tesseract', 'hybrid', or 'paddle'")
|
| 257 |
|
| 258 |
self.ocr_engine = ocr_engine
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
|
| 260 |
if self.ocr_engine == "paddle" or self.ocr_engine == "hybrid":
|
| 261 |
if PaddleOCR is None:
|
| 262 |
raise ImportError("paddleocr is not installed. Please run 'pip install paddleocr paddlepaddle'")
|
| 263 |
# Default paddle configuration if none provided
|
| 264 |
if paddle_kwargs is None:
|
| 265 |
-
paddle_kwargs = {'use_textline_orientation': True, 'lang':
|
|
|
|
|
|
|
|
|
|
| 266 |
self.paddle_ocr = PaddleOCR(**paddle_kwargs)
|
| 267 |
|
| 268 |
if not analyzer_engine:
|
|
@@ -394,7 +485,8 @@ class CustomImageAnalyzerEngine:
|
|
| 394 |
tesseract_data = pytesseract.image_to_data(
|
| 395 |
image,
|
| 396 |
output_type=pytesseract.Output.DICT,
|
| 397 |
-
config=self.tesseract_config
|
|
|
|
| 398 |
)
|
| 399 |
|
| 400 |
#tesseract_data['abs_line_id'] = tesseract_data.groupby(['block_num', 'par_num', 'line_num']).ngroup()
|
|
@@ -510,7 +602,8 @@ class CustomImageAnalyzerEngine:
|
|
| 510 |
ocr_data = pytesseract.image_to_data(
|
| 511 |
image,
|
| 512 |
output_type=pytesseract.Output.DICT,
|
| 513 |
-
config=self.tesseract_config
|
|
|
|
| 514 |
)
|
| 515 |
|
| 516 |
#ocr_data['abs_line_id'] = ocr_data.groupby(['block_num', 'par_num', 'line_num']).ngroup()
|
|
@@ -569,6 +662,7 @@ class CustomImageAnalyzerEngine:
|
|
| 569 |
pii_identification_method: str = "Local",
|
| 570 |
comprehend_client = "",
|
| 571 |
custom_entities:List[str]=custom_entities,
|
|
|
|
| 572 |
**text_analyzer_kwargs
|
| 573 |
) -> List[CustomImageRecognizerResult]:
|
| 574 |
|
|
@@ -586,10 +680,14 @@ class CustomImageAnalyzerEngine:
|
|
| 586 |
# Note: We're not passing line_characters here since it's not needed for this use case
|
| 587 |
page_text_mapping.append((start_pos, i, line_level_ocr_result, None))
|
| 588 |
|
|
|
|
|
|
|
|
|
|
| 589 |
# Process using either Local or AWS Comprehend
|
| 590 |
if pii_identification_method == "Local":
|
| 591 |
analyzer_result = self.analyzer_engine.analyze(
|
| 592 |
text=page_text,
|
|
|
|
| 593 |
**text_analyzer_kwargs
|
| 594 |
)
|
| 595 |
all_text_line_results = map_back_entity_results(
|
|
@@ -609,6 +707,7 @@ class CustomImageAnalyzerEngine:
|
|
| 609 |
text_analyzer_kwargs["entities"] = custom_redact_entities
|
| 610 |
page_analyser_result = self.analyzer_engine.analyze(
|
| 611 |
text=page_text,
|
|
|
|
| 612 |
**text_analyzer_kwargs
|
| 613 |
)
|
| 614 |
all_text_line_results = map_back_entity_results(
|
|
@@ -641,7 +740,7 @@ class CustomImageAnalyzerEngine:
|
|
| 641 |
current_batch,
|
| 642 |
current_batch_mapping,
|
| 643 |
comprehend_client,
|
| 644 |
-
|
| 645 |
text_analyzer_kwargs.get('allow_list', []),
|
| 646 |
chosen_redact_comprehend_entities,
|
| 647 |
all_text_line_results
|
|
@@ -676,7 +775,7 @@ class CustomImageAnalyzerEngine:
|
|
| 676 |
current_batch,
|
| 677 |
current_batch_mapping,
|
| 678 |
comprehend_client,
|
| 679 |
-
|
| 680 |
text_analyzer_kwargs.get('allow_list', []),
|
| 681 |
chosen_redact_comprehend_entities,
|
| 682 |
all_text_line_results
|
|
@@ -988,7 +1087,7 @@ def run_page_text_redaction(
|
|
| 988 |
comprehend_client = None,
|
| 989 |
allow_list: List[str] = None,
|
| 990 |
pii_identification_method: str = "Local",
|
| 991 |
-
nlp_analyser = None,
|
| 992 |
score_threshold: float = 0.0,
|
| 993 |
custom_entities: List[str] = None,
|
| 994 |
comprehend_query_number:int = 0#,
|
|
|
|
| 16 |
from tools.helper_functions import clean_unicode_text
|
| 17 |
from tools.presidio_analyzer_custom import recognizer_result_from_dict
|
| 18 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
| 19 |
+
from tools.config import PREPROCESS_LOCAL_OCR_IMAGES, DEFAULT_LANGUAGE
|
| 20 |
|
| 21 |
if PREPROCESS_LOCAL_OCR_IMAGES == "True": PREPROCESS_LOCAL_OCR_IMAGES = True
|
| 22 |
else: PREPROCESS_LOCAL_OCR_IMAGES = False
|
|
|
|
| 26 |
except ImportError:
|
| 27 |
PaddleOCR = None
|
| 28 |
|
| 29 |
+
# --- Language utilities ---
|
| 30 |
+
def _normalize_lang(language: str) -> str:
|
| 31 |
+
return language.strip().lower().replace("-", "_") if language else "en"
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def _tesseract_lang_code(language: str) -> str:
|
| 35 |
+
"""Map a user language input to a Tesseract traineddata code."""
|
| 36 |
+
lang = _normalize_lang(language)
|
| 37 |
+
|
| 38 |
+
mapping = {
|
| 39 |
+
# Common
|
| 40 |
+
"en": "eng", "eng": "eng",
|
| 41 |
+
"fr": "fra", "fre": "fra", "fra": "fra",
|
| 42 |
+
"de": "deu", "ger": "deu", "deu": "deu",
|
| 43 |
+
"es": "spa", "spa": "spa",
|
| 44 |
+
"it": "ita", "ita": "ita",
|
| 45 |
+
"nl": "nld", "dut": "nld", "nld": "nld",
|
| 46 |
+
"pt": "por", "por": "por",
|
| 47 |
+
"ru": "rus", "rus": "rus",
|
| 48 |
+
"ar": "ara", "ara": "ara",
|
| 49 |
+
# Nordics
|
| 50 |
+
"sv": "swe", "swe": "swe",
|
| 51 |
+
"no": "nor", "nb": "nor", "nn": "nor", "nor": "nor",
|
| 52 |
+
"fi": "fin", "fin": "fin",
|
| 53 |
+
"da": "dan", "dan": "dan",
|
| 54 |
+
# Eastern/Central
|
| 55 |
+
"pl": "pol", "pol": "pol",
|
| 56 |
+
"cs": "ces", "cz": "ces", "ces": "ces",
|
| 57 |
+
"hu": "hun", "hun": "hun",
|
| 58 |
+
"ro": "ron", "rum": "ron", "ron": "ron",
|
| 59 |
+
"bg": "bul", "bul": "bul",
|
| 60 |
+
"el": "ell", "gre": "ell", "ell": "ell",
|
| 61 |
+
# Asian
|
| 62 |
+
"ja": "jpn", "jp": "jpn", "jpn": "jpn",
|
| 63 |
+
"zh": "chi_sim", "zh_cn": "chi_sim", "zh_hans": "chi_sim", "chi_sim": "chi_sim",
|
| 64 |
+
"zh_tw": "chi_tra", "zh_hk": "chi_tra", "zh_tr": "chi_tra", "chi_tra": "chi_tra",
|
| 65 |
+
"hi": "hin", "hin": "hin",
|
| 66 |
+
"bn": "ben", "ben": "ben",
|
| 67 |
+
"ur": "urd", "urd": "urd",
|
| 68 |
+
"fa": "fas", "per": "fas", "fas": "fas",
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
return mapping.get(lang, "eng")
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def _paddle_lang_code(language: str) -> str:
|
| 75 |
+
"""Map a user language input to a PaddleOCR language code.
|
| 76 |
+
|
| 77 |
+
PaddleOCR supports codes like: 'en', 'ch', 'chinese_cht', 'korean', 'japan', 'german', 'fr', 'it', 'es',
|
| 78 |
+
as well as script packs like 'arabic', 'cyrillic', 'latin'.
|
| 79 |
+
"""
|
| 80 |
+
lang = _normalize_lang(language)
|
| 81 |
+
|
| 82 |
+
mapping = {
|
| 83 |
+
"en": "en",
|
| 84 |
+
"fr": "fr",
|
| 85 |
+
"de": "german",
|
| 86 |
+
"es": "es",
|
| 87 |
+
"it": "it",
|
| 88 |
+
"pt": "pt",
|
| 89 |
+
"nl": "nl",
|
| 90 |
+
"ru": "cyrillic", # Russian is covered by cyrillic models
|
| 91 |
+
"uk": "cyrillic",
|
| 92 |
+
"bg": "cyrillic",
|
| 93 |
+
"sr": "cyrillic",
|
| 94 |
+
"ar": "arabic",
|
| 95 |
+
"tr": "tr",
|
| 96 |
+
"fa": "arabic", # fallback to arabic script pack
|
| 97 |
+
"zh": "ch",
|
| 98 |
+
"zh_cn": "ch",
|
| 99 |
+
"zh_tw": "chinese_cht",
|
| 100 |
+
"zh_hk": "chinese_cht",
|
| 101 |
+
"ja": "japan",
|
| 102 |
+
"jp": "japan",
|
| 103 |
+
"ko": "korean",
|
| 104 |
+
"hi": "latin", # fallback; dedicated Hindi not always available
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
return mapping.get(lang, "en")
|
| 108 |
+
|
| 109 |
@dataclass
|
| 110 |
class OCRResult:
|
| 111 |
text: str
|
|
|
|
| 314 |
ocr_data['height'][i] = h_orig
|
| 315 |
|
| 316 |
return ocr_data
|
| 317 |
+
|
| 318 |
class CustomImageAnalyzerEngine:
|
| 319 |
def __init__(
|
| 320 |
self,
|
|
|
|
| 322 |
ocr_engine: str = "tesseract",
|
| 323 |
tesseract_config: Optional[str] = None,
|
| 324 |
paddle_kwargs: Optional[Dict[str, Any]] = None,
|
| 325 |
+
image_preprocessor: Optional[ImagePreprocessor] = None,
|
| 326 |
+
language: Optional[str] = None
|
| 327 |
):
|
| 328 |
"""
|
| 329 |
Initializes the CustomImageAnalyzerEngine.
|
| 330 |
|
| 331 |
+
:param ocr_engine: The OCR engine to use ("tesseract", "hybrid", or "paddle").
|
| 332 |
:param analyzer_engine: The Presidio AnalyzerEngine instance.
|
| 333 |
:param tesseract_config: Configuration string for Tesseract.
|
| 334 |
:param paddle_kwargs: Dictionary of keyword arguments for PaddleOCR constructor.
|
| 335 |
:param image_preprocessor: Optional image preprocessor.
|
| 336 |
+
:param language: Preferred OCR language (e.g., "en", "fr", "de"). Defaults to DEFAULT_LANGUAGE.
|
| 337 |
"""
|
| 338 |
if ocr_engine not in ["tesseract", "paddle", "hybrid"]:
|
| 339 |
raise ValueError("ocr_engine must be either 'tesseract', 'hybrid', or 'paddle'")
|
| 340 |
|
| 341 |
self.ocr_engine = ocr_engine
|
| 342 |
+
|
| 343 |
+
# Language setup
|
| 344 |
+
self.language = language or DEFAULT_LANGUAGE or "en"
|
| 345 |
+
self.tesseract_lang = _tesseract_lang_code(self.language)
|
| 346 |
+
self.paddle_lang = _paddle_lang_code(self.language)
|
| 347 |
|
| 348 |
if self.ocr_engine == "paddle" or self.ocr_engine == "hybrid":
|
| 349 |
if PaddleOCR is None:
|
| 350 |
raise ImportError("paddleocr is not installed. Please run 'pip install paddleocr paddlepaddle'")
|
| 351 |
# Default paddle configuration if none provided
|
| 352 |
if paddle_kwargs is None:
|
| 353 |
+
paddle_kwargs = {'use_textline_orientation': True, 'lang': self.paddle_lang}
|
| 354 |
+
else:
|
| 355 |
+
# Enforce language if not explicitly provided
|
| 356 |
+
paddle_kwargs.setdefault('lang', self.paddle_lang)
|
| 357 |
self.paddle_ocr = PaddleOCR(**paddle_kwargs)
|
| 358 |
|
| 359 |
if not analyzer_engine:
|
|
|
|
| 485 |
tesseract_data = pytesseract.image_to_data(
|
| 486 |
image,
|
| 487 |
output_type=pytesseract.Output.DICT,
|
| 488 |
+
config=self.tesseract_config,
|
| 489 |
+
lang=self.tesseract_lang
|
| 490 |
)
|
| 491 |
|
| 492 |
#tesseract_data['abs_line_id'] = tesseract_data.groupby(['block_num', 'par_num', 'line_num']).ngroup()
|
|
|
|
| 602 |
ocr_data = pytesseract.image_to_data(
|
| 603 |
image,
|
| 604 |
output_type=pytesseract.Output.DICT,
|
| 605 |
+
config=self.tesseract_config,
|
| 606 |
+
lang=self.tesseract_lang # Ensure the Tesseract language data (e.g., fra.traineddata) is installed on your system.
|
| 607 |
)
|
| 608 |
|
| 609 |
#ocr_data['abs_line_id'] = ocr_data.groupby(['block_num', 'par_num', 'line_num']).ngroup()
|
|
|
|
| 662 |
pii_identification_method: str = "Local",
|
| 663 |
comprehend_client = "",
|
| 664 |
custom_entities:List[str]=custom_entities,
|
| 665 |
+
language: Optional[str] = None,
|
| 666 |
**text_analyzer_kwargs
|
| 667 |
) -> List[CustomImageRecognizerResult]:
|
| 668 |
|
|
|
|
| 680 |
# Note: We're not passing line_characters here since it's not needed for this use case
|
| 681 |
page_text_mapping.append((start_pos, i, line_level_ocr_result, None))
|
| 682 |
|
| 683 |
+
# Determine language for downstream services
|
| 684 |
+
aws_language = language or getattr(self, 'language', None) or 'en'
|
| 685 |
+
|
| 686 |
# Process using either Local or AWS Comprehend
|
| 687 |
if pii_identification_method == "Local":
|
| 688 |
analyzer_result = self.analyzer_engine.analyze(
|
| 689 |
text=page_text,
|
| 690 |
+
language=language,
|
| 691 |
**text_analyzer_kwargs
|
| 692 |
)
|
| 693 |
all_text_line_results = map_back_entity_results(
|
|
|
|
| 707 |
text_analyzer_kwargs["entities"] = custom_redact_entities
|
| 708 |
page_analyser_result = self.analyzer_engine.analyze(
|
| 709 |
text=page_text,
|
| 710 |
+
language=language,
|
| 711 |
**text_analyzer_kwargs
|
| 712 |
)
|
| 713 |
all_text_line_results = map_back_entity_results(
|
|
|
|
| 740 |
current_batch,
|
| 741 |
current_batch_mapping,
|
| 742 |
comprehend_client,
|
| 743 |
+
aws_language,
|
| 744 |
text_analyzer_kwargs.get('allow_list', []),
|
| 745 |
chosen_redact_comprehend_entities,
|
| 746 |
all_text_line_results
|
|
|
|
| 775 |
current_batch,
|
| 776 |
current_batch_mapping,
|
| 777 |
comprehend_client,
|
| 778 |
+
aws_language,
|
| 779 |
text_analyzer_kwargs.get('allow_list', []),
|
| 780 |
chosen_redact_comprehend_entities,
|
| 781 |
all_text_line_results
|
|
|
|
| 1087 |
comprehend_client = None,
|
| 1088 |
allow_list: List[str] = None,
|
| 1089 |
pii_identification_method: str = "Local",
|
| 1090 |
+
nlp_analyser: AnalyzerEngine = None,
|
| 1091 |
score_threshold: float = 0.0,
|
| 1092 |
custom_entities: List[str] = None,
|
| 1093 |
comprehend_query_number:int = 0#,
|
tools/data_anonymise.py
CHANGED
|
@@ -10,11 +10,11 @@ import docx
|
|
| 10 |
from openpyxl import Workbook
|
| 11 |
from faker import Faker
|
| 12 |
from gradio import Progress
|
| 13 |
-
from typing import List, Dict, Any
|
| 14 |
from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
|
| 15 |
from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
|
| 16 |
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
|
| 17 |
-
from tools.config import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, OUTPUT_FOLDER
|
| 18 |
from tools.helper_functions import get_file_name_without_type, read_file, detect_file_type
|
| 19 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, custom_entities
|
| 20 |
# Use custom version of analyze_dict to be able to track progress
|
|
@@ -119,7 +119,7 @@ def anon_consistent_names(df:pd.DataFrame) -> pd.DataFrame:
|
|
| 119 |
#analyzer = AnalyzerEngine()
|
| 120 |
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
|
| 121 |
|
| 122 |
-
analyzer_results = batch_analyzer.analyze_dict(df_dict, language=
|
| 123 |
analyzer_results = list(analyzer_results)
|
| 124 |
|
| 125 |
# + tags=[]
|
|
@@ -208,7 +208,6 @@ def handle_docx_anonymisation(
|
|
| 208 |
file_path: str,
|
| 209 |
output_folder: str,
|
| 210 |
anon_strat: str,
|
| 211 |
-
language: str,
|
| 212 |
chosen_redact_entities: List[str],
|
| 213 |
in_allow_list: List[str],
|
| 214 |
in_deny_list: List[str],
|
|
@@ -216,7 +215,8 @@ def handle_docx_anonymisation(
|
|
| 216 |
pii_identification_method: str,
|
| 217 |
chosen_redact_comprehend_entities: List[str],
|
| 218 |
comprehend_query_number: int,
|
| 219 |
-
comprehend_client # Assuming botocore.client.BaseClient type
|
|
|
|
| 220 |
):
|
| 221 |
"""
|
| 222 |
Anonymises a .docx file by extracting text, processing it, and re-inserting it.
|
|
@@ -253,11 +253,14 @@ def handle_docx_anonymisation(
|
|
| 253 |
# 2. Convert to a DataFrame for the existing anonymisation script
|
| 254 |
df_to_anonymise = pd.DataFrame({'text_to_redact': original_texts})
|
| 255 |
|
|
|
|
|
|
|
|
|
|
| 256 |
# 3. Call the core anonymisation script
|
| 257 |
anonymised_df, _, decision_log = anonymise_script(
|
| 258 |
df=df_to_anonymise,
|
| 259 |
anon_strat=anon_strat,
|
| 260 |
-
language=
|
| 261 |
chosen_redact_entities=chosen_redact_entities,
|
| 262 |
in_allow_list=in_allow_list,
|
| 263 |
in_deny_list=in_deny_list,
|
|
@@ -307,7 +310,6 @@ def anonymise_files_with_open_text(file_paths: List[str],
|
|
| 307 |
in_text: str,
|
| 308 |
anon_strat: str,
|
| 309 |
chosen_cols: List[str],
|
| 310 |
-
language: str,
|
| 311 |
chosen_redact_entities: List[str],
|
| 312 |
in_allow_list: List[str] = None,
|
| 313 |
latest_file_completed: int = 0,
|
|
@@ -325,7 +327,9 @@ def anonymise_files_with_open_text(file_paths: List[str],
|
|
| 325 |
aws_access_key_textbox:str='',
|
| 326 |
aws_secret_key_textbox:str='',
|
| 327 |
actual_time_taken_number:float=0,
|
| 328 |
-
|
|
|
|
|
|
|
| 329 |
"""
|
| 330 |
This function anonymises data files based on the provided parameters.
|
| 331 |
|
|
@@ -352,11 +356,21 @@ def anonymise_files_with_open_text(file_paths: List[str],
|
|
| 352 |
- aws_access_key_textbox (str, optional): AWS access key for account with Textract and Comprehend permissions.
|
| 353 |
- aws_secret_key_textbox (str, optional): AWS secret key for account with Textract and Comprehend permissions.
|
| 354 |
- actual_time_taken_number (float, optional): Time taken to do the redaction.
|
|
|
|
| 355 |
- progress (Progress, optional): A Progress object to track progress. Defaults to a Progress object with track_tqdm=True.
|
| 356 |
"""
|
| 357 |
|
| 358 |
tic = time.perf_counter()
|
| 359 |
comprehend_client = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 360 |
|
| 361 |
# If this is the first time around, set variables to 0/blank
|
| 362 |
if first_loop_state==True:
|
|
@@ -455,7 +469,6 @@ def anonymise_files_with_open_text(file_paths: List[str],
|
|
| 455 |
file_path=anon_file.name, # .name if it's a temp file object
|
| 456 |
output_folder=output_folder,
|
| 457 |
anon_strat=anon_strat,
|
| 458 |
-
language=language,
|
| 459 |
chosen_redact_entities=chosen_redact_entities,
|
| 460 |
in_allow_list=in_allow_list_flat,
|
| 461 |
in_deny_list=in_deny_list,
|
|
@@ -463,7 +476,8 @@ def anonymise_files_with_open_text(file_paths: List[str],
|
|
| 463 |
pii_identification_method=pii_identification_method,
|
| 464 |
chosen_redact_comprehend_entities=chosen_redact_comprehend_entities,
|
| 465 |
comprehend_query_number=comprehend_query_number,
|
| 466 |
-
comprehend_client=comprehend_client
|
|
|
|
| 467 |
)
|
| 468 |
if output_path:
|
| 469 |
out_file_paths.append(output_path)
|
|
@@ -493,14 +507,14 @@ def anonymise_files_with_open_text(file_paths: List[str],
|
|
| 493 |
|
| 494 |
anon_df = pd.read_excel(anon_file, sheet_name=sheet_name)
|
| 495 |
|
| 496 |
-
out_file_paths, out_message, key_string, log_files_output_paths = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat,
|
| 497 |
|
| 498 |
else:
|
| 499 |
sheet_name = ""
|
| 500 |
anon_df = read_file(anon_file)
|
| 501 |
out_file_part = get_file_name_without_type(anon_file.name)
|
| 502 |
|
| 503 |
-
out_file_paths, out_message, key_string, log_files_output_paths = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat,
|
| 504 |
|
| 505 |
# Increase latest file completed count unless we are at the last file
|
| 506 |
if latest_file_completed != len(file_paths):
|
|
@@ -537,7 +551,7 @@ def tabular_anonymise_wrapper_func(
|
|
| 537 |
out_message: str,
|
| 538 |
excel_sheet_name: str,
|
| 539 |
anon_strat: str,
|
| 540 |
-
language: str,
|
| 541 |
chosen_redact_entities: List[str],
|
| 542 |
in_allow_list: List[str],
|
| 543 |
file_type: str,
|
|
@@ -546,6 +560,7 @@ def tabular_anonymise_wrapper_func(
|
|
| 546 |
in_deny_list: List[str]=[],
|
| 547 |
max_fuzzy_spelling_mistakes_num:int=0,
|
| 548 |
pii_identification_method:str="Local",
|
|
|
|
| 549 |
chosen_redact_comprehend_entities:List[str]=[],
|
| 550 |
comprehend_query_number:int=0,
|
| 551 |
comprehend_client:botocore.client.BaseClient="",
|
|
@@ -617,8 +632,11 @@ def tabular_anonymise_wrapper_func(
|
|
| 617 |
anon_df_part = anon_df[chosen_cols_in_anon_df]
|
| 618 |
anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
|
| 619 |
|
|
|
|
|
|
|
|
|
|
| 620 |
# Anonymise the selected columns
|
| 621 |
-
anon_df_part_out, key_string, decision_process_output_str = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client)
|
| 622 |
|
| 623 |
anon_df_part_out.replace("^nan$", "", regex=True, inplace=True)
|
| 624 |
|
|
@@ -681,6 +699,7 @@ def anonymise_script(df:pd.DataFrame,
|
|
| 681 |
in_deny_list:List[str]=[],
|
| 682 |
max_fuzzy_spelling_mistakes_num:int=0,
|
| 683 |
pii_identification_method:str="Local",
|
|
|
|
| 684 |
chosen_redact_comprehend_entities:List[str]=[],
|
| 685 |
comprehend_query_number:int=0,
|
| 686 |
comprehend_client:botocore.client.BaseClient="",
|
|
@@ -738,6 +757,9 @@ def anonymise_script(df:pd.DataFrame,
|
|
| 738 |
batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
|
| 739 |
analyzer_results = []
|
| 740 |
|
|
|
|
|
|
|
|
|
|
| 741 |
if pii_identification_method == "Local":
|
| 742 |
|
| 743 |
# Use custom analyzer to be able to track progress with Gradio
|
|
@@ -801,7 +823,7 @@ def anonymise_script(df:pd.DataFrame,
|
|
| 801 |
try:
|
| 802 |
response = comprehend_client.detect_pii_entities(
|
| 803 |
Text=str(text),
|
| 804 |
-
LanguageCode=
|
| 805 |
)
|
| 806 |
|
| 807 |
comprehend_query_number += 1
|
|
|
|
| 10 |
from openpyxl import Workbook
|
| 11 |
from faker import Faker
|
| 12 |
from gradio import Progress
|
| 13 |
+
from typing import List, Dict, Any, Optional
|
| 14 |
from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
|
| 15 |
from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
|
| 16 |
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
|
| 17 |
+
from tools.config import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, OUTPUT_FOLDER, DEFAULT_LANGUAGE, aws_comprehend_language_choices
|
| 18 |
from tools.helper_functions import get_file_name_without_type, read_file, detect_file_type
|
| 19 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, custom_entities
|
| 20 |
# Use custom version of analyze_dict to be able to track progress
|
|
|
|
| 119 |
#analyzer = AnalyzerEngine()
|
| 120 |
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
|
| 121 |
|
| 122 |
+
analyzer_results = batch_analyzer.analyze_dict(df_dict, language=DEFAULT_LANGUAGE)
|
| 123 |
analyzer_results = list(analyzer_results)
|
| 124 |
|
| 125 |
# + tags=[]
|
|
|
|
| 208 |
file_path: str,
|
| 209 |
output_folder: str,
|
| 210 |
anon_strat: str,
|
|
|
|
| 211 |
chosen_redact_entities: List[str],
|
| 212 |
in_allow_list: List[str],
|
| 213 |
in_deny_list: List[str],
|
|
|
|
| 215 |
pii_identification_method: str,
|
| 216 |
chosen_redact_comprehend_entities: List[str],
|
| 217 |
comprehend_query_number: int,
|
| 218 |
+
comprehend_client, # Assuming botocore.client.BaseClient type
|
| 219 |
+
language: Optional[str] = None
|
| 220 |
):
|
| 221 |
"""
|
| 222 |
Anonymises a .docx file by extracting text, processing it, and re-inserting it.
|
|
|
|
| 253 |
# 2. Convert to a DataFrame for the existing anonymisation script
|
| 254 |
df_to_anonymise = pd.DataFrame({'text_to_redact': original_texts})
|
| 255 |
|
| 256 |
+
# Use provided language or default
|
| 257 |
+
effective_language = language or DEFAULT_LANGUAGE
|
| 258 |
+
|
| 259 |
# 3. Call the core anonymisation script
|
| 260 |
anonymised_df, _, decision_log = anonymise_script(
|
| 261 |
df=df_to_anonymise,
|
| 262 |
anon_strat=anon_strat,
|
| 263 |
+
language=effective_language,
|
| 264 |
chosen_redact_entities=chosen_redact_entities,
|
| 265 |
in_allow_list=in_allow_list,
|
| 266 |
in_deny_list=in_deny_list,
|
|
|
|
| 310 |
in_text: str,
|
| 311 |
anon_strat: str,
|
| 312 |
chosen_cols: List[str],
|
|
|
|
| 313 |
chosen_redact_entities: List[str],
|
| 314 |
in_allow_list: List[str] = None,
|
| 315 |
latest_file_completed: int = 0,
|
|
|
|
| 327 |
aws_access_key_textbox:str='',
|
| 328 |
aws_secret_key_textbox:str='',
|
| 329 |
actual_time_taken_number:float=0,
|
| 330 |
+
language: Optional[str] = None,
|
| 331 |
+
progress: Progress = Progress(track_tqdm=True),
|
| 332 |
+
comprehend_language: Optional[str] = None):
|
| 333 |
"""
|
| 334 |
This function anonymises data files based on the provided parameters.
|
| 335 |
|
|
|
|
| 356 |
- aws_access_key_textbox (str, optional): AWS access key for account with Textract and Comprehend permissions.
|
| 357 |
- aws_secret_key_textbox (str, optional): AWS secret key for account with Textract and Comprehend permissions.
|
| 358 |
- actual_time_taken_number (float, optional): Time taken to do the redaction.
|
| 359 |
+
- language (str, optional): The language of the text to anonymise.
|
| 360 |
- progress (Progress, optional): A Progress object to track progress. Defaults to a Progress object with track_tqdm=True.
|
| 361 |
"""
|
| 362 |
|
| 363 |
tic = time.perf_counter()
|
| 364 |
comprehend_client = ""
|
| 365 |
+
|
| 366 |
+
# Use provided language or default
|
| 367 |
+
effective_language = language or DEFAULT_LANGUAGE
|
| 368 |
+
effective_comprehend_language = comprehend_language or effective_language
|
| 369 |
+
|
| 370 |
+
if pii_identification_method == "AWS Comprehend":
|
| 371 |
+
if effective_comprehend_language not in aws_comprehend_language_choices:
|
| 372 |
+
out_message = f"Please note that this language is not supported by AWS Comprehend: {effective_comprehend_language}"
|
| 373 |
+
raise Warning(out_message)
|
| 374 |
|
| 375 |
# If this is the first time around, set variables to 0/blank
|
| 376 |
if first_loop_state==True:
|
|
|
|
| 469 |
file_path=anon_file.name, # .name if it's a temp file object
|
| 470 |
output_folder=output_folder,
|
| 471 |
anon_strat=anon_strat,
|
|
|
|
| 472 |
chosen_redact_entities=chosen_redact_entities,
|
| 473 |
in_allow_list=in_allow_list_flat,
|
| 474 |
in_deny_list=in_deny_list,
|
|
|
|
| 476 |
pii_identification_method=pii_identification_method,
|
| 477 |
chosen_redact_comprehend_entities=chosen_redact_comprehend_entities,
|
| 478 |
comprehend_query_number=comprehend_query_number,
|
| 479 |
+
comprehend_client=comprehend_client,
|
| 480 |
+
language=effective_language
|
| 481 |
)
|
| 482 |
if output_path:
|
| 483 |
out_file_paths.append(output_path)
|
|
|
|
| 507 |
|
| 508 |
anon_df = pd.read_excel(anon_file, sheet_name=sheet_name)
|
| 509 |
|
| 510 |
+
out_file_paths, out_message, key_string, log_files_output_paths = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, effective_language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name, log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, effective_comprehend_language, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder)
|
| 511 |
|
| 512 |
else:
|
| 513 |
sheet_name = ""
|
| 514 |
anon_df = read_file(anon_file)
|
| 515 |
out_file_part = get_file_name_without_type(anon_file.name)
|
| 516 |
|
| 517 |
+
out_file_paths, out_message, key_string, log_files_output_paths = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, effective_language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, effective_comprehend_language, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder)
|
| 518 |
|
| 519 |
# Increase latest file completed count unless we are at the last file
|
| 520 |
if latest_file_completed != len(file_paths):
|
|
|
|
| 551 |
out_message: str,
|
| 552 |
excel_sheet_name: str,
|
| 553 |
anon_strat: str,
|
| 554 |
+
language: str,
|
| 555 |
chosen_redact_entities: List[str],
|
| 556 |
in_allow_list: List[str],
|
| 557 |
file_type: str,
|
|
|
|
| 560 |
in_deny_list: List[str]=[],
|
| 561 |
max_fuzzy_spelling_mistakes_num:int=0,
|
| 562 |
pii_identification_method:str="Local",
|
| 563 |
+
comprehend_language: Optional[str] = None,
|
| 564 |
chosen_redact_comprehend_entities:List[str]=[],
|
| 565 |
comprehend_query_number:int=0,
|
| 566 |
comprehend_client:botocore.client.BaseClient="",
|
|
|
|
| 632 |
anon_df_part = anon_df[chosen_cols_in_anon_df]
|
| 633 |
anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
|
| 634 |
|
| 635 |
+
# Use provided comprehend language or fall back to main language
|
| 636 |
+
effective_comprehend_language = comprehend_language or language
|
| 637 |
+
|
| 638 |
# Anonymise the selected columns
|
| 639 |
+
anon_df_part_out, key_string, decision_process_output_str = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, effective_comprehend_language, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client)
|
| 640 |
|
| 641 |
anon_df_part_out.replace("^nan$", "", regex=True, inplace=True)
|
| 642 |
|
|
|
|
| 699 |
in_deny_list:List[str]=[],
|
| 700 |
max_fuzzy_spelling_mistakes_num:int=0,
|
| 701 |
pii_identification_method:str="Local",
|
| 702 |
+
comprehend_language:Optional[str]=None,
|
| 703 |
chosen_redact_comprehend_entities:List[str]=[],
|
| 704 |
comprehend_query_number:int=0,
|
| 705 |
comprehend_client:botocore.client.BaseClient="",
|
|
|
|
| 757 |
batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
|
| 758 |
analyzer_results = []
|
| 759 |
|
| 760 |
+
# Use provided comprehend language or fall back to main language
|
| 761 |
+
effective_comprehend_language = comprehend_language or language
|
| 762 |
+
|
| 763 |
if pii_identification_method == "Local":
|
| 764 |
|
| 765 |
# Use custom analyzer to be able to track progress with Gradio
|
|
|
|
| 823 |
try:
|
| 824 |
response = comprehend_client.detect_pii_entities(
|
| 825 |
Text=str(text),
|
| 826 |
+
LanguageCode=effective_comprehend_language
|
| 827 |
)
|
| 828 |
|
| 829 |
comprehend_query_number += 1
|
tools/file_conversion.py
CHANGED
|
@@ -673,7 +673,11 @@ def prepare_image_or_pdf(
|
|
| 673 |
all_annotations_object = convert_review_df_to_annotation_json(review_file_csv, image_file_paths, page_sizes)
|
| 674 |
json_from_csv = True
|
| 675 |
elif '_ocr_output' in file_path_without_ext:
|
| 676 |
-
all_line_level_ocr_results_df = read_file(file_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 677 |
json_from_csv = False
|
| 678 |
elif '_ocr_results_with_words' in file_path_without_ext:
|
| 679 |
all_page_line_level_ocr_results_with_words_df = read_file(file_path)
|
|
|
|
| 673 |
all_annotations_object = convert_review_df_to_annotation_json(review_file_csv, image_file_paths, page_sizes)
|
| 674 |
json_from_csv = True
|
| 675 |
elif '_ocr_output' in file_path_without_ext:
|
| 676 |
+
all_line_level_ocr_results_df = read_file(file_path)
|
| 677 |
+
|
| 678 |
+
if "line" not in all_line_level_ocr_results_df.columns:
|
| 679 |
+
all_line_level_ocr_results_df["line"] = ""
|
| 680 |
+
|
| 681 |
json_from_csv = False
|
| 682 |
elif '_ocr_results_with_words' in file_path_without_ext:
|
| 683 |
all_page_line_level_ocr_results_with_words_df = read_file(file_path)
|
tools/file_redaction.py
CHANGED
|
@@ -15,14 +15,15 @@ from pdfminer.high_level import extract_pages
|
|
| 15 |
from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTTextLineHorizontal, LTAnno
|
| 16 |
from pikepdf import Pdf, Dictionary, Name
|
| 17 |
from pymupdf import Rect, Page, Document
|
|
|
|
| 18 |
import gradio as gr
|
| 19 |
from gradio import Progress
|
| 20 |
from collections import defaultdict # For efficient grouping
|
| 21 |
|
| 22 |
-
from tools.config import OUTPUT_FOLDER, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, PAGE_BREAK_VALUE, MAX_TIME_VALUE, LOAD_TRUNCATED_IMAGES, INPUT_FOLDER, RETURN_PDF_END_OF_REDACTION, TESSERACT_TEXT_EXTRACT_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, LOCAL_PII_OPTION, AWS_PII_OPTION, NO_REDACTION_PII_OPTION
|
| 23 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, recreate_page_line_level_ocr_results_with_page
|
| 24 |
from tools.file_conversion import convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, is_pdf, is_pdf_or_image, prepare_image_or_pdf, divide_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, divide_coordinates_by_page_sizes, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, fill_missing_box_ids, load_and_convert_ocr_results_with_words_json, save_pdf_with_or_without_compression, word_level_ocr_output_to_dataframe
|
| 25 |
-
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
|
| 26 |
from tools.helper_functions import get_file_name_without_type, clean_unicode_text
|
| 27 |
from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
|
| 28 |
|
|
@@ -84,12 +85,9 @@ def merge_page_results(data:list):
|
|
| 84 |
|
| 85 |
return list(merged.values())
|
| 86 |
|
| 87 |
-
|
| 88 |
-
|
| 89 |
def choose_and_run_redactor(file_paths:List[str],
|
| 90 |
prepared_pdf_file_paths:List[str],
|
| 91 |
-
pdf_image_file_paths:List[str],
|
| 92 |
-
language:str,
|
| 93 |
chosen_redact_entities:List[str],
|
| 94 |
chosen_redact_comprehend_entities:List[str],
|
| 95 |
text_extraction_method:str,
|
|
@@ -112,7 +110,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 112 |
pymupdf_doc=list(),
|
| 113 |
current_loop_page:int=0,
|
| 114 |
page_break_return:bool=False,
|
| 115 |
-
pii_identification_method:str="Local",
|
| 116 |
comprehend_query_number:int=0,
|
| 117 |
max_fuzzy_spelling_mistakes_num:int=1,
|
| 118 |
match_fuzzy_whole_phrase_bool:bool=True,
|
|
@@ -134,7 +132,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 134 |
all_page_line_level_ocr_results_with_words:list[dict] = list(),
|
| 135 |
all_page_line_level_ocr_results_with_words_df:pd.DataFrame=None,
|
| 136 |
chosen_local_model:str="tesseract",
|
| 137 |
-
|
|
|
|
| 138 |
RETURN_PDF_END_OF_REDACTION:bool=RETURN_PDF_END_OF_REDACTION,
|
| 139 |
progress=gr.Progress(track_tqdm=True)):
|
| 140 |
'''
|
|
@@ -143,7 +142,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 143 |
- file_paths (List[str]): A list of paths to the files to be redacted.
|
| 144 |
- prepared_pdf_file_paths (List[str]): A list of paths to the PDF files prepared for redaction.
|
| 145 |
- pdf_image_file_paths (List[str]): A list of paths to the PDF files converted to images for redaction.
|
| 146 |
-
|
| 147 |
- chosen_redact_entities (List[str]): A list of entity types to redact from the files using the local model (spacy) with Microsoft Presidio.
|
| 148 |
- chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service.
|
| 149 |
- text_extraction_method (str): The method to use to extract text from documents.
|
|
@@ -188,7 +187,9 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 188 |
- all_page_line_level_ocr_results_with_words (list, optional): All word level text on the page with bounding boxes.
|
| 189 |
- all_page_line_level_ocr_results_with_words_df (pd.Dataframe, optional): All word level text on the page with bounding boxes as a dataframe.
|
| 190 |
- chosen_local_model (str): Which local model is being used for OCR on images - "tesseract", "paddle" for PaddleOCR, or "hybrid" to combine both.
|
| 191 |
-
-
|
|
|
|
|
|
|
| 192 |
- RETURN_PDF_END_OF_REDACTION (bool, optional): Boolean to determine whether to return a redacted PDF at the end of the redaction process.
|
| 193 |
- progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
| 194 |
|
|
@@ -203,6 +204,18 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 203 |
blank_request_metadata = []
|
| 204 |
all_textract_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
|
| 205 |
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
|
| 207 |
if all_page_line_level_ocr_results_with_words_df is None:
|
| 208 |
all_page_line_level_ocr_results_with_words_df = pd.DataFrame()
|
|
@@ -452,6 +465,19 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 452 |
else:
|
| 453 |
textract_client = ""
|
| 454 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 455 |
# Check if output_folder exists, create it if it doesn't
|
| 456 |
if not os.path.exists(output_folder): os.makedirs(output_folder)
|
| 457 |
|
|
@@ -511,7 +537,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 511 |
|
| 512 |
pymupdf_doc, all_pages_decision_process_table, out_file_paths, new_textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_page_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words = redact_image_pdf(file_path,
|
| 513 |
pdf_image_file_paths,
|
| 514 |
-
|
| 515 |
chosen_redact_entities,
|
| 516 |
chosen_redact_comprehend_entities,
|
| 517 |
in_allow_list_flat,
|
|
@@ -538,7 +564,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 538 |
text_extraction_only,
|
| 539 |
all_page_line_level_ocr_results,
|
| 540 |
all_page_line_level_ocr_results_with_words,
|
| 541 |
-
chosen_local_model,
|
| 542 |
log_files_output_paths=log_files_output_paths,
|
| 543 |
output_folder=output_folder)
|
| 544 |
|
|
@@ -560,7 +586,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 560 |
|
| 561 |
pymupdf_doc, all_pages_decision_process_table, all_page_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, all_page_line_level_ocr_results_with_words = redact_text_pdf(
|
| 562 |
file_path,
|
| 563 |
-
|
| 564 |
chosen_redact_entities,
|
| 565 |
chosen_redact_comprehend_entities,
|
| 566 |
in_allow_list_flat,
|
|
@@ -1352,6 +1378,7 @@ def redact_image_pdf(file_path:str,
|
|
| 1352 |
log_files_output_paths:List=list(),
|
| 1353 |
max_time:int=int(MAX_TIME_VALUE),
|
| 1354 |
output_folder:str=OUTPUT_FOLDER,
|
|
|
|
| 1355 |
progress=Progress(track_tqdm=True)):
|
| 1356 |
|
| 1357 |
'''
|
|
@@ -1391,6 +1418,7 @@ def redact_image_pdf(file_path:str,
|
|
| 1391 |
- log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
|
| 1392 |
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
| 1393 |
- output_folder (str, optional): The folder for file outputs.
|
|
|
|
| 1394 |
- progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
| 1395 |
|
| 1396 |
The function returns a redacted PDF document along with processing output objects.
|
|
@@ -1400,6 +1428,20 @@ def redact_image_pdf(file_path:str,
|
|
| 1400 |
|
| 1401 |
file_name = get_file_name_without_type(file_path)
|
| 1402 |
comprehend_query_number_new = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1403 |
|
| 1404 |
# Update custom word list analyser object with any new words that have been added to the custom deny list
|
| 1405 |
if custom_recogniser_word_list:
|
|
@@ -1413,9 +1455,9 @@ def redact_image_pdf(file_path:str,
|
|
| 1413 |
|
| 1414 |
# Only load in PaddleOCR models if not running Textract
|
| 1415 |
if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
|
| 1416 |
-
image_analyser = CustomImageAnalyzerEngine(nlp_analyser, ocr_engine="tesseract")
|
| 1417 |
else:
|
| 1418 |
-
image_analyser = CustomImageAnalyzerEngine(nlp_analyser, ocr_engine=chosen_local_model)
|
| 1419 |
|
| 1420 |
if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
|
| 1421 |
out_message = "Connection to AWS Comprehend service unsuccessful."
|
|
@@ -1635,7 +1677,7 @@ def redact_image_pdf(file_path:str,
|
|
| 1635 |
chosen_redact_comprehend_entities = chosen_redact_comprehend_entities,
|
| 1636 |
pii_identification_method = pii_identification_method,
|
| 1637 |
comprehend_client=comprehend_client,
|
| 1638 |
-
language=
|
| 1639 |
entities=chosen_redact_entities,
|
| 1640 |
allow_list=allow_list,
|
| 1641 |
score_threshold=score_threshold
|
|
@@ -2155,7 +2197,7 @@ def redact_text_pdf(
|
|
| 2155 |
all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"]), # DataFrame for decision process table
|
| 2156 |
pymupdf_doc: List = list(), # List of PyMuPDF documents
|
| 2157 |
all_page_line_level_ocr_results_with_words: List = list(),
|
| 2158 |
-
pii_identification_method: str = "Local",
|
| 2159 |
comprehend_query_number:int = 0,
|
| 2160 |
comprehend_client="",
|
| 2161 |
custom_recogniser_word_list:List[str]=list(),
|
|
@@ -2167,10 +2209,10 @@ def redact_text_pdf(
|
|
| 2167 |
text_extraction_only:bool=False,
|
| 2168 |
output_folder:str=OUTPUT_FOLDER,
|
| 2169 |
page_break_val: int = int(PAGE_BREAK_VALUE), # Value for page break
|
| 2170 |
-
max_time: int = int(MAX_TIME_VALUE),
|
|
|
|
| 2171 |
progress: Progress = Progress(track_tqdm=True) # Progress tracking object
|
| 2172 |
-
):
|
| 2173 |
-
|
| 2174 |
'''
|
| 2175 |
Redact chosen entities from a PDF that is made up of multiple pages that are not images.
|
| 2176 |
|
|
@@ -2199,13 +2241,15 @@ def redact_text_pdf(
|
|
| 2199 |
- page_sizes_df (pd.DataFrame, optional): A pandas dataframe containing page size information.
|
| 2200 |
- original_cropboxes (List[dict], optional): A list of dictionaries containing pymupdf cropbox information.
|
| 2201 |
- text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
|
|
|
|
| 2202 |
- output_folder (str, optional): The output folder for the function
|
| 2203 |
- page_break_val: Value for page break
|
| 2204 |
-
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
|
|
|
| 2205 |
- progress: Progress tracking object
|
| 2206 |
'''
|
| 2207 |
|
| 2208 |
-
tic = time.perf_counter()
|
| 2209 |
|
| 2210 |
if isinstance(all_line_level_ocr_results_df, pd.DataFrame):
|
| 2211 |
all_line_level_ocr_results_list = [all_line_level_ocr_results_df]
|
|
@@ -2218,6 +2262,20 @@ def redact_text_pdf(
|
|
| 2218 |
out_message = "Connection to AWS Comprehend service not found."
|
| 2219 |
raise Exception(out_message)
|
| 2220 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2221 |
# Update custom word list analyser object with any new words that have been added to the custom deny list
|
| 2222 |
if custom_recogniser_word_list:
|
| 2223 |
nlp_analyser.registry.remove_recognizer("CUSTOM")
|
|
@@ -2228,6 +2286,8 @@ def redact_text_pdf(
|
|
| 2228 |
new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
|
| 2229 |
nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
|
| 2230 |
|
|
|
|
|
|
|
| 2231 |
# Open with Pikepdf to get text lines
|
| 2232 |
pikepdf_pdf = Pdf.open(file_path)
|
| 2233 |
number_of_pages = len(pikepdf_pdf.pages)
|
|
@@ -2323,7 +2383,7 @@ def redact_text_pdf(
|
|
| 2323 |
|
| 2324 |
if chosen_redact_entities or chosen_redact_comprehend_entities:
|
| 2325 |
page_redaction_bounding_boxes = run_page_text_redaction(
|
| 2326 |
-
|
| 2327 |
chosen_redact_entities,
|
| 2328 |
chosen_redact_comprehend_entities,
|
| 2329 |
all_page_line_level_text_extraction_results_list,
|
|
|
|
| 15 |
from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTTextLineHorizontal, LTAnno
|
| 16 |
from pikepdf import Pdf, Dictionary, Name
|
| 17 |
from pymupdf import Rect, Page, Document
|
| 18 |
+
from presidio_analyzer import AnalyzerEngine
|
| 19 |
import gradio as gr
|
| 20 |
from gradio import Progress
|
| 21 |
from collections import defaultdict # For efficient grouping
|
| 22 |
|
| 23 |
+
from tools.config import OUTPUT_FOLDER, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, PAGE_BREAK_VALUE, MAX_TIME_VALUE, LOAD_TRUNCATED_IMAGES, INPUT_FOLDER, RETURN_PDF_END_OF_REDACTION, TESSERACT_TEXT_EXTRACT_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, LOCAL_PII_OPTION, AWS_PII_OPTION, NO_REDACTION_PII_OPTION, DEFAULT_LANGUAGE, textract_language_choices, aws_comprehend_language_choices
|
| 24 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, recreate_page_line_level_ocr_results_with_page
|
| 25 |
from tools.file_conversion import convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, is_pdf, is_pdf_or_image, prepare_image_or_pdf, divide_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, divide_coordinates_by_page_sizes, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, fill_missing_box_ids, load_and_convert_ocr_results_with_words_json, save_pdf_with_or_without_compression, word_level_ocr_output_to_dataframe
|
| 26 |
+
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer, load_spacy_model, download_tesseract_lang_pack, create_nlp_analyser
|
| 27 |
from tools.helper_functions import get_file_name_without_type, clean_unicode_text
|
| 28 |
from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
|
| 29 |
|
|
|
|
| 85 |
|
| 86 |
return list(merged.values())
|
| 87 |
|
|
|
|
|
|
|
| 88 |
def choose_and_run_redactor(file_paths:List[str],
|
| 89 |
prepared_pdf_file_paths:List[str],
|
| 90 |
+
pdf_image_file_paths:List[str],
|
|
|
|
| 91 |
chosen_redact_entities:List[str],
|
| 92 |
chosen_redact_comprehend_entities:List[str],
|
| 93 |
text_extraction_method:str,
|
|
|
|
| 110 |
pymupdf_doc=list(),
|
| 111 |
current_loop_page:int=0,
|
| 112 |
page_break_return:bool=False,
|
| 113 |
+
pii_identification_method:str="Local",
|
| 114 |
comprehend_query_number:int=0,
|
| 115 |
max_fuzzy_spelling_mistakes_num:int=1,
|
| 116 |
match_fuzzy_whole_phrase_bool:bool=True,
|
|
|
|
| 132 |
all_page_line_level_ocr_results_with_words:list[dict] = list(),
|
| 133 |
all_page_line_level_ocr_results_with_words_df:pd.DataFrame=None,
|
| 134 |
chosen_local_model:str="tesseract",
|
| 135 |
+
language:str=DEFAULT_LANGUAGE,
|
| 136 |
+
prepare_images:bool=True,
|
| 137 |
RETURN_PDF_END_OF_REDACTION:bool=RETURN_PDF_END_OF_REDACTION,
|
| 138 |
progress=gr.Progress(track_tqdm=True)):
|
| 139 |
'''
|
|
|
|
| 142 |
- file_paths (List[str]): A list of paths to the files to be redacted.
|
| 143 |
- prepared_pdf_file_paths (List[str]): A list of paths to the PDF files prepared for redaction.
|
| 144 |
- pdf_image_file_paths (List[str]): A list of paths to the PDF files converted to images for redaction.
|
| 145 |
+
|
| 146 |
- chosen_redact_entities (List[str]): A list of entity types to redact from the files using the local model (spacy) with Microsoft Presidio.
|
| 147 |
- chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service.
|
| 148 |
- text_extraction_method (str): The method to use to extract text from documents.
|
|
|
|
| 187 |
- all_page_line_level_ocr_results_with_words (list, optional): All word level text on the page with bounding boxes.
|
| 188 |
- all_page_line_level_ocr_results_with_words_df (pd.Dataframe, optional): All word level text on the page with bounding boxes as a dataframe.
|
| 189 |
- chosen_local_model (str): Which local model is being used for OCR on images - "tesseract", "paddle" for PaddleOCR, or "hybrid" to combine both.
|
| 190 |
+
- language (str, optional): The language of the text in the files. Defaults to English.
|
| 191 |
+
- language (str, optional): The language to do AWS Comprehend calls. Defaults to value of language if not provided.
|
| 192 |
+
- prepare_images (bool, optional): Boolean to determine whether to load images for the PDF.
|
| 193 |
- RETURN_PDF_END_OF_REDACTION (bool, optional): Boolean to determine whether to return a redacted PDF at the end of the redaction process.
|
| 194 |
- progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
| 195 |
|
|
|
|
| 204 |
blank_request_metadata = []
|
| 205 |
all_textract_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
|
| 206 |
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
| 207 |
+
|
| 208 |
+
# Use provided language or default
|
| 209 |
+
effective_language = language or DEFAULT_LANGUAGE
|
| 210 |
+
|
| 211 |
+
if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
|
| 212 |
+
if effective_language not in textract_language_choices:
|
| 213 |
+
out_message = f"Language '{effective_language}' is not supported by AWS Textract. Please select a different language."
|
| 214 |
+
raise Warning(out_message)
|
| 215 |
+
elif pii_identification_method == AWS_PII_OPTION:
|
| 216 |
+
if effective_language not in aws_comprehend_language_choices:
|
| 217 |
+
out_message = f"Language '{effective_language}' is not supported by AWS Comprehend. Please select a different language."
|
| 218 |
+
raise Warning(out_message)
|
| 219 |
|
| 220 |
if all_page_line_level_ocr_results_with_words_df is None:
|
| 221 |
all_page_line_level_ocr_results_with_words_df = pd.DataFrame()
|
|
|
|
| 465 |
else:
|
| 466 |
textract_client = ""
|
| 467 |
|
| 468 |
+
### Language check - check if selected language packs exist
|
| 469 |
+
try:
|
| 470 |
+
if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION and chosen_local_model == "tesseract":
|
| 471 |
+
progress(0.1, desc=f"Downloading Tesseract language pack for {effective_language}")
|
| 472 |
+
download_tesseract_lang_pack(effective_language)
|
| 473 |
+
|
| 474 |
+
progress(0.1, desc=f"Loading SpaCy model for {effective_language}")
|
| 475 |
+
load_spacy_model(effective_language)
|
| 476 |
+
|
| 477 |
+
except Exception as e:
|
| 478 |
+
print(f"Error downloading language packs for {effective_language}: {e}")
|
| 479 |
+
raise Exception(f"Error downloading language packs for {effective_language}: {e}")
|
| 480 |
+
|
| 481 |
# Check if output_folder exists, create it if it doesn't
|
| 482 |
if not os.path.exists(output_folder): os.makedirs(output_folder)
|
| 483 |
|
|
|
|
| 537 |
|
| 538 |
pymupdf_doc, all_pages_decision_process_table, out_file_paths, new_textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_page_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words = redact_image_pdf(file_path,
|
| 539 |
pdf_image_file_paths,
|
| 540 |
+
effective_language,
|
| 541 |
chosen_redact_entities,
|
| 542 |
chosen_redact_comprehend_entities,
|
| 543 |
in_allow_list_flat,
|
|
|
|
| 564 |
text_extraction_only,
|
| 565 |
all_page_line_level_ocr_results,
|
| 566 |
all_page_line_level_ocr_results_with_words,
|
| 567 |
+
chosen_local_model,
|
| 568 |
log_files_output_paths=log_files_output_paths,
|
| 569 |
output_folder=output_folder)
|
| 570 |
|
|
|
|
| 586 |
|
| 587 |
pymupdf_doc, all_pages_decision_process_table, all_page_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, all_page_line_level_ocr_results_with_words = redact_text_pdf(
|
| 588 |
file_path,
|
| 589 |
+
effective_language,
|
| 590 |
chosen_redact_entities,
|
| 591 |
chosen_redact_comprehend_entities,
|
| 592 |
in_allow_list_flat,
|
|
|
|
| 1378 |
log_files_output_paths:List=list(),
|
| 1379 |
max_time:int=int(MAX_TIME_VALUE),
|
| 1380 |
output_folder:str=OUTPUT_FOLDER,
|
| 1381 |
+
nlp_analyser: AnalyzerEngine = nlp_analyser,
|
| 1382 |
progress=Progress(track_tqdm=True)):
|
| 1383 |
|
| 1384 |
'''
|
|
|
|
| 1418 |
- log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
|
| 1419 |
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
| 1420 |
- output_folder (str, optional): The folder for file outputs.
|
| 1421 |
+
- nlp_analyser (AnalyzerEngine, optional): The nlp_analyser object to use for entity detection. Defaults to nlp_analyser.
|
| 1422 |
- progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
| 1423 |
|
| 1424 |
The function returns a redacted PDF document along with processing output objects.
|
|
|
|
| 1428 |
|
| 1429 |
file_name = get_file_name_without_type(file_path)
|
| 1430 |
comprehend_query_number_new = 0
|
| 1431 |
+
|
| 1432 |
+
# Use provided comprehend language or fall back to main language
|
| 1433 |
+
effective_language = language or language
|
| 1434 |
+
|
| 1435 |
+
# Try updating the supported languages for the spacy analyser
|
| 1436 |
+
try:
|
| 1437 |
+
nlp_analyser = create_nlp_analyser(language, existing_nlp_analyser=nlp_analyser)
|
| 1438 |
+
# Check list of nlp_analyser recognisers and languages
|
| 1439 |
+
if language != "en":
|
| 1440 |
+
gr.Info(f"Language: {language} only supports the following entity detection: {str(nlp_analyser.registry.get_supported_entities(languages=[language]))}")
|
| 1441 |
+
|
| 1442 |
+
except Exception as e:
|
| 1443 |
+
print(f"Error creating nlp_analyser for {language}: {e}")
|
| 1444 |
+
raise Exception(f"Error creating nlp_analyser for {language}: {e}")
|
| 1445 |
|
| 1446 |
# Update custom word list analyser object with any new words that have been added to the custom deny list
|
| 1447 |
if custom_recogniser_word_list:
|
|
|
|
| 1455 |
|
| 1456 |
# Only load in PaddleOCR models if not running Textract
|
| 1457 |
if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
|
| 1458 |
+
image_analyser = CustomImageAnalyzerEngine(analyzer_engine=nlp_analyser, ocr_engine="tesseract", language=language)
|
| 1459 |
else:
|
| 1460 |
+
image_analyser = CustomImageAnalyzerEngine(analyzer_engine=nlp_analyser, ocr_engine=chosen_local_model, language=language)
|
| 1461 |
|
| 1462 |
if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
|
| 1463 |
out_message = "Connection to AWS Comprehend service unsuccessful."
|
|
|
|
| 1677 |
chosen_redact_comprehend_entities = chosen_redact_comprehend_entities,
|
| 1678 |
pii_identification_method = pii_identification_method,
|
| 1679 |
comprehend_client=comprehend_client,
|
| 1680 |
+
language=effective_language,
|
| 1681 |
entities=chosen_redact_entities,
|
| 1682 |
allow_list=allow_list,
|
| 1683 |
score_threshold=score_threshold
|
|
|
|
| 2197 |
all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"]), # DataFrame for decision process table
|
| 2198 |
pymupdf_doc: List = list(), # List of PyMuPDF documents
|
| 2199 |
all_page_line_level_ocr_results_with_words: List = list(),
|
| 2200 |
+
pii_identification_method: str = "Local",
|
| 2201 |
comprehend_query_number:int = 0,
|
| 2202 |
comprehend_client="",
|
| 2203 |
custom_recogniser_word_list:List[str]=list(),
|
|
|
|
| 2209 |
text_extraction_only:bool=False,
|
| 2210 |
output_folder:str=OUTPUT_FOLDER,
|
| 2211 |
page_break_val: int = int(PAGE_BREAK_VALUE), # Value for page break
|
| 2212 |
+
max_time: int = int(MAX_TIME_VALUE),
|
| 2213 |
+
nlp_analyser: AnalyzerEngine = nlp_analyser,
|
| 2214 |
progress: Progress = Progress(track_tqdm=True) # Progress tracking object
|
| 2215 |
+
):
|
|
|
|
| 2216 |
'''
|
| 2217 |
Redact chosen entities from a PDF that is made up of multiple pages that are not images.
|
| 2218 |
|
|
|
|
| 2241 |
- page_sizes_df (pd.DataFrame, optional): A pandas dataframe containing page size information.
|
| 2242 |
- original_cropboxes (List[dict], optional): A list of dictionaries containing pymupdf cropbox information.
|
| 2243 |
- text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
|
| 2244 |
+
- language (str, optional): The language to do AWS Comprehend calls. Defaults to value of language if not provided.
|
| 2245 |
- output_folder (str, optional): The output folder for the function
|
| 2246 |
- page_break_val: Value for page break
|
| 2247 |
+
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
| 2248 |
+
- nlp_analyser (AnalyzerEngine, optional): The nlp_analyser object to use for entity detection. Defaults to nlp_analyser.
|
| 2249 |
- progress: Progress tracking object
|
| 2250 |
'''
|
| 2251 |
|
| 2252 |
+
tic = time.perf_counter()
|
| 2253 |
|
| 2254 |
if isinstance(all_line_level_ocr_results_df, pd.DataFrame):
|
| 2255 |
all_line_level_ocr_results_list = [all_line_level_ocr_results_df]
|
|
|
|
| 2262 |
out_message = "Connection to AWS Comprehend service not found."
|
| 2263 |
raise Exception(out_message)
|
| 2264 |
|
| 2265 |
+
# Use provided comprehend language or fall back to main language
|
| 2266 |
+
effective_language = language or language
|
| 2267 |
+
|
| 2268 |
+
# Try updating the supported languages for the spacy analyser
|
| 2269 |
+
try:
|
| 2270 |
+
nlp_analyser = create_nlp_analyser(language, existing_nlp_analyser=nlp_analyser)
|
| 2271 |
+
# Check list of nlp_analyser recognisers and languages
|
| 2272 |
+
if language != "en":
|
| 2273 |
+
gr.Info(f"Language: {language} only supports the following entity detection: {str(nlp_analyser.registry.get_supported_entities(languages=[language]))}")
|
| 2274 |
+
|
| 2275 |
+
except Exception as e:
|
| 2276 |
+
print(f"Error creating nlp_analyser for {language}: {e}")
|
| 2277 |
+
raise Exception(f"Error creating nlp_analyser for {language}: {e}")
|
| 2278 |
+
|
| 2279 |
# Update custom word list analyser object with any new words that have been added to the custom deny list
|
| 2280 |
if custom_recogniser_word_list:
|
| 2281 |
nlp_analyser.registry.remove_recognizer("CUSTOM")
|
|
|
|
| 2286 |
new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
|
| 2287 |
nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
|
| 2288 |
|
| 2289 |
+
|
| 2290 |
+
|
| 2291 |
# Open with Pikepdf to get text lines
|
| 2292 |
pikepdf_pdf = Pdf.open(file_path)
|
| 2293 |
number_of_pages = len(pikepdf_pdf.pages)
|
|
|
|
| 2383 |
|
| 2384 |
if chosen_redact_entities or chosen_redact_comprehend_entities:
|
| 2385 |
page_redaction_bounding_boxes = run_page_text_redaction(
|
| 2386 |
+
effective_language,
|
| 2387 |
chosen_redact_entities,
|
| 2388 |
chosen_redact_comprehend_entities,
|
| 2389 |
all_page_line_level_text_extraction_results_list,
|
tools/find_duplicate_pages.py
CHANGED
|
@@ -14,9 +14,7 @@ from pathlib import Path
|
|
| 14 |
from typing import List
|
| 15 |
from tools.helper_functions import OUTPUT_FOLDER
|
| 16 |
from tools.file_conversion import redact_whole_pymupdf_page, convert_annotation_data_to_dataframe, fill_missing_box_ids_each_box
|
| 17 |
-
import
|
| 18 |
-
|
| 19 |
-
nlp = en_core_web_lg.load()
|
| 20 |
|
| 21 |
similarity_threshold = 0.95
|
| 22 |
number_of_zeros_to_add_to_index = 7 # Number of zeroes to add between page number and line numbers to get a unique page/line index value
|
|
|
|
| 14 |
from typing import List
|
| 15 |
from tools.helper_functions import OUTPUT_FOLDER
|
| 16 |
from tools.file_conversion import redact_whole_pymupdf_page, convert_annotation_data_to_dataframe, fill_missing_box_ids_each_box
|
| 17 |
+
from tools.load_spacy_model_custom_recognisers import nlp
|
|
|
|
|
|
|
| 18 |
|
| 19 |
similarity_threshold = 0.95
|
| 20 |
number_of_zeros_to_add_to_index = 7 # Number of zeroes to add between page number and line numbers to get a unique page/line index value
|
tools/helper_functions.py
CHANGED
|
@@ -9,7 +9,24 @@ import unicodedata
|
|
| 9 |
from typing import List
|
| 10 |
from math import ceil
|
| 11 |
from gradio_image_annotation import image_annotator
|
| 12 |
-
from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, AWS_PII_OPTION
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
def reset_state_vars():
|
| 15 |
return [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
|
|
@@ -157,13 +174,7 @@ def ensure_output_folder_exists(output_folder:str):
|
|
| 157 |
else:
|
| 158 |
print(f"The {output_folder} folder already exists.")
|
| 159 |
|
| 160 |
-
|
| 161 |
-
"""Parses a comma-separated environment variable into a list of strings."""
|
| 162 |
-
value = env_var_name[1:-1].strip().replace('\"', '').replace("\'","")
|
| 163 |
-
if not value:
|
| 164 |
-
return []
|
| 165 |
-
# Split by comma and filter out any empty strings that might result from extra commas
|
| 166 |
-
return [s.strip() for s in value.split(',') if s.strip()]
|
| 167 |
|
| 168 |
def custom_regex_load(in_file:List[str], file_type:str = "allow_list"):
|
| 169 |
'''
|
|
@@ -189,7 +200,7 @@ def custom_regex_load(in_file:List[str], file_type:str = "allow_list"):
|
|
| 189 |
print(output_text)
|
| 190 |
else:
|
| 191 |
output_text = "No file provided."
|
| 192 |
-
print(output_text)
|
| 193 |
return output_text, custom_regex_df
|
| 194 |
|
| 195 |
return output_text, custom_regex_df
|
|
@@ -590,4 +601,25 @@ def reset_ocr_with_words_base_dataframe(df:pd.DataFrame, page_entity_dropdown_re
|
|
| 590 |
|
| 591 |
output_df_filtered = df.loc[df["page"]==str(page_entity_dropdown_redaction_value), ["page", "line", "word_text", "word_x0", "word_y0", "word_x1", "word_y1", "index"]]
|
| 592 |
return output_df_filtered, output_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 593 |
|
|
|
|
| 9 |
from typing import List
|
| 10 |
from math import ceil
|
| 11 |
from gradio_image_annotation import image_annotator
|
| 12 |
+
from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, AWS_PII_OPTION, MAPPED_LANGUAGE_CHOICES, LANGUAGE_CHOICES, textract_language_choices, aws_comprehend_language_choices, DEFAULT_LANGUAGE
|
| 13 |
+
# from tools.load_spacy_model_custom_recognisers import nlp_analyser
|
| 14 |
+
|
| 15 |
+
def _get_env_list(env_var_name: str) -> List[str]:
|
| 16 |
+
"""Parses a comma-separated environment variable into a list of strings."""
|
| 17 |
+
value = env_var_name[1:-1].strip().replace('\"', '').replace("\'","")
|
| 18 |
+
if not value:
|
| 19 |
+
return []
|
| 20 |
+
# Split by comma and filter out any empty strings that might result from extra commas
|
| 21 |
+
return [s.strip() for s in value.split(',') if s.strip()]
|
| 22 |
+
|
| 23 |
+
if textract_language_choices: textract_language_choices = _get_env_list(textract_language_choices)
|
| 24 |
+
if aws_comprehend_language_choices: aws_comprehend_language_choices = _get_env_list(aws_comprehend_language_choices)
|
| 25 |
+
|
| 26 |
+
if MAPPED_LANGUAGE_CHOICES: MAPPED_LANGUAGE_CHOICES = _get_env_list(MAPPED_LANGUAGE_CHOICES)
|
| 27 |
+
if LANGUAGE_CHOICES: LANGUAGE_CHOICES = _get_env_list(LANGUAGE_CHOICES)
|
| 28 |
+
|
| 29 |
+
LANGUAGE_MAP = dict(zip(MAPPED_LANGUAGE_CHOICES, LANGUAGE_CHOICES))
|
| 30 |
|
| 31 |
def reset_state_vars():
|
| 32 |
return [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
|
|
|
|
| 174 |
else:
|
| 175 |
print(f"The {output_folder} folder already exists.")
|
| 176 |
|
| 177 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
|
| 179 |
def custom_regex_load(in_file:List[str], file_type:str = "allow_list"):
|
| 180 |
'''
|
|
|
|
| 200 |
print(output_text)
|
| 201 |
else:
|
| 202 |
output_text = "No file provided."
|
| 203 |
+
#print(output_text)
|
| 204 |
return output_text, custom_regex_df
|
| 205 |
|
| 206 |
return output_text, custom_regex_df
|
|
|
|
| 601 |
|
| 602 |
output_df_filtered = df.loc[df["page"]==str(page_entity_dropdown_redaction_value), ["page", "line", "word_text", "word_x0", "word_y0", "word_x1", "word_y1", "index"]]
|
| 603 |
return output_df_filtered, output_df
|
| 604 |
+
|
| 605 |
+
def update_language_dropdown(chosen_language_full_name_drop, textract_language_choices=textract_language_choices, aws_comprehend_language_choices=aws_comprehend_language_choices, LANGUAGE_MAP=LANGUAGE_MAP):
|
| 606 |
+
|
| 607 |
+
try:
|
| 608 |
+
full_language_name = chosen_language_full_name_drop.lower()
|
| 609 |
+
matched_language = LANGUAGE_MAP[full_language_name]
|
| 610 |
+
|
| 611 |
+
chosen_language_drop = gr.Dropdown(value = matched_language, choices = LANGUAGE_CHOICES, label="Chosen language short code", multiselect=False, visible=True)
|
| 612 |
+
|
| 613 |
+
if matched_language not in aws_comprehend_language_choices and matched_language not in textract_language_choices:
|
| 614 |
+
gr.Info(f"Note that {full_language_name} is not supported by AWS Comprehend or AWS Textract")
|
| 615 |
+
elif matched_language not in aws_comprehend_language_choices:
|
| 616 |
+
gr.Info(f"Note that {full_language_name} is not supported by AWS Comprehend")
|
| 617 |
+
elif matched_language not in textract_language_choices:
|
| 618 |
+
gr.Info(f"Note that {full_language_name} is not supported by AWS Textract")
|
| 619 |
+
except Exception as e:
|
| 620 |
+
print(e)
|
| 621 |
+
gr.Info("Could not find language in list")
|
| 622 |
+
chosen_language_drop = gr.Dropdown(value = DEFAULT_LANGUAGE, choices = LANGUAGE_CHOICES, label="Chosen language short code", multiselect=False)
|
| 623 |
+
|
| 624 |
+
return chosen_language_drop
|
| 625 |
|
tools/load_spacy_model_custom_recognisers.py
CHANGED
|
@@ -1,48 +1,255 @@
|
|
| 1 |
from typing import List
|
| 2 |
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
|
| 3 |
from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts, NerModelConfiguration
|
| 4 |
-
|
| 5 |
import spacy
|
| 6 |
-
from spacy.matcher import Matcher
|
| 7 |
from spaczz.matcher import FuzzyMatcher
|
| 8 |
spacy.prefer_gpu()
|
| 9 |
from spacy.cli.download import download
|
| 10 |
import Levenshtein
|
| 11 |
import re
|
|
|
|
|
|
|
| 12 |
import gradio as gr
|
|
|
|
| 13 |
|
| 14 |
-
model_name = "en_core_web_lg" #"en_core_web_sm" #"en_core_web_trf"
|
| 15 |
score_threshold = 0.001
|
| 16 |
custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"]
|
| 17 |
|
| 18 |
-
#Load spacy model
|
| 19 |
-
try:
|
| 20 |
-
import en_core_web_lg #en_core_web_sm
|
| 21 |
-
nlp = en_core_web_lg.load() #en_core_web_sm.load()
|
| 22 |
-
print("Successfully imported spaCy model")
|
| 23 |
-
|
| 24 |
-
except:
|
| 25 |
-
download(model_name)
|
| 26 |
-
nlp = spacy.load(model_name)
|
| 27 |
-
print("Successfully downloaded and imported spaCy model", model_name)
|
| 28 |
-
|
| 29 |
# Create a class inheriting from SpacyNlpEngine
|
| 30 |
class LoadedSpacyNlpEngine(SpacyNlpEngine):
|
| 31 |
-
def __init__(self, loaded_spacy_model):
|
| 32 |
super().__init__(ner_model_configuration=NerModelConfiguration(labels_to_ignore=["CARDINAL", "ORDINAL"])) # Ignore non-relevant labels
|
| 33 |
-
self.nlp = {
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
|
|
|
|
|
|
| 38 |
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
log_decision_process=False,
|
| 43 |
-
) # New custom recognisers based on the following functions are added at the end of this script
|
| 44 |
|
| 45 |
-
|
| 46 |
def custom_word_list_recogniser(custom_list:List[str]=[]):
|
| 47 |
# Create regex pattern, handling quotes carefully
|
| 48 |
|
|
@@ -297,7 +504,6 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
|
|
| 297 |
|
| 298 |
return all_start_positions, all_end_positions
|
| 299 |
|
| 300 |
-
|
| 301 |
class CustomWordFuzzyRecognizer(EntityRecognizer):
|
| 302 |
def __init__(self, supported_entities: List[str], custom_list: List[str] = [], spelling_mistakes_max: int = 1, search_whole_phrase: bool = True):
|
| 303 |
super().__init__(supported_entities=supported_entities)
|
|
@@ -332,10 +538,79 @@ custom_list_default = []
|
|
| 332 |
custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_list_default)
|
| 333 |
|
| 334 |
|
| 335 |
-
#
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
|
|
|
|
| 1 |
from typing import List
|
| 2 |
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
|
| 3 |
from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts, NerModelConfiguration
|
|
|
|
| 4 |
import spacy
|
| 5 |
+
from spacy.matcher import Matcher
|
| 6 |
from spaczz.matcher import FuzzyMatcher
|
| 7 |
spacy.prefer_gpu()
|
| 8 |
from spacy.cli.download import download
|
| 9 |
import Levenshtein
|
| 10 |
import re
|
| 11 |
+
import os
|
| 12 |
+
import requests
|
| 13 |
import gradio as gr
|
| 14 |
+
from tools.config import DEFAULT_LANGUAGE, TESSERACT_FOLDER
|
| 15 |
|
|
|
|
| 16 |
score_threshold = 0.001
|
| 17 |
custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"]
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
# Create a class inheriting from SpacyNlpEngine
|
| 20 |
class LoadedSpacyNlpEngine(SpacyNlpEngine):
|
| 21 |
+
def __init__(self, loaded_spacy_model, language_code: str):
|
| 22 |
super().__init__(ner_model_configuration=NerModelConfiguration(labels_to_ignore=["CARDINAL", "ORDINAL"])) # Ignore non-relevant labels
|
| 23 |
+
self.nlp = {language_code: loaded_spacy_model}
|
| 24 |
|
| 25 |
+
def _base_language_code(language: str) -> str:
|
| 26 |
+
lang = _normalize_language_input(language)
|
| 27 |
+
if "_" in lang:
|
| 28 |
+
return lang.split("_")[0]
|
| 29 |
+
return lang
|
| 30 |
+
|
| 31 |
+
def load_spacy_model(language: str = DEFAULT_LANGUAGE):
|
| 32 |
+
"""
|
| 33 |
+
Load a spaCy model for the requested language and return it as `nlp`.
|
| 34 |
+
|
| 35 |
+
Accepts common inputs like: "en", "en_lg", "en_sm", "de", "fr", "es", "it", "nl", "pt", "zh", "ja", "xx".
|
| 36 |
+
Falls back through sensible candidates and will download if missing.
|
| 37 |
+
"""
|
| 38 |
+
|
| 39 |
+
synonyms = {
|
| 40 |
+
"english": "en",
|
| 41 |
+
"catalan": "ca",
|
| 42 |
+
"danish": "da",
|
| 43 |
+
"german": "de",
|
| 44 |
+
"french": "fr",
|
| 45 |
+
"greek": "el",
|
| 46 |
+
"finnish": "fi",
|
| 47 |
+
"croatian": "hr",
|
| 48 |
+
"lithuanian": "lt",
|
| 49 |
+
"macedonian": "mk",
|
| 50 |
+
"norwegian_bokmaal": "nb",
|
| 51 |
+
"polish": "pl",
|
| 52 |
+
"russian": "ru",
|
| 53 |
+
"slovenian": "sl",
|
| 54 |
+
"swedish": "sv",
|
| 55 |
+
"dutch": "nl",
|
| 56 |
+
"portuguese": "pt",
|
| 57 |
+
"chinese": "zh",
|
| 58 |
+
"japanese": "ja",
|
| 59 |
+
"multilingual": "xx",
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
lang_norm = _normalize_language_input(language)
|
| 63 |
+
lang_norm = synonyms.get(lang_norm, lang_norm)
|
| 64 |
+
base_lang = _base_language_code(lang_norm)
|
| 65 |
+
|
| 66 |
+
candidates_by_lang = {
|
| 67 |
+
# English
|
| 68 |
+
"en": [
|
| 69 |
+
"en_core_web_lg",
|
| 70 |
+
"en_core_web_trf",
|
| 71 |
+
"en_core_web_md",
|
| 72 |
+
"en_core_web_sm",
|
| 73 |
+
],
|
| 74 |
+
"en_lg": ["en_core_web_lg"],
|
| 75 |
+
"en_trf": ["en_core_web_trf"],
|
| 76 |
+
"en_md": ["en_core_web_md"],
|
| 77 |
+
"en_sm": ["en_core_web_sm"],
|
| 78 |
+
|
| 79 |
+
# Major languages (news pipelines)
|
| 80 |
+
"ca": ["ca_core_news_lg", "ca_core_news_md", "ca_core_news_sm"], # Catalan
|
| 81 |
+
"da": ["da_core_news_lg", "da_core_news_md", "da_core_news_sm"], # Danish
|
| 82 |
+
"de": ["de_core_news_lg", "de_core_news_md", "de_core_news_sm"], # German
|
| 83 |
+
"el": ["el_core_news_lg", "el_core_news_md", "el_core_news_sm"], # Greek
|
| 84 |
+
"es": ["es_core_news_lg", "es_core_news_md", "es_core_news_sm"], # Spanish
|
| 85 |
+
"fi": ["fi_core_news_lg", "fi_core_news_md", "fi_core_news_sm"], # Finnish
|
| 86 |
+
"fr": ["fr_core_news_lg", "fr_core_news_md", "fr_core_news_sm"], # French
|
| 87 |
+
"hr": ["hr_core_news_lg", "hr_core_news_md", "hr_core_news_sm"], # Croatian
|
| 88 |
+
"it": ["it_core_news_lg", "it_core_news_md", "it_core_news_sm"], # Italian
|
| 89 |
+
"ja": ["ja_core_news_lg", "ja_core_news_md", "ja_core_news_sm"], # Japanese
|
| 90 |
+
"ko": ["ko_core_news_lg", "ko_core_news_md", "ko_core_news_sm"], # Korean
|
| 91 |
+
"lt": ["lt_core_news_lg", "lt_core_news_md", "lt_core_news_sm"], # Lithuanian
|
| 92 |
+
"mk": ["mk_core_news_lg", "mk_core_news_md", "mk_core_news_sm"], # Macedonian
|
| 93 |
+
"nb": ["nb_core_news_lg", "nb_core_news_md", "nb_core_news_sm"], # Norwegian Bokmål
|
| 94 |
+
"nl": ["nl_core_news_lg", "nl_core_news_md", "nl_core_news_sm"], # Dutch
|
| 95 |
+
"pl": ["pl_core_news_lg", "pl_core_news_md", "pl_core_news_sm"], # Polish
|
| 96 |
+
"pt": ["pt_core_news_lg", "pt_core_news_md", "pt_core_news_sm"], # Portuguese
|
| 97 |
+
"ro": ["ro_core_news_lg", "ro_core_news_md", "ro_core_news_sm"], # Romanian
|
| 98 |
+
"ru": ["ru_core_news_lg", "ru_core_news_md", "ru_core_news_sm"], # Russian
|
| 99 |
+
"sl": ["sl_core_news_lg", "sl_core_news_md", "sl_core_news_sm"], # Slovenian
|
| 100 |
+
"sv": ["sv_core_news_lg", "sv_core_news_md", "sv_core_news_sm"], # Swedish
|
| 101 |
+
"uk": ["uk_core_news_lg", "uk_core_news_md", "uk_core_news_sm"], # Ukrainian
|
| 102 |
+
"zh": ["zh_core_web_lg", "zh_core_web_mod", "zh_core_web_sm", "zh_core_web_trf"], # Chinese
|
| 103 |
+
|
| 104 |
+
# Multilingual NER
|
| 105 |
+
"xx": ["xx_ent_wiki_sm"],
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
if lang_norm in candidates_by_lang:
|
| 109 |
+
candidates = candidates_by_lang[lang_norm]
|
| 110 |
+
elif base_lang in candidates_by_lang:
|
| 111 |
+
candidates = candidates_by_lang[base_lang]
|
| 112 |
+
else:
|
| 113 |
+
# Fallback to multilingual if unknown
|
| 114 |
+
candidates = candidates_by_lang["xx"]
|
| 115 |
+
|
| 116 |
+
last_error = None
|
| 117 |
+
for candidate in candidates:
|
| 118 |
+
# Try importable package first (fast-path when installed as a package)
|
| 119 |
+
try:
|
| 120 |
+
module = __import__(candidate)
|
| 121 |
+
print(f"Successfully imported spaCy model: {candidate}")
|
| 122 |
+
return module.load()
|
| 123 |
+
except Exception as e:
|
| 124 |
+
last_error = e
|
| 125 |
+
|
| 126 |
+
# Try spacy.load if package is linked/installed
|
| 127 |
+
try:
|
| 128 |
+
nlp = spacy.load(candidate)
|
| 129 |
+
print(f"Successfully loaded spaCy model via spacy.load: {candidate}")
|
| 130 |
+
return nlp
|
| 131 |
+
except Exception as e:
|
| 132 |
+
last_error = e
|
| 133 |
+
|
| 134 |
+
# Check if model is already downloaded before attempting to download
|
| 135 |
+
try:
|
| 136 |
+
# Try to load the model to see if it's already available
|
| 137 |
+
nlp = spacy.load(candidate)
|
| 138 |
+
print(f"Model {candidate} is already available, skipping download")
|
| 139 |
+
return nlp
|
| 140 |
+
except OSError:
|
| 141 |
+
# Model not found, proceed with download
|
| 142 |
+
pass
|
| 143 |
+
except Exception as e:
|
| 144 |
+
last_error = e
|
| 145 |
+
continue
|
| 146 |
+
|
| 147 |
+
# Attempt to download then load
|
| 148 |
+
try:
|
| 149 |
+
print(f"Downloading spaCy model: {candidate}")
|
| 150 |
+
download(candidate)
|
| 151 |
+
nlp = spacy.load(candidate)
|
| 152 |
+
print(f"Successfully downloaded and loaded spaCy model: {candidate}")
|
| 153 |
+
return nlp
|
| 154 |
+
except Exception as e:
|
| 155 |
+
last_error = e
|
| 156 |
+
continue
|
| 157 |
+
|
| 158 |
+
raise RuntimeError(f"Failed to load spaCy model for language '{language}'. Last error: {last_error}")
|
| 159 |
+
|
| 160 |
+
# Language-aware spaCy model loader
|
| 161 |
+
def _normalize_language_input(language: str) -> str:
|
| 162 |
+
return language.strip().lower().replace("-", "_")
|
| 163 |
+
|
| 164 |
+
# Update the global variables to use the new function
|
| 165 |
+
ACTIVE_LANGUAGE_CODE = _base_language_code(DEFAULT_LANGUAGE)
|
| 166 |
+
nlp = None # Placeholder, will be loaded in the create_nlp_analyser function below #load_spacy_model(DEFAULT_LANGUAGE)
|
| 167 |
+
|
| 168 |
+
def get_tesseract_lang_code(short_code:str):
|
| 169 |
+
"""
|
| 170 |
+
Maps a two-letter language code to the corresponding Tesseract OCR code.
|
| 171 |
+
|
| 172 |
+
Args:
|
| 173 |
+
short_code (str): The two-letter language code (e.g., "en", "de").
|
| 174 |
+
|
| 175 |
+
Returns:
|
| 176 |
+
str or None: The Tesseract language code (e.g., "eng", "deu"),
|
| 177 |
+
or None if no mapping is found.
|
| 178 |
+
"""
|
| 179 |
+
# Mapping from 2-letter codes to Tesseract 3-letter codes
|
| 180 |
+
# Based on ISO 639-2/T codes.
|
| 181 |
+
lang_map = {
|
| 182 |
+
"en": "eng",
|
| 183 |
+
"de": "deu",
|
| 184 |
+
"fr": "fra",
|
| 185 |
+
"es": "spa",
|
| 186 |
+
"it": "ita",
|
| 187 |
+
"nl": "nld",
|
| 188 |
+
"pt": "por",
|
| 189 |
+
"zh": "chi_sim", # Mapping to Simplified Chinese by default
|
| 190 |
+
"ja": "jpn",
|
| 191 |
+
"ko": "kor",
|
| 192 |
+
"lt": "lit",
|
| 193 |
+
"mk": "mkd",
|
| 194 |
+
"nb": "nor",
|
| 195 |
+
"pl": "pol",
|
| 196 |
+
"ro": "ron",
|
| 197 |
+
"ru": "rus",
|
| 198 |
+
"sl": "slv",
|
| 199 |
+
"sv": "swe",
|
| 200 |
+
"uk": "ukr"
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
return lang_map.get(short_code)
|
| 204 |
+
|
| 205 |
+
def download_tesseract_lang_pack(short_lang_code:str, tessdata_dir=TESSERACT_FOLDER + "/tessdata"):
|
| 206 |
+
"""
|
| 207 |
+
Downloads a Tesseract language pack to a local directory.
|
| 208 |
+
|
| 209 |
+
Args:
|
| 210 |
+
lang_code (str): The short code for the language (e.g., "eng", "fra").
|
| 211 |
+
tessdata_dir (str, optional): The directory to save the language pack.
|
| 212 |
+
Defaults to "tessdata".
|
| 213 |
+
"""
|
| 214 |
+
|
| 215 |
+
# Create the directory if it doesn't exist
|
| 216 |
+
if not os.path.exists(tessdata_dir):
|
| 217 |
+
os.makedirs(tessdata_dir)
|
| 218 |
+
|
| 219 |
+
# Get the Tesseract language code
|
| 220 |
+
lang_code = get_tesseract_lang_code(short_lang_code)
|
| 221 |
+
|
| 222 |
+
if lang_code is None:
|
| 223 |
+
raise ValueError(f"Language code {short_lang_code} not found in Tesseract language map")
|
| 224 |
+
|
| 225 |
+
# Set the local file path
|
| 226 |
+
file_path = os.path.join(tessdata_dir, f"{lang_code}.traineddata")
|
| 227 |
+
|
| 228 |
+
# Check if the file already exists
|
| 229 |
+
if os.path.exists(file_path):
|
| 230 |
+
print(f"Language pack {lang_code}.traineddata already exists at {file_path}")
|
| 231 |
+
return file_path
|
| 232 |
+
|
| 233 |
+
# Construct the URL for the language pack
|
| 234 |
+
url = f"https://raw.githubusercontent.com/tesseract-ocr/tessdata/main/{lang_code}.traineddata"
|
| 235 |
+
|
| 236 |
+
# Download the file
|
| 237 |
+
try:
|
| 238 |
+
response = requests.get(url, stream=True)
|
| 239 |
+
response.raise_for_status() # Raise an exception for bad status codes
|
| 240 |
+
|
| 241 |
+
with open(file_path, "wb") as f:
|
| 242 |
+
for chunk in response.iter_content(chunk_size=8192):
|
| 243 |
+
f.write(chunk)
|
| 244 |
|
| 245 |
+
print(f"Successfully downloaded {lang_code}.traineddata to {file_path}")
|
| 246 |
+
return file_path
|
| 247 |
|
| 248 |
+
except requests.exceptions.RequestException as e:
|
| 249 |
+
print(f"Error downloading {lang_code}.traineddata: {e}")
|
| 250 |
+
return None
|
|
|
|
|
|
|
| 251 |
|
| 252 |
+
#### Custom recognisers
|
| 253 |
def custom_word_list_recogniser(custom_list:List[str]=[]):
|
| 254 |
# Create regex pattern, handling quotes carefully
|
| 255 |
|
|
|
|
| 504 |
|
| 505 |
return all_start_positions, all_end_positions
|
| 506 |
|
|
|
|
| 507 |
class CustomWordFuzzyRecognizer(EntityRecognizer):
|
| 508 |
def __init__(self, supported_entities: List[str], custom_list: List[str] = [], spelling_mistakes_max: int = 1, search_whole_phrase: bool = True):
|
| 509 |
super().__init__(supported_entities=supported_entities)
|
|
|
|
| 538 |
custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_list_default)
|
| 539 |
|
| 540 |
|
| 541 |
+
# Pass the loaded model to the new LoadedSpacyNlpEngine
|
| 542 |
+
loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp, language_code = ACTIVE_LANGUAGE_CODE)
|
| 543 |
+
|
| 544 |
+
|
| 545 |
+
def create_nlp_analyser(language: str = DEFAULT_LANGUAGE, custom_list: List[str] = None,
|
| 546 |
+
spelling_mistakes_max: int = 1, search_whole_phrase: bool = True, existing_nlp_analyser: AnalyzerEngine = None):
|
| 547 |
+
"""
|
| 548 |
+
Create an nlp_analyser object based on the specified language input.
|
| 549 |
+
|
| 550 |
+
Args:
|
| 551 |
+
language (str): Language code (e.g., "en", "de", "fr", "es", etc.)
|
| 552 |
+
custom_list (List[str], optional): List of custom words to recognize. Defaults to None.
|
| 553 |
+
spelling_mistakes_max (int, optional): Maximum number of spelling mistakes for fuzzy matching. Defaults to 1.
|
| 554 |
+
search_whole_phrase (bool, optional): Whether to search for whole phrases or individual words. Defaults to True.
|
| 555 |
+
|
| 556 |
+
Returns:
|
| 557 |
+
AnalyzerEngine: Configured nlp_analyser object with custom recognizers
|
| 558 |
+
"""
|
| 559 |
+
print("existing_nlp_analyser:", existing_nlp_analyser)
|
| 560 |
+
|
| 561 |
+
if existing_nlp_analyser is None:
|
| 562 |
+
pass
|
| 563 |
+
else:
|
| 564 |
+
if existing_nlp_analyser.supported_languages[0] == language:
|
| 565 |
+
nlp_analyser = existing_nlp_analyser
|
| 566 |
+
print(f"Using existing nlp_analyser for {language}")
|
| 567 |
+
return nlp_analyser
|
| 568 |
+
|
| 569 |
+
# Load spaCy model for the specified language
|
| 570 |
+
nlp_model = load_spacy_model(language)
|
| 571 |
+
|
| 572 |
+
# Get base language code
|
| 573 |
+
base_lang_code = _base_language_code(language)
|
| 574 |
+
|
| 575 |
+
# Create custom recognizers
|
| 576 |
+
if custom_list is None:
|
| 577 |
+
custom_list = []
|
| 578 |
+
|
| 579 |
+
custom_recogniser = custom_word_list_recogniser(custom_list)
|
| 580 |
+
custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(
|
| 581 |
+
supported_entities=["CUSTOM_FUZZY"],
|
| 582 |
+
custom_list=custom_list,
|
| 583 |
+
spelling_mistakes_max=spelling_mistakes_max,
|
| 584 |
+
search_whole_phrase=search_whole_phrase
|
| 585 |
+
)
|
| 586 |
+
|
| 587 |
+
# Create NLP engine with loaded model
|
| 588 |
+
loaded_nlp_engine = LoadedSpacyNlpEngine(
|
| 589 |
+
loaded_spacy_model=nlp_model,
|
| 590 |
+
language_code=base_lang_code
|
| 591 |
+
)
|
| 592 |
+
|
| 593 |
+
# Create analyzer engine
|
| 594 |
+
nlp_analyser = AnalyzerEngine(
|
| 595 |
+
nlp_engine=loaded_nlp_engine,
|
| 596 |
+
default_score_threshold=score_threshold,
|
| 597 |
+
supported_languages=[base_lang_code],
|
| 598 |
+
log_decision_process=False,
|
| 599 |
+
)
|
| 600 |
+
|
| 601 |
+
# Add custom recognizers to nlp_analyser
|
| 602 |
+
nlp_analyser.registry.add_recognizer(custom_recogniser)
|
| 603 |
+
nlp_analyser.registry.add_recognizer(custom_word_fuzzy_recognizer)
|
| 604 |
+
|
| 605 |
+
# Add language-specific recognizers for English
|
| 606 |
+
if base_lang_code == "en":
|
| 607 |
+
nlp_analyser.registry.add_recognizer(street_recogniser)
|
| 608 |
+
nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
|
| 609 |
+
nlp_analyser.registry.add_recognizer(titles_recogniser)
|
| 610 |
+
|
| 611 |
+
return nlp_analyser
|
| 612 |
+
|
| 613 |
+
# Create the default nlp_analyser using the new function
|
| 614 |
+
nlp_analyser = create_nlp_analyser(DEFAULT_LANGUAGE)
|
| 615 |
+
|
| 616 |
|