Sean Pedrick-Case commited on
Commit
4a5cee5
·
unverified ·
2 Parent(s): 5203951 818efbc

Merge pull request #14 from seanpedrick-case/dev

Browse files
Dockerfile CHANGED
@@ -1,5 +1,5 @@
1
  # Stage 1: Build dependencies and download models
2
- FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm AS builder
3
 
4
  # Install system dependencies. Need to specify -y for poppler to get it to install
5
  RUN apt-get update \
@@ -27,7 +27,7 @@ COPY lambda_entrypoint.py .
27
  COPY entrypoint.sh .
28
 
29
  # Stage 2: Final runtime image
30
- FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
31
 
32
  # Define a build argument with a default value
33
  ARG APP_MODE=gradio
@@ -52,11 +52,7 @@ RUN apt-get update \
52
  RUN useradd -m -u 1000 user
53
 
54
  # Create required directories
55
- RUN mkdir -p /home/user/app/output \
56
- && mkdir -p /home/user/app/input \
57
- && mkdir -p /home/user/app/tld \
58
- && mkdir -p /home/user/app/logs \
59
- && mkdir -p /home/user/app/config \
60
  && chown -R user:user /home/user/app
61
 
62
  # Copy installed packages from builder stage
@@ -73,10 +69,11 @@ RUN chmod +x /entrypoint.sh
73
  # Switch to the "user" user
74
  USER user
75
 
 
 
76
  # Set environmental variables
77
- ENV HOME=/home/user \
78
- PATH=/home/user/.local/bin:$PATH \
79
- PYTHONPATH=/home/user/app \
80
  PYTHONUNBUFFERED=1 \
81
  PYTHONDONTWRITEBYTECODE=1 \
82
  GRADIO_ALLOW_FLAGGING=never \
@@ -84,15 +81,17 @@ ENV HOME=/home/user \
84
  GRADIO_SERVER_NAME=0.0.0.0 \
85
  GRADIO_SERVER_PORT=7860 \
86
  GRADIO_ANALYTICS_ENABLED=False \
87
- GRADIO_THEME=huggingface \
88
- TLDEXTRACT_CACHE=$HOME/app/tld/.tld_set_snapshot \
89
  SYSTEM=spaces
90
 
91
  # Set the working directory to the user's home directory
92
- WORKDIR $HOME/app
93
 
94
  # Copy the app code to the container
95
- COPY --chown=user . $HOME/app
 
 
 
96
 
97
  ENTRYPOINT [ "/entrypoint.sh" ]
98
 
 
1
  # Stage 1: Build dependencies and download models
2
+ FROM public.ecr.aws/docker/library/python:3.11.11-slim-bookworm AS builder
3
 
4
  # Install system dependencies. Need to specify -y for poppler to get it to install
5
  RUN apt-get update \
 
27
  COPY entrypoint.sh .
28
 
29
  # Stage 2: Final runtime image
30
+ FROM public.ecr.aws/docker/library/python:3.11.11-slim-bookworm
31
 
32
  # Define a build argument with a default value
33
  ARG APP_MODE=gradio
 
52
  RUN useradd -m -u 1000 user
53
 
54
  # Create required directories
55
+ RUN mkdir -p /home/user/app/{output,input,tld,logs,usage,feedback,config} \
 
 
 
 
56
  && chown -R user:user /home/user/app
57
 
58
  # Copy installed packages from builder stage
 
69
  # Switch to the "user" user
70
  USER user
71
 
72
+ ENV APP_HOME=/home/user
73
+
74
  # Set environmental variables
75
+ ENV PATH=$APP_HOME/.local/bin:$PATH \
76
+ PYTHONPATH=$APP_HOME/app \
 
77
  PYTHONUNBUFFERED=1 \
78
  PYTHONDONTWRITEBYTECODE=1 \
79
  GRADIO_ALLOW_FLAGGING=never \
 
81
  GRADIO_SERVER_NAME=0.0.0.0 \
82
  GRADIO_SERVER_PORT=7860 \
83
  GRADIO_ANALYTICS_ENABLED=False \
84
+ TLDEXTRACT_CACHE=$APP_HOME/app/tld/.tld_set_snapshot \
 
85
  SYSTEM=spaces
86
 
87
  # Set the working directory to the user's home directory
88
+ WORKDIR $APP_HOME/app
89
 
90
  # Copy the app code to the container
91
+ COPY --chown=user . $APP_HOME/app
92
+
93
+ # Ensure permissions are really user:user again after copying
94
+ RUN chown -R user:user $APP_HOME/app && chmod -R u+rwX $APP_HOME/app
95
 
96
  ENTRYPOINT [ "/entrypoint.sh" ]
97
 
app.py CHANGED
@@ -1,28 +1,25 @@
1
  import os
 
2
  import pandas as pd
3
  import gradio as gr
4
  from gradio_image_annotation import image_annotator
5
 
6
- from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, TESSERACT_FOLDER, POPPLER_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_BULK_TEXTRACT_CALL_OPTIONS
7
- from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe
8
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3
9
  from tools.file_redaction import choose_and_run_redactor
10
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, convert_review_df_to_annotation_json
11
- from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr
12
  from tools.data_anonymise import anonymise_data_files
13
  from tools.auth import authenticate_user
14
  from tools.load_spacy_model_custom_recognisers import custom_entities
15
  from tools.custom_csvlogger import CSVLogger_custom
16
  from tools.find_duplicate_pages import identify_similar_pages
 
17
 
18
  # Suppress downcasting warnings
19
  pd.set_option('future.no_silent_downcasting', True)
20
 
21
- add_folder_to_path(TESSERACT_FOLDER)
22
- add_folder_to_path(POPPLER_FOLDER)
23
-
24
- ensure_output_folder_exists(OUTPUT_FOLDER)
25
-
26
  chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']
27
 
28
  full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER', "CUSTOM_FUZZY"]
@@ -58,14 +55,16 @@ with app:
58
 
59
  # Pymupdf doc and all image annotations objects need to be stored as State objects as they do not have a standard Gradio component equivalent
60
  pdf_doc_state = gr.State([])
61
- all_image_annotations_state = gr.State([])
62
 
63
 
64
  all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas", wrap=True)
65
  review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas", wrap=True)
66
 
67
  session_hash_state = gr.Textbox(label= "session_hash_state", value="", visible=False)
 
68
  s3_output_folder_state = gr.Textbox(label= "s3_output_folder_state", value="", visible=False)
 
69
  output_folder_textbox = gr.Textbox(value = OUTPUT_FOLDER, label="output_folder_textbox", visible=False)
70
  input_folder_textbox = gr.Textbox(value = INPUT_FOLDER, label="input_folder_textbox", visible=False)
71
 
@@ -133,6 +132,7 @@ with app:
133
 
134
  clear_all_page_redactions = gr.Checkbox(label="clear_all_page_redactions", value=True, visible=False)
135
  prepare_for_review_bool = gr.Checkbox(label="prepare_for_review_bool", value=True, visible=False)
 
136
  prepare_images_bool_false = gr.Checkbox(label="prepare_images_bool_false", value=False, visible=False)
137
 
138
  ## Settings page variables
@@ -147,20 +147,31 @@ with app:
147
  # S3 settings for default allow list load
148
  s3_default_bucket = gr.Textbox(label = "Default S3 bucket", value=DOCUMENT_REDACTION_BUCKET, visible=False)
149
  s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=S3_ALLOW_LIST_PATH, visible=False)
150
- default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=ALLOW_LIST_PATH, visible=False)
 
 
 
 
 
 
 
 
 
151
 
152
  s3_default_cost_codes_file = gr.Textbox(label = "Default cost centre file", value=S3_COST_CODES_PATH, visible=False)
153
- default_cost_codes_output_folder_location = gr.Textbox(label = "Output default cost centre location", value=COST_CODES_PATH, visible=False)
154
  enforce_cost_code_textbox = gr.Textbox(label = "Enforce cost code textbox", value=ENFORCE_COST_CODES, visible=False)
 
155
 
156
  # Base tables that are not modified subsequent to load
157
  recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=3, type="pandas", visible=False, label="recogniser_entity_dataframe_base", show_search="filter", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True)
158
  all_line_level_ocr_results_df_base = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"), label="All OCR results", type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, visible=False)
 
159
  cost_code_dataframe_base = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', wrap=True, max_height=200, visible=False)
160
 
161
  # Duplicate page detection
162
  in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
163
- duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="duplicate_pages_df", visible=False, type="pandas", wrap=True)
164
 
165
  # Tracking variables for current page (not visible)
166
  current_loop_page_number = gr.Number(value=0,precision=0, interactive=False, label = "Last redacted page in document", visible=False)
@@ -168,7 +179,7 @@ with app:
168
 
169
  # Placeholders for elements that may be made visible later below depending on environment variables
170
  cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), type="pandas", visible=False, wrap=True)
171
- cost_code_choice_drop = gr.Dropdown(value="", label="Choose cost code for analysis", choices=[], allow_custom_value=True, visible=False)
172
 
173
  textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=False)
174
  total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=False)
@@ -177,6 +188,22 @@ with app:
177
 
178
  only_extract_text_radio = gr.Checkbox(value=False, label="Only extract text (no redaction)", visible=False)
179
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  ###
181
  # UI DESIGN
182
  ###
@@ -199,32 +226,21 @@ with app:
199
  with gr.Accordion("Redact document", open = True):
200
  in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json', '.zip'], height=file_input_height)
201
 
202
- text_extract_method_radio = gr.Radio(label="Choose text extraction method. AWS Textract has a cost per page - $3.50 per 1,000 pages with signature detection (default), $1.50 without. Go to Redaction settings - AWS Textract options to remove signature detection.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
203
-
204
- with gr.Row(equal_height=True):
205
- pii_identification_method_drop = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost of approximately $0.01 per 10,000 characters.", value = default_pii_detector, choices=[no_redaction_option, local_pii_detector, aws_pii_detector])
206
 
207
  with gr.Accordion("AWS Textract signature detection (default is on)", open = False):
208
- handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=["Extract handwriting", "Extract signatures"], value=["Extract handwriting", "Extract signatures"])
209
 
210
- if SHOW_BULK_TEXTRACT_CALL_OPTIONS == "True":
211
- with gr.Accordion("AWS Textract bulk document API call", open = False, visible=True):
212
- with gr.Row(equal_height=True):
213
- job_name_textbox = gr.Textbox(value="", label="Bulk Textract call", visible=True)
214
- send_document_to_textract_api_btn = gr.Button("Analyse document with AWS Textract", variant="primary", visible=True)
215
- with gr.Row(equal_height=True):
216
- check_state_of_textract_api__call_btn = gr.Button("Check state of Textract job", variant="secondary", visible=True)
217
- job_current_status = gr.Textbox(value="", label="job_current_status", visible=True)
218
- with gr.Row(equal_height=True):
219
- textract_job_output_file = gr.File(label="Textract job output files", height=file_input_height, visible=True)
220
 
221
  if SHOW_COSTS == "True":
222
  with gr.Accordion("Estimated costs and time taken", open = True, visible=True):
223
- with gr.Row(equal_height=True):
224
- textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=True)
225
- total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=True)
226
- estimated_aws_costs_number = gr.Number(label = "Approximate AWS Textract and/or Comprehend cost ($)", value=0.00, precision=2, visible=True)
227
- estimated_time_taken_number = gr.Number(label = "Approximate time taken to extract text/redact (minutes)", value=0, visible=True, precision=2)
228
 
229
  if GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True":
230
  with gr.Accordion("Apply cost code", open = True, visible=True):
@@ -232,19 +248,32 @@ with app:
232
  cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Existing cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', visible=True, wrap=True, max_height=200)
233
  with gr.Column():
234
  reset_cost_code_dataframe_button = gr.Button(value="Reset code code table filter")
235
- cost_code_choice_drop = gr.Dropdown(value="", label="Choose cost code for analysis", choices=[], allow_custom_value=True, visible=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
 
237
  gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the Redaction Settings tab.""")
238
  document_redact_btn = gr.Button("Extract text and redact document", variant="primary", scale = 4)
239
 
240
  with gr.Row():
241
- output_summary = gr.Textbox(label="Output summary", scale=1)
242
  output_file = gr.File(label="Output files", scale = 2)#, height=file_input_height)
243
  latest_file_completed_text = gr.Number(value=0, label="Number of documents redacted", interactive=False, visible=False)
244
 
245
- with gr.Row():
246
- convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
247
-
248
  # Feedback elements are invisible until revealed by redaction action
249
  pdf_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
250
  pdf_feedback_radio = gr.Radio(label = "Quality of results", choices=["The results were good", "The results were not good"], visible=False)
@@ -263,21 +292,16 @@ with app:
263
  annotate_zoom_in = gr.Button("Zoom in", visible=False)
264
  annotate_zoom_out = gr.Button("Zoom out", visible=False)
265
  with gr.Row():
266
- clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
267
 
268
- with gr.Row(equal_height=True):
269
  with gr.Column(scale=2):
270
  with gr.Row(equal_height=True):
271
  annotation_last_page_button = gr.Button("Previous page", scale = 4)
272
- annotate_current_page = gr.Number(value=1, label="Current page", precision=0, scale = 2, min_width=50)
273
- annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
274
  annotation_next_page_button = gr.Button("Next page", scale = 4)
275
- with gr.Column(scale=1):
276
- annotation_button_apply = gr.Button("Apply revised redactions to PDF", variant="primary")
277
-
278
 
279
- with gr.Row():
280
- with gr.Column(scale=2):
281
  zoom_str = str(annotator_zoom_number) + '%'
282
 
283
  annotator = image_annotator(
@@ -297,7 +321,15 @@ with app:
297
  handles_cursor=True,
298
  interactive=False
299
  )
 
 
 
 
 
 
 
300
  with gr.Column(scale=1):
 
301
  update_current_page_redactions_btn = gr.Button(value="Save changes on current page to file", variant="primary")
302
  with gr.Accordion("Search suggested redactions", open=True):
303
  with gr.Row(equal_height=True):
@@ -318,17 +350,7 @@ with app:
318
 
319
  with gr.Accordion("Search all extracted text", open=True):
320
  all_line_level_ocr_results_df = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"), label="All OCR results", visible=True, type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, max_height=400)
321
- reset_all_ocr_results_btn = gr.Button(value="Reset OCR output table filter")
322
-
323
- with gr.Row():
324
- with gr.Column(scale=2):
325
- with gr.Row(equal_height=True):
326
- annotation_last_page_button_bottom = gr.Button("Previous page", scale = 4)
327
- annotate_current_page_bottom = gr.Number(value=1, label="Current page", precision=0, interactive=True, scale = 2, min_width=50)
328
- annotate_max_pages_bottom = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
329
- annotation_next_page_button_bottom = gr.Button("Next page", scale = 4)
330
- with gr.Column(scale=1):
331
- blank_markdown_bot = gr.Markdown(value="", label="")
332
 
333
  with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
334
  convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
@@ -342,8 +364,8 @@ with app:
342
  with gr.Accordion("Identify duplicate pages to redact", open = True):
343
  in_duplicate_pages = gr.File(label="Upload multiple 'ocr_output.csv' data files from redaction jobs here to compare", file_count="multiple", height=file_input_height, file_types=['.csv'])
344
  with gr.Row():
345
- duplicate_threshold_value = gr.Number(value=0.9, label="Minimum similarity to be considered a duplicate (maximum = 1)", scale =1)
346
- find_duplicate_pages_btn = gr.Button(value="Identify duplicate pages", variant="primary", scale = 5)
347
 
348
  duplicate_pages_out = gr.File(label="Duplicate pages analysis output", file_count="multiple", height=file_input_height, file_types=['.csv'])
349
 
@@ -432,7 +454,9 @@ with app:
432
  all_output_files_btn = gr.Button("Click here to view all output files", variant="secondary")
433
  all_output_files = gr.File(label="All files in output folder", file_count='multiple', file_types=['.csv'], interactive=False)
434
 
 
435
  ### UI INTERACTION ###
 
436
 
437
  ###
438
  # PDF/IMAGE REDACTION
@@ -440,7 +464,7 @@ with app:
440
  # Recalculate estimated costs based on changes to inputs
441
  if SHOW_COSTS == 'True':
442
  # Calculate costs
443
- total_pdf_page_count.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
444
  text_extract_method_radio.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
445
  pii_identification_method_drop.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
446
  handwrite_signature_checkbox.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
@@ -460,31 +484,42 @@ with app:
460
  cost_code_dataframe.select(df_select_callback_cost, inputs=[cost_code_dataframe], outputs=[cost_code_choice_drop])
461
  reset_cost_code_dataframe_button.click(reset_base_dataframe, inputs=[cost_code_dataframe_base], outputs=[cost_code_dataframe])
462
 
 
 
463
  in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
464
- success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base]).\
465
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
466
 
467
  # Run redaction function
468
- document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, output_summary]).\
469
- success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop]).\
470
- success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
471
- outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path], api_name="redact_doc").\
472
  success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
473
 
474
  # If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
475
- current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
476
- outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path]).\
477
  success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
478
 
479
  # If a file has been completed, the function will continue onto the next document
480
- latest_file_completed_text.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
481
- outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path]).\
482
  success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
483
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
484
  success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
485
 
486
  # If the line level ocr results are changed by load in by user or by a new redaction task, replace the ocr results displayed in the table
487
  all_line_level_ocr_results_df_base.change(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
 
 
 
 
 
 
 
 
 
488
 
489
  ###
490
  # REVIEW PDF REDACTIONS
@@ -493,7 +528,7 @@ with app:
493
  # Upload previous files for modifying redactions
494
  upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
495
  success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
496
- success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base], api_name="prepare_doc").\
497
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
498
 
499
  # Page number controls
@@ -501,11 +536,11 @@ with app:
501
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
502
  success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
503
 
504
- annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page])
505
- annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page])
506
 
507
- annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page_bottom, annotate_current_page_bottom])
508
- annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page_bottom, annotate_current_page_bottom])
509
 
510
  annotate_current_page_bottom.submit(update_other_annotator_number_from_current, inputs=[annotate_current_page_bottom], outputs=[annotate_current_page])
511
 
@@ -548,16 +583,16 @@ with app:
548
 
549
  # Review OCR text buttom
550
  all_line_level_ocr_results_df.select(df_select_callback_ocr, inputs=[all_line_level_ocr_results_df], outputs=[annotate_current_page, selected_entity_dataframe_row], scroll_to_output=True)
551
- reset_all_ocr_results_btn.click(reset_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
552
 
553
  # Convert review file to xfdf Adobe format
554
  convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
555
- success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base]).\
556
  success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes, page_sizes], outputs=[adobe_review_files_out])
557
 
558
  # Convert xfdf Adobe file back to review_file.csv
559
  convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
560
- success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, text_extract_method_radio, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base]).\
561
  success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
562
 
563
  ###
@@ -601,11 +636,20 @@ with app:
601
  ###
602
 
603
  # Get connection details on app load
604
- app.load(get_connection_params, inputs=[output_folder_textbox, input_folder_textbox], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox, input_folder_textbox])
 
 
 
 
 
 
 
 
605
 
606
  # If relevant environment variable is set, load in the default allow list file from S3 or locally. Even when setting S3 path, need to local path to give a download location
607
- if GET_DEFAULT_ALLOW_LIST == "True" and ALLOW_LIST_PATH:
608
- if not os.path.exists(ALLOW_LIST_PATH) and S3_ALLOW_LIST_PATH:
 
609
  app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_allow_list_file, default_allow_list_output_folder_location]).\
610
  success(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
611
  print("Successfully loaded allow list from S3")
@@ -615,20 +659,24 @@ with app:
615
  else: print("Could not load in default allow list")
616
 
617
  # If relevant environment variable is set, load in the default cost code file from S3 or locally
618
- if GET_COST_CODES == "True" and COST_CODES_PATH:
619
- if not os.path.exists(COST_CODES_PATH) and S3_COST_CODES_PATH:
 
620
  app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_cost_codes_file, default_cost_codes_output_folder_location]).\
621
- success(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
622
  print("Successfully loaded cost codes from S3")
623
  elif os.path.exists(COST_CODES_PATH):
624
  print("Loading cost codes from default cost codes path location:", COST_CODES_PATH)
625
- app.load(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
626
  else: print("Could not load in cost code data")
627
 
 
 
628
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
629
  access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
630
- access_callback.setup([session_hash_textbox], ACCESS_LOGS_FOLDER)
631
- session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
 
632
  success(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
633
 
634
  # User submitted feedback for pdf redactions
@@ -647,16 +695,23 @@ with app:
647
  usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
648
 
649
  if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
650
- usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop], USAGE_LOGS_FOLDER)
651
- latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop], None, preprocess=False).\
 
 
 
 
652
  success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
653
  else:
654
- usage_callback.setup([session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop], USAGE_LOGS_FOLDER)
655
- latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop], None, preprocess=False).\
 
656
  success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
657
 
658
- if __name__ == "__main__":
 
659
 
 
660
  if RUN_DIRECT_MODE == "0":
661
 
662
  if os.environ['COGNITO_AUTH'] == "1":
@@ -667,7 +722,7 @@ if __name__ == "__main__":
667
  else:
668
  from tools.cli_redact import main
669
 
670
- main(first_loop_state, latest_file_completed=0, output_summary="", output_file_list=None,
671
  log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
672
  current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_line_level_ocr_results_df = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),chosen_comprehend_entities = chosen_comprehend_entities, chosen_redact_entities = chosen_redact_entities, handwrite_signature_checkbox = ["Extract handwriting", "Extract signatures"])
673
 
 
1
  import os
2
+ import logging
3
  import pandas as pd
4
  import gradio as gr
5
  from gradio_image_annotation import image_annotator
6
 
7
+ from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, TESSERACT_FOLDER, POPPLER_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_BULK_TEXTRACT_CALL_OPTIONS, TEXTRACT_BULK_ANALYSIS_BUCKET, TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH
8
+ from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select
9
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3
10
  from tools.file_redaction import choose_and_run_redactor
11
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, convert_review_df_to_annotation_json
12
+ from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr, df_select_callback_textract_api
13
  from tools.data_anonymise import anonymise_data_files
14
  from tools.auth import authenticate_user
15
  from tools.load_spacy_model_custom_recognisers import custom_entities
16
  from tools.custom_csvlogger import CSVLogger_custom
17
  from tools.find_duplicate_pages import identify_similar_pages
18
+ from tools.textract_batch_call import analyse_document_with_textract_api, poll_bulk_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id
19
 
20
  # Suppress downcasting warnings
21
  pd.set_option('future.no_silent_downcasting', True)
22
 
 
 
 
 
 
23
  chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']
24
 
25
  full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER', "CUSTOM_FUZZY"]
 
55
 
56
  # Pymupdf doc and all image annotations objects need to be stored as State objects as they do not have a standard Gradio component equivalent
57
  pdf_doc_state = gr.State([])
58
+ all_image_annotations_state = gr.State([])
59
 
60
 
61
  all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas", wrap=True)
62
  review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas", wrap=True)
63
 
64
  session_hash_state = gr.Textbox(label= "session_hash_state", value="", visible=False)
65
+ host_name_textbox = gr.Textbox(label= "host_name_textbox", value=HOST_NAME, visible=False)
66
  s3_output_folder_state = gr.Textbox(label= "s3_output_folder_state", value="", visible=False)
67
+ session_output_folder_textbox = gr.Textbox(value = SESSION_OUTPUT_FOLDER, label="session_output_folder_textbox", visible=False)
68
  output_folder_textbox = gr.Textbox(value = OUTPUT_FOLDER, label="output_folder_textbox", visible=False)
69
  input_folder_textbox = gr.Textbox(value = INPUT_FOLDER, label="input_folder_textbox", visible=False)
70
 
 
132
 
133
  clear_all_page_redactions = gr.Checkbox(label="clear_all_page_redactions", value=True, visible=False)
134
  prepare_for_review_bool = gr.Checkbox(label="prepare_for_review_bool", value=True, visible=False)
135
+ prepare_for_review_bool_false = gr.Checkbox(label="prepare_for_review_bool_false", value=False, visible=False)
136
  prepare_images_bool_false = gr.Checkbox(label="prepare_images_bool_false", value=False, visible=False)
137
 
138
  ## Settings page variables
 
147
  # S3 settings for default allow list load
148
  s3_default_bucket = gr.Textbox(label = "Default S3 bucket", value=DOCUMENT_REDACTION_BUCKET, visible=False)
149
  s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=S3_ALLOW_LIST_PATH, visible=False)
150
+ default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=OUTPUT_ALLOW_LIST_PATH, visible=False)
151
+
152
+ s3_bulk_textract_default_bucket = gr.Textbox(label = "Default Textract bulk S3 bucket", value=TEXTRACT_BULK_ANALYSIS_BUCKET, visible=False)
153
+ s3_bulk_textract_input_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER, visible=False)
154
+ s3_bulk_textract_output_subfolder = gr.Textbox(label = "Default Textract bulk S3 output folder", value=TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER, visible=False)
155
+ successful_textract_api_call_number = gr.Number(precision=0, value=0, visible=False)
156
+
157
+ load_s3_bulk_textract_logs_bool = gr.Textbox(label = "Load Textract logs or not", value=LOAD_PREVIOUS_TEXTRACT_JOBS_S3, visible=False)
158
+ s3_bulk_textract_logs_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_JOBS_S3_LOC, visible=False)
159
+ local_bulk_textract_logs_subfolder = gr.Textbox(label = "Default Textract bulk S3 output folder", value=TEXTRACT_JOBS_LOCAL_LOC, visible=False)
160
 
161
  s3_default_cost_codes_file = gr.Textbox(label = "Default cost centre file", value=S3_COST_CODES_PATH, visible=False)
162
+ default_cost_codes_output_folder_location = gr.Textbox(label = "Output default cost centre location", value=OUTPUT_COST_CODES_PATH, visible=False)
163
  enforce_cost_code_textbox = gr.Textbox(label = "Enforce cost code textbox", value=ENFORCE_COST_CODES, visible=False)
164
+ default_cost_code_textbox = gr.Textbox(label = "Default cost code textbox", value=DEFAULT_COST_CODE, visible=False)
165
 
166
  # Base tables that are not modified subsequent to load
167
  recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=3, type="pandas", visible=False, label="recogniser_entity_dataframe_base", show_search="filter", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True)
168
  all_line_level_ocr_results_df_base = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"), label="All OCR results", type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, visible=False)
169
+ all_line_level_ocr_results_df_placeholder = gr.Dataframe(visible=False)
170
  cost_code_dataframe_base = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', wrap=True, max_height=200, visible=False)
171
 
172
  # Duplicate page detection
173
  in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
174
+ duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="duplicate_pages_df", visible=False, type="pandas", wrap=True)
175
 
176
  # Tracking variables for current page (not visible)
177
  current_loop_page_number = gr.Number(value=0,precision=0, interactive=False, label = "Last redacted page in document", visible=False)
 
179
 
180
  # Placeholders for elements that may be made visible later below depending on environment variables
181
  cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), type="pandas", visible=False, wrap=True)
182
+ cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis. Please contact Finance if you can't find your cost code in the given list.", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=False)
183
 
184
  textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=False)
185
  total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=False)
 
188
 
189
  only_extract_text_radio = gr.Checkbox(value=False, label="Only extract text (no redaction)", visible=False)
190
 
191
+ # Textract API call placeholders in case option not selected in config
192
+
193
+ job_name_textbox = gr.Textbox(value="", label="Bulk Textract call", visible=False)
194
+ send_document_to_textract_api_btn = gr.Button("Analyse document with AWS Textract", variant="primary", visible=False)
195
+
196
+ job_id_textbox = gr.Textbox(label = "Latest job ID for bulk document analysis", value='', visible=False)
197
+ check_state_of_textract_api_call_btn = gr.Button("Check state of Textract document job and download", variant="secondary", visible=False)
198
+ job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=False)
199
+ job_type_dropdown = gr.Dropdown(value="document_text_detection", choices=["document_text_detection", "document_analysis"], label="Job type of Textract analysis job", allow_custom_value=False, visible=False)
200
+ textract_job_detail_df = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time']), label="Previous job details", visible=False, type="pandas", wrap=True)
201
+ selected_job_id_row = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time']), label="Selected job id row", visible=False, type="pandas", wrap=True)
202
+ is_a_textract_api_call = gr.Checkbox(value=False, label="is_a_textract_api_call", visible=False)
203
+ job_output_textbox = gr.Textbox(value="", label="Textract call outputs", visible=False)
204
+
205
+ textract_job_output_file = gr.File(label="Textract job output files", height=file_input_height, visible=False)
206
+
207
  ###
208
  # UI DESIGN
209
  ###
 
226
  with gr.Accordion("Redact document", open = True):
227
  in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json', '.zip'], height=file_input_height)
228
 
229
+ text_extract_method_radio = gr.Radio(label="""Choose text extraction method. Local options are lower quality but cost nothing - they may be worth a try if you are willing to spend some time reviewing outputs. AWS Textract has a cost per page - £2.66 ($3.50) per 1,000 pages with signature detection (default), £1.14 ($1.50) without. Go to Redaction settings - AWS Textract options to remove signature detection.""", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
 
 
 
230
 
231
  with gr.Accordion("AWS Textract signature detection (default is on)", open = False):
232
+ handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=["Extract handwriting", "Extract signatures"], value=["Extract handwriting", "Extract signatures"])
233
 
234
+ with gr.Row(equal_height=True):
235
+ pii_identification_method_drop = gr.Radio(label = """Choose personal information detection method. The local model is lower quality but costs nothing - it may be worth a try if you are willing to spend some time reviewing outputs, or if you are only interested in searching for custom search terms (see Redaction settings - custom deny list). AWS Comprehend has a cost of around £0.0075 ($0.01) per 10,000 characters.""", value = default_pii_detector, choices=[no_redaction_option, local_pii_detector, aws_pii_detector])
 
 
 
 
 
 
 
 
236
 
237
  if SHOW_COSTS == "True":
238
  with gr.Accordion("Estimated costs and time taken", open = True, visible=True):
239
+ with gr.Row(equal_height=True):
240
+ textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=True)
241
+ total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=True)
242
+ estimated_aws_costs_number = gr.Number(label = "Approximate AWS Textract and/or Comprehend cost (£)", value=0.00, precision=2, visible=True)
243
+ estimated_time_taken_number = gr.Number(label = "Approximate time taken to extract text/redact (minutes)", value=0, visible=True, precision=2)
244
 
245
  if GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True":
246
  with gr.Accordion("Apply cost code", open = True, visible=True):
 
248
  cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Existing cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', visible=True, wrap=True, max_height=200)
249
  with gr.Column():
250
  reset_cost_code_dataframe_button = gr.Button(value="Reset code code table filter")
251
+ cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=True)
252
+
253
+ if SHOW_BULK_TEXTRACT_CALL_OPTIONS == "True":
254
+ with gr.Accordion("Submit whole document to AWS Textract API (quicker, max 3,000 pages per document)", open = False, visible=True):
255
+ with gr.Row(equal_height=True):
256
+ gr.Markdown("""Document will be submitted to AWS Textract API service to extract all text in the document. Processing will take place on (secure) AWS servers, and outputs will be stored on S3 for up to 7 days. To download the results, click 'Check status' below and they will be downloaded if ready.""")
257
+ with gr.Row(equal_height=True):
258
+ send_document_to_textract_api_btn = gr.Button("Analyse document with AWS Textract API call", variant="primary", visible=True)
259
+ with gr.Row(equal_height=False):
260
+ with gr.Column(scale=2):
261
+ textract_job_detail_df = gr.Dataframe(label="Previous job details", visible=True, type="pandas", wrap=True, interactive=True, row_count=(0, 'fixed'), col_count=(6,'fixed'), static_columns=[0,1,2,3,4,5])
262
+ with gr.Column(scale=1):
263
+ job_id_textbox = gr.Textbox(label = "Job ID to check status", value='', visible=True)
264
+ check_state_of_textract_api_call_btn = gr.Button("Check status of Textract job and download", variant="secondary", visible=True)
265
+ with gr.Row():
266
+ job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=True)
267
+ textract_job_output_file = gr.File(label="Textract job output files", height=100, visible=True)
268
 
269
  gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the Redaction Settings tab.""")
270
  document_redact_btn = gr.Button("Extract text and redact document", variant="primary", scale = 4)
271
 
272
  with gr.Row():
273
+ redaction_output_summary_textbox = gr.Textbox(label="Output summary", scale=1)
274
  output_file = gr.File(label="Output files", scale = 2)#, height=file_input_height)
275
  latest_file_completed_text = gr.Number(value=0, label="Number of documents redacted", interactive=False, visible=False)
276
 
 
 
 
277
  # Feedback elements are invisible until revealed by redaction action
278
  pdf_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
279
  pdf_feedback_radio = gr.Radio(label = "Quality of results", choices=["The results were good", "The results were not good"], visible=False)
 
292
  annotate_zoom_in = gr.Button("Zoom in", visible=False)
293
  annotate_zoom_out = gr.Button("Zoom out", visible=False)
294
  with gr.Row():
295
+ clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
296
 
297
+ with gr.Row():
298
  with gr.Column(scale=2):
299
  with gr.Row(equal_height=True):
300
  annotation_last_page_button = gr.Button("Previous page", scale = 4)
301
+ annotate_current_page = gr.Number(value=0, label="Current page", precision=0, scale = 2, min_width=50)
302
+ annotate_max_pages = gr.Number(value=0, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
303
  annotation_next_page_button = gr.Button("Next page", scale = 4)
 
 
 
304
 
 
 
305
  zoom_str = str(annotator_zoom_number) + '%'
306
 
307
  annotator = image_annotator(
 
321
  handles_cursor=True,
322
  interactive=False
323
  )
324
+
325
+ with gr.Row(equal_height=True):
326
+ annotation_last_page_button_bottom = gr.Button("Previous page", scale = 4)
327
+ annotate_current_page_bottom = gr.Number(value=0, label="Current page", precision=0, interactive=True, scale = 2, min_width=50)
328
+ annotate_max_pages_bottom = gr.Number(value=0, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
329
+ annotation_next_page_button_bottom = gr.Button("Next page", scale = 4)
330
+
331
  with gr.Column(scale=1):
332
+ annotation_button_apply = gr.Button("Apply revised redactions to PDF", variant="primary")
333
  update_current_page_redactions_btn = gr.Button(value="Save changes on current page to file", variant="primary")
334
  with gr.Accordion("Search suggested redactions", open=True):
335
  with gr.Row(equal_height=True):
 
350
 
351
  with gr.Accordion("Search all extracted text", open=True):
352
  all_line_level_ocr_results_df = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"), label="All OCR results", visible=True, type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, max_height=400)
353
+ reset_all_ocr_results_btn = gr.Button(value="Reset OCR output table filter")
 
 
 
 
 
 
 
 
 
 
354
 
355
  with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
356
  convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
 
364
  with gr.Accordion("Identify duplicate pages to redact", open = True):
365
  in_duplicate_pages = gr.File(label="Upload multiple 'ocr_output.csv' data files from redaction jobs here to compare", file_count="multiple", height=file_input_height, file_types=['.csv'])
366
  with gr.Row():
367
+ duplicate_threshold_value = gr.Number(value=0.9, label="Minimum similarity to be considered a duplicate (maximum = 1)", scale =1)
368
+ find_duplicate_pages_btn = gr.Button(value="Identify duplicate pages", variant="primary", scale = 4)
369
 
370
  duplicate_pages_out = gr.File(label="Duplicate pages analysis output", file_count="multiple", height=file_input_height, file_types=['.csv'])
371
 
 
454
  all_output_files_btn = gr.Button("Click here to view all output files", variant="secondary")
455
  all_output_files = gr.File(label="All files in output folder", file_count='multiple', file_types=['.csv'], interactive=False)
456
 
457
+ ###
458
  ### UI INTERACTION ###
459
+ ###
460
 
461
  ###
462
  # PDF/IMAGE REDACTION
 
464
  # Recalculate estimated costs based on changes to inputs
465
  if SHOW_COSTS == 'True':
466
  # Calculate costs
467
+ total_pdf_page_count.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
468
  text_extract_method_radio.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
469
  pii_identification_method_drop.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
470
  handwrite_signature_checkbox.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
 
484
  cost_code_dataframe.select(df_select_callback_cost, inputs=[cost_code_dataframe], outputs=[cost_code_choice_drop])
485
  reset_cost_code_dataframe_button.click(reset_base_dataframe, inputs=[cost_code_dataframe_base], outputs=[cost_code_dataframe])
486
 
487
+ cost_code_choice_drop.select(update_cost_code_dataframe_from_dropdown_select, inputs=[cost_code_choice_drop, cost_code_dataframe_base], outputs=[cost_code_dataframe])
488
+
489
  in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
490
+ success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base]).\
491
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
492
 
493
  # Run redaction function
494
+ document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call]).\
495
+ success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
496
+ success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
497
+ outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path], api_name="redact_doc").\
498
  success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
499
 
500
  # If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
501
+ current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
502
+ outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path]).\
503
  success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
504
 
505
  # If a file has been completed, the function will continue onto the next document
506
+ latest_file_completed_text.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
507
+ outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path]).\
508
  success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
509
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
510
  success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
511
 
512
  # If the line level ocr results are changed by load in by user or by a new redaction task, replace the ocr results displayed in the table
513
  all_line_level_ocr_results_df_base.change(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
514
+
515
+ # Send whole document to Textract for text extraction
516
+ send_document_to_textract_api_btn.click(analyse_document_with_textract_api, inputs=[prepared_pdf_state, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, textract_job_detail_df, s3_bulk_textract_default_bucket, output_folder_textbox, handwrite_signature_checkbox, successful_textract_api_call_number], outputs=[job_output_textbox, job_id_textbox, job_type_dropdown, successful_textract_api_call_number, is_a_textract_api_call])
517
+
518
+ check_state_of_textract_api_call_btn.click(check_for_provided_job_id, inputs=[job_id_textbox]).\
519
+ success(poll_bulk_textract_analysis_progress_and_download, inputs=[job_id_textbox, job_type_dropdown, s3_bulk_textract_output_subfolder, doc_file_name_no_extension_textbox, textract_job_detail_df, s3_bulk_textract_default_bucket, output_folder_textbox, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs = [textract_job_output_file, job_current_status, textract_job_detail_df]).\
520
+ success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
521
+
522
+ textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
523
 
524
  ###
525
  # REVIEW PDF REDACTIONS
 
528
  # Upload previous files for modifying redactions
529
  upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
530
  success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
531
+ success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base], api_name="prepare_doc").\
532
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
533
 
534
  # Page number controls
 
536
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
537
  success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
538
 
539
+ annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom])
540
+ annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom])
541
 
542
+ annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom])
543
+ annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom])
544
 
545
  annotate_current_page_bottom.submit(update_other_annotator_number_from_current, inputs=[annotate_current_page_bottom], outputs=[annotate_current_page])
546
 
 
583
 
584
  # Review OCR text buttom
585
  all_line_level_ocr_results_df.select(df_select_callback_ocr, inputs=[all_line_level_ocr_results_df], outputs=[annotate_current_page, selected_entity_dataframe_row], scroll_to_output=True)
586
+ reset_all_ocr_results_btn.click(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
587
 
588
  # Convert review file to xfdf Adobe format
589
  convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
590
+ success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder]).\
591
  success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes, page_sizes], outputs=[adobe_review_files_out])
592
 
593
  # Convert xfdf Adobe file back to review_file.csv
594
  convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
595
+ success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder]).\
596
  success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
597
 
598
  ###
 
636
  ###
637
 
638
  # Get connection details on app load
639
+
640
+ if SHOW_BULK_TEXTRACT_CALL_OPTIONS == "True":
641
+ app.load(get_connection_params, inputs=[output_folder_textbox, input_folder_textbox, session_output_folder_textbox, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox, input_folder_textbox, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder]).\
642
+ success(load_in_textract_job_details, inputs=[load_s3_bulk_textract_logs_bool, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs=[textract_job_detail_df])
643
+ else:
644
+ app.load(get_connection_params, inputs=[output_folder_textbox, input_folder_textbox, session_output_folder_textbox, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox, input_folder_textbox, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder])
645
+
646
+
647
+ # If relevant environment variable is set, load in the Textract job details
648
 
649
  # If relevant environment variable is set, load in the default allow list file from S3 or locally. Even when setting S3 path, need to local path to give a download location
650
+ if GET_DEFAULT_ALLOW_LIST == "True" and (ALLOW_LIST_PATH or S3_ALLOW_LIST_PATH):
651
+ if not os.path.exists(ALLOW_LIST_PATH) and S3_ALLOW_LIST_PATH and RUN_AWS_FUNCTIONS == "1":
652
+ print("Downloading allow list from S3")
653
  app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_allow_list_file, default_allow_list_output_folder_location]).\
654
  success(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
655
  print("Successfully loaded allow list from S3")
 
659
  else: print("Could not load in default allow list")
660
 
661
  # If relevant environment variable is set, load in the default cost code file from S3 or locally
662
+ if GET_COST_CODES == "True" and (COST_CODES_PATH or S3_COST_CODES_PATH):
663
+ if not os.path.exists(COST_CODES_PATH) and S3_COST_CODES_PATH and RUN_AWS_FUNCTIONS == "1":
664
+ print("Downloading cost codes from S3")
665
  app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_cost_codes_file, default_cost_codes_output_folder_location]).\
666
+ success(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location, default_cost_code_textbox], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
667
  print("Successfully loaded cost codes from S3")
668
  elif os.path.exists(COST_CODES_PATH):
669
  print("Loading cost codes from default cost codes path location:", COST_CODES_PATH)
670
+ app.load(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location, default_cost_code_textbox], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
671
  else: print("Could not load in cost code data")
672
 
673
+ ### LOGGING
674
+
675
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
676
  access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
677
+ access_callback.setup([session_hash_textbox, host_name_textbox], ACCESS_LOGS_FOLDER)
678
+
679
+ session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox, host_name_textbox], None, preprocess=False).\
680
  success(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
681
 
682
  # User submitted feedback for pdf redactions
 
695
  usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
696
 
697
  if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
698
+ usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], USAGE_LOGS_FOLDER)
699
+
700
+ latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], None, preprocess=False).\
701
+ success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
702
+
703
+ successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], None, preprocess=False).\
704
  success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
705
  else:
706
+ usage_callback.setup([session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], USAGE_LOGS_FOLDER)
707
+
708
+ latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], None, preprocess=False).\
709
  success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
710
 
711
+ successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], None, preprocess=False).\
712
+ success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
713
 
714
+ if __name__ == "__main__":
715
  if RUN_DIRECT_MODE == "0":
716
 
717
  if os.environ['COGNITO_AUTH'] == "1":
 
722
  else:
723
  from tools.cli_redact import main
724
 
725
+ main(first_loop_state, latest_file_completed=0, redaction_output_summary_textbox="", output_file_list=None,
726
  log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
727
  current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_line_level_ocr_results_df = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),chosen_comprehend_entities = chosen_comprehend_entities, chosen_redact_entities = chosen_redact_entities, handwrite_signature_checkbox = ["Extract handwriting", "Extract signatures"])
728
 
requirements.txt CHANGED
@@ -7,13 +7,13 @@ presidio_anonymizer==2.2.358
7
  presidio-image-redactor==0.0.56
8
  pikepdf==9.5.2
9
  pandas==2.2.3
10
- #nltk==3.9.1 # Not required
11
  scikit-learn==1.6.1
12
  spacy==3.8.4
13
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
14
  #en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
15
- gradio==5.23.3
16
- boto3==1.37.17
 
17
  pyarrow==19.0.1
18
  openpyxl==3.1.5
19
  Faker==36.1.1
 
7
  presidio-image-redactor==0.0.56
8
  pikepdf==9.5.2
9
  pandas==2.2.3
 
10
  scikit-learn==1.6.1
11
  spacy==3.8.4
12
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
13
  #en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
14
+ #gradio==5.23.3 # Using latest version of Gradio 5.25.0 below as it fixes the table select issues while filtered
15
+ https://gradio-pypi-previews.s3.amazonaws.com/3e66dcbc9f3b1d106f5488fb1dca51f0787e6d79/gradio-5.25.0-py3-none-any.whl
16
+ boto3==1.37.29
17
  pyarrow==19.0.1
18
  openpyxl==3.1.5
19
  Faker==36.1.1
tools/aws_functions.py CHANGED
@@ -30,129 +30,101 @@ if RUN_AWS_FUNCTIONS == "1":
30
  assumed_role_arn, assumed_role_name = get_assumed_role_info()
31
 
32
  print("Successfully assumed ARN role")
33
- print("Assumed Role ARN:", assumed_role_arn)
34
- print("Assumed Role Name:", assumed_role_name)
35
 
36
  except Exception as e:
37
  print("Could not get assumed role from STS:", e)
38
 
39
  # Download direct from S3 - requires login credentials
40
- def download_file_from_s3(bucket_name:str, key:str, local_file_path_and_name:str):
41
 
42
- s3 = boto3.client('s3', region_name=AWS_REGION)
43
- s3.download_file(bucket_name, key, local_file_path_and_name)
44
- print(f"File downloaded from s3://{bucket_name}/{key} to {local_file_path_and_name}")
45
-
46
- def download_folder_from_s3(bucket_name:str, s3_folder:str, local_folder:str):
47
- """
48
- Download all files from an S3 folder to a local folder.
49
- """
50
- s3 = boto3.client('s3', region_name=AWS_REGION)
51
 
52
- # List objects in the specified S3 folder
53
- response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
54
-
55
- # Download each object
56
- for obj in response.get('Contents', []):
57
- # Extract object key and construct local file path
58
- object_key = obj['Key']
59
- local_file_path = os.path.join(local_folder, os.path.relpath(object_key, s3_folder))
60
 
61
- # Create directories if necessary
62
- os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
63
 
64
- # Download the object
65
- try:
66
- s3.download_file(bucket_name, object_key, local_file_path)
67
- print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
68
  except Exception as e:
69
- print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
70
 
71
- def download_files_from_s3(bucket_name:str, s3_folder:str, local_folder:str, filenames:List[str]):
 
72
  """
73
- Download specific files from an S3 folder to a local folder.
74
  """
75
- s3 = boto3.client('s3', region_name=AWS_REGION)
76
-
77
- print("Trying to download file: ", filenames)
78
-
79
- if filenames == '*':
80
- # List all objects in the S3 folder
81
- print("Trying to download all files in AWS folder: ", s3_folder)
82
- response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
83
-
84
- print("Found files in AWS folder: ", response.get('Contents', []))
85
-
86
- filenames = [obj['Key'].split('/')[-1] for obj in response.get('Contents', [])]
87
 
88
- print("Found filenames in AWS folder: ", filenames)
89
 
90
- for filename in filenames:
91
- object_key = os.path.join(s3_folder, filename)
92
- local_file_path = os.path.join(local_folder, filename)
93
 
94
- # Create directories if necessary
95
- os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
 
 
 
96
 
97
- # Download the object
98
- try:
99
- s3.download_file(bucket_name, object_key, local_file_path)
100
- print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
101
- except Exception as e:
102
- print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
103
 
104
- def load_data_from_aws(in_aws_keyword_file, aws_password:str="", bucket_name:str=DOCUMENT_REDACTION_BUCKET):
 
 
 
 
 
 
105
 
106
- temp_dir = tempfile.mkdtemp()
107
- local_address_stub = temp_dir + '/doc-redaction/'
108
- files = []
109
-
110
- if not 'LAMBETH_BOROUGH_PLAN_PASSWORD' in os.environ:
111
- out_message = "Can't verify password for dataset access. Do you have a valid AWS connection? Data not loaded."
112
- return files, out_message
113
-
114
- if aws_password:
115
- if "Lambeth borough plan" in in_aws_keyword_file and aws_password == os.environ['LAMBETH_BOROUGH_PLAN_PASSWORD']:
116
-
117
- s3_folder_stub = 'example-data/lambeth-borough-plan/latest/'
118
 
119
- local_folder_path = local_address_stub
 
120
 
121
- # Check if folder exists
122
- if not os.path.exists(local_folder_path):
123
- print(f"Folder {local_folder_path} does not exist! Making folder.")
124
 
125
- os.mkdir(local_folder_path)
126
 
127
- # Check if folder is empty
128
- if len(os.listdir(local_folder_path)) == 0:
129
- print(f"Folder {local_folder_path} is empty")
130
- # Download data
131
- download_files_from_s3(bucket_name, s3_folder_stub, local_folder_path, filenames='*')
132
 
133
- print("AWS data downloaded")
134
 
135
- else:
136
- print(f"Folder {local_folder_path} is not empty")
137
 
138
- #files = os.listdir(local_folder_stub)
139
- #print(files)
140
 
141
- files = [os.path.join(local_folder_path, f) for f in os.listdir(local_folder_path) if os.path.isfile(os.path.join(local_folder_path, f))]
 
 
142
 
143
- out_message = "Data successfully loaded from AWS"
144
- print(out_message)
145
 
146
- else:
147
- out_message = "Data not loaded from AWS"
148
- print(out_message)
149
- else:
150
- out_message = "No password provided. Please ask the data team for access if you need this."
151
- print(out_message)
152
 
153
- return files, out_message
154
 
155
- def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=DOCUMENT_REDACTION_BUCKET):
156
  """
157
  Uploads a file from local machine to Amazon S3.
158
 
@@ -165,33 +137,44 @@ def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=DOCU
165
  - Message as variable/printed to console
166
  """
167
  final_out_message = []
 
168
 
169
- s3_client = boto3.client('s3', region_name=AWS_REGION)
 
 
170
 
171
- if isinstance(local_file_paths, str):
172
- local_file_paths = [local_file_paths]
173
 
174
- for file in local_file_paths:
175
- if s3_client:
176
- #print(s3_client)
177
- try:
178
- # Get file name off file path
179
- file_name = os.path.basename(file)
180
 
181
- s3_key_full = s3_key + file_name
182
- print("S3 key: ", s3_key_full)
 
 
 
 
183
 
184
- s3_client.upload_file(file, s3_bucket, s3_key_full)
185
- out_message = "File " + file_name + " uploaded successfully!"
186
- print(out_message)
187
-
188
- except Exception as e:
189
- out_message = f"Error uploading file(s): {e}"
190
- print(out_message)
191
 
192
- final_out_message.append(out_message)
193
- final_out_message_str = '\n'.join(final_out_message)
 
 
 
 
 
194
 
195
- else: final_out_message_str = "Could not connect to AWS."
 
 
 
 
 
 
 
 
 
196
 
197
  return final_out_message_str
 
30
  assumed_role_arn, assumed_role_name = get_assumed_role_info()
31
 
32
  print("Successfully assumed ARN role")
33
+ #print("Assumed Role ARN:", assumed_role_arn)
34
+ #print("Assumed Role Name:", assumed_role_name)
35
 
36
  except Exception as e:
37
  print("Could not get assumed role from STS:", e)
38
 
39
  # Download direct from S3 - requires login credentials
40
+ def download_file_from_s3(bucket_name:str, key:str, local_file_path_and_name:str, RUN_AWS_FUNCTIONS:str = RUN_AWS_FUNCTIONS):
41
 
42
+ if RUN_AWS_FUNCTIONS == "1":
 
 
 
 
 
 
 
 
43
 
44
+ try:
45
+ print("bucket_name:", bucket_name)
46
+ print("key:", key)
47
+ print("local_file_path_and_name:", local_file_path_and_name)
 
 
 
 
48
 
49
+ # Ensure the local directory exists
50
+ os.makedirs(os.path.dirname(local_file_path_and_name), exist_ok=True)
51
 
52
+ s3 = boto3.client('s3', region_name=AWS_REGION)
53
+ s3.download_file(bucket_name, key, local_file_path_and_name)
54
+ print(f"File downloaded from s3://{bucket_name}/{key} to {local_file_path_and_name}")
 
55
  except Exception as e:
56
+ print("Could not download file:", key, "from s3 due to", e)
57
 
58
+
59
+ def download_folder_from_s3(bucket_name:str, s3_folder:str, local_folder:str, RUN_AWS_FUNCTIONS:str = RUN_AWS_FUNCTIONS):
60
  """
61
+ Download all files from an S3 folder to a local folder.
62
  """
63
+ if RUN_AWS_FUNCTIONS == "1":
64
+ if bucket_name and s3_folder and local_folder:
 
 
 
 
 
 
 
 
 
 
65
 
66
+ s3 = boto3.client('s3', region_name=AWS_REGION)
67
 
68
+ # List objects in the specified S3 folder
69
+ response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
 
70
 
71
+ # Download each object
72
+ for obj in response.get('Contents', []):
73
+ # Extract object key and construct local file path
74
+ object_key = obj['Key']
75
+ local_file_path = os.path.join(local_folder, os.path.relpath(object_key, s3_folder))
76
 
77
+ # Create directories if necessary
78
+ os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
 
 
 
 
79
 
80
+ # Download the object
81
+ try:
82
+ s3.download_file(bucket_name, object_key, local_file_path)
83
+ print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
84
+ except Exception as e:
85
+ print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
86
+ else: print("One or more required variables are empty, could not download from S3")
87
 
88
+ def download_files_from_s3(bucket_name:str, s3_folder:str, local_folder:str, filenames:List[str], RUN_AWS_FUNCTIONS:str = RUN_AWS_FUNCTIONS):
89
+ """
90
+ Download specific files from an S3 folder to a local folder.
91
+ """
 
 
 
 
 
 
 
 
92
 
93
+ if RUN_AWS_FUNCTIONS == "1":
94
+ if bucket_name and s3_folder and local_folder and filenames:
95
 
96
+ s3 = boto3.client('s3', region_name=AWS_REGION)
 
 
97
 
98
+ print("Trying to download file: ", filenames)
99
 
100
+ if filenames == '*':
101
+ # List all objects in the S3 folder
102
+ print("Trying to download all files in AWS folder: ", s3_folder)
103
+ response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
 
104
 
105
+ print("Found files in AWS folder: ", response.get('Contents', []))
106
 
107
+ filenames = [obj['Key'].split('/')[-1] for obj in response.get('Contents', [])]
 
108
 
109
+ print("Found filenames in AWS folder: ", filenames)
 
110
 
111
+ for filename in filenames:
112
+ object_key = os.path.join(s3_folder, filename)
113
+ local_file_path = os.path.join(local_folder, filename)
114
 
115
+ # Create directories if necessary
116
+ os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
117
 
118
+ # Download the object
119
+ try:
120
+ s3.download_file(bucket_name, object_key, local_file_path)
121
+ print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
122
+ except Exception as e:
123
+ print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
124
 
125
+ else: print("One or more required variables are empty, could not download from S3")
126
 
127
+ def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=DOCUMENT_REDACTION_BUCKET, RUN_AWS_FUNCTIONS:str = RUN_AWS_FUNCTIONS):
128
  """
129
  Uploads a file from local machine to Amazon S3.
130
 
 
137
  - Message as variable/printed to console
138
  """
139
  final_out_message = []
140
+ final_out_message_str = ""
141
 
142
+ if RUN_AWS_FUNCTIONS == "1":
143
+ try:
144
+ if s3_bucket and s3_key and local_file_paths:
145
 
146
+ s3_client = boto3.client('s3', region_name=AWS_REGION)
 
147
 
148
+ if isinstance(local_file_paths, str):
149
+ local_file_paths = [local_file_paths]
 
 
 
 
150
 
151
+ for file in local_file_paths:
152
+ if s3_client:
153
+ #print(s3_client)
154
+ try:
155
+ # Get file name off file path
156
+ file_name = os.path.basename(file)
157
 
158
+ s3_key_full = s3_key + file_name
159
+ print("S3 key: ", s3_key_full)
 
 
 
 
 
160
 
161
+ s3_client.upload_file(file, s3_bucket, s3_key_full)
162
+ out_message = "File " + file_name + " uploaded successfully!"
163
+ print(out_message)
164
+
165
+ except Exception as e:
166
+ out_message = f"Error uploading file(s): {e}"
167
+ print(out_message)
168
 
169
+ final_out_message.append(out_message)
170
+ final_out_message_str = '\n'.join(final_out_message)
171
+
172
+ else: final_out_message_str = "Could not connect to AWS."
173
+ else: final_out_message_str = "At least one essential variable is empty, could not upload to S3"
174
+ except Exception as e:
175
+ final_out_message_str = "Could not upload files to S3 due to: " + str(e)
176
+ print(final_out_message_str)
177
+ else:
178
+ final_out_message_str = "App not set to run AWS functions"
179
 
180
  return final_out_message_str
tools/aws_textract.py CHANGED
@@ -6,6 +6,7 @@ import json
6
  from collections import defaultdict
7
  import pikepdf
8
  import time
 
9
  from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
10
  from tools.config import AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION
11
 
@@ -38,12 +39,10 @@ def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str=""
38
  else:
39
  client = boto3.client('textract', region_name=AWS_REGION)
40
  except:
41
- print("Cannot connect to AWS Textract")
 
 
42
  return [], "" # Return an empty list and an empty string
43
-
44
- #print("Analysing page with AWS Textract")
45
- #print("pdf_page_bytes:", pdf_page_bytes)
46
- #print("handwrite_signature_checkbox:", handwrite_signature_checkbox)
47
 
48
  # Redact signatures if specified
49
  if "Redact all identified signatures" in handwrite_signature_checkbox:
@@ -137,6 +136,7 @@ def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_
137
  # This is a new page
138
  elif "page_no" in page_json_data:
139
  text_blocks = page_json_data["data"]["Blocks"]
 
140
 
141
  is_signature = False
142
  is_handwriting = False
@@ -275,7 +275,7 @@ def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_
275
 
276
  return all_ocr_results, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_children
277
 
278
- def load_and_convert_textract_json(textract_json_file_path:str, log_files_output_paths:str):
279
  """
280
  Loads Textract JSON from a file, detects if conversion is needed, and converts if necessary.
281
  """
@@ -307,7 +307,7 @@ def load_and_convert_textract_json(textract_json_file_path:str, log_files_output
307
  print("Need to convert Textract JSON to app format.")
308
  try:
309
 
310
- textract_data = restructure_textract_output(textract_data)
311
  return textract_data, False, log_files_output_paths # Successfully converted
312
 
313
  except Exception as e:
@@ -318,7 +318,7 @@ def load_and_convert_textract_json(textract_json_file_path:str, log_files_output
318
  print("textract data:", textract_data)
319
  return {}, True, log_files_output_paths # Return empty data if JSON is not recognized
320
 
321
- def restructure_textract_output(textract_output: dict):
322
  """
323
  Reorganise Textract output from the bulk Textract analysis option on AWS
324
  into a format that works in this redaction app, reducing size.
@@ -328,10 +328,62 @@ def restructure_textract_output(textract_output: dict):
328
  # Extract total pages from DocumentMetadata
329
  document_metadata = textract_output.get("DocumentMetadata", {})
330
 
 
 
 
 
331
  for block in textract_output.get("Blocks", []):
332
  page_no = block.get("Page", 1) # Default to 1 if missing
333
 
334
- # Initialize page structure if not already present
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
  if page_no not in pages_dict:
336
  pages_dict[page_no] = {"page_no": str(page_no), "data": {"Blocks": []}}
337
 
 
6
  from collections import defaultdict
7
  import pikepdf
8
  import time
9
+ import pandas as pd
10
  from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
11
  from tools.config import AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION
12
 
 
39
  else:
40
  client = boto3.client('textract', region_name=AWS_REGION)
41
  except:
42
+ out_message = "Cannot connect to AWS Textract"
43
+ print(out_message)
44
+ raise Exception(out_message)
45
  return [], "" # Return an empty list and an empty string
 
 
 
 
46
 
47
  # Redact signatures if specified
48
  if "Redact all identified signatures" in handwrite_signature_checkbox:
 
136
  # This is a new page
137
  elif "page_no" in page_json_data:
138
  text_blocks = page_json_data["data"]["Blocks"]
139
+ else: text_blocks = []
140
 
141
  is_signature = False
142
  is_handwriting = False
 
275
 
276
  return all_ocr_results, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_children
277
 
278
+ def load_and_convert_textract_json(textract_json_file_path:str, log_files_output_paths:str, page_sizes_df:pd.DataFrame):
279
  """
280
  Loads Textract JSON from a file, detects if conversion is needed, and converts if necessary.
281
  """
 
307
  print("Need to convert Textract JSON to app format.")
308
  try:
309
 
310
+ textract_data = restructure_textract_output(textract_data, page_sizes_df)
311
  return textract_data, False, log_files_output_paths # Successfully converted
312
 
313
  except Exception as e:
 
318
  print("textract data:", textract_data)
319
  return {}, True, log_files_output_paths # Return empty data if JSON is not recognized
320
 
321
+ def restructure_textract_output(textract_output: dict, page_sizes_df:pd.DataFrame):
322
  """
323
  Reorganise Textract output from the bulk Textract analysis option on AWS
324
  into a format that works in this redaction app, reducing size.
 
328
  # Extract total pages from DocumentMetadata
329
  document_metadata = textract_output.get("DocumentMetadata", {})
330
 
331
+ # For efficient lookup, set 'page' as index if it's not already
332
+ if 'page' in page_sizes_df.columns:
333
+ page_sizes_df = page_sizes_df.set_index('page')
334
+
335
  for block in textract_output.get("Blocks", []):
336
  page_no = block.get("Page", 1) # Default to 1 if missing
337
 
338
+ # --- Geometry Conversion Logic ---
339
+ try:
340
+ page_info = page_sizes_df.loc[page_no]
341
+ cb_width = page_info['cropbox_width']
342
+ cb_height = page_info['cropbox_height']
343
+ mb_width = page_info['mediabox_width']
344
+ mb_height = page_info['mediabox_height']
345
+ cb_x_offset = page_info['cropbox_x_offset']
346
+ cb_y_offset_top = page_info['cropbox_y_offset_from_top']
347
+
348
+ # Check if conversion is needed (and avoid division by zero)
349
+ needs_conversion = (
350
+ abs(cb_width - mb_width) > 1e-6 or \
351
+ abs(cb_height - mb_height) > 1e-6
352
+ ) and mb_width > 1e-6 and mb_height > 1e-6 # Avoid division by zero
353
+
354
+ if needs_conversion and 'Geometry' in block:
355
+ geometry = block['Geometry'] # Work directly on the block's geometry
356
+
357
+ # --- Convert BoundingBox ---
358
+ if 'BoundingBox' in geometry:
359
+ bbox = geometry['BoundingBox']
360
+ old_left = bbox['Left']
361
+ old_top = bbox['Top']
362
+ old_width = bbox['Width']
363
+ old_height = bbox['Height']
364
+
365
+ # Calculate absolute coordinates within CropBox
366
+ abs_cb_x = old_left * cb_width
367
+ abs_cb_y = old_top * cb_height
368
+ abs_cb_width = old_width * cb_width
369
+ abs_cb_height = old_height * cb_height
370
+
371
+ # Calculate absolute coordinates relative to MediaBox top-left
372
+ abs_mb_x = cb_x_offset + abs_cb_x
373
+ abs_mb_y = cb_y_offset_top + abs_cb_y
374
+
375
+ # Convert back to normalized coordinates relative to MediaBox
376
+ bbox['Left'] = abs_mb_x / mb_width
377
+ bbox['Top'] = abs_mb_y / mb_height
378
+ bbox['Width'] = abs_cb_width / mb_width
379
+ bbox['Height'] = abs_cb_height / mb_height
380
+ except KeyError:
381
+ print(f"Warning: Page number {page_no} not found in page_sizes_df. Skipping coordinate conversion for this block.")
382
+ # Decide how to handle missing page info: skip conversion, raise error, etc.
383
+ except ZeroDivisionError:
384
+ print(f"Warning: MediaBox width or height is zero for page {page_no}. Skipping coordinate conversion for this block.")
385
+
386
+ # Initialise page structure if not already present
387
  if page_no not in pages_dict:
388
  pages_dict[page_no] = {"page_no": str(page_no), "data": {"Blocks": []}}
389
 
tools/config.py CHANGED
@@ -1,12 +1,13 @@
1
  import os
2
  import tempfile
3
  import socket
 
4
  from datetime import datetime
5
  from dotenv import load_dotenv
6
  from tldextract import TLDExtract
7
 
8
  today_rev = datetime.now().strftime("%Y%m%d")
9
- host_name = socket.gethostname()
10
 
11
  # Set or retrieve configuration variables for the redaction app
12
 
@@ -27,29 +28,71 @@ def get_or_create_env_var(var_name:str, default_value:str, print_val:bool=False)
27
 
28
  return value
29
 
 
 
30
 
31
- # If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. '/env/app_config.env'
32
- APP_CONFIG_PATH = get_or_create_env_var('APP_CONFIG_PATH', '')
 
 
 
 
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- if os.path.exists(APP_CONFIG_PATH):
36
- print(f"Loading APP variables from config file {APP_CONFIG_PATH}")
37
- load_dotenv(APP_CONFIG_PATH)
 
 
 
 
 
 
 
 
 
38
 
39
  ###
40
  # AWS CONFIG
41
  ###
42
 
43
- # If you have an aws_config env file in the config folder, you can load in AWS keys this way, e.g. '/env/aws_config.env'
44
- AWS_CONFIG_PATH = get_or_create_env_var('AWS_CONFIG_PATH', '')
45
 
46
- if os.path.exists(AWS_CONFIG_PATH):
47
- print(f"Loading AWS variables from config file {AWS_CONFIG_PATH}")
48
- load_dotenv(AWS_CONFIG_PATH)
 
 
49
 
50
  RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
51
 
52
- AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
53
 
54
  AWS_CLIENT_ID = get_or_create_env_var('AWS_CLIENT_ID', '')
55
 
@@ -65,14 +108,28 @@ if AWS_SECRET_KEY: print(f'AWS_SECRET_KEY found in environment variables')
65
 
66
  DOCUMENT_REDACTION_BUCKET = get_or_create_env_var('DOCUMENT_REDACTION_BUCKET', '')
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  # Custom headers e.g. if routing traffic through Cloudfront
69
  # Retrieving or setting CUSTOM_HEADER
70
  CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
71
- if CUSTOM_HEADER: print(f'CUSTOM_HEADER found')
72
 
73
  # Retrieving or setting CUSTOM_HEADER_VALUE
74
  CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
75
- if CUSTOM_HEADER_VALUE: print(f'CUSTOM_HEADER_VALUE found')
76
 
77
  ###
78
  # Images config
@@ -84,12 +141,14 @@ MAX_IMAGE_PIXELS = get_or_create_env_var('MAX_IMAGE_PIXELS', '') # Changed to No
84
  ###
85
  # File I/O config
86
  ###
87
-
88
- SESSION_OUTPUT_FOLDER = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'True') # i.e. do you want your input and output folders saved within a subfolder based on session hash value within output/input folders
89
 
90
  OUTPUT_FOLDER = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/') # 'output/'
91
  INPUT_FOLDER = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/') # 'input/'
92
 
 
 
 
93
  # Allow for files to be saved in a temporary folder for increased security in some instances
94
  if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
95
  # Create a temporary directory
@@ -99,22 +158,39 @@ if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
99
  if OUTPUT_FOLDER == "TEMP": OUTPUT_FOLDER = temp_dir + "/"
100
  if INPUT_FOLDER == "TEMP": INPUT_FOLDER = temp_dir + "/"
101
 
102
- FEEDBACK_LOGS_FOLDER = get_or_create_env_var('FEEDBACK_LOGS_FOLDER', 'feedback/' + today_rev + '/' + host_name + '/')
 
 
 
 
 
 
 
 
 
 
103
 
104
- USAGE_LOGS_FOLDER = get_or_create_env_var('USAGE_LOGS_FOLDER', 'logs/' + today_rev + '/' + host_name + '/')
 
 
105
 
106
- ACCESS_LOGS_FOLDER = get_or_create_env_var('ACCESS_LOGS_FOLDER', 'usage/' + today_rev + '/' + host_name + '/')
 
 
107
 
 
108
  DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS', 'False')
109
 
110
  ###
111
  # REDACTION CONFIG
112
- ###
113
- TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "tesseract/")
114
 
115
- POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "poppler/poppler-24.02.0/Library/bin/")
 
 
 
 
 
116
 
117
- SHOW_BULK_TEXTRACT_CALL_OPTIONS = get_or_create_env_var('SHOW_BULK_TEXTRACT_CALL_OPTIONS', 'False') # This feature not currently implemented
118
 
119
  # Number of pages to loop through before breaking the function and restarting from the last finished page (not currently activated).
120
  PAGE_BREAK_VALUE = get_or_create_env_var('PAGE_BREAK_VALUE', '99999')
@@ -130,7 +206,10 @@ REDACTION_LANGUAGE = get_or_create_env_var("REDACTION_LANGUAGE", "en") # Current
130
  ###
131
 
132
  TLDEXTRACT_CACHE = get_or_create_env_var('TLDEXTRACT_CACHE', 'tld/.tld_set_snapshot')
133
- extract = TLDExtract(cache_dir=TLDEXTRACT_CACHE)
 
 
 
134
 
135
  # Get some environment variables and Launch the Gradio app
136
  COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
@@ -153,15 +232,22 @@ ALLOW_LIST_PATH = get_or_create_env_var('ALLOW_LIST_PATH', '') # config/default_
153
 
154
  S3_ALLOW_LIST_PATH = get_or_create_env_var('S3_ALLOW_LIST_PATH', '') # default_allow_list.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
155
 
156
- SHOW_COSTS = get_or_create_env_var('SHOW_COSTS', 'True')
 
 
 
157
 
158
  GET_COST_CODES = get_or_create_env_var('GET_COST_CODES', 'False')
159
 
 
 
160
  COST_CODES_PATH = get_or_create_env_var('COST_CODES_PATH', '') # 'config/COST_CENTRES.csv' # file should be a csv file with a single table in it that has two columns with a header. First column should contain cost codes, second column should contain a name or description for the cost code
161
 
162
  S3_COST_CODES_PATH = get_or_create_env_var('S3_COST_CODES_PATH', '') # COST_CENTRES.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
163
 
 
 
 
164
  ENFORCE_COST_CODES = get_or_create_env_var('ENFORCE_COST_CODES', 'False') # If you have cost codes listed, is it compulsory to choose one before redacting?
165
 
166
- if ENFORCE_COST_CODES == 'True': GET_COST_CODES = 'True'
167
- if GET_COST_CODES == 'True': ENFORCE_COST_CODES = 'False'
 
1
  import os
2
  import tempfile
3
  import socket
4
+ import logging
5
  from datetime import datetime
6
  from dotenv import load_dotenv
7
  from tldextract import TLDExtract
8
 
9
  today_rev = datetime.now().strftime("%Y%m%d")
10
+ HOST_NAME = socket.gethostname()
11
 
12
  # Set or retrieve configuration variables for the redaction app
13
 
 
28
 
29
  return value
30
 
31
+ def ensure_folder_exists(output_folder:str):
32
+ """Checks if the specified folder exists, creates it if not."""
33
 
34
+ if not os.path.exists(output_folder):
35
+ # Create the folder if it doesn't exist
36
+ os.makedirs(output_folder, exist_ok=True)
37
+ print(f"Created the {output_folder} folder.")
38
+ else:
39
+ print(f"The {output_folder} folder already exists.")
40
 
41
+ def add_folder_to_path(folder_path: str):
42
+ '''
43
+ Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
44
+ '''
45
+
46
+ if os.path.exists(folder_path) and os.path.isdir(folder_path):
47
+ print(folder_path, "folder exists.")
48
+
49
+ # Resolve relative path to absolute path
50
+ absolute_path = os.path.abspath(folder_path)
51
+
52
+ current_path = os.environ['PATH']
53
+ if absolute_path not in current_path.split(os.pathsep):
54
+ full_path_extension = absolute_path + os.pathsep + current_path
55
+ os.environ['PATH'] = full_path_extension
56
+ #print(f"Updated PATH with: ", full_path_extension)
57
+ else:
58
+ print(f"Directory {folder_path} already exists in PATH.")
59
+ else:
60
+ print(f"Folder not found at {folder_path} - not added to PATH")
61
+
62
+ ensure_folder_exists("config/")
63
+
64
+ # If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/app_config.env'
65
+ APP_CONFIG_PATH = get_or_create_env_var('APP_CONFIG_PATH', 'config/app_config.env') # e.g. config/app_config.env
66
 
67
+ if APP_CONFIG_PATH:
68
+ if os.path.exists(APP_CONFIG_PATH):
69
+ print(f"Loading app variables from config file {APP_CONFIG_PATH}")
70
+ load_dotenv(APP_CONFIG_PATH)
71
+ else: print("App config file not found at location:", APP_CONFIG_PATH)
72
+
73
+ # Report logging to console?
74
+ LOGGING = get_or_create_env_var('LOGGING', 'False')
75
+
76
+ if LOGGING == 'True':
77
+ # Configure logging
78
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
79
 
80
  ###
81
  # AWS CONFIG
82
  ###
83
 
84
+ # If you have an aws_config env file in the config folder, you can load in AWS keys this way, e.g. 'env/aws_config.env'
85
+ AWS_CONFIG_PATH = get_or_create_env_var('AWS_CONFIG_PATH', '') # e.g. config/aws_config.env
86
 
87
+ if AWS_CONFIG_PATH:
88
+ if os.path.exists(AWS_CONFIG_PATH):
89
+ print(f"Loading AWS variables from config file {AWS_CONFIG_PATH}")
90
+ load_dotenv(AWS_CONFIG_PATH)
91
+ else: print("AWS config file not found at location:", AWS_CONFIG_PATH)
92
 
93
  RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
94
 
95
+ AWS_REGION = get_or_create_env_var('AWS_REGION', '')
96
 
97
  AWS_CLIENT_ID = get_or_create_env_var('AWS_CLIENT_ID', '')
98
 
 
108
 
109
  DOCUMENT_REDACTION_BUCKET = get_or_create_env_var('DOCUMENT_REDACTION_BUCKET', '')
110
 
111
+ SHOW_BULK_TEXTRACT_CALL_OPTIONS = get_or_create_env_var('SHOW_BULK_TEXTRACT_CALL_OPTIONS', 'False') # This feature not currently implemented
112
+
113
+ TEXTRACT_BULK_ANALYSIS_BUCKET = get_or_create_env_var('TEXTRACT_BULK_ANALYSIS_BUCKET', '')
114
+
115
+ TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER = get_or_create_env_var('TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER', 'input')
116
+
117
+ TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER = get_or_create_env_var('TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER', 'output')
118
+
119
+ LOAD_PREVIOUS_TEXTRACT_JOBS_S3 = get_or_create_env_var('LOAD_PREVIOUS_TEXTRACT_JOBS_S3', 'False') # Whether or not to load previous Textract jobs from S3
120
+
121
+ TEXTRACT_JOBS_S3_LOC = get_or_create_env_var('TEXTRACT_JOBS_S3_LOC', 'output') # Subfolder in the DOCUMENT_REDACTION_BUCKET where the Textract jobs are stored
122
+
123
+ TEXTRACT_JOBS_LOCAL_LOC = get_or_create_env_var('TEXTRACT_JOBS_LOCAL_LOC', 'output') # Local subfolder where the Textract jobs are stored
124
+
125
  # Custom headers e.g. if routing traffic through Cloudfront
126
  # Retrieving or setting CUSTOM_HEADER
127
  CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
128
+ #if CUSTOM_HEADER: print(f'CUSTOM_HEADER found')
129
 
130
  # Retrieving or setting CUSTOM_HEADER_VALUE
131
  CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
132
+ #if CUSTOM_HEADER_VALUE: print(f'CUSTOM_HEADER_VALUE found')
133
 
134
  ###
135
  # Images config
 
141
  ###
142
  # File I/O config
143
  ###
144
+ SESSION_OUTPUT_FOLDER = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'False') # i.e. do you want your input and output folders saved within a subfolder based on session hash value within output/input folders
 
145
 
146
  OUTPUT_FOLDER = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/') # 'output/'
147
  INPUT_FOLDER = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/') # 'input/'
148
 
149
+ ensure_folder_exists(OUTPUT_FOLDER)
150
+ ensure_folder_exists(INPUT_FOLDER)
151
+
152
  # Allow for files to be saved in a temporary folder for increased security in some instances
153
  if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
154
  # Create a temporary directory
 
158
  if OUTPUT_FOLDER == "TEMP": OUTPUT_FOLDER = temp_dir + "/"
159
  if INPUT_FOLDER == "TEMP": INPUT_FOLDER = temp_dir + "/"
160
 
161
+ # By default, logs are put into a subfolder of today's date and the host name of the instance running the app. This is to avoid at all possible the possibility of log files from one instance overwriting the logs of another instance on S3. If running the app on one system always, or just locally, it is not necessary to make the log folders so specific.
162
+ # Another way to address this issue would be to write logs to another type of storage, e.g. database such as dynamodb. I may look into this in future.
163
+
164
+ USE_LOG_SUBFOLDERS = get_or_create_env_var('USE_LOG_SUBFOLDERS', 'True')
165
+
166
+ if USE_LOG_SUBFOLDERS == "True":
167
+ day_log_subfolder = today_rev + '/'
168
+ host_name_subfolder = HOST_NAME + '/'
169
+ full_log_subfolder = day_log_subfolder + host_name_subfolder
170
+ else:
171
+ full_log_subfolder = ""
172
 
173
+ FEEDBACK_LOGS_FOLDER = get_or_create_env_var('FEEDBACK_LOGS_FOLDER', 'feedback/' + full_log_subfolder)
174
+ ACCESS_LOGS_FOLDER = get_or_create_env_var('ACCESS_LOGS_FOLDER', 'logs/' + full_log_subfolder)
175
+ USAGE_LOGS_FOLDER = get_or_create_env_var('USAGE_LOGS_FOLDER', 'usage/' + full_log_subfolder)
176
 
177
+ ensure_folder_exists(FEEDBACK_LOGS_FOLDER)
178
+ ensure_folder_exists(ACCESS_LOGS_FOLDER)
179
+ ensure_folder_exists(USAGE_LOGS_FOLDER)
180
 
181
+ # Should the redacted file name be included in the logs? In some instances, the names of the files themselves could be sensitive, and should not be disclosed beyond the app. So, by default this is false.
182
  DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS', 'False')
183
 
184
  ###
185
  # REDACTION CONFIG
 
 
186
 
187
+ # Create Tesseract and Poppler folders if you have installed them locally
188
+ TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "") # e.g. tesseract/
189
+ POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "") # e.g. poppler/poppler-24.02.0/Library/bin/
190
+
191
+ if TESSERACT_FOLDER: add_folder_to_path(TESSERACT_FOLDER)
192
+ if POPPLER_FOLDER: add_folder_to_path(POPPLER_FOLDER)
193
 
 
194
 
195
  # Number of pages to loop through before breaking the function and restarting from the last finished page (not currently activated).
196
  PAGE_BREAK_VALUE = get_or_create_env_var('PAGE_BREAK_VALUE', '99999')
 
206
  ###
207
 
208
  TLDEXTRACT_CACHE = get_or_create_env_var('TLDEXTRACT_CACHE', 'tld/.tld_set_snapshot')
209
+ try:
210
+ extract = TLDExtract(cache_dir=TLDEXTRACT_CACHE)
211
+ except:
212
+ extract = TLDExtract(cache_dir=None)
213
 
214
  # Get some environment variables and Launch the Gradio app
215
  COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
 
232
 
233
  S3_ALLOW_LIST_PATH = get_or_create_env_var('S3_ALLOW_LIST_PATH', '') # default_allow_list.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
234
 
235
+ if ALLOW_LIST_PATH: OUTPUT_ALLOW_LIST_PATH = ALLOW_LIST_PATH
236
+ else: OUTPUT_ALLOW_LIST_PATH = 'config/default_allow_list.csv'
237
+
238
+ SHOW_COSTS = get_or_create_env_var('SHOW_COSTS', 'False')
239
 
240
  GET_COST_CODES = get_or_create_env_var('GET_COST_CODES', 'False')
241
 
242
+ DEFAULT_COST_CODE = get_or_create_env_var('DEFAULT_COST_CODE', '')
243
+
244
  COST_CODES_PATH = get_or_create_env_var('COST_CODES_PATH', '') # 'config/COST_CENTRES.csv' # file should be a csv file with a single table in it that has two columns with a header. First column should contain cost codes, second column should contain a name or description for the cost code
245
 
246
  S3_COST_CODES_PATH = get_or_create_env_var('S3_COST_CODES_PATH', '') # COST_CENTRES.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
247
 
248
+ if COST_CODES_PATH: OUTPUT_COST_CODES_PATH = COST_CODES_PATH
249
+ else: OUTPUT_COST_CODES_PATH = 'config/COST_CENTRES.csv'
250
+
251
  ENFORCE_COST_CODES = get_or_create_env_var('ENFORCE_COST_CODES', 'False') # If you have cost codes listed, is it compulsory to choose one before redacting?
252
 
253
+ if ENFORCE_COST_CODES == 'True': GET_COST_CODES = 'True'
 
tools/file_conversion.py CHANGED
@@ -181,7 +181,7 @@ def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min
181
  widths = [result[2] for result in results]
182
  heights = [result[3] for result in results]
183
 
184
- print("PDF has been converted to images.")
185
  return images, widths, heights, results
186
 
187
  # Function to take in a file path, decide if it is an image or pdf, then process appropriately.
@@ -208,7 +208,7 @@ def process_file_for_image_creation(file_path:str, prepare_for_review:bool=False
208
 
209
  # Check if the file is a PDF
210
  elif file_extension == '.pdf':
211
- print(f"{file_path} is a PDF file. Converting to image set")
212
 
213
  # Run your function for processing PDF files here
214
  img_path, image_sizes_width, image_sizes_height, all_img_details = convert_pdf_to_images(file_path, prepare_for_review, input_folder=input_folder, create_images=create_images)
@@ -417,12 +417,29 @@ def create_page_size_objects(pymupdf_doc:Document, image_sizes_width:List[float]
417
  pymupdf_page = pymupdf_doc.load_page(page_no)
418
  original_cropboxes.append(pymupdf_page.cropbox) # Save original CropBox
419
 
420
- # Create a page_sizes_object.
421
- # If images have been created, then image width an height come from this value. Otherwise, they are set to the cropbox size
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
422
  if image_sizes_width and image_sizes_height:
423
- out_page_image_sizes = {"page":reported_page_no, "image_path":image_file_paths[page_no], "image_width":image_sizes_width[page_no], "image_height":image_sizes_height[page_no], "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":pymupdf_page.cropbox.width, "cropbox_height":pymupdf_page.cropbox.height, "original_cropbox":original_cropboxes[-1]}
424
- else:
425
- out_page_image_sizes = {"page":reported_page_no, "image_path":image_file_paths[page_no], "image_width":pd.NA, "image_height":pd.NA, "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":pymupdf_page.cropbox.width, "cropbox_height":pymupdf_page.cropbox.height, "original_cropbox":original_cropboxes[-1]}
426
 
427
  page_sizes.append(out_page_image_sizes)
428
 
@@ -434,7 +451,7 @@ def prepare_image_or_pdf(
434
  latest_file_completed: int = 0,
435
  out_message: List[str] = [],
436
  first_loop_state: bool = False,
437
- number_of_pages:int = 1,
438
  all_annotations_object:List = [],
439
  prepare_for_review:bool = False,
440
  in_fully_redacted_list:List[int]=[],
@@ -481,6 +498,9 @@ def prepare_image_or_pdf(
481
  all_img_details = []
482
  review_file_csv = pd.DataFrame()
483
  all_line_level_ocr_results_df = pd.DataFrame()
 
 
 
484
 
485
  if isinstance(in_fully_redacted_list, pd.DataFrame):
486
  if not in_fully_redacted_list.empty:
@@ -494,7 +514,7 @@ def prepare_image_or_pdf(
494
  else:
495
  print("Now redacting file", str(latest_file_completed))
496
 
497
- # If out message or converted_file_paths are blank, change to a list so it can be appended to
498
  if isinstance(out_message, str): out_message = [out_message]
499
 
500
  if not file_paths: file_paths = []
@@ -521,15 +541,9 @@ def prepare_image_or_pdf(
521
  file_paths_list = [file_paths]
522
  file_paths_loop = file_paths_list
523
  else:
524
- if prepare_for_review == False:
525
- file_paths_list = file_paths
526
- file_paths_loop = [file_paths_list[int(latest_file_completed)]]
527
- else:
528
- file_paths_list = file_paths
529
- file_paths_loop = file_paths
530
- # Sort files to prioritise PDF files first, then JSON files. This means that the pdf can be loaded in, and pdf page path locations can be added to the json
531
- file_paths_loop = sorted(file_paths_loop, key=lambda x: (os.path.splitext(x)[1] != '.pdf', os.path.splitext(x)[1] != '.json'))
532
-
533
  # Loop through files to load in
534
  for file in file_paths_loop:
535
  converted_file_path = []
@@ -592,7 +606,6 @@ def prepare_image_or_pdf(
592
 
593
  image_file_paths, image_sizes_width, image_sizes_height, all_img_details = process_file_for_image_creation(file_path_str, prepare_for_review, input_folder, create_images=True)
594
 
595
-
596
  # Create a page_sizes_object
597
  page_sizes, original_cropboxes = create_page_size_objects(pymupdf_doc, image_sizes_width, image_sizes_height, image_file_paths)
598
 
@@ -612,7 +625,8 @@ def prepare_image_or_pdf(
612
  json_from_csv = False
613
 
614
  # NEW IF STATEMENT
615
- # If the file name ends with redactions.json, assume it is an annoations object, overwrite the current variable
 
616
  if (file_extension in ['.json']) | (json_from_csv == True):
617
 
618
  if (file_extension in ['.json']) & (prepare_for_review == True):
@@ -624,9 +638,14 @@ def prepare_image_or_pdf(
624
  all_annotations_object = json.loads(file_path) # Use loads for string content
625
 
626
  # Assume it's a textract json
627
- elif (file_extension == '.json') and (prepare_for_review is not True):
 
628
  # Copy it to the output folder so it can be used later.
629
- out_textract_path = os.path.join(output_folder, file_path_without_ext + "_textract.json")
 
 
 
 
630
 
631
  # Use shutil to copy the file directly
632
  shutil.copy2(file_path, out_textract_path) # Preserves metadata
@@ -748,11 +767,11 @@ def prepare_image_or_pdf(
748
  print(out_time)
749
 
750
  out_message.append(out_time)
751
- out_message_out = '\n'.join(out_message)
752
 
753
- number_of_pages = len(image_file_paths)
754
 
755
- return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details, all_line_level_ocr_results_df
756
 
757
  def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi, output_folder:str=OUTPUT_FOLDER, input_folder:str=INPUT_FOLDER):
758
  file_path_without_ext = get_file_name_without_type(in_file_path)
 
181
  widths = [result[2] for result in results]
182
  heights = [result[3] for result in results]
183
 
184
+ #print("PDF has been converted to images.")
185
  return images, widths, heights, results
186
 
187
  # Function to take in a file path, decide if it is an image or pdf, then process appropriately.
 
208
 
209
  # Check if the file is a PDF
210
  elif file_extension == '.pdf':
211
+ # print(f"{file_path} is a PDF file. Converting to image set")
212
 
213
  # Run your function for processing PDF files here
214
  img_path, image_sizes_width, image_sizes_height, all_img_details = convert_pdf_to_images(file_path, prepare_for_review, input_folder=input_folder, create_images=create_images)
 
417
  pymupdf_page = pymupdf_doc.load_page(page_no)
418
  original_cropboxes.append(pymupdf_page.cropbox) # Save original CropBox
419
 
420
+ # Create a page_sizes_object. If images have been created, then image width an height come from this value. Otherwise, they are set to the cropbox size
421
+ out_page_image_sizes = {
422
+ "page":reported_page_no,
423
+ "mediabox_width":pymupdf_page.mediabox.width,
424
+ "mediabox_height": pymupdf_page.mediabox.height,
425
+ "cropbox_width":pymupdf_page.cropbox.width,
426
+ "cropbox_height":pymupdf_page.cropbox.height,
427
+ "original_cropbox":original_cropboxes[-1],
428
+ "image_path":image_file_paths[page_no]}
429
+
430
+ # cropbox_x_offset: Distance from MediaBox left edge to CropBox left edge
431
+ # This is simply the difference in their x0 coordinates.
432
+ out_page_image_sizes['cropbox_x_offset'] = pymupdf_page.cropbox.x0 - pymupdf_page.mediabox.x0
433
+
434
+ # cropbox_y_offset_from_top: Distance from MediaBox top edge to CropBox top edge
435
+ # MediaBox top y = mediabox.y1
436
+ # CropBox top y = cropbox.y1
437
+ # The difference is mediabox.y1 - cropbox.y1
438
+ out_page_image_sizes['cropbox_y_offset_from_top'] = pymupdf_page.mediabox.y1 - pymupdf_page.cropbox.y1
439
+
440
  if image_sizes_width and image_sizes_height:
441
+ out_page_image_sizes["image_width"] = image_sizes_width[page_no]
442
+ out_page_image_sizes["image_height"] = image_sizes_height[page_no]
 
443
 
444
  page_sizes.append(out_page_image_sizes)
445
 
 
451
  latest_file_completed: int = 0,
452
  out_message: List[str] = [],
453
  first_loop_state: bool = False,
454
+ number_of_pages:int = 0,
455
  all_annotations_object:List = [],
456
  prepare_for_review:bool = False,
457
  in_fully_redacted_list:List[int]=[],
 
498
  all_img_details = []
499
  review_file_csv = pd.DataFrame()
500
  all_line_level_ocr_results_df = pd.DataFrame()
501
+ out_textract_path = ""
502
+ combined_out_message = ""
503
+ final_out_message = ""
504
 
505
  if isinstance(in_fully_redacted_list, pd.DataFrame):
506
  if not in_fully_redacted_list.empty:
 
514
  else:
515
  print("Now redacting file", str(latest_file_completed))
516
 
517
+ # If combined out message or converted_file_paths are blank, change to a list so it can be appended to
518
  if isinstance(out_message, str): out_message = [out_message]
519
 
520
  if not file_paths: file_paths = []
 
541
  file_paths_list = [file_paths]
542
  file_paths_loop = file_paths_list
543
  else:
544
+ file_paths_list = file_paths
545
+ file_paths_loop = sorted(file_paths_list, key=lambda x: (os.path.splitext(x)[1] != '.pdf', os.path.splitext(x)[1] != '.json'))
546
+
 
 
 
 
 
 
547
  # Loop through files to load in
548
  for file in file_paths_loop:
549
  converted_file_path = []
 
606
 
607
  image_file_paths, image_sizes_width, image_sizes_height, all_img_details = process_file_for_image_creation(file_path_str, prepare_for_review, input_folder, create_images=True)
608
 
 
609
  # Create a page_sizes_object
610
  page_sizes, original_cropboxes = create_page_size_objects(pymupdf_doc, image_sizes_width, image_sizes_height, image_file_paths)
611
 
 
625
  json_from_csv = False
626
 
627
  # NEW IF STATEMENT
628
+ # If the file name ends with .json, check if we are loading for review. If yes, assume it is an annoations object, overwrite the current annotations object. If false, assume this is a Textract object, load in to Textract
629
+
630
  if (file_extension in ['.json']) | (json_from_csv == True):
631
 
632
  if (file_extension in ['.json']) & (prepare_for_review == True):
 
638
  all_annotations_object = json.loads(file_path) # Use loads for string content
639
 
640
  # Assume it's a textract json
641
+ elif (file_extension in ['.json']) and (prepare_for_review != True):
642
+ print("Saving Textract output")
643
  # Copy it to the output folder so it can be used later.
644
+ output_textract_json_file_name = file_path_without_ext
645
+ if not file_path.endswith("_textract.json"): output_textract_json_file_name = file_path_without_ext + "_textract.json"
646
+ else: output_textract_json_file_name = file_path_without_ext + ".json"
647
+
648
+ out_textract_path = os.path.join(output_folder, output_textract_json_file_name)
649
 
650
  # Use shutil to copy the file directly
651
  shutil.copy2(file_path, out_textract_path) # Preserves metadata
 
767
  print(out_time)
768
 
769
  out_message.append(out_time)
770
+ combined_out_message = '\n'.join(out_message)
771
 
772
+ number_of_pages = len(page_sizes)#len(image_file_paths)
773
 
774
+ return combined_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details, all_line_level_ocr_results_df
775
 
776
  def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi, output_folder:str=OUTPUT_FOLDER, input_folder:str=INPUT_FOLDER):
777
  file_path_without_ext = get_file_name_without_type(in_file_path)
tools/file_redaction.py CHANGED
@@ -205,7 +205,7 @@ def choose_and_run_redactor(file_paths:List[str],
205
  latest_file_completed = int(latest_file_completed)
206
 
207
  if isinstance(file_paths,str): number_of_files = 1
208
- else: number_of_files = len(file_paths)
209
 
210
  # If we have already redacted the last file, return the input out_message and file list to the relevant outputs
211
  if latest_file_completed >= number_of_files:
@@ -330,7 +330,7 @@ def choose_and_run_redactor(file_paths:List[str],
330
 
331
 
332
  # Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
333
- if pii_identification_method == "AWS Comprehend":
334
  if aws_access_key_textbox and aws_secret_key_textbox:
335
  print("Connecting to Comprehend using AWS access key and secret keys from textboxes.")
336
  comprehend_client = boto3.client('comprehend',
@@ -349,7 +349,8 @@ def choose_and_run_redactor(file_paths:List[str],
349
  out_message = "Cannot connect to AWS Comprehend service. Please provide access keys under Textract settings on the Redaction settings tab, or choose another PII identification method."
350
  print(out_message)
351
  raise Exception(out_message)
352
- else: comprehend_client = ""
 
353
 
354
  # Try to connect to AWS Textract Client if using that text extraction method
355
  if text_extraction_method == textract_option:
@@ -365,13 +366,17 @@ def choose_and_run_redactor(file_paths:List[str],
365
  print("Getting Textract credentials from environment variables.")
366
  textract_client = boto3.client('textract',
367
  aws_access_key_id=AWS_ACCESS_KEY,
368
- aws_secret_access_key=AWS_SECRET_KEY, region_name=AWS_REGION)
 
 
 
369
  else:
370
  textract_client = ""
371
- out_message_warning = "Cannot connect to AWS Textract service."
372
- print(out_message_warning)
373
- #raise Warning(out_message)
374
- else: textract_client = ""
 
375
 
376
  # Check if output_folder exists, create it if it doesn't
377
  if not os.path.exists(output_folder): os.makedirs(output_folder)
@@ -764,28 +769,66 @@ def move_page_info(file_path: str) -> str:
764
 
765
  return new_file_path
766
 
767
- def prepare_custom_image_recogniser_result_annotation_box(page:Page, annot:dict, image:Image):
768
  '''
769
  Prepare an image annotation box and coordinates based on a CustomImageRecogniserResult, PyMuPDF page, and PIL Image.
770
  '''
771
 
772
  img_annotation_box = {}
773
 
 
 
 
 
 
 
 
 
 
774
  if image:
775
  pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = convert_image_coords_to_pymupdf(page, annot, image)
 
776
  else:
777
- pymupdf_x1 = annot.left
778
- pymupdf_x2 = annot.left + annot.width
779
- pymupdf_y1 = annot.top
780
- pymupdf_y2 = annot.top + annot.height
781
-
782
- x1 = pymupdf_x1
783
- x2 = pymupdf_x2
784
-
785
- img_annotation_box["xmin"] = annot.left
786
- img_annotation_box["ymin"] = annot.top
787
- img_annotation_box["xmax"] = annot.left + annot.width
788
- img_annotation_box["ymax"] = annot.top + annot.height
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
789
  img_annotation_box["color"] = (0,0,0)
790
  try:
791
  img_annotation_box["label"] = str(annot.entity_type)
@@ -795,12 +838,11 @@ def prepare_custom_image_recogniser_result_annotation_box(page:Page, annot:dict,
795
  if hasattr(annot, 'text') and annot.text:
796
  img_annotation_box["text"] = str(annot.text)
797
  else:
798
- img_annotation_box["text"] = ""
799
-
800
- rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2) # Create the PyMuPDF Rect
801
 
802
  return img_annotation_box, rect
803
 
 
804
  def convert_pikepdf_annotations_to_result_annotation_box(page:Page, annot:dict, image:Image=None, convert_pikepdf_to_pymupdf_coords:bool=True, page_sizes_df:pd.DataFrame=pd.DataFrame(), image_dimensions:dict={}):
805
  '''
806
  Convert redaction objects with pikepdf coordinates to annotation boxes for PyMuPDF that can then be redacted from the document. First 1. converts pikepdf to pymupdf coordinates, then 2. converts pymupdf coordinates to image coordinates if page is an image.
@@ -951,8 +993,9 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None,
951
  rect = Rect(pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2) # Create the PyMuPDF Rect
952
 
953
  # Else should be CustomImageRecognizerResult
954
- elif isinstance(annot, CustomImageRecognizerResult):
955
- img_annotation_box, rect = prepare_custom_image_recogniser_result_annotation_box(page, annot, image)
 
956
 
957
  # Else it should be a pikepdf annotation object
958
  else:
@@ -1170,8 +1213,7 @@ def redact_image_pdf(file_path:str,
1170
 
1171
  tic = time.perf_counter()
1172
 
1173
- file_name = get_file_name_without_type(file_path)
1174
-
1175
  comprehend_query_number_new = 0
1176
 
1177
  # Update custom word list analyser object with any new words that have been added to the custom deny list
@@ -1211,7 +1253,7 @@ def redact_image_pdf(file_path:str,
1211
  # If running Textract, check if file already exists. If it does, load in existing data
1212
  if text_extraction_method == textract_option:
1213
  textract_json_file_path = output_folder + file_name + "_textract.json"
1214
- textract_data, is_missing, log_files_output_paths = load_and_convert_textract_json(textract_json_file_path, log_files_output_paths)
1215
  original_textract_data = textract_data.copy()
1216
 
1217
  ###
@@ -1285,6 +1327,8 @@ def redact_image_pdf(file_path:str,
1285
 
1286
  # Check if page exists in existing textract data. If not, send to service to analyse
1287
  if text_extraction_method == textract_option:
 
 
1288
  if not textract_data:
1289
  try:
1290
  # Convert the image_path to bytes using an in-memory buffer
@@ -1327,12 +1371,15 @@ def redact_image_pdf(file_path:str,
1327
  textract_data["pages"].append(text_blocks)
1328
 
1329
  except Exception as e:
1330
- print("Textract extraction for page", reported_page_number, "failed due to:", e)
 
1331
  text_blocks = []
1332
- new_request_metadata = "Failed Textract API call"
1333
 
1334
  # Check if "pages" key exists, if not, initialise it as an empty list
1335
- if "pages" not in textract_data: textract_data["pages"] = []
 
 
1336
 
1337
  request_metadata = request_metadata + "\n" + new_request_metadata
1338
 
 
205
  latest_file_completed = int(latest_file_completed)
206
 
207
  if isinstance(file_paths,str): number_of_files = 1
208
+ else: number_of_files = len(file_paths_list)
209
 
210
  # If we have already redacted the last file, return the input out_message and file list to the relevant outputs
211
  if latest_file_completed >= number_of_files:
 
330
 
331
 
332
  # Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
333
+ if pii_identification_method == aws_pii_detector:
334
  if aws_access_key_textbox and aws_secret_key_textbox:
335
  print("Connecting to Comprehend using AWS access key and secret keys from textboxes.")
336
  comprehend_client = boto3.client('comprehend',
 
349
  out_message = "Cannot connect to AWS Comprehend service. Please provide access keys under Textract settings on the Redaction settings tab, or choose another PII identification method."
350
  print(out_message)
351
  raise Exception(out_message)
352
+ else:
353
+ comprehend_client = ""
354
 
355
  # Try to connect to AWS Textract Client if using that text extraction method
356
  if text_extraction_method == textract_option:
 
366
  print("Getting Textract credentials from environment variables.")
367
  textract_client = boto3.client('textract',
368
  aws_access_key_id=AWS_ACCESS_KEY,
369
+ aws_secret_access_key=AWS_SECRET_KEY, region_name=AWS_REGION)
370
+ elif textract_output_found==True:
371
+ print("Existing Textract data found for file, no need to connect to AWS Textract")
372
+ textract_client = boto3.client('textract', region_name=AWS_REGION)
373
  else:
374
  textract_client = ""
375
+ out_message = "Cannot connect to AWS Textract service."
376
+ print(out_message)
377
+ raise Exception(out_message)
378
+ else:
379
+ textract_client = ""
380
 
381
  # Check if output_folder exists, create it if it doesn't
382
  if not os.path.exists(output_folder): os.makedirs(output_folder)
 
769
 
770
  return new_file_path
771
 
772
+ def prepare_custom_image_recogniser_result_annotation_box(page:Page, annot:dict, image:Image, page_sizes_df:pd.DataFrame):
773
  '''
774
  Prepare an image annotation box and coordinates based on a CustomImageRecogniserResult, PyMuPDF page, and PIL Image.
775
  '''
776
 
777
  img_annotation_box = {}
778
 
779
+ # For efficient lookup, set 'page' as index if it's not already
780
+ if 'page' in page_sizes_df.columns:
781
+ page_sizes_df = page_sizes_df.set_index('page')
782
+ # PyMuPDF page numbers are 0-based, DataFrame index assumed 1-based
783
+ page_num_one_based = page.number + 1
784
+
785
+ pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = 0, 0, 0, 0 # Initialize defaults
786
+
787
+
788
  if image:
789
  pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = convert_image_coords_to_pymupdf(page, annot, image)
790
+
791
  else:
792
+ # --- Calculate coordinates when no image is present ---
793
+ # Assumes annot coords are normalized relative to MediaBox (top-left origin)
794
+ try:
795
+ # 1. Get MediaBox dimensions from the DataFrame
796
+ page_info = page_sizes_df.loc[page_num_one_based]
797
+ mb_width = page_info['mediabox_width']
798
+ mb_height = page_info['mediabox_height']
799
+ x_offset = page_info['cropbox_x_offset']
800
+ y_offset = page_info['cropbox_y_offset_from_top']
801
+
802
+
803
+ # Check for invalid dimensions
804
+ if mb_width <= 0 or mb_height <= 0:
805
+ print(f"Warning: Invalid MediaBox dimensions ({mb_width}x{mb_height}) for page {page_num_one_based}. Setting coords to 0.")
806
+ else:
807
+ pymupdf_x1 = annot.left - x_offset
808
+ pymupdf_x2 = annot.left + annot.width - x_offset
809
+ pymupdf_y1 = annot.top - y_offset
810
+ pymupdf_y2 = annot.top + annot.height - y_offset
811
+
812
+ except KeyError:
813
+ print(f"Warning: Page number {page_num_one_based} not found in page_sizes_df. Cannot get MediaBox dimensions. Setting coords to 0.")
814
+ except AttributeError as e:
815
+ print(f"Error accessing attributes ('left', 'top', etc.) on 'annot' object for page {page_num_one_based}: {e}")
816
+ except Exception as e:
817
+ print(f"Error during coordinate calculation for page {page_num_one_based}: {e}")
818
+
819
+ rect = Rect(pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2) # Create the PyMuPDF Rect
820
+
821
+ # Now creating image annotation object
822
+ image_x1 = annot.left
823
+ image_x2 = annot.left + annot.width
824
+ image_y1 = annot.top
825
+ image_y2 = annot.top + annot.height
826
+
827
+ # Create image annotation boxes
828
+ img_annotation_box["xmin"] = image_x1
829
+ img_annotation_box["ymin"] = image_y1
830
+ img_annotation_box["xmax"] = image_x2 # annot.left + annot.width
831
+ img_annotation_box["ymax"] = image_y2 # annot.top + annot.height
832
  img_annotation_box["color"] = (0,0,0)
833
  try:
834
  img_annotation_box["label"] = str(annot.entity_type)
 
838
  if hasattr(annot, 'text') and annot.text:
839
  img_annotation_box["text"] = str(annot.text)
840
  else:
841
+ img_annotation_box["text"] = ""
 
 
842
 
843
  return img_annotation_box, rect
844
 
845
+
846
  def convert_pikepdf_annotations_to_result_annotation_box(page:Page, annot:dict, image:Image=None, convert_pikepdf_to_pymupdf_coords:bool=True, page_sizes_df:pd.DataFrame=pd.DataFrame(), image_dimensions:dict={}):
847
  '''
848
  Convert redaction objects with pikepdf coordinates to annotation boxes for PyMuPDF that can then be redacted from the document. First 1. converts pikepdf to pymupdf coordinates, then 2. converts pymupdf coordinates to image coordinates if page is an image.
 
993
  rect = Rect(pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2) # Create the PyMuPDF Rect
994
 
995
  # Else should be CustomImageRecognizerResult
996
+ elif isinstance(annot, CustomImageRecognizerResult):
997
+ #print("annot is a CustomImageRecognizerResult")
998
+ img_annotation_box, rect = prepare_custom_image_recogniser_result_annotation_box(page, annot, image, page_sizes_df)
999
 
1000
  # Else it should be a pikepdf annotation object
1001
  else:
 
1213
 
1214
  tic = time.perf_counter()
1215
 
1216
+ file_name = get_file_name_without_type(file_path)
 
1217
  comprehend_query_number_new = 0
1218
 
1219
  # Update custom word list analyser object with any new words that have been added to the custom deny list
 
1253
  # If running Textract, check if file already exists. If it does, load in existing data
1254
  if text_extraction_method == textract_option:
1255
  textract_json_file_path = output_folder + file_name + "_textract.json"
1256
+ textract_data, is_missing, log_files_output_paths = load_and_convert_textract_json(textract_json_file_path, log_files_output_paths, page_sizes_df)
1257
  original_textract_data = textract_data.copy()
1258
 
1259
  ###
 
1327
 
1328
  # Check if page exists in existing textract data. If not, send to service to analyse
1329
  if text_extraction_method == textract_option:
1330
+ text_blocks = []
1331
+
1332
  if not textract_data:
1333
  try:
1334
  # Convert the image_path to bytes using an in-memory buffer
 
1371
  textract_data["pages"].append(text_blocks)
1372
 
1373
  except Exception as e:
1374
+ out_message = "Textract extraction for page " + reported_page_number + " failed due to:" + str(e)
1375
+ print(out_message)
1376
  text_blocks = []
1377
+ new_request_metadata = "Failed Textract API call"
1378
 
1379
  # Check if "pages" key exists, if not, initialise it as an empty list
1380
+ if "pages" not in textract_data: textract_data["pages"] = []
1381
+
1382
+ raise Exception(out_message)
1383
 
1384
  request_metadata = request_metadata + "\n" + new_request_metadata
1385
 
tools/helper_functions.py CHANGED
@@ -9,7 +9,7 @@ import unicodedata
9
  from typing import List
10
  from math import ceil
11
  from gradio_image_annotation import image_annotator
12
- from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID
13
 
14
  # Names for options labels
15
  text_ocr_option = "Local model - selectable text"
@@ -31,7 +31,7 @@ def reset_state_vars():
31
  show_share_button=False,
32
  show_remove_button=False,
33
  interactive=False
34
- ), [], [], pd.DataFrame(), pd.DataFrame(), [], [], ""
35
 
36
  def reset_ocr_results_state():
37
  return pd.DataFrame(), pd.DataFrame(), []
@@ -44,23 +44,54 @@ def load_in_default_allow_list(allow_list_file_path):
44
  allow_list_file_path = [allow_list_file_path]
45
  return allow_list_file_path
46
 
47
- def load_in_default_cost_codes(cost_codes_path:str):
 
 
 
48
  cost_codes_df = pd.read_csv(cost_codes_path)
49
-
50
- dropdown_choices = cost_codes_df.iloc[:,0].to_list()
51
- dropdown_choices.insert(0, "")
52
-
53
-
54
- out_dropdown = gr.Dropdown(value="", label="Choose cost code for analysis", choices=dropdown_choices, allow_custom_value=True)
 
 
 
 
 
 
 
 
 
 
55
 
56
  return cost_codes_df, cost_codes_df, out_dropdown
57
 
58
- def enforce_cost_codes(enforce_cost_code_textbox, cost_code_choice):
 
 
 
 
59
  if enforce_cost_code_textbox == "True":
60
  if not cost_code_choice:
61
  raise Exception("Please choose a cost code before continuing")
 
 
 
 
 
 
 
 
 
62
  return
63
 
 
 
 
 
 
64
  def update_dataframe(df:pd.DataFrame):
65
  df_copy = df.copy()
66
  return df_copy
@@ -201,10 +232,10 @@ def check_for_existing_textract_file(doc_file_name_no_extension_textbox:str, out
201
  else:
202
  return False
203
 
204
- # Following function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
205
  def add_folder_to_path(folder_path: str):
206
  '''
207
- Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist.
208
  '''
209
 
210
  if os.path.exists(folder_path) and os.path.isdir(folder_path):
@@ -271,7 +302,14 @@ def merge_csv_files(file_list:List[str], output_folder:str=OUTPUT_FOLDER):
271
 
272
  return output_files
273
 
274
- async def get_connection_params(request: gr.Request, output_folder_textbox:str=OUTPUT_FOLDER, input_folder_textbox:str=INPUT_FOLDER, session_output_folder:str=SESSION_OUTPUT_FOLDER):
 
 
 
 
 
 
 
275
 
276
  #print("Session hash:", request.session_hash)
277
 
@@ -323,6 +361,13 @@ async def get_connection_params(request: gr.Request, output_folder_textbox:str=O
323
  if session_output_folder == 'True':
324
  output_folder = output_folder_textbox + out_session_hash + "/"
325
  input_folder = input_folder_textbox + out_session_hash + "/"
 
 
 
 
 
 
 
326
  else:
327
  output_folder = output_folder_textbox
328
  input_folder = input_folder_textbox
@@ -330,8 +375,7 @@ async def get_connection_params(request: gr.Request, output_folder_textbox:str=O
330
  if not os.path.exists(output_folder): os.mkdir(output_folder)
331
  if not os.path.exists(input_folder): os.mkdir(input_folder)
332
 
333
-
334
- return out_session_hash, output_folder, out_session_hash, input_folder
335
 
336
  def clean_unicode_text(text:str):
337
  # Step 1: Normalise unicode characters to decompose any special forms
@@ -374,6 +418,8 @@ def calculate_aws_costs(number_of_pages:str,
374
  pii_identification_method:str,
375
  textract_output_found_checkbox:bool,
376
  only_extract_text_radio:bool,
 
 
377
  textract_page_cost:float=1.5/1000,
378
  textract_signature_cost:float=2.0/1000,
379
  comprehend_unit_cost:float=0.0001,
@@ -391,6 +437,8 @@ def calculate_aws_costs(number_of_pages:str,
391
  - pii_identification_method_drop: The method of personally-identifiable information removal.
392
  - textract_output_found_checkbox: Whether existing Textract results have been found in the output folder. Assumes that results exist for all pages and files in the output folder.
393
  - only_extract_text_radio (bool, optional): Option to only extract text from the document rather than redact.
 
 
394
  - textract_page_cost (float, optional): AWS pricing for Textract text extraction per page ($).
395
  - textract_signature_cost (float, optional): Additional AWS cost above standard AWS Textract extraction for extracting signatures.
396
  - comprehend_unit_cost (float, optional): Cost per 'unit' (300 character minimum) for identifying PII in text with AWS Comprehend.
@@ -419,6 +467,9 @@ def calculate_aws_costs(number_of_pages:str,
419
 
420
  calculated_aws_cost = calculated_aws_cost + text_extraction_cost + pii_identification_cost
421
 
 
 
 
422
  return calculated_aws_cost
423
 
424
  def calculate_time_taken(number_of_pages:str,
 
9
  from typing import List
10
  from math import ceil
11
  from gradio_image_annotation import image_annotator
12
+ from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID, TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC
13
 
14
  # Names for options labels
15
  text_ocr_option = "Local model - selectable text"
 
31
  show_share_button=False,
32
  show_remove_button=False,
33
  interactive=False
34
+ ), [], [], pd.DataFrame(), pd.DataFrame(), [], [], "", False
35
 
36
  def reset_ocr_results_state():
37
  return pd.DataFrame(), pd.DataFrame(), []
 
44
  allow_list_file_path = [allow_list_file_path]
45
  return allow_list_file_path
46
 
47
+ def load_in_default_cost_codes(cost_codes_path:str, default_cost_code:str=""):
48
+ '''
49
+ Load in the cost codes list from file.
50
+ '''
51
  cost_codes_df = pd.read_csv(cost_codes_path)
52
+ dropdown_choices = cost_codes_df.iloc[:, 0].astype(str).tolist()
53
+
54
+ # Avoid inserting duplicate or empty cost code values
55
+ if default_cost_code and default_cost_code not in dropdown_choices:
56
+ dropdown_choices.insert(0, default_cost_code)
57
+
58
+ # Always have a blank option at the top
59
+ if "" not in dropdown_choices:
60
+ dropdown_choices.insert(0, "")
61
+
62
+ out_dropdown = gr.Dropdown(
63
+ value=default_cost_code if default_cost_code in dropdown_choices else "",
64
+ label="Choose cost code for analysis",
65
+ choices=dropdown_choices,
66
+ allow_custom_value=False
67
+ )
68
 
69
  return cost_codes_df, cost_codes_df, out_dropdown
70
 
71
+ def enforce_cost_codes(enforce_cost_code_textbox:str, cost_code_choice:str, cost_code_df:pd.DataFrame, verify_cost_codes:bool=True):
72
+ '''
73
+ Check if the enforce cost codes variable is set to true, and then check that a cost cost has been chosen. If not, raise an error. Then, check against the values in the cost code dataframe to ensure that the cost code exists.
74
+ '''
75
+
76
  if enforce_cost_code_textbox == "True":
77
  if not cost_code_choice:
78
  raise Exception("Please choose a cost code before continuing")
79
+
80
+ if verify_cost_codes == True:
81
+ if cost_code_df.empty:
82
+ raise Exception("No cost codes present in dataframe for verification")
83
+ else:
84
+ valid_cost_codes_list = list(cost_code_df.iloc[:,0].unique())
85
+
86
+ if not cost_code_choice in valid_cost_codes_list:
87
+ raise Exception("Selected cost code not found in list. Please contact Finance if you cannot find the correct cost code from the given list of suggestions.")
88
  return
89
 
90
+ def update_cost_code_dataframe_from_dropdown_select(cost_dropdown_selection:str, cost_code_df:pd.DataFrame):
91
+ cost_code_df = cost_code_df.loc[cost_code_df.iloc[:,0] == cost_dropdown_selection, :
92
+ ]
93
+ return cost_code_df
94
+
95
  def update_dataframe(df:pd.DataFrame):
96
  df_copy = df.copy()
97
  return df_copy
 
232
  else:
233
  return False
234
 
235
+ #
236
  def add_folder_to_path(folder_path: str):
237
  '''
238
+ Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
239
  '''
240
 
241
  if os.path.exists(folder_path) and os.path.isdir(folder_path):
 
302
 
303
  return output_files
304
 
305
+ async def get_connection_params(request: gr.Request,
306
+ output_folder_textbox:str=OUTPUT_FOLDER,
307
+ input_folder_textbox:str=INPUT_FOLDER,
308
+ session_output_folder:str=SESSION_OUTPUT_FOLDER,
309
+ textract_document_upload_input_folder:str=TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER,
310
+ textract_document_upload_output_folder:str=TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER,
311
+ s3_textract_document_logs_subfolder:str=TEXTRACT_JOBS_S3_LOC,
312
+ local_textract_document_logs_subfolder:str=TEXTRACT_JOBS_LOCAL_LOC):
313
 
314
  #print("Session hash:", request.session_hash)
315
 
 
361
  if session_output_folder == 'True':
362
  output_folder = output_folder_textbox + out_session_hash + "/"
363
  input_folder = input_folder_textbox + out_session_hash + "/"
364
+
365
+ textract_document_upload_input_folder = textract_document_upload_input_folder + "/" + out_session_hash
366
+ textract_document_upload_output_folder = textract_document_upload_output_folder + "/" + out_session_hash
367
+
368
+ s3_textract_document_logs_subfolder = s3_textract_document_logs_subfolder + "/" + out_session_hash
369
+ local_textract_document_logs_subfolder = local_textract_document_logs_subfolder + "/" + out_session_hash + "/"
370
+
371
  else:
372
  output_folder = output_folder_textbox
373
  input_folder = input_folder_textbox
 
375
  if not os.path.exists(output_folder): os.mkdir(output_folder)
376
  if not os.path.exists(input_folder): os.mkdir(input_folder)
377
 
378
+ return out_session_hash, output_folder, out_session_hash, input_folder, textract_document_upload_input_folder, textract_document_upload_output_folder, s3_textract_document_logs_subfolder, local_textract_document_logs_subfolder
 
379
 
380
  def clean_unicode_text(text:str):
381
  # Step 1: Normalise unicode characters to decompose any special forms
 
418
  pii_identification_method:str,
419
  textract_output_found_checkbox:bool,
420
  only_extract_text_radio:bool,
421
+ convert_to_gbp:bool=True,
422
+ usd_gbp_conversion_rate:float=0.76,
423
  textract_page_cost:float=1.5/1000,
424
  textract_signature_cost:float=2.0/1000,
425
  comprehend_unit_cost:float=0.0001,
 
437
  - pii_identification_method_drop: The method of personally-identifiable information removal.
438
  - textract_output_found_checkbox: Whether existing Textract results have been found in the output folder. Assumes that results exist for all pages and files in the output folder.
439
  - only_extract_text_radio (bool, optional): Option to only extract text from the document rather than redact.
440
+ - convert_to_gbp (bool, optional): Should suggested costs be converted from USD to GBP.
441
+ - usd_gbp_conversion_rate (float, optional): Conversion rate used for USD to GBP. Last changed 14th April 2025.
442
  - textract_page_cost (float, optional): AWS pricing for Textract text extraction per page ($).
443
  - textract_signature_cost (float, optional): Additional AWS cost above standard AWS Textract extraction for extracting signatures.
444
  - comprehend_unit_cost (float, optional): Cost per 'unit' (300 character minimum) for identifying PII in text with AWS Comprehend.
 
467
 
468
  calculated_aws_cost = calculated_aws_cost + text_extraction_cost + pii_identification_cost
469
 
470
+ if convert_to_gbp == True:
471
+ calculated_aws_cost *= usd_gbp_conversion_rate
472
+
473
  return calculated_aws_cost
474
 
475
  def calculate_time_taken(number_of_pages:str,
tools/redaction_review.py CHANGED
@@ -577,7 +577,7 @@ def apply_redactions_to_review_df_and_files(page_image_annotator_object:Annotate
577
  output_files.append(orig_pdf_file_path)
578
 
579
  try:
580
- print("Saving review file.")
581
  review_df = convert_annotation_json_to_review_df(all_image_annotations, review_file_state.copy(), page_sizes=page_sizes)[["image", "page", "label","color", "xmin", "ymin", "xmax", "ymax", "text"]]#.drop_duplicates(subset=["image", "page", "text", "label","color", "xmin", "ymin", "xmax", "ymax"])
582
  out_review_file_file_path = output_folder + file_name_with_ext + '_review_file.csv'
583
 
@@ -756,6 +756,18 @@ def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
756
 
757
  return row_value_page, row_value_df
758
 
 
 
 
 
 
 
 
 
 
 
 
 
759
  def df_select_callback_cost(df: pd.DataFrame, evt: gr.SelectData):
760
 
761
  row_value_code = evt.row_value[0] # This is the value for cost code
 
577
  output_files.append(orig_pdf_file_path)
578
 
579
  try:
580
+ #print("Saving review file.")
581
  review_df = convert_annotation_json_to_review_df(all_image_annotations, review_file_state.copy(), page_sizes=page_sizes)[["image", "page", "label","color", "xmin", "ymin", "xmax", "ymax", "text"]]#.drop_duplicates(subset=["image", "page", "text", "label","color", "xmin", "ymin", "xmax", "ymax"])
582
  out_review_file_file_path = output_folder + file_name_with_ext + '_review_file.csv'
583
 
 
756
 
757
  return row_value_page, row_value_df
758
 
759
+ def df_select_callback_textract_api(df: pd.DataFrame, evt: gr.SelectData):
760
+
761
+ #print("evt.data:", evt._data)
762
+
763
+ row_value_job_id = evt.row_value[0] # This is the page number value
764
+ # row_value_label = evt.row_value[1] # This is the label number value
765
+ row_value_job_type = evt.row_value[2] # This is the text number value
766
+
767
+ row_value_df = pd.DataFrame(data={"job_id":[row_value_job_id], "label":[row_value_job_type]})
768
+
769
+ return row_value_job_id, row_value_job_type, row_value_df
770
+
771
  def df_select_callback_cost(df: pd.DataFrame, evt: gr.SelectData):
772
 
773
  row_value_code = evt.row_value[0] # This is the value for cost code
tools/textract_batch_call.py CHANGED
@@ -1,22 +1,36 @@
1
  import boto3
2
  import time
3
  import os
 
4
  import json
5
  import logging
 
 
 
6
  from urllib.parse import urlparse
 
7
 
8
- # Configure logging
9
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
 
 
 
 
 
10
 
11
- def analyze_pdf_with_textract(
 
12
  local_pdf_path: str,
13
- s3_bucket_name: str,
14
  s3_input_prefix: str,
15
  s3_output_prefix: str,
16
- local_output_dir: str,
17
- aws_region: str = None, # Optional: specify region if not default
18
- poll_interval_seconds: int = 5,
19
- max_polling_attempts: int = 120 # ~10 minutes total wait time
 
 
 
20
  ):
21
  """
22
  Uploads a local PDF to S3, starts a Textract analysis job (detecting text & signatures),
@@ -27,10 +41,12 @@ def analyze_pdf_with_textract(
27
  s3_bucket_name (str): Name of the S3 bucket to use.
28
  s3_input_prefix (str): S3 prefix (folder) to upload the input PDF.
29
  s3_output_prefix (str): S3 prefix (folder) where Textract should write output.
30
- local_output_dir (str): Local directory to save the downloaded JSON results.
 
 
 
 
31
  aws_region (str, optional): AWS region name. Defaults to boto3 default region.
32
- poll_interval_seconds (int): Seconds to wait between polling Textract status.
33
- max_polling_attempts (int): Maximum number of times to poll Textract status.
34
 
35
  Returns:
36
  str: Path to the downloaded local JSON output file, or None if failed.
@@ -41,12 +57,21 @@ def analyze_pdf_with_textract(
41
  Exception: For other AWS errors or job failures.
42
  """
43
 
 
 
 
 
 
 
 
44
  if not os.path.exists(local_pdf_path):
45
- raise FileNotFoundError(f"Input PDF not found: {local_pdf_path}")
46
 
47
  if not os.path.exists(local_output_dir):
48
  os.makedirs(local_output_dir)
49
- logging.info(f"Created local output directory: {local_output_dir}")
 
 
50
 
51
  # Initialize boto3 clients
52
  session = boto3.Session(region_name=aws_region)
@@ -57,216 +82,407 @@ def analyze_pdf_with_textract(
57
  pdf_filename = os.path.basename(local_pdf_path)
58
  s3_input_key = os.path.join(s3_input_prefix, pdf_filename).replace("\\", "/") # Ensure forward slashes for S3
59
 
60
- logging.info(f"Uploading '{local_pdf_path}' to 's3://{s3_bucket_name}/{s3_input_key}'...")
 
 
61
  try:
62
  s3_client.upload_file(local_pdf_path, s3_bucket_name, s3_input_key)
63
- logging.info("Upload successful.")
 
 
64
  except Exception as e:
65
- logging.error(f"Failed to upload PDF to S3: {e}")
 
 
66
  raise
67
 
 
 
 
 
 
 
 
 
68
  # --- 2. Start Textract Document Analysis ---
69
- logging.info("Starting Textract document analysis job...")
 
 
 
70
  try:
71
- response = textract_client.start_document_analysis(
72
- DocumentLocation={
73
- 'S3Object': {
74
- 'Bucket': s3_bucket_name,
75
- 'Name': s3_input_key
 
 
 
 
 
 
 
76
  }
77
- },
78
- FeatureTypes=['SIGNATURES', 'FORMS', 'TABLES'], # Analyze for signatures, forms, and tables
79
- OutputConfig={
80
- 'S3Bucket': s3_bucket_name,
81
- 'S3Prefix': s3_output_prefix
82
- }
83
- # Optional: Add NotificationChannel for SNS topic notifications
84
- # NotificationChannel={
85
- # 'SNSTopicArn': 'YOUR_SNS_TOPIC_ARN',
86
- # 'RoleArn': 'YOUR_IAM_ROLE_ARN_FOR_TEXTRACT_TO_ACCESS_SNS'
87
- # }
88
- )
89
- job_id = response['JobId']
90
- logging.info(f"Textract job started with JobId: {job_id}")
91
-
92
- except Exception as e:
93
- logging.error(f"Failed to start Textract job: {e}")
94
- raise
95
-
96
- # --- 3. Poll for Job Completion ---
97
- job_status = 'IN_PROGRESS'
98
- attempts = 0
99
- logging.info("Polling Textract for job completion status...")
100
 
101
- while job_status == 'IN_PROGRESS' and attempts < max_polling_attempts:
102
- attempts += 1
103
- try:
104
- response = textract_client.get_document_analysis(JobId=job_id)
105
- job_status = response['JobStatus']
106
- logging.info(f"Polling attempt {attempts}/{max_polling_attempts}. Job status: {job_status}")
107
-
108
- if job_status == 'IN_PROGRESS':
109
- time.sleep(poll_interval_seconds)
110
- elif job_status == 'SUCCEEDED':
111
- logging.info("Textract job succeeded.")
112
- break
113
- elif job_status in ['FAILED', 'PARTIAL_SUCCESS']:
114
- status_message = response.get('StatusMessage', 'No status message provided.')
115
- warnings = response.get('Warnings', [])
116
- logging.error(f"Textract job ended with status: {job_status}. Message: {status_message}")
117
- if warnings:
118
- logging.warning(f"Warnings: {warnings}")
119
- # Decide if PARTIAL_SUCCESS should proceed or raise error
120
- # For simplicity here, we raise for both FAILED and PARTIAL_SUCCESS
121
- raise Exception(f"Textract job {job_id} failed or partially failed. Status: {job_status}. Message: {status_message}")
122
- else:
123
- # Should not happen based on documentation, but handle defensively
124
- raise Exception(f"Unexpected Textract job status: {job_status}")
125
 
126
- except textract_client.exceptions.InvalidJobIdException:
127
- logging.error(f"Invalid JobId: {job_id}. This might happen if the job expired (older than 7 days) or never existed.")
128
- raise
129
- except Exception as e:
130
- logging.error(f"Error while polling Textract status for job {job_id}: {e}")
131
- raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
- if job_status != 'SUCCEEDED':
134
- raise TimeoutError(f"Textract job {job_id} did not complete successfully within the polling limit.")
135
 
136
- # --- 4. Download Output JSON from S3 ---
137
- # Textract typically creates output under s3_output_prefix/job_id/
138
- # There might be multiple JSON files if pagination occurred during writing.
139
- # Usually, for smaller docs, there's one file, often named '1'.
140
- # For robust handling, list objects and find the JSON(s).
141
 
142
- s3_output_key_prefix = os.path.join(s3_output_prefix, job_id).replace("\\", "/") + "/"
143
- logging.info(f"Searching for output files in s3://{s3_bucket_name}/{s3_output_key_prefix}")
 
 
 
144
 
145
- downloaded_file_path = None
146
- try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  list_response = s3_client.list_objects_v2(
148
  Bucket=s3_bucket_name,
149
  Prefix=s3_output_key_prefix
150
  )
151
-
152
  output_files = list_response.get('Contents', [])
153
- if not output_files:
154
- # Sometimes Textract might take a moment longer to write the output after SUCCEEDED status
155
- logging.warning("No output files found immediately after job success. Waiting briefly and retrying list...")
156
- time.sleep(5)
157
- list_response = s3_client.list_objects_v2(
158
- Bucket=s3_bucket_name,
159
- Prefix=s3_output_key_prefix
160
- )
161
- output_files = list_response.get('Contents', [])
162
-
163
- if not output_files:
164
- logging.error(f"No output files found in s3://{s3_bucket_name}/{s3_output_key_prefix}")
165
- # You could alternatively try getting results via get_document_analysis pagination here
166
- # but sticking to the request to download from S3 output path.
167
- raise FileNotFoundError(f"Textract output files not found in S3 path: s3://{s3_bucket_name}/{s3_output_key_prefix}")
168
-
169
- # Usually, we only need the first/main JSON output file(s)
170
- # For simplicity, download the first one found. A more complex scenario might merge multiple files.
171
- # Filter out potential directory markers if any key ends with '/'
172
- json_files_to_download = [f for f in output_files if f['Key'] != s3_output_key_prefix and not f['Key'].endswith('/')]
173
-
174
- if not json_files_to_download:
175
- logging.error(f"No JSON files found (only prefix marker?) in s3://{s3_bucket_name}/{s3_output_key_prefix}")
176
- raise FileNotFoundError(f"Textract output JSON files not found in S3 path: s3://{s3_bucket_name}/{s3_output_key_prefix}")
177
-
178
- # Let's download the first JSON found. Often it's the only one or the main one.
179
- s3_output_key = json_files_to_download[0]['Key']
180
- output_filename_base = os.path.basename(pdf_filename).replace('.pdf', '')
181
- local_output_filename = f"{output_filename_base}_textract_output_{job_id}.json"
182
- local_output_path = os.path.join(local_output_dir, local_output_filename)
183
-
184
- logging.info(f"Downloading Textract output from 's3://{s3_bucket_name}/{s3_output_key}' to '{local_output_path}'...")
185
- s3_client.download_file(s3_bucket_name, s3_output_key, local_output_path)
186
- logging.info("Download successful.")
187
- downloaded_file_path = local_output_path
188
-
189
- # Log if multiple files were found, as user might need to handle them
190
- if len(json_files_to_download) > 1:
191
- logging.warning(f"Multiple output files found in S3 output location. Downloaded the first: '{s3_output_key}'. Other files exist.")
192
 
193
- except Exception as e:
194
- logging.error(f"Failed to download or process Textract output from S3: {e}")
195
- raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
 
197
  return downloaded_file_path
198
 
199
- # --- Example Usage ---
200
- if __name__ == '__main__':
201
- # --- Configuration --- (Replace with your actual values)
202
- MY_LOCAL_PDF = r"C:\path\to\your\document.pdf" # Use raw string for Windows paths
203
- MY_S3_BUCKET = "your-textract-demo-bucket-name" # MUST BE UNIQUE GLOBALLY
204
- MY_S3_INPUT_PREFIX = "textract-inputs" # Folder in the bucket for uploads
205
- MY_S3_OUTPUT_PREFIX = "textract-outputs" # Folder in the bucket for results
206
- MY_LOCAL_OUTPUT_DIR = "./textract_results" # Local folder to save JSON
207
- MY_AWS_REGION = "us-east-1" # e.g., 'us-east-1', 'eu-west-1'
208
-
209
- # --- Create a dummy PDF for testing if you don't have one ---
210
- # Requires 'reportlab' library: pip install reportlab
211
- try:
212
- from reportlab.pdfgen import canvas
213
- from reportlab.lib.pagesizes import letter
214
- if not os.path.exists(MY_LOCAL_PDF):
215
- print(f"Creating dummy PDF: {MY_LOCAL_PDF}")
216
- c = canvas.Canvas(MY_LOCAL_PDF, pagesize=letter)
217
- c.drawString(100, 750, "This is a test document for AWS Textract.")
218
- c.drawString(100, 700, "It includes some text and a placeholder for a signature.")
219
- c.drawString(100, 650, "Signed:")
220
- # Draw a simple line/scribble for signature placeholder
221
- c.line(150, 630, 250, 645)
222
- c.line(250, 645, 300, 620)
223
- c.save()
224
- print("Dummy PDF created.")
225
- except ImportError:
226
- if not os.path.exists(MY_LOCAL_PDF):
227
- print(f"Warning: reportlab not installed and '{MY_LOCAL_PDF}' not found. Cannot run example without an input PDF.")
228
- exit() # Exit if no PDF available for the example
229
- except Exception as e:
230
- print(f"Error creating dummy PDF: {e}")
231
- exit()
232
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
 
234
- # --- Run the analysis ---
235
- try:
236
- output_json_path = analyze_pdf_with_textract(
237
- local_pdf_path=MY_LOCAL_PDF,
238
- s3_bucket_name=MY_S3_BUCKET,
239
- s3_input_prefix=MY_S3_INPUT_PREFIX,
240
- s3_output_prefix=MY_S3_OUTPUT_PREFIX,
241
- local_output_dir=MY_LOCAL_OUTPUT_DIR,
242
- aws_region=MY_AWS_REGION
243
- )
244
 
245
- if output_json_path:
246
- print(f"\n--- Analysis Complete ---")
247
- print(f"Textract output JSON saved to: {output_json_path}")
248
 
249
- # Optional: Load and print some info from the JSON
250
- with open(output_json_path, 'r') as f:
251
- results = json.load(f)
252
- print(f"Detected {results.get('DocumentMetadata', {}).get('Pages', 'N/A')} page(s).")
253
- # Find signature blocks (Note: This is basic, real parsing might be more complex)
254
- signature_blocks = [block for block in results.get('Blocks', []) if block.get('BlockType') == 'SIGNATURE']
255
- print(f"Found {len(signature_blocks)} potential signature block(s).")
256
- if signature_blocks:
257
- print(f"First signature confidence: {signature_blocks[0].get('Confidence', 'N/A')}")
258
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
 
260
- except FileNotFoundError as e:
261
- print(f"\nError: Input file not found. {e}")
262
- except Exception as e:
263
- print(f"\nAn error occurred during the process: {e}")
264
 
265
- import boto3
266
- import time
267
- import os
268
 
269
- def download_textract_output(job_id, output_bucket, output_prefix, local_folder):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  """
271
  Checks the status of a Textract job and downloads the output ZIP file if the job is complete.
272
 
@@ -290,8 +506,8 @@ def download_textract_output(job_id, output_bucket, output_prefix, local_folder)
290
  print("Job failed:", response.get("StatusMessage", "No error message provided."))
291
  return
292
  else:
293
- print(f"Job is still {status}, waiting...")
294
- time.sleep(10) # Wait before checking again
295
 
296
  # Find output ZIP file in S3
297
  output_file_key = f"{output_prefix}/{job_id}.zip"
@@ -303,6 +519,3 @@ def download_textract_output(job_id, output_bucket, output_prefix, local_folder)
303
  print(f"Output file downloaded to: {local_file_path}")
304
  except Exception as e:
305
  print(f"Error downloading file: {e}")
306
-
307
- # Example usage:
308
- # download_textract_output("your-job-id", "your-output-bucket", "your-output-prefix", "/path/to/local/folder")
 
1
  import boto3
2
  import time
3
  import os
4
+ import pandas as pd
5
  import json
6
  import logging
7
+ import datetime
8
+ from typing import List
9
+ from io import StringIO
10
  from urllib.parse import urlparse
11
+ from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
12
 
13
+ # MY_LOCAL_PDF = r"C:\path\to\your\document.pdf" # Use raw string for Windows paths
14
+ # MY_S3_BUCKET = TEXTRACT_BULK_ANALYSIS_BUCKET # MUST BE UNIQUE GLOBALLY
15
+ # MY_S3_INPUT_PREFIX = session_hash_textbox # Folder in the bucket for uploads
16
+ # MY_S3_OUTPUT_PREFIX = session_hash_textbox # Folder in the bucket for results
17
+ # MY_LOCAL_OUTPUT_DIR = OUTPUT_FOLDER # Local folder to save JSON
18
+ # MY_AWS_REGION = AWS_REGION # e.g., 'us-east-1', 'eu-west-1'
19
+ from tools.config import TEXTRACT_BULK_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC
20
+ from tools.aws_textract import json_to_ocrresult
21
 
22
+
23
+ def analyse_document_with_textract_api(
24
  local_pdf_path: str,
 
25
  s3_input_prefix: str,
26
  s3_output_prefix: str,
27
+ job_df:pd.DataFrame,
28
+ s3_bucket_name: str = TEXTRACT_BULK_ANALYSIS_BUCKET,
29
+ local_output_dir: str = OUTPUT_FOLDER,
30
+ analyse_signatures:List[str] = [],
31
+ successful_job_number:int=0,
32
+ general_s3_bucket_name: str = DOCUMENT_REDACTION_BUCKET,
33
+ aws_region: str = AWS_REGION # Optional: specify region if not default
34
  ):
35
  """
36
  Uploads a local PDF to S3, starts a Textract analysis job (detecting text & signatures),
 
41
  s3_bucket_name (str): Name of the S3 bucket to use.
42
  s3_input_prefix (str): S3 prefix (folder) to upload the input PDF.
43
  s3_output_prefix (str): S3 prefix (folder) where Textract should write output.
44
+ job_df (pd.DataFrame): Dataframe containing information from previous Textract API calls.
45
+ s3_bucket_name (str, optional): S3 bucket in which to save API call outputs.
46
+ local_output_dir (str, optional): Local directory to save the downloaded JSON results.
47
+ analyse_signatures (List[str], optional): Analyse signatures? Default is no.
48
+ successful_job_number (int): The number of successful jobs that have been submitted in this session.
49
  aws_region (str, optional): AWS region name. Defaults to boto3 default region.
 
 
50
 
51
  Returns:
52
  str: Path to the downloaded local JSON output file, or None if failed.
 
57
  Exception: For other AWS errors or job failures.
58
  """
59
 
60
+ # This is a variable that is written to logs to indicate that a Textract API call was made
61
+ is_a_textract_api_call = True
62
+
63
+ # Keep only latest pdf path if it's a list
64
+ if isinstance(local_pdf_path, list):
65
+ local_pdf_path = local_pdf_path[-1]
66
+
67
  if not os.path.exists(local_pdf_path):
68
+ raise FileNotFoundError(f"Input document not found {local_pdf_path}")
69
 
70
  if not os.path.exists(local_output_dir):
71
  os.makedirs(local_output_dir)
72
+ log_message = f"Created local output directory: {local_output_dir}"
73
+ print(log_message)
74
+ #logging.info(log_message)
75
 
76
  # Initialize boto3 clients
77
  session = boto3.Session(region_name=aws_region)
 
82
  pdf_filename = os.path.basename(local_pdf_path)
83
  s3_input_key = os.path.join(s3_input_prefix, pdf_filename).replace("\\", "/") # Ensure forward slashes for S3
84
 
85
+ log_message = f"Uploading '{local_pdf_path}' to 's3://{s3_bucket_name}/{s3_input_key}'..."
86
+ print(log_message)
87
+ #logging.info(log_message)
88
  try:
89
  s3_client.upload_file(local_pdf_path, s3_bucket_name, s3_input_key)
90
+ log_message = "Upload successful."
91
+ print(log_message)
92
+ #logging.info(log_message)
93
  except Exception as e:
94
+ log_message = f"Failed to upload PDF to S3: {e}"
95
+ print(log_message)
96
+ #logging.error(log_message)
97
  raise
98
 
99
+ # If job_df is not empty
100
+ if not job_df.empty:
101
+ if "file_name" in job_df.columns:
102
+ matching_job_id_file_names = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(analyse_signatures)), "file_name"]
103
+
104
+ if len(matching_job_id_file_names) > 0:
105
+ raise Exception("Existing Textract outputs found. No need to re-analyse. Please download existing results from the list")
106
+
107
  # --- 2. Start Textract Document Analysis ---
108
+ message = "Starting Textract document analysis job..."
109
+ print(message)
110
+ #logging.info("Starting Textract document analysis job...")
111
+
112
  try:
113
+ if "Extract signatures" in analyse_signatures:
114
+ response = textract_client.start_document_analysis(
115
+ DocumentLocation={
116
+ 'S3Object': {
117
+ 'Bucket': s3_bucket_name,
118
+ 'Name': s3_input_key
119
+ }
120
+ },
121
+ FeatureTypes=['SIGNATURES'], # Analyze for signatures, forms, and tables
122
+ OutputConfig={
123
+ 'S3Bucket': s3_bucket_name,
124
+ 'S3Prefix': s3_output_prefix
125
  }
126
+ # Optional: Add NotificationChannel for SNS topic notifications
127
+ # NotificationChannel={
128
+ # 'SNSTopicArn': 'YOUR_SNS_TOPIC_ARN',
129
+ # 'RoleArn': 'YOUR_IAM_ROLE_ARN_FOR_TEXTRACT_TO_ACCESS_SNS'
130
+ # }
131
+ )
132
+ job_type="document_analysis"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
+ else:
135
+ response = textract_client.start_document_text_detection(
136
+ DocumentLocation={
137
+ 'S3Object': {
138
+ 'Bucket': s3_bucket_name,
139
+ 'Name': s3_input_key
140
+ }
141
+ },
142
+ OutputConfig={
143
+ 'S3Bucket': s3_bucket_name,
144
+ 'S3Prefix': s3_output_prefix
145
+ }
146
+ # Optional: Add NotificationChannel for SNS topic notifications
147
+ # NotificationChannel={
148
+ # 'SNSTopicArn': 'YOUR_SNS_TOPIC_ARN',
149
+ # 'RoleArn': 'YOUR_IAM_ROLE_ARN_FOR_TEXTRACT_TO_ACCESS_SNS'
150
+ # }
151
+ )
152
+ job_type="document_text_detection"
 
 
 
 
 
153
 
154
+ job_id = response['JobId']
155
+ print(f"Textract job started with JobId: {job_id}")
156
+ #logging.info(f"Textract job started with JobId: {job_id}")
157
+
158
+ # Write job_id to memory
159
+ # Prepare CSV in memory
160
+ log_csv_key_location = f"{s3_output_prefix}/textract_document_jobs.csv"
161
+ job_location_full = f"s3://{s3_bucket_name}/{s3_output_prefix}/{job_id}/"
162
+
163
+ csv_buffer = StringIO()
164
+ log_df = pd.DataFrame([{
165
+ 'job_id': job_id,
166
+ 'file_name': pdf_filename,
167
+ 'job_type': job_type,
168
+ 'signature_extraction':analyse_signatures,
169
+ 's3_location': job_location_full,
170
+ 'job_date_time': datetime.datetime.now()
171
+ }])
172
+
173
+ # File path
174
+ log_file_path = os.path.join(local_output_dir, "textract_job_log_files.csv")
175
+
176
+ # Check if file exists
177
+ file_exists = os.path.exists(log_file_path)
178
+
179
+ # Append to CSV if it exists, otherwise write with header
180
+ log_df.to_csv(log_file_path, mode='a', index=False, header=not file_exists)
181
+
182
+ #log_df.to_csv(csv_buffer)
183
 
184
+ # Upload the file
185
+ s3_client.upload_file(log_file_path, general_s3_bucket_name, log_csv_key_location)
186
 
187
+ # Upload to S3 (overwrite existing file)
188
+ #s3_client.put_object(Bucket=general_s3_bucket_name, Key=log_csv_key_location, Body=csv_buffer.getvalue())
189
+ print(f"Job ID written to {log_csv_key_location}")
190
+ #logging.info(f"Job ID written to s3://{s3_bucket_name}/{s3_output_prefix}/textract_document_jobs.csv")
 
191
 
192
+ except Exception as e:
193
+ error = f"Failed to start Textract job: {e}"
194
+ print(error)
195
+ #logging.error(error)
196
+ raise
197
 
198
+ successful_job_number += 1
199
+
200
+ return f"Textract analysis job submitted, job ID:{job_id}", job_id, job_type, successful_job_number, is_a_textract_api_call
201
+
202
+ def return_job_status(job_id:str,
203
+ response:dict,
204
+ attempts:int,
205
+ poll_interval_seconds: int = 5,
206
+ max_polling_attempts: int = 1 # ~10 minutes total wait time
207
+ ):
208
+ job_status = response['JobStatus']
209
+ logging.info(f"Polling attempt {attempts}/{max_polling_attempts}. Job status: {job_status}")
210
+
211
+ if job_status == 'IN_PROGRESS':
212
+ time.sleep(poll_interval_seconds)
213
+ elif job_status == 'SUCCEEDED':
214
+ logging.info("Textract job succeeded.")
215
+ elif job_status in ['FAILED', 'PARTIAL_SUCCESS']:
216
+ status_message = response.get('StatusMessage', 'No status message provided.')
217
+ warnings = response.get('Warnings', [])
218
+ logging.error(f"Textract job ended with status: {job_status}. Message: {status_message}")
219
+ if warnings:
220
+ logging.warning(f"Warnings: {warnings}")
221
+ # Decide if PARTIAL_SUCCESS should proceed or raise error
222
+ # For simplicity here, we raise for both FAILED and PARTIAL_SUCCESS
223
+ raise Exception(f"Textract job {job_id} failed or partially failed. Status: {job_status}. Message: {status_message}")
224
+ else:
225
+ # Should not happen based on documentation, but handle defensively
226
+ raise Exception(f"Unexpected Textract job status: {job_status}")
227
+
228
+ return job_status
229
+
230
+ def download_textract_job_files(s3_client:str,
231
+ s3_bucket_name:str,
232
+ s3_output_key_prefix:str,
233
+ pdf_filename:str,
234
+ job_id:str,
235
+ local_output_dir:str):
236
+ list_response = s3_client.list_objects_v2(
237
+ Bucket=s3_bucket_name,
238
+ Prefix=s3_output_key_prefix
239
+ )
240
+
241
+ output_files = list_response.get('Contents', [])
242
+ if not output_files:
243
+ # Sometimes Textract might take a moment longer to write the output after SUCCEEDED status
244
+ #logging.warning("No output files found immediately after job success. Waiting briefly and retrying list...")
245
+ #time.sleep(5)
246
  list_response = s3_client.list_objects_v2(
247
  Bucket=s3_bucket_name,
248
  Prefix=s3_output_key_prefix
249
  )
 
250
  output_files = list_response.get('Contents', [])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
+ if not output_files:
253
+ logging.error(f"No output files found in s3://{s3_bucket_name}/{s3_output_key_prefix}")
254
+ # You could alternatively try getting results via get_document_analysis pagination here
255
+ # but sticking to the request to download from S3 output path.
256
+ raise FileNotFoundError(f"Textract output files not found in S3 path: s3://{s3_bucket_name}/{s3_output_key_prefix}")
257
+
258
+ # Usually, we only need the first/main JSON output file(s)
259
+ # For simplicity, download the first one found. A more complex scenario might merge multiple files.
260
+ # Filter out potential directory markers if any key ends with '/'
261
+ json_files_to_download = [
262
+ f for f in output_files
263
+ if f['Key'] != s3_output_key_prefix and not f['Key'].endswith('/') and 'access_check' not in f['Key']
264
+ ]
265
+
266
+ #print("json_files_to_download:", json_files_to_download)
267
+
268
+ if not json_files_to_download:
269
+ error = f"No JSON files found (only prefix marker?) in s3://{s3_bucket_name}/{s3_output_key_prefix}"
270
+ print(error)
271
+ #logging.error(error)
272
+ raise FileNotFoundError(error)
273
+
274
+ combined_blocks = []
275
+
276
+ for f in sorted(json_files_to_download, key=lambda x: x['Key']): # Optional: sort to ensure consistent order
277
+ obj = s3_client.get_object(Bucket=s3_bucket_name, Key=f['Key'])
278
+ data = json.loads(obj['Body'].read())
279
+
280
+ # Assuming Textract-style output with a "Blocks" key
281
+ if "Blocks" in data:
282
+ combined_blocks.extend(data["Blocks"])
283
+ else:
284
+ logging.warning(f"No 'Blocks' key in file: {f['Key']}")
285
+
286
+ # Build final combined JSON structure
287
+ combined_output = {
288
+ "DocumentMetadata": {
289
+ "Pages": len(set(block.get('Page', 1) for block in combined_blocks))
290
+ },
291
+ "Blocks": combined_blocks,
292
+ "JobStatus": "SUCCEEDED"
293
+ }
294
+
295
+ output_filename_base = os.path.basename(pdf_filename)
296
+ output_filename_base_no_ext = os.path.splitext(output_filename_base)[0]
297
+ local_output_filename = f"{output_filename_base_no_ext}_textract.json"
298
+ local_output_path = os.path.join(local_output_dir, local_output_filename)
299
+
300
+ with open(local_output_path, 'w') as f:
301
+ json.dump(combined_output, f)
302
+
303
+ print(f"Combined Textract output written to {local_output_path}")
304
+
305
+ # logging.info(f"Downloading Textract output from 's3://{s3_bucket_name}/{s3_output_key}' to '{local_output_path}'...")
306
+ # s3_client.download_file(s3_bucket_name, s3_output_key, local_output_path)
307
+ # logging.info("Download successful.")
308
+ downloaded_file_path = local_output_path
309
+
310
+ # Log if multiple files were found, as user might need to handle them
311
+ #if len(json_files_to_download) > 1:
312
+ # logging.warning(f"Multiple output files found in S3 output location. Downloaded the first: '{s3_output_key}'. Other files exist.")
313
 
314
  return downloaded_file_path
315
 
316
+ def check_for_provided_job_id(job_id:str):
317
+ if not job_id:
318
+ raise Exception("Please provide a job ID.")
319
+ return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
 
321
+ def poll_bulk_textract_analysis_progress_and_download(
322
+ job_id:str,
323
+ job_type_dropdown:str,
324
+ s3_output_prefix: str,
325
+ pdf_filename:str,
326
+ job_df:pd.DataFrame,
327
+ s3_bucket_name: str = TEXTRACT_BULK_ANALYSIS_BUCKET,
328
+ local_output_dir: str = OUTPUT_FOLDER,
329
+ load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
330
+ load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
331
+ aws_region: str = AWS_REGION, # Optional: specify region if not default
332
+ poll_interval_seconds: int = 1,
333
+ max_polling_attempts: int = 1 # ~10 minutes total wait time):
334
+ ):
335
 
336
+ if job_id:
337
+ # Initialize boto3 clients
338
+ session = boto3.Session(region_name=aws_region)
339
+ s3_client = session.client('s3')
340
+ textract_client = session.client('textract')
 
 
 
 
 
341
 
342
+ # --- 3. Poll for Job Completion ---
343
+ job_status = 'IN_PROGRESS'
344
+ attempts = 0
345
 
346
+ message = "Polling Textract for job completion status..."
347
+ print(message)
348
+ #logging.info("Polling Textract for job completion status...")
 
 
 
 
 
 
349
 
350
+ # Update Textract document history df
351
+ try:
352
+ job_df = load_in_textract_job_details(load_s3_jobs=LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
353
+ load_s3_jobs_loc=load_s3_jobs_loc,
354
+ load_local_jobs_loc=load_local_jobs_loc)
355
+ except Exception as e:
356
+ #logging.error(f"Failed to update job details dataframe: {e}")
357
+ print(f"Failed to update job details dataframe: {e}")
358
+ #raise
359
+
360
+ while job_status == 'IN_PROGRESS' and attempts < max_polling_attempts:
361
+ attempts += 1
362
+ try:
363
+ if job_type_dropdown=="document_analysis":
364
+ response = textract_client.get_document_analysis(JobId=job_id)
365
+ job_status = return_job_status(job_id, response, attempts, poll_interval_seconds, max_polling_attempts)
366
+ elif job_type_dropdown=="document_text_detection":
367
+ response = textract_client.get_document_text_detection(JobId=job_id)
368
+ job_status = return_job_status(job_id, response, attempts, poll_interval_seconds, max_polling_attempts)
369
+ else:
370
+ error = f"Unknown job type, cannot poll job"
371
+ print(error)
372
+ #logging.error(f"Invalid JobId: {job_id}. This might happen if the job expired (older than 7 days) or never existed.")
373
+ raise
374
+
375
+ except textract_client.exceptions.InvalidJobIdException:
376
+ error_message = f"Invalid JobId: {job_id}. This might happen if the job expired (older than 7 days) or never existed."
377
+ print(error_message)
378
+ logging.error(error_message)
379
+ raise
380
+ except Exception as e:
381
+ error_message = f"Error while polling Textract status for job {job_id}: {e}"
382
+ print(error_message)
383
+ logging.error(error_message)
384
+ raise
385
+
386
+ downloaded_file_path = None
387
+ if job_status == 'SUCCEEDED':
388
+ #raise TimeoutError(f"Textract job {job_id} did not complete successfully within the polling limit.")
389
+ # 3b - Replace PDF file name if it exists in the job dataframe
390
+
391
+ # If job_df is not empty
392
+ if not job_df.empty:
393
+ if "file_name" in job_df.columns:
394
+ matching_job_id_file_names = job_df.loc[job_df["job_id"] == job_id, "file_name"]
395
+
396
+ if pdf_filename and not matching_job_id_file_names.empty:
397
+ if pdf_filename == matching_job_id_file_names.iloc[0]:
398
+ raise Exception("Existing Textract outputs found. No need to re-download.")
399
+
400
+ if not matching_job_id_file_names.empty:
401
+ pdf_filename = matching_job_id_file_names.iloc[0]
402
+ else:
403
+ pdf_filename = "unknown_file"
404
+
405
+
406
+ # --- 4. Download Output JSON from S3 ---
407
+ # Textract typically creates output under s3_output_prefix/job_id/
408
+ # There might be multiple JSON files if pagination occurred during writing.
409
+ # Usually, for smaller docs, there's one file, often named '1'.
410
+ # For robust handling, list objects and find the JSON(s).
411
+
412
+
413
+ s3_output_key_prefix = os.path.join(s3_output_prefix, job_id).replace("\\", "/") + "/"
414
+ logging.info(f"Searching for output files in s3://{s3_bucket_name}/{s3_output_key_prefix}")
415
+
416
+ try:
417
+ downloaded_file_path = download_textract_job_files(s3_client,
418
+ s3_bucket_name,
419
+ s3_output_key_prefix,
420
+ pdf_filename,
421
+ job_id,
422
+ local_output_dir)
423
+
424
+ except Exception as e:
425
+ #logging.error(f"Failed to download or process Textract output from S3: {e}")
426
+ print(f"Failed to download or process Textract output from S3: {e}")
427
+ raise
428
+
429
+ else:
430
+ raise Exception("No Job ID provided.")
431
+
432
+ return downloaded_file_path, job_status, job_df
433
+
434
+
435
+
436
+ def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
437
+ load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
438
+ load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
439
+ document_redaction_bucket:str=DOCUMENT_REDACTION_BUCKET,
440
+ aws_region:str=AWS_REGION):
441
+
442
+ job_df = pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time'])
443
 
444
+ # Initialize boto3 clients
445
+ session = boto3.Session(region_name=aws_region)
446
+ s3_client = session.client('s3')
 
447
 
448
+ local_output_path = f'{load_local_jobs_loc}/textract_job_log_files.csv'
 
 
449
 
450
+ if load_s3_jobs == 'True':
451
+
452
+ s3_output_key = f'{load_s3_jobs_loc}/textract_job_log_files.csv'
453
+
454
+ try:
455
+ s3_client.head_object(Bucket=document_redaction_bucket, Key=s3_output_key)
456
+ print(f"File exists. Downloading from '{s3_output_key}' to '{local_output_path}'...")
457
+ s3_client.download_file(document_redaction_bucket, s3_output_key, local_output_path)
458
+ print("Download successful.")
459
+ except ClientError as e:
460
+ if e.response['Error']['Code'] == '404':
461
+ print("Log file does not exist in S3.")
462
+ else:
463
+ print(f"Unexpected error occurred: {e}")
464
+ except (NoCredentialsError, PartialCredentialsError, TokenRetrievalError) as e:
465
+ print(f"AWS credential issue encountered: {e}")
466
+ print("Skipping S3 log file download.")
467
+
468
+ # If the log path exists, load it in
469
+ if os.path.exists(local_output_path):
470
+ print("Found log file in local path")
471
+ job_df = pd.read_csv(local_output_path)
472
+
473
+ if "job_date_time" in job_df.columns:
474
+ job_df["job_date_time"] = pd.to_datetime(job_df["job_date_time"], errors='coerce')
475
+ # Keep only jobs that have been completed in the last 7 days
476
+ cutoff_time = pd.Timestamp.now() - pd.Timedelta(days=7)
477
+ job_df = job_df.loc[job_df["job_date_time"] >= cutoff_time,:]
478
+
479
+ return job_df
480
+
481
+
482
+ def download_textract_output(job_id:str,
483
+ output_bucket:str,
484
+ output_prefix:str,
485
+ local_folder:str):
486
  """
487
  Checks the status of a Textract job and downloads the output ZIP file if the job is complete.
488
 
 
506
  print("Job failed:", response.get("StatusMessage", "No error message provided."))
507
  return
508
  else:
509
+ print(f"Job is still {status}.")
510
+ #time.sleep(10) # Wait before checking again
511
 
512
  # Find output ZIP file in S3
513
  output_file_key = f"{output_prefix}/{job_id}.zip"
 
519
  print(f"Output file downloaded to: {local_file_path}")
520
  except Exception as e:
521
  print(f"Error downloading file: {e}")