seanpedrickcase commited on
Commit
0ea8b9e
·
1 Parent(s): 0e1a4a7

Major update. General code revision. Improved config variables. Dataframe based review frame now includes text, items can be searched and excluded. Costs now estimated. Option for adding cost codes added. Option to extract text only.

Browse files
app.py CHANGED
@@ -1,33 +1,27 @@
1
  import os
2
- import socket
3
-
4
- # By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
5
- #os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
6
-
7
- import gradio as gr
8
  import pandas as pd
9
- from datetime import datetime
10
  from gradio_image_annotation import image_annotator
11
- from gradio_image_annotation.image_annotator import AnnotatedImageData
12
 
13
- from tools.config import output_folder, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, DEFAULT_ALLOW_LIST_PATH
14
- from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe
15
- from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
16
  from tools.file_redaction import choose_and_run_redactor
17
- from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
18
- from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal
19
  from tools.data_anonymise import anonymise_data_files
20
  from tools.auth import authenticate_user
21
  from tools.load_spacy_model_custom_recognisers import custom_entities
22
  from tools.custom_csvlogger import CSVLogger_custom
23
  from tools.find_duplicate_pages import identify_similar_pages
24
 
25
- today_rev = datetime.now().strftime("%Y%m%d")
 
26
 
27
- add_folder_to_path("tesseract/")
28
- add_folder_to_path("poppler/poppler-24.02.0/Library/bin/")
29
 
30
- ensure_output_folder_exists()
31
 
32
  chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']
33
 
@@ -42,12 +36,7 @@ chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "
42
 
43
  full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM', 'CUSTOM_FUZZY']
44
 
45
- language = 'en'
46
-
47
- host_name = socket.gethostname()
48
- feedback_logs_folder = 'feedback/' + today_rev + '/' + host_name + '/'
49
- access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
50
- usage_logs_folder = 'usage/' + today_rev + '/' + host_name + '/'
51
 
52
  file_input_height = 200
53
 
@@ -71,13 +60,14 @@ with app:
71
  pdf_doc_state = gr.State([])
72
  all_image_annotations_state = gr.State([])
73
 
74
- all_line_level_ocr_results_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_line_level_ocr_results_df", visible=False, type="pandas")
75
- all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas")
76
- review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas")
77
 
78
  session_hash_state = gr.Textbox(label= "session_hash_state", value="", visible=False)
79
  s3_output_folder_state = gr.Textbox(label= "s3_output_folder_state", value="", visible=False)
80
- output_folder_textbox = gr.Textbox(value = output_folder, label="output_folder_textbox", visible=False)
 
81
 
82
  first_loop_state = gr.Checkbox(label="first_loop_state", value=True, visible=False)
83
  second_loop_state = gr.Checkbox(label="second_loop_state", value=False, visible=False)
@@ -88,36 +78,38 @@ with app:
88
  document_cropboxes = gr.Dropdown(label = "document_cropboxes", value="", allow_custom_value=True,visible=False)
89
  page_sizes = gr.Dropdown(label = "page_sizes", value="", allow_custom_value=True, visible=False)
90
  images_pdf_state = gr.Dropdown(label = "images_pdf_list", value="", allow_custom_value=True,visible=False)
 
91
 
92
  output_image_files_state = gr.Dropdown(label = "output_image_files_list", value="", allow_custom_value=True,visible=False)
93
  output_file_list_state = gr.Dropdown(label = "output_file_list", value="", allow_custom_value=True,visible=False)
94
  text_output_file_list_state = gr.Dropdown(label = "text_output_file_list", value="", allow_custom_value=True,visible=False)
95
- log_files_output_list_state = gr.Dropdown(label = "log_files_output_list", value="", allow_custom_value=True,visible=False)
 
96
 
97
  # Backup versions of these objects in case you make a mistake
98
  backup_review_state = gr.Dataframe(visible=False)
99
  backup_image_annotations_state = gr.State([])
100
- backup_recogniser_entity_dataframe_base = gr.Dataframe(visible=False)
101
-
102
 
103
  # Logging state
104
- log_file_name = 'log.csv'
105
-
106
- feedback_logs_state = gr.Textbox(label= "feedback_logs_state", value=feedback_logs_folder + log_file_name, visible=False)
107
- feedback_s3_logs_loc_state = gr.Textbox(label= "feedback_s3_logs_loc_state", value=feedback_logs_folder, visible=False)
108
- access_logs_state = gr.Textbox(label= "access_logs_state", value=access_logs_folder + log_file_name, visible=False)
109
- access_s3_logs_loc_state = gr.Textbox(label= "access_s3_logs_loc_state", value=access_logs_folder, visible=False)
110
- usage_logs_state = gr.Textbox(label= "usage_logs_state", value=usage_logs_folder + log_file_name, visible=False)
111
- usage_s3_logs_loc_state = gr.Textbox(label= "usage_s3_logs_loc_state", value=usage_logs_folder, visible=False)
112
 
113
  session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
114
  textract_metadata_textbox = gr.Textbox(label = "textract_metadata_textbox", value="", visible=False)
115
  comprehend_query_number = gr.Number(label = "comprehend_query_number", value=0, visible=False)
116
-
 
117
  doc_full_file_name_textbox = gr.Textbox(label = "doc_full_file_name_textbox", value="", visible=False)
118
  doc_file_name_no_extension_textbox = gr.Textbox(label = "doc_full_file_name_textbox", value="", visible=False)
 
119
  doc_file_name_with_extension_textbox = gr.Textbox(label = "doc_file_name_with_extension_textbox", value="", visible=False)
120
  doc_file_name_textbox_list = gr.Dropdown(label = "doc_file_name_textbox_list", value="", allow_custom_value=True,visible=False)
 
121
 
122
  data_full_file_name_textbox = gr.Textbox(label = "data_full_file_name_textbox", value="", visible=False)
123
  data_file_name_no_extension_textbox = gr.Textbox(label = "data_full_file_name_textbox", value="", visible=False)
@@ -129,9 +121,8 @@ with app:
129
  text_name_const = gr.Textbox(label="text_name_const", value="text", visible=False)
130
  page_name_const = gr.Textbox(label="page_name_const", value="page", visible=False)
131
 
132
- estimated_time_taken_number = gr.Number(label = "estimated_time_taken_number", value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
133
  annotate_previous_page = gr.Number(value=0, label="Previous page", precision=0, visible=False) # Keeps track of the last page that the annotator was on
134
-
135
  s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
136
 
137
  ## Annotator zoom value
@@ -141,31 +132,47 @@ with app:
141
 
142
  clear_all_page_redactions = gr.Checkbox(label="clear_all_page_redactions", value=True, visible=False)
143
  prepare_for_review_bool = gr.Checkbox(label="prepare_for_review_bool", value=True, visible=False)
 
144
 
145
  ## Settings page variables
146
  default_deny_list_file_name = "default_deny_list.csv"
147
- default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
148
  in_deny_list_text_in = gr.Textbox(value="deny_list", visible=False)
149
 
150
  fully_redacted_list_file_name = "default_fully_redacted_list.csv"
151
- fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
152
  in_fully_redacted_text_in = gr.Textbox(value="fully_redacted_pages_list", visible=False)
153
 
154
  # S3 settings for default allow list load
155
- s3_default_bucket = gr.Textbox(label = "Default S3 bucket", value=bucket_name, visible=False)
 
 
156
 
157
- default_allow_list_file_name = "default_allow_list.csv"
158
- default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
159
-
160
- s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=DEFAULT_ALLOW_LIST_PATH, visible=False)
161
- default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
162
 
163
  # Base dataframe for recognisers that is not modified subsequent to load
164
- recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=3, type="pandas", visible=False, label="recogniser_entity_dataframe_base", show_search="filter", headers=["page", "label", "text"])
165
 
166
  # Duplicate page detection
167
  in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
168
- duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="duplicate_pages_df", visible=False, type="pandas")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
  ###
171
  # UI DESIGN
@@ -174,35 +181,61 @@ with app:
174
  gr.Markdown(
175
  """# Document redaction
176
 
177
- Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](https://github.com/seanpedrick-case/doc_redaction/blob/main/README.md) for a walkthrough on how to use the app. Below is a very brief overview.
178
 
179
- To identify text in documents, the 'local' text/OCR image analysis uses spacy/tesseract, and works ok for documents with typed text. If available, choose 'AWS Textract service' to redact more complex elements e.g. signatures or handwriting. Then, choose a method for PII identification. 'Local' is quick and gives good results if you are primarily looking for a custom list of terms to redact (see Redaction settings). If available, AWS Comprehend gives better results at a small cost.
180
 
181
- After redaction, review suggested redactions on the 'Review redactions' tab. The original pdf can be uploaded here alongside a '...redaction_file.csv' to continue a previous redaction/review task. See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or custom terms to always include/ exclude from redaction.
182
 
183
  NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.""")
184
 
185
  ###
186
- # REDACTION PDF/IMAGES TABL
187
  ###
188
  with gr.Tab("Redact PDFs/images"):
189
  with gr.Accordion("Redact document", open = True):
190
  in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json', '.zip'], height=file_input_height)
191
- # if RUN_AWS_FUNCTIONS == "1":
192
- in_redaction_method = gr.Radio(label="Choose text extraction method. AWS Textract has a cost per page - $3.50 per 1,000 pages with signature detection (default), $1.50 without. Go to Redaction settings - AWS Textract options to remove signature detection.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
193
- pii_identification_method_drop = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost of approximately $0.01 per 10,000 characters.", value = default_pii_detector, choices=[local_pii_detector, aws_pii_detector])
194
- # else:
195
- # in_redaction_method = gr.Radio(label="Choose text extraction method.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option])
196
- # pii_identification_method_drop = gr.Radio(label = "Choose PII detection method.", value = default_pii_detector, choices=[local_pii_detector], visible=False)
197
-
198
- gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the redaction settings tab.""")
199
- document_redact_btn = gr.Button("Redact document", variant="primary")
200
- current_loop_page_number = gr.Number(value=0,precision=0, interactive=False, label = "Last redacted page in document", visible=False)
201
- page_break_return = gr.Checkbox(value = False, label="Page break reached", visible=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
  with gr.Row():
204
  output_summary = gr.Textbox(label="Output summary", scale=1)
205
- output_file = gr.File(label="Output files", scale = 2, height=file_input_height)
206
  latest_file_completed_text = gr.Number(value=0, label="Number of documents redacted", interactive=False, visible=False)
207
 
208
  with gr.Row():
@@ -221,7 +254,7 @@ with app:
221
 
222
  with gr.Accordion(label = "Review PDF redactions", open=True):
223
  output_review_files = gr.File(label="Upload original PDF and 'review_file' csv here to review suggested redactions", file_count='multiple', height=file_input_height)
224
- upload_previous_review_file_btn = gr.Button("Review PDF and 'review file' csv provided above", variant="primary")
225
  with gr.Row():
226
  annotate_zoom_in = gr.Button("Zoom in", visible=False)
227
  annotate_zoom_out = gr.Button("Zoom out", visible=False)
@@ -236,12 +269,10 @@ with app:
236
  annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
237
  annotation_next_page_button = gr.Button("Next page", scale = 4)
238
  with gr.Column(scale=1):
239
- annotation_button_apply = gr.Button("Apply revised redactions to PDF", variant="secondary")
240
- #blank_markdown_top = gr.Markdown(value="", label="")
241
 
242
  with gr.Row():
243
  with gr.Column(scale=2):
244
-
245
  zoom_str = str(annotator_zoom_number) + '%'
246
 
247
  annotator = image_annotator(
@@ -266,11 +297,17 @@ with app:
266
  recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
267
  page_entity_dropdown = gr.Dropdown(label="Page", value="ALL", allow_custom_value=True)
268
  text_entity_dropdown = gr.Dropdown(label="Text", value="ALL", allow_custom_value=True)
269
- recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=(3,"fixed"), type="pandas", label="Search results. Click to go to page", headers=["page", "label", "text"])
 
 
 
 
270
  with gr.Row(equal_height=True):
271
  reset_dropdowns_btn = gr.Button(value="Reset filters")
272
- exclude_selected_btn = gr.Button(value="Exclude items in table from redactions")
273
  undo_last_removal_btn = gr.Button(value="Undo last element removal")
 
 
274
 
275
  with gr.Row():
276
  with gr.Column(scale=2):
@@ -285,17 +322,25 @@ with app:
285
  with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
286
  convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
287
  adobe_review_files_out = gr.File(label="Output Adobe comment files will appear here. If converting from .xfdf file to review_file.csv, upload the original pdf with the xfdf file here then click Convert below.", file_count='multiple', file_types=['.csv', '.xfdf', '.pdf'])
288
- convert_adobe_to_review_file_btn = gr.Button("Convert Adobe .xfdf comment file to review_file.csv", variant="secondary")
289
-
 
 
 
 
 
 
 
 
 
 
 
 
290
  ###
291
  # TEXT / TABULAR DATA TAB
292
  ###
293
  with gr.Tab(label="Open text or Excel/csv files"):
294
- gr.Markdown(
295
- """
296
- ### Choose open text or a tabular data file (xlsx or csv) to redact.
297
- """
298
- )
299
  with gr.Accordion("Paste open text", open = False):
300
  in_text = gr.Textbox(label="Enter open text", lines=10)
301
  with gr.Accordion("Upload xlsx or csv files", open = True):
@@ -321,15 +366,6 @@ with app:
321
  data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
322
  data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
323
 
324
- ###
325
- # IDENTIFY DUPLICATE PAGES TAB
326
- ###
327
- with gr.Tab(label="Identify duplicate pages"):
328
- with gr.Accordion("Identify duplicate pages to redact", open = True):
329
- in_duplicate_pages = gr.File(label="Upload multiple 'ocr_output.csv' data files from redaction jobs here to compare", file_count="multiple", height=file_input_height, file_types=['.csv'])
330
- find_duplicate_pages_btn = gr.Button(value="Identify duplicate pages", variant="primary")
331
- duplicate_pages_out =gr.File(label="Duplicate pages analysis output", file_count="multiple", height=file_input_height, file_types=['.csv'])
332
-
333
  ###
334
  # SETTINGS TAB
335
  ###
@@ -347,9 +383,9 @@ with app:
347
  in_fully_redacted_list_text = gr.Textbox(label="Fully redacted page list load status")
348
  with gr.Accordion("Manually modify custom allow, deny, and full page redaction lists (NOTE: you need to press Enter after modifying/adding an entry to the lists to apply them)", open = False):
349
  with gr.Row():
350
- in_allow_list_state = gr.Dataframe(value=pd.DataFrame(), headers=["allow_list"], col_count=(1, "fixed"), row_count = (0, "dynamic"), label="Allow list", visible=True, type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True)
351
- in_deny_list_state = gr.Dataframe(value=pd.DataFrame(), headers=["deny_list"], col_count=(1, "fixed"), row_count = (0, "dynamic"), label="Deny list", visible=True, type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True)
352
- in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), headers=["fully_redacted_pages_list"], col_count=(1, "fixed"), row_count = (0, "dynamic"), label="Fully redacted pages", visible=True, type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, datatype='number')
353
 
354
  with gr.Accordion("Select entity types to redact", open = True):
355
  in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Local PII identification model (click empty space in box for full list)")
@@ -364,10 +400,9 @@ with app:
364
  page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
365
  page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
366
 
367
- with gr.Accordion("AWS Textract options", open = False):
368
- handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
369
  #with gr.Row():
370
- in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)
371
 
372
  with gr.Row():
373
  aws_access_key_textbox = gr.Textbox(value='', label="AWS access key for account with permissions for AWS Textract and Comprehend", visible=True, type="password")
@@ -384,31 +419,56 @@ with app:
384
 
385
  with gr.Accordion("View all output files from this session", open = False):
386
  all_output_files_btn = gr.Button("Click here to view all output files", variant="secondary")
387
- all_output_files = gr.File(label="All output files.", file_count='multiple', file_types=['.csv'], interactive=False)
388
-
389
 
390
  ### UI INTERACTION ###
391
 
392
  ###
393
  # PDF/IMAGE REDACTION
394
  ###
395
- in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
396
 
397
  # Run redaction function
398
- document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, document_cropboxes]).\
399
- success(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes],
400
- outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, document_cropboxes], api_name="redact_doc").\
401
- success(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
402
-
 
403
  # If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
404
- current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes],
405
- outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, document_cropboxes]).\
406
- success(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
407
-
408
  # If a file has been completed, the function will continue onto the next document
409
- latest_file_completed_text.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes],
410
- outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, document_cropboxes]).\
411
- success(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown]).\
 
412
  success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
413
 
414
  ###
@@ -417,92 +477,75 @@ with app:
417
 
418
  # Upload previous files for modifying redactions
419
  upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
420
- success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
421
- success(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes], api_name="prepare_doc").\
422
- success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
423
-
424
- # Page controls at top
425
- annotate_current_page.submit(
426
- modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown, recogniser_entity_dataframe_base, review_file_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
427
- success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown]).\
428
- success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
429
 
430
- annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
431
- success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown, recogniser_entity_dataframe_base, review_file_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
432
- success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown]).\
433
- success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
434
-
435
- annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
436
- success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown, recogniser_entity_dataframe_base, review_file_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
437
- success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown]).\
438
- success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
439
-
440
- # Zoom in and out on annotator
441
- annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown, recogniser_entity_dataframe_base, review_file_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
442
- success(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_true_bool], outputs=[annotator_zoom_number, annotate_current_page])
443
-
444
- annotate_zoom_out.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown, recogniser_entity_dataframe_base, review_file_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
445
- success(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_false_bool], outputs=[annotator_zoom_number, annotate_current_page])
446
-
447
- annotator_zoom_number.change(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
448
-
449
- clear_all_redactions_on_page_btn.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown, recogniser_entity_dataframe_base, clear_all_page_redactions], outputs = [all_image_annotations_state, annotate_previous_page]).\
450
- success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
451
 
452
- annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
 
453
 
454
- # Page controls at bottom
455
- annotate_current_page_bottom.submit(
456
- modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown, recogniser_entity_dataframe_base, review_file_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page]).\
457
- success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown]).\
458
- success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
459
 
460
- annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
461
- success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown, recogniser_entity_dataframe_base, review_file_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
462
- success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown]).\
463
- success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
464
-
465
- annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
466
- success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown, recogniser_entity_dataframe_base, review_file_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
467
- success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown]).\
468
- success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
469
 
470
  # Review table controls
471
  recogniser_entity_dropdown.select(update_entities_df_recogniser_entities, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dataframe, text_entity_dropdown, page_entity_dropdown])
472
  page_entity_dropdown.select(update_entities_df_page, inputs=[page_entity_dropdown, recogniser_entity_dataframe_base, recogniser_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dataframe, recogniser_entity_dropdown, text_entity_dropdown])
473
  text_entity_dropdown.select(update_entities_df_text, inputs=[text_entity_dropdown, recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown], outputs=[recogniser_entity_dataframe, recogniser_entity_dropdown, page_entity_dropdown])
474
 
475
- recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=[annotate_current_page]).\
476
- success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown, recogniser_entity_dataframe_base, review_file_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
477
- success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown]).\
478
- success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
479
 
480
- reset_dropdowns_btn.click(reset_dropdowns, outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown]).\
481
- success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
482
 
 
 
 
 
 
 
 
 
 
 
 
483
  exclude_selected_btn.click(exclude_selected_items_from_redaction, inputs=[review_file_state, recogniser_entity_dataframe, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_state, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
484
- success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown]).\
485
- success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
486
- # success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown, recogniser_entity_dataframe_base, review_file_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
487
 
488
  undo_last_removal_btn.click(undo_last_removal, inputs=[backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base], outputs=[review_file_state, all_image_annotations_state, recogniser_entity_dataframe_base]).\
489
- success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
 
 
 
 
 
490
 
491
  # Convert review file to xfdf Adobe format
492
- convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
493
- success(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes]).\
494
  success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes, page_sizes], outputs=[adobe_review_files_out])
495
 
496
  # Convert xfdf Adobe file back to review_file.csv
497
- convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
498
- success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes]).\
499
  success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
500
 
501
  ###
502
  # TABULAR DATA REDACTION
503
  ###
504
  in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
505
- success(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list])
506
 
507
  tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list_state, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
508
 
@@ -513,7 +556,7 @@ with app:
513
  ###
514
  # IDENTIFY DUPLICATE PAGES
515
  ###
516
- find_duplicate_pages_btn.click(fn=identify_similar_pages, inputs=[in_duplicate_pages], outputs=[duplicate_pages_df, duplicate_pages_out])
517
 
518
  ###
519
  # SETTINGS PAGE INPUT / OUTPUT
@@ -539,42 +582,55 @@ with app:
539
  ###
540
 
541
  # Get connection details on app load
542
- app.load(get_connection_params, inputs=[output_folder_textbox], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox])
543
 
544
- # If relevant environment variable is set, load in the default allow list file from S3
545
- if GET_DEFAULT_ALLOW_LIST == "True" and DEFAULT_ALLOW_LIST_PATH:
546
- print("Loading allow list from default_allow_list_output_folder_location:", default_allow_list_loc)
547
- if not os.path.exists(default_allow_list_loc):
548
  app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_allow_list_file, default_allow_list_output_folder_location]).\
549
  success(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
550
  else:
551
  app.load(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
552
 
 
 
 
 
 
 
 
 
 
553
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
554
  access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
555
- access_callback.setup([session_hash_textbox], access_logs_folder)
556
  session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
557
  success(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
558
 
559
  # User submitted feedback for pdf redactions
560
  pdf_callback = CSVLogger_custom(dataset_file_name=log_file_name)
561
- pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], feedback_logs_folder)
562
  pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], None, preprocess=False).\
563
  success(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
564
 
565
  # User submitted feedback for data redactions
566
  data_callback = CSVLogger_custom(dataset_file_name=log_file_name)
567
- data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], feedback_logs_folder)
568
  data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, data_full_file_name_textbox], None, preprocess=False).\
569
  success(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
570
 
571
  # Log processing time/token usage when making a query
572
- usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
573
- usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], usage_logs_folder)
574
- latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], None, preprocess=False).\
575
- success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
576
-
577
 
 
 
 
 
 
 
 
 
578
 
579
  if __name__ == "__main__":
580
 
@@ -590,7 +646,7 @@ if __name__ == "__main__":
590
 
591
  main(first_loop_state, latest_file_completed=0, output_summary="", output_file_list=None,
592
  log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
593
- current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_line_level_ocr_results = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),chosen_comprehend_entities = chosen_comprehend_entities, chosen_redact_entities = chosen_redact_entities, handwrite_signature_checkbox = ["Redact all identified handwriting", "Redact all identified signatures"])
594
 
595
  # AWS options - placeholder for possibility of storing data on s3 and retrieving it in app
596
  # with gr.Tab(label="Advanced options"):
 
1
  import os
 
 
 
 
 
 
2
  import pandas as pd
3
+ import gradio as gr
4
  from gradio_image_annotation import image_annotator
 
5
 
6
+ from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, TESSERACT_FOLDER, POPPLER_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_BULK_TEXTRACT_CALL_OPTIONS
7
+ from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken
8
+ from tools.aws_functions import upload_file_to_s3, download_file_from_s3
9
  from tools.file_redaction import choose_and_run_redactor
10
+ from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, convert_review_df_to_annotation_json
11
+ from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df
12
  from tools.data_anonymise import anonymise_data_files
13
  from tools.auth import authenticate_user
14
  from tools.load_spacy_model_custom_recognisers import custom_entities
15
  from tools.custom_csvlogger import CSVLogger_custom
16
  from tools.find_duplicate_pages import identify_similar_pages
17
 
18
+ # Suppress downcasting warnings
19
+ pd.set_option('future.no_silent_downcasting', True)
20
 
21
+ add_folder_to_path(TESSERACT_FOLDER)
22
+ add_folder_to_path(POPPLER_FOLDER)
23
 
24
+ ensure_output_folder_exists(OUTPUT_FOLDER)
25
 
26
  chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']
27
 
 
36
 
37
  full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM', 'CUSTOM_FUZZY']
38
 
39
+ log_file_name = 'log.csv'
 
 
 
 
 
40
 
41
  file_input_height = 200
42
 
 
60
  pdf_doc_state = gr.State([])
61
  all_image_annotations_state = gr.State([])
62
 
63
+ all_line_level_ocr_results_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_line_level_ocr_results_df", visible=False, type="pandas", wrap=True)
64
+ all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas", wrap=True)
65
+ review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas", wrap=True)
66
 
67
  session_hash_state = gr.Textbox(label= "session_hash_state", value="", visible=False)
68
  s3_output_folder_state = gr.Textbox(label= "s3_output_folder_state", value="", visible=False)
69
+ output_folder_textbox = gr.Textbox(value = OUTPUT_FOLDER, label="output_folder_textbox", visible=False)
70
+ input_folder_textbox = gr.Textbox(value = INPUT_FOLDER, label="input_folder_textbox", visible=False)
71
 
72
  first_loop_state = gr.Checkbox(label="first_loop_state", value=True, visible=False)
73
  second_loop_state = gr.Checkbox(label="second_loop_state", value=False, visible=False)
 
78
  document_cropboxes = gr.Dropdown(label = "document_cropboxes", value="", allow_custom_value=True,visible=False)
79
  page_sizes = gr.Dropdown(label = "page_sizes", value="", allow_custom_value=True, visible=False)
80
  images_pdf_state = gr.Dropdown(label = "images_pdf_list", value="", allow_custom_value=True,visible=False)
81
+ all_img_details_state = gr.State([])
82
 
83
  output_image_files_state = gr.Dropdown(label = "output_image_files_list", value="", allow_custom_value=True,visible=False)
84
  output_file_list_state = gr.Dropdown(label = "output_file_list", value="", allow_custom_value=True,visible=False)
85
  text_output_file_list_state = gr.Dropdown(label = "text_output_file_list", value="", allow_custom_value=True,visible=False)
86
+ log_files_output_list_state = gr.Dropdown(label = "log_files_output_list", value="", allow_custom_value=True,visible=False)
87
+ duplication_file_path_outputs_list_state = gr.Dropdown(label = "duplication_file_path_outputs_list", value=[], multiselect=True, allow_custom_value=True,visible=False)
88
 
89
  # Backup versions of these objects in case you make a mistake
90
  backup_review_state = gr.Dataframe(visible=False)
91
  backup_image_annotations_state = gr.State([])
92
+ backup_recogniser_entity_dataframe_base = gr.Dataframe(visible=False)
 
93
 
94
  # Logging state
95
+ feedback_logs_state = gr.Textbox(label= "feedback_logs_state", value=FEEDBACK_LOGS_FOLDER + log_file_name, visible=False)
96
+ feedback_s3_logs_loc_state = gr.Textbox(label= "feedback_s3_logs_loc_state", value=FEEDBACK_LOGS_FOLDER, visible=False)
97
+ access_logs_state = gr.Textbox(label= "access_logs_state", value=ACCESS_LOGS_FOLDER + log_file_name, visible=False)
98
+ access_s3_logs_loc_state = gr.Textbox(label= "access_s3_logs_loc_state", value=ACCESS_LOGS_FOLDER, visible=False)
99
+ usage_logs_state = gr.Textbox(label= "usage_logs_state", value=USAGE_LOGS_FOLDER + log_file_name, visible=False)
100
+ usage_s3_logs_loc_state = gr.Textbox(label= "usage_s3_logs_loc_state", value=USAGE_LOGS_FOLDER, visible=False)
 
 
101
 
102
  session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
103
  textract_metadata_textbox = gr.Textbox(label = "textract_metadata_textbox", value="", visible=False)
104
  comprehend_query_number = gr.Number(label = "comprehend_query_number", value=0, visible=False)
105
+ textract_query_number = gr.Number(label = "textract_query_number", value=0, visible=False)
106
+
107
  doc_full_file_name_textbox = gr.Textbox(label = "doc_full_file_name_textbox", value="", visible=False)
108
  doc_file_name_no_extension_textbox = gr.Textbox(label = "doc_full_file_name_textbox", value="", visible=False)
109
+ blank_doc_file_name_no_extension_textbox_for_logs = gr.Textbox(label = "doc_full_file_name_textbox", value="", visible=False) # Left blank for when user does not want to report file names
110
  doc_file_name_with_extension_textbox = gr.Textbox(label = "doc_file_name_with_extension_textbox", value="", visible=False)
111
  doc_file_name_textbox_list = gr.Dropdown(label = "doc_file_name_textbox_list", value="", allow_custom_value=True,visible=False)
112
+ latest_review_file_path = gr.Textbox(label = "latest_review_file_path", value="", visible=False) # Latest review file path output from redaction
113
 
114
  data_full_file_name_textbox = gr.Textbox(label = "data_full_file_name_textbox", value="", visible=False)
115
  data_file_name_no_extension_textbox = gr.Textbox(label = "data_full_file_name_textbox", value="", visible=False)
 
121
  text_name_const = gr.Textbox(label="text_name_const", value="text", visible=False)
122
  page_name_const = gr.Textbox(label="page_name_const", value="page", visible=False)
123
 
124
+ actual_time_taken_number = gr.Number(label = "actual_time_taken_number", value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
125
  annotate_previous_page = gr.Number(value=0, label="Previous page", precision=0, visible=False) # Keeps track of the last page that the annotator was on
 
126
  s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
127
 
128
  ## Annotator zoom value
 
132
 
133
  clear_all_page_redactions = gr.Checkbox(label="clear_all_page_redactions", value=True, visible=False)
134
  prepare_for_review_bool = gr.Checkbox(label="prepare_for_review_bool", value=True, visible=False)
135
+ prepare_images_bool_false = gr.Checkbox(label="prepare_images_bool_false", value=False, visible=False)
136
 
137
  ## Settings page variables
138
  default_deny_list_file_name = "default_deny_list.csv"
139
+ default_deny_list_loc = OUTPUT_FOLDER + "/" + default_deny_list_file_name
140
  in_deny_list_text_in = gr.Textbox(value="deny_list", visible=False)
141
 
142
  fully_redacted_list_file_name = "default_fully_redacted_list.csv"
143
+ fully_redacted_list_loc = OUTPUT_FOLDER + "/" + fully_redacted_list_file_name
144
  in_fully_redacted_text_in = gr.Textbox(value="fully_redacted_pages_list", visible=False)
145
 
146
  # S3 settings for default allow list load
147
+ s3_default_bucket = gr.Textbox(label = "Default S3 bucket", value=DOCUMENT_REDACTION_BUCKET, visible=False)
148
+ s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=S3_ALLOW_LIST_PATH, visible=False)
149
+ default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=ALLOW_LIST_PATH, visible=False)
150
 
151
+ s3_default_cost_codes_file = gr.Textbox(label = "Default cost centre file", value=S3_COST_CODES_PATH, visible=False)
152
+ default_cost_codes_output_folder_location = gr.Textbox(label = "Output default cost centre location", value=COST_CODES_PATH, visible=False)
153
+ enforce_cost_code_textbox = gr.Textbox(label = "Enforce cost code textbox", value=ENFORCE_COST_CODES, visible=False)
 
 
154
 
155
  # Base dataframe for recognisers that is not modified subsequent to load
156
+ recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=3, type="pandas", visible=False, label="recogniser_entity_dataframe_base", show_search="filter", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True)
157
 
158
  # Duplicate page detection
159
  in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
160
+ duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="duplicate_pages_df", visible=False, type="pandas", wrap=True)
161
+
162
+ # Tracking variables for current page (not visible)
163
+ current_loop_page_number = gr.Number(value=0,precision=0, interactive=False, label = "Last redacted page in document", visible=False)
164
+ page_break_return = gr.Checkbox(value = False, label="Page break reached", visible=False)
165
+
166
+ # Placeholders for elements that may be made visible later below depending on environment variables
167
+ cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), type="pandas", visible=False, wrap=True)
168
+ cost_code_choice_drop = gr.Dropdown(value="", label="Choose cost code for analysis", choices=[], allow_custom_value=True, visible=False)
169
+
170
+ textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=False)
171
+ total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=False)
172
+ estimated_aws_costs_number = gr.Number(label = "Approximate AWS Textract and/or Comprehend cost ($)", value=0, visible=False, precision=2)
173
+ estimated_time_taken_number = gr.Number(label = "Approximate time taken to extract text/redact (minutes)", value=0, visible=False, precision=2)
174
+
175
+ only_extract_text_radio = gr.Checkbox(value=False, label="Only extract text (no redaction)", visible=False)
176
 
177
  ###
178
  # UI DESIGN
 
181
  gr.Markdown(
182
  """# Document redaction
183
 
184
+ Redact personally identifiable information (PII) from documents (PDF, images), open text, or tabular data (XLSX/CSV/Parquet). Please see the [User Guide](https://github.com/seanpedrick-case/doc_redaction/blob/main/README.md) for a walkthrough on how to use the app. Below is a very brief overview.
185
 
186
+ To identify text in documents, the 'Local' text/OCR image analysis uses spacy/tesseract, and works ok for documents with typed text. If available, choose 'AWS Textract' to redact more complex elements e.g. signatures or handwriting. Then, choose a method for PII identification. 'Local' is quick and gives good results if you are primarily looking for a custom list of terms to redact (see Redaction settings). If available, AWS Comprehend gives better results at a small cost.
187
 
188
+ After redaction, review suggested redactions on the 'Review redactions' tab. The original pdf can be uploaded here alongside a '...review_file.csv' to continue a previous redaction/review task. See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or custom terms to always include/ exclude from redaction.
189
 
190
  NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.""")
191
 
192
  ###
193
+ # REDACTION PDF/IMAGES TABLE
194
  ###
195
  with gr.Tab("Redact PDFs/images"):
196
  with gr.Accordion("Redact document", open = True):
197
  in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json', '.zip'], height=file_input_height)
198
+
199
+ text_extract_method_radio = gr.Radio(label="Choose text extraction method. AWS Textract has a cost per page - $3.50 per 1,000 pages with signature detection (default), $1.50 without. Go to Redaction settings - AWS Textract options to remove signature detection.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
200
+
201
+ with gr.Row(equal_height=True):
202
+ pii_identification_method_drop = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost of approximately $0.01 per 10,000 characters.", value = default_pii_detector, choices=[no_redaction_option, local_pii_detector, aws_pii_detector])
203
+
204
+ with gr.Accordion("AWS Textract signature detection (default is on)", open = False):
205
+ handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=["Extract handwriting", "Extract signatures"], value=["Extract handwriting", "Extract signatures"])
206
+
207
+ if SHOW_BULK_TEXTRACT_CALL_OPTIONS == "True":
208
+ with gr.Accordion("AWS Textract bulk document API call", open = False, visible=True):
209
+ with gr.Row(equal_height=True):
210
+ job_name_textbox = gr.Textbox(value="", label="Bulk Textract call", visible=True)
211
+ send_document_to_textract_api_btn = gr.Button("Analyse document with AWS Textract", variant="primary", visible=True)
212
+ with gr.Row(equal_height=True):
213
+ check_state_of_textract_api__call_btn = gr.Button("Check state of Textract job", variant="secondary", visible=True)
214
+ job_current_status = gr.Textbox(value="", label="job_current_status", visible=True)
215
+ with gr.Row(equal_height=True):
216
+ textract_job_output_file = gr.File(label="Textract job output files", height=file_input_height, visible=True)
217
+
218
+ if SHOW_COSTS == "True":
219
+ with gr.Accordion("Estimated costs and time taken", open = False, visible=True):
220
+ with gr.Row(equal_height=True):
221
+ textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=True)
222
+ total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=True)
223
+ estimated_aws_costs_number = gr.Number(label = "Approximate AWS Textract and/or Comprehend cost ($)", value=0.00, precision=2, visible=True)
224
+ estimated_time_taken_number = gr.Number(label = "Approximate time taken to extract text/redact (minutes)", value=0, visible=True, precision=2)
225
+
226
+ gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the Redaction Settings tab.""")
227
+
228
+ document_redact_btn = gr.Button("Extract text and redact document", variant="primary", scale = 4)
229
+
230
+ if GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True":
231
+ with gr.Accordion("Apply cost code", open = True, visible=True):
232
+ with gr.Row(equal_height=True):
233
+ cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='search', visible=True, wrap=True, max_height=200)
234
+ cost_code_choice_drop = gr.Dropdown(value="", label="Choose cost code for analysis", choices=[], allow_custom_value=True, visible=True)
235
 
236
  with gr.Row():
237
  output_summary = gr.Textbox(label="Output summary", scale=1)
238
+ output_file = gr.File(label="Output files", scale = 2)#, height=file_input_height)
239
  latest_file_completed_text = gr.Number(value=0, label="Number of documents redacted", interactive=False, visible=False)
240
 
241
  with gr.Row():
 
254
 
255
  with gr.Accordion(label = "Review PDF redactions", open=True):
256
  output_review_files = gr.File(label="Upload original PDF and 'review_file' csv here to review suggested redactions", file_count='multiple', height=file_input_height)
257
+ upload_previous_review_file_btn = gr.Button("Review PDF and 'review file' csv provided above", variant="secondary")
258
  with gr.Row():
259
  annotate_zoom_in = gr.Button("Zoom in", visible=False)
260
  annotate_zoom_out = gr.Button("Zoom out", visible=False)
 
269
  annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
270
  annotation_next_page_button = gr.Button("Next page", scale = 4)
271
  with gr.Column(scale=1):
272
+ annotation_button_apply = gr.Button("Apply revised redactions to PDF", variant="primary")
 
273
 
274
  with gr.Row():
275
  with gr.Column(scale=2):
 
276
  zoom_str = str(annotator_zoom_number) + '%'
277
 
278
  annotator = image_annotator(
 
297
  recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
298
  page_entity_dropdown = gr.Dropdown(label="Page", value="ALL", allow_custom_value=True)
299
  text_entity_dropdown = gr.Dropdown(label="Text", value="ALL", allow_custom_value=True)
300
+ recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=(3,"fixed"), type="pandas", label="Search results. Click to go to page", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True)
301
+
302
+ with gr.Row(equal_height=True):
303
+ exclude_selected_row_btn = gr.Button(value="Exclude specific row from redactions")
304
+ exclude_selected_btn = gr.Button(value="Exclude all items in table from redactions")
305
  with gr.Row(equal_height=True):
306
  reset_dropdowns_btn = gr.Button(value="Reset filters")
307
+
308
  undo_last_removal_btn = gr.Button(value="Undo last element removal")
309
+ update_current_page_redactions_btn = gr.Button(value="Save changes on current page to file", variant="primary")
310
+ selected_entity_dataframe_row = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=3, type="pandas", visible=False, label="selected_entity_dataframe_row", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True)
311
 
312
  with gr.Row():
313
  with gr.Column(scale=2):
 
322
  with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
323
  convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
324
  adobe_review_files_out = gr.File(label="Output Adobe comment files will appear here. If converting from .xfdf file to review_file.csv, upload the original pdf with the xfdf file here then click Convert below.", file_count='multiple', file_types=['.csv', '.xfdf', '.pdf'])
325
+ convert_adobe_to_review_file_btn = gr.Button("Convert Adobe .xfdf comment file to review_file.csv", variant="secondary")
326
+
327
+ ###
328
+ # IDENTIFY DUPLICATE PAGES TAB
329
+ ###
330
+ with gr.Tab(label="Identify duplicate pages"):
331
+ with gr.Accordion("Identify duplicate pages to redact", open = True):
332
+ in_duplicate_pages = gr.File(label="Upload multiple 'ocr_output.csv' data files from redaction jobs here to compare", file_count="multiple", height=file_input_height, file_types=['.csv'])
333
+ with gr.Row():
334
+ duplicate_threshold_value = gr.Number(value=0.9, label="Minimum similarity to be considered a duplicate (maximum = 1)", scale =1)
335
+ find_duplicate_pages_btn = gr.Button(value="Identify duplicate pages", variant="primary", scale = 5)
336
+
337
+ duplicate_pages_out = gr.File(label="Duplicate pages analysis output", file_count="multiple", height=file_input_height, file_types=['.csv'])
338
+
339
  ###
340
  # TEXT / TABULAR DATA TAB
341
  ###
342
  with gr.Tab(label="Open text or Excel/csv files"):
343
+ gr.Markdown("""### Choose open text or a tabular data file (xlsx or csv) to redact.""")
 
 
 
 
344
  with gr.Accordion("Paste open text", open = False):
345
  in_text = gr.Textbox(label="Enter open text", lines=10)
346
  with gr.Accordion("Upload xlsx or csv files", open = True):
 
366
  data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
367
  data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
368
 
 
 
 
 
 
 
 
 
 
369
  ###
370
  # SETTINGS TAB
371
  ###
 
383
  in_fully_redacted_list_text = gr.Textbox(label="Fully redacted page list load status")
384
  with gr.Accordion("Manually modify custom allow, deny, and full page redaction lists (NOTE: you need to press Enter after modifying/adding an entry to the lists to apply them)", open = False):
385
  with gr.Row():
386
+ in_allow_list_state = gr.Dataframe(value=pd.DataFrame(), headers=["allow_list"], col_count=(1, "fixed"), row_count = (0, "dynamic"), label="Allow list", visible=True, type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, wrap=True)
387
+ in_deny_list_state = gr.Dataframe(value=pd.DataFrame(), headers=["deny_list"], col_count=(1, "fixed"), row_count = (0, "dynamic"), label="Deny list", visible=True, type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, wrap=True)
388
+ in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), headers=["fully_redacted_pages_list"], col_count=(1, "fixed"), row_count = (0, "dynamic"), label="Fully redacted pages", visible=True, type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, datatype='number', wrap=True)
389
 
390
  with gr.Accordion("Select entity types to redact", open = True):
391
  in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Local PII identification model (click empty space in box for full list)")
 
400
  page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
401
  page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
402
 
403
+ with gr.Accordion("AWS options", open = False):
 
404
  #with gr.Row():
405
+ in_redact_language = gr.Dropdown(value = REDACTION_LANGUAGE, choices = [REDACTION_LANGUAGE], label="Redaction language", multiselect=False, visible=False)
406
 
407
  with gr.Row():
408
  aws_access_key_textbox = gr.Textbox(value='', label="AWS access key for account with permissions for AWS Textract and Comprehend", visible=True, type="password")
 
419
 
420
  with gr.Accordion("View all output files from this session", open = False):
421
  all_output_files_btn = gr.Button("Click here to view all output files", variant="secondary")
422
+ all_output_files = gr.File(label="All files in output folder", file_count='multiple', file_types=['.csv'], interactive=False)
 
423
 
424
  ### UI INTERACTION ###
425
 
426
  ###
427
  # PDF/IMAGE REDACTION
428
  ###
429
+ # Recalculate estimated costs based on changes to inputs
430
+ if SHOW_COSTS == 'True':
431
+ # Calculate costs
432
+ total_pdf_page_count.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
433
+ text_extract_method_radio.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
434
+ pii_identification_method_drop.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
435
+ handwrite_signature_checkbox.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
436
+ textract_output_found_checkbox.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
437
+ only_extract_text_radio.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
438
+
439
+ # Calculate time taken
440
+ total_pdf_page_count.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_time_taken_number])
441
+ text_extract_method_radio.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_time_taken_number])
442
+ pii_identification_method_drop.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_time_taken_number])
443
+ handwrite_signature_checkbox.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_time_taken_number])
444
+ textract_output_found_checkbox.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_time_taken_number])
445
+ only_extract_text_radio.change(calculate_time_taken, inputs=[total_pdf_page_count, text_extract_method_radio, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_time_taken_number])
446
+
447
+ # Allow user to select items from cost code dataframe for cost code
448
+ if SHOW_COSTS=="True" and (GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True"):
449
+ cost_code_dataframe.select(df_select_callback_cost, inputs=[cost_code_dataframe], outputs=[cost_code_choice_drop])
450
+
451
+ in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
452
+ success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state]).\
453
+ success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
454
 
455
  # Run redaction function
456
+ document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, output_summary]).\
457
+ success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop]).\
458
+ success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number],
459
+ outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number], api_name="redact_doc").\
460
+ success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
461
+
462
  # If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
463
+ current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number],
464
+ outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number]).\
465
+ success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
466
+
467
  # If a file has been completed, the function will continue onto the next document
468
+ latest_file_completed_text.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number],
469
+ outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number]).\
470
+ success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
471
+ success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
472
  success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
473
 
474
  ###
 
477
 
478
  # Upload previous files for modifying redactions
479
  upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
480
+ success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
481
+ success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state], api_name="prepare_doc").\
482
+ success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
483
+
484
+ # Page number controls
485
+ annotate_current_page.change(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
486
+ success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
487
+ success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
 
488
 
489
+ annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page])
490
+ annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
491
 
492
+ annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page_bottom, annotate_current_page_bottom])
493
+ annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page_bottom, annotate_current_page_bottom])
494
 
495
+ annotate_current_page_bottom.submit(update_other_annotator_number_from_current, inputs=[annotate_current_page_bottom], outputs=[annotate_current_page])
 
 
 
 
496
 
497
+ # Apply page redactions
498
+ annotation_button_apply.click(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state], scroll_to_output=True)
 
 
 
 
 
 
 
499
 
500
  # Review table controls
501
  recogniser_entity_dropdown.select(update_entities_df_recogniser_entities, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dataframe, text_entity_dropdown, page_entity_dropdown])
502
  page_entity_dropdown.select(update_entities_df_page, inputs=[page_entity_dropdown, recogniser_entity_dataframe_base, recogniser_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dataframe, recogniser_entity_dropdown, text_entity_dropdown])
503
  text_entity_dropdown.select(update_entities_df_text, inputs=[text_entity_dropdown, recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown], outputs=[recogniser_entity_dataframe, recogniser_entity_dropdown, page_entity_dropdown])
504
 
505
+ recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=[annotate_current_page, selected_entity_dataframe_row])#.\
506
+ #success(update_selected_review_df_row_colour, inputs=[selected_entity_dataframe_row, review_file_state], outputs=[review_file_state]).\
507
+ #success(update_annotator_page_from_review_df, inputs=[review_file_state, images_pdf_state, page_sizes, annotate_current_page, annotate_previous_page, all_image_annotations_state, annotator], outputs=[annotator, all_image_annotations_state])
 
508
 
 
 
509
 
510
+ reset_dropdowns_btn.click(reset_dropdowns, inputs=[recogniser_entity_dataframe_base], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown]).\
511
+ success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
512
+
513
+ # Exclude current selection from annotator and outputs
514
+ # Exclude only row
515
+ exclude_selected_row_btn.click(exclude_selected_items_from_redaction, inputs=[review_file_state, selected_entity_dataframe_row, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_state, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
516
+ success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
517
+ success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state]).\
518
+ success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
519
+
520
+ # Exclude everything visible in table
521
  exclude_selected_btn.click(exclude_selected_items_from_redaction, inputs=[review_file_state, recogniser_entity_dataframe, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_state, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
522
+ success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
523
+ success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state]).\
524
+ success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
525
 
526
  undo_last_removal_btn.click(undo_last_removal, inputs=[backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base], outputs=[review_file_state, all_image_annotations_state, recogniser_entity_dataframe_base]).\
527
+ success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
528
+ success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
529
+
530
+ update_current_page_redactions_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
531
+ success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
532
+ success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
533
 
534
  # Convert review file to xfdf Adobe format
535
+ convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
536
+ success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state]).\
537
  success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes, page_sizes], outputs=[adobe_review_files_out])
538
 
539
  # Convert xfdf Adobe file back to review_file.csv
540
+ convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
541
+ success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, text_extract_method_radio, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state]).\
542
  success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
543
 
544
  ###
545
  # TABULAR DATA REDACTION
546
  ###
547
  in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
548
+ success(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list, total_pdf_page_count])
549
 
550
  tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list_state, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
551
 
 
556
  ###
557
  # IDENTIFY DUPLICATE PAGES
558
  ###
559
+ find_duplicate_pages_btn.click(fn=identify_similar_pages, inputs=[in_duplicate_pages, duplicate_threshold_value, output_folder_textbox], outputs=[duplicate_pages_df, duplicate_pages_out])
560
 
561
  ###
562
  # SETTINGS PAGE INPUT / OUTPUT
 
582
  ###
583
 
584
  # Get connection details on app load
585
+ app.load(get_connection_params, inputs=[output_folder_textbox, input_folder_textbox], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox, input_folder_textbox])
586
 
587
+ # If relevant environment variable is set, load in the default allow list file from S3 or locally
588
+ if GET_DEFAULT_ALLOW_LIST == "True" and ALLOW_LIST_PATH:
589
+ print("Loading allow list from default_allow_list_output_path location:", ALLOW_LIST_PATH)
590
+ if not os.path.exists(ALLOW_LIST_PATH):
591
  app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_allow_list_file, default_allow_list_output_folder_location]).\
592
  success(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
593
  else:
594
  app.load(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
595
 
596
+ # If relevant environment variable is set, load in the default cost code file from S3 or locally
597
+ if GET_COST_CODES == "True" and COST_CODES_PATH:
598
+ print("Loading cost codes from default_cost_codes_path location:", COST_CODES_PATH)
599
+ if not os.path.exists(COST_CODES_PATH):
600
+ app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_cost_codes_file, default_cost_codes_output_folder_location]).\
601
+ success(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location], outputs=[cost_code_dataframe, cost_code_choice_drop])
602
+ else:
603
+ app.load(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location], outputs=[cost_code_dataframe, cost_code_choice_drop])
604
+
605
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
606
  access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
607
+ access_callback.setup([session_hash_textbox], ACCESS_LOGS_FOLDER)
608
  session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
609
  success(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
610
 
611
  # User submitted feedback for pdf redactions
612
  pdf_callback = CSVLogger_custom(dataset_file_name=log_file_name)
613
+ pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], FEEDBACK_LOGS_FOLDER)
614
  pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], None, preprocess=False).\
615
  success(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
616
 
617
  # User submitted feedback for data redactions
618
  data_callback = CSVLogger_custom(dataset_file_name=log_file_name)
619
+ data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], FEEDBACK_LOGS_FOLDER)
620
  data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, data_full_file_name_textbox], None, preprocess=False).\
621
  success(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
622
 
623
  # Log processing time/token usage when making a query
624
+ usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
 
 
 
 
625
 
626
+ if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
627
+ usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop], USAGE_LOGS_FOLDER)
628
+ latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop], None, preprocess=False).\
629
+ success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
630
+ else:
631
+ usage_callback.setup([session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop], USAGE_LOGS_FOLDER)
632
+ latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop], None, preprocess=False).\
633
+ success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
634
 
635
  if __name__ == "__main__":
636
 
 
646
 
647
  main(first_loop_state, latest_file_completed=0, output_summary="", output_file_list=None,
648
  log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
649
+ current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_line_level_ocr_results = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),chosen_comprehend_entities = chosen_comprehend_entities, chosen_redact_entities = chosen_redact_entities, handwrite_signature_checkbox = ["Extract handwriting", "Extract signatures"])
650
 
651
  # AWS options - placeholder for possibility of storing data on s3 and retrieving it in app
652
  # with gr.Tab(label="Advanced options"):
requirements.txt CHANGED
@@ -2,17 +2,17 @@ pdfminer.six==20240706
2
  pdf2image==1.17.0
3
  pymupdf==1.25.3
4
  opencv-python==4.10.0.84
5
- presidio_analyzer==2.2.357
6
- presidio_anonymizer==2.2.357
7
- presidio-image-redactor==0.0.55
8
  pikepdf==9.5.2
9
  pandas==2.2.3
10
- nltk==3.9.1
11
  scikit-learn==1.6.1
12
  spacy==3.8.4
13
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
14
  #en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
15
- gradio==5.22.0
16
  boto3==1.37.17
17
  pyarrow==19.0.1
18
  openpyxl==3.1.5
 
2
  pdf2image==1.17.0
3
  pymupdf==1.25.3
4
  opencv-python==4.10.0.84
5
+ presidio_analyzer==2.2.358
6
+ presidio_anonymizer==2.2.358
7
+ presidio-image-redactor==0.0.56
8
  pikepdf==9.5.2
9
  pandas==2.2.3
10
+ #nltk==3.9.1 # Not required
11
  scikit-learn==1.6.1
12
  spacy==3.8.4
13
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
14
  #en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
15
+ gradio==5.23.3
16
  boto3==1.37.17
17
  pyarrow==19.0.1
18
  openpyxl==3.1.5
tools/auth.py CHANGED
@@ -4,7 +4,7 @@ import boto3
4
  import hmac
5
  import hashlib
6
  import base64
7
- from tools.config import client_id, client_secret, user_pool_id
8
 
9
  def calculate_secret_hash(client_id:str, client_secret:str, username:str):
10
  message = username + client_id
@@ -16,7 +16,7 @@ def calculate_secret_hash(client_id:str, client_secret:str, username:str):
16
  secret_hash = base64.b64encode(dig).decode()
17
  return secret_hash
18
 
19
- def authenticate_user(username:str, password:str, user_pool_id:str=user_pool_id, client_id:str=client_id, client_secret:str=client_secret):
20
  """Authenticates a user against an AWS Cognito user pool.
21
 
22
  Args:
@@ -30,7 +30,7 @@ def authenticate_user(username:str, password:str, user_pool_id:str=user_pool_id,
30
  bool: True if the user is authenticated, False otherwise.
31
  """
32
 
33
- client = boto3.client('cognito-idp') # Cognito Identity Provider client
34
 
35
  # Compute the secret hash
36
  secret_hash = calculate_secret_hash(client_id, client_secret, username)
 
4
  import hmac
5
  import hashlib
6
  import base64
7
+ from tools.config import AWS_CLIENT_ID, AWS_CLIENT_SECRET, AWS_USER_POOL_ID, AWS_REGION
8
 
9
  def calculate_secret_hash(client_id:str, client_secret:str, username:str):
10
  message = username + client_id
 
16
  secret_hash = base64.b64encode(dig).decode()
17
  return secret_hash
18
 
19
+ def authenticate_user(username:str, password:str, user_pool_id:str=AWS_USER_POOL_ID, client_id:str=AWS_CLIENT_ID, client_secret:str=AWS_CLIENT_SECRET):
20
  """Authenticates a user against an AWS Cognito user pool.
21
 
22
  Args:
 
30
  bool: True if the user is authenticated, False otherwise.
31
  """
32
 
33
+ client = boto3.client('cognito-idp', region_name=AWS_REGION) # Cognito Identity Provider client
34
 
35
  # Compute the secret hash
36
  secret_hash = calculate_secret_hash(client_id, client_secret, username)
tools/aws_functions.py CHANGED
@@ -4,13 +4,8 @@ import boto3
4
  import tempfile
5
  import os
6
  from tools.config import AWS_REGION, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET
7
-
8
-
9
  PandasDataFrame = Type[pd.DataFrame]
10
 
11
- # Get AWS credentials
12
- bucket_name = DOCUMENT_REDACTION_BUCKET
13
-
14
  def get_assumed_role_info():
15
  sts_endpoint = 'https://sts.' + AWS_REGION + '.amazonaws.com'
16
  sts = boto3.client('sts', region_name=AWS_REGION, endpoint_url=sts_endpoint)
@@ -26,7 +21,7 @@ def get_assumed_role_info():
26
 
27
  if RUN_AWS_FUNCTIONS == "1":
28
  try:
29
- session = boto3.Session()
30
 
31
  except Exception as e:
32
  print("Could not start boto3 session:", e)
@@ -34,6 +29,7 @@ if RUN_AWS_FUNCTIONS == "1":
34
  try:
35
  assumed_role_arn, assumed_role_name = get_assumed_role_info()
36
 
 
37
  print("Assumed Role ARN:", assumed_role_arn)
38
  print("Assumed Role Name:", assumed_role_name)
39
 
@@ -43,15 +39,15 @@ if RUN_AWS_FUNCTIONS == "1":
43
  # Download direct from S3 - requires login credentials
44
  def download_file_from_s3(bucket_name, key, local_file_path_and_name):
45
 
46
- s3 = boto3.client('s3')
47
  s3.download_file(bucket_name, key, local_file_path_and_name)
48
  print(f"File downloaded from S3: s3://{bucket_name}/{key} to {local_file_path_and_name}")
49
 
50
- def download_folder_from_s3(bucket_name, s3_folder, local_folder):
51
  """
52
  Download all files from an S3 folder to a local folder.
53
  """
54
- s3 = boto3.client('s3')
55
 
56
  # List objects in the specified S3 folder
57
  response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
@@ -72,11 +68,11 @@ def download_folder_from_s3(bucket_name, s3_folder, local_folder):
72
  except Exception as e:
73
  print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
74
 
75
- def download_files_from_s3(bucket_name, s3_folder, local_folder, filenames):
76
  """
77
  Download specific files from an S3 folder to a local folder.
78
  """
79
- s3 = boto3.client('s3')
80
 
81
  print("Trying to download file: ", filenames)
82
 
@@ -105,7 +101,7 @@ def download_files_from_s3(bucket_name, s3_folder, local_folder, filenames):
105
  except Exception as e:
106
  print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
107
 
108
- def load_data_from_aws(in_aws_keyword_file, aws_password="", bucket_name=bucket_name):
109
 
110
  temp_dir = tempfile.mkdtemp()
111
  local_address_stub = temp_dir + '/doc-redaction/'
@@ -156,7 +152,7 @@ def load_data_from_aws(in_aws_keyword_file, aws_password="", bucket_name=bucket_
156
 
157
  return files, out_message
158
 
159
- def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=bucket_name):
160
  """
161
  Uploads a file from local machine to Amazon S3.
162
 
@@ -170,7 +166,7 @@ def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=buck
170
  """
171
  final_out_message = []
172
 
173
- s3_client = boto3.client('s3')
174
 
175
  if isinstance(local_file_paths, str):
176
  local_file_paths = [local_file_paths]
 
4
  import tempfile
5
  import os
6
  from tools.config import AWS_REGION, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET
 
 
7
  PandasDataFrame = Type[pd.DataFrame]
8
 
 
 
 
9
  def get_assumed_role_info():
10
  sts_endpoint = 'https://sts.' + AWS_REGION + '.amazonaws.com'
11
  sts = boto3.client('sts', region_name=AWS_REGION, endpoint_url=sts_endpoint)
 
21
 
22
  if RUN_AWS_FUNCTIONS == "1":
23
  try:
24
+ session = boto3.Session(region_name=AWS_REGION)
25
 
26
  except Exception as e:
27
  print("Could not start boto3 session:", e)
 
29
  try:
30
  assumed_role_arn, assumed_role_name = get_assumed_role_info()
31
 
32
+ print("Successfully assumed ARN role")
33
  print("Assumed Role ARN:", assumed_role_arn)
34
  print("Assumed Role Name:", assumed_role_name)
35
 
 
39
  # Download direct from S3 - requires login credentials
40
  def download_file_from_s3(bucket_name, key, local_file_path_and_name):
41
 
42
+ s3 = boto3.client('s3', region_name=AWS_REGION)
43
  s3.download_file(bucket_name, key, local_file_path_and_name)
44
  print(f"File downloaded from S3: s3://{bucket_name}/{key} to {local_file_path_and_name}")
45
 
46
+ def download_folder_from_s3(bucket_name:str, s3_folder:str, local_folder:str):
47
  """
48
  Download all files from an S3 folder to a local folder.
49
  """
50
+ s3 = boto3.client('s3', region_name=AWS_REGION)
51
 
52
  # List objects in the specified S3 folder
53
  response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
 
68
  except Exception as e:
69
  print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
70
 
71
+ def download_files_from_s3(bucket_name:str, s3_folder:str, local_folder:str, filenames:List[str]):
72
  """
73
  Download specific files from an S3 folder to a local folder.
74
  """
75
+ s3 = boto3.client('s3', region_name=AWS_REGION)
76
 
77
  print("Trying to download file: ", filenames)
78
 
 
101
  except Exception as e:
102
  print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
103
 
104
+ def load_data_from_aws(in_aws_keyword_file, aws_password:str="", bucket_name:str=DOCUMENT_REDACTION_BUCKET):
105
 
106
  temp_dir = tempfile.mkdtemp()
107
  local_address_stub = temp_dir + '/doc-redaction/'
 
152
 
153
  return files, out_message
154
 
155
+ def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=DOCUMENT_REDACTION_BUCKET):
156
  """
157
  Uploads a file from local machine to Amazon S3.
158
 
 
166
  """
167
  final_out_message = []
168
 
169
+ s3_client = boto3.client('s3', region_name=AWS_REGION)
170
 
171
  if isinstance(local_file_paths, str):
172
  local_file_paths = [local_file_paths]
tools/aws_textract.py CHANGED
@@ -7,7 +7,7 @@ from collections import defaultdict
7
  import pikepdf
8
  import time
9
  from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
10
- from tools.config import AWS_ACCESS_KEY, AWS_SECRET_KEY
11
 
12
  def extract_textract_metadata(response:object):
13
  """Extracts metadata from an AWS Textract response."""
@@ -25,7 +25,7 @@ def extract_textract_metadata(response:object):
25
  #'NumberOfPages': number_of_pages
26
  })
27
 
28
- def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str="", handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"]):
29
  '''
30
  Analyse page with AWS Textract
31
  '''
@@ -34,9 +34,9 @@ def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str=""
34
  if AWS_ACCESS_KEY and AWS_SECRET_KEY:
35
  client = boto3.client('textract',
36
  aws_access_key_id=AWS_ACCESS_KEY,
37
- aws_secret_access_key=AWS_SECRET_KEY)
38
  else:
39
- client = boto3.client('textract')
40
  except:
41
  print("Cannot connect to AWS Textract")
42
  return [], "" # Return an empty list and an empty string
@@ -75,8 +75,12 @@ def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str=""
75
  'data': response
76
  }
77
 
 
 
78
  request_metadata = extract_textract_metadata(response) # Metadata comes out as a string
79
 
 
 
80
  # Return a list containing the wrapped response and the metadata
81
  return wrapped_response, request_metadata # Return as a list to match the desired structure
82
 
@@ -125,6 +129,8 @@ def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_
125
  # Find the specific page data
126
  page_json_data = json_data #next((page for page in json_data["pages"] if page["page_no"] == page_no), None)
127
 
 
 
128
  if "Blocks" in page_json_data:
129
  # Access the data for the specific page
130
  text_blocks = page_json_data["Blocks"] # Access the Blocks within the page data
@@ -271,8 +277,7 @@ def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_
271
 
272
  def load_and_convert_textract_json(textract_json_file_path:str, log_files_output_paths:str):
273
  """
274
- Loads Textract JSON from a file, detects if conversion is needed,
275
- and converts if necessary.
276
  """
277
 
278
  if not os.path.exists(textract_json_file_path):
@@ -295,15 +300,16 @@ def load_and_convert_textract_json(textract_json_file_path:str, log_files_output
295
 
296
  # Check if conversion is needed
297
  if "pages" in textract_data:
298
- print("JSON already in the new format. No changes needed.")
299
  return textract_data, False, log_files_output_paths # No conversion required
300
 
301
  if "Blocks" in textract_data:
302
  print("Need to convert Textract JSON to app format.")
303
  try:
304
- from tools.aws_textract import restructure_textract_output
305
  textract_data = restructure_textract_output(textract_data)
306
  return textract_data, False, log_files_output_paths # Successfully converted
 
307
  except Exception as e:
308
  print("Failed to convert JSON data to app format due to:", e)
309
  return {}, True, log_files_output_paths # Conversion failed
@@ -312,33 +318,35 @@ def load_and_convert_textract_json(textract_json_file_path:str, log_files_output
312
  print("textract data:", textract_data)
313
  return {}, True, log_files_output_paths # Return empty data if JSON is not recognized
314
 
315
- # Load Textract JSON output (assuming it's stored in a variable called `textract_output`)
316
- def restructure_textract_output(textract_output:object):
317
- '''
318
- Reorganise textract output that comes from the bulk textract analysis option on AWS to format that works in this app.
319
- '''
320
- pages_dict = defaultdict(lambda: {"page_no": None, "data": {"Blocks": []}})
321
 
322
- # Extract number of pages from DocumentMetadata
323
- total_pages = textract_output.get("DocumentMetadata", {}).get("Pages", 1)
324
 
325
  for block in textract_output.get("Blocks", []):
326
- page_no = block.get("Page", 1) # Default to 1 if not present
327
-
328
- # Ensure page metadata is only set once
329
- if pages_dict[page_no]["page_no"] is None:
330
- pages_dict[page_no]["page_no"] = str(page_no)
331
 
332
- # Add block to corresponding page
333
- pages_dict[page_no]["data"]["Blocks"].append(block)
 
334
 
335
- # Convert dictionary to sorted list of pages
 
 
 
 
 
 
 
 
336
  structured_output = {
 
337
  "pages": [pages_dict[page] for page in sorted(pages_dict.keys())]
338
  }
339
 
340
- # Add DocumentMetadata to the first page's data (optional)
341
- if structured_output["pages"]:
342
- structured_output["pages"][0]["data"]["DocumentMetadata"] = textract_output.get("DocumentMetadata", {})
343
-
344
  return structured_output
 
7
  import pikepdf
8
  import time
9
  from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
10
+ from tools.config import AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION
11
 
12
  def extract_textract_metadata(response:object):
13
  """Extracts metadata from an AWS Textract response."""
 
25
  #'NumberOfPages': number_of_pages
26
  })
27
 
28
+ def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str="", handwrite_signature_checkbox:List[str]=["Extract handwriting", "Redact all identified signatures"]):
29
  '''
30
  Analyse page with AWS Textract
31
  '''
 
34
  if AWS_ACCESS_KEY and AWS_SECRET_KEY:
35
  client = boto3.client('textract',
36
  aws_access_key_id=AWS_ACCESS_KEY,
37
+ aws_secret_access_key=AWS_SECRET_KEY, region_name=AWS_REGION)
38
  else:
39
+ client = boto3.client('textract', region_name=AWS_REGION)
40
  except:
41
  print("Cannot connect to AWS Textract")
42
  return [], "" # Return an empty list and an empty string
 
75
  'data': response
76
  }
77
 
78
+ #print("response:", response)
79
+
80
  request_metadata = extract_textract_metadata(response) # Metadata comes out as a string
81
 
82
+ #print("request_metadata:", request_metadata)
83
+
84
  # Return a list containing the wrapped response and the metadata
85
  return wrapped_response, request_metadata # Return as a list to match the desired structure
86
 
 
129
  # Find the specific page data
130
  page_json_data = json_data #next((page for page in json_data["pages"] if page["page_no"] == page_no), None)
131
 
132
+ #print("page_json_data:", page_json_data)
133
+
134
  if "Blocks" in page_json_data:
135
  # Access the data for the specific page
136
  text_blocks = page_json_data["Blocks"] # Access the Blocks within the page data
 
277
 
278
  def load_and_convert_textract_json(textract_json_file_path:str, log_files_output_paths:str):
279
  """
280
+ Loads Textract JSON from a file, detects if conversion is needed, and converts if necessary.
 
281
  """
282
 
283
  if not os.path.exists(textract_json_file_path):
 
300
 
301
  # Check if conversion is needed
302
  if "pages" in textract_data:
303
+ print("JSON already in the correct format for app. No changes needed.")
304
  return textract_data, False, log_files_output_paths # No conversion required
305
 
306
  if "Blocks" in textract_data:
307
  print("Need to convert Textract JSON to app format.")
308
  try:
309
+
310
  textract_data = restructure_textract_output(textract_data)
311
  return textract_data, False, log_files_output_paths # Successfully converted
312
+
313
  except Exception as e:
314
  print("Failed to convert JSON data to app format due to:", e)
315
  return {}, True, log_files_output_paths # Conversion failed
 
318
  print("textract data:", textract_data)
319
  return {}, True, log_files_output_paths # Return empty data if JSON is not recognized
320
 
321
+ def restructure_textract_output(textract_output: dict):
322
+ """
323
+ Reorganise Textract output from the bulk Textract analysis option on AWS
324
+ into a format that works in this redaction app, reducing size.
325
+ """
326
+ pages_dict = {}
327
 
328
+ # Extract total pages from DocumentMetadata
329
+ document_metadata = textract_output.get("DocumentMetadata", {})
330
 
331
  for block in textract_output.get("Blocks", []):
332
+ page_no = block.get("Page", 1) # Default to 1 if missing
 
 
 
 
333
 
334
+ # Initialize page structure if not already present
335
+ if page_no not in pages_dict:
336
+ pages_dict[page_no] = {"page_no": str(page_no), "data": {"Blocks": []}}
337
 
338
+ # Keep only essential fields to reduce size
339
+ filtered_block = {
340
+ key: block[key] for key in ["BlockType", "Confidence", "Text", "Geometry", "Page", "Id", "Relationships"]
341
+ if key in block
342
+ }
343
+
344
+ pages_dict[page_no]["data"]["Blocks"].append(filtered_block)
345
+
346
+ # Convert pages dictionary to a sorted list
347
  structured_output = {
348
+ "DocumentMetadata": document_metadata, # Store metadata separately
349
  "pages": [pages_dict[page] for page in sorted(pages_dict.keys())]
350
  }
351
 
 
 
 
 
352
  return structured_output
tools/config.py CHANGED
@@ -1,5 +1,12 @@
1
  import os
 
 
 
2
  from dotenv import load_dotenv
 
 
 
 
3
 
4
  # Set or retrieve configuration variables for the redaction app
5
 
@@ -22,7 +29,7 @@ def get_or_create_env_var(var_name:str, default_value:str, print_val:bool=False)
22
 
23
 
24
  # If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. '/env/app_config.env'
25
- APP_CONFIG_PATH = get_or_create_env_var('APP_CONFIG_PATH', '', print_val=True)
26
 
27
 
28
  if os.path.exists(APP_CONFIG_PATH):
@@ -34,7 +41,7 @@ if os.path.exists(APP_CONFIG_PATH):
34
  ###
35
 
36
  # If you have an aws_config env file in the config folder, you can load in AWS keys this way, e.g. '/env/aws_config.env'
37
- AWS_CONFIG_PATH = get_or_create_env_var('AWS_CONFIG_PATH', '', print_val=True)
38
 
39
  if os.path.exists(AWS_CONFIG_PATH):
40
  print(f"Loading AWS variables from config file {AWS_CONFIG_PATH}")
@@ -44,11 +51,11 @@ RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
44
 
45
  AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
46
 
47
- client_id = get_or_create_env_var('AWS_CLIENT_ID', '')
48
 
49
- client_secret = get_or_create_env_var('AWS_CLIENT_SECRET', '')
50
 
51
- user_pool_id = get_or_create_env_var('AWS_USER_POOL_ID', '')
52
 
53
  AWS_ACCESS_KEY = get_or_create_env_var('AWS_ACCESS_KEY', '')
54
  if AWS_ACCESS_KEY: print(f'AWS_ACCESS_KEY found in environment variables')
@@ -78,28 +85,53 @@ MAX_IMAGE_PIXELS = get_or_create_env_var('MAX_IMAGE_PIXELS', '') # Changed to No
78
  # File I/O config
79
  ###
80
 
81
- output_folder = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/')
82
- print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
- session_output_folder = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'False')
85
- print(f'The value of SESSION_OUTPUT_FOLDER is {session_output_folder}')
86
 
87
- input_folder = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/')
88
- print(f'The value of GRADIO_INPUT_FOLDER is {input_folder}')
 
89
 
90
  ###
91
  # REDACTION CONFIG
92
  ###
93
- # Number of pages to loop through before breaking the function and restarting from the last finished page.
94
- page_break_value = get_or_create_env_var('page_break_value', '50000')
 
 
 
95
 
96
- max_time_value = get_or_create_env_var('max_time_value', '999999')
 
 
 
97
 
98
  CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "")
99
 
 
 
100
  ###
101
  # APP RUN CONFIG
102
  ###
 
 
 
 
103
  # Get some environment variables and Launch the Gradio app
104
  COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
105
 
@@ -117,4 +149,19 @@ DEFAULT_CONCURRENCY_LIMIT = get_or_create_env_var('DEFAULT_CONCURRENCY_LIMIT', '
117
 
118
  GET_DEFAULT_ALLOW_LIST = get_or_create_env_var('GET_DEFAULT_ALLOW_LIST', 'False')
119
 
120
- DEFAULT_ALLOW_LIST_PATH = get_or_create_env_var('DEFAULT_ALLOW_LIST_PATH', '')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ import tempfile
3
+ import socket
4
+ from datetime import datetime
5
  from dotenv import load_dotenv
6
+ from tldextract import TLDExtract
7
+
8
+ today_rev = datetime.now().strftime("%Y%m%d")
9
+ host_name = socket.gethostname()
10
 
11
  # Set or retrieve configuration variables for the redaction app
12
 
 
29
 
30
 
31
  # If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. '/env/app_config.env'
32
+ APP_CONFIG_PATH = get_or_create_env_var('APP_CONFIG_PATH', '')
33
 
34
 
35
  if os.path.exists(APP_CONFIG_PATH):
 
41
  ###
42
 
43
  # If you have an aws_config env file in the config folder, you can load in AWS keys this way, e.g. '/env/aws_config.env'
44
+ AWS_CONFIG_PATH = get_or_create_env_var('AWS_CONFIG_PATH', '')
45
 
46
  if os.path.exists(AWS_CONFIG_PATH):
47
  print(f"Loading AWS variables from config file {AWS_CONFIG_PATH}")
 
51
 
52
  AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
53
 
54
+ AWS_CLIENT_ID = get_or_create_env_var('AWS_CLIENT_ID', '')
55
 
56
+ AWS_CLIENT_SECRET = get_or_create_env_var('AWS_CLIENT_SECRET', '')
57
 
58
+ AWS_USER_POOL_ID = get_or_create_env_var('AWS_USER_POOL_ID', '')
59
 
60
  AWS_ACCESS_KEY = get_or_create_env_var('AWS_ACCESS_KEY', '')
61
  if AWS_ACCESS_KEY: print(f'AWS_ACCESS_KEY found in environment variables')
 
85
  # File I/O config
86
  ###
87
 
88
+ SESSION_OUTPUT_FOLDER = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'False') # i.e. do you want your input and output folders saved within a subfolder based on session hash value within output/input folders
89
+
90
+ OUTPUT_FOLDER = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/') # 'output/'
91
+ INPUT_FOLDER = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/') # 'input/'
92
+
93
+ # Allow for files to be saved in a temporary folder for increased security in some instances
94
+ if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
95
+ # Create a temporary directory
96
+ with tempfile.TemporaryDirectory() as temp_dir:
97
+ print(f'Temporary directory created at: {temp_dir}')
98
+
99
+ if OUTPUT_FOLDER == "TEMP": OUTPUT_FOLDER = temp_dir + "/"
100
+ if INPUT_FOLDER == "TEMP": INPUT_FOLDER = temp_dir + "/"
101
+
102
+ FEEDBACK_LOGS_FOLDER = get_or_create_env_var('FEEDBACK_LOGS_FOLDER', 'feedback/' + today_rev + '/' + host_name + '/')
103
 
104
+ USAGE_LOGS_FOLDER = get_or_create_env_var('USAGE_LOGS_FOLDER', 'logs/' + today_rev + '/' + host_name + '/')
 
105
 
106
+ ACCESS_LOGS_FOLDER = get_or_create_env_var('ACCESS_LOGS_FOLDER', 'usage/' + today_rev + '/' + host_name + '/')
107
+
108
+ DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS', 'False')
109
 
110
  ###
111
  # REDACTION CONFIG
112
  ###
113
+ TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "tesseract/")
114
+
115
+ POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "poppler/poppler-24.02.0/Library/bin/")
116
+
117
+ SHOW_BULK_TEXTRACT_CALL_OPTIONS = get_or_create_env_var('SHOW_BULK_TEXTRACT_CALL_OPTIONS', 'False') # This feature not currently implemented
118
 
119
+ # Number of pages to loop through before breaking the function and restarting from the last finished page (not currently activated).
120
+ PAGE_BREAK_VALUE = get_or_create_env_var('PAGE_BREAK_VALUE', '99999')
121
+
122
+ MAX_TIME_VALUE = get_or_create_env_var('MAX_TIME_VALUE', '999999')
123
 
124
  CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "")
125
 
126
+ REDACTION_LANGUAGE = get_or_create_env_var("REDACTION_LANGUAGE", "en") # Currently only English is supported by the app
127
+
128
  ###
129
  # APP RUN CONFIG
130
  ###
131
+
132
+ TLDEXTRACT_CACHE = get_or_create_env_var('TLDEXTRACT_CACHE', 'tld/.tld_set_snapshot')
133
+ extract = TLDExtract(cache_dir=TLDEXTRACT_CACHE)
134
+
135
  # Get some environment variables and Launch the Gradio app
136
  COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
137
 
 
149
 
150
  GET_DEFAULT_ALLOW_LIST = get_or_create_env_var('GET_DEFAULT_ALLOW_LIST', 'False')
151
 
152
+ ALLOW_LIST_PATH = get_or_create_env_var('ALLOW_LIST_PATH', "config/default_allow_list.csv")
153
+
154
+ S3_ALLOW_LIST_PATH = get_or_create_env_var('S3_ALLOW_LIST_PATH', '')
155
+
156
+ SHOW_COSTS = get_or_create_env_var('SHOW_COSTS', 'True')
157
+
158
+ GET_COST_CODES = get_or_create_env_var('GET_COST_CODES', 'False')
159
+
160
+ COST_CODES_PATH = get_or_create_env_var('COST_CODES_PATH', 'config/COST_CENTRES.csv') # file should be a csv file with a single table in it that has two columns with a header. First column should contain cost codes, second column should contain a name or description for the cost code
161
+
162
+ S3_COST_CODES_PATH = get_or_create_env_var('COST_CODES_PATH', '')
163
+
164
+ ENFORCE_COST_CODES = get_or_create_env_var('ENFORCE_COST_CODES', 'False') # If you have cost codes listed, are they compulsory?
165
+
166
+ if ENFORCE_COST_CODES == 'True': GET_COST_CODES = 'True'
167
+ if GET_COST_CODES == 'True': ENFORCE_COST_CODES = 'False'
tools/data_anonymise.py CHANGED
@@ -14,7 +14,7 @@ from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerR
14
  from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
15
  from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
16
 
17
- from tools.config import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, output_folder
18
  from tools.helper_functions import get_file_name_without_type, read_file, detect_file_type
19
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, custom_entities
20
  from tools.custom_image_analyser_engine import do_aws_comprehend_call
@@ -218,7 +218,7 @@ def anonymise_data_files(file_paths: List[str],
218
  log_files_output_paths: list = [],
219
  in_excel_sheets: list = [],
220
  first_loop_state: bool = False,
221
- output_folder: str = output_folder,
222
  in_deny_list:list[str]=[],
223
  max_fuzzy_spelling_mistakes_num:int=0,
224
  pii_identification_method:str="Local",
@@ -335,7 +335,7 @@ def anonymise_data_files(file_paths: List[str],
335
  file_type = ""
336
  out_file_part = anon_file
337
 
338
- out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder)
339
  else:
340
  # If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
341
  file_type = detect_file_type(anon_file)
@@ -419,7 +419,7 @@ def anon_wrapper_func(
419
  chosen_redact_comprehend_entities:List[str]=[],
420
  comprehend_query_number:int=0,
421
  comprehend_client:botocore.client.BaseClient="",
422
- output_folder: str = output_folder
423
  ):
424
  """
425
  This function wraps the anonymisation process for a given dataframe. It filters the dataframe based on chosen columns, applies the specified anonymisation strategy using the anonymise_script function, and exports the anonymised data to a file.
 
14
  from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
15
  from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
16
 
17
+ from tools.config import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, OUTPUT_FOLDER
18
  from tools.helper_functions import get_file_name_without_type, read_file, detect_file_type
19
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, custom_entities
20
  from tools.custom_image_analyser_engine import do_aws_comprehend_call
 
218
  log_files_output_paths: list = [],
219
  in_excel_sheets: list = [],
220
  first_loop_state: bool = False,
221
+ output_folder: str = OUTPUT_FOLDER,
222
  in_deny_list:list[str]=[],
223
  max_fuzzy_spelling_mistakes_num:int=0,
224
  pii_identification_method:str="Local",
 
335
  file_type = ""
336
  out_file_part = anon_file
337
 
338
+ out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=OUTPUT_FOLDER)
339
  else:
340
  # If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
341
  file_type = detect_file_type(anon_file)
 
419
  chosen_redact_comprehend_entities:List[str]=[],
420
  comprehend_query_number:int=0,
421
  comprehend_client:botocore.client.BaseClient="",
422
+ output_folder: str = OUTPUT_FOLDER
423
  ):
424
  """
425
  This function wraps the anonymisation process for a given dataframe. It filters the dataframe based on chosen columns, applies the specified anonymisation strategy using the anonymise_script function, and exports the anonymised data to a file.
tools/file_conversion.py CHANGED
@@ -5,23 +5,28 @@ import os
5
  import re
6
  import time
7
  import json
 
8
  import pymupdf
9
- from pymupdf import Document
10
  import pandas as pd
11
- #import numpy as np
12
  import shutil
13
- from pymupdf import Rect
14
- from fitz import Page
15
  from tqdm import tqdm
16
  from gradio import Progress
17
- from typing import List, Optional
18
  from concurrent.futures import ThreadPoolExecutor, as_completed
19
  from pdf2image import convert_from_path
20
  from PIL import Image
21
  from scipy.spatial import cKDTree
22
 
23
- from tools.config import output_folder, IMAGES_DPI, LOAD_TRUNCATED_IMAGES, MAX_IMAGE_PIXELS, CUSTOM_BOX_COLOUR
 
 
 
 
24
  from tools.helper_functions import get_file_name_without_type, tesseract_ocr_option, text_ocr_option, textract_option, read_file
 
25
 
26
  image_dpi = float(IMAGES_DPI)
27
  if not MAX_IMAGE_PIXELS: Image.MAX_IMAGE_PIXELS = None
@@ -56,16 +61,16 @@ def is_pdf(filename):
56
  """
57
  return filename.lower().endswith(".pdf")
58
 
59
- # %%
60
  ## Convert pdf to image if necessary
61
 
62
-
63
-
64
  def check_image_size_and_reduce(out_path:str, image:Image):
65
  '''
66
  Check if a given image size is above around 4.5mb, and reduce size if necessary. 5mb is the maximum possible to submit to AWS Textract.
67
  '''
68
 
 
 
 
69
  # Check file size and resize if necessary
70
  max_size = 4.5 * 1024 * 1024 # 5 MB in bytes # 5
71
  file_size = os.path.getsize(out_path)
@@ -93,61 +98,82 @@ def check_image_size_and_reduce(out_path:str, image:Image):
93
  print(f"Resized to {new_width}x{new_height}, new file_size: {file_size}")
94
  else:
95
  new_width = width
96
- new_height = height
 
 
 
97
 
98
- return new_width, new_height
99
 
100
- def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_dir: str = 'input') -> tuple[int, str]:
101
- try:
102
- # Construct the full output directory path
103
- output_dir = os.path.join(os.getcwd(), output_dir)
104
- out_path = os.path.join(output_dir, f"{os.path.basename(pdf_path)}_{page_num}.png")
105
- os.makedirs(os.path.dirname(out_path), exist_ok=True)
106
-
107
- if os.path.exists(out_path):
108
- # Load existing image
109
- image = Image.open(out_path)
110
- else:
111
- # Convert PDF page to image
112
- image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
113
- dpi=image_dpi, use_cropbox=False, use_pdftocairo=False)
114
- image = image_l[0]
115
- image = image.convert("L")
116
- image.save(out_path, format="PNG")
117
 
118
- width, height = image.size
 
 
 
 
 
119
 
120
- # Check if image size too large and reduce if necessary
121
- width, height = check_image_size_and_reduce(out_path, image)
 
 
 
 
 
 
 
122
 
123
- return page_num, out_path, width, height
 
 
 
124
 
125
- except Exception as e:
126
- print(f"Error processing page {page_num + 1}: {e}")
127
- return page_num, "", width, height
128
 
129
- def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min: int = 0, image_dpi: float = image_dpi, num_threads: int = 8, output_dir: str = '/input'):
 
 
130
 
131
- # If preparing for review, just load the first page (not used)
 
 
 
 
 
 
 
 
 
 
 
132
  if prepare_for_review == True:
133
  page_count = pdfinfo_from_path(pdf_path)['Pages'] #1
 
 
134
  else:
135
  page_count = pdfinfo_from_path(pdf_path)['Pages']
136
 
137
  print(f"Number of pages in PDF: {page_count}")
138
 
 
 
 
139
  results = []
140
  with ThreadPoolExecutor(max_workers=num_threads) as executor:
141
  futures = []
142
- for page_num in range(page_min, page_count):
143
- futures.append(executor.submit(process_single_page, pdf_path, page_num, image_dpi))
144
 
145
- for future in tqdm(as_completed(futures), total=len(futures), unit="pages", desc="Converting pages"):
146
- page_num, result, width, height = future.result()
147
- if result:
148
- results.append((page_num, result, width, height))
149
  else:
150
  print(f"Page {page_num + 1} failed to process.")
 
151
 
152
  # Sort results by page number
153
  results.sort(key=lambda x: x[0])
@@ -156,10 +182,10 @@ def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min
156
  heights = [result[3] for result in results]
157
 
158
  print("PDF has been converted to images.")
159
- return images, widths, heights
160
 
161
  # Function to take in a file path, decide if it is an image or pdf, then process appropriately.
162
- def process_file(file_path:str, prepare_for_review:bool=False):
163
  # Get the file extension
164
  file_extension = os.path.splitext(file_path)[1].lower()
165
 
@@ -171,21 +197,30 @@ def process_file(file_path:str, prepare_for_review:bool=False):
171
 
172
  # Load images from the file paths. Test to see if it is bigger than 4.5 mb and reduct if needed (Textract limit is 5mb)
173
  image = Image.open(file_path)
174
- img_object, image_sizes_width, image_sizes_height = check_image_size_and_reduce(file_path, image)
 
 
 
 
 
 
 
175
 
176
  # Check if the file is a PDF
177
  elif file_extension == '.pdf':
178
  print(f"{file_path} is a PDF file. Converting to image set")
 
179
  # Run your function for processing PDF files here
180
- img_object, image_sizes_width, image_sizes_height = convert_pdf_to_images(file_path, prepare_for_review)
181
 
182
  else:
183
  print(f"{file_path} is not an image or PDF file.")
184
- img_object = []
185
  image_sizes_width = []
186
  image_sizes_height = []
 
187
 
188
- return img_object, image_sizes_width, image_sizes_height
189
 
190
  def get_input_file_names(file_input:List[str]):
191
  '''
@@ -195,8 +230,8 @@ def get_input_file_names(file_input:List[str]):
195
  all_relevant_files = []
196
  file_name_with_extension = ""
197
  full_file_name = ""
 
198
 
199
- #print("file_input in input file names:", file_input)
200
  if isinstance(file_input, dict):
201
  file_input = os.path.abspath(file_input["name"])
202
 
@@ -215,23 +250,38 @@ def get_input_file_names(file_input:List[str]):
215
 
216
  file_extension = os.path.splitext(file_path)[1].lower()
217
 
218
- # Check if the file is an image type
219
  if (file_extension in ['.jpg', '.jpeg', '.png', '.pdf', '.xlsx', '.csv', '.parquet']) & ("review_file" not in file_path_without_ext):
220
  all_relevant_files.append(file_path_without_ext)
221
  file_name_with_extension = file_path_without_ext + file_extension
222
  full_file_name = file_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
 
224
  all_relevant_files_str = ", ".join(all_relevant_files)
225
 
226
- #print("all_relevant_files_str in input_file_names", all_relevant_files_str)
227
- #print("all_relevant_files in input_file_names", all_relevant_files)
228
-
229
- return all_relevant_files_str, file_name_with_extension, full_file_name, all_relevant_files
230
 
231
  def convert_color_to_range_0_1(color):
232
  return tuple(component / 255 for component in color)
233
 
234
  def redact_single_box(pymupdf_page:Page, pymupdf_rect:Rect, img_annotation_box:dict, custom_colours:bool=False):
 
 
 
 
235
  pymupdf_x1 = pymupdf_rect[0]
236
  pymupdf_y1 = pymupdf_rect[1]
237
  pymupdf_x2 = pymupdf_rect[2]
@@ -247,7 +297,6 @@ def redact_single_box(pymupdf_page:Page, pymupdf_rect:Rect, img_annotation_box:d
247
  redact_bottom_y = middle_y - 1
248
  redact_top_y = middle_y + 1
249
 
250
- #print("Rect:", rect)
251
 
252
  rect_small_pixel_height = Rect(pymupdf_x1, redact_bottom_y, pymupdf_x2, redact_top_y) # Slightly smaller than outside box
253
 
@@ -274,8 +323,7 @@ def redact_single_box(pymupdf_page:Page, pymupdf_rect:Rect, img_annotation_box:d
274
  #shape.finish(color=(0, 0, 0)) # Black fill for the rectangle
275
  shape.commit()
276
 
277
-
278
- def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
279
  '''
280
  Converts coordinates from pymupdf format to image coordinates,
281
  accounting for mediabox dimensions and offset.
@@ -291,22 +339,17 @@ def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
291
  mediabox_height = mediabox.height
292
 
293
  # Get target image dimensions
294
- image_page_width, image_page_height = image.size
 
 
 
 
 
295
 
296
  # Calculate scaling factors
297
  image_to_mediabox_x_scale = image_page_width / mediabox_width
298
  image_to_mediabox_y_scale = image_page_height / mediabox_height
299
 
300
- image_to_rect_scale_width = image_page_width / rect_width
301
- image_to_rect_scale_height = image_page_height / rect_height
302
-
303
- # Adjust for offsets (difference in position between mediabox and rect)
304
- x_offset = rect.x0 - mediabox.x0 # Difference in x position
305
- y_offset = rect.y0 - mediabox.y0 # Difference in y position
306
-
307
- #print("x_offset:", x_offset)
308
- #print("y_offset:", y_offset)
309
-
310
  # Adjust coordinates:
311
  # Apply scaling to match image dimensions
312
  x1_image = x1 * image_to_mediabox_x_scale
@@ -339,24 +382,24 @@ def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
339
 
340
  return x1_image, y1_image, x2_image, y2_image
341
 
342
- def redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours, border = 5):
343
  # Small border to page that remains white
344
  border = 5
345
  # Define the coordinates for the Rect
346
  whole_page_x1, whole_page_y1 = 0 + border, 0 + border # Bottom-left corner
347
  whole_page_x2, whole_page_y2 = rect_width - border, rect_height - border # Top-right corner
348
 
349
- whole_page_image_x1, whole_page_image_y1, whole_page_image_x2, whole_page_image_y2 = convert_pymupdf_to_image_coords(page, whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2, image)
350
 
351
  # Create new image annotation element based on whole page coordinates
352
  whole_page_rect = Rect(whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2)
353
 
354
  # Write whole page annotation to annotation boxes
355
  whole_page_img_annotation_box = {}
356
- whole_page_img_annotation_box["xmin"] = whole_page_image_x1
357
- whole_page_img_annotation_box["ymin"] = whole_page_image_y1
358
- whole_page_img_annotation_box["xmax"] = whole_page_image_x2
359
- whole_page_img_annotation_box["ymax"] = whole_page_image_y2
360
  whole_page_img_annotation_box["color"] = (0,0,0)
361
  whole_page_img_annotation_box["label"] = "Whole page"
362
 
@@ -364,7 +407,7 @@ def redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colou
364
 
365
  return whole_page_img_annotation_box
366
 
367
- def create_page_size_objects(pymupdf_doc:Document, image_sizes_width:List[float], image_sizes_height:List[float]):
368
  page_sizes = []
369
  original_cropboxes = []
370
 
@@ -377,9 +420,9 @@ def create_page_size_objects(pymupdf_doc:Document, image_sizes_width:List[float]
377
  # Create a page_sizes_object.
378
  # If images have been created, then image width an height come from this value. Otherwise, they are set to the cropbox size
379
  if image_sizes_width and image_sizes_height:
380
- out_page_image_sizes = {"page":reported_page_no, "image_width":image_sizes_width[page_no], "image_height":image_sizes_height[page_no], "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":pymupdf_page.cropbox.width, "cropbox_height":pymupdf_page.cropbox.height}
381
  else:
382
- out_page_image_sizes = {"page":reported_page_no, "image_width":pd.NA(), "image_height":pd.NA(), "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":pymupdf_page.cropbox.width, "cropbox_height":pymupdf_page.cropbox.height}
383
 
384
  page_sizes.append(out_page_image_sizes)
385
 
@@ -395,8 +438,11 @@ def prepare_image_or_pdf(
395
  all_annotations_object:List = [],
396
  prepare_for_review:bool = False,
397
  in_fully_redacted_list:List[int]=[],
398
- output_folder:str=output_folder,
 
399
  prepare_images:bool=True,
 
 
400
  progress: Progress = Progress(track_tqdm=True)
401
  ) -> tuple[List[str], List[str]]:
402
  """
@@ -416,7 +462,9 @@ def prepare_image_or_pdf(
416
  prepare_for_review(optional, bool): Is this preparation step preparing pdfs and json files to review current redactions?
417
  in_fully_redacted_list(optional, List of int): A list of pages to fully redact
418
  output_folder (optional, str): The output folder for file save
419
- prepare_images (optional, bool): A boolean indicating whether to create images for each PDF page. Defaults to true
 
 
420
  progress (optional, Progress): Progress tracker for the operation
421
 
422
 
@@ -430,6 +478,7 @@ def prepare_image_or_pdf(
430
  converted_file_paths = []
431
  image_file_paths = []
432
  pymupdf_doc = []
 
433
  review_file_csv = pd.DataFrame()
434
 
435
  if isinstance(in_fully_redacted_list, pd.DataFrame):
@@ -438,39 +487,21 @@ def prepare_image_or_pdf(
438
 
439
  # If this is the first time around, set variables to 0/blank
440
  if first_loop_state==True:
441
- print("first_loop_state is True")
442
  latest_file_completed = 0
443
  out_message = []
444
  all_annotations_object = []
445
  else:
446
- print("Now attempting file:", str(latest_file_completed))
447
-
448
- # This is only run when a new page is loaded, so can reset page loop values. If end of last file (99), current loop number set to 999
449
- # if latest_file_completed == 99:
450
- # current_loop_page_number = 999
451
- # page_break_return = False
452
- # else:
453
- # current_loop_page_number = 0
454
- # page_break_return = False
455
-
456
  # If out message or converted_file_paths are blank, change to a list so it can be appended to
457
- if isinstance(out_message, str):
458
- out_message = [out_message]
459
 
460
- if not file_paths:
461
- file_paths = []
462
 
463
- if isinstance(file_paths, dict):
464
- file_paths = os.path.abspath(file_paths["name"])
465
 
466
- if isinstance(file_paths, str):
467
- file_path_number = 1
468
- else:
469
- file_path_number = len(file_paths)
470
-
471
- #print("Current_loop_page_number at start of prepare_image_or_pdf function is:", current_loop_page_number)
472
- print("Number of file paths:", file_path_number)
473
- print("Latest_file_completed:", latest_file_completed)
474
 
475
  latest_file_completed = int(latest_file_completed)
476
 
@@ -481,9 +512,7 @@ def prepare_image_or_pdf(
481
  final_out_message = '\n'.join(out_message)
482
  else:
483
  final_out_message = out_message
484
- return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes
485
-
486
- #in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
487
 
488
  progress(0.1, desc='Preparing file')
489
 
@@ -524,33 +553,14 @@ def prepare_image_or_pdf(
524
  pymupdf_doc = pymupdf.open(file_path)
525
  pymupdf_pages = pymupdf_doc.page_count
526
 
527
- # Load cropbox dimensions to use later
528
-
529
  converted_file_path = file_path
530
 
531
  if prepare_images==True:
532
- image_file_paths, image_sizes_width, image_sizes_height = process_file(file_path, prepare_for_review)
533
  else:
534
- print("Skipping image preparation")
535
- image_file_paths=[]
536
- image_sizes_width=[]
537
- image_sizes_height=[]
538
-
539
- # Create page sizes object
540
- # page_sizes = []
541
-
542
- # for i, page in enumerate(pymupdf_doc):
543
- # page_no = i
544
- # reported_page_no = i + 1
545
-
546
- # pymupdf_page = pymupdf_doc.load_page(page_no)
547
- # original_cropboxes.append(pymupdf_page.cropbox) # Save original CropBox
548
-
549
- # # Create a page_sizes_object
550
- # out_page_image_sizes = {"page":reported_page_no, "image_width":image_sizes_width[page_no], "image_height":image_sizes_height[page_no], "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":pymupdf_page.cropbox.width, "cropbox_height":pymupdf_page.cropbox.height}
551
- # page_sizes.append(out_page_image_sizes)
552
-
553
- page_sizes, original_cropboxes = create_page_size_objects(pymupdf_doc, image_sizes_width, image_sizes_height)
554
 
555
  #Create base version of the annotation object that doesn't have any annotations in it
556
  if (not all_annotations_object) & (prepare_for_review == True):
@@ -577,22 +587,17 @@ def prepare_image_or_pdf(
577
  pymupdf_page.insert_image(rect, filename=file_path) # Insert the image into the page
578
  pymupdf_page = pymupdf_doc.load_page(0)
579
 
580
- original_cropboxes.append(pymupdf_page.cropbox) # Save original CropBox
581
-
582
  file_path_str = str(file_path)
583
 
584
- image_file_paths, image_sizes_width, image_sizes_height = process_file(file_path_str, prepare_for_review)
 
585
 
586
- #print("image_file_paths:", image_file_paths)
587
  # Create a page_sizes_object
588
- out_page_image_sizes = {"page":1, "image_width":image_sizes_width[0], "image_height":image_sizes_height[0], "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":original_cropboxes[-1].width, "cropbox_height":original_cropboxes[-1].height}
589
- page_sizes.append(out_page_image_sizes)
590
 
591
  converted_file_path = output_folder + file_name_with_ext
592
 
593
- pymupdf_doc.save(converted_file_path)
594
-
595
- print("Inserted image into PDF file")
596
 
597
  elif file_extension in ['.csv']:
598
  review_file_csv = read_file(file)
@@ -604,7 +609,6 @@ def prepare_image_or_pdf(
604
  if (file_extension in ['.json']) | (json_from_csv == True):
605
 
606
  if (file_extension in ['.json']) & (prepare_for_review == True):
607
- print("Preparing file for review")
608
  if isinstance(file_path, str):
609
  with open(file_path, 'r') as json_file:
610
  all_annotations_object = json.load(json_file)
@@ -614,18 +618,18 @@ def prepare_image_or_pdf(
614
 
615
  # Assume it's a textract json
616
  elif (file_extension == '.json') and (prepare_for_review is not True):
617
- # If the file ends with textract.json, assume it's a Textract response object.
618
  # Copy it to the output folder so it can be used later.
619
- out_folder = os.path.join(output_folder, file_path_without_ext + ".json")
620
 
621
  # Use shutil to copy the file directly
622
- shutil.copy2(file_path, out_folder) # Preserves metadata
 
 
623
 
624
  continue
625
 
626
  # If you have an annotations object from the above code
627
  if all_annotations_object:
628
- #print("out_annotations_object before reloading images:", all_annotations_object)
629
 
630
  # Get list of page numbers
631
  image_file_paths_pages = [
@@ -637,11 +641,6 @@ def prepare_image_or_pdf(
637
 
638
  # If PDF pages have been converted to image files, replace the current image paths in the json to this.
639
  if image_file_paths:
640
- #print("Image file paths found")
641
-
642
- #print("Image_file_paths:", image_file_paths)
643
-
644
- #for i, annotation in enumerate(all_annotations_object):
645
  for i, image_file_path in enumerate(image_file_paths):
646
 
647
  if i < len(all_annotations_object):
@@ -650,18 +649,15 @@ def prepare_image_or_pdf(
650
  annotation = {}
651
  all_annotations_object.append(annotation)
652
 
653
- #print("annotation:", annotation, "for page:", str(i))
654
  try:
655
  if not annotation:
656
  annotation = {"image":"", "boxes": []}
657
  annotation_page_number = int(re.search(r'_(\d+)\.png$', image_file_path).group(1))
658
-
659
  else:
660
  annotation_page_number = int(re.search(r'_(\d+)\.png$', annotation["image"]).group(1))
661
  except Exception as e:
662
  print("Extracting page number from image failed due to:", e)
663
  annotation_page_number = 0
664
- #print("Annotation page number:", annotation_page_number)
665
 
666
  # Check if the annotation page number exists in the image file paths pages
667
  if annotation_page_number in image_file_paths_pages:
@@ -674,26 +670,53 @@ def prepare_image_or_pdf(
674
 
675
  all_annotations_object[i] = annotation
676
 
677
- #print("all_annotations_object at end of json/csv load part:", all_annotations_object)
 
 
678
 
679
  # Get list of pages that are to be fully redacted and redact them
680
- # if not in_fully_redacted_list.empty:
681
- # print("Redacting whole pages")
682
 
683
- # for i, image in enumerate(image_file_paths):
684
- # page = pymupdf_doc.load_page(i)
685
- # rect_height = page.rect.height
686
- # rect_width = page.rect.width
687
- # whole_page_img_annotation_box = redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours = False, border = 5)
688
 
689
- # all_annotations_object.append(whole_page_img_annotation_box)
690
 
691
  # Write the response to a JSON file in output folder
692
  out_folder = output_folder + file_path_without_ext + ".json"
693
- with open(out_folder, 'w') as json_file:
694
- json.dump(all_annotations_object, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
695
  continue
696
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
697
  # Must be something else, return with error message
698
  else:
699
  if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
@@ -706,8 +729,7 @@ def prepare_image_or_pdf(
706
  if is_pdf(file_path) == False:
707
  out_message = "Please upload a PDF file for text analysis."
708
  print(out_message)
709
- raise Exception(out_message)
710
-
711
 
712
  converted_file_paths.append(converted_file_path)
713
  image_file_paths.extend(image_file_path)
@@ -722,29 +744,23 @@ def prepare_image_or_pdf(
722
 
723
  number_of_pages = len(image_file_paths)
724
 
725
- return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes
726
 
727
- def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
728
  file_path_without_ext = get_file_name_without_type(in_file_path)
729
 
730
  out_file_paths = out_text_file_path
731
 
732
- # Convert annotated text pdf back to image to give genuine redactions
733
- print("Creating image version of redacted PDF to embed redactions.")
734
-
735
- pdf_text_image_paths, image_sizes_width, image_sizes_height = process_file(out_text_file_path[0])
736
  out_text_image_file_path = output_folder + file_path_without_ext + "_text_redacted_as_img.pdf"
737
  pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=image_dpi, save_all=True, append_images=pdf_text_image_paths[1:])
738
 
739
- # out_file_paths.append(out_text_image_file_path)
740
-
741
  out_file_paths = [out_text_image_file_path]
742
 
743
  out_message = "PDF " + file_path_without_ext + " converted to image-based file."
744
  print(out_message)
745
 
746
- #print("Out file paths:", out_file_paths)
747
-
748
  return out_message, out_file_paths
749
 
750
  def join_values_within_threshold(df1:pd.DataFrame, df2:pd.DataFrame):
@@ -775,212 +791,595 @@ def join_values_within_threshold(df1:pd.DataFrame, df2:pd.DataFrame):
775
 
776
  # Clean up extra columns
777
  final_df = final_df.drop(columns=['key'])
778
- print(final_df)
779
-
780
 
781
- def convert_annotation_json_to_review_df(all_annotations:List[dict], redaction_decision_output:pd.DataFrame=pd.DataFrame(), page_sizes:List[dict]=[]) -> pd.DataFrame:
782
  '''
783
- Convert the annotation json data to a dataframe format. Add on any text from the initial review_file dataframe by joining on pages/co-ordinates (doesn't work very well currently).
784
  '''
785
- # Flatten the data
786
- flattened_annotation_data = []
787
- page_sizes_df = pd.DataFrame()
788
-
789
- if not isinstance(redaction_decision_output, pd.DataFrame):
790
- redaction_decision_output = pd.DataFrame()
791
-
792
- for annotation in all_annotations:
793
- #print("annotation:", annotation)
794
- #print("flattened_data:", flattened_data)
795
- image_path = annotation["image"]
796
-
797
- # Use regex to find the number before .png
798
- match = re.search(r'_(\d+)\.png$', image_path)
799
- if match:
800
- number = match.group(1) # Extract the number
801
- #print(number) # Output: 0
802
- reported_number = int(number) + 1
 
803
  else:
804
- print("No number found before .png. Returning page 1.")
805
- reported_number = 1
806
 
807
- # Check if 'boxes' is in the annotation, if not, add an empty list
808
- if 'boxes' not in annotation:
809
- annotation['boxes'] = []
810
 
811
- for box in annotation["boxes"]:
812
- if 'text' not in box:
813
- data_to_add = {"image": image_path, "page": reported_number, **box} # "text": annotation['text'],
814
- else:
815
- data_to_add = {"image": image_path, "page": reported_number, "text": box['text'], **box}
816
- #print("data_to_add:", data_to_add)
817
- flattened_annotation_data.append(data_to_add)
818
 
819
- # Convert to a DataFrame
820
- review_file_df = pd.DataFrame(flattened_annotation_data)
821
 
822
- if page_sizes:
823
- page_sizes_df = pd.DataFrame(page_sizes)
824
- page_sizes_df["page"] = page_sizes_df["page"].astype(int)
825
 
826
- # Convert data to same coordinate system
827
- # If all coordinates all greater than one, this is a absolute image coordinates - change back to relative coordinates
828
- if "xmin" in review_file_df.columns:
829
- if review_file_df["xmin"].max() >= 1 and review_file_df["xmax"].max() >= 1 and review_file_df["ymin"].max() >= 1 and review_file_df["ymax"].max() >= 1:
830
- #print("review file df has large coordinates")
831
- review_file_df["page"] = review_file_df["page"].astype(int)
832
 
833
- if "image_width" not in review_file_df.columns and not page_sizes_df.empty:
834
- review_file_df = review_file_df.merge(page_sizes_df, on="page", how="left")
835
 
836
- if "image_width" in review_file_df.columns:
837
- #print("Dividing coordinates in review file")
838
- review_file_df["xmin"] = review_file_df["xmin"] / review_file_df["image_width"]
839
- review_file_df["xmax"] = review_file_df["xmax"] / review_file_df["image_width"]
840
- review_file_df["ymin"] = review_file_df["ymin"] / review_file_df["image_height"]
841
- review_file_df["ymax"] = review_file_df["ymax"] / review_file_df["image_height"]
842
 
843
- #print("review_file_df after coordinates divided:", review_file_df)
844
 
845
- if not redaction_decision_output.empty:
846
- # If all coordinates all greater than one, this is a absolute image coordinates - change back to relative coordinates
847
- if redaction_decision_output["xmin"].max() >= 1 and redaction_decision_output["xmax"].max() >= 1 and redaction_decision_output["ymin"].max() >= 1 and redaction_decision_output["ymax"].max() >= 1:
848
 
849
- redaction_decision_output["page"] = redaction_decision_output["page"].astype(int)
 
 
 
850
 
851
- if "image_width" not in redaction_decision_output.columns and not page_sizes_df.empty:
852
- redaction_decision_output = redaction_decision_output.merge(page_sizes_df, on="page", how="left")
853
 
854
- if "image_width" in redaction_decision_output.columns:
855
- redaction_decision_output["xmin"] = redaction_decision_output["xmin"] / redaction_decision_output["image_width"]
856
- redaction_decision_output["xmax"] = redaction_decision_output["xmax"] / redaction_decision_output["image_width"]
857
- redaction_decision_output["ymin"] = redaction_decision_output["ymin"] / redaction_decision_output["image_height"]
858
- redaction_decision_output["ymax"] = redaction_decision_output["ymax"] / redaction_decision_output["image_height"]
859
 
860
- #print("convert_review_json review_file_df before merges:", review_file_df[['xmin', 'ymin', 'xmax', 'ymax', 'label']])
861
- #print("review_file_df[xmin]", review_file_df["xmin"])
 
 
 
 
862
 
863
- #print("redaction_decision_output:", redaction_decision_output)
864
- #print("review_file_df:", review_file_df)
 
 
865
 
866
- # Join on additional text data from decision output results if included, if text not already there
867
- if not redaction_decision_output.empty:
868
- if not 'text' in redaction_decision_output.columns:
869
- redaction_decision_output['text'] = ''
870
 
871
- if not 'text' in review_file_df.columns:
872
- review_file_df['text'] = ''
873
 
874
- # Load DataFrames
875
- df1 = review_file_df.copy()
876
- df2 = redaction_decision_output.copy()
877
 
878
- #print("review_file before tolerance merge:", review_file_df)
879
- #print("redaction_decision_output before tolerance merge:", redaction_decision_output)
880
 
881
- # Create a unique key based on coordinates and label for exact merge
882
- merge_keys = ['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page']
883
- df1['key'] = df1[merge_keys].astype(str).agg('_'.join, axis=1)
884
- df2['key'] = df2[merge_keys].astype(str).agg('_'.join, axis=1)
 
885
 
886
- # Attempt exact merge first
887
- #merged_df = df1.merge(df2[['key', 'text']], on='key', how='left')
 
888
 
889
- # Attempt exact merge first, renaming df2['text'] to avoid suffixes
890
- merged_df = df1.merge(df2[['key', 'text']], on='key', how='left', suffixes=('', '_duplicate'))
891
 
892
- # If a match is found, keep that text; otherwise, keep the original df1 text
893
- merged_df['text'] = merged_df['text'].combine_first(merged_df.pop('text_duplicate'))
894
 
895
- #print("merged_df['text']:", merged_df['text'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
896
 
897
- # Handle missing matches using a proximity-based approach
898
- #if merged_df['text'].isnull().sum() > 0:
899
- #print("Attempting tolerance-based merge for text")
900
- # Convert coordinates to numpy arrays for KDTree lookup
901
- tree = cKDTree(df2[['xmin', 'ymin', 'xmax', 'ymax']].values)
902
- query_coords = df1[['xmin', 'ymin', 'xmax', 'ymax']].values
903
-
904
  # Find nearest neighbors within a reasonable tolerance (e.g., 1% of page)
905
- tolerance = 0.01
906
  distances, indices = tree.query(query_coords, distance_upper_bound=tolerance)
907
 
908
  # Assign text values where matches are found
909
  for i, (dist, idx) in enumerate(zip(distances, indices)):
910
- if dist < tolerance and idx < len(df2):
911
- merged_df.at[i, 'text'] = df2.iloc[idx]['text']
 
 
 
 
 
 
 
912
 
913
- # Drop the temporary key column
914
- merged_df.drop(columns=['key'], inplace=True)
915
 
916
- review_file_df = merged_df
 
 
 
 
 
 
 
 
 
 
917
 
918
- review_file_df = review_file_df[["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
919
 
920
  # Ensure required columns exist, filling with blank if they don't
921
- for col in ["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"]:
 
 
922
  if col not in review_file_df.columns:
923
  review_file_df[col] = ''
924
 
925
- #for col in ['xmin', 'xmax', 'ymin', 'ymax']:
926
- # review_file_df[col] = np.floor(review_file_df[col])
 
 
927
 
928
  # If colours are saved as list, convert to tuple
929
  review_file_df.loc[:,"color"] = review_file_df.loc[:,"color"].apply(lambda x: tuple(x) if isinstance(x, list) else x)
930
 
931
- # print("page_sizes:", page_sizes)
932
-
933
- # Convert page sizes to relative values
934
- # if page_sizes:
935
- # print("Checking page sizes")
936
-
937
- # page_sizes_df = pd.DataFrame(page_sizes)
938
-
939
- # if "image_width" not in review_file_df.columns:
940
- # review_file_df = review_file_df.merge(page_sizes_df, how="left", on = "page")
941
-
942
- # # If all coordinates all greater than one, this is a absolute image coordinates - change back to relative coordinates
943
- # if review_file_df["xmin"].max() > 1 and review_file_df["xmax"].max() > 1 and review_file_df["ymin"].max() > 1 and review_file_df["ymax"].max() > 1:
944
- # print("Dividing coordinates by image width and height.")
945
- # review_file_df["xmin"] = review_file_df["xmin"] / review_file_df["image_width"]
946
- # review_file_df["xmax"] = review_file_df["xmax"] / review_file_df["image_width"]
947
- # review_file_df["ymin"] = review_file_df["ymin"] / review_file_df["image_height"]
948
- # review_file_df["ymax"] = review_file_df["ymax"] / review_file_df["image_height"]
949
-
950
  review_file_df = review_file_df.sort_values(['page', 'ymin', 'xmin', 'label'])
951
 
952
- #review_file_df.to_csv(output_folder + "review_file_test.csv", index=None)
953
-
954
  return review_file_df
955
 
956
- def convert_review_df_to_annotation_json(review_file_df:pd.DataFrame, image_paths:List[Image.Image], page_sizes:List[dict]=[]) -> List[dict]:
 
 
957
  '''
958
  Convert a review csv to a json file for use by the Gradio Annotation object.
959
  '''
 
 
 
 
960
 
961
  # Convert relative co-ordinates into image coordinates for the image annotation output object
962
  if page_sizes:
963
  page_sizes_df = pd.DataFrame(page_sizes)
 
964
 
965
- # If there are no image coordinates, then just convert the first page to image to be able to see this at least.
966
- if len(page_sizes_df.loc[page_sizes_df["image_width"].isnull(),"image_width"]) == len(page_sizes_df["image_width"]):
967
- print("No image dimensions found, converting first page.")
968
-
969
- # If no nulls, then can do image coordinate conversion
970
- elif len(page_sizes_df.loc[page_sizes_df["image_width"].isnull(),"image_width"]) == 0:
971
-
972
- if "image_width" not in review_file_df.columns:
973
- review_file_df = review_file_df.merge(page_sizes_df, how="left", on = "page")
974
-
975
- # If all coordinates are less or equal to one, this is a relative page scaling - change back to image coordinates
976
- if review_file_df["xmin"].max() <= 1 and review_file_df["xmax"].max() <= 1 and review_file_df["ymin"].max() <= 1 and review_file_df["ymax"].max() <= 1:
977
- review_file_df["xmin"] = review_file_df["xmin"] * review_file_df["image_width"]
978
- review_file_df["xmax"] = review_file_df["xmax"] * review_file_df["image_width"]
979
- review_file_df["ymin"] = review_file_df["ymin"] * review_file_df["image_height"]
980
- review_file_df["ymax"] = review_file_df["ymax"] * review_file_df["image_height"]
981
-
982
  # Keep only necessary columns
983
- review_file_df = review_file_df[["image", "page", "xmin", "ymin", "xmax", "ymax", "color", "label"]]
984
 
985
  # If colours are saved as list, convert to tuple
986
  review_file_df.loc[:, "color"] = review_file_df.loc[:,"color"].apply(lambda x: tuple(x) if isinstance(x, list) else x)
@@ -991,16 +1390,15 @@ def convert_review_df_to_annotation_json(review_file_df:pd.DataFrame, image_path
991
  # Create a list to hold the JSON data
992
  json_data = []
993
 
994
- for page_no, pdf_image_path in enumerate(image_paths):
995
- reported_page_number = int(page_no + 1)
 
996
 
997
  if reported_page_number in review_file_df["page"].values:
998
 
999
  # Convert each relevant group to a list of box dictionaries
1000
  selected_csv_pages = grouped_csv_pages.get_group(reported_page_number)
1001
  annotation_boxes = selected_csv_pages.drop(columns=['image', 'page']).to_dict(orient='records')
1002
-
1003
- # If all bbox coordinates are below 1, then they are relative. Need to convert based on image size.
1004
 
1005
  annotation = {
1006
  "image": pdf_image_path,
 
5
  import re
6
  import time
7
  import json
8
+ import numpy as np
9
  import pymupdf
10
+ from pymupdf import Document, Page, Rect
11
  import pandas as pd
 
12
  import shutil
13
+ import zipfile
14
+ from collections import defaultdict
15
  from tqdm import tqdm
16
  from gradio import Progress
17
+ from typing import List, Optional, Dict, Any
18
  from concurrent.futures import ThreadPoolExecutor, as_completed
19
  from pdf2image import convert_from_path
20
  from PIL import Image
21
  from scipy.spatial import cKDTree
22
 
23
+ IMAGE_NUM_REGEX = re.compile(r'_(\d+)\.png$')
24
+
25
+ pd.set_option('future.no_silent_downcasting', True)
26
+
27
+ from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, IMAGES_DPI, LOAD_TRUNCATED_IMAGES, MAX_IMAGE_PIXELS, CUSTOM_BOX_COLOUR
28
  from tools.helper_functions import get_file_name_without_type, tesseract_ocr_option, text_ocr_option, textract_option, read_file
29
+ # from tools.aws_textract import load_and_convert_textract_json
30
 
31
  image_dpi = float(IMAGES_DPI)
32
  if not MAX_IMAGE_PIXELS: Image.MAX_IMAGE_PIXELS = None
 
61
  """
62
  return filename.lower().endswith(".pdf")
63
 
 
64
  ## Convert pdf to image if necessary
65
 
 
 
66
  def check_image_size_and_reduce(out_path:str, image:Image):
67
  '''
68
  Check if a given image size is above around 4.5mb, and reduce size if necessary. 5mb is the maximum possible to submit to AWS Textract.
69
  '''
70
 
71
+ all_img_details = []
72
+ page_num = 0
73
+
74
  # Check file size and resize if necessary
75
  max_size = 4.5 * 1024 * 1024 # 5 MB in bytes # 5
76
  file_size = os.path.getsize(out_path)
 
98
  print(f"Resized to {new_width}x{new_height}, new file_size: {file_size}")
99
  else:
100
  new_width = width
101
+ new_height = height
102
+
103
+
104
+ all_img_details.append((page_num, image, new_width, new_height))
105
 
106
+ return image, new_width, new_height, all_img_details, out_path
107
 
108
+ def process_single_page_for_image_conversion(pdf_path:str, page_num:int, image_dpi:float=image_dpi, create_images:bool = True, input_folder: str = INPUT_FOLDER) -> tuple[int, str, float, float]:
109
+
110
+ out_path_placeholder = "placeholder_image_" + str(page_num) + ".png"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
+ if create_images == True:
113
+ try:
114
+ # Construct the full output directory path
115
+ image_output_dir = os.path.join(os.getcwd(), input_folder)
116
+ out_path = os.path.join(image_output_dir, f"{os.path.basename(pdf_path)}_{page_num}.png")
117
+ os.makedirs(os.path.dirname(out_path), exist_ok=True)
118
 
119
+ if os.path.exists(out_path):
120
+ # Load existing image
121
+ image = Image.open(out_path)
122
+ elif pdf_path.lower().endswith(".pdf"):
123
+ # Convert PDF page to image
124
+ image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
125
+ dpi=image_dpi, use_cropbox=False, use_pdftocairo=False)
126
+ image = image_l[0]
127
+ image = image.convert("L")
128
 
129
+ image.save(out_path, format="PNG")
130
+ elif pdf_path.lower().endswith(".jpg") or pdf_path.lower().endswith(".png") or pdf_path.lower().endswith(".jpeg"):
131
+ image = Image.open(pdf_path)
132
+ image.save(out_path, format="PNG")
133
 
134
+ width, height = image.size
 
 
135
 
136
+ # Check if image size too large and reduce if necessary
137
+ #print("Checking size of image and reducing if necessary.")
138
+ image, width, height, all_img_details, img_path = check_image_size_and_reduce(out_path, image)
139
 
140
+ return page_num, out_path, width, height
141
+
142
+ except Exception as e:
143
+ print(f"Error processing page {page_num + 1}: {e}")
144
+ return page_num, out_path_placeholder, pd.NA, pd.NA
145
+ else:
146
+ # print("Not creating image for page", page_num)
147
+ return page_num, out_path_placeholder, pd.NA, pd.NA
148
+
149
+ def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min: int = 0, page_max:int = 0, create_images:bool=True, image_dpi: float = image_dpi, num_threads: int = 8, input_folder: str = INPUT_FOLDER):
150
+
151
+ # If preparing for review, just load the first page (not currently used)
152
  if prepare_for_review == True:
153
  page_count = pdfinfo_from_path(pdf_path)['Pages'] #1
154
+ page_min = 0
155
+ page_max = page_count
156
  else:
157
  page_count = pdfinfo_from_path(pdf_path)['Pages']
158
 
159
  print(f"Number of pages in PDF: {page_count}")
160
 
161
+ # Set page max to length of pdf if not specified
162
+ if page_max == 0: page_max = page_count
163
+
164
  results = []
165
  with ThreadPoolExecutor(max_workers=num_threads) as executor:
166
  futures = []
167
+ for page_num in range(page_min, page_max):
168
+ futures.append(executor.submit(process_single_page_for_image_conversion, pdf_path, page_num, image_dpi, create_images=create_images, input_folder=input_folder))
169
 
170
+ for future in tqdm(as_completed(futures), total=len(futures), unit="pages", desc="Converting pages to image"):
171
+ page_num, img_path, width, height = future.result()
172
+ if img_path:
173
+ results.append((page_num, img_path, width, height))
174
  else:
175
  print(f"Page {page_num + 1} failed to process.")
176
+ results.append((page_num, "placeholder_image_" + str(page_num) + ".png", pd.NA, pd.NA))
177
 
178
  # Sort results by page number
179
  results.sort(key=lambda x: x[0])
 
182
  heights = [result[3] for result in results]
183
 
184
  print("PDF has been converted to images.")
185
+ return images, widths, heights, results
186
 
187
  # Function to take in a file path, decide if it is an image or pdf, then process appropriately.
188
+ def process_file_for_image_creation(file_path:str, prepare_for_review:bool=False, input_folder:str=INPUT_FOLDER, create_images:bool=True):
189
  # Get the file extension
190
  file_extension = os.path.splitext(file_path)[1].lower()
191
 
 
197
 
198
  # Load images from the file paths. Test to see if it is bigger than 4.5 mb and reduct if needed (Textract limit is 5mb)
199
  image = Image.open(file_path)
200
+ img_object, image_sizes_width, image_sizes_height, all_img_details, img_path = check_image_size_and_reduce(file_path, image)
201
+
202
+ if not isinstance(image_sizes_width, list):
203
+ img_path = [img_path]
204
+ image_sizes_width = [image_sizes_width]
205
+ image_sizes_height = [image_sizes_height]
206
+ all_img_details = [all_img_details]
207
+
208
 
209
  # Check if the file is a PDF
210
  elif file_extension == '.pdf':
211
  print(f"{file_path} is a PDF file. Converting to image set")
212
+
213
  # Run your function for processing PDF files here
214
+ img_path, image_sizes_width, image_sizes_height, all_img_details = convert_pdf_to_images(file_path, prepare_for_review, input_folder=input_folder, create_images=create_images)
215
 
216
  else:
217
  print(f"{file_path} is not an image or PDF file.")
218
+ img_path = []
219
  image_sizes_width = []
220
  image_sizes_height = []
221
+ all_img_details = []
222
 
223
+ return img_path, image_sizes_width, image_sizes_height, all_img_details
224
 
225
  def get_input_file_names(file_input:List[str]):
226
  '''
 
230
  all_relevant_files = []
231
  file_name_with_extension = ""
232
  full_file_name = ""
233
+ total_pdf_page_count = 0
234
 
 
235
  if isinstance(file_input, dict):
236
  file_input = os.path.abspath(file_input["name"])
237
 
 
250
 
251
  file_extension = os.path.splitext(file_path)[1].lower()
252
 
253
+ # Check if the file is in acceptable types
254
  if (file_extension in ['.jpg', '.jpeg', '.png', '.pdf', '.xlsx', '.csv', '.parquet']) & ("review_file" not in file_path_without_ext):
255
  all_relevant_files.append(file_path_without_ext)
256
  file_name_with_extension = file_path_without_ext + file_extension
257
  full_file_name = file_path
258
+
259
+ # If PDF, get number of pages
260
+ if (file_extension in ['.pdf']):
261
+ # Open the PDF file
262
+ pdf_document = pymupdf.open(file_path)
263
+ # Get the number of pages
264
+ page_count = pdf_document.page_count
265
+
266
+ # Close the document
267
+ pdf_document.close()
268
+ else:
269
+ page_count = 1
270
+
271
+ total_pdf_page_count += page_count
272
 
273
  all_relevant_files_str = ", ".join(all_relevant_files)
274
 
275
+ return all_relevant_files_str, file_name_with_extension, full_file_name, all_relevant_files, total_pdf_page_count
 
 
 
276
 
277
  def convert_color_to_range_0_1(color):
278
  return tuple(component / 255 for component in color)
279
 
280
  def redact_single_box(pymupdf_page:Page, pymupdf_rect:Rect, img_annotation_box:dict, custom_colours:bool=False):
281
+ '''
282
+ Commit redaction boxes to a PyMuPDF page.
283
+ '''
284
+
285
  pymupdf_x1 = pymupdf_rect[0]
286
  pymupdf_y1 = pymupdf_rect[1]
287
  pymupdf_x2 = pymupdf_rect[2]
 
297
  redact_bottom_y = middle_y - 1
298
  redact_top_y = middle_y + 1
299
 
 
300
 
301
  rect_small_pixel_height = Rect(pymupdf_x1, redact_bottom_y, pymupdf_x2, redact_top_y) # Slightly smaller than outside box
302
 
 
323
  #shape.finish(color=(0, 0, 0)) # Black fill for the rectangle
324
  shape.commit()
325
 
326
+ def convert_pymupdf_to_image_coords(pymupdf_page:Page, x1:float, y1:float, x2:float, y2:float, image: Image=None, image_dimensions:dict={}):
 
327
  '''
328
  Converts coordinates from pymupdf format to image coordinates,
329
  accounting for mediabox dimensions and offset.
 
339
  mediabox_height = mediabox.height
340
 
341
  # Get target image dimensions
342
+ if image:
343
+ image_page_width, image_page_height = image.size
344
+ elif image_dimensions:
345
+ image_page_width, image_page_height = image_dimensions['image_width'], image_dimensions['image_height']
346
+ else:
347
+ image_page_width, image_page_height = mediabox_width, mediabox_height
348
 
349
  # Calculate scaling factors
350
  image_to_mediabox_x_scale = image_page_width / mediabox_width
351
  image_to_mediabox_y_scale = image_page_height / mediabox_height
352
 
 
 
 
 
 
 
 
 
 
 
353
  # Adjust coordinates:
354
  # Apply scaling to match image dimensions
355
  x1_image = x1 * image_to_mediabox_x_scale
 
382
 
383
  return x1_image, y1_image, x2_image, y2_image
384
 
385
+ def redact_whole_pymupdf_page(rect_height:float, rect_width:float, image:Image, page:Page, custom_colours, border:float = 5, image_dimensions:dict={}):
386
  # Small border to page that remains white
387
  border = 5
388
  # Define the coordinates for the Rect
389
  whole_page_x1, whole_page_y1 = 0 + border, 0 + border # Bottom-left corner
390
  whole_page_x2, whole_page_y2 = rect_width - border, rect_height - border # Top-right corner
391
 
392
+ # whole_page_image_x1, whole_page_image_y1, whole_page_image_x2, whole_page_image_y2 = convert_pymupdf_to_image_coords(page, whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2, image, image_dimensions=image_dimensions)
393
 
394
  # Create new image annotation element based on whole page coordinates
395
  whole_page_rect = Rect(whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2)
396
 
397
  # Write whole page annotation to annotation boxes
398
  whole_page_img_annotation_box = {}
399
+ whole_page_img_annotation_box["xmin"] = whole_page_x1 #whole_page_image_x1
400
+ whole_page_img_annotation_box["ymin"] = whole_page_y1 #whole_page_image_y1
401
+ whole_page_img_annotation_box["xmax"] = whole_page_x2 #whole_page_image_x2
402
+ whole_page_img_annotation_box["ymax"] = whole_page_y2 #whole_page_image_y2
403
  whole_page_img_annotation_box["color"] = (0,0,0)
404
  whole_page_img_annotation_box["label"] = "Whole page"
405
 
 
407
 
408
  return whole_page_img_annotation_box
409
 
410
+ def create_page_size_objects(pymupdf_doc:Document, image_sizes_width:List[float], image_sizes_height:List[float], image_file_paths:List[str]):
411
  page_sizes = []
412
  original_cropboxes = []
413
 
 
420
  # Create a page_sizes_object.
421
  # If images have been created, then image width an height come from this value. Otherwise, they are set to the cropbox size
422
  if image_sizes_width and image_sizes_height:
423
+ out_page_image_sizes = {"page":reported_page_no, "image_path":image_file_paths[page_no], "image_width":image_sizes_width[page_no], "image_height":image_sizes_height[page_no], "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":pymupdf_page.cropbox.width, "cropbox_height":pymupdf_page.cropbox.height, "original_cropbox":original_cropboxes[-1]}
424
  else:
425
+ out_page_image_sizes = {"page":reported_page_no, "image_path":image_file_paths[page_no], "image_width":pd.NA, "image_height":pd.NA, "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":pymupdf_page.cropbox.width, "cropbox_height":pymupdf_page.cropbox.height, "original_cropbox":original_cropboxes[-1]}
426
 
427
  page_sizes.append(out_page_image_sizes)
428
 
 
438
  all_annotations_object:List = [],
439
  prepare_for_review:bool = False,
440
  in_fully_redacted_list:List[int]=[],
441
+ output_folder:str=OUTPUT_FOLDER,
442
+ input_folder:str=INPUT_FOLDER,
443
  prepare_images:bool=True,
444
+ page_sizes:list[dict]=[],
445
+ textract_output_found:bool = False,
446
  progress: Progress = Progress(track_tqdm=True)
447
  ) -> tuple[List[str], List[str]]:
448
  """
 
462
  prepare_for_review(optional, bool): Is this preparation step preparing pdfs and json files to review current redactions?
463
  in_fully_redacted_list(optional, List of int): A list of pages to fully redact
464
  output_folder (optional, str): The output folder for file save
465
+ prepare_images (optional, bool): A boolean indicating whether to create images for each PDF page. Defaults to True.
466
+ page_sizes(optional, List[dict]): A list of dicts containing information about page sizes in various formats.
467
+ textract_output_found (optional, bool): A boolean indicating whether textract output has already been found . Defaults to False.
468
  progress (optional, Progress): Progress tracker for the operation
469
 
470
 
 
478
  converted_file_paths = []
479
  image_file_paths = []
480
  pymupdf_doc = []
481
+ all_img_details = []
482
  review_file_csv = pd.DataFrame()
483
 
484
  if isinstance(in_fully_redacted_list, pd.DataFrame):
 
487
 
488
  # If this is the first time around, set variables to 0/blank
489
  if first_loop_state==True:
 
490
  latest_file_completed = 0
491
  out_message = []
492
  all_annotations_object = []
493
  else:
494
+ print("Now redacting file", str(latest_file_completed))
495
+
 
 
 
 
 
 
 
 
496
  # If out message or converted_file_paths are blank, change to a list so it can be appended to
497
+ if isinstance(out_message, str): out_message = [out_message]
 
498
 
499
+ if not file_paths: file_paths = []
 
500
 
501
+ if isinstance(file_paths, dict): file_paths = os.path.abspath(file_paths["name"])
 
502
 
503
+ if isinstance(file_paths, str): file_path_number = 1
504
+ else: file_path_number = len(file_paths)
 
 
 
 
 
 
505
 
506
  latest_file_completed = int(latest_file_completed)
507
 
 
512
  final_out_message = '\n'.join(out_message)
513
  else:
514
  final_out_message = out_message
515
+ return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details
 
 
516
 
517
  progress(0.1, desc='Preparing file')
518
 
 
553
  pymupdf_doc = pymupdf.open(file_path)
554
  pymupdf_pages = pymupdf_doc.page_count
555
 
 
 
556
  converted_file_path = file_path
557
 
558
  if prepare_images==True:
559
+ image_file_paths, image_sizes_width, image_sizes_height, all_img_details = process_file_for_image_creation(file_path, prepare_for_review, input_folder, create_images=True)
560
  else:
561
+ image_file_paths, image_sizes_width, image_sizes_height, all_img_details = process_file_for_image_creation(file_path, prepare_for_review, input_folder, create_images=False)
562
+
563
+ page_sizes, original_cropboxes = create_page_size_objects(pymupdf_doc, image_sizes_width, image_sizes_height, image_file_paths)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
564
 
565
  #Create base version of the annotation object that doesn't have any annotations in it
566
  if (not all_annotations_object) & (prepare_for_review == True):
 
587
  pymupdf_page.insert_image(rect, filename=file_path) # Insert the image into the page
588
  pymupdf_page = pymupdf_doc.load_page(0)
589
 
 
 
590
  file_path_str = str(file_path)
591
 
592
+ image_file_paths, image_sizes_width, image_sizes_height, all_img_details = process_file_for_image_creation(file_path_str, prepare_for_review, input_folder, create_images=True)
593
+
594
 
 
595
  # Create a page_sizes_object
596
+ page_sizes, original_cropboxes = create_page_size_objects(pymupdf_doc, image_sizes_width, image_sizes_height, image_file_paths)
 
597
 
598
  converted_file_path = output_folder + file_name_with_ext
599
 
600
+ pymupdf_doc.save(converted_file_path, garbage=4, deflate=True, clean=True)
 
 
601
 
602
  elif file_extension in ['.csv']:
603
  review_file_csv = read_file(file)
 
609
  if (file_extension in ['.json']) | (json_from_csv == True):
610
 
611
  if (file_extension in ['.json']) & (prepare_for_review == True):
 
612
  if isinstance(file_path, str):
613
  with open(file_path, 'r') as json_file:
614
  all_annotations_object = json.load(json_file)
 
618
 
619
  # Assume it's a textract json
620
  elif (file_extension == '.json') and (prepare_for_review is not True):
 
621
  # Copy it to the output folder so it can be used later.
622
+ out_textract_path = os.path.join(output_folder, file_path_without_ext + "_textract.json")
623
 
624
  # Use shutil to copy the file directly
625
+ shutil.copy2(file_path, out_textract_path) # Preserves metadata
626
+
627
+ textract_output_found = True
628
 
629
  continue
630
 
631
  # If you have an annotations object from the above code
632
  if all_annotations_object:
 
633
 
634
  # Get list of page numbers
635
  image_file_paths_pages = [
 
641
 
642
  # If PDF pages have been converted to image files, replace the current image paths in the json to this.
643
  if image_file_paths:
 
 
 
 
 
644
  for i, image_file_path in enumerate(image_file_paths):
645
 
646
  if i < len(all_annotations_object):
 
649
  annotation = {}
650
  all_annotations_object.append(annotation)
651
 
 
652
  try:
653
  if not annotation:
654
  annotation = {"image":"", "boxes": []}
655
  annotation_page_number = int(re.search(r'_(\d+)\.png$', image_file_path).group(1))
 
656
  else:
657
  annotation_page_number = int(re.search(r'_(\d+)\.png$', annotation["image"]).group(1))
658
  except Exception as e:
659
  print("Extracting page number from image failed due to:", e)
660
  annotation_page_number = 0
 
661
 
662
  # Check if the annotation page number exists in the image file paths pages
663
  if annotation_page_number in image_file_paths_pages:
 
670
 
671
  all_annotations_object[i] = annotation
672
 
673
+
674
+ if isinstance(in_fully_redacted_list, list):
675
+ in_fully_redacted_list = pd.DataFrame(data={"fully_redacted_pages_list":in_fully_redacted_list})
676
 
677
  # Get list of pages that are to be fully redacted and redact them
678
+ if not in_fully_redacted_list.empty:
679
+ print("Redacting whole pages")
680
 
681
+ for i, image in enumerate(image_file_paths):
682
+ page = pymupdf_doc.load_page(i)
683
+ rect_height = page.rect.height
684
+ rect_width = page.rect.width
685
+ whole_page_img_annotation_box = redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours = False, border = 5, image_dimensions={"image_width":image_sizes_width[i], "image_height":image_sizes_height[i]})
686
 
687
+ all_annotations_object.append(whole_page_img_annotation_box)
688
 
689
  # Write the response to a JSON file in output folder
690
  out_folder = output_folder + file_path_without_ext + ".json"
691
+ # with open(out_folder, 'w') as json_file:
692
+ # json.dump(all_annotations_object, json_file, separators=(",", ":"))
693
  continue
694
 
695
+ # If it's a zip, it could be extract from a Textract bulk API call. Check it's this, and load in json if found
696
+ elif file_extension in ['.zip']:
697
+
698
+ # Assume it's a Textract response object. Copy it to the output folder so it can be used later.
699
+ out_folder = os.path.join(output_folder, file_path_without_ext + "_textract.json")
700
+
701
+ # Use shutil to copy the file directly
702
+ # Open the ZIP file to check its contents
703
+ with zipfile.ZipFile(file_path, 'r') as zip_ref:
704
+ json_files = [f for f in zip_ref.namelist() if f.lower().endswith('.json')]
705
+
706
+ if len(json_files) == 1: # Ensure only one JSON file exists
707
+ json_filename = json_files[0]
708
+
709
+ # Extract the JSON file to the same directory as the ZIP file
710
+ extracted_path = os.path.join(os.path.dirname(file_path), json_filename)
711
+ zip_ref.extract(json_filename, os.path.dirname(file_path))
712
+
713
+ # Move the extracted JSON to the intended output location
714
+ shutil.move(extracted_path, out_folder)
715
+
716
+ textract_output_found = True
717
+ else:
718
+ print(f"Skipping {file_path}: Expected 1 JSON file, found {len(json_files)}")
719
+
720
  # Must be something else, return with error message
721
  else:
722
  if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
 
729
  if is_pdf(file_path) == False:
730
  out_message = "Please upload a PDF file for text analysis."
731
  print(out_message)
732
+ raise Exception(out_message)
 
733
 
734
  converted_file_paths.append(converted_file_path)
735
  image_file_paths.extend(image_file_path)
 
744
 
745
  number_of_pages = len(image_file_paths)
746
 
747
+ return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details
748
 
749
+ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi, output_folder:str=OUTPUT_FOLDER, input_folder:str=INPUT_FOLDER):
750
  file_path_without_ext = get_file_name_without_type(in_file_path)
751
 
752
  out_file_paths = out_text_file_path
753
 
754
+ # Convert annotated text pdf back to image to give genuine redactions
755
+ pdf_text_image_paths, image_sizes_width, image_sizes_height, all_img_details = process_file_for_image_creation(out_file_paths[0], input_folder=input_folder)
 
 
756
  out_text_image_file_path = output_folder + file_path_without_ext + "_text_redacted_as_img.pdf"
757
  pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=image_dpi, save_all=True, append_images=pdf_text_image_paths[1:])
758
 
 
 
759
  out_file_paths = [out_text_image_file_path]
760
 
761
  out_message = "PDF " + file_path_without_ext + " converted to image-based file."
762
  print(out_message)
763
 
 
 
764
  return out_message, out_file_paths
765
 
766
  def join_values_within_threshold(df1:pd.DataFrame, df2:pd.DataFrame):
 
791
 
792
  # Clean up extra columns
793
  final_df = final_df.drop(columns=['key'])
 
 
794
 
795
+ def remove_duplicate_images_with_blank_boxes(data: List[dict]) -> List[dict]:
796
  '''
797
+ Remove items from the annotator object where the same page exists twice.
798
  '''
799
+ # Group items by 'image'
800
+ image_groups = defaultdict(list)
801
+ for item in data:
802
+ image_groups[item['image']].append(item)
803
+
804
+ # Process each group to prioritize items with non-empty boxes
805
+ result = []
806
+ for image, items in image_groups.items():
807
+ # Filter items with non-empty boxes
808
+ non_empty_boxes = [item for item in items if item.get('boxes')]
809
+
810
+ # Remove 'text' elements from boxes
811
+ for item in non_empty_boxes:
812
+ if 'boxes' in item:
813
+ item['boxes'] = [{k: v for k, v in box.items() if k != 'text'} for box in item['boxes']]
814
+
815
+ if non_empty_boxes:
816
+ # Keep the first entry with non-empty boxes
817
+ result.append(non_empty_boxes[0])
818
  else:
819
+ # If all items have empty or missing boxes, keep the first item
820
+ result.append(items[0])
821
 
822
+ return result
 
 
823
 
824
+ def divide_coordinates_by_page_sizes(review_file_df:pd.DataFrame, page_sizes_df:pd.DataFrame, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax"):
 
 
 
 
 
 
825
 
826
+ '''Convert data to same coordinate system. If all coordinates all greater than one, this is a absolute image coordinates - change back to relative coordinates.'''
 
827
 
828
+ review_file_df_out = review_file_df
 
 
829
 
830
+ if xmin in review_file_df.columns and not review_file_df.empty:
831
+ review_file_df_orig = review_file_df.copy().loc[(review_file_df[xmin] <= 1) & (review_file_df[xmax] <= 1) & (review_file_df[ymin] <= 1) & (review_file_df[ymax] <= 1),:]
832
+
833
+ review_file_df = review_file_df.loc[(review_file_df[xmin] > 1) & (review_file_df[xmax] > 1) & (review_file_df[ymin] > 1) & (review_file_df[ymax] > 1),:]
 
 
834
 
835
+ review_file_df.loc[:, "page"] = pd.to_numeric(review_file_df["page"], errors="coerce")
 
836
 
837
+ review_file_df_div = review_file_df
 
 
 
 
 
838
 
839
+ if "image_width" not in review_file_df_div.columns and not page_sizes_df.empty:
840
 
841
+ page_sizes_df["image_width"] = page_sizes_df["image_width"].replace("<NA>", pd.NA)
842
+ page_sizes_df["image_height"] = page_sizes_df["image_height"].replace("<NA>", pd.NA)
843
+ review_file_df_div = review_file_df_div.merge(page_sizes_df[["page", "image_width", "image_height", "mediabox_width", "mediabox_height"]], on="page", how="left")
844
 
845
+ if "image_width" in review_file_df_div.columns:
846
+ if review_file_df_div["image_width"].isna().all(): # Check if all are NaN values. If so, assume we only have mediabox coordinates available
847
+ review_file_df_div["image_width"] = review_file_df_div["image_width"].fillna(review_file_df_div["mediabox_width"]).infer_objects()
848
+ review_file_df_div["image_height"] = review_file_df_div["image_height"].fillna(review_file_df_div["mediabox_height"]).infer_objects()
849
 
850
+ convert_type_cols = ["image_width", "image_height", xmin, xmax, ymin, ymax]
851
+ review_file_df_div[convert_type_cols] = review_file_df_div[convert_type_cols].apply(pd.to_numeric, errors="coerce")
852
 
853
+ review_file_df_div[xmin] = review_file_df_div[xmin] / review_file_df_div["image_width"]
854
+ review_file_df_div[xmax] = review_file_df_div[xmax] / review_file_df_div["image_width"]
855
+ review_file_df_div[ymin] = review_file_df_div[ymin] / review_file_df_div["image_height"]
856
+ review_file_df_div[ymax] = review_file_df_div[ymax] / review_file_df_div["image_height"]
 
857
 
858
+ # Concatenate the original and modified DataFrames
859
+ dfs_to_concat = [df for df in [review_file_df_orig, review_file_df_div] if not df.empty]
860
+ if dfs_to_concat: # Ensure there's at least one non-empty DataFrame
861
+ review_file_df_out = pd.concat(dfs_to_concat)
862
+ else:
863
+ review_file_df_out = review_file_df # Return an original DataFrame instead of raising an error
864
 
865
+ # Only sort if the DataFrame is not empty and contains the required columns
866
+ required_sort_columns = {"page", xmin, ymin}
867
+ if not review_file_df_out.empty and required_sort_columns.issubset(review_file_df_out.columns):
868
+ review_file_df_out.sort_values(["page", ymin, xmin], inplace=True)
869
 
870
+ review_file_df_out.drop(["image_width", "image_height", "mediabox_width", "mediabox_height"], axis=1, errors="ignore")
 
 
 
871
 
872
+ return review_file_df_out
 
873
 
874
+ def multiply_coordinates_by_page_sizes(review_file_df: pd.DataFrame, page_sizes_df: pd.DataFrame, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax"):
 
 
875
 
 
 
876
 
877
+ if xmin in review_file_df.columns and not review_file_df.empty:
878
+ # Separate absolute vs relative coordinates
879
+ review_file_df_orig = review_file_df.loc[
880
+ (review_file_df[xmin] > 1) & (review_file_df[xmax] > 1) &
881
+ (review_file_df[ymin] > 1) & (review_file_df[ymax] > 1), :].copy()
882
 
883
+ review_file_df = review_file_df.loc[
884
+ (review_file_df[xmin] <= 1) & (review_file_df[xmax] <= 1) &
885
+ (review_file_df[ymin] <= 1) & (review_file_df[ymax] <= 1), :].copy()
886
 
887
+ if review_file_df.empty:
888
+ return review_file_df_orig # If nothing is left, return the original absolute-coordinates DataFrame
889
 
890
+ review_file_df.loc[:, "page"] = pd.to_numeric(review_file_df["page"], errors="coerce")
 
891
 
892
+ if "image_width" not in review_file_df.columns and not page_sizes_df.empty:
893
+ page_sizes_df[['image_width', 'image_height']] = page_sizes_df[['image_width','image_height']].replace("<NA>", pd.NA) # Ensure proper NA handling
894
+ review_file_df = review_file_df.merge(page_sizes_df, on="page", how="left")
895
+
896
+ if "image_width" in review_file_df.columns:
897
+ # Split into rows with/without image size info
898
+ review_file_df_not_na = review_file_df.loc[review_file_df["image_width"].notna()].copy()
899
+ review_file_df_na = review_file_df.loc[review_file_df["image_width"].isna()].copy()
900
+
901
+ if not review_file_df_not_na.empty:
902
+ convert_type_cols = ["image_width", "image_height", xmin, xmax, ymin, ymax]
903
+ review_file_df_not_na[convert_type_cols] = review_file_df_not_na[convert_type_cols].apply(pd.to_numeric, errors="coerce")
904
+
905
+ # Multiply coordinates by image sizes
906
+ review_file_df_not_na[xmin] *= review_file_df_not_na["image_width"]
907
+ review_file_df_not_na[xmax] *= review_file_df_not_na["image_width"]
908
+ review_file_df_not_na[ymin] *= review_file_df_not_na["image_height"]
909
+ review_file_df_not_na[ymax] *= review_file_df_not_na["image_height"]
910
+
911
+ # Concatenate the modified and unmodified data
912
+ review_file_df = pd.concat([df for df in [review_file_df_not_na, review_file_df_na] if not df.empty])
913
+
914
+ # Merge with the original absolute-coordinates DataFrame
915
+ dfs_to_concat = [df for df in [review_file_df_orig, review_file_df] if not df.empty]
916
+ if dfs_to_concat: # Ensure there's at least one non-empty DataFrame
917
+ review_file_df = pd.concat(dfs_to_concat)
918
+ else:
919
+ review_file_df = pd.DataFrame() # Return an empty DataFrame instead of raising an error
920
+
921
+ # Only sort if the DataFrame is not empty and contains the required columns
922
+ required_sort_columns = {"page", "xmin", "ymin"}
923
+ if not review_file_df.empty and required_sort_columns.issubset(review_file_df.columns):
924
+ review_file_df.sort_values(["page", "xmin", "ymin"], inplace=True)
925
+
926
+ return review_file_df
927
+
928
+
929
+ def do_proximity_match_by_page_for_text(df1:pd.DataFrame, df2:pd.DataFrame):
930
+ '''
931
+ Match text from one dataframe to another based on proximity matching of coordinates page by page.
932
+ '''
933
+
934
+ if not 'text' in df2.columns: df2['text'] = ''
935
+ if not 'text' in df1.columns: df1['text'] = ''
936
+
937
+ # Create a unique key based on coordinates and label for exact merge
938
+ merge_keys = ['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page']
939
+ df1['key'] = df1[merge_keys].astype(str).agg('_'.join, axis=1)
940
+ df2['key'] = df2[merge_keys].astype(str).agg('_'.join, axis=1)
941
+
942
+ # Attempt exact merge first
943
+ merged_df = df1.merge(df2[['key', 'text']], on='key', how='left', suffixes=('', '_duplicate'))
944
+
945
+ # If a match is found, keep that text; otherwise, keep the original df1 text
946
+ merged_df['text'] = np.where(
947
+ merged_df['text'].isna() | (merged_df['text'] == ''),
948
+ merged_df.pop('text_duplicate'),
949
+ merged_df['text']
950
+ )
951
+
952
+ # Define tolerance for proximity matching
953
+ tolerance = 0.02
954
+
955
+ # Precompute KDTree for each page in df2
956
+ page_trees = {}
957
+ for page in df2['page'].unique():
958
+ df2_page = df2[df2['page'] == page]
959
+ coords = df2_page[['xmin', 'ymin', 'xmax', 'ymax']].values
960
+ if np.all(np.isfinite(coords)) and len(coords) > 0:
961
+ page_trees[page] = (cKDTree(coords), df2_page)
962
+
963
+ # Perform proximity matching
964
+ for i, row in df1.iterrows():
965
+ page_number = row['page']
966
+
967
+ if page_number in page_trees:
968
+ tree, df2_page = page_trees[page_number]
969
+
970
+ # Query KDTree for nearest neighbor
971
+ dist, idx = tree.query([row[['xmin', 'ymin', 'xmax', 'ymax']].values], distance_upper_bound=tolerance)
972
+
973
+ if dist[0] < tolerance and idx[0] < len(df2_page):
974
+ merged_df.at[i, 'text'] = df2_page.iloc[idx[0]]['text']
975
+
976
+ # Drop the temporary key column
977
+ merged_df.drop(columns=['key'], inplace=True)
978
+
979
+ return merged_df
980
+
981
+
982
+ def do_proximity_match_all_pages_for_text(df1:pd.DataFrame, df2:pd.DataFrame, threshold:float=0.03):
983
+ '''
984
+ Match text from one dataframe to another based on proximity matching of coordinates across all pages.
985
+ '''
986
+
987
+ if not 'text' in df2.columns: df2['text'] = ''
988
+ if not 'text' in df1.columns: df1['text'] = ''
989
+
990
+ # Create a unique key based on coordinates and label for exact merge
991
+ merge_keys = ['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page']
992
+ df1['key'] = df1[merge_keys].astype(str).agg('_'.join, axis=1)
993
+ df2['key'] = df2[merge_keys].astype(str).agg('_'.join, axis=1)
994
+
995
+ # Attempt exact merge first, renaming df2['text'] to avoid suffixes
996
+ merged_df = df1.merge(df2[['key', 'text']], on='key', how='left', suffixes=('', '_duplicate'))
997
+
998
+ # If a match is found, keep that text; otherwise, keep the original df1 text
999
+ merged_df['text'] = np.where(
1000
+ merged_df['text'].isna() | (merged_df['text'] == ''),
1001
+ merged_df.pop('text_duplicate'),
1002
+ merged_df['text']
1003
+ )
1004
+
1005
+ # Handle missing matches using a proximity-based approach
1006
+ # Convert coordinates to numpy arrays for KDTree lookup
1007
+ query_coords = np.array(df1[['xmin', 'ymin', 'xmax', 'ymax']].values, dtype=float)
1008
+
1009
+ # Check for NaN or infinite values in query_coords and filter them out
1010
+ finite_mask = np.isfinite(query_coords).all(axis=1)
1011
+ if not finite_mask.all():
1012
+ print("Warning: query_coords contains non-finite values. Filtering out non-finite entries.")
1013
+ query_coords = query_coords[finite_mask] # Filter out rows with NaN or infinite values
1014
+ else:
1015
+ pass
1016
+
1017
+ # Proceed only if query_coords is not empty
1018
+ if query_coords.size > 0:
1019
+ # Ensure df2 is filtered for finite values before creating the KDTree
1020
+ finite_mask_df2 = np.isfinite(df2[['xmin', 'ymin', 'xmax', 'ymax']].values).all(axis=1)
1021
+ df2_finite = df2[finite_mask_df2]
1022
+
1023
+ # Create the KDTree with the filtered data
1024
+ tree = cKDTree(df2_finite[['xmin', 'ymin', 'xmax', 'ymax']].values)
1025
 
 
 
 
 
 
 
 
1026
  # Find nearest neighbors within a reasonable tolerance (e.g., 1% of page)
1027
+ tolerance = threshold
1028
  distances, indices = tree.query(query_coords, distance_upper_bound=tolerance)
1029
 
1030
  # Assign text values where matches are found
1031
  for i, (dist, idx) in enumerate(zip(distances, indices)):
1032
+ if dist < tolerance and idx < len(df2_finite):
1033
+ merged_df.at[i, 'text'] = df2_finite.iloc[idx]['text']
1034
+
1035
+ # Drop the temporary key column
1036
+ merged_df.drop(columns=['key'], inplace=True)
1037
+
1038
+ return merged_df
1039
+
1040
+
1041
 
 
 
1042
 
1043
+ def _extract_page_number(image_path: Any) -> int:
1044
+ """Helper function to safely extract page number."""
1045
+ if not isinstance(image_path, str):
1046
+ return 1
1047
+ match = IMAGE_NUM_REGEX.search(image_path)
1048
+ if match:
1049
+ try:
1050
+ return int(match.group(1)) + 1
1051
+ except (ValueError, TypeError):
1052
+ return 1
1053
+ return 1
1054
 
1055
+ def convert_annotation_data_to_dataframe(all_annotations: List[Dict[str, Any]]):
1056
+ '''
1057
+ Convert annotation list to DataFrame using Pandas explode and json_normalize.
1058
+ '''
1059
+ if not all_annotations:
1060
+ # Return an empty DataFrame with the expected schema if input is empty
1061
+ return pd.DataFrame(columns=["image", "page", "xmin", "xmax", "ymin", "ymax", "text"])
1062
+
1063
+ # 1. Create initial DataFrame from the list of annotations
1064
+ # Use list comprehensions with .get() for robustness
1065
+ df = pd.DataFrame({
1066
+ "image": [anno.get("image") for anno in all_annotations],
1067
+ # Ensure 'boxes' defaults to an empty list if missing or None
1068
+ "boxes": [anno.get("boxes") if isinstance(anno.get("boxes"), list) else [] for anno in all_annotations]
1069
+ })
1070
+
1071
+ # 2. Calculate the page number using the helper function
1072
+ df['page'] = df['image'].apply(_extract_page_number)
1073
+
1074
+ # 3. Handle empty 'boxes' lists *before* exploding.
1075
+ # Explode removes rows where the list is empty. We want to keep them
1076
+ # as rows with NA values. Replace empty lists with a list containing
1077
+ # a single placeholder dictionary.
1078
+ placeholder_box = {"xmin": pd.NA, "xmax": pd.NA, "ymin": pd.NA, "ymax": pd.NA, "text": pd.NA}
1079
+ df['boxes'] = df['boxes'].apply(lambda x: x if x else [placeholder_box])
1080
+
1081
+ # 4. Explode the 'boxes' column. Each item in the list becomes a new row.
1082
+ df_exploded = df.explode('boxes', ignore_index=True)
1083
+
1084
+ # 5. Normalize the 'boxes' column (which now contains dictionaries or the placeholder)
1085
+ # This turns the dictionaries into separate columns.
1086
+ # Check for NaNs or non-dict items just in case, though placeholder handles most cases.
1087
+ mask = df_exploded['boxes'].notna() & df_exploded['boxes'].apply(isinstance, args=(dict,))
1088
+ normalized_boxes = pd.json_normalize(df_exploded.loc[mask, 'boxes'])
1089
+
1090
+ # 6. Combine the base data (image, page) with the normalized box data
1091
+ # Use the index of the exploded frame (where mask is True) to ensure correct alignment
1092
+ final_df = df_exploded.loc[mask, ['image', 'page']].reset_index(drop=True).join(normalized_boxes)
1093
+
1094
+ # --- Optional: Handle rows that might have had non-dict items in 'boxes' ---
1095
+ # If there were rows filtered out by 'mask', you might want to add them back
1096
+ # with NA values for box columns. However, the placeholder strategy usually
1097
+ # prevents this from being necessary.
1098
+
1099
+ # 7. Ensure essential columns exist and set column order
1100
+ essential_box_cols = ["xmin", "xmax", "ymin", "ymax", "text"]
1101
+ for col in essential_box_cols:
1102
+ if col not in final_df.columns:
1103
+ final_df[col] = pd.NA # Add column with NA if it wasn't present in any box
1104
+
1105
+ base_cols = ["image", "page"]
1106
+ extra_box_cols = [col for col in final_df.columns if col not in base_cols and col not in essential_box_cols]
1107
+ final_col_order = base_cols + essential_box_cols + sorted(extra_box_cols)
1108
+
1109
+ # Reindex to ensure consistent column order and presence of essential columns
1110
+ # Using fill_value=pd.NA isn't strictly needed here as we added missing columns above,
1111
+ # but it's good practice if columns could be missing for other reasons.
1112
+ final_df = final_df.reindex(columns=final_col_order, fill_value=pd.NA)
1113
+
1114
+ return final_df
1115
+
1116
+
1117
+ # def convert_annotation_data_to_dataframe(all_annotations:List[dict]):
1118
+ # '''
1119
+ # Convert an annotation list of dictionaries to a dataframe with all boxes on a separate row
1120
+ # '''
1121
+ # # Flatten the data
1122
+ # flattened_annotation_data = []
1123
+
1124
+ # for annotation in all_annotations:
1125
+ # image_path = annotation["image"]
1126
+
1127
+ # if image_path:
1128
+ # match = re.search(r'_(\d+)\.png$', image_path)
1129
+ # if match:
1130
+ # number = match.group(1)
1131
+ # reported_number = int(number) + 1
1132
+ # else:
1133
+ # reported_number = 1
1134
+ # else:
1135
+ # reported_number = 1
1136
+
1137
+ # # Check if 'boxes' is in the annotation, if not, add an empty list
1138
+ # if 'boxes' not in annotation:
1139
+ # annotation['boxes'] = []
1140
+
1141
+ # # If boxes are empty, create a row with blank values for xmin, xmax, ymin, ymax
1142
+ # if not annotation["boxes"]:
1143
+ # data_to_add = {"image": image_path, "page": reported_number, "xmin": pd.NA, "xmax": pd.NA, "ymin": pd.NA, "ymax": pd.NA}
1144
+ # flattened_annotation_data.append(data_to_add)
1145
+ # else:
1146
+ # for box in annotation["boxes"]:
1147
+ # if 'xmin' not in box:
1148
+ # data_to_add = {"image": image_path, "page": reported_number, "xmin": pd.NA, 'xmax': pd.NA, 'ymin': pd.NA, 'ymax': pd.NA}
1149
+ # elif 'text' not in box:
1150
+ # data_to_add = {"image": image_path, "page": reported_number, **box}
1151
+ # else:
1152
+ # data_to_add = {"image": image_path, "page": reported_number, "text": box['text'], **box}
1153
+ # flattened_annotation_data.append(data_to_add)
1154
+
1155
+ # # Convert to a DataFrame
1156
+ # review_file_df = pd.DataFrame(flattened_annotation_data)
1157
+
1158
+ # return review_file_df
1159
+
1160
+ # def create_annotation_dicts_from_annotation_df(all_image_annotations_df:pd.DataFrame, page_sizes:List[dict]):
1161
+ # '''
1162
+ # From an annotation object as a dataframe, convert back to a list of dictionaries that can be used in the Gradio Image Annotator component
1163
+ # '''
1164
+ # result = []
1165
+
1166
+ # # Ensure that every page has an entry in the resulting list of dicts
1167
+ # for image_path in page_sizes:
1168
+ # annotation = {}
1169
+ # annotation["image"] = image_path["image_path"]
1170
+ # annotation["boxes"] = []
1171
+
1172
+ # result.append(annotation)
1173
+
1174
+ # # Then add in all the filled in data
1175
+ # for image, group in all_image_annotations_df.groupby('image'):
1176
+ # boxes = group[['xmin', 'ymin', 'xmax', 'ymax', 'color', 'label']].to_dict(orient='records')
1177
+ # result.append({'image': image, 'boxes': boxes})
1178
+
1179
+ # return result
1180
+
1181
+ def create_annotation_dicts_from_annotation_df(
1182
+ all_image_annotations_df: pd.DataFrame,
1183
+ page_sizes: List[Dict[str, Any]]
1184
+ ) -> List[Dict[str, Any]]:
1185
+ '''
1186
+ Convert annotation DataFrame back to list of dicts using dictionary lookup.
1187
+ Ensures all images from page_sizes are present without duplicates.
1188
+ '''
1189
+ # 1. Create a dictionary keyed by image path for efficient lookup & update
1190
+ # Initialize with all images from page_sizes. Use .get for safety.
1191
+ image_dict: Dict[str, Dict[str, Any]] = {}
1192
+ for item in page_sizes:
1193
+ image_path = item.get("image_path")
1194
+ if image_path: # Only process if image_path exists and is not None/empty
1195
+ image_dict[image_path] = {"image": image_path, "boxes": []}
1196
+
1197
+ # Check if the DataFrame is empty or lacks necessary columns
1198
+ if all_image_annotations_df.empty or 'image' not in all_image_annotations_df.columns:
1199
+ print("Warning: Annotation DataFrame is empty or missing 'image' column.")
1200
+ return list(image_dict.values()) # Return based on page_sizes only
1201
+
1202
+ # 2. Define columns to extract for boxes and check availability
1203
+ # Make sure these columns actually exist in the DataFrame
1204
+ box_cols = ['xmin', 'ymin', 'xmax', 'ymax', 'color', 'label']
1205
+ available_cols = [col for col in box_cols if col in all_image_annotations_df.columns]
1206
+
1207
+ if not available_cols:
1208
+ print(f"Warning: None of the expected box columns ({box_cols}) found in DataFrame.")
1209
+ return list(image_dict.values()) # Return based on page_sizes only
1210
+
1211
+ # 3. Group the DataFrame by image and update the dictionary
1212
+ # Drop rows where essential coordinates might be NA (adjust if NA is meaningful)
1213
+ coord_cols = ['xmin', 'ymin', 'xmax', 'ymax']
1214
+ valid_box_df = all_image_annotations_df.dropna(
1215
+ subset=[col for col in coord_cols if col in available_cols]
1216
+ ).copy() # Use .copy() to avoid SettingWithCopyWarning if modifying later
1217
+
1218
+
1219
+ # Check if any valid boxes remain after dropping NAs
1220
+ if valid_box_df.empty:
1221
+ print("Warning: No valid annotation rows found in DataFrame after dropping NA coordinates.")
1222
+ return list(image_dict.values())
1223
+
1224
+
1225
+ # Process groups
1226
+ try:
1227
+ for image_path, group in valid_box_df.groupby('image', observed=True, sort=False):
1228
+ # Check if this image path exists in our target dictionary (from page_sizes)
1229
+ if image_path in image_dict:
1230
+ # Convert the relevant columns of the group to a list of dicts
1231
+ # Using only columns that are actually available
1232
+ boxes = group[available_cols].to_dict(orient='records')
1233
+ # Update the 'boxes' list in the dictionary
1234
+ image_dict[image_path]['boxes'] = boxes
1235
+ # Else: Image found in DataFrame but not required by page_sizes; ignore it.
1236
+ except KeyError:
1237
+ # This shouldn't happen due to the 'image' column check above, but handle defensively
1238
+ print("Error: Issue grouping DataFrame by 'image'.")
1239
+ return list(image_dict.values())
1240
+
1241
+
1242
+ # 4. Convert the dictionary values back into the final list format
1243
+ result = list(image_dict.values())
1244
+
1245
+ return result
1246
+
1247
+ # import pandas as pd
1248
+ # from typing import List, Dict, Any
1249
+
1250
+ # def create_annotation_dicts_from_annotation_df(
1251
+ # all_image_annotations_df: pd.DataFrame,
1252
+ # page_sizes: List[Dict[str, Any]]
1253
+ # ) -> List[Dict[str, Any]]:
1254
+ # '''
1255
+ # Convert annotation DataFrame back to list of dicts using Pandas merge.
1256
+ # Ensures all images from page_sizes are present without duplicates.
1257
+ # '''
1258
+ # # 1. Create a DataFrame containing all required image paths from page_sizes
1259
+ # if not page_sizes:
1260
+ # return []
1261
+ # all_image_paths = [item.get("image_path") for item in page_sizes if item.get("image_path")]
1262
+ # if not all_image_paths:
1263
+ # return []
1264
+ # # Use unique paths
1265
+ # pages_df = pd.DataFrame({'image': list(set(all_image_paths))})
1266
+
1267
+ # # Check if the DataFrame is empty or lacks necessary columns
1268
+ # if all_image_annotations_df.empty or 'image' not in all_image_annotations_df.columns:
1269
+ # print("Warning: Annotation DataFrame is empty or missing 'image' column.")
1270
+ # # Add empty boxes column and return
1271
+ # pages_df['boxes'] = [[] for _ in range(len(pages_df))]
1272
+ # return pages_df.to_dict(orient='records')
1273
+
1274
+ # # 2. Define columns to extract and check availability
1275
+ # box_cols = ['xmin', 'ymin', 'xmax', 'ymax', 'color', 'label']
1276
+ # available_cols = [col for col in box_cols if col in all_image_annotations_df.columns]
1277
+
1278
+ # if not available_cols:
1279
+ # print(f"Warning: None of the expected box columns ({box_cols}) found in DataFrame.")
1280
+ # pages_df['boxes'] = [[] for _ in range(len(pages_df))]
1281
+ # return pages_df.to_dict(orient='records')
1282
+
1283
+ # # 3. Prepare the annotation data: drop invalid rows and aggregate boxes
1284
+ # coord_cols = ['xmin', 'ymin', 'xmax', 'ymax']
1285
+ # valid_box_df = all_image_annotations_df.dropna(
1286
+ # subset=[col for col in coord_cols if col in available_cols]
1287
+ # ).copy() # Use .copy()
1288
+
1289
+ # if valid_box_df.empty:
1290
+ # print("Warning: No valid annotation rows found after dropping NA coordinates.")
1291
+ # pages_df['boxes'] = [[] for _ in range(len(pages_df))]
1292
+ # return pages_df.to_dict(orient='records')
1293
+
1294
+
1295
+ # # Aggregate boxes into lists of dictionaries per image
1296
+ # def aggregate_boxes(group):
1297
+ # return group[available_cols].to_dict(orient='records')
1298
+
1299
+ # # Group by image and apply the aggregation
1300
+ # grouped_boxes = valid_box_df.groupby('image', observed=True, sort=False).apply(aggregate_boxes).reset_index(name='boxes')
1301
+
1302
+ # # 4. Perform a left merge: keep all images from pages_df, add boxes where they exist
1303
+ # merged_df = pd.merge(pages_df, grouped_boxes, on='image', how='left')
1304
+
1305
+ # # 5. Fill NaN in 'boxes' column (for images with no annotations) with empty lists
1306
+ # # Ensure the column exists before trying to fillna
1307
+ # if 'boxes' in merged_df.columns:
1308
+ # # Use apply with a lambda for robust filling of NAs or potential None values
1309
+ # merged_df['boxes'] = merged_df['boxes'].apply(lambda x: [] if pd.isna(x) else x)
1310
+ # else:
1311
+ # # Should not happen with left merge, but handle defensively
1312
+ # merged_df['boxes'] = [[] for _ in range(len(merged_df))]
1313
+
1314
+
1315
+ # # 6. Convert the final DataFrame to the list of dictionaries format
1316
+ # result = merged_df.to_dict(orient='records')
1317
+
1318
+ # return result
1319
+
1320
+ def convert_annotation_json_to_review_df(all_annotations:List[dict],
1321
+ redaction_decision_output:pd.DataFrame=pd.DataFrame(),
1322
+ page_sizes:pd.DataFrame=pd.DataFrame(),
1323
+ do_proximity_match:bool=True) -> pd.DataFrame:
1324
+ '''
1325
+ Convert the annotation json data to a dataframe format. Add on any text from the initial review_file dataframe by joining on pages/co-ordinates (if option selected).
1326
+ '''
1327
+
1328
+ review_file_df = convert_annotation_data_to_dataframe(all_annotations)
1329
+
1330
+ if page_sizes:
1331
+ page_sizes_df = pd.DataFrame(page_sizes)
1332
+ page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
1333
+
1334
+ review_file_df = divide_coordinates_by_page_sizes(review_file_df, page_sizes_df)
1335
+
1336
+ redaction_decision_output = divide_coordinates_by_page_sizes(redaction_decision_output, page_sizes_df)
1337
+
1338
+ # Join on additional text data from decision output results if included, if text not already there
1339
+ if not redaction_decision_output.empty and not review_file_df.empty and do_proximity_match == True:
1340
+
1341
+ # Match text to review file to match on text
1342
+ review_file_df = do_proximity_match_all_pages_for_text(df1 = review_file_df.copy(), df2 = redaction_decision_output.copy())
1343
 
1344
  # Ensure required columns exist, filling with blank if they don't
1345
+ check_columns = ["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"]
1346
+
1347
+ for col in check_columns:
1348
  if col not in review_file_df.columns:
1349
  review_file_df[col] = ''
1350
 
1351
+ if not review_file_df.empty:
1352
+ review_file_df = review_file_df[check_columns]
1353
+ else:
1354
+ review_file_df = pd.DataFrame(columns=check_columns)
1355
 
1356
  # If colours are saved as list, convert to tuple
1357
  review_file_df.loc[:,"color"] = review_file_df.loc[:,"color"].apply(lambda x: tuple(x) if isinstance(x, list) else x)
1358
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1359
  review_file_df = review_file_df.sort_values(['page', 'ymin', 'xmin', 'label'])
1360
 
 
 
1361
  return review_file_df
1362
 
1363
+ def convert_review_df_to_annotation_json(review_file_df:pd.DataFrame,
1364
+ image_paths:List[Image.Image],
1365
+ page_sizes:List[dict]=[]) -> List[dict]:
1366
  '''
1367
  Convert a review csv to a json file for use by the Gradio Annotation object.
1368
  '''
1369
+ # Make sure all relevant cols are float
1370
+ float_cols = ["page", "xmin", "xmax", "ymin", "ymax"]
1371
+ for col in float_cols:
1372
+ review_file_df.loc[:, col] = pd.to_numeric(review_file_df.loc[:, col], errors='coerce')
1373
 
1374
  # Convert relative co-ordinates into image coordinates for the image annotation output object
1375
  if page_sizes:
1376
  page_sizes_df = pd.DataFrame(page_sizes)
1377
+ page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
1378
 
1379
+ review_file_df = multiply_coordinates_by_page_sizes(review_file_df, page_sizes_df)
1380
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1381
  # Keep only necessary columns
1382
+ review_file_df = review_file_df[["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax"]].drop_duplicates(subset=["image", "page", "xmin", "ymin", "xmax", "ymax", "label"])
1383
 
1384
  # If colours are saved as list, convert to tuple
1385
  review_file_df.loc[:, "color"] = review_file_df.loc[:,"color"].apply(lambda x: tuple(x) if isinstance(x, list) else x)
 
1390
  # Create a list to hold the JSON data
1391
  json_data = []
1392
 
1393
+ for page_no, pdf_image_path in enumerate(page_sizes_df["image_path"]):
1394
+
1395
+ reported_page_number = int(page_no + 1)
1396
 
1397
  if reported_page_number in review_file_df["page"].values:
1398
 
1399
  # Convert each relevant group to a list of box dictionaries
1400
  selected_csv_pages = grouped_csv_pages.get_group(reported_page_number)
1401
  annotation_boxes = selected_csv_pages.drop(columns=['image', 'page']).to_dict(orient='records')
 
 
1402
 
1403
  annotation = {
1404
  "image": pdf_image_path,
tools/file_redaction.py CHANGED
The diff for this file is too large to render. See raw diff
 
tools/find_duplicate_pages.py CHANGED
@@ -3,36 +3,32 @@ import pandas as pd
3
  #import glob
4
  import os
5
  import re
6
- from tools.helper_functions import output_folder
7
  from sklearn.feature_extraction.text import TfidfVectorizer
8
  from sklearn.metrics.pairwise import cosine_similarity
9
- import nltk
10
- from nltk.corpus import stopwords
11
- from nltk.tokenize import word_tokenize
12
- from nltk.stem import PorterStemmer
 
13
  import numpy as np
14
  import random
15
  import string
16
  from typing import List
 
17
 
18
- nltk.download('punkt')
19
- nltk.download('stopwords')
20
- nltk.download('punkt_tab')
21
 
22
- similarity_threshold = 0.9
 
 
23
 
24
- stop_words = set(stopwords.words('english'))
25
- # List of words to remove from the stopword set
26
- #words_to_remove = ['no', 'nor', 'not', 'don', 'don't', 'wasn', 'wasn't', 'weren', 'weren't', "don't", "wasn't", "weren't"]
27
 
28
- # Remove the specified words from the stopwords set
29
- #for word in words_to_remove:
30
- # stop_words.discard(word.lower())
31
-
32
- stemmer = PorterStemmer()
33
- vectorizer = TfidfVectorizer()
34
 
35
- def combine_ocr_output_text(input_files):
36
  """
37
  Combines text from multiple CSV files containing page and text columns.
38
  Groups text by file and page number, concatenating text within these groups.
@@ -92,7 +88,7 @@ def combine_ocr_output_text(input_files):
92
 
93
  return combined_df, output_files
94
 
95
- def process_data(df, column:str):
96
  '''
97
  Clean and stem text columns in a data frame
98
  '''
@@ -100,118 +96,130 @@ def process_data(df, column:str):
100
  def _clean_text(raw_text):
101
  # Remove HTML tags
102
  clean = re.sub(r'<.*?>', '', raw_text)
103
- clean = re.sub(r'&nbsp;', ' ', clean)
104
- clean = re.sub(r'\r\n', ' ', clean)
105
- clean = re.sub(r'&lt;', ' ', clean)
106
- clean = re.sub(r'&gt;', ' ', clean)
107
- clean = re.sub(r'<strong>', ' ', clean)
108
- clean = re.sub(r'</strong>', ' ', clean)
109
 
110
  # Replace non-breaking space \xa0 with a space
111
- clean = clean.replace(u'\xa0', u' ')
112
  # Remove extra whitespace
113
  clean = ' '.join(clean.split())
114
 
115
- # Tokenize the text
116
- words = word_tokenize(clean.lower())
117
 
118
- # Remove punctuation and numbers
119
- words = [word for word in words if word.isalpha()]
120
 
121
- # Remove stopwords
122
- words = [word for word in words if word not in stop_words]
123
 
124
  # Join the cleaned words back into a string
125
- return ' '.join(words)
126
-
127
- # Function to apply stemming
128
- def _apply_stemming(text):
129
- # Tokenize the text
130
- words = word_tokenize(text.lower())
131
-
132
- # Apply stemming to each word
133
- stemmed_words = [stemmer.stem(word) for word in words]
134
-
135
- # Join the stemmed words back into a single string
136
- return ' '.join(stemmed_words)
137
-
138
-
139
-
140
-
141
  df['text_clean'] = df[column].apply(_clean_text)
142
- df['text_clean'] = df['text_clean'].apply(_apply_stemming)
 
143
 
144
  return df
145
 
146
- def identify_similar_pages(input_files:List[str]):
147
-
148
  output_paths = []
 
 
149
 
 
150
  df, output_files = combine_ocr_output_text(input_files)
151
-
152
  output_paths.extend(output_files)
 
153
 
154
- # Clean text
155
- df = process_data(df, 'text')
156
-
157
- # Vectorise text
158
  tfidf_matrix = vectorizer.fit_transform(df['text_clean'])
159
 
160
- # Calculate cosine similarity
161
- similarity_matrix = cosine_similarity(tfidf_matrix)
162
 
163
- # Find the indices of the most similar pages
164
- np.fill_diagonal(similarity_matrix, 0) # Ignore self-comparisons
165
- similar_pages = np.argwhere(similarity_matrix > similarity_threshold) # Threshold of similarity
166
 
167
- #print(similar_pages)
 
 
168
 
169
- # Create a DataFrame for similar pairs and their scores
170
- similarity_df = pd.DataFrame({
171
- 'Page1_Index': similar_pages[:, 0],
172
- 'Page2_Index': similar_pages[:, 1],
173
- 'Page1_File': similar_pages[:, 0],
174
- 'Page2_File': similar_pages[:, 1],
175
- 'Similarity_Score': similarity_matrix[similar_pages[:, 0], similar_pages[:, 1]]
176
- })
177
 
178
- # Filter out duplicate pairs (keep only one direction)
 
 
 
179
  similarity_df = similarity_df[similarity_df['Page1_Index'] < similarity_df['Page2_Index']]
180
 
181
- # Map the indices to their corresponding text and metadata
182
- similarity_df['Page1_File'] = similarity_df['Page1_File'].map(df['file'])
183
- similarity_df['Page2_File'] = similarity_df['Page2_File'].map(df['file'])
 
 
 
 
 
 
184
 
185
- similarity_df['Page1_Page'] = similarity_df['Page1_Index'].map(df['page'])
186
- similarity_df['Page2_Page'] = similarity_df['Page2_Index'].map(df['page'])
187
 
188
- similarity_df['Page1_Text'] = similarity_df['Page1_Index'].map(df['text'])
189
- similarity_df['Page2_Text'] = similarity_df['Page2_Index'].map(df['text'])
 
190
 
 
 
 
 
 
 
 
 
 
 
 
191
  similarity_df_out = similarity_df[['Page1_File', 'Page1_Page', 'Page2_File', 'Page2_Page', 'Similarity_Score', 'Page1_Text', 'Page2_Text']]
192
  similarity_df_out = similarity_df_out.sort_values(["Page1_File", "Page1_Page", "Page2_File", "Page2_Page", "Similarity_Score"], ascending=[True, True, True, True, False])
193
 
194
- # Save detailed results to a CSV file
 
 
 
 
 
195
  similarity_file_output_path = output_folder + 'page_similarity_results.csv'
196
  similarity_df_out.to_csv(similarity_file_output_path, index=False)
197
-
198
  output_paths.append(similarity_file_output_path)
199
 
200
- if not similarity_df_out.empty:
201
- unique_files = similarity_df_out['Page2_File'].unique()
202
- for redact_file in unique_files:
203
- output_file_name = output_folder + redact_file + "_whole_page.csv"
204
- whole_pages_to_redact_df = similarity_df_out.loc[similarity_df_out['Page2_File']==redact_file,:][['Page2_Page']]
205
- whole_pages_to_redact_df.to_csv(output_file_name, header=None, index=None)
206
-
207
- output_paths.append(output_file_name)
208
-
209
 
210
  return similarity_df_out, output_paths
211
 
212
  # Perturb text
213
  # Apply the perturbation function with a 10% error probability
214
- def perturb_text_with_errors(series):
215
 
216
  def _perturb_text(text, error_probability=0.1):
217
  words = text.split() # Split text into words
@@ -241,36 +249,3 @@ def perturb_text_with_errors(series):
241
  series = series.apply(lambda x: _perturb_text(x, error_probability=0.1))
242
 
243
  return series
244
-
245
- # Run through command line
246
- # def main():
247
- # parser = argparse.ArgumentParser(description='Combine text from multiple CSV files by page')
248
- # parser.add_argument('input_pattern', help='Input file pattern (e.g., "input/*.csv")')
249
- # parser.add_argument('--output', '-o', default='combined_text.csv',
250
- # help='Output CSV file path (default: combined_text.csv)')
251
-
252
- # args = parser.parse_args()
253
-
254
- # # Get list of input files
255
- # input_files = glob.glob(args.input_pattern)
256
-
257
- # if not input_files:
258
- # print(f"No files found matching pattern: {args.input_pattern}")
259
- # return
260
-
261
- # print(f"Processing {len(input_files)} files...")
262
-
263
- # try:
264
- # # Combine the text from all files
265
- # combined_df = combine_ocr_output_text(input_files)
266
-
267
- # # Save to CSV
268
- # combined_df.to_csv(args.output, index=False)
269
- # print(f"Successfully created combined output: {args.output}")
270
- # print(f"Total pages processed: {len(combined_df)}")
271
-
272
- # except Exception as e:
273
- # print(f"Error processing files: {str(e)}")
274
-
275
- # if __name__ == "__main__":
276
- # main()
 
3
  #import glob
4
  import os
5
  import re
6
+ from tools.helper_functions import OUTPUT_FOLDER
7
  from sklearn.feature_extraction.text import TfidfVectorizer
8
  from sklearn.metrics.pairwise import cosine_similarity
9
+ # import nltk
10
+ # from nltk.corpus import stopwords
11
+ # from nltk.tokenize import word_tokenize
12
+ # from nltk.stem import PorterStemmer
13
+ #import spacy
14
  import numpy as np
15
  import random
16
  import string
17
  from typing import List
18
+ from gradio import Progress
19
 
20
+ import en_core_web_lg #en_core_web_sm
21
+ nlp = en_core_web_lg.load()
22
+ #from tqdm import tqdm
23
 
24
+ # nltk.download('punkt')
25
+ # nltk.download('stopwords')
26
+ # nltk.download('punkt_tab')
27
 
28
+ similarity_threshold = 0.9
 
 
29
 
 
 
 
 
 
 
30
 
31
+ def combine_ocr_output_text(input_files:List[str], output_folder:str=OUTPUT_FOLDER):
32
  """
33
  Combines text from multiple CSV files containing page and text columns.
34
  Groups text by file and page number, concatenating text within these groups.
 
88
 
89
  return combined_df, output_files
90
 
91
+ def process_data(df:pd.DataFrame, column:str):
92
  '''
93
  Clean and stem text columns in a data frame
94
  '''
 
96
  def _clean_text(raw_text):
97
  # Remove HTML tags
98
  clean = re.sub(r'<.*?>', '', raw_text)
99
+ # clean = re.sub(r'&nbsp;', ' ', clean)
100
+ # clean = re.sub(r'\r\n', ' ', clean)
101
+ # clean = re.sub(r'&lt;', ' ', clean)
102
+ # clean = re.sub(r'&gt;', ' ', clean)
103
+ # clean = re.sub(r'<strong>', ' ', clean)
104
+ # clean = re.sub(r'</strong>', ' ', clean)
105
 
106
  # Replace non-breaking space \xa0 with a space
107
+ # clean = clean.replace(u'\xa0', u' ')
108
  # Remove extra whitespace
109
  clean = ' '.join(clean.split())
110
 
111
+ # # Tokenize the text
112
+ # words = word_tokenize(clean.lower())
113
 
114
+ # # Remove punctuation and numbers
115
+ # words = [word for word in words if word.isalpha()]
116
 
117
+ # # Remove stopwords
118
+ # words = [word for word in words if word not in stop_words]
119
 
120
  # Join the cleaned words back into a string
121
+ return clean
122
+
123
+ # Function to apply lemmatization and remove stopwords
124
+ def _apply_lemmatization(text):
125
+ doc = nlp(text)
126
+ # Keep only alphabetic tokens and remove stopwords
127
+ lemmatized_words = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
128
+ return ' '.join(lemmatized_words)
129
+
 
 
 
 
 
 
 
130
  df['text_clean'] = df[column].apply(_clean_text)
131
+
132
+ df['text_clean'] = df['text_clean'].apply(_apply_lemmatization)
133
 
134
  return df
135
 
136
+ def identify_similar_pages(input_files: List[str], similarity_threshold: float = 0.9, output_folder:str=OUTPUT_FOLDER, progress=Progress(track_tqdm=True)):
 
137
  output_paths = []
138
+
139
+ progress(0.1, desc="Cleaning input texts")
140
 
141
+ # Load and clean data
142
  df, output_files = combine_ocr_output_text(input_files)
 
143
  output_paths.extend(output_files)
144
+ df = process_data(df, 'text') # Assume this returns 'text_clean', 'file', and 'page' columns
145
 
146
+ # Vectorize text
147
+ vectorizer = TfidfVectorizer()
 
 
148
  tfidf_matrix = vectorizer.fit_transform(df['text_clean'])
149
 
150
+ progress(0.3, desc="Calculating text similarity")
 
151
 
152
+ # Compute sparse cosine similarity
153
+ similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False) # Keep sparse format
 
154
 
155
+ # Extract indices of similar pages above threshold
156
+ coo_matrix = similarity_matrix.tocoo()
157
+ similar_pages = np.array([(i, j, v) for i, j, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data) if v > similarity_threshold])
158
 
159
+ if similar_pages.size == 0:
160
+ return pd.DataFrame(), output_paths # Return empty if no matches
161
+
162
+
 
 
 
 
163
 
164
+ # Create a DataFrame for similar pairs
165
+ similarity_df = pd.DataFrame(similar_pages, columns=['Page1_Index', 'Page2_Index', 'Similarity_Score'])
166
+
167
+ # Remove duplicate pairs (keep one direction)
168
  similarity_df = similarity_df[similarity_df['Page1_Index'] < similarity_df['Page2_Index']]
169
 
170
+ progress(0.8, desc="Mapping back results")
171
+ # Map indices to metadata
172
+ # index_map = df[['file', 'page', 'text']].to_dict(orient='index')
173
+ # similarity_df['Page1_File'] = similarity_df['Page1_Index'].map(lambda x: index_map[x]['file'])
174
+ # similarity_df['Page2_File'] = similarity_df['Page2_Index'].map(lambda x: index_map[x]['file'])
175
+ # similarity_df['Page1_Page'] = similarity_df['Page1_Index'].map(lambda x: index_map[x]['page'])
176
+ # similarity_df['Page2_Page'] = similarity_df['Page2_Index'].map(lambda x: index_map[x]['page'])
177
+ # similarity_df['Page1_Text'] = similarity_df['Page1_Index'].map(lambda x: index_map[x]['text'][0:200])
178
+ # similarity_df['Page2_Text'] = similarity_df['Page2_Index'].map(lambda x: index_map[x]['text'][0:200])
179
 
180
+ # Create a DataFrame with the metadata
181
+ metadata_df = df[['file', 'page', 'text']].reset_index()
182
 
183
+ # Merge to get the metadata for Page1
184
+ similarity_df = similarity_df.merge(metadata_df, left_on='Page1_Index', right_on='index', suffixes=('', '_Page1'))
185
+ similarity_df = similarity_df.rename(columns={'file': 'Page1_File', 'page': 'Page1_Page', 'text': 'Page1_Text'})
186
 
187
+ # Merge to get the metadata for Page2
188
+ similarity_df = similarity_df.merge(metadata_df, left_on='Page2_Index', right_on='index', suffixes=('', '_Page2'))
189
+ similarity_df = similarity_df.rename(columns={'file': 'Page2_File', 'page': 'Page2_Page', 'text': 'Page2_Text'})
190
+
191
+ # Optionally, drop the index columns if not needed
192
+ #similarity_df = similarity_df.drop(columns=['index_Page1', 'index_Page2'])
193
+
194
+
195
+ similarity_df["Similarity_Score"] = similarity_df["Similarity_Score"].round(3)
196
+
197
+ # Sort results
198
  similarity_df_out = similarity_df[['Page1_File', 'Page1_Page', 'Page2_File', 'Page2_Page', 'Similarity_Score', 'Page1_Text', 'Page2_Text']]
199
  similarity_df_out = similarity_df_out.sort_values(["Page1_File", "Page1_Page", "Page2_File", "Page2_Page", "Similarity_Score"], ascending=[True, True, True, True, False])
200
 
201
+ similarity_df_out['Page1_Text'] = similarity_df_out['Page1_Text'][0:100]
202
+ similarity_df_out['Page2_Text'] = similarity_df_out['Page2_Text'][0:100]
203
+
204
+ progress(0.8, desc="Saving output files")
205
+
206
+ # Save results
207
  similarity_file_output_path = output_folder + 'page_similarity_results.csv'
208
  similarity_df_out.to_csv(similarity_file_output_path, index=False)
 
209
  output_paths.append(similarity_file_output_path)
210
 
211
+ # Save per-file redaction lists
212
+ for redact_file in similarity_df_out['Page2_File'].unique():
213
+ output_file_name = output_folder + redact_file + "_whole_page.csv"
214
+ whole_pages_to_redact_df = similarity_df_out.loc[similarity_df_out['Page2_File'] == redact_file, ['Page2_Page']].drop_duplicates(['Page2_Page']).sort_values('Page2_Page')
215
+ whole_pages_to_redact_df.to_csv(output_file_name, header=False, index=False)
216
+ output_paths.append(output_file_name)
 
 
 
217
 
218
  return similarity_df_out, output_paths
219
 
220
  # Perturb text
221
  # Apply the perturbation function with a 10% error probability
222
+ def perturb_text_with_errors(series:pd.Series):
223
 
224
  def _perturb_text(text, error_probability=0.1):
225
  words = text.split() # Split text into words
 
249
  series = series.apply(lambda x: _perturb_text(x, error_probability=0.1))
250
 
251
  return series
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tools/helper_functions.py CHANGED
@@ -7,20 +7,21 @@ import pandas as pd
7
  import numpy as np
8
  import unicodedata
9
  from typing import List
 
10
  from gradio_image_annotation import image_annotator
11
- from tools.auth import user_pool_id
12
- from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, output_folder, session_output_folder
13
 
14
  # Names for options labels
15
  text_ocr_option = "Local model - selectable text"
16
  tesseract_ocr_option = "Local OCR model - PDFs without selectable text"
17
  textract_option = "AWS Textract service - all PDF types"
18
 
 
19
  local_pii_detector = "Local"
20
  aws_pii_detector = "AWS Comprehend"
21
 
22
  def reset_state_vars():
23
- return [], [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
24
  label="Modify redaction boxes",
25
  label_list=["Redaction"],
26
  label_colors=[(0, 0, 0)],
@@ -30,7 +31,10 @@ def reset_state_vars():
30
  show_share_button=False,
31
  show_remove_button=False,
32
  interactive=False
33
- ), [], [], pd.DataFrame(), pd.DataFrame(), []
 
 
 
34
 
35
  def reset_review_vars():
36
  return pd.DataFrame(), pd.DataFrame()
@@ -40,6 +44,23 @@ def load_in_default_allow_list(allow_list_file_path):
40
  allow_list_file_path = [allow_list_file_path]
41
  return allow_list_file_path
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  def update_dataframe(df:pd.DataFrame):
44
  df_copy = df.copy()
45
  return df_copy
@@ -87,23 +108,20 @@ def read_file(filename):
87
  elif file_type == 'parquet':
88
  return pd.read_parquet(filename)
89
 
90
- def ensure_output_folder_exists():
91
- """Checks if the 'output/' folder exists, creates it if not."""
92
 
93
- folder_name = "output/"
94
-
95
- if not os.path.exists(folder_name):
96
  # Create the folder if it doesn't exist
97
- os.makedirs(folder_name)
98
- print(f"Created the 'output/' folder.")
99
  else:
100
- print(f"The 'output/' folder already exists.")
101
 
102
  def custom_regex_load(in_file:List[str], file_type:str = "allow_list"):
103
  '''
104
  When file is loaded, update the column dropdown choices and write to relevant data states.
105
  '''
106
-
107
  custom_regex_df = pd.DataFrame()
108
 
109
  if in_file:
@@ -113,7 +131,6 @@ def custom_regex_load(in_file:List[str], file_type:str = "allow_list"):
113
  if regex_file_names:
114
  regex_file_name = regex_file_names[0]
115
  custom_regex_df = pd.read_csv(regex_file_name, low_memory=False, header=None)
116
- #regex_file_name_no_ext = get_file_name_without_type(regex_file_name)
117
 
118
  # Select just first columns
119
  custom_regex_df = pd.DataFrame(custom_regex_df.iloc[:,[0]])
@@ -122,8 +139,6 @@ def custom_regex_load(in_file:List[str], file_type:str = "allow_list"):
122
  custom_regex_df.columns = custom_regex_df.columns.astype(str)
123
 
124
  output_text = file_type + " file loaded."
125
-
126
- print("Custom regex df:", custom_regex_df)
127
  print(output_text)
128
  else:
129
  output_text = "No file provided."
@@ -132,7 +147,7 @@ def custom_regex_load(in_file:List[str], file_type:str = "allow_list"):
132
 
133
  return output_text, custom_regex_df
134
 
135
- def put_columns_in_df(in_file):
136
  new_choices = []
137
  concat_choices = []
138
  all_sheet_names = []
@@ -176,6 +191,16 @@ def put_columns_in_df(in_file):
176
  else:
177
  return gr.Dropdown(choices=concat_choices, value=concat_choices), gr.Dropdown(visible=False)
178
 
 
 
 
 
 
 
 
 
 
 
179
  # Following function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
180
  def add_folder_to_path(folder_path: str):
181
  '''
@@ -202,7 +227,7 @@ def add_folder_to_path(folder_path: str):
202
  def reveal_feedback_buttons():
203
  return gr.Radio(visible=True, label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 60% of personally identifiable information in a given (typed) document."), gr.Textbox(visible=True), gr.Button(visible=True), gr.Markdown(visible=True)
204
 
205
- def wipe_logs(feedback_logs_loc, usage_logs_loc):
206
  try:
207
  os.remove(feedback_logs_loc)
208
  except Exception as e:
@@ -212,7 +237,7 @@ def wipe_logs(feedback_logs_loc, usage_logs_loc):
212
  except Exception as e:
213
  print("Could not remove usage logs file", e)
214
 
215
- def merge_csv_files(file_list):
216
 
217
  # Initialise an empty list to hold DataFrames
218
  dataframes = []
@@ -246,9 +271,9 @@ def merge_csv_files(file_list):
246
 
247
  return output_files
248
 
249
- async def get_connection_params(request: gr.Request, output_folder_textbox:str='/output/'):
250
 
251
- print("Session hash:", request.session_hash)
252
 
253
  if CUSTOM_HEADER and CUSTOM_HEADER_VALUE:
254
  if CUSTOM_HEADER in request.headers:
@@ -266,11 +291,11 @@ async def get_connection_params(request: gr.Request, output_folder_textbox:str='
266
 
267
  if request.username:
268
  out_session_hash = request.username
269
- print("Request username found:", out_session_hash)
270
 
271
  elif 'x-cognito-id' in request.headers:
272
  out_session_hash = request.headers['x-cognito-id']
273
- print("Cognito ID found:", out_session_hash)
274
 
275
  elif 'x-amzn-oidc-identity' in request.headers:
276
  out_session_hash = request.headers['x-amzn-oidc-identity']
@@ -279,7 +304,7 @@ async def get_connection_params(request: gr.Request, output_folder_textbox:str='
279
  cognito_client = boto3.client('cognito-idp')
280
  try:
281
  response = cognito_client.admin_get_user(
282
- UserPoolId=user_pool_id, # Replace with your User Pool ID
283
  Username=out_session_hash
284
  )
285
  email = next(attr['Value'] for attr in response['UserAttributes'] if attr['Name'] == 'email')
@@ -297,19 +322,19 @@ async def get_connection_params(request: gr.Request, output_folder_textbox:str='
297
 
298
  if session_output_folder == 'True':
299
  output_folder = output_folder_textbox + out_session_hash + "/"
 
300
  else:
301
  output_folder = output_folder_textbox
 
302
 
303
- if not os.path.exists(output_folder):
304
- os.mkdir(output_folder)
305
 
306
- #if bucket_name:
307
- # print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
308
 
309
- return out_session_hash, output_folder, out_session_hash
310
 
311
- def clean_unicode_text(text):
312
- # Step 1: Normalize unicode characters to decompose any special forms
313
  normalized_text = unicodedata.normalize('NFKC', text)
314
 
315
  # Step 2: Replace smart quotes and special punctuation with standard ASCII equivalents
@@ -329,7 +354,7 @@ def clean_unicode_text(text):
329
 
330
  return cleaned_text
331
 
332
- def load_all_output_files(folder_path:str=output_folder) -> List[str]:
333
  """Get the file paths of all files in the given folder."""
334
  file_paths = []
335
 
@@ -343,4 +368,121 @@ def load_all_output_files(folder_path:str=output_folder) -> List[str]:
343
 
344
  return file_paths
345
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
 
 
7
  import numpy as np
8
  import unicodedata
9
  from typing import List
10
+ from math import ceil
11
  from gradio_image_annotation import image_annotator
12
+ from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID
 
13
 
14
  # Names for options labels
15
  text_ocr_option = "Local model - selectable text"
16
  tesseract_ocr_option = "Local OCR model - PDFs without selectable text"
17
  textract_option = "AWS Textract service - all PDF types"
18
 
19
+ no_redaction_option = "Only extract text (no redaction)"
20
  local_pii_detector = "Local"
21
  aws_pii_detector = "AWS Comprehend"
22
 
23
  def reset_state_vars():
24
+ return [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
25
  label="Modify redaction boxes",
26
  label_list=["Redaction"],
27
  label_colors=[(0, 0, 0)],
 
31
  show_share_button=False,
32
  show_remove_button=False,
33
  interactive=False
34
+ ), [], [], pd.DataFrame(), pd.DataFrame(), [], [], ""
35
+
36
+ def reset_ocr_results_state():
37
+ return pd.DataFrame(), pd.DataFrame(), []
38
 
39
  def reset_review_vars():
40
  return pd.DataFrame(), pd.DataFrame()
 
44
  allow_list_file_path = [allow_list_file_path]
45
  return allow_list_file_path
46
 
47
+ def load_in_default_cost_codes(cost_codes_path:str):
48
+ cost_codes_df = pd.read_csv(cost_codes_path)
49
+
50
+ dropdown_choices = cost_codes_df.iloc[:,0].to_list()
51
+ dropdown_choices.insert(0, "")
52
+
53
+
54
+ out_dropdown = gr.Dropdown(value="", label="Choose cost code for analysis", choices=dropdown_choices, allow_custom_value=True)
55
+
56
+ return cost_codes_df, out_dropdown
57
+
58
+ def enforce_cost_codes(enforce_cost_code_textbox, cost_code_choice):
59
+ if enforce_cost_code_textbox == "True":
60
+ if not cost_code_choice:
61
+ raise Exception("Please choose a cost code before continuing")
62
+ return
63
+
64
  def update_dataframe(df:pd.DataFrame):
65
  df_copy = df.copy()
66
  return df_copy
 
108
  elif file_type == 'parquet':
109
  return pd.read_parquet(filename)
110
 
111
+ def ensure_output_folder_exists(output_folder:str):
112
+ """Checks if the specified folder exists, creates it if not."""
113
 
114
+ if not os.path.exists(output_folder):
 
 
115
  # Create the folder if it doesn't exist
116
+ os.makedirs(output_folder)
117
+ print(f"Created the {output_folder} folder.")
118
  else:
119
+ print(f"The {output_folder} folder already exists.")
120
 
121
  def custom_regex_load(in_file:List[str], file_type:str = "allow_list"):
122
  '''
123
  When file is loaded, update the column dropdown choices and write to relevant data states.
124
  '''
 
125
  custom_regex_df = pd.DataFrame()
126
 
127
  if in_file:
 
131
  if regex_file_names:
132
  regex_file_name = regex_file_names[0]
133
  custom_regex_df = pd.read_csv(regex_file_name, low_memory=False, header=None)
 
134
 
135
  # Select just first columns
136
  custom_regex_df = pd.DataFrame(custom_regex_df.iloc[:,[0]])
 
139
  custom_regex_df.columns = custom_regex_df.columns.astype(str)
140
 
141
  output_text = file_type + " file loaded."
 
 
142
  print(output_text)
143
  else:
144
  output_text = "No file provided."
 
147
 
148
  return output_text, custom_regex_df
149
 
150
+ def put_columns_in_df(in_file:List[str]):
151
  new_choices = []
152
  concat_choices = []
153
  all_sheet_names = []
 
191
  else:
192
  return gr.Dropdown(choices=concat_choices, value=concat_choices), gr.Dropdown(visible=False)
193
 
194
+ def check_for_existing_textract_file(doc_file_name_no_extension_textbox:str, output_folder:str=OUTPUT_FOLDER):
195
+ textract_output_path = os.path.join(output_folder, doc_file_name_no_extension_textbox + "_textract.json")
196
+
197
+ if os.path.exists(textract_output_path):
198
+ print("Existing Textract file found.")
199
+ return True
200
+
201
+ else:
202
+ return False
203
+
204
  # Following function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
205
  def add_folder_to_path(folder_path: str):
206
  '''
 
227
  def reveal_feedback_buttons():
228
  return gr.Radio(visible=True, label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 60% of personally identifiable information in a given (typed) document."), gr.Textbox(visible=True), gr.Button(visible=True), gr.Markdown(visible=True)
229
 
230
+ def wipe_logs(feedback_logs_loc:str, usage_logs_loc:str):
231
  try:
232
  os.remove(feedback_logs_loc)
233
  except Exception as e:
 
237
  except Exception as e:
238
  print("Could not remove usage logs file", e)
239
 
240
+ def merge_csv_files(file_list:List[str], output_folder:str=OUTPUT_FOLDER):
241
 
242
  # Initialise an empty list to hold DataFrames
243
  dataframes = []
 
271
 
272
  return output_files
273
 
274
+ async def get_connection_params(request: gr.Request, output_folder_textbox:str=OUTPUT_FOLDER, input_folder_textbox:str=INPUT_FOLDER, session_output_folder:str=SESSION_OUTPUT_FOLDER):
275
 
276
+ #print("Session hash:", request.session_hash)
277
 
278
  if CUSTOM_HEADER and CUSTOM_HEADER_VALUE:
279
  if CUSTOM_HEADER in request.headers:
 
291
 
292
  if request.username:
293
  out_session_hash = request.username
294
+ #print("Request username found:", out_session_hash)
295
 
296
  elif 'x-cognito-id' in request.headers:
297
  out_session_hash = request.headers['x-cognito-id']
298
+ #print("Cognito ID found:", out_session_hash)
299
 
300
  elif 'x-amzn-oidc-identity' in request.headers:
301
  out_session_hash = request.headers['x-amzn-oidc-identity']
 
304
  cognito_client = boto3.client('cognito-idp')
305
  try:
306
  response = cognito_client.admin_get_user(
307
+ UserPoolId=AWS_USER_POOL_ID, # Replace with your User Pool ID
308
  Username=out_session_hash
309
  )
310
  email = next(attr['Value'] for attr in response['UserAttributes'] if attr['Name'] == 'email')
 
322
 
323
  if session_output_folder == 'True':
324
  output_folder = output_folder_textbox + out_session_hash + "/"
325
+ input_folder = input_folder_textbox + out_session_hash + "/"
326
  else:
327
  output_folder = output_folder_textbox
328
+ input_folder = input_folder_textbox
329
 
330
+ if not os.path.exists(output_folder): os.mkdir(output_folder)
331
+ if not os.path.exists(input_folder): os.mkdir(input_folder)
332
 
 
 
333
 
334
+ return out_session_hash, output_folder, out_session_hash, input_folder
335
 
336
+ def clean_unicode_text(text:str):
337
+ # Step 1: Normalise unicode characters to decompose any special forms
338
  normalized_text = unicodedata.normalize('NFKC', text)
339
 
340
  # Step 2: Replace smart quotes and special punctuation with standard ASCII equivalents
 
354
 
355
  return cleaned_text
356
 
357
+ def load_all_output_files(folder_path:str=OUTPUT_FOLDER) -> List[str]:
358
  """Get the file paths of all files in the given folder."""
359
  file_paths = []
360
 
 
368
 
369
  return file_paths
370
 
371
+ def calculate_aws_costs(number_of_pages:str,
372
+ text_extract_method_radio:str,
373
+ handwrite_signature_checkbox:List[str],
374
+ pii_identification_method:str,
375
+ textract_output_found_checkbox:bool,
376
+ only_extract_text_radio:bool,
377
+ textract_page_cost:float=1.5/1000,
378
+ textract_signature_cost:float=2.0/1000,
379
+ comprehend_unit_cost:float=0.0001,
380
+ comprehend_size_unit_average:float=250,
381
+ average_characters_per_page:float=2000,
382
+ textract_option:str=textract_option,
383
+ no_redaction_option:str=no_redaction_option,
384
+ aws_pii_detector:str=aws_pii_detector):
385
+ '''
386
+ Calculate the approximate cost of submitting a document to AWS Textract and/or AWS Comprehend, assuming that Textract outputs do not already exist in the output folder.
387
+
388
+ - number_of_pages: The number of pages in the uploaded document(s).
389
+ - text_extract_method_radio: The method of text extraction.
390
+ - handwrite_signature_checkbox: Whether signatures are being extracted or not.
391
+ - pii_identification_method_drop: The method of personally-identifiable information removal.
392
+ - textract_output_found_checkbox: Whether existing Textract results have been found in the output folder. Assumes that results exist for all pages and files in the output folder.
393
+ - only_extract_text_radio (bool, optional): Option to only extract text from the document rather than redact.
394
+ - textract_page_cost (float, optional): AWS pricing for Textract text extraction per page ($).
395
+ - textract_signature_cost (float, optional): Additional AWS cost above standard AWS Textract extraction for extracting signatures.
396
+ - comprehend_unit_cost (float, optional): Cost per 'unit' (300 character minimum) for identifying PII in text with AWS Comprehend.
397
+ - comprehend_size_unit_average (float, optional): Average size of a 'unit' of text passed to AWS Comprehend by the app through the batching process
398
+ - average_characters_per_page (float, optional): Average number of characters on an A4 page.
399
+ - textract_option (str, optional): String label for the text_extract_method_radio button for AWS Textract.
400
+ - no_redaction_option (str, optional): String label for pii_identification_method_drop for no redaction.
401
+ - aws_pii_detector (str, optional): String label for pii_identification_method_drop for AWS Comprehend.
402
+ '''
403
+ text_extraction_cost = 0
404
+ pii_identification_cost = 0
405
+ calculated_aws_cost = 0
406
+ number_of_pages = int(number_of_pages)
407
+
408
+ if textract_output_found_checkbox != True:
409
+ if text_extract_method_radio == textract_option:
410
+ text_extraction_cost = number_of_pages * textract_page_cost
411
+
412
+ if "Extract signatures" in handwrite_signature_checkbox:
413
+ text_extraction_cost += (textract_signature_cost * number_of_pages)
414
+
415
+ if pii_identification_method != no_redaction_option:
416
+ if pii_identification_method == aws_pii_detector:
417
+ comprehend_page_cost = ceil(average_characters_per_page / comprehend_size_unit_average) * comprehend_unit_cost
418
+ pii_identification_cost = comprehend_page_cost * number_of_pages
419
+
420
+ calculated_aws_cost = calculated_aws_cost + text_extraction_cost + pii_identification_cost
421
+
422
+ return calculated_aws_cost
423
+
424
+ def calculate_time_taken(number_of_pages:str,
425
+ text_extract_method_radio:str,
426
+ pii_identification_method:str,
427
+ textract_output_found_checkbox:bool,
428
+ only_extract_text_radio:bool,
429
+ convert_page_time:float=0.5,
430
+ textract_page_time:float=1,
431
+ comprehend_page_time:float=1,
432
+ local_text_extraction_page_time:float=0.3,
433
+ local_pii_redaction_page_time:float=0.5,
434
+ local_ocr_extraction_page_time:float=1.5,
435
+ textract_option:str=textract_option,
436
+ text_ocr_option:str=text_ocr_option,
437
+ local_ocr_option:str=tesseract_ocr_option,
438
+ no_redaction_option:str=no_redaction_option,
439
+ aws_pii_detector:str=aws_pii_detector):
440
+ '''
441
+ Calculate the approximate time to redact a document.
442
+
443
+ - number_of_pages: The number of pages in the uploaded document(s).
444
+ - text_extract_method_radio: The method of text extraction.
445
+ - pii_identification_method_drop: The method of personally-identifiable information removal.
446
+ - only_extract_text_radio (bool, optional): Option to only extract text from the document rather than redact.
447
+ - textract_page_time (float, optional): Approximate time to query AWS Textract.
448
+ - comprehend_page_time (float, optional): Approximate time to query text on a page with AWS Comprehend.
449
+ - local_text_redaction_page_time (float, optional): Approximate time to extract text on a page with the local text redaction option.
450
+ - local_pii_redaction_page_time (float, optional): Approximate time to redact text on a page with the local text redaction option.
451
+ - local_ocr_extraction_page_time (float, optional): Approximate time to extract text from a page with the local OCR redaction option.
452
+ - textract_option (str, optional): String label for the text_extract_method_radio button for AWS Textract.
453
+ - text_ocr_option (str, optional): String label for text_extract_method_radio for text extraction.
454
+ - local_ocr_option (str, optional): String label for text_extract_method_radio for local OCR.
455
+ - no_redaction_option (str, optional): String label for pii_identification_method_drop for no redaction.
456
+ - aws_pii_detector (str, optional): String label for pii_identification_method_drop for AWS Comprehend.
457
+ '''
458
+ calculated_time_taken = 0
459
+ page_conversion_time_taken = 0
460
+ page_extraction_time_taken = 0
461
+ page_redaction_time_taken = 0
462
+
463
+ number_of_pages = int(number_of_pages)
464
+
465
+ # Page preparation/conversion to image time
466
+ if (text_extract_method_radio != text_ocr_option) and (textract_output_found_checkbox != True):
467
+ page_conversion_time_taken = number_of_pages * convert_page_time
468
+
469
+ # Page text extraction time
470
+ if text_extract_method_radio == textract_option:
471
+ if textract_output_found_checkbox != True:
472
+ page_extraction_time_taken = number_of_pages * textract_page_time
473
+ elif text_extract_method_radio == local_ocr_option:
474
+ page_extraction_time_taken = number_of_pages * local_ocr_extraction_page_time
475
+ elif text_extract_method_radio == text_ocr_option:
476
+ page_conversion_time_taken = number_of_pages * local_text_extraction_page_time
477
+
478
+ # Page redaction time
479
+ if pii_identification_method != no_redaction_option:
480
+ if pii_identification_method == aws_pii_detector:
481
+ page_redaction_time_taken = number_of_pages * comprehend_page_time
482
+ else:
483
+ page_redaction_time_taken = number_of_pages * local_pii_redaction_page_time
484
+
485
+ calculated_time_taken = (page_conversion_time_taken + page_extraction_time_taken + page_redaction_time_taken)/60
486
+
487
+ return calculated_time_taken
488
 
tools/load_spacy_model_custom_recognisers.py CHANGED
@@ -1,4 +1,3 @@
1
- # %%
2
  from typing import List
3
  from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
4
  from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts
 
 
1
  from typing import List
2
  from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
3
  from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts
tools/redaction_review.py CHANGED
@@ -13,10 +13,9 @@ from pymupdf import Document, Rect
13
  import pymupdf
14
  #from fitz
15
  from PIL import ImageDraw, Image
16
- from collections import defaultdict
17
 
18
- from tools.config import output_folder, CUSTOM_BOX_COLOUR, MAX_IMAGE_PIXELS
19
- from tools.file_conversion import is_pdf, convert_annotation_json_to_review_df, convert_review_df_to_annotation_json
20
  from tools.helper_functions import get_file_name_without_type, detect_file_type
21
  from tools.file_redaction import redact_page_with_pymupdf
22
 
@@ -26,21 +25,20 @@ def decrease_page(number:int):
26
  '''
27
  Decrease page number for review redactions page.
28
  '''
29
- #print("number:", str(number))
30
  if number > 1:
31
  return number - 1, number - 1
32
  else:
33
  return 1, 1
34
 
35
- def increase_page(number:int, image_annotator_object:AnnotatedImageData):
36
  '''
37
  Increase page number for review redactions page.
38
  '''
39
 
40
- if not image_annotator_object:
41
  return 1, 1
42
 
43
- max_pages = len(image_annotator_object)
44
 
45
  if number < max_pages:
46
  return number + 1, number + 1
@@ -57,48 +55,35 @@ def update_zoom(current_zoom_level:int, annotate_current_page:int, decrease:bool
57
 
58
  return current_zoom_level, annotate_current_page
59
 
60
- def remove_duplicate_images_with_blank_boxes(data: List[dict]) -> List[dict]:
61
- '''
62
- Remove items from the annotator object where the same page exists twice.
63
- '''
64
- # Group items by 'image'
65
- image_groups = defaultdict(list)
66
- for item in data:
67
- image_groups[item['image']].append(item)
68
-
69
- # Process each group to prioritize items with non-empty boxes
70
- result = []
71
- for image, items in image_groups.items():
72
- # Filter items with non-empty boxes
73
- non_empty_boxes = [item for item in items if item.get('boxes')]
74
-
75
- # Remove 'text' elements from boxes
76
- for item in non_empty_boxes:
77
- if 'boxes' in item:
78
- item['boxes'] = [{k: v for k, v in box.items() if k != 'text'} for box in item['boxes']]
79
-
80
- if non_empty_boxes:
81
- # Keep the first entry with non-empty boxes
82
- result.append(non_empty_boxes[0])
83
- else:
84
- # If all items have empty or missing boxes, keep the first item
85
- result.append(items[0])
86
-
87
- return result
88
 
89
  def update_dropdown_list_based_on_dataframe(df:pd.DataFrame, column:str) -> List["str"]:
90
  '''
91
  Gather unique elements from a string pandas Series, then append 'ALL' to the start and return the list.
92
  '''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
- entities = df[column].astype(str).unique().tolist()
95
- entities_for_drop = sorted(entities)
96
- entities_for_drop.insert(0, "ALL")
97
-
98
- return entities_for_drop
99
-
100
- def get_filtered_recogniser_dataframe_and_dropdowns(image_annotator_object:AnnotatedImageData,
101
- recogniser_dataframe_modified:pd.DataFrame,
102
  recogniser_dropdown_value:str,
103
  text_dropdown_value:str,
104
  page_dropdown_value:str,
@@ -109,10 +94,12 @@ def get_filtered_recogniser_dataframe_and_dropdowns(image_annotator_object:Annot
109
  '''
110
 
111
  recogniser_entities_list = ["Redaction"]
112
- recogniser_dataframe_out = recogniser_dataframe_modified
 
 
113
 
114
  try:
115
- review_dataframe = convert_annotation_json_to_review_df(image_annotator_object, review_df, page_sizes)
116
 
117
  recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(review_dataframe, "label")
118
  recogniser_entities_drop = gr.Dropdown(value=recogniser_dropdown_value, choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
@@ -127,49 +114,96 @@ def get_filtered_recogniser_dataframe_and_dropdowns(image_annotator_object:Annot
127
  page_entities_for_drop = update_dropdown_list_based_on_dataframe(review_dataframe, "page")
128
  page_entities_drop = gr.Dropdown(value=page_dropdown_value, choices=page_entities_for_drop, allow_custom_value=True, interactive=True)
129
 
130
- recogniser_dataframe_out = gr.Dataframe(review_dataframe[["page", "label", "text"]], show_search="filter", col_count=(3, "fixed"), type="pandas", headers=["page", "label", "text"])
 
 
131
 
132
  except Exception as e:
133
  print("Could not extract recogniser information:", e)
134
- recogniser_dataframe_out = recogniser_dataframe_modified[["page", "label", "text"]]
135
 
136
- recogniser_entities_drop = gr.Dropdown(value=recogniser_dropdown_value, choices=recogniser_dataframe_out["label"].astype(str).unique().tolist(), allow_custom_value=True, interactive=True)
 
 
 
 
137
  recogniser_entities_list = ["Redaction"]
138
- text_entities_drop = gr.Dropdown(value=text_dropdown_value, choices=recogniser_dataframe_out["text"].astype(str).unique().tolist(), allow_custom_value=True, interactive=True)
139
- page_entities_drop = gr.Dropdown(value=page_dropdown_value, choices=recogniser_dataframe_out["page"].astype(str).unique().tolist(), allow_custom_value=True, interactive=True)
140
 
141
- return recogniser_dataframe_out, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list, text_entities_drop, page_entities_drop
142
 
143
- def update_recogniser_dataframes(image_annotator_object:AnnotatedImageData, recogniser_dataframe_modified:pd.DataFrame, recogniser_entities_dropdown_value:str="ALL", text_dropdown_value:str="ALL", page_dropdown_value:str="ALL", review_df:pd.DataFrame=[], page_sizes:list[str]=[]):
144
  '''
145
  Update recogniser dataframe information that appears alongside the pdf pages on the review screen.
146
  '''
147
  recogniser_entities_list = ["Redaction"]
148
  recogniser_dataframe_out = pd.DataFrame()
 
 
 
 
 
 
 
 
 
 
 
149
 
150
- if recogniser_dataframe_modified.empty:
151
- recogniser_dataframe_modified, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list, text_entities_drop, page_entities_drop = get_filtered_recogniser_dataframe_and_dropdowns(image_annotator_object, recogniser_dataframe_modified, recogniser_entities_dropdown_value, text_dropdown_value, page_dropdown_value, review_df, page_sizes)
152
- elif recogniser_dataframe_modified.iloc[0,0] == "":
153
- recogniser_dataframe_modified, recogniser_dataframe_out, recogniser_entities_dropdown_value, recogniser_entities_list, text_entities_drop, page_entities_drop = get_filtered_recogniser_dataframe_and_dropdowns(image_annotator_object, recogniser_dataframe_modified, recogniser_entities_dropdown_value, text_dropdown_value, page_dropdown_value, review_df, page_sizes)
154
- else:
155
- review_dataframe, text_entities_drop, page_entities_drop = update_entities_df_recogniser_entities(recogniser_entities_dropdown_value, recogniser_dataframe_modified, page_dropdown_value, text_dropdown_value)
156
- recogniser_dataframe_out = gr.Dataframe(review_dataframe[["page", "label", "text"]], show_search="filter", col_count=(3, "fixed"), type="pandas", headers=["page", "label", "text"])
157
 
158
- recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(recogniser_dataframe_modified, "label")
159
  recogniser_entities_drop = gr.Dropdown(value=recogniser_entities_dropdown_value, choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
160
 
161
- recogniser_entities_list_base = recogniser_dataframe_modified["label"].astype(str).unique().tolist()
162
 
163
  # Recogniser entities list is the list of choices that appear when you make a new redaction box
164
  recogniser_entities_list = [entity for entity in recogniser_entities_list_base if entity != 'Redaction']
165
  recogniser_entities_list.insert(0, 'Redaction')
166
 
167
- return recogniser_entities_list, recogniser_dataframe_out, recogniser_dataframe_modified, recogniser_entities_drop, text_entities_drop, page_entities_drop
168
 
169
  def undo_last_removal(backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base):
170
  return backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base
171
 
172
- def exclude_selected_items_from_redaction(review_df: pd.DataFrame, selected_rows_df: pd.DataFrame, image_file_paths:List[str], page_sizes:List[dict], image_annotations_state:dict, recogniser_entity_dataframe_base:pd.DataFrame):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  '''
174
  Remove selected items from the review dataframe from the annotation object and review dataframe.
175
  '''
@@ -180,7 +214,7 @@ def exclude_selected_items_from_redaction(review_df: pd.DataFrame, selected_rows
180
 
181
  if not selected_rows_df.empty and not review_df.empty:
182
  # Ensure selected_rows_df has the same relevant columns
183
- selected_subset = selected_rows_df[['label', 'page', 'text']].drop_duplicates()
184
 
185
  # Perform anti-join using merge with an indicator column
186
  merged_df = review_df.merge(selected_subset, on=['label', 'page', 'text'], how='left', indicator=True)
@@ -189,6 +223,7 @@ def exclude_selected_items_from_redaction(review_df: pd.DataFrame, selected_rows
189
  out_review_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])
190
 
191
  out_image_annotations_state = convert_review_df_to_annotation_json(out_review_df, image_file_paths, page_sizes)
 
192
  out_recogniser_entity_dataframe_base = out_review_df[["page", "label", "text"]]
193
 
194
  # Either there is nothing left in the selection dataframe, or the review dataframe
@@ -196,90 +231,118 @@ def exclude_selected_items_from_redaction(review_df: pd.DataFrame, selected_rows
196
  out_review_df = review_df
197
  out_recogniser_entity_dataframe_base = recogniser_entity_dataframe_base
198
 
199
- out_image_annotations_state = []
200
-
201
- for page_no, page in enumerate(image_file_paths):
202
- annotation = {}
203
- annotation["image"] = image_file_paths[page_no]
204
- annotation["boxes"] = []
205
 
206
- out_image_annotations_state.append(annotation)
207
-
208
  return out_review_df, out_image_annotations_state, out_recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base
209
 
210
- def update_annotator(image_annotator_object:AnnotatedImageData,
211
- page_num:int,
212
- recogniser_entities_dropdown_value:str="ALL",
213
- page_dropdown_value:str="ALL",
214
- text_dropdown_value:str="ALL",
215
- recogniser_dataframe_modified=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), type="pandas", headers=["page", "label", "text"]), zoom:int=100,
216
- review_df:pd.DataFrame=[],
217
- page_sizes:List[dict]=[]):
 
 
 
 
218
  '''
219
  Update a gradio_image_annotation object with new annotation data.
220
  '''
221
- # First, update the dataframe containing the found recognisers
222
- recogniser_entities_list, recogniser_dataframe_out, recogniser_dataframe_modified, recogniser_entities_dropdown_value, text_entities_drop, page_entities_drop = update_recogniser_dataframes(image_annotator_object, recogniser_dataframe_modified, recogniser_entities_dropdown_value, text_dropdown_value, page_dropdown_value, review_df, page_sizes)
 
223
 
224
- #print("Creating output annotator object in update_annotator function")
 
 
 
 
 
225
 
226
- zoom_str = str(zoom) + '%'
227
- recogniser_colour_list = [(0, 0, 0) for _ in range(len(recogniser_entities_list))]
 
228
 
229
- #print("recogniser_entities_list:", recogniser_entities_list)
230
- #print("recogniser_colour_list:", recogniser_colour_list)
231
- #print("zoom_str:", zoom_str)
232
 
233
- if not image_annotator_object:
234
- page_num_reported = 1
235
 
236
- out_image_annotator = image_annotator(
237
- None,
238
- boxes_alpha=0.1,
239
- box_thickness=1,
240
- label_list=recogniser_entities_list,
241
- label_colors=recogniser_colour_list,
242
- show_label=False,
243
- height=zoom_str,
244
- width=zoom_str,
245
- box_min_size=1,
246
- box_selected_thickness=2,
247
- handle_size=4,
248
- sources=None,#["upload"],
249
- show_clear_button=False,
250
- show_share_button=False,
251
- show_remove_button=False,
252
- handles_cursor=True,
253
- interactive=True
254
- )
255
- number_reported = gr.Number(label = "Current page", value=page_num_reported, precision=0)
256
 
257
- return out_image_annotator, number_reported, number_reported, page_num_reported, recogniser_entities_dropdown_value, recogniser_dataframe_out, recogniser_dataframe_modified, text_entities_drop, page_entities_drop
 
 
 
 
 
 
 
 
 
 
258
 
259
- #print("page_num at start of update_annotator function:", page_num)
 
 
 
 
 
 
 
260
 
261
- if page_num is None:
262
- page_num = 0
263
 
264
- # Check bounding values for current page and page max
265
- if page_num > 0:
266
- page_num_reported = page_num
267
 
268
- elif page_num == 0: page_num_reported = 1
269
 
270
- else:
271
- page_num = 0
272
- page_num_reported = 1
 
 
273
 
274
- page_max_reported = len(image_annotator_object)
 
 
 
275
 
276
- if page_num_reported > page_max_reported:
277
- page_num_reported = page_max_reported
278
 
279
- image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
 
 
 
280
 
281
- out_image_annotator = image_annotator(
282
- value = image_annotator_object[page_num_reported - 1],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  boxes_alpha=0.1,
284
  box_thickness=1,
285
  label_list=recogniser_entities_list,
@@ -296,47 +359,99 @@ def update_annotator(image_annotator_object:AnnotatedImageData,
296
  show_remove_button=False,
297
  handles_cursor=True,
298
  interactive=True
299
- )
300
 
301
- number_reported = gr.Number(label = "Current page", value=page_num_reported, precision=0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
 
303
- return out_image_annotator, number_reported, number_reported, page_num_reported, recogniser_entities_dropdown_value, recogniser_dataframe_out, recogniser_dataframe_modified, text_entities_drop, page_entities_drop
304
 
305
- def modify_existing_page_redactions(image_annotator_object:AnnotatedImageData,
 
306
  current_page:int,
307
  previous_page:int,
308
  all_image_annotations:List[AnnotatedImageData],
309
- recogniser_entities_dropdown_value="ALL",
310
- text_dropdown_value="ALL",
311
- page_dropdown_value="ALL",
312
- recogniser_dataframe=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), show_search="filter", col_count=(3, "fixed"), type="pandas", headers=["page", "label", "text"]),
313
- review_dataframe:pd.DataFrame=[],
314
  page_sizes:List[dict]=[],
315
  clear_all:bool=False
316
  ):
317
  '''
318
- Overwrite current image annotations with modifications
319
  '''
320
 
321
- if not current_page:
322
- current_page = 1
 
323
 
324
- image_annotator_object['image'] = all_image_annotations[previous_page - 1]["image"]
 
 
325
 
326
- if clear_all == False:
327
- all_image_annotations[previous_page - 1] = image_annotator_object
328
- else:
329
- all_image_annotations[previous_page - 1]["boxes"] = []
330
 
331
  return all_image_annotations, current_page, current_page
332
 
333
- def apply_redactions(image_annotator_object:AnnotatedImageData,
334
  file_paths:List[str],
335
  doc:Document,
336
  all_image_annotations:List[AnnotatedImageData],
337
  current_page:int,
338
  review_file_state:pd.DataFrame,
339
- output_folder:str = output_folder,
340
  save_pdf:bool=True,
341
  page_sizes:List[dict]=[],
342
  progress=gr.Progress(track_tqdm=True)):
@@ -347,22 +462,22 @@ def apply_redactions(image_annotator_object:AnnotatedImageData,
347
  output_files = []
348
  output_log_files = []
349
  pdf_doc = []
 
350
 
351
- #print("File paths in apply_redactions:", file_paths)
352
 
353
- image_annotator_object['image'] = all_image_annotations[current_page - 1]["image"]
 
 
354
 
355
- all_image_annotations[current_page - 1] = image_annotator_object
356
-
357
- if not image_annotator_object:
358
- print("No image annotations found")
359
- return doc, all_image_annotations
360
 
361
  if isinstance(file_paths, str):
362
  file_paths = [file_paths]
363
 
364
  for file_path in file_paths:
365
- #print("file_path:", file_path)
366
  file_name_without_ext = get_file_name_without_type(file_path)
367
  file_name_with_ext = os.path.basename(file_path)
368
 
@@ -373,11 +488,9 @@ def apply_redactions(image_annotator_object:AnnotatedImageData,
373
  if (is_pdf(file_path) == False) & (file_extension not in '.csv'):
374
  image = Image.open(file_paths[-1])
375
 
376
- #image = pdf_doc
377
-
378
  draw = ImageDraw.Draw(image)
379
 
380
- for img_annotation_box in image_annotator_object['boxes']:
381
  coords = [img_annotation_box["xmin"],
382
  img_annotation_box["ymin"],
383
  img_annotation_box["xmax"],
@@ -385,6 +498,25 @@ def apply_redactions(image_annotator_object:AnnotatedImageData,
385
 
386
  fill = img_annotation_box["color"]
387
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
388
  draw.rectangle(coords, fill=fill)
389
 
390
  output_image_path = output_folder + file_name_without_ext + "_redacted.png"
@@ -392,12 +524,10 @@ def apply_redactions(image_annotator_object:AnnotatedImageData,
392
 
393
  output_files.append(output_image_path)
394
 
395
- print("Redactions saved to image file")
396
-
397
  doc = [image]
398
 
399
  elif file_extension in '.csv':
400
- print("This is a csv")
401
  pdf_doc = []
402
 
403
  # If working with pdfs
@@ -410,46 +540,38 @@ def apply_redactions(image_annotator_object:AnnotatedImageData,
410
  number_of_pages = pdf_doc.page_count
411
  original_cropboxes = []
412
 
413
- print("Saving pages to file.")
 
414
 
415
  for i in progress.tqdm(range(0, number_of_pages), desc="Saving redactions to file", unit = "pages"):
416
-
417
- #print("Saving page", str(i))
418
-
419
  image_loc = all_image_annotations[i]['image']
420
- #print("Image location:", image_loc)
421
 
422
  # Load in image object
423
  if isinstance(image_loc, np.ndarray):
424
  image = Image.fromarray(image_loc.astype('uint8'))
425
- #all_image_annotations[i]['image'] = image_loc.tolist()
426
  elif isinstance(image_loc, Image.Image):
427
  image = image_loc
428
- #image_out_folder = output_folder + file_name_without_ext + "_page_" + str(i) + ".png"
429
- #image_loc.save(image_out_folder)
430
- #all_image_annotations[i]['image'] = image_out_folder
431
  elif isinstance(image_loc, str):
432
- image = Image.open(image_loc)
433
-
434
-
435
- #print("all_image_annotations for page:", all_image_annotations[i])
436
- #print("image:", image)
 
437
 
438
  pymupdf_page = pdf_doc.load_page(i) #doc.load_page(current_page -1)
439
  original_cropboxes.append(pymupdf_page.cropbox.irect)
440
  pymupdf_page.set_cropbox = pymupdf_page.mediabox
441
- #print("pymupdf_page:", pymupdf_page)
442
- # print("original_cropboxes:", original_cropboxes)
443
-
444
- pymupdf_page = redact_page_with_pymupdf(page=pymupdf_page, page_annotations=all_image_annotations[i], image=image, original_cropbox=original_cropboxes[-1])
445
 
 
446
  else:
447
  print("File type not recognised.")
448
 
449
  #try:
450
  if pdf_doc:
451
  out_pdf_file_path = output_folder + file_name_without_ext + "_redacted.pdf"
452
- pdf_doc.save(out_pdf_file_path)
453
  output_files.append(out_pdf_file_path)
454
 
455
  else:
@@ -462,38 +584,60 @@ def apply_redactions(image_annotator_object:AnnotatedImageData,
462
  output_files.append(orig_pdf_file_path)
463
 
464
  try:
465
- #print("Saving annotations to JSON")
466
-
467
- # out_annotation_file_path = output_folder + file_name_with_ext + '_review_file.json'
468
- # with open(out_annotation_file_path, 'w') as f:
469
- # json.dump(all_image_annotations, f)
470
- # output_log_files.append(out_annotation_file_path)
471
-
472
- #print("Saving annotations to CSV review file")
473
- #print("all_image_annotations before conversion in apply redactions:", all_image_annotations)
474
- #print("review_file_state before conversion in apply redactions:", review_file_state)
475
- #print("page_sizes before conversion in apply redactions:", page_sizes)
476
-
477
- # Convert json to csv and also save this
478
- review_df = convert_annotation_json_to_review_df(all_image_annotations, review_file_state, page_sizes=page_sizes)[["image", "page", "text", "label","color", "xmin", "ymin", "xmax", "ymax"]]
479
  out_review_file_file_path = output_folder + file_name_with_ext + '_review_file.csv'
480
 
481
- #print("Saving review file after convert_annotation_json_to_review_df function in apply redactions")
482
  review_df.to_csv(out_review_file_file_path, index=None)
483
  output_files.append(out_review_file_file_path)
484
 
485
  except Exception as e:
486
  print("In apply redactions function, could not save annotations to csv file:", e)
487
 
488
- return doc, all_image_annotations, output_files, output_log_files
489
 
490
  def get_boxes_json(annotations:AnnotatedImageData):
491
  return annotations["boxes"]
492
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
493
  def update_entities_df_recogniser_entities(choice:str, df:pd.DataFrame, page_dropdown_value:str, text_dropdown_value:str):
494
  '''
495
  Update the rows in a dataframe depending on the user choice from a dropdown
496
  '''
 
497
  if isinstance(choice, str):
498
  choice = [choice]
499
  if isinstance(page_dropdown_value, str):
@@ -592,20 +736,86 @@ def update_entities_df_text(choice:str, df:pd.DataFrame, label_dropdown_value:st
592
 
593
  return filtered_df, recogniser_entities_drop, page_entities_drop
594
 
595
- def reset_dropdowns():
596
  '''
597
  Return Gradio dropdown objects with value 'ALL'.
598
  '''
599
- return gr.Dropdown(value="ALL", allow_custom_value=True), gr.Dropdown(value="ALL", allow_custom_value=True), gr.Dropdown(value="ALL", allow_custom_value=True)
 
 
 
 
 
 
 
 
 
 
600
 
601
  def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
602
 
603
  row_value_page = evt.row_value[0] # This is the page number value
 
 
 
 
 
 
604
 
605
- if isinstance(row_value_page, list):
606
- row_value_page = row_value_page[0]
607
 
608
- return row_value_page
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
609
 
610
  def convert_image_coords_to_adobe(pdf_page_width:float, pdf_page_height:float, image_width:float, image_height:float, x1:float, y1:float, x2:float, y2:float):
611
  '''
@@ -644,27 +854,23 @@ def convert_image_coords_to_adobe(pdf_page_width:float, pdf_page_height:float, i
644
 
645
  return pdf_x1, pdf_y1, pdf_x2, pdf_y2
646
 
647
- def convert_pymupdf_coords_to_adobe(x1: float, y1: float, x2: float, y2: float):
648
  """
649
  Converts coordinates from PyMuPDF (fitz) space to Adobe PDF space.
650
 
651
  Parameters:
652
- - pdf_page_width: Width of the PDF page
653
- - pdf_page_height: Height of the PDF page
654
  - x1, y1, x2, y2: Coordinates in PyMuPDF space
 
655
 
656
  Returns:
657
  - Tuple of converted coordinates (x1, y1, x2, y2) in Adobe PDF space
658
  """
659
-
660
- # PyMuPDF and Adobe PDF coordinates are similar, but ensure y1 is always the lower value
661
- pdf_x1, pdf_x2 = x1, x2
662
-
663
- # Ensure y1 is the bottom coordinate and y2 is the top
664
- pdf_y1, pdf_y2 = min(y1, y2), max(y1, y2)
665
-
666
- return pdf_x1, pdf_y1, pdf_x2, pdf_y2
667
 
 
 
 
 
 
668
 
669
  def create_xfdf(review_file_df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, image_paths:List[str], document_cropboxes:List=[], page_sizes:List[dict]=[]):
670
  '''
@@ -687,45 +893,49 @@ def create_xfdf(review_file_df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, i
687
  page_sizes_df = pd.DataFrame(page_sizes)
688
 
689
  # If there are no image coordinates, then convert coordinates to pymupdf coordinates prior to export
690
- if len(page_sizes_df.loc[page_sizes_df["image_width"].isnull(),"image_width"]) == len(page_sizes_df["image_width"]):
691
- print("No image dimensions found, using pymupdf coordinates for conversion.")
692
 
693
- if "mediabox_width" not in review_file_df.columns:
694
- review_file_df = review_file_df.merge(page_sizes_df, how="left", on = "page")
695
-
696
- # If all coordinates are less or equal to one, this is a relative page scaling - change back to image coordinates
697
- if review_file_df["xmin"].max() <= 1 and review_file_df["xmax"].max() <= 1 and review_file_df["ymin"].max() <= 1 and review_file_df["ymax"].max() <= 1:
698
- review_file_df["xmin"] = review_file_df["xmin"] * review_file_df["mediabox_width"]
699
- review_file_df["xmax"] = review_file_df["xmax"] * review_file_df["mediabox_width"]
700
- review_file_df["ymin"] = review_file_df["ymin"] * review_file_df["mediabox_height"]
701
- review_file_df["ymax"] = review_file_df["ymax"] * review_file_df["mediabox_height"]
702
 
703
- pages_are_images = False
 
 
 
 
 
 
 
 
 
 
 
 
 
704
 
705
- # If no nulls, then can do image coordinate conversion
706
- elif len(page_sizes_df.loc[page_sizes_df["image_width"].isnull(),"image_width"]) == 0:
707
 
708
- if "image_width" not in review_file_df.columns:
709
- review_file_df = review_file_df.merge(page_sizes_df, how="left", on = "page")
710
 
711
- # If all coordinates are less or equal to one, this is a relative page scaling - change back to image coordinates
712
- if review_file_df["xmin"].max() <= 1 and review_file_df["xmax"].max() <= 1 and review_file_df["ymin"].max() <= 1 and review_file_df["ymax"].max() <= 1:
713
- review_file_df["xmin"] = review_file_df["xmin"] * review_file_df["image_width"]
714
- review_file_df["xmax"] = review_file_df["xmax"] * review_file_df["image_width"]
715
- review_file_df["ymin"] = review_file_df["ymin"] * review_file_df["image_height"]
716
- review_file_df["ymax"] = review_file_df["ymax"] * review_file_df["image_height"]
717
-
718
- pages_are_images = True
719
 
720
  # Go through each row of the review_file_df, create an entry in the output Adobe xfdf file.
721
  for _, row in review_file_df.iterrows():
 
722
  page_python_format = int(row["page"])-1
723
 
724
  pymupdf_page = pymupdf_doc.load_page(page_python_format)
725
 
726
  # Load cropbox sizes. Set cropbox to the original cropbox sizes from when the document was loaded into the app.
727
  if document_cropboxes:
728
- #print("Document cropboxes:", document_cropboxes)
729
 
730
  # Extract numbers safely using regex
731
  match = re.findall(r"[-+]?\d*\.\d+|\d+", document_cropboxes[page_python_format])
@@ -740,14 +950,30 @@ def create_xfdf(review_file_df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, i
740
 
741
 
742
  pdf_page_height = pymupdf_page.mediabox.height
743
- pdf_page_width = pymupdf_page.mediabox.width
744
 
745
- image = image_paths[page_python_format]
 
746
 
747
- if isinstance(image, str):
748
- image = Image.open(image)
749
 
750
- image_page_width, image_page_height = image.size
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
751
 
752
  # Create redaction annotation
753
  redact_annot = SubElement(annots, 'redact')
@@ -759,23 +985,23 @@ def create_xfdf(review_file_df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, i
759
  # Set page number (subtract 1 as PDF pages are 0-based)
760
  redact_annot.set('page', str(int(row['page']) - 1))
761
 
762
- # Convert coordinates
763
- if pages_are_images == True:
764
- x1, y1, x2, y2 = convert_image_coords_to_adobe(
765
- pdf_page_width,
766
- pdf_page_height,
767
- image_page_width,
768
- image_page_height,
769
- row['xmin'],
770
- row['ymin'],
771
- row['xmax'],
772
- row['ymax']
773
- )
774
- else:
775
- x1, y1, x2, y2 = convert_pymupdf_coords_to_adobe(row['xmin'],
776
- row['ymin'],
777
- row['xmax'],
778
- row['ymax'])
779
 
780
  if CUSTOM_BOX_COLOUR == "grey":
781
  colour_str = "0.5,0.5,0.5"
@@ -827,7 +1053,7 @@ def create_xfdf(review_file_df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, i
827
 
828
  return xml_str
829
 
830
- def convert_df_to_xfdf(input_files:List[str], pdf_doc:Document, image_paths:List[str], output_folder:str = output_folder, document_cropboxes:List=[], page_sizes:List[dict]=[]):
831
  '''
832
  Load in files to convert a review file into an Adobe comment file format
833
  '''
@@ -934,8 +1160,6 @@ def parse_xfdf(xfdf_path:str):
934
  # Find all redact elements using the namespace
935
  for redact in root.findall('.//xfdf:redact', namespaces=namespace):
936
 
937
- #print("redact:", redact)
938
-
939
  redaction_info = {
940
  'image': '', # Image will be filled in later
941
  'page': int(redact.get('page')) + 1, # Convert to 1-based index
@@ -948,12 +1172,10 @@ def parse_xfdf(xfdf_path:str):
948
  'color': redact.get('border-color', '(0, 0, 0)') # Default to black if not specified
949
  }
950
  redactions.append(redaction_info)
951
-
952
- print("redactions:", redactions)
953
 
954
  return redactions
955
 
956
- def convert_xfdf_to_dataframe(file_paths_list:List[str], pymupdf_doc, image_paths:List[str], output_folder:str=output_folder):
957
  '''
958
  Convert redaction annotations from XFDF and associated images into a DataFrame.
959
 
@@ -969,8 +1191,6 @@ def convert_xfdf_to_dataframe(file_paths_list:List[str], pymupdf_doc, image_path
969
  xfdf_paths = []
970
  df = pd.DataFrame()
971
 
972
- #print("Image paths:", image_paths)
973
-
974
  # Sort the file paths so that the pdfs come first
975
  file_paths_list = sorted(file_paths_list, key=lambda x: (os.path.splitext(x)[1] != '.pdf', os.path.splitext(x)[1] != '.json'))
976
 
@@ -986,7 +1206,6 @@ def convert_xfdf_to_dataframe(file_paths_list:List[str], pymupdf_doc, image_path
986
 
987
  if file_path_end == "pdf":
988
  pdf_name = os.path.basename(file_path)
989
- #print("pymupdf_doc:", pymupdf_doc)
990
 
991
  # Add pdf to outputs
992
  output_paths.append(file_path)
@@ -997,18 +1216,10 @@ def convert_xfdf_to_dataframe(file_paths_list:List[str], pymupdf_doc, image_path
997
  message = "Original PDF needed to convert from .xfdf format"
998
  print(message)
999
  raise ValueError(message)
1000
-
1001
  xfdf_path = file
1002
 
1003
- # if isinstance(xfdf_paths, str):
1004
- # xfdf_path = xfdf_paths.name
1005
- # else:
1006
- # xfdf_path = xfdf_paths[0].name
1007
-
1008
  file_path_name = get_file_name_without_type(xfdf_path)
1009
 
1010
- #print("file_path_name:", file_path_name)
1011
-
1012
  # Parse the XFDF file
1013
  redactions = parse_xfdf(xfdf_path)
1014
 
@@ -1027,8 +1238,6 @@ def convert_xfdf_to_dataframe(file_paths_list:List[str], pymupdf_doc, image_path
1027
 
1028
  image_path = image_paths[page_python_format]
1029
 
1030
- #print("image_path:", image_path)
1031
-
1032
  if isinstance(image_path, str):
1033
  image = Image.open(image_path)
1034
 
@@ -1040,7 +1249,6 @@ def convert_xfdf_to_dataframe(file_paths_list:List[str], pymupdf_doc, image_path
1040
  df.loc[_, ['xmin', 'ymin', 'xmax', 'ymax']] = [image_x1, image_y1, image_x2, image_y2]
1041
 
1042
  # Optionally, you can add the image path or other relevant information
1043
- #print("Image path:", image_path)
1044
  df.loc[_, 'image'] = image_path
1045
 
1046
  #print('row:', row)
 
13
  import pymupdf
14
  #from fitz
15
  from PIL import ImageDraw, Image
 
16
 
17
+ from tools.config import OUTPUT_FOLDER, CUSTOM_BOX_COLOUR, MAX_IMAGE_PIXELS, INPUT_FOLDER
18
+ from tools.file_conversion import is_pdf, convert_annotation_json_to_review_df, convert_review_df_to_annotation_json, process_single_page_for_image_conversion, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes
19
  from tools.helper_functions import get_file_name_without_type, detect_file_type
20
  from tools.file_redaction import redact_page_with_pymupdf
21
 
 
25
  '''
26
  Decrease page number for review redactions page.
27
  '''
 
28
  if number > 1:
29
  return number - 1, number - 1
30
  else:
31
  return 1, 1
32
 
33
+ def increase_page(number:int, page_image_annotator_object:AnnotatedImageData):
34
  '''
35
  Increase page number for review redactions page.
36
  '''
37
 
38
+ if not page_image_annotator_object:
39
  return 1, 1
40
 
41
+ max_pages = len(page_image_annotator_object)
42
 
43
  if number < max_pages:
44
  return number + 1, number + 1
 
55
 
56
  return current_zoom_level, annotate_current_page
57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
  def update_dropdown_list_based_on_dataframe(df:pd.DataFrame, column:str) -> List["str"]:
60
  '''
61
  Gather unique elements from a string pandas Series, then append 'ALL' to the start and return the list.
62
  '''
63
+ if isinstance(df, pd.DataFrame):
64
+ # Check if the Series is empty or all NaN
65
+ if column not in df.columns or df[column].empty or df[column].isna().all():
66
+ return ["ALL"]
67
+ elif column != "page":
68
+ entities = df[column].astype(str).unique().tolist()
69
+ entities_for_drop = sorted(entities)
70
+ entities_for_drop.insert(0, "ALL")
71
+ else:
72
+ # Ensure the column can be converted to int - assumes it is the page column
73
+ try:
74
+ entities = df[column].astype(int).unique()
75
+ entities_for_drop = sorted(entities)
76
+ entities_for_drop = [str(e) for e in entities_for_drop] # Convert back to string
77
+ entities_for_drop.insert(0, "ALL")
78
+ except ValueError:
79
+ return ["ALL"] # Handle case where conversion fails
80
+
81
+ return entities_for_drop # Ensure to return the list
82
+ else:
83
+ return ["ALL"]
84
 
85
+ def get_filtered_recogniser_dataframe_and_dropdowns(page_image_annotator_object:AnnotatedImageData,
86
+ recogniser_dataframe_base:pd.DataFrame,
 
 
 
 
 
 
87
  recogniser_dropdown_value:str,
88
  text_dropdown_value:str,
89
  page_dropdown_value:str,
 
94
  '''
95
 
96
  recogniser_entities_list = ["Redaction"]
97
+ recogniser_dataframe_out = recogniser_dataframe_base
98
+ recogniser_dataframe_out_gr = gr.Dataframe()
99
+ review_dataframe = review_df
100
 
101
  try:
102
+ review_dataframe = convert_annotation_json_to_review_df(page_image_annotator_object, review_df, page_sizes)
103
 
104
  recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(review_dataframe, "label")
105
  recogniser_entities_drop = gr.Dropdown(value=recogniser_dropdown_value, choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
 
114
  page_entities_for_drop = update_dropdown_list_based_on_dataframe(review_dataframe, "page")
115
  page_entities_drop = gr.Dropdown(value=page_dropdown_value, choices=page_entities_for_drop, allow_custom_value=True, interactive=True)
116
 
117
+ recogniser_dataframe_out_gr = gr.Dataframe(review_dataframe[["page", "label", "text"]], show_search="filter", col_count=(3, "fixed"), type="pandas", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True)
118
+
119
+ recogniser_dataframe_out = review_dataframe[["page", "label", "text"]]
120
 
121
  except Exception as e:
122
  print("Could not extract recogniser information:", e)
123
+ recogniser_dataframe_out = recogniser_dataframe_base[["page", "label", "text"]]
124
 
125
+ label_choices = review_dataframe["label"].astype(str).unique().tolist()
126
+ text_choices = review_dataframe["text"].astype(str).unique().tolist()
127
+ page_choices = review_dataframe["page"].astype(str).unique().tolist()
128
+
129
+ recogniser_entities_drop = gr.Dropdown(value=recogniser_dropdown_value, choices=label_choices, allow_custom_value=True, interactive=True)
130
  recogniser_entities_list = ["Redaction"]
131
+ text_entities_drop = gr.Dropdown(value=text_dropdown_value, choices=text_choices, allow_custom_value=True, interactive=True)
132
+ page_entities_drop = gr.Dropdown(value=page_dropdown_value, choices=page_choices, allow_custom_value=True, interactive=True)
133
 
134
+ return recogniser_dataframe_out_gr, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list, text_entities_drop, page_entities_drop
135
 
136
+ def update_recogniser_dataframes(page_image_annotator_object:AnnotatedImageData, recogniser_dataframe_base:pd.DataFrame, recogniser_entities_dropdown_value:str="ALL", text_dropdown_value:str="ALL", page_dropdown_value:str="ALL", review_df:pd.DataFrame=[], page_sizes:list[str]=[]):
137
  '''
138
  Update recogniser dataframe information that appears alongside the pdf pages on the review screen.
139
  '''
140
  recogniser_entities_list = ["Redaction"]
141
  recogniser_dataframe_out = pd.DataFrame()
142
+ recogniser_dataframe_out_gr = gr.Dataframe()
143
+
144
+ # If base recogniser dataframe is empy, need to create it.
145
+ if recogniser_dataframe_base.empty:
146
+ recogniser_dataframe_out_gr, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list, text_entities_drop, page_entities_drop = get_filtered_recogniser_dataframe_and_dropdowns(page_image_annotator_object, recogniser_dataframe_base, recogniser_entities_dropdown_value, text_dropdown_value, page_dropdown_value, review_df, page_sizes)
147
+ elif recogniser_dataframe_base.iloc[0,0] == "":
148
+ recogniser_dataframe_out_gr, recogniser_dataframe_out, recogniser_entities_dropdown_value, recogniser_entities_list, text_entities_drop, page_entities_drop = get_filtered_recogniser_dataframe_and_dropdowns(page_image_annotator_object, recogniser_dataframe_base, recogniser_entities_dropdown_value, text_dropdown_value, page_dropdown_value, review_df, page_sizes)
149
+ else:
150
+ recogniser_dataframe_out_gr, recogniser_dataframe_out, recogniser_entities_dropdown, recogniser_entities_list, text_dropdown, page_dropdown = get_filtered_recogniser_dataframe_and_dropdowns(page_image_annotator_object, recogniser_dataframe_base, recogniser_entities_dropdown_value, text_dropdown_value, page_dropdown_value, review_df, page_sizes)
151
+
152
+ review_dataframe, text_entities_drop, page_entities_drop = update_entities_df_recogniser_entities(recogniser_entities_dropdown_value, recogniser_dataframe_out, page_dropdown_value, text_dropdown_value)
153
 
154
+ recogniser_dataframe_out_gr = gr.Dataframe(review_dataframe[["page", "label", "text"]], show_search="filter", col_count=(3, "fixed"), type="pandas", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True)
 
 
 
 
 
 
155
 
156
+ recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(recogniser_dataframe_out, "label")
157
  recogniser_entities_drop = gr.Dropdown(value=recogniser_entities_dropdown_value, choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
158
 
159
+ recogniser_entities_list_base = recogniser_dataframe_out["label"].astype(str).unique().tolist()
160
 
161
  # Recogniser entities list is the list of choices that appear when you make a new redaction box
162
  recogniser_entities_list = [entity for entity in recogniser_entities_list_base if entity != 'Redaction']
163
  recogniser_entities_list.insert(0, 'Redaction')
164
 
165
+ return recogniser_entities_list, recogniser_dataframe_out_gr, recogniser_dataframe_out, recogniser_entities_drop, text_entities_drop, page_entities_drop
166
 
167
  def undo_last_removal(backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base):
168
  return backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base
169
 
170
+ def update_annotator_page_from_review_df(review_df: pd.DataFrame,
171
+ image_file_paths:List[str],
172
+ page_sizes:List[dict],
173
+ current_page:int,
174
+ previous_page:int,
175
+ current_image_annotations_state:List[str],
176
+ current_page_annotator:object):
177
+ '''
178
+ Update the visible annotation object with the latest review file information
179
+ '''
180
+ out_image_annotations_state = current_image_annotations_state
181
+ out_current_page_annotator = current_page_annotator
182
+
183
+ print("page_sizes:", page_sizes)
184
+
185
+ review_df.to_csv(OUTPUT_FOLDER + "review_df_in_update_annotator.csv")
186
+
187
+ if not review_df.empty:
188
+
189
+ out_image_annotations_state = convert_review_df_to_annotation_json(review_df, image_file_paths, page_sizes)
190
+
191
+ print("out_image_annotations_state[current_page-1]:", out_image_annotations_state[current_page-1])
192
+
193
+ if previous_page == current_page:
194
+ out_current_page_annotator = out_image_annotations_state[current_page-1]
195
+
196
+ return out_current_page_annotator, out_image_annotations_state
197
+
198
+
199
+
200
+
201
+ def exclude_selected_items_from_redaction(review_df: pd.DataFrame,
202
+ selected_rows_df: pd.DataFrame,
203
+ image_file_paths:List[str],
204
+ page_sizes:List[dict],
205
+ image_annotations_state:dict,
206
+ recogniser_entity_dataframe_base:pd.DataFrame):
207
  '''
208
  Remove selected items from the review dataframe from the annotation object and review dataframe.
209
  '''
 
214
 
215
  if not selected_rows_df.empty and not review_df.empty:
216
  # Ensure selected_rows_df has the same relevant columns
217
+ selected_subset = selected_rows_df[['label', 'page', 'text']].drop_duplicates(subset=['label', 'page', 'text'])
218
 
219
  # Perform anti-join using merge with an indicator column
220
  merged_df = review_df.merge(selected_subset, on=['label', 'page', 'text'], how='left', indicator=True)
 
223
  out_review_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])
224
 
225
  out_image_annotations_state = convert_review_df_to_annotation_json(out_review_df, image_file_paths, page_sizes)
226
+
227
  out_recogniser_entity_dataframe_base = out_review_df[["page", "label", "text"]]
228
 
229
  # Either there is nothing left in the selection dataframe, or the review dataframe
 
231
  out_review_df = review_df
232
  out_recogniser_entity_dataframe_base = recogniser_entity_dataframe_base
233
 
234
+ out_image_annotations_state = image_annotations_state
 
 
 
 
 
235
 
 
 
236
  return out_review_df, out_image_annotations_state, out_recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base
237
 
238
+ def update_annotator_object_and_filter_df(
239
+ all_image_annotations:List[AnnotatedImageData],
240
+ gradio_annotator_current_page_number:int,
241
+ recogniser_entities_dropdown_value:str="ALL",
242
+ page_dropdown_value:str="ALL",
243
+ text_dropdown_value:str="ALL",
244
+ recogniser_dataframe_base:gr.Dataframe=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), type="pandas", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True),
245
+ zoom:int=100,
246
+ review_df:pd.DataFrame=[],
247
+ page_sizes:List[dict]=[],
248
+ doc_full_file_name_textbox:str='',
249
+ input_folder:str=INPUT_FOLDER):
250
  '''
251
  Update a gradio_image_annotation object with new annotation data.
252
  '''
253
+ zoom_str = str(zoom) + '%'
254
+
255
+ if not gradio_annotator_current_page_number: gradio_annotator_current_page_number = 0
256
 
257
+ # Check bounding values for current page and page max
258
+ if gradio_annotator_current_page_number > 0: page_num_reported = gradio_annotator_current_page_number
259
+ elif gradio_annotator_current_page_number == 0: page_num_reported = 1 # minimum possible reported page is 1
260
+ else:
261
+ gradio_annotator_current_page_number = 0
262
+ page_num_reported = 1
263
 
264
+ # Ensure page displayed can't exceed number of pages in document
265
+ page_max_reported = len(all_image_annotations)
266
+ if page_num_reported > page_max_reported: page_num_reported = page_max_reported
267
 
268
+ page_num_reported_zero_indexed = page_num_reported - 1
 
 
269
 
270
+ # First, check that the image on the current page is valid, replace with what exists in page_sizes object if not
271
+ page_image_annotator_object, all_image_annotations = replace_images_in_image_annotation_object(all_image_annotations, all_image_annotations[page_num_reported_zero_indexed], page_sizes, page_num_reported)
272
 
273
+ all_image_annotations[page_num_reported_zero_indexed] = page_image_annotator_object
274
+
275
+ current_image_path = all_image_annotations[page_num_reported_zero_indexed]['image']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
 
277
+ # If image path is still not valid, load in a new image an overwrite it. Then replace all items in the image annotation object for all pages based on the updated information.
278
+ page_sizes_df = pd.DataFrame(page_sizes)
279
+
280
+ if not os.path.exists(current_image_path):
281
+
282
+ page_num, replaced_image_path, width, height = process_single_page_for_image_conversion(doc_full_file_name_textbox, page_num_reported_zero_indexed, input_folder=input_folder)
283
+
284
+ # Overwrite page_sizes values
285
+ page_sizes_df.loc[page_sizes_df['page']==page_num_reported, "image_width"] = width
286
+ page_sizes_df.loc[page_sizes_df['page']==page_num_reported, "image_height"] = height
287
+ page_sizes_df.loc[page_sizes_df['page']==page_num_reported, "image_path"] = replaced_image_path
288
 
289
+ else:
290
+ if not page_sizes_df.loc[page_sizes_df['page']==page_num_reported, "image_width"].isnull().all():
291
+ width = page_sizes_df.loc[page_sizes_df['page']==page_num_reported, "image_width"].max()
292
+ height = page_sizes_df.loc[page_sizes_df['page']==page_num_reported, "image_height"].max()
293
+ else:
294
+ image = Image.open(current_image_path)
295
+ width = image.width
296
+ height = image.height
297
 
298
+ page_sizes_df.loc[page_sizes_df['page']==page_num_reported, "image_width"] = width
299
+ page_sizes_df.loc[page_sizes_df['page']==page_num_reported, "image_height"] = height
300
 
301
+ page_sizes_df.loc[page_sizes_df['page']==page_num_reported, "image_path"] = current_image_path
 
 
302
 
303
+ replaced_image_path = current_image_path
304
 
305
+ if review_df.empty: review_df = pd.DataFrame(columns=["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"])
306
+
307
+ ##
308
+
309
+ review_df.loc[review_df["page"]==page_num_reported, 'image'] = replaced_image_path
310
 
311
+ # Update dropdowns and review selection dataframe with the updated annotator object
312
+ recogniser_entities_list, recogniser_dataframe_out_gr, recogniser_dataframe_modified, recogniser_entities_dropdown_value, text_entities_drop, page_entities_drop = update_recogniser_dataframes(all_image_annotations, recogniser_dataframe_base, recogniser_entities_dropdown_value, text_dropdown_value, page_dropdown_value, review_df.copy(), page_sizes)
313
+
314
+ recogniser_colour_list = [(0, 0, 0) for _ in range(len(recogniser_entities_list))]
315
 
316
+ # page_sizes_df has been changed - save back to page_sizes_object
317
+ page_sizes = page_sizes_df.to_dict(orient='records')
318
 
319
+ images_list = list(page_sizes_df["image_path"])
320
+ images_list[page_num_reported_zero_indexed] = replaced_image_path
321
+
322
+ all_image_annotations[page_num_reported_zero_indexed]['image'] = replaced_image_path
323
 
324
+ # Multiply out image_annotation coordinates from relative to absolute if necessary
325
+ all_image_annotations_df = convert_annotation_data_to_dataframe(all_image_annotations)
326
+
327
+ all_image_annotations_df = multiply_coordinates_by_page_sizes(all_image_annotations_df, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
328
+
329
+ all_image_annotations = create_annotation_dicts_from_annotation_df(all_image_annotations_df, page_sizes)
330
+
331
+ # Remove blank duplicate entries
332
+ all_image_annotations = remove_duplicate_images_with_blank_boxes(all_image_annotations)
333
+
334
+ current_page_image_annotator_object = all_image_annotations[page_num_reported_zero_indexed]
335
+
336
+ page_number_reported_gradio = gr.Number(label = "Current page", value=page_num_reported, precision=0)
337
+
338
+ ###
339
+ # If no data, present a blank page
340
+ if not all_image_annotations:
341
+ print("No all_image_annotation object found")
342
+ page_num_reported = 1
343
+
344
+ out_image_annotator = image_annotator(
345
+ value = None,
346
  boxes_alpha=0.1,
347
  box_thickness=1,
348
  label_list=recogniser_entities_list,
 
359
  show_remove_button=False,
360
  handles_cursor=True,
361
  interactive=True
362
+ )
363
 
364
+ return out_image_annotator, page_number_reported_gradio, page_number_reported_gradio, page_num_reported, recogniser_entities_dropdown_value, recogniser_dataframe_out_gr, recogniser_dataframe_modified, text_entities_drop, page_entities_drop, page_sizes, all_image_annotations
365
+
366
+ else:
367
+ ### Present image_annotator outputs
368
+ out_image_annotator = image_annotator(
369
+ value = current_page_image_annotator_object,
370
+ boxes_alpha=0.1,
371
+ box_thickness=1,
372
+ label_list=recogniser_entities_list,
373
+ label_colors=recogniser_colour_list,
374
+ show_label=False,
375
+ height=zoom_str,
376
+ width=zoom_str,
377
+ box_min_size=1,
378
+ box_selected_thickness=2,
379
+ handle_size=4,
380
+ sources=None,#["upload"],
381
+ show_clear_button=False,
382
+ show_share_button=False,
383
+ show_remove_button=False,
384
+ handles_cursor=True,
385
+ interactive=True
386
+ )
387
+
388
+ #print("all_image_annotations at end of update_annotator...:", all_image_annotations)
389
+ #print("review_df at end of update_annotator_object:", review_df)
390
+
391
+ return out_image_annotator, page_number_reported_gradio, page_number_reported_gradio, page_num_reported, recogniser_entities_dropdown_value, recogniser_dataframe_out_gr, recogniser_dataframe_modified, text_entities_drop, page_entities_drop, page_sizes, all_image_annotations
392
+
393
+ def replace_images_in_image_annotation_object(
394
+ all_image_annotations:List[dict],
395
+ page_image_annotator_object:AnnotatedImageData,
396
+ page_sizes:List[dict],
397
+ page:int):
398
+
399
+ '''
400
+ Check if the image value in an AnnotatedImageData dict is a placeholder or np.array. If either of these, replace the value with the file path of the image that is hopefully already loaded into the app related to this page.
401
+ '''
402
+
403
+ page_zero_index = page - 1
404
+
405
+ if isinstance(all_image_annotations[page_zero_index]["image"], np.ndarray) or "placeholder_image" in all_image_annotations[page_zero_index]["image"] or isinstance(page_image_annotator_object['image'], np.ndarray):
406
+ page_sizes_df = pd.DataFrame(page_sizes)
407
+ page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
408
+
409
+ # Check for matching pages
410
+ matching_paths = page_sizes_df.loc[page_sizes_df['page'] == page, "image_path"].unique()
411
+
412
+ if matching_paths.size > 0:
413
+ image_path = matching_paths[0]
414
+ page_image_annotator_object['image'] = image_path
415
+ all_image_annotations[page_zero_index]["image"] = image_path
416
+ else:
417
+ print(f"No image path found for page {page}.")
418
 
419
+ return page_image_annotator_object, all_image_annotations
420
 
421
+ def update_all_page_annotation_object_based_on_previous_page(
422
+ page_image_annotator_object:AnnotatedImageData,
423
  current_page:int,
424
  previous_page:int,
425
  all_image_annotations:List[AnnotatedImageData],
 
 
 
 
 
426
  page_sizes:List[dict]=[],
427
  clear_all:bool=False
428
  ):
429
  '''
430
+ Overwrite image annotations on the page we are moving from with modifications.
431
  '''
432
 
433
+ previous_page_zero_index = previous_page -1
434
+
435
+ if not current_page: current_page = 1
436
 
437
+ #print("page_image_annotator_object at start of update_all_page_annotation_object:", page_image_annotator_object)
438
+
439
+ page_image_annotator_object, all_image_annotations = replace_images_in_image_annotation_object(all_image_annotations, page_image_annotator_object, page_sizes, previous_page)
440
 
441
+ #print("page_image_annotator_object after replace_images in update_all_page_annotation_object:", page_image_annotator_object)
442
+
443
+ if clear_all == False: all_image_annotations[previous_page_zero_index] = page_image_annotator_object
444
+ else: all_image_annotations[previous_page_zero_index]["boxes"] = []
445
 
446
  return all_image_annotations, current_page, current_page
447
 
448
+ def apply_redactions_to_review_df_and_files(page_image_annotator_object:AnnotatedImageData,
449
  file_paths:List[str],
450
  doc:Document,
451
  all_image_annotations:List[AnnotatedImageData],
452
  current_page:int,
453
  review_file_state:pd.DataFrame,
454
+ output_folder:str = OUTPUT_FOLDER,
455
  save_pdf:bool=True,
456
  page_sizes:List[dict]=[],
457
  progress=gr.Progress(track_tqdm=True)):
 
462
  output_files = []
463
  output_log_files = []
464
  pdf_doc = []
465
+ review_df = review_file_state
466
 
467
+ page_image_annotator_object = all_image_annotations[current_page - 1]
468
 
469
+ # This replaces the numpy array image object with the image file path
470
+ page_image_annotator_object, all_image_annotations = replace_images_in_image_annotation_object(all_image_annotations, page_image_annotator_object, page_sizes, current_page)
471
+ page_image_annotator_object['image'] = all_image_annotations[current_page - 1]["image"]
472
 
473
+ if not page_image_annotator_object:
474
+ print("No image annotations object found for page")
475
+ return doc, all_image_annotations, output_files, output_log_files, review_df
 
 
476
 
477
  if isinstance(file_paths, str):
478
  file_paths = [file_paths]
479
 
480
  for file_path in file_paths:
 
481
  file_name_without_ext = get_file_name_without_type(file_path)
482
  file_name_with_ext = os.path.basename(file_path)
483
 
 
488
  if (is_pdf(file_path) == False) & (file_extension not in '.csv'):
489
  image = Image.open(file_paths[-1])
490
 
 
 
491
  draw = ImageDraw.Draw(image)
492
 
493
+ for img_annotation_box in page_image_annotator_object['boxes']:
494
  coords = [img_annotation_box["xmin"],
495
  img_annotation_box["ymin"],
496
  img_annotation_box["xmax"],
 
498
 
499
  fill = img_annotation_box["color"]
500
 
501
+ # Ensure fill is a valid RGB tuple
502
+ if isinstance(fill, tuple) and len(fill) == 3:
503
+ # Check if all elements are integers in the range 0-255
504
+ if all(isinstance(c, int) and 0 <= c <= 255 for c in fill):
505
+ pass
506
+ #print("fill:", fill)
507
+ else:
508
+ print(f"Invalid color values: {fill}. Defaulting to black.")
509
+ fill = (0, 0, 0) # Default to black if invalid
510
+ else:
511
+ print(f"Invalid fill format: {fill}. Defaulting to black.")
512
+ fill = (0, 0, 0) # Default to black if not a valid tuple
513
+
514
+ # Ensure the image is in RGB mode
515
+ if image.mode not in ("RGB", "RGBA"):
516
+ image = image.convert("RGB")
517
+
518
+ draw = ImageDraw.Draw(image)
519
+
520
  draw.rectangle(coords, fill=fill)
521
 
522
  output_image_path = output_folder + file_name_without_ext + "_redacted.png"
 
524
 
525
  output_files.append(output_image_path)
526
 
 
 
527
  doc = [image]
528
 
529
  elif file_extension in '.csv':
530
+ #print("This is a csv")
531
  pdf_doc = []
532
 
533
  # If working with pdfs
 
540
  number_of_pages = pdf_doc.page_count
541
  original_cropboxes = []
542
 
543
+ page_sizes_df = pd.DataFrame(page_sizes)
544
+ page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
545
 
546
  for i in progress.tqdm(range(0, number_of_pages), desc="Saving redactions to file", unit = "pages"):
547
+
 
 
548
  image_loc = all_image_annotations[i]['image']
 
549
 
550
  # Load in image object
551
  if isinstance(image_loc, np.ndarray):
552
  image = Image.fromarray(image_loc.astype('uint8'))
 
553
  elif isinstance(image_loc, Image.Image):
554
  image = image_loc
 
 
 
555
  elif isinstance(image_loc, str):
556
+ if not os.path.exists(image_loc):
557
+ image=page_sizes_df.loc[page_sizes_df['page']==i, "image_path"]
558
+ try:
559
+ image = Image.open(image_loc)
560
+ except Exception as e:
561
+ image = None
562
 
563
  pymupdf_page = pdf_doc.load_page(i) #doc.load_page(current_page -1)
564
  original_cropboxes.append(pymupdf_page.cropbox.irect)
565
  pymupdf_page.set_cropbox = pymupdf_page.mediabox
 
 
 
 
566
 
567
+ pymupdf_page = redact_page_with_pymupdf(page=pymupdf_page, page_annotations=all_image_annotations[i], image=image, original_cropbox=original_cropboxes[-1], page_sizes_df= page_sizes_df) # image=image,
568
  else:
569
  print("File type not recognised.")
570
 
571
  #try:
572
  if pdf_doc:
573
  out_pdf_file_path = output_folder + file_name_without_ext + "_redacted.pdf"
574
+ pdf_doc.save(out_pdf_file_path, garbage=4, deflate=True, clean=True)
575
  output_files.append(out_pdf_file_path)
576
 
577
  else:
 
584
  output_files.append(orig_pdf_file_path)
585
 
586
  try:
587
+ review_df = convert_annotation_json_to_review_df(all_image_annotations, review_file_state.copy(), page_sizes=page_sizes)[["image", "page", "label","color", "xmin", "ymin", "xmax", "ymax", "text"]]#.drop_duplicates(subset=["image", "page", "text", "label","color", "xmin", "ymin", "xmax", "ymax"])
 
 
 
 
 
 
 
 
 
 
 
 
 
588
  out_review_file_file_path = output_folder + file_name_with_ext + '_review_file.csv'
589
 
 
590
  review_df.to_csv(out_review_file_file_path, index=None)
591
  output_files.append(out_review_file_file_path)
592
 
593
  except Exception as e:
594
  print("In apply redactions function, could not save annotations to csv file:", e)
595
 
596
+ return doc, all_image_annotations, output_files, output_log_files, review_df
597
 
598
  def get_boxes_json(annotations:AnnotatedImageData):
599
  return annotations["boxes"]
600
 
601
+ def update_all_entity_df_dropdowns(df:pd.DataFrame, label_dropdown_value:str, page_dropdown_value:str, text_dropdown_value:str):
602
+ '''
603
+ Update all dropdowns based on rows that exist in a dataframe
604
+ '''
605
+
606
+ if isinstance(label_dropdown_value, str):
607
+ label_dropdown_value = [label_dropdown_value]
608
+ if isinstance(page_dropdown_value, str):
609
+ page_dropdown_value = [page_dropdown_value]
610
+ if isinstance(text_dropdown_value, str):
611
+ text_dropdown_value = [text_dropdown_value]
612
+
613
+ filtered_df = df.copy()
614
+
615
+ # Apply filtering based on dropdown selections
616
+ # if not "ALL" in page_dropdown_value:
617
+ # filtered_df = filtered_df[filtered_df["page"].astype(str).isin(page_dropdown_value)]
618
+
619
+ # if not "ALL" in text_dropdown_value:
620
+ # filtered_df = filtered_df[filtered_df["text"].astype(str).isin(text_dropdown_value)]
621
+
622
+ # if not "ALL" in label_dropdown_value:
623
+ # filtered_df = filtered_df[filtered_df["label"].astype(str).isin(label_dropdown_value)]
624
+
625
+ recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "label")
626
+ recogniser_entities_drop = gr.Dropdown(value=label_dropdown_value[0], choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
627
+
628
+ text_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "text")
629
+ text_entities_drop = gr.Dropdown(value=text_dropdown_value[0], choices=text_entities_for_drop, allow_custom_value=True, interactive=True)
630
+
631
+ page_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "page")
632
+ page_entities_drop = gr.Dropdown(value=page_dropdown_value[0], choices=page_entities_for_drop, allow_custom_value=True, interactive=True)
633
+
634
+ return recogniser_entities_drop, text_entities_drop, page_entities_drop
635
+
636
  def update_entities_df_recogniser_entities(choice:str, df:pd.DataFrame, page_dropdown_value:str, text_dropdown_value:str):
637
  '''
638
  Update the rows in a dataframe depending on the user choice from a dropdown
639
  '''
640
+
641
  if isinstance(choice, str):
642
  choice = [choice]
643
  if isinstance(page_dropdown_value, str):
 
736
 
737
  return filtered_df, recogniser_entities_drop, page_entities_drop
738
 
739
+ def reset_dropdowns(df:pd.DataFrame):
740
  '''
741
  Return Gradio dropdown objects with value 'ALL'.
742
  '''
743
+
744
+ recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(df, "label")
745
+ recogniser_entities_drop = gr.Dropdown(value="ALL", choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
746
+
747
+ text_entities_for_drop = update_dropdown_list_based_on_dataframe(df, "text")
748
+ text_entities_drop = gr.Dropdown(value="ALL", choices=text_entities_for_drop, allow_custom_value=True, interactive=True)
749
+
750
+ page_entities_for_drop = update_dropdown_list_based_on_dataframe(df, "page")
751
+ page_entities_drop = gr.Dropdown(value="ALL", choices=page_entities_for_drop, allow_custom_value=True, interactive=True)
752
+
753
+ return recogniser_entities_drop, text_entities_drop, page_entities_drop
754
 
755
  def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
756
 
757
  row_value_page = evt.row_value[0] # This is the page number value
758
+ row_value_label = evt.row_value[1] # This is the label number value
759
+ row_value_text = evt.row_value[2] # This is the text number value
760
+
761
+ row_value_df = pd.DataFrame(data={"page":[row_value_page], "label":[row_value_label], "text":[row_value_text]})
762
+
763
+ return row_value_page, row_value_df
764
 
765
+ def df_select_callback_cost(df: pd.DataFrame, evt: gr.SelectData):
 
766
 
767
+ row_value_code = evt.row_value[0] # This is the value for cost code
768
+ row_value_label = evt.row_value[1] # This is the label number value
769
+
770
+ #row_value_df = pd.DataFrame(data={"page":[row_value_code], "label":[row_value_label]})
771
+
772
+ return row_value_code
773
+
774
+ def update_selected_review_df_row_colour(redaction_row_selection:pd.DataFrame, review_df:pd.DataFrame, colour:tuple=(0,0,255)):
775
+ '''
776
+ Update the colour of a single redaction box based on the values in a selection row
777
+ '''
778
+ colour_tuple = str(tuple(colour))
779
+
780
+ if "color" not in review_df.columns: review_df["color"] = None
781
+
782
+ # Reset existing highlight colours
783
+ review_df.loc[review_df["color"]==colour_tuple, "color"] = review_df.loc[review_df["color"]==colour_tuple, "color"].apply(lambda _: '(0, 0, 0)')
784
+
785
+ review_df = review_df.merge(redaction_row_selection, on=["page", "label", "text"], indicator=True, how="left")
786
+ review_df.loc[review_df["_merge"]=="both", "color"] = review_df.loc[review_df["_merge"] == "both", "color"].apply(lambda _: '(0, 0, 255)')
787
+
788
+ review_df.drop("_merge", axis=1, inplace=True)
789
+
790
+ review_df.to_csv(OUTPUT_FOLDER + "review_df_in_update_selected_review.csv")
791
+
792
+ return review_df
793
+
794
+ def update_boxes_color(images: list, redaction_row_selection: pd.DataFrame, colour: tuple = (0, 255, 0)):
795
+ """
796
+ Update the color of bounding boxes in the images list based on redaction_row_selection.
797
+
798
+ Parameters:
799
+ - images (list): List of dictionaries containing image paths and box metadata.
800
+ - redaction_row_selection (pd.DataFrame): DataFrame with 'page', 'label', and optionally 'text' columns.
801
+ - colour (tuple): RGB tuple for the new color.
802
+
803
+ Returns:
804
+ - Updated list with modified colors.
805
+ """
806
+ # Convert DataFrame to a set for fast lookup
807
+ selection_set = set(zip(redaction_row_selection["page"], redaction_row_selection["label"]))
808
+
809
+ for page_idx, image_obj in enumerate(images):
810
+ if "boxes" in image_obj:
811
+ for box in image_obj["boxes"]:
812
+ if (page_idx, box["label"]) in selection_set:
813
+ box["color"] = colour # Update color
814
+
815
+ return images
816
+
817
+ def update_other_annotator_number_from_current(page_number_first_counter:int):
818
+ return page_number_first_counter
819
 
820
  def convert_image_coords_to_adobe(pdf_page_width:float, pdf_page_height:float, image_width:float, image_height:float, x1:float, y1:float, x2:float, y2:float):
821
  '''
 
854
 
855
  return pdf_x1, pdf_y1, pdf_x2, pdf_y2
856
 
857
+ def convert_pymupdf_coords_to_adobe(x1: float, y1: float, x2: float, y2: float, pdf_page_height: float):
858
  """
859
  Converts coordinates from PyMuPDF (fitz) space to Adobe PDF space.
860
 
861
  Parameters:
 
 
862
  - x1, y1, x2, y2: Coordinates in PyMuPDF space
863
+ - pdf_page_height: Total height of the PDF page
864
 
865
  Returns:
866
  - Tuple of converted coordinates (x1, y1, x2, y2) in Adobe PDF space
867
  """
 
 
 
 
 
 
 
 
868
 
869
+ # PyMuPDF uses (0,0) at the bottom-left, while Adobe uses (0,0) at the top-left
870
+ adobe_y1 = pdf_page_height - y2 # Convert top coordinate
871
+ adobe_y2 = pdf_page_height - y1 # Convert bottom coordinate
872
+
873
+ return x1, adobe_y1, x2, adobe_y2
874
 
875
  def create_xfdf(review_file_df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, image_paths:List[str], document_cropboxes:List=[], page_sizes:List[dict]=[]):
876
  '''
 
893
  page_sizes_df = pd.DataFrame(page_sizes)
894
 
895
  # If there are no image coordinates, then convert coordinates to pymupdf coordinates prior to export
896
+ #if len(page_sizes_df.loc[page_sizes_df["image_width"].isnull(),"image_width"]) == len(page_sizes_df["image_width"]):
897
+ print("Using pymupdf coordinates for conversion.")
898
 
899
+ pages_are_images = False
 
 
 
 
 
 
 
 
900
 
901
+ if "mediabox_width" not in review_file_df.columns:
902
+ review_file_df = review_file_df.merge(page_sizes_df, how="left", on = "page")
903
+
904
+ # If all coordinates are less or equal to one, this is a relative page scaling - change back to image coordinates
905
+ if review_file_df["xmin"].max() <= 1 and review_file_df["xmax"].max() <= 1 and review_file_df["ymin"].max() <= 1 and review_file_df["ymax"].max() <= 1:
906
+ review_file_df["xmin"] = review_file_df["xmin"] * review_file_df["mediabox_width"]
907
+ review_file_df["xmax"] = review_file_df["xmax"] * review_file_df["mediabox_width"]
908
+ review_file_df["ymin"] = review_file_df["ymin"] * review_file_df["mediabox_height"]
909
+ review_file_df["ymax"] = review_file_df["ymax"] * review_file_df["mediabox_height"]
910
+
911
+ # If all nulls, then can do image coordinate conversion
912
+ if len(page_sizes_df.loc[page_sizes_df["mediabox_width"].isnull(),"mediabox_width"]) == len(page_sizes_df["mediabox_width"]):
913
+
914
+ pages_are_images = True
915
 
916
+ review_file_df = multiply_coordinates_by_page_sizes(review_file_df, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
 
917
 
918
+ # if "image_width" not in review_file_df.columns:
919
+ # review_file_df = review_file_df.merge(page_sizes_df, how="left", on = "page")
920
 
921
+ # # If all coordinates are less or equal to one, this is a relative page scaling - change back to image coordinates
922
+ # if review_file_df["xmin"].max() <= 1 and review_file_df["xmax"].max() <= 1 and review_file_df["ymin"].max() <= 1 and review_file_df["ymax"].max() <= 1:
923
+ # review_file_df["xmin"] = review_file_df["xmin"] * review_file_df["image_width"]
924
+ # review_file_df["xmax"] = review_file_df["xmax"] * review_file_df["image_width"]
925
+ # review_file_df["ymin"] = review_file_df["ymin"] * review_file_df["image_height"]
926
+ # review_file_df["ymax"] = review_file_df["ymax"] * review_file_df["image_height"]
927
+
928
+
929
 
930
  # Go through each row of the review_file_df, create an entry in the output Adobe xfdf file.
931
  for _, row in review_file_df.iterrows():
932
+ page_num_reported = row["page"]
933
  page_python_format = int(row["page"])-1
934
 
935
  pymupdf_page = pymupdf_doc.load_page(page_python_format)
936
 
937
  # Load cropbox sizes. Set cropbox to the original cropbox sizes from when the document was loaded into the app.
938
  if document_cropboxes:
 
939
 
940
  # Extract numbers safely using regex
941
  match = re.findall(r"[-+]?\d*\.\d+|\d+", document_cropboxes[page_python_format])
 
950
 
951
 
952
  pdf_page_height = pymupdf_page.mediabox.height
953
+ pdf_page_width = pymupdf_page.mediabox.width
954
 
955
+ # Check if image dimensions for page exist in page_sizes_df
956
+ # image_dimensions = {}
957
 
958
+ # image_dimensions['image_width'] = page_sizes_df.loc[page_sizes_df['page']==page_num_reported, "image_width"].max()
959
+ # image_dimensions['image_height'] = page_sizes_df.loc[page_sizes_df['page']==page_num_reported, "image_height"].max()
960
 
961
+ # if pd.isna(image_dimensions['image_width']):
962
+ # image_dimensions = {}
963
+
964
+ # image = image_paths[page_python_format]
965
+
966
+ # if image_dimensions:
967
+ # image_page_width, image_page_height = image_dimensions["image_width"], image_dimensions["image_height"]
968
+ # if isinstance(image, str) and 'placeholder' not in image:
969
+ # image = Image.open(image)
970
+ # image_page_width, image_page_height = image.size
971
+ # else:
972
+ # try:
973
+ # image = Image.open(image)
974
+ # image_page_width, image_page_height = image.size
975
+ # except Exception as e:
976
+ # print("Could not get image sizes due to:", e)
977
 
978
  # Create redaction annotation
979
  redact_annot = SubElement(annots, 'redact')
 
985
  # Set page number (subtract 1 as PDF pages are 0-based)
986
  redact_annot.set('page', str(int(row['page']) - 1))
987
 
988
+ # # Convert coordinates
989
+ # if pages_are_images == True:
990
+ # x1, y1, x2, y2 = convert_image_coords_to_adobe(
991
+ # pdf_page_width,
992
+ # pdf_page_height,
993
+ # image_page_width,
994
+ # image_page_height,
995
+ # row['xmin'],
996
+ # row['ymin'],
997
+ # row['xmax'],
998
+ # row['ymax']
999
+ # )
1000
+ # else:
1001
+ x1, y1, x2, y2 = convert_pymupdf_coords_to_adobe(row['xmin'],
1002
+ row['ymin'],
1003
+ row['xmax'],
1004
+ row['ymax'], pdf_page_height)
1005
 
1006
  if CUSTOM_BOX_COLOUR == "grey":
1007
  colour_str = "0.5,0.5,0.5"
 
1053
 
1054
  return xml_str
1055
 
1056
+ def convert_df_to_xfdf(input_files:List[str], pdf_doc:Document, image_paths:List[str], output_folder:str = OUTPUT_FOLDER, document_cropboxes:List=[], page_sizes:List[dict]=[]):
1057
  '''
1058
  Load in files to convert a review file into an Adobe comment file format
1059
  '''
 
1160
  # Find all redact elements using the namespace
1161
  for redact in root.findall('.//xfdf:redact', namespaces=namespace):
1162
 
 
 
1163
  redaction_info = {
1164
  'image': '', # Image will be filled in later
1165
  'page': int(redact.get('page')) + 1, # Convert to 1-based index
 
1172
  'color': redact.get('border-color', '(0, 0, 0)') # Default to black if not specified
1173
  }
1174
  redactions.append(redaction_info)
 
 
1175
 
1176
  return redactions
1177
 
1178
+ def convert_xfdf_to_dataframe(file_paths_list:List[str], pymupdf_doc, image_paths:List[str], output_folder:str=OUTPUT_FOLDER):
1179
  '''
1180
  Convert redaction annotations from XFDF and associated images into a DataFrame.
1181
 
 
1191
  xfdf_paths = []
1192
  df = pd.DataFrame()
1193
 
 
 
1194
  # Sort the file paths so that the pdfs come first
1195
  file_paths_list = sorted(file_paths_list, key=lambda x: (os.path.splitext(x)[1] != '.pdf', os.path.splitext(x)[1] != '.json'))
1196
 
 
1206
 
1207
  if file_path_end == "pdf":
1208
  pdf_name = os.path.basename(file_path)
 
1209
 
1210
  # Add pdf to outputs
1211
  output_paths.append(file_path)
 
1216
  message = "Original PDF needed to convert from .xfdf format"
1217
  print(message)
1218
  raise ValueError(message)
 
1219
  xfdf_path = file
1220
 
 
 
 
 
 
1221
  file_path_name = get_file_name_without_type(xfdf_path)
1222
 
 
 
1223
  # Parse the XFDF file
1224
  redactions = parse_xfdf(xfdf_path)
1225
 
 
1238
 
1239
  image_path = image_paths[page_python_format]
1240
 
 
 
1241
  if isinstance(image_path, str):
1242
  image = Image.open(image_path)
1243
 
 
1249
  df.loc[_, ['xmin', 'ymin', 'xmax', 'ymax']] = [image_x1, image_y1, image_x2, image_y2]
1250
 
1251
  # Optionally, you can add the image path or other relevant information
 
1252
  df.loc[_, 'image'] = image_path
1253
 
1254
  #print('row:', row)
tools/textract_batch_call.py ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import boto3
2
+ import time
3
+ import os
4
+ import json
5
+ import logging
6
+ from urllib.parse import urlparse
7
+
8
+ # Configure logging
9
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
10
+
11
+ def analyze_pdf_with_textract(
12
+ local_pdf_path: str,
13
+ s3_bucket_name: str,
14
+ s3_input_prefix: str,
15
+ s3_output_prefix: str,
16
+ local_output_dir: str,
17
+ aws_region: str = None, # Optional: specify region if not default
18
+ poll_interval_seconds: int = 5,
19
+ max_polling_attempts: int = 120 # ~10 minutes total wait time
20
+ ):
21
+ """
22
+ Uploads a local PDF to S3, starts a Textract analysis job (detecting text & signatures),
23
+ waits for completion, and downloads the output JSON from S3 to a local directory.
24
+
25
+ Args:
26
+ local_pdf_path (str): Path to the local PDF file.
27
+ s3_bucket_name (str): Name of the S3 bucket to use.
28
+ s3_input_prefix (str): S3 prefix (folder) to upload the input PDF.
29
+ s3_output_prefix (str): S3 prefix (folder) where Textract should write output.
30
+ local_output_dir (str): Local directory to save the downloaded JSON results.
31
+ aws_region (str, optional): AWS region name. Defaults to boto3 default region.
32
+ poll_interval_seconds (int): Seconds to wait between polling Textract status.
33
+ max_polling_attempts (int): Maximum number of times to poll Textract status.
34
+
35
+ Returns:
36
+ str: Path to the downloaded local JSON output file, or None if failed.
37
+
38
+ Raises:
39
+ FileNotFoundError: If the local_pdf_path does not exist.
40
+ boto3.exceptions.NoCredentialsError: If AWS credentials are not found.
41
+ Exception: For other AWS errors or job failures.
42
+ """
43
+
44
+ if not os.path.exists(local_pdf_path):
45
+ raise FileNotFoundError(f"Input PDF not found: {local_pdf_path}")
46
+
47
+ if not os.path.exists(local_output_dir):
48
+ os.makedirs(local_output_dir)
49
+ logging.info(f"Created local output directory: {local_output_dir}")
50
+
51
+ # Initialize boto3 clients
52
+ session = boto3.Session(region_name=aws_region)
53
+ s3_client = session.client('s3')
54
+ textract_client = session.client('textract')
55
+
56
+ # --- 1. Upload PDF to S3 ---
57
+ pdf_filename = os.path.basename(local_pdf_path)
58
+ s3_input_key = os.path.join(s3_input_prefix, pdf_filename).replace("\\", "/") # Ensure forward slashes for S3
59
+
60
+ logging.info(f"Uploading '{local_pdf_path}' to 's3://{s3_bucket_name}/{s3_input_key}'...")
61
+ try:
62
+ s3_client.upload_file(local_pdf_path, s3_bucket_name, s3_input_key)
63
+ logging.info("Upload successful.")
64
+ except Exception as e:
65
+ logging.error(f"Failed to upload PDF to S3: {e}")
66
+ raise
67
+
68
+ # --- 2. Start Textract Document Analysis ---
69
+ logging.info("Starting Textract document analysis job...")
70
+ try:
71
+ response = textract_client.start_document_analysis(
72
+ DocumentLocation={
73
+ 'S3Object': {
74
+ 'Bucket': s3_bucket_name,
75
+ 'Name': s3_input_key
76
+ }
77
+ },
78
+ FeatureTypes=['SIGNATURES', 'FORMS', 'TABLES'], # Analyze for signatures, forms, and tables
79
+ OutputConfig={
80
+ 'S3Bucket': s3_bucket_name,
81
+ 'S3Prefix': s3_output_prefix
82
+ }
83
+ # Optional: Add NotificationChannel for SNS topic notifications
84
+ # NotificationChannel={
85
+ # 'SNSTopicArn': 'YOUR_SNS_TOPIC_ARN',
86
+ # 'RoleArn': 'YOUR_IAM_ROLE_ARN_FOR_TEXTRACT_TO_ACCESS_SNS'
87
+ # }
88
+ )
89
+ job_id = response['JobId']
90
+ logging.info(f"Textract job started with JobId: {job_id}")
91
+
92
+ except Exception as e:
93
+ logging.error(f"Failed to start Textract job: {e}")
94
+ raise
95
+
96
+ # --- 3. Poll for Job Completion ---
97
+ job_status = 'IN_PROGRESS'
98
+ attempts = 0
99
+ logging.info("Polling Textract for job completion status...")
100
+
101
+ while job_status == 'IN_PROGRESS' and attempts < max_polling_attempts:
102
+ attempts += 1
103
+ try:
104
+ response = textract_client.get_document_analysis(JobId=job_id)
105
+ job_status = response['JobStatus']
106
+ logging.info(f"Polling attempt {attempts}/{max_polling_attempts}. Job status: {job_status}")
107
+
108
+ if job_status == 'IN_PROGRESS':
109
+ time.sleep(poll_interval_seconds)
110
+ elif job_status == 'SUCCEEDED':
111
+ logging.info("Textract job succeeded.")
112
+ break
113
+ elif job_status in ['FAILED', 'PARTIAL_SUCCESS']:
114
+ status_message = response.get('StatusMessage', 'No status message provided.')
115
+ warnings = response.get('Warnings', [])
116
+ logging.error(f"Textract job ended with status: {job_status}. Message: {status_message}")
117
+ if warnings:
118
+ logging.warning(f"Warnings: {warnings}")
119
+ # Decide if PARTIAL_SUCCESS should proceed or raise error
120
+ # For simplicity here, we raise for both FAILED and PARTIAL_SUCCESS
121
+ raise Exception(f"Textract job {job_id} failed or partially failed. Status: {job_status}. Message: {status_message}")
122
+ else:
123
+ # Should not happen based on documentation, but handle defensively
124
+ raise Exception(f"Unexpected Textract job status: {job_status}")
125
+
126
+ except textract_client.exceptions.InvalidJobIdException:
127
+ logging.error(f"Invalid JobId: {job_id}. This might happen if the job expired (older than 7 days) or never existed.")
128
+ raise
129
+ except Exception as e:
130
+ logging.error(f"Error while polling Textract status for job {job_id}: {e}")
131
+ raise
132
+
133
+ if job_status != 'SUCCEEDED':
134
+ raise TimeoutError(f"Textract job {job_id} did not complete successfully within the polling limit.")
135
+
136
+ # --- 4. Download Output JSON from S3 ---
137
+ # Textract typically creates output under s3_output_prefix/job_id/
138
+ # There might be multiple JSON files if pagination occurred during writing.
139
+ # Usually, for smaller docs, there's one file, often named '1'.
140
+ # For robust handling, list objects and find the JSON(s).
141
+
142
+ s3_output_key_prefix = os.path.join(s3_output_prefix, job_id).replace("\\", "/") + "/"
143
+ logging.info(f"Searching for output files in s3://{s3_bucket_name}/{s3_output_key_prefix}")
144
+
145
+ downloaded_file_path = None
146
+ try:
147
+ list_response = s3_client.list_objects_v2(
148
+ Bucket=s3_bucket_name,
149
+ Prefix=s3_output_key_prefix
150
+ )
151
+
152
+ output_files = list_response.get('Contents', [])
153
+ if not output_files:
154
+ # Sometimes Textract might take a moment longer to write the output after SUCCEEDED status
155
+ logging.warning("No output files found immediately after job success. Waiting briefly and retrying list...")
156
+ time.sleep(5)
157
+ list_response = s3_client.list_objects_v2(
158
+ Bucket=s3_bucket_name,
159
+ Prefix=s3_output_key_prefix
160
+ )
161
+ output_files = list_response.get('Contents', [])
162
+
163
+ if not output_files:
164
+ logging.error(f"No output files found in s3://{s3_bucket_name}/{s3_output_key_prefix}")
165
+ # You could alternatively try getting results via get_document_analysis pagination here
166
+ # but sticking to the request to download from S3 output path.
167
+ raise FileNotFoundError(f"Textract output files not found in S3 path: s3://{s3_bucket_name}/{s3_output_key_prefix}")
168
+
169
+ # Usually, we only need the first/main JSON output file(s)
170
+ # For simplicity, download the first one found. A more complex scenario might merge multiple files.
171
+ # Filter out potential directory markers if any key ends with '/'
172
+ json_files_to_download = [f for f in output_files if f['Key'] != s3_output_key_prefix and not f['Key'].endswith('/')]
173
+
174
+ if not json_files_to_download:
175
+ logging.error(f"No JSON files found (only prefix marker?) in s3://{s3_bucket_name}/{s3_output_key_prefix}")
176
+ raise FileNotFoundError(f"Textract output JSON files not found in S3 path: s3://{s3_bucket_name}/{s3_output_key_prefix}")
177
+
178
+ # Let's download the first JSON found. Often it's the only one or the main one.
179
+ s3_output_key = json_files_to_download[0]['Key']
180
+ output_filename_base = os.path.basename(pdf_filename).replace('.pdf', '')
181
+ local_output_filename = f"{output_filename_base}_textract_output_{job_id}.json"
182
+ local_output_path = os.path.join(local_output_dir, local_output_filename)
183
+
184
+ logging.info(f"Downloading Textract output from 's3://{s3_bucket_name}/{s3_output_key}' to '{local_output_path}'...")
185
+ s3_client.download_file(s3_bucket_name, s3_output_key, local_output_path)
186
+ logging.info("Download successful.")
187
+ downloaded_file_path = local_output_path
188
+
189
+ # Log if multiple files were found, as user might need to handle them
190
+ if len(json_files_to_download) > 1:
191
+ logging.warning(f"Multiple output files found in S3 output location. Downloaded the first: '{s3_output_key}'. Other files exist.")
192
+
193
+ except Exception as e:
194
+ logging.error(f"Failed to download or process Textract output from S3: {e}")
195
+ raise
196
+
197
+ return downloaded_file_path
198
+
199
+ # --- Example Usage ---
200
+ if __name__ == '__main__':
201
+ # --- Configuration --- (Replace with your actual values)
202
+ MY_LOCAL_PDF = r"C:\path\to\your\document.pdf" # Use raw string for Windows paths
203
+ MY_S3_BUCKET = "your-textract-demo-bucket-name" # MUST BE UNIQUE GLOBALLY
204
+ MY_S3_INPUT_PREFIX = "textract-inputs" # Folder in the bucket for uploads
205
+ MY_S3_OUTPUT_PREFIX = "textract-outputs" # Folder in the bucket for results
206
+ MY_LOCAL_OUTPUT_DIR = "./textract_results" # Local folder to save JSON
207
+ MY_AWS_REGION = "us-east-1" # e.g., 'us-east-1', 'eu-west-1'
208
+
209
+ # --- Create a dummy PDF for testing if you don't have one ---
210
+ # Requires 'reportlab' library: pip install reportlab
211
+ try:
212
+ from reportlab.pdfgen import canvas
213
+ from reportlab.lib.pagesizes import letter
214
+ if not os.path.exists(MY_LOCAL_PDF):
215
+ print(f"Creating dummy PDF: {MY_LOCAL_PDF}")
216
+ c = canvas.Canvas(MY_LOCAL_PDF, pagesize=letter)
217
+ c.drawString(100, 750, "This is a test document for AWS Textract.")
218
+ c.drawString(100, 700, "It includes some text and a placeholder for a signature.")
219
+ c.drawString(100, 650, "Signed:")
220
+ # Draw a simple line/scribble for signature placeholder
221
+ c.line(150, 630, 250, 645)
222
+ c.line(250, 645, 300, 620)
223
+ c.save()
224
+ print("Dummy PDF created.")
225
+ except ImportError:
226
+ if not os.path.exists(MY_LOCAL_PDF):
227
+ print(f"Warning: reportlab not installed and '{MY_LOCAL_PDF}' not found. Cannot run example without an input PDF.")
228
+ exit() # Exit if no PDF available for the example
229
+ except Exception as e:
230
+ print(f"Error creating dummy PDF: {e}")
231
+ exit()
232
+
233
+
234
+ # --- Run the analysis ---
235
+ try:
236
+ output_json_path = analyze_pdf_with_textract(
237
+ local_pdf_path=MY_LOCAL_PDF,
238
+ s3_bucket_name=MY_S3_BUCKET,
239
+ s3_input_prefix=MY_S3_INPUT_PREFIX,
240
+ s3_output_prefix=MY_S3_OUTPUT_PREFIX,
241
+ local_output_dir=MY_LOCAL_OUTPUT_DIR,
242
+ aws_region=MY_AWS_REGION
243
+ )
244
+
245
+ if output_json_path:
246
+ print(f"\n--- Analysis Complete ---")
247
+ print(f"Textract output JSON saved to: {output_json_path}")
248
+
249
+ # Optional: Load and print some info from the JSON
250
+ with open(output_json_path, 'r') as f:
251
+ results = json.load(f)
252
+ print(f"Detected {results.get('DocumentMetadata', {}).get('Pages', 'N/A')} page(s).")
253
+ # Find signature blocks (Note: This is basic, real parsing might be more complex)
254
+ signature_blocks = [block for block in results.get('Blocks', []) if block.get('BlockType') == 'SIGNATURE']
255
+ print(f"Found {len(signature_blocks)} potential signature block(s).")
256
+ if signature_blocks:
257
+ print(f"First signature confidence: {signature_blocks[0].get('Confidence', 'N/A')}")
258
+
259
+
260
+ except FileNotFoundError as e:
261
+ print(f"\nError: Input file not found. {e}")
262
+ except Exception as e:
263
+ print(f"\nAn error occurred during the process: {e}")
264
+
265
+ import boto3
266
+ import time
267
+ import os
268
+
269
+ def download_textract_output(job_id, output_bucket, output_prefix, local_folder):
270
+ """
271
+ Checks the status of a Textract job and downloads the output ZIP file if the job is complete.
272
+
273
+ :param job_id: The Textract job ID.
274
+ :param output_bucket: The S3 bucket where the output is stored.
275
+ :param output_prefix: The prefix (folder path) in S3 where the output file is stored.
276
+ :param local_folder: The local directory where the ZIP file should be saved.
277
+ """
278
+ textract_client = boto3.client('textract')
279
+ s3_client = boto3.client('s3')
280
+
281
+ # Check job status
282
+ while True:
283
+ response = textract_client.get_document_analysis(JobId=job_id)
284
+ status = response['JobStatus']
285
+
286
+ if status == 'SUCCEEDED':
287
+ print("Job completed successfully.")
288
+ break
289
+ elif status == 'FAILED':
290
+ print("Job failed:", response.get("StatusMessage", "No error message provided."))
291
+ return
292
+ else:
293
+ print(f"Job is still {status}, waiting...")
294
+ time.sleep(10) # Wait before checking again
295
+
296
+ # Find output ZIP file in S3
297
+ output_file_key = f"{output_prefix}/{job_id}.zip"
298
+ local_file_path = os.path.join(local_folder, f"{job_id}.zip")
299
+
300
+ # Download file
301
+ try:
302
+ s3_client.download_file(output_bucket, output_file_key, local_file_path)
303
+ print(f"Output file downloaded to: {local_file_path}")
304
+ except Exception as e:
305
+ print(f"Error downloading file: {e}")
306
+
307
+ # Example usage:
308
+ # download_textract_output("your-job-id", "your-output-bucket", "your-output-prefix", "/path/to/local/folder")