seanpedrickcase commited on
Commit
66e145d
·
1 Parent(s): 08a3ec3

Added features to review dataframe to filter and exclude features based on text. Text should now appear consistently in review_df (for boxes not modified). Larger spacy model returned to use. Gradio upgrade.

Browse files
DocRedactApp_0.2.0.spec DELETED
@@ -1,66 +0,0 @@
1
- # -*- mode: python ; coding: utf-8 -*-
2
- from PyInstaller.utils.hooks import collect_data_files
3
- from PyInstaller.utils.hooks import collect_all
4
-
5
- datas = [('tesseract/', 'tesseract/'), ('poppler/poppler-24.02.0/', 'poppler/poppler-24.02.0/')]
6
- binaries = []
7
- hiddenimports = ['gradio_image_annotation', 'pyarrow.vendored.version', 'pydicom.encoders', 'safehttpx', 'presidio_analyzer', 'presidio_anonymizer', 'presidio_image_redactor']
8
- datas += collect_data_files('gradio_client')
9
- datas += collect_data_files('gradio')
10
- datas += collect_data_files('gradio_image_annotation')
11
- tmp_ret = collect_all('gradio_image_annotation')
12
- datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
13
- tmp_ret = collect_all('safehttpx')
14
- datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
15
- tmp_ret = collect_all('presidio_analyzer')
16
- datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
17
- tmp_ret = collect_all('presidio_anonymizer')
18
- datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
19
- tmp_ret = collect_all('presidio_image_redactor')
20
- datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
21
-
22
-
23
- a = Analysis(
24
- ['app.py'],
25
- pathex=[],
26
- binaries=binaries,
27
- datas=datas,
28
- hiddenimports=hiddenimports,
29
- hookspath=['build_deps'],
30
- hooksconfig={},
31
- runtime_hooks=[],
32
- excludes=[],
33
- noarchive=False,
34
- optimize=0,
35
- module_collection_mode={
36
- 'gradio': 'py', # Collect gradio package as source .py files
37
- }
38
- )
39
- pyz = PYZ(a.pure)
40
-
41
- exe = EXE(
42
- pyz,
43
- a.scripts,
44
- [],
45
- exclude_binaries=True,
46
- name='DocRedactApp_0.2.0',
47
- debug=False,
48
- bootloader_ignore_signals=False,
49
- strip=False,
50
- upx=True,
51
- console=True,
52
- disable_windowed_traceback=False,
53
- argv_emulation=False,
54
- target_arch=None,
55
- codesign_identity=None,
56
- entitlements_file=None,
57
- )
58
- coll = COLLECT(
59
- exe,
60
- a.binaries,
61
- a.datas,
62
- strip=False,
63
- upx=True,
64
- upx_exclude=[],
65
- name='DocRedactApp_0.2.0',
66
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -14,7 +14,7 @@ from tools.helper_functions import ensure_output_folder_exists, add_folder_to_pa
14
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
15
  from tools.file_redaction import choose_and_run_redactor
16
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
17
- from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe
18
  from tools.data_anonymise import anonymise_data_files
19
  from tools.auth import authenticate_user
20
  from tools.load_spacy_model_custom_recognisers import custom_entities
@@ -81,15 +81,22 @@ with app:
81
  first_loop_state = gr.Checkbox(label="first_loop_state", value=True, visible=False)
82
  second_loop_state = gr.Checkbox(label="second_loop_state", value=False, visible=False)
83
  do_not_save_pdf_state = gr.Checkbox(label="do_not_save_pdf_state", value=False, visible=False)
 
84
 
85
  prepared_pdf_state = gr.Dropdown(label = "prepared_pdf_list", value="", allow_custom_value=True,visible=False)
86
- document_cropboxes = gr.Dropdown(label = "document_cropboxes", value="", allow_custom_value=True,visible=False)
 
87
  images_pdf_state = gr.Dropdown(label = "images_pdf_list", value="", allow_custom_value=True,visible=False)
88
 
89
  output_image_files_state = gr.Dropdown(label = "output_image_files_list", value="", allow_custom_value=True,visible=False)
90
  output_file_list_state = gr.Dropdown(label = "output_file_list", value="", allow_custom_value=True,visible=False)
91
  text_output_file_list_state = gr.Dropdown(label = "text_output_file_list", value="", allow_custom_value=True,visible=False)
92
  log_files_output_list_state = gr.Dropdown(label = "log_files_output_list", value="", allow_custom_value=True,visible=False)
 
 
 
 
 
93
 
94
 
95
  # Logging state
@@ -115,6 +122,11 @@ with app:
115
  data_file_name_no_extension_textbox = gr.Textbox(label = "data_full_file_name_textbox", value="", visible=False)
116
  data_file_name_with_extension_textbox = gr.Textbox(label = "data_file_name_with_extension_textbox", value="", visible=False)
117
  data_file_name_textbox_list = gr.Dropdown(label = "data_file_name_textbox_list", value="", allow_custom_value=True,visible=False)
 
 
 
 
 
118
 
119
  estimated_time_taken_number = gr.Number(label = "estimated_time_taken_number", value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
120
  annotate_previous_page = gr.Number(value=0, label="Previous page", precision=0, visible=False) # Keeps track of the last page that the annotator was on
@@ -131,17 +143,14 @@ with app:
131
 
132
  ## Settings page variables
133
  default_allow_list_file_name = "default_allow_list.csv"
134
- default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
135
- in_allow_list_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_allow_list_df", visible=False, type="pandas")
136
 
137
  default_deny_list_file_name = "default_deny_list.csv"
138
- default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
139
- in_deny_list_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_deny_list_df", visible=False, type="pandas")
140
  in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
141
 
142
  fully_redacted_list_file_name = "default_fully_redacted_list.csv"
143
- fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
144
- in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_full_redacted_list_df", visible=False, type="pandas")
145
  in_fully_redacted_text_in = gr.Textbox(value="Fully redacted page list", visible=False)
146
 
147
  # S3 settings for default allow list load
@@ -150,14 +159,12 @@ with app:
150
  default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
151
 
152
  # Base dataframe for recognisers that is not modified subsequent to load
153
- recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", visible=False, label="recogniser_entity_dataframe_base")
154
 
155
  # Duplicate page detection
156
  in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
157
  duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="duplicate_pages_df", visible=False, type="pandas")
158
 
159
-
160
-
161
  ###
162
  # UI DESIGN
163
  ###
@@ -178,7 +185,7 @@ with app:
178
  ###
179
  with gr.Tab("Redact PDFs/images"):
180
  with gr.Accordion("Redact document", open = True):
181
- in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'], height=file_input_height)
182
  # if RUN_AWS_FUNCTIONS == "1":
183
  in_redaction_method = gr.Radio(label="Choose text extraction method. AWS Textract has a cost per page - $3.50 per 1,000 pages with signature detection (default), $1.50 without. Go to Redaction settings - AWS Textract options to remove signature detection.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
184
  pii_identification_method_drop = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost of approximately $0.01 per 10,000 characters.", value = default_pii_detector, choices=[local_pii_detector, aws_pii_detector])
@@ -220,14 +227,19 @@ with app:
220
  annotate_zoom_out = gr.Button("Zoom out", visible=False)
221
  with gr.Row():
222
  clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
223
- with gr.Row():
224
- annotation_last_page_button = gr.Button("Previous page", scale = 3)
225
- annotate_current_page = gr.Number(value=1, label="Page (press enter to change)", precision=0, scale = 2)
226
- annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
227
- annotation_next_page_button = gr.Button("Next page", scale = 3)
 
 
 
 
 
228
 
229
  with gr.Row():
230
- with gr.Column(scale=3):
231
 
232
  zoom_str = str(annotator_zoom_number) + '%'
233
 
@@ -249,17 +261,25 @@ with app:
249
  interactive=False
250
  )
251
  with gr.Column(scale=1):
252
- #with gr.Row():
253
- recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
254
- recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=(2,"fixed"), type="pandas", label="Search results. Click to go to page")
255
-
256
- with gr.Row():
257
- annotation_last_page_button_bottom = gr.Button("Previous page", scale = 3)
258
- annotate_current_page_bottom = gr.Number(value=1, label="Page (press enter to change)", precision=0, interactive=True, scale = 2)
259
- annotate_max_pages_bottom = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
260
- annotation_next_page_button_bottom = gr.Button("Next page", scale = 3)
261
-
262
-
 
 
 
 
 
 
 
 
263
 
264
  with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
265
  convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
@@ -306,9 +326,7 @@ with app:
306
  with gr.Tab(label="Identify duplicate pages"):
307
  with gr.Accordion("Identify duplicate pages to redact", open = True):
308
  in_duplicate_pages = gr.File(label="Upload multiple 'ocr_output.csv' data files from redaction jobs here to compare", file_count="multiple", height=file_input_height, file_types=['.csv'])
309
-
310
- find_duplicate_pages_btn = gr.Button(value="Identify duplicate pages", variant="primary")
311
-
312
  duplicate_pages_out =gr.File(label="Duplicate pages analysis output", file_count="multiple", height=file_input_height, file_types=['.csv'])
313
 
314
  ###
@@ -326,6 +344,11 @@ with app:
326
  with gr.Column():
327
  in_fully_redacted_list = gr.File(label="Import fully redacted pages list - csv table with one column of page numbers on each row. Page numbers in this file will be fully redacted.", file_count="multiple", height=file_input_height)
328
  in_fully_redacted_list_text = gr.Textbox(label="Fully redacted page list load status")
 
 
 
 
 
329
 
330
  with gr.Accordion("Select entity types to redact", open = True):
331
  in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Local PII identification model (click empty space in box for full list)")
@@ -370,92 +393,106 @@ with app:
370
  ###
371
  in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
372
 
373
- document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, document_cropboxes]).\
374
  success(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox],
375
- outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state], api_name="redact_doc").\
376
- success(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
377
 
378
- # If the app has completed a batch of pages, it will run this until the end of all pages in the document
379
- current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox],
380
- outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state]).\
381
- success(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
382
 
383
  # If a file has been completed, the function will continue onto the next document
384
- latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
385
- success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
 
 
386
 
387
  ###
388
  # REVIEW PDF REDACTIONS
389
  ###
390
 
391
  # Upload previous files for modifying redactions
392
- upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
393
  success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
394
- success(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes], api_name="prepare_doc").\
395
- success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
396
 
397
  # Page controls at top
398
  annotate_current_page.submit(
399
- modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
400
- success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
401
- success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
402
 
403
  annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
404
- success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
405
- success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
406
- success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
407
 
408
  annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
409
- success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
410
- success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
411
- success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
412
 
413
  # Zoom in and out on annotator
414
- annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
415
  success(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_true_bool], outputs=[annotator_zoom_number, annotate_current_page])
416
 
417
- annotate_zoom_out.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
418
  success(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_false_bool], outputs=[annotator_zoom_number, annotate_current_page])
419
 
420
- annotator_zoom_number.change(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
421
 
422
- clear_all_redactions_on_page_btn.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base, clear_all_page_redactions], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
423
- success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
424
 
425
- annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
426
 
427
  # Page controls at bottom
428
  annotate_current_page_bottom.submit(
429
- modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
430
- success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
431
- success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
432
 
433
  annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
434
- success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
435
- success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
436
- success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
437
 
438
  annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
439
- success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
440
- success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
441
- success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
442
 
443
  # Review table controls
444
- recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
 
 
445
 
446
  recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=[annotate_current_page]).\
447
- success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
448
- success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
449
- success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
 
 
 
 
 
 
 
 
 
 
450
 
451
  # Convert review file to xfdf Adobe format
452
  convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
453
- success(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes]).\
454
  success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes], outputs=[adobe_review_files_out])
455
 
456
  # Convert xfdf Adobe file back to review_file.csv
457
  convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
458
- success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes]).\
459
  success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
460
 
461
  ###
 
14
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
15
  from tools.file_redaction import choose_and_run_redactor
16
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
17
+ from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal
18
  from tools.data_anonymise import anonymise_data_files
19
  from tools.auth import authenticate_user
20
  from tools.load_spacy_model_custom_recognisers import custom_entities
 
81
  first_loop_state = gr.Checkbox(label="first_loop_state", value=True, visible=False)
82
  second_loop_state = gr.Checkbox(label="second_loop_state", value=False, visible=False)
83
  do_not_save_pdf_state = gr.Checkbox(label="do_not_save_pdf_state", value=False, visible=False)
84
+ save_pdf_state = gr.Checkbox(label="save_pdf_state", value=True, visible=False)
85
 
86
  prepared_pdf_state = gr.Dropdown(label = "prepared_pdf_list", value="", allow_custom_value=True,visible=False)
87
+ document_cropboxes = gr.Dropdown(label = "document_cropboxes", value="", allow_custom_value=True,visible=False)
88
+ page_sizes = gr.Dropdown(label = "page_sizes", value="", allow_custom_value=True, visible=False)
89
  images_pdf_state = gr.Dropdown(label = "images_pdf_list", value="", allow_custom_value=True,visible=False)
90
 
91
  output_image_files_state = gr.Dropdown(label = "output_image_files_list", value="", allow_custom_value=True,visible=False)
92
  output_file_list_state = gr.Dropdown(label = "output_file_list", value="", allow_custom_value=True,visible=False)
93
  text_output_file_list_state = gr.Dropdown(label = "text_output_file_list", value="", allow_custom_value=True,visible=False)
94
  log_files_output_list_state = gr.Dropdown(label = "log_files_output_list", value="", allow_custom_value=True,visible=False)
95
+
96
+ # Backup versions of these objects in case you make a mistake
97
+ backup_review_state = gr.Dataframe(visible=False)
98
+ backup_image_annotations_state = gr.State([])
99
+ backup_recogniser_entity_dataframe_base = gr.Dataframe(visible=False)
100
 
101
 
102
  # Logging state
 
122
  data_file_name_no_extension_textbox = gr.Textbox(label = "data_full_file_name_textbox", value="", visible=False)
123
  data_file_name_with_extension_textbox = gr.Textbox(label = "data_file_name_with_extension_textbox", value="", visible=False)
124
  data_file_name_textbox_list = gr.Dropdown(label = "data_file_name_textbox_list", value="", allow_custom_value=True,visible=False)
125
+
126
+ # Constants just to use with the review dropdowns for filtering by various columns
127
+ label_name_const = gr.Textbox(label="label_name_const", value="label", visible=False)
128
+ text_name_const = gr.Textbox(label="text_name_const", value="text", visible=False)
129
+ page_name_const = gr.Textbox(label="page_name_const", value="page", visible=False)
130
 
131
  estimated_time_taken_number = gr.Number(label = "estimated_time_taken_number", value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
132
  annotate_previous_page = gr.Number(value=0, label="Previous page", precision=0, visible=False) # Keeps track of the last page that the annotator was on
 
143
 
144
  ## Settings page variables
145
  default_allow_list_file_name = "default_allow_list.csv"
146
+ default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
 
147
 
148
  default_deny_list_file_name = "default_deny_list.csv"
149
+ default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
 
150
  in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
151
 
152
  fully_redacted_list_file_name = "default_fully_redacted_list.csv"
153
+ fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
 
154
  in_fully_redacted_text_in = gr.Textbox(value="Fully redacted page list", visible=False)
155
 
156
  # S3 settings for default allow list load
 
159
  default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
160
 
161
  # Base dataframe for recognisers that is not modified subsequent to load
162
+ recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=3, type="pandas", visible=False, label="recogniser_entity_dataframe_base", show_search="filter", headers=["page", "label", "text"])
163
 
164
  # Duplicate page detection
165
  in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
166
  duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="duplicate_pages_df", visible=False, type="pandas")
167
 
 
 
168
  ###
169
  # UI DESIGN
170
  ###
 
185
  ###
186
  with gr.Tab("Redact PDFs/images"):
187
  with gr.Accordion("Redact document", open = True):
188
+ in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json'], height=file_input_height)
189
  # if RUN_AWS_FUNCTIONS == "1":
190
  in_redaction_method = gr.Radio(label="Choose text extraction method. AWS Textract has a cost per page - $3.50 per 1,000 pages with signature detection (default), $1.50 without. Go to Redaction settings - AWS Textract options to remove signature detection.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
191
  pii_identification_method_drop = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost of approximately $0.01 per 10,000 characters.", value = default_pii_detector, choices=[local_pii_detector, aws_pii_detector])
 
227
  annotate_zoom_out = gr.Button("Zoom out", visible=False)
228
  with gr.Row():
229
  clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
230
+
231
+ with gr.Row():
232
+ with gr.Column(scale=2):
233
+ with gr.Row(equal_height=True):
234
+ annotation_last_page_button = gr.Button("Previous page", scale = 4)
235
+ annotate_current_page = gr.Number(value=1, label="Current page", precision=0, scale = 2, min_width=50)
236
+ annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
237
+ annotation_next_page_button = gr.Button("Next page", scale = 4)
238
+ with gr.Column(scale=1):
239
+ blank_markdown_top = gr.Markdown(value="", label="")
240
 
241
  with gr.Row():
242
+ with gr.Column(scale=2):
243
 
244
  zoom_str = str(annotator_zoom_number) + '%'
245
 
 
261
  interactive=False
262
  )
263
  with gr.Column(scale=1):
264
+ with gr.Row():
265
+ recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
266
+ page_entity_dropdown = gr.Dropdown(label="Page", value="ALL", allow_custom_value=True)
267
+ text_entity_dropdown = gr.Dropdown(label="Text", value="ALL", allow_custom_value=True)
268
+ recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=(3,"fixed"), type="pandas", label="Search results. Click to go to page", headers=["page", "label", "text"])
269
+ with gr.Row():
270
+ reset_dropdowns_btn = gr.Button(value="Reset filters")
271
+ exclude_selected_btn = gr.Button(value="Exclude items in table from redactions")
272
+ undo_last_removal_btn = gr.Button(value="Undo last element removal")
273
+
274
+ with gr.Row():
275
+ with gr.Column(scale=2):
276
+ with gr.Row(equal_height=True):
277
+ annotation_last_page_button_bottom = gr.Button("Previous page", scale = 4)
278
+ annotate_current_page_bottom = gr.Number(value=1, label="Current page", precision=0, interactive=True, scale = 2, min_width=50)
279
+ annotate_max_pages_bottom = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
280
+ annotation_next_page_button_bottom = gr.Button("Next page", scale = 4)
281
+ with gr.Column(scale=1):
282
+ blank_markdown_bot = gr.Markdown(value="", label="")
283
 
284
  with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
285
  convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
 
326
  with gr.Tab(label="Identify duplicate pages"):
327
  with gr.Accordion("Identify duplicate pages to redact", open = True):
328
  in_duplicate_pages = gr.File(label="Upload multiple 'ocr_output.csv' data files from redaction jobs here to compare", file_count="multiple", height=file_input_height, file_types=['.csv'])
329
+ find_duplicate_pages_btn = gr.Button(value="Identify duplicate pages", variant="primary")
 
 
330
  duplicate_pages_out =gr.File(label="Duplicate pages analysis output", file_count="multiple", height=file_input_height, file_types=['.csv'])
331
 
332
  ###
 
344
  with gr.Column():
345
  in_fully_redacted_list = gr.File(label="Import fully redacted pages list - csv table with one column of page numbers on each row. Page numbers in this file will be fully redacted.", file_count="multiple", height=file_input_height)
346
  in_fully_redacted_list_text = gr.Textbox(label="Fully redacted page list load status")
347
+ with gr.Accordion("Manually modify custom allow, deny, and full page redaction lists", open = False):
348
+ with gr.Row():
349
+ in_allow_list_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_allow_list_df", visible=True, type="pandas")
350
+ in_deny_list_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_deny_list_df", visible=True, type="pandas")
351
+ in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_full_redacted_list_df", visible=True, type="pandas")
352
 
353
  with gr.Accordion("Select entity types to redact", open = True):
354
  in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Local PII identification model (click empty space in box for full list)")
 
393
  ###
394
  in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
395
 
396
+ document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, document_cropboxes]).\
397
  success(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox],
398
+ outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes], api_name="redact_doc").\
399
+ success(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
400
 
401
+ # If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
402
+ # current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox],
403
+ # outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes]).\
404
+ # success(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
405
 
406
  # If a file has been completed, the function will continue onto the next document
407
+ # latest_file_completed_text.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox],
408
+ # outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes]).\
409
+ # success(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown]).\
410
+ # success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
411
 
412
  ###
413
  # REVIEW PDF REDACTIONS
414
  ###
415
 
416
  # Upload previous files for modifying redactions
417
+ upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
418
  success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
419
+ success(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes], api_name="prepare_doc").\
420
+ success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
421
 
422
  # Page controls at top
423
  annotate_current_page.submit(
424
+ modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown, recogniser_entity_dataframe_base, review_file_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
425
+ success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown]).\
426
+ success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
427
 
428
  annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
429
+ success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown, recogniser_entity_dataframe_base, review_file_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
430
+ success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown]).\
431
+ success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
432
 
433
  annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
434
+ success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown, recogniser_entity_dataframe_base, review_file_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
435
+ success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown]).\
436
+ success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
437
 
438
  # Zoom in and out on annotator
439
+ annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown, recogniser_entity_dataframe_base, review_file_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
440
  success(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_true_bool], outputs=[annotator_zoom_number, annotate_current_page])
441
 
442
+ annotate_zoom_out.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown, recogniser_entity_dataframe_base, review_file_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
443
  success(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_false_bool], outputs=[annotator_zoom_number, annotate_current_page])
444
 
445
+ annotator_zoom_number.change(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
446
 
447
+ clear_all_redactions_on_page_btn.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown, recogniser_entity_dataframe_base, clear_all_page_redactions], outputs = [all_image_annotations_state, annotate_previous_page]).\
448
+ success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
449
 
450
+ annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
451
 
452
  # Page controls at bottom
453
  annotate_current_page_bottom.submit(
454
+ modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown, recogniser_entity_dataframe_base, review_file_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page]).\
455
+ success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown]).\
456
+ success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
457
 
458
  annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
459
+ success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown, recogniser_entity_dataframe_base, review_file_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
460
+ success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown]).\
461
+ success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
462
 
463
  annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
464
+ success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown, recogniser_entity_dataframe_base, review_file_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
465
+ success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown]).\
466
+ success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
467
 
468
  # Review table controls
469
+ recogniser_entity_dropdown.select(update_entities_df_recogniser_entities, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dataframe, text_entity_dropdown, page_entity_dropdown])
470
+ page_entity_dropdown.select(update_entities_df_page, inputs=[page_entity_dropdown, recogniser_entity_dataframe_base, recogniser_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dataframe, recogniser_entity_dropdown, text_entity_dropdown])
471
+ text_entity_dropdown.select(update_entities_df_text, inputs=[text_entity_dropdown, recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown], outputs=[recogniser_entity_dataframe, recogniser_entity_dropdown, page_entity_dropdown])
472
 
473
  recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=[annotate_current_page]).\
474
+ success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown, recogniser_entity_dataframe_base, review_file_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
475
+ success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown]).\
476
+ success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
477
+
478
+ reset_dropdowns_btn.click(reset_dropdowns, outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown]).\
479
+ success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
480
+
481
+ exclude_selected_btn.click(exclude_selected_items_from_redaction, inputs=[review_file_state, recogniser_entity_dataframe, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_state, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
482
+ success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])#.\
483
+ #success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
484
+
485
+ undo_last_removal_btn.click(undo_last_removal, inputs=[backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base], outputs=[review_file_state, all_image_annotations_state, recogniser_entity_dataframe_base]).\
486
+ success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
487
 
488
  # Convert review file to xfdf Adobe format
489
  convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
490
+ success(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes]).\
491
  success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes], outputs=[adobe_review_files_out])
492
 
493
  # Convert xfdf Adobe file back to review_file.csv
494
  convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
495
+ success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes]).\
496
  success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
497
 
498
  ###
requirements.txt CHANGED
@@ -9,10 +9,10 @@ pikepdf==9.5.2
9
  pandas==2.2.3
10
  nltk==3.9.1
11
  scikit-learn==1.6.1
12
- spacy==3.8.3
13
- #en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
14
- en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
15
- gradio==5.18.0
16
  boto3==1.36.26
17
  pyarrow==19.0.1
18
  openpyxl==3.1.5
 
9
  pandas==2.2.3
10
  nltk==3.9.1
11
  scikit-learn==1.6.1
12
+ spacy==3.8.4
13
+ en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
14
+ #en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
15
+ gradio==5.22.0
16
  boto3==1.36.26
17
  pyarrow==19.0.1
18
  openpyxl==3.1.5
tools/aws_textract.py CHANGED
@@ -2,7 +2,9 @@ import boto3
2
  #from PIL import Image
3
  from typing import List
4
  import io
5
- #import json
 
 
6
  import pikepdf
7
  import time
8
  # Example: converting this single page to an image
@@ -26,7 +28,7 @@ def extract_textract_metadata(response):
26
  #'NumberOfPages': number_of_pages
27
  })
28
 
29
- def analyse_page_with_textract(pdf_page_bytes, page_no, client="", handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"]):
30
  '''
31
  Analyse page with AWS Textract
32
  '''
@@ -65,6 +67,11 @@ def analyse_page_with_textract(pdf_page_bytes, page_no, client="", handwrite_sig
65
  time.sleep(5)
66
  response = client.detect_document_text(Document={'Bytes': pdf_page_bytes})
67
 
 
 
 
 
 
68
  # Wrap the response with the page number in the desired format
69
  wrapped_response = {
70
  'page_no': page_no,
@@ -265,4 +272,80 @@ def json_to_ocrresult(json_data, page_width, page_height, page_no):
265
 
266
  i += 1
267
 
268
- return all_ocr_results, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_children
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  #from PIL import Image
3
  from typing import List
4
  import io
5
+ import os
6
+ import json
7
+ from collections import defaultdict
8
  import pikepdf
9
  import time
10
  # Example: converting this single page to an image
 
28
  #'NumberOfPages': number_of_pages
29
  })
30
 
31
+ def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str="", handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"]):
32
  '''
33
  Analyse page with AWS Textract
34
  '''
 
67
  time.sleep(5)
68
  response = client.detect_document_text(Document={'Bytes': pdf_page_bytes})
69
 
70
+ # Add the 'Page' attribute to each block
71
+ if "Blocks" in response:
72
+ for block in response["Blocks"]:
73
+ block["Page"] = page_no # Inject the page number into each block
74
+
75
  # Wrap the response with the page number in the desired format
76
  wrapped_response = {
77
  'page_no': page_no,
 
272
 
273
  i += 1
274
 
275
+ return all_ocr_results, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_children
276
+
277
+ def load_and_convert_textract_json(textract_json_file_path, log_files_output_paths):
278
+ """
279
+ Loads Textract JSON from a file, detects if conversion is needed,
280
+ and converts if necessary.
281
+ """
282
+
283
+ if not os.path.exists(textract_json_file_path):
284
+ print("No existing Textract results file found.")
285
+ return {}, True, log_files_output_paths # Return empty dict and flag indicating missing file
286
+
287
+ no_textract_file = False
288
+ print("Found existing Textract json results file.")
289
+
290
+ # Track log files
291
+ if textract_json_file_path not in log_files_output_paths:
292
+ log_files_output_paths.append(textract_json_file_path)
293
+
294
+ try:
295
+ with open(textract_json_file_path, 'r', encoding='utf-8') as json_file:
296
+ textract_data = json.load(json_file)
297
+ except json.JSONDecodeError:
298
+ print("Error: Failed to parse Textract JSON file. Returning empty data.")
299
+ return {}, True, log_files_output_paths # Indicate failure
300
+
301
+ # Check if conversion is needed
302
+ if "pages" in textract_data:
303
+ print("JSON already in the new format. No changes needed.")
304
+ return textract_data, False, log_files_output_paths # No conversion required
305
+
306
+ if "Blocks" in textract_data:
307
+ print("Need to convert Textract JSON to app format.")
308
+ try:
309
+ from tools.aws_textract import restructure_textract_output
310
+ textract_data = restructure_textract_output(textract_data)
311
+ return textract_data, False, log_files_output_paths # Successfully converted
312
+ except Exception as e:
313
+ print("Failed to convert JSON data to app format due to:", e)
314
+ return {}, True, log_files_output_paths # Conversion failed
315
+ else:
316
+ print("Invalid Textract JSON format: 'Blocks' missing.")
317
+ print("textract data:", textract_data)
318
+ return {}, True, log_files_output_paths # Return empty data if JSON is not recognized
319
+
320
+
321
+
322
+ # Load Textract JSON output (assuming it's stored in a variable called `textract_output`)
323
+ def restructure_textract_output(textract_output:object):
324
+ '''
325
+ Reorganise textract output that comes from the bulk textract analysis option on AWS to format that works in this app.
326
+ '''
327
+ pages_dict = defaultdict(lambda: {"page_no": None, "data": {"Blocks": []}})
328
+
329
+ # Extract number of pages from DocumentMetadata
330
+ total_pages = textract_output.get("DocumentMetadata", {}).get("Pages", 1)
331
+
332
+ for block in textract_output.get("Blocks", []):
333
+ page_no = block.get("Page", 1) # Default to 1 if not present
334
+
335
+ # Ensure page metadata is only set once
336
+ if pages_dict[page_no]["page_no"] is None:
337
+ pages_dict[page_no]["page_no"] = str(page_no)
338
+
339
+ # Add block to corresponding page
340
+ pages_dict[page_no]["data"]["Blocks"].append(block)
341
+
342
+ # Convert dictionary to sorted list of pages
343
+ structured_output = {
344
+ "pages": [pages_dict[page] for page in sorted(pages_dict.keys())]
345
+ }
346
+
347
+ # Add DocumentMetadata to the first page's data (optional)
348
+ if structured_output["pages"]:
349
+ structured_output["pages"][0]["data"]["DocumentMetadata"] = textract_output.get("DocumentMetadata", {})
350
+
351
+ return structured_output
tools/file_conversion.py CHANGED
@@ -8,12 +8,16 @@ import json
8
  import pymupdf
9
  import pandas as pd
10
  import numpy as np
 
11
  from pymupdf import Rect
12
  from fitz import Page
13
  from tqdm import tqdm
14
  from gradio import Progress
15
  from typing import List, Optional
16
  from concurrent.futures import ThreadPoolExecutor, as_completed
 
 
 
17
 
18
  image_dpi = 300.0
19
  ImageFile.LOAD_TRUNCATED_IMAGES = True
@@ -53,9 +57,41 @@ def is_pdf(filename):
53
  CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "")
54
  print(f'The value of CUSTOM_BOX_COLOUR is {CUSTOM_BOX_COLOUR}')
55
 
56
- import os
57
- from pdf2image import convert_from_path
58
- from PIL import Image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
  def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_dir: str = 'input') -> tuple[int, str]:
61
  try:
@@ -75,38 +111,16 @@ def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_d
75
  image = image.convert("L")
76
  image.save(out_path, format="PNG")
77
 
78
- # Check file size and resize if necessary
79
- max_size = 4.5 * 1024 * 1024 # 5 MB in bytes # 5
80
- file_size = os.path.getsize(out_path)
81
-
82
- # Resize images if they are too big
83
- if file_size > max_size:
84
- # Start with the original image size
85
- width, height = image.size
86
-
87
- print(f"Image size before {width}x{height}, original file_size: {file_size}")
88
 
89
- while file_size > max_size:
90
- # Reduce the size by a factor (e.g., 50% of the current size)
91
- new_width = int(width * 0.5)
92
- new_height = int(height * 0.5)
93
- image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
94
-
95
- # Save the resized image
96
- image.save(out_path, format="PNG", optimize=True)
97
-
98
- # Update the file size
99
- file_size = os.path.getsize(out_path)
100
- print(f"Resized to {new_width}x{new_height}, new file_size: {file_size}")
101
-
102
- # Update the dimensions for the next iteration
103
- width, height = new_width, new_height
104
 
105
- return page_num, out_path
106
 
107
  except Exception as e:
108
  print(f"Error processing page {page_num + 1}: {e}")
109
- return page_num, None
110
 
111
  def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min: int = 0, image_dpi: float = image_dpi, num_threads: int = 8, output_dir: str = '/input'):
112
 
@@ -125,44 +139,49 @@ def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min
125
  futures.append(executor.submit(process_single_page, pdf_path, page_num, image_dpi))
126
 
127
  for future in tqdm(as_completed(futures), total=len(futures), unit="pages", desc="Converting pages"):
128
- page_num, result = future.result()
129
  if result:
130
- results.append((page_num, result))
131
  else:
132
  print(f"Page {page_num + 1} failed to process.")
133
 
134
  # Sort results by page number
135
  results.sort(key=lambda x: x[0])
136
  images = [result[1] for result in results]
 
 
137
 
138
  print("PDF has been converted to images.")
139
- return images
140
-
141
-
142
 
143
  # Function to take in a file path, decide if it is an image or pdf, then process appropriately.
144
  def process_file(file_path:str, prepare_for_review:bool=False):
145
  # Get the file extension
146
  file_extension = os.path.splitext(file_path)[1].lower()
147
-
148
  # Check if the file is an image type
149
  if file_extension in ['.jpg', '.jpeg', '.png']:
150
  print(f"{file_path} is an image file.")
151
  # Perform image processing here
152
  img_object = [file_path] #[Image.open(file_path)]
153
- # Load images from the file paths
 
 
 
154
 
155
  # Check if the file is a PDF
156
  elif file_extension == '.pdf':
157
  print(f"{file_path} is a PDF file. Converting to image set")
158
  # Run your function for processing PDF files here
159
- img_object = convert_pdf_to_images(file_path, prepare_for_review)
160
 
161
  else:
162
  print(f"{file_path} is not an image or PDF file.")
163
- img_object = ['']
 
 
164
 
165
- return img_object
166
 
167
  def get_input_file_names(file_input:List[str]):
168
  '''
@@ -351,6 +370,7 @@ def prepare_image_or_pdf(
351
  all_annotations_object:List = [],
352
  prepare_for_review:bool = False,
353
  in_fully_redacted_list:List[int]=[],
 
354
  progress: Progress = Progress(track_tqdm=True)
355
  ) -> tuple[List[str], List[str]]:
356
  """
@@ -369,7 +389,8 @@ def prepare_image_or_pdf(
369
  all_annotations_object(optional, List of annotation objects): All annotations for current document
370
  prepare_for_review(optional, bool): Is this preparation step preparing pdfs and json files to review current redactions?
371
  in_fully_redacted_list(optional, List of int): A list of pages to fully redact
372
- progress (optional, Progress): Progress tracker for the operation.
 
373
 
374
 
375
  Returns:
@@ -381,7 +402,8 @@ def prepare_image_or_pdf(
381
  original_cropboxes = [] # Store original CropBox values
382
 
383
  if isinstance(in_fully_redacted_list, pd.DataFrame):
384
- in_fully_redacted_list = in_fully_redacted_list.iloc[:,0].tolist()
 
385
 
386
  # If this is the first time around, set variables to 0/blank
387
  if first_loop_state==True:
@@ -433,7 +455,7 @@ def prepare_image_or_pdf(
433
  final_out_message = '\n'.join(out_message)
434
  else:
435
  final_out_message = out_message
436
- return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
437
 
438
  #in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
439
 
@@ -475,13 +497,22 @@ def prepare_image_or_pdf(
475
  if is_pdf(file_path):
476
  pymupdf_doc = pymupdf.open(file_path)
477
 
478
- # Load cropbox dimensions to use later
479
-
480
- for page in pymupdf_doc:
481
- original_cropboxes.append(page.cropbox) # Save original CropBox
482
 
483
  converted_file_path = file_path
484
- image_file_paths = process_file(file_path, prepare_for_review)
 
 
 
 
 
 
 
 
 
 
 
 
485
 
486
  #Create base version of the annotation object that doesn't have any annotations in it
487
  if (not all_annotations_object) & (prepare_for_review == True):
@@ -503,14 +534,20 @@ def prepare_image_or_pdf(
503
 
504
  img = Image.open(file_path) # Open the image file
505
  rect = pymupdf.Rect(0, 0, img.width, img.height) # Create a rectangle for the image
506
- page = pymupdf_doc.new_page(width=img.width, height=img.height) # Add a new page
507
- page.insert_image(rect, filename=file_path) # Insert the image into the page
 
 
 
508
 
509
  file_path_str = str(file_path)
510
 
511
- image_file_paths = process_file(file_path_str, prepare_for_review)
512
 
513
  #print("image_file_paths:", image_file_paths)
 
 
 
514
 
515
  converted_file_path = output_folder + file_name_with_ext
516
 
@@ -520,7 +557,7 @@ def prepare_image_or_pdf(
520
 
521
  elif file_extension in ['.csv']:
522
  review_file_csv = read_file(file)
523
- all_annotations_object = convert_pandas_df_to_review_json(review_file_csv, image_file_paths)
524
  json_from_csv = True
525
  print("Converted CSV review file to json")
526
 
@@ -537,13 +574,14 @@ def prepare_image_or_pdf(
537
  all_annotations_object = json.loads(file_path) # Use loads for string content
538
 
539
  # Assume it's a textract json
540
- elif (file_extension in ['.json']) & (prepare_for_review != True):
541
- # If the file loaded has end textract.json, assume this is a textract response object. Save this to the output folder so it can be found later during redaction and go to the next file.
542
- json_contents = json.load(file_path)
543
- # Write the response to a JSON file in output folder
544
- out_folder = output_folder + file_path_without_ext + ".json"
545
- with open(out_folder, 'w') as json_file:
546
- json.dump(json_contents, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
 
547
  continue
548
 
549
  # If you have an annotations object from the above code
@@ -600,16 +638,16 @@ def prepare_image_or_pdf(
600
  #print("all_annotations_object at end of json/csv load part:", all_annotations_object)
601
 
602
  # Get list of pages that are to be fully redacted and redact them
603
- if in_fully_redacted_list:
604
- print("Redacting whole pages")
605
 
606
- for i, image in enumerate(image_file_paths):
607
- page = pymupdf_doc.load_page(i)
608
- rect_height = page.rect.height
609
- rect_width = page.rect.width
610
- whole_page_img_annotation_box = redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours = False, border = 5)
611
 
612
- all_annotations_object.append(whole_page_img_annotation_box)
613
 
614
  # Write the response to a JSON file in output folder
615
  out_folder = output_folder + file_path_without_ext + ".json"
@@ -645,7 +683,7 @@ def prepare_image_or_pdf(
645
 
646
  number_of_pages = len(image_file_paths)
647
 
648
- return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes
649
 
650
  def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
651
  file_path_without_ext = get_file_name_without_type(in_file_path)
@@ -655,7 +693,7 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str],
655
  # Convert annotated text pdf back to image to give genuine redactions
656
  print("Creating image version of redacted PDF to embed redactions.")
657
 
658
- pdf_text_image_paths = process_file(out_text_file_path[0])
659
  out_text_image_file_path = output_folder + file_path_without_ext + "_text_redacted_as_img.pdf"
660
  pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=image_dpi, save_all=True, append_images=pdf_text_image_paths[1:])
661
 
@@ -701,12 +739,13 @@ def join_values_within_threshold(df1, df2):
701
  print(final_df)
702
 
703
 
704
- def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decision_output:pd.DataFrame=pd.DataFrame()) -> pd.DataFrame:
705
  '''
706
  Convert the annotation json data to a dataframe format. Add on any text from the initial review_file dataframe by joining on pages/co-ordinates (doesn't work very well currently).
707
  '''
708
  # Flatten the data
709
  flattened_annotation_data = []
 
710
 
711
  if not isinstance(redaction_decision_output, pd.DataFrame):
712
  redaction_decision_output = pd.DataFrame()
@@ -739,54 +778,171 @@ def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decis
739
  flattened_annotation_data.append(data_to_add)
740
 
741
  # Convert to a DataFrame
742
- annotation_data_as_df = pd.DataFrame(flattened_annotation_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
743
 
744
  #print("redaction_decision_output:", redaction_decision_output)
745
- #print("annotation_data_as_df:", annotation_data_as_df)
746
 
747
  # Join on additional text data from decision output results if included, if text not already there
748
- if not redaction_decision_output.empty:
749
- #print("redaction_decision_output is not empty")
750
- #print("redaction_decision_output:", redaction_decision_output)
751
- #print("annotation_data_as_df:", annotation_data_as_df)
752
- redaction_decision_output['page'] = redaction_decision_output['page'].astype(str)
753
- annotation_data_as_df['page'] = annotation_data_as_df['page'].astype(str)
754
- redaction_decision_output = redaction_decision_output[['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page', 'text']]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
755
 
756
- # Round to the closest number divisible by 5
757
- redaction_decision_output.loc[:, ['xmin', 'ymin', 'xmax', 'ymax']] = (redaction_decision_output[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
758
 
759
- redaction_decision_output = redaction_decision_output.drop_duplicates(['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page'])
 
 
 
 
 
760
 
761
- #annotation_data_as_df[['xmin1', 'ymin1', 'xmax1', 'ymax1']] = (annotation_data_as_df[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
 
 
762
 
763
- annotation_data_as_df.loc[:, ['xmin1', 'ymin1', 'xmax1', 'ymax1']] = (annotation_data_as_df[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
 
 
 
764
 
765
- annotation_data_as_df = annotation_data_as_df.merge(redaction_decision_output, left_on = ['xmin1', 'ymin1', 'xmax1', 'ymax1', 'label', 'page'], right_on = ['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page'], how = "left", suffixes=("", "_y"))
 
766
 
767
- annotation_data_as_df = annotation_data_as_df.drop(['xmin1', 'ymin1', 'xmax1', 'ymax1', 'xmin_y', 'ymin_y', 'xmax_y', 'ymax_y'], axis=1, errors="ignore")
768
 
769
- annotation_data_as_df = annotation_data_as_df[["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"]]
770
 
771
  # Ensure required columns exist, filling with blank if they don't
772
  for col in ["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"]:
773
- if col not in annotation_data_as_df.columns:
774
- annotation_data_as_df[col] = ''
775
 
776
- for col in ['xmin', 'xmax', 'ymin', 'ymax']:
777
- annotation_data_as_df[col] = np.floor(annotation_data_as_df[col])
778
 
779
- annotation_data_as_df = annotation_data_as_df.sort_values(['page', 'ymin', 'xmin', 'label'])
 
780
 
781
- return annotation_data_as_df
782
 
783
- def convert_pandas_df_to_review_json(review_file_df: pd.DataFrame, image_paths: List[Image.Image]) -> List[dict]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
784
  '''
785
- Convert a review csv to a json file for use by the Gradio Annotation object
786
  '''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
787
  # Keep only necessary columns
788
  review_file_df = review_file_df[["image", "page", "xmin", "ymin", "xmax", "ymax", "color", "label"]]
789
 
 
 
 
790
  # Group the DataFrame by the 'image' column
791
  grouped_csv_pages = review_file_df.groupby('page')
792
 
@@ -795,6 +951,7 @@ def convert_pandas_df_to_review_json(review_file_df: pd.DataFrame, image_paths:
795
 
796
  for n, pdf_image_path in enumerate(image_paths):
797
  reported_page_number = int(n + 1)
 
798
 
799
  if reported_page_number in review_file_df["page"].values:
800
 
@@ -802,6 +959,8 @@ def convert_pandas_df_to_review_json(review_file_df: pd.DataFrame, image_paths:
802
  selected_csv_pages = grouped_csv_pages.get_group(reported_page_number)
803
  annotation_boxes = selected_csv_pages.drop(columns=['image', 'page']).to_dict(orient='records')
804
 
 
 
805
  annotation = {
806
  "image": pdf_image_path,
807
  "boxes": annotation_boxes
 
8
  import pymupdf
9
  import pandas as pd
10
  import numpy as np
11
+ import shutil
12
  from pymupdf import Rect
13
  from fitz import Page
14
  from tqdm import tqdm
15
  from gradio import Progress
16
  from typing import List, Optional
17
  from concurrent.futures import ThreadPoolExecutor, as_completed
18
+ from pdf2image import convert_from_path
19
+ from PIL import Image
20
+ from scipy.spatial import cKDTree
21
 
22
  image_dpi = 300.0
23
  ImageFile.LOAD_TRUNCATED_IMAGES = True
 
57
  CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "")
58
  print(f'The value of CUSTOM_BOX_COLOUR is {CUSTOM_BOX_COLOUR}')
59
 
60
+ def check_image_size_and_reduce(out_path:str, image:Image):
61
+ '''
62
+ Check if a given image size is above around 4.5mb, and reduce size if necessary. 5mb is the maximum possible to submit to AWS Textract.
63
+ '''
64
+
65
+ # Check file size and resize if necessary
66
+ max_size = 4.5 * 1024 * 1024 # 5 MB in bytes # 5
67
+ file_size = os.path.getsize(out_path)
68
+
69
+ width = image.width
70
+ height = image.height
71
+
72
+ # Resize images if they are too big
73
+ if file_size > max_size:
74
+ # Start with the original image size
75
+
76
+ print(f"Image size before {width}x{height}, original file_size: {file_size}")
77
+
78
+ while file_size > max_size:
79
+ # Reduce the size by a factor (e.g., 50% of the current size)
80
+ new_width = int(width * 0.5)
81
+ new_height = int(height * 0.5)
82
+ image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
83
+
84
+ # Save the resized image
85
+ image.save(out_path, format="PNG", optimize=True)
86
+
87
+ # Update the file size
88
+ file_size = os.path.getsize(out_path)
89
+ print(f"Resized to {new_width}x{new_height}, new file_size: {file_size}")
90
+ else:
91
+ new_width = width
92
+ new_height = height
93
+
94
+ return new_width, new_height
95
 
96
  def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_dir: str = 'input') -> tuple[int, str]:
97
  try:
 
111
  image = image.convert("L")
112
  image.save(out_path, format="PNG")
113
 
114
+ width, height = image.size
 
 
 
 
 
 
 
 
 
115
 
116
+ # Check if image size too large and reduce if necessary
117
+ width, height = check_image_size_and_reduce(out_path, image)
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
+ return page_num, out_path, width, height
120
 
121
  except Exception as e:
122
  print(f"Error processing page {page_num + 1}: {e}")
123
+ return page_num, "", width, height
124
 
125
  def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min: int = 0, image_dpi: float = image_dpi, num_threads: int = 8, output_dir: str = '/input'):
126
 
 
139
  futures.append(executor.submit(process_single_page, pdf_path, page_num, image_dpi))
140
 
141
  for future in tqdm(as_completed(futures), total=len(futures), unit="pages", desc="Converting pages"):
142
+ page_num, result, width, height = future.result()
143
  if result:
144
+ results.append((page_num, result, width, height))
145
  else:
146
  print(f"Page {page_num + 1} failed to process.")
147
 
148
  # Sort results by page number
149
  results.sort(key=lambda x: x[0])
150
  images = [result[1] for result in results]
151
+ widths = [result[2] for result in results]
152
+ heights = [result[3] for result in results]
153
 
154
  print("PDF has been converted to images.")
155
+ return images, widths, heights
 
 
156
 
157
  # Function to take in a file path, decide if it is an image or pdf, then process appropriately.
158
  def process_file(file_path:str, prepare_for_review:bool=False):
159
  # Get the file extension
160
  file_extension = os.path.splitext(file_path)[1].lower()
161
+
162
  # Check if the file is an image type
163
  if file_extension in ['.jpg', '.jpeg', '.png']:
164
  print(f"{file_path} is an image file.")
165
  # Perform image processing here
166
  img_object = [file_path] #[Image.open(file_path)]
167
+
168
+ # Load images from the file paths. Test to see if it is bigger than 4.5 mb and reduct if needed (Textract limit is 5mb)
169
+ image = Image.open(file_path)
170
+ img_object, image_sizes_width, image_sizes_height = check_image_size_and_reduce(file_path, image)
171
 
172
  # Check if the file is a PDF
173
  elif file_extension == '.pdf':
174
  print(f"{file_path} is a PDF file. Converting to image set")
175
  # Run your function for processing PDF files here
176
+ img_object, image_sizes_width, image_sizes_height = convert_pdf_to_images(file_path, prepare_for_review)
177
 
178
  else:
179
  print(f"{file_path} is not an image or PDF file.")
180
+ img_object = []
181
+ image_sizes_width = []
182
+ image_sizes_height = []
183
 
184
+ return img_object, image_sizes_width, image_sizes_height
185
 
186
  def get_input_file_names(file_input:List[str]):
187
  '''
 
370
  all_annotations_object:List = [],
371
  prepare_for_review:bool = False,
372
  in_fully_redacted_list:List[int]=[],
373
+ output_folder:str=output_folder,
374
  progress: Progress = Progress(track_tqdm=True)
375
  ) -> tuple[List[str], List[str]]:
376
  """
 
389
  all_annotations_object(optional, List of annotation objects): All annotations for current document
390
  prepare_for_review(optional, bool): Is this preparation step preparing pdfs and json files to review current redactions?
391
  in_fully_redacted_list(optional, List of int): A list of pages to fully redact
392
+ output_folder (optional, str): The output folder for file save
393
+ progress (optional, Progress): Progress tracker for the operation
394
 
395
 
396
  Returns:
 
402
  original_cropboxes = [] # Store original CropBox values
403
 
404
  if isinstance(in_fully_redacted_list, pd.DataFrame):
405
+ if not in_fully_redacted_list.empty:
406
+ in_fully_redacted_list = in_fully_redacted_list.iloc[:,0].tolist()
407
 
408
  # If this is the first time around, set variables to 0/blank
409
  if first_loop_state==True:
 
455
  final_out_message = '\n'.join(out_message)
456
  else:
457
  final_out_message = out_message
458
+ return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes
459
 
460
  #in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
461
 
 
497
  if is_pdf(file_path):
498
  pymupdf_doc = pymupdf.open(file_path)
499
 
500
+ # Load cropbox dimensions to use later
 
 
 
501
 
502
  converted_file_path = file_path
503
+ image_file_paths, image_sizes_width, image_sizes_height = process_file(file_path, prepare_for_review)
504
+ page_sizes = []
505
+
506
+ for i, page in enumerate(pymupdf_doc):
507
+ page_no = i
508
+ reported_page_no = i + 1
509
+
510
+ pymupdf_page = pymupdf_doc.load_page(page_no)
511
+ original_cropboxes.append(pymupdf_page.cropbox) # Save original CropBox
512
+
513
+ # Create a page_sizes_object
514
+ out_page_image_sizes = {"page":reported_page_no, "image_width":image_sizes_width[page_no], "image_height":image_sizes_height[page_no], "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":pymupdf_page.cropbox.width, "cropbox_height":pymupdf_page.cropbox.height}
515
+ page_sizes.append(out_page_image_sizes)
516
 
517
  #Create base version of the annotation object that doesn't have any annotations in it
518
  if (not all_annotations_object) & (prepare_for_review == True):
 
534
 
535
  img = Image.open(file_path) # Open the image file
536
  rect = pymupdf.Rect(0, 0, img.width, img.height) # Create a rectangle for the image
537
+ pymupdf_page = pymupdf_doc.new_page(width=img.width, height=img.height) # Add a new page
538
+ pymupdf_page.insert_image(rect, filename=file_path) # Insert the image into the page
539
+ pymupdf_page = pymupdf_doc.load_page(0)
540
+
541
+ original_cropboxes.append(pymupdf_page.cropbox) # Save original CropBox
542
 
543
  file_path_str = str(file_path)
544
 
545
+ image_file_paths, image_sizes_width, image_sizes_height = process_file(file_path_str, prepare_for_review)
546
 
547
  #print("image_file_paths:", image_file_paths)
548
+ # Create a page_sizes_object
549
+ out_page_image_sizes = {"page":1, "image_width":image_sizes_width[page_no], "image_height":image_sizes_height[page_no], "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":original_cropboxes[-1].width, "cropbox_height":original_cropboxes[-1].height}
550
+ page_sizes.append(out_page_image_sizes)
551
 
552
  converted_file_path = output_folder + file_name_with_ext
553
 
 
557
 
558
  elif file_extension in ['.csv']:
559
  review_file_csv = read_file(file)
560
+ all_annotations_object = convert_pandas_df_to_review_json(review_file_csv, image_file_paths, page_sizes)
561
  json_from_csv = True
562
  print("Converted CSV review file to json")
563
 
 
574
  all_annotations_object = json.loads(file_path) # Use loads for string content
575
 
576
  # Assume it's a textract json
577
+ elif (file_extension == '.json') and (prepare_for_review is not True):
578
+ # If the file ends with textract.json, assume it's a Textract response object.
579
+ # Copy it to the output folder so it can be used later.
580
+ out_folder = os.path.join(output_folder, file_path_without_ext + ".json")
581
+
582
+ # Use shutil to copy the file directly
583
+ shutil.copy2(file_path, out_folder) # Preserves metadata
584
+
585
  continue
586
 
587
  # If you have an annotations object from the above code
 
638
  #print("all_annotations_object at end of json/csv load part:", all_annotations_object)
639
 
640
  # Get list of pages that are to be fully redacted and redact them
641
+ # if not in_fully_redacted_list.empty:
642
+ # print("Redacting whole pages")
643
 
644
+ # for i, image in enumerate(image_file_paths):
645
+ # page = pymupdf_doc.load_page(i)
646
+ # rect_height = page.rect.height
647
+ # rect_width = page.rect.width
648
+ # whole_page_img_annotation_box = redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours = False, border = 5)
649
 
650
+ # all_annotations_object.append(whole_page_img_annotation_box)
651
 
652
  # Write the response to a JSON file in output folder
653
  out_folder = output_folder + file_path_without_ext + ".json"
 
683
 
684
  number_of_pages = len(image_file_paths)
685
 
686
+ return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes
687
 
688
  def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
689
  file_path_without_ext = get_file_name_without_type(in_file_path)
 
693
  # Convert annotated text pdf back to image to give genuine redactions
694
  print("Creating image version of redacted PDF to embed redactions.")
695
 
696
+ pdf_text_image_paths, image_sizes_width, image_sizes_height = process_file(out_text_file_path[0])
697
  out_text_image_file_path = output_folder + file_path_without_ext + "_text_redacted_as_img.pdf"
698
  pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=image_dpi, save_all=True, append_images=pdf_text_image_paths[1:])
699
 
 
739
  print(final_df)
740
 
741
 
742
+ def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decision_output:pd.DataFrame=pd.DataFrame(), page_sizes:List[dict]=[]) -> pd.DataFrame:
743
  '''
744
  Convert the annotation json data to a dataframe format. Add on any text from the initial review_file dataframe by joining on pages/co-ordinates (doesn't work very well currently).
745
  '''
746
  # Flatten the data
747
  flattened_annotation_data = []
748
+ page_sizes_df = pd.DataFrame()
749
 
750
  if not isinstance(redaction_decision_output, pd.DataFrame):
751
  redaction_decision_output = pd.DataFrame()
 
778
  flattened_annotation_data.append(data_to_add)
779
 
780
  # Convert to a DataFrame
781
+ review_file_df = pd.DataFrame(flattened_annotation_data)
782
+
783
+ if page_sizes:
784
+ page_sizes_df = pd.DataFrame(page_sizes)
785
+ page_sizes_df["page"] = page_sizes_df["page"].astype(int)
786
+
787
+ # Convert data to same coordinate system
788
+ # If all coordinates all greater than one, this is a absolute image coordinates - change back to relative coordinates
789
+ if "xmin" in review_file_df.columns:
790
+ if review_file_df["xmin"].max() >= 1 and review_file_df["xmax"].max() >= 1 and review_file_df["ymin"].max() >= 1 and review_file_df["ymax"].max() >= 1:
791
+ print("review file df has large coordinates")
792
+ review_file_df["page"] = review_file_df["page"].astype(int)
793
+
794
+ if "image_width" not in review_file_df.columns and not page_sizes_df.empty:
795
+ review_file_df = review_file_df.merge(page_sizes_df, on="page", how="left")
796
+
797
+ if "image_width" in review_file_df.columns:
798
+ print("Dividing coordinates in review file")
799
+ review_file_df["xmin"] = review_file_df["xmin"] / review_file_df["image_width"]
800
+ review_file_df["xmax"] = review_file_df["xmax"] / review_file_df["image_width"]
801
+ review_file_df["ymin"] = review_file_df["ymin"] / review_file_df["image_height"]
802
+ review_file_df["ymax"] = review_file_df["ymax"] / review_file_df["image_height"]
803
+
804
+ #print("review_file_df after coordinates divided:", review_file_df)
805
+
806
+ if not redaction_decision_output.empty:
807
+ # If all coordinates all greater than one, this is a absolute image coordinates - change back to relative coordinates
808
+ if redaction_decision_output["xmin"].max() >= 1 and redaction_decision_output["xmax"].max() >= 1 and redaction_decision_output["ymin"].max() >= 1 and redaction_decision_output["ymax"].max() >= 1:
809
+
810
+ redaction_decision_output["page"] = redaction_decision_output["page"].astype(int)
811
+
812
+ if "image_width" not in redaction_decision_output.columns and not page_sizes_df.empty:
813
+ redaction_decision_output = redaction_decision_output.merge(page_sizes_df, on="page", how="left")
814
+
815
+ if "image_width" in redaction_decision_output.columns:
816
+ redaction_decision_output["xmin"] = redaction_decision_output["xmin"] / redaction_decision_output["image_width"]
817
+ redaction_decision_output["xmax"] = redaction_decision_output["xmax"] / redaction_decision_output["image_width"]
818
+ redaction_decision_output["ymin"] = redaction_decision_output["ymin"] / redaction_decision_output["image_height"]
819
+ redaction_decision_output["ymax"] = redaction_decision_output["ymax"] / redaction_decision_output["image_height"]
820
+
821
+ #print("convert_review_json review_file_df before merges:", review_file_df[['xmin', 'ymin', 'xmax', 'ymax', 'label']])
822
+ #print("review_file_df[xmin]", review_file_df["xmin"])
823
 
824
  #print("redaction_decision_output:", redaction_decision_output)
825
+ #print("review_file_df:", review_file_df)
826
 
827
  # Join on additional text data from decision output results if included, if text not already there
828
+ if not redaction_decision_output.empty:
829
+ if not 'text' in redaction_decision_output.columns:
830
+ redaction_decision_output['text'] = ''
831
+
832
+ if not 'text' in review_file_df.columns:
833
+ review_file_df['text'] = ''
834
+
835
+ # Load DataFrames
836
+ df1 = review_file_df.copy()
837
+ df2 = redaction_decision_output.copy()
838
+
839
+ #print("review_file before tolerance merge:", review_file_df)
840
+ #print("redaction_decision_output before tolerance merge:", redaction_decision_output)
841
+
842
+ # Create a unique key based on coordinates and label for exact merge
843
+ merge_keys = ['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page']
844
+ df1['key'] = df1[merge_keys].astype(str).agg('_'.join, axis=1)
845
+ df2['key'] = df2[merge_keys].astype(str).agg('_'.join, axis=1)
846
+
847
+ # Attempt exact merge first
848
+ #merged_df = df1.merge(df2[['key', 'text']], on='key', how='left')
849
+
850
+ # Attempt exact merge first, renaming df2['text'] to avoid suffixes
851
+ merged_df = df1.merge(df2[['key', 'text']], on='key', how='left', suffixes=('', '_duplicate'))
852
+
853
+ # If a match is found, keep that text; otherwise, keep the original df1 text
854
+ merged_df['text'] = merged_df['text'].combine_first(merged_df.pop('text_duplicate'))
855
 
856
+ #print("merged_df['text']:", merged_df['text'])
 
857
 
858
+ # Handle missing matches using a proximity-based approach
859
+ #if merged_df['text'].isnull().sum() > 0:
860
+ print("Attempting tolerance-based merge for text")
861
+ # Convert coordinates to numpy arrays for KDTree lookup
862
+ tree = cKDTree(df2[['xmin', 'ymin', 'xmax', 'ymax']].values)
863
+ query_coords = df1[['xmin', 'ymin', 'xmax', 'ymax']].values
864
 
865
+ # Find nearest neighbors within a reasonable tolerance (e.g., 1% of page)
866
+ tolerance = 0.01
867
+ distances, indices = tree.query(query_coords, distance_upper_bound=tolerance)
868
 
869
+ # Assign text values where matches are found
870
+ for i, (dist, idx) in enumerate(zip(distances, indices)):
871
+ if dist < tolerance and idx < len(df2):
872
+ merged_df.at[i, 'text'] = df2.iloc[idx]['text']
873
 
874
+ # Drop the temporary key column
875
+ merged_df.drop(columns=['key'], inplace=True)
876
 
877
+ review_file_df = merged_df
878
 
879
+ review_file_df = review_file_df[["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"]]
880
 
881
  # Ensure required columns exist, filling with blank if they don't
882
  for col in ["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"]:
883
+ if col not in review_file_df.columns:
884
+ review_file_df[col] = ''
885
 
886
+ #for col in ['xmin', 'xmax', 'ymin', 'ymax']:
887
+ # review_file_df[col] = np.floor(review_file_df[col])
888
 
889
+ # If colours are saved as list, convert to tuple
890
+ review_file_df["color"] = review_file_df["color"].apply(lambda x: tuple(x) if isinstance(x, list) else x)
891
 
892
+ # print("page_sizes:", page_sizes)
893
 
894
+ # Convert page sizes to relative values
895
+ # if page_sizes:
896
+ # print("Checking page sizes")
897
+
898
+ # page_sizes_df = pd.DataFrame(page_sizes)
899
+
900
+ # if "image_width" not in review_file_df.columns:
901
+ # review_file_df = review_file_df.merge(page_sizes_df, how="left", on = "page")
902
+
903
+ # # If all coordinates all greater than one, this is a absolute image coordinates - change back to relative coordinates
904
+ # if review_file_df["xmin"].max() > 1 and review_file_df["xmax"].max() > 1 and review_file_df["ymin"].max() > 1 and review_file_df["ymax"].max() > 1:
905
+ # print("Dividing coordinates by image width and height.")
906
+ # review_file_df["xmin"] = review_file_df["xmin"] / review_file_df["image_width"]
907
+ # review_file_df["xmax"] = review_file_df["xmax"] / review_file_df["image_width"]
908
+ # review_file_df["ymin"] = review_file_df["ymin"] / review_file_df["image_height"]
909
+ # review_file_df["ymax"] = review_file_df["ymax"] / review_file_df["image_height"]
910
+
911
+ review_file_df = review_file_df.sort_values(['page', 'ymin', 'xmin', 'label'])
912
+
913
+ review_file_df.to_csv(output_folder + "review_file_test.csv", index=None)
914
+
915
+ return review_file_df
916
+
917
+ def convert_pandas_df_to_review_json(review_file_df: pd.DataFrame, image_paths: List[Image.Image], page_sizes:List[dict]=[]) -> List[dict]:
918
  '''
919
+ Convert a review csv to a json file for use by the Gradio Annotation object.
920
  '''
921
+
922
+ if page_sizes:
923
+
924
+ page_sizes_df = pd.DataFrame(page_sizes)
925
+
926
+ #print(page_sizes_df)
927
+
928
+ if "image_width" not in review_file_df.columns:
929
+ review_file_df = review_file_df.merge(page_sizes_df, how="left", on = "page")
930
+
931
+ #print("review_file_df in convert pandas df to review json function:", review_file_df[["xmin", "xmax", "ymin", "ymax"]])
932
+
933
+ # If all coordinates are less or equal to one, this is a relative page scaling - change back to image coordinates
934
+ if review_file_df["xmin"].max() <= 1 and review_file_df["xmax"].max() <= 1 and review_file_df["ymin"].max() <= 1 and review_file_df["ymax"].max() <= 1:
935
+ review_file_df["xmin"] = review_file_df["xmin"] * review_file_df["image_width"]
936
+ review_file_df["xmax"] = review_file_df["xmax"] * review_file_df["image_width"]
937
+ review_file_df["ymin"] = review_file_df["ymin"] * review_file_df["image_height"]
938
+ review_file_df["ymax"] = review_file_df["ymax"] * review_file_df["image_height"]
939
+
940
  # Keep only necessary columns
941
  review_file_df = review_file_df[["image", "page", "xmin", "ymin", "xmax", "ymax", "color", "label"]]
942
 
943
+ # If colours are saved as list, convert to tuple
944
+ review_file_df.loc[:, "color"] = review_file_df.loc[:,"color"].apply(lambda x: tuple(x) if isinstance(x, list) else x)
945
+
946
  # Group the DataFrame by the 'image' column
947
  grouped_csv_pages = review_file_df.groupby('page')
948
 
 
951
 
952
  for n, pdf_image_path in enumerate(image_paths):
953
  reported_page_number = int(n + 1)
954
+
955
 
956
  if reported_page_number in review_file_df["page"].values:
957
 
 
959
  selected_csv_pages = grouped_csv_pages.get_group(reported_page_number)
960
  annotation_boxes = selected_csv_pages.drop(columns=['image', 'page']).to_dict(orient='records')
961
 
962
+ # If all bbox coordinates are below 1, then they are relative. Need to convert based on image size.
963
+
964
  annotation = {
965
  "image": pdf_image_path,
966
  "boxes": annotation_boxes
tools/file_redaction.py CHANGED
@@ -30,7 +30,7 @@ from tools.file_conversion import process_file, image_dpi, convert_review_json_t
30
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
31
  from tools.helper_functions import get_file_name_without_type, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
32
  from tools.file_conversion import process_file, is_pdf, is_pdf_or_image, prepare_image_or_pdf
33
- from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
34
  from tools.presidio_analyzer_custom import recognizer_result_from_dict
35
 
36
  # Number of pages to loop through before breaking. Currently set very high, as functions are breaking on time metrics (e.g. every 105 seconds), rather than on number of pages redacted.
@@ -100,7 +100,7 @@ def choose_and_run_redactor(file_paths:List[str],
100
  aws_access_key_textbox:str='',
101
  aws_secret_key_textbox:str='',
102
  annotate_max_pages:int=1,
103
- review_file_state=[],
104
  output_folder:str=output_folder,
105
  document_cropboxes:List=[],
106
  progress=gr.Progress(track_tqdm=True)):
@@ -139,7 +139,8 @@ def choose_and_run_redactor(file_paths:List[str],
139
  - match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
140
  - aws_access_key_textbox (str, optional): AWS access key for account with Textract and Comprehend permissions.
141
  - aws_secret_key_textbox (str, optional): AWS secret key for account with Textract and Comprehend permissions.
142
- - annotate_max_pages (int, optional): Maximum page value for the annotation object
 
143
  - output_folder (str, optional): Output folder for results.
144
  - document_cropboxes (List, optional): List of document cropboxes for the PDF.
145
  - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
@@ -150,10 +151,29 @@ def choose_and_run_redactor(file_paths:List[str],
150
  tic = time.perf_counter()
151
  all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
152
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  # If there are no prepared PDF file paths, it is most likely that the prepare_image_or_pdf function has not been run. So do it here to get the outputs you need
154
  if not pymupdf_doc:
155
  print("Prepared PDF file not found, loading from file")
156
- out_message, prepared_pdf_file_paths, prepared_pdf_image_paths, annotate_max_pages, annotate_max_pages, pymupdf_doc, annotations_all_pages, review_file_state, document_cropboxes = prepare_image_or_pdf(file_paths, in_redact_method, latest_file_completed, out_message, first_loop_state, annotate_max_pages, annotations_all_pages, document_cropboxes)
157
 
158
  #print("prepared_pdf_file_paths:", prepared_pdf_file_paths[0])
159
  review_out_file_paths = [prepared_pdf_file_paths[0]]
@@ -219,7 +239,7 @@ def choose_and_run_redactor(file_paths:List[str],
219
  estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
220
  print("Estimated total processing time:", str(estimate_total_processing_time))
221
 
222
- return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
223
 
224
  # If we have reached the last page, return message and outputs
225
  if current_loop_page >= number_of_pages:
@@ -235,7 +255,7 @@ def choose_and_run_redactor(file_paths:List[str],
235
 
236
  review_out_file_paths.extend(out_review_file_path)
237
 
238
- return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
239
 
240
  # Create allow list
241
  # If string, assume file path
@@ -306,17 +326,7 @@ def choose_and_run_redactor(file_paths:List[str],
306
 
307
  progress(0.5, desc="Redacting file")
308
 
309
- if isinstance(file_paths, str):
310
- file_paths_list = [os.path.abspath(file_paths)]
311
- file_paths_loop = file_paths_list
312
- elif isinstance(file_paths, dict):
313
- file_paths = file_paths["name"]
314
- file_paths_list = [os.path.abspath(file_paths)]
315
- file_paths_loop = file_paths_list
316
- else:
317
- file_paths_list = file_paths
318
- file_paths_loop = [file_paths_list[int(latest_file_completed)]]
319
-
320
  for file in file_paths_loop:
321
  if isinstance(file, str):
322
  file_path = file
@@ -336,7 +346,7 @@ def choose_and_run_redactor(file_paths:List[str],
336
  out_message = "No file selected"
337
  print(out_message)
338
  raise Exception(out_message)
339
-
340
  if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
341
 
342
  #Analyse and redact image-based pdf or image
@@ -346,7 +356,7 @@ def choose_and_run_redactor(file_paths:List[str],
346
 
347
  print("Redacting file " + pdf_file_name_with_ext + " as an image-based file")
348
 
349
- pymupdf_doc, all_decision_process_table, log_files_output_paths, new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number = redact_image_pdf(file_path,
350
  prepared_pdf_image_paths,
351
  language,
352
  chosen_redact_entities,
@@ -389,7 +399,7 @@ def choose_and_run_redactor(file_paths:List[str],
389
  # Analyse text-based pdf
390
  print('Redacting file as text-based PDF')
391
 
392
- pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number = redact_text_pdf(file_path,
393
  prepared_pdf_image_paths,language,
394
  chosen_redact_entities,
395
  chosen_redact_comprehend_entities,
@@ -416,6 +426,10 @@ def choose_and_run_redactor(file_paths:List[str],
416
  print(out_message)
417
  raise Exception(out_message)
418
 
 
 
 
 
419
  # If at last page, save to file
420
  if current_loop_page >= number_of_pages:
421
 
@@ -437,21 +451,61 @@ def choose_and_run_redactor(file_paths:List[str],
437
 
438
  out_file_paths.append(out_redacted_pdf_file_path)
439
 
440
- out_orig_pdf_file_path = output_folder + pdf_file_name_with_ext
441
-
442
  #logs_output_file_name = out_orig_pdf_file_path + "_decision_process_output.csv"
443
  #all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
444
  #log_files_output_paths.append(logs_output_file_name)
445
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
446
  all_text_output_file_name = out_orig_pdf_file_path + "_ocr_output.csv"
447
  all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
448
  out_file_paths.append(all_text_output_file_name)
449
 
450
- # Save the gradio_annotation_boxes to a review csv file
451
  try:
452
- review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
 
 
 
 
453
 
454
- out_review_file_path = out_orig_pdf_file_path + '_review_file.csv'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
455
  review_df.to_csv(out_review_file_path, index=None)
456
  out_file_paths.append(out_review_file_path)
457
 
@@ -465,7 +519,7 @@ def choose_and_run_redactor(file_paths:List[str],
465
  #print("Saving annotations to JSON")
466
 
467
  except Exception as e:
468
- print("Could not save annotations to csv file:", e)
469
 
470
  # Make a combined message for the file
471
  if isinstance(out_message, list):
@@ -486,7 +540,6 @@ def choose_and_run_redactor(file_paths:List[str],
486
  time_taken = toc - tic
487
  estimated_time_taken_state = estimated_time_taken_state + time_taken
488
 
489
-
490
  # If textract requests made, write to logging file
491
  if all_request_metadata:
492
  all_request_metadata_str = '\n'.join(all_request_metadata).strip()
@@ -507,7 +560,7 @@ def choose_and_run_redactor(file_paths:List[str],
507
  out_file_paths = list(set(out_file_paths))
508
  review_out_file_paths = [prepared_pdf_file_paths[0], out_review_file_path]
509
 
510
- return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
511
 
512
  def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
513
  '''
@@ -714,7 +767,7 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
714
  x2 = pymupdf_x2
715
 
716
  if hasattr(annot, 'text') and annot.text:
717
- img_annotation_box["text"] = annot.text
718
  else:
719
  img_annotation_box["text"] = ""
720
 
@@ -731,12 +784,12 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
731
  img_annotation_box["ymax"] = annot.top + annot.height
732
  img_annotation_box["color"] = (0,0,0)
733
  try:
734
- img_annotation_box["label"] = annot.entity_type
735
  except:
736
  img_annotation_box["label"] = "Redaction"
737
 
738
  if hasattr(annot, 'text') and annot.text:
739
- img_annotation_box["text"] = annot.text
740
  else:
741
  img_annotation_box["text"] = ""
742
 
@@ -771,7 +824,7 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
771
  img_annotation_box["label"] = str(annot["/T"])
772
 
773
  if hasattr(annot, 'Contents'):
774
- img_annotation_box["text"] = annot.Contents
775
  else:
776
  img_annotation_box["text"] = ""
777
  else:
@@ -797,7 +850,7 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
797
  }
798
 
799
  page.apply_redactions(images=0, graphics=0)
800
- page.set_cropbox(original_cropbox) # Set CropBox to original size
801
  page.clean_contents()
802
 
803
  return page, out_annotation_boxes
@@ -1006,9 +1059,9 @@ def redact_image_pdf(file_path:str,
1006
 
1007
 
1008
  if analysis_type == textract_option and textract_client == "":
1009
- print("Connection to AWS Textract service unsuccessful.")
1010
-
1011
- return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
1012
 
1013
  tic = time.perf_counter()
1014
 
@@ -1016,7 +1069,7 @@ def redact_image_pdf(file_path:str,
1016
  out_message = "PDF does not exist as images. Converting pages to image"
1017
  print(out_message)
1018
 
1019
- prepared_pdf_file_paths = process_file(file_path)
1020
 
1021
  number_of_pages = len(prepared_pdf_file_paths)
1022
  print("Number of pages:", str(number_of_pages))
@@ -1033,21 +1086,10 @@ def redact_image_pdf(file_path:str,
1033
  # If running Textract, check if file already exists. If it does, load in existing data
1034
  if analysis_type == textract_option:
1035
 
1036
- json_file_path = output_folder + file_name + "_textract.json"
1037
 
1038
- if not os.path.exists(json_file_path):
1039
- print("No existing Textract results file found.")
1040
- textract_data = {}
1041
- else:
1042
- # Open the file and load the JSON data
1043
- no_textract_file = False
1044
- print("Found existing Textract json results file.")
1045
-
1046
- if json_file_path not in log_files_output_paths:
1047
- log_files_output_paths.append(json_file_path)
1048
-
1049
- with open(json_file_path, 'r') as json_file:
1050
- textract_data = json.load(json_file)
1051
 
1052
  ###
1053
  if current_loop_page == 0: page_loop_start = 0
@@ -1056,6 +1098,7 @@ def redact_image_pdf(file_path:str,
1056
  progress_bar = tqdm(range(page_loop_start, number_of_pages), unit="pages remaining", desc="Redacting pages")
1057
 
1058
  original_cropboxes = []
 
1059
 
1060
  for page_no in progress_bar:
1061
 
@@ -1077,7 +1120,8 @@ def redact_image_pdf(file_path:str,
1077
  image_annotations = {"image": image, "boxes": []}
1078
  pymupdf_page = pymupdf_doc.load_page(page_no)
1079
 
1080
- original_cropboxes.append(pymupdf_page.cropbox) # Save original CropBox
 
1081
  pymupdf_page.set_cropbox(pymupdf_page.mediabox) # Set CropBox to MediaBox
1082
 
1083
  if page_no >= page_min and page_no < page_max:
@@ -1085,10 +1129,15 @@ def redact_image_pdf(file_path:str,
1085
  #print("Image is in range of pages to redact")
1086
  if isinstance(image, str):
1087
  image = Image.open(image)
 
 
1088
 
1089
  # Need image size to convert textract OCR outputs to the correct sizes
1090
  page_width, page_height = image.size
1091
 
 
 
 
1092
  # Possibility to use different languages
1093
  if language == 'en': ocr_lang = 'eng'
1094
  else: ocr_lang = language
@@ -1110,8 +1159,8 @@ def redact_image_pdf(file_path:str,
1110
  try:
1111
  text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
1112
 
1113
- if json_file_path not in log_files_output_paths:
1114
- log_files_output_paths.append(json_file_path)
1115
 
1116
  textract_data = {"pages":[text_blocks]}
1117
  except Exception as e:
@@ -1170,10 +1219,6 @@ def redact_image_pdf(file_path:str,
1170
  else:
1171
  redaction_bboxes = []
1172
 
1173
-
1174
- # if analysis_type == tesseract_ocr_option: interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
1175
- # elif analysis_type == textract_option: interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
1176
-
1177
  # # Save decision making process
1178
  # bboxes_str = str(redaction_bboxes)
1179
  # with open(interim_results_file_path, "w") as f:
@@ -1282,17 +1327,17 @@ def redact_image_pdf(file_path:str,
1282
 
1283
  if analysis_type == textract_option:
1284
  # Write the updated existing textract data back to the JSON file
1285
- with open(json_file_path, 'w') as json_file:
1286
  json.dump(textract_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
1287
 
1288
- if json_file_path not in log_files_output_paths:
1289
- log_files_output_paths.append(json_file_path)
1290
 
1291
- print("At end of redact_image_pdf function where time over max.", json_file_path, "not found in log_files_output_paths, appended to list:", log_files_output_paths)
1292
 
1293
  current_loop_page += 1
1294
 
1295
- return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
1296
 
1297
  if is_pdf(file_path) == False:
1298
  images.append(image)
@@ -1317,23 +1362,23 @@ def redact_image_pdf(file_path:str,
1317
 
1318
  if analysis_type == textract_option:
1319
  # Write the updated existing textract data back to the JSON file
1320
- with open(json_file_path, 'w') as json_file:
1321
  json.dump(textract_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
1322
 
1323
- if json_file_path not in log_files_output_paths:
1324
- log_files_output_paths.append(json_file_path)
1325
 
1326
- return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
1327
 
1328
  if analysis_type == textract_option:
1329
  # Write the updated existing textract data back to the JSON file
1330
 
1331
- with open(json_file_path, 'w') as json_file:
1332
  json.dump(textract_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
1333
- if json_file_path not in log_files_output_paths:
1334
- log_files_output_paths.append(json_file_path)
1335
 
1336
- return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
1337
 
1338
 
1339
  ###
@@ -1565,11 +1610,13 @@ def redact_text_pdf(
1565
  - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
1566
  - progress: Progress tracking object
1567
  '''
 
 
1568
 
1569
  if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
1570
  print("Connection to AWS Comprehend service not found.")
1571
 
1572
- return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
1573
 
1574
  # Update custom word list analyser object with any new words that have been added to the custom deny list
1575
  if custom_recogniser_word_list:
@@ -1600,6 +1647,7 @@ def redact_text_pdf(
1600
  else: page_loop_start = current_loop_page
1601
 
1602
  original_cropboxes = []
 
1603
 
1604
  progress_bar = tqdm(range(current_loop_page, number_of_pages), unit="pages remaining", desc="Redacting pages")
1605
 
@@ -1620,7 +1668,7 @@ def redact_text_pdf(
1620
  image_annotations = {"image": image, "boxes": []}
1621
  pymupdf_page = pymupdf_doc.load_page(page_no)
1622
 
1623
- original_cropboxes.append(pymupdf_page.cropbox) # Save original CropBox
1624
  pymupdf_page.set_cropbox(pymupdf_page.mediabox) # Set CropBox to MediaBox
1625
 
1626
  if page_min <= page_no < page_max:
@@ -1628,6 +1676,14 @@ def redact_text_pdf(
1628
  if isinstance(image, str):
1629
  image_path = image
1630
  image = Image.open(image_path)
 
 
 
 
 
 
 
 
1631
 
1632
  for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
1633
 
@@ -1749,7 +1805,7 @@ def redact_text_pdf(
1749
 
1750
  current_loop_page += 1
1751
 
1752
- return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
1753
 
1754
 
1755
  # Check if the image already exists in annotations_all_pages
@@ -1768,7 +1824,7 @@ def redact_text_pdf(
1768
  page_break_return = True
1769
  progress.close(_tqdm=progress_bar)
1770
 
1771
- return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
1772
 
1773
 
1774
- return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
 
30
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
31
  from tools.helper_functions import get_file_name_without_type, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
32
  from tools.file_conversion import process_file, is_pdf, is_pdf_or_image, prepare_image_or_pdf
33
+ from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
34
  from tools.presidio_analyzer_custom import recognizer_result_from_dict
35
 
36
  # Number of pages to loop through before breaking. Currently set very high, as functions are breaking on time metrics (e.g. every 105 seconds), rather than on number of pages redacted.
 
100
  aws_access_key_textbox:str='',
101
  aws_secret_key_textbox:str='',
102
  annotate_max_pages:int=1,
103
+ review_file_state:pd.DataFrame=[],
104
  output_folder:str=output_folder,
105
  document_cropboxes:List=[],
106
  progress=gr.Progress(track_tqdm=True)):
 
139
  - match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
140
  - aws_access_key_textbox (str, optional): AWS access key for account with Textract and Comprehend permissions.
141
  - aws_secret_key_textbox (str, optional): AWS secret key for account with Textract and Comprehend permissions.
142
+ - annotate_max_pages (int, optional): Maximum page value for the annotation object.
143
+ - review_file_state (pd.DataFrame, optional): Output review file dataframe.
144
  - output_folder (str, optional): Output folder for results.
145
  - document_cropboxes (List, optional): List of document cropboxes for the PDF.
146
  - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
 
151
  tic = time.perf_counter()
152
  all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
153
 
154
+ # Choose the correct file to prepare
155
+ if isinstance(file_paths, str):
156
+ file_paths_list = [os.path.abspath(file_paths)]
157
+ elif isinstance(file_paths, dict):
158
+ file_paths = file_paths["name"]
159
+ file_paths_list = [os.path.abspath(file_paths)]
160
+ else:
161
+ file_paths_list = file_paths
162
+
163
+ valid_extensions = {".pdf", ".jpg", ".jpeg", ".png"}
164
+ # Filter only files with valid extensions. Currently only allowing one file to be redacted at a time
165
+ file_paths_list = [list([file for file in file_paths_list if os.path.splitext(file)[1].lower() in valid_extensions])[0]]
166
+
167
+ # If latest_file_completed is used, get the specific file
168
+ if not isinstance(file_paths, (str, dict)):
169
+ file_paths_loop = [file_paths_list[int(latest_file_completed)]] if len(file_paths_list) > latest_file_completed else []
170
+ else:
171
+ file_paths_loop = file_paths_list
172
+
173
  # If there are no prepared PDF file paths, it is most likely that the prepare_image_or_pdf function has not been run. So do it here to get the outputs you need
174
  if not pymupdf_doc:
175
  print("Prepared PDF file not found, loading from file")
176
+ out_message, prepared_pdf_file_paths, prepared_pdf_image_paths, annotate_max_pages, annotate_max_pages, pymupdf_doc, annotations_all_pages, review_file_state, document_cropboxes, page_sizes = prepare_image_or_pdf(file_paths, in_redact_method, latest_file_completed, out_message, first_loop_state, annotate_max_pages, annotations_all_pages, document_cropboxes, redact_whole_page_list, output_folder)
177
 
178
  #print("prepared_pdf_file_paths:", prepared_pdf_file_paths[0])
179
  review_out_file_paths = [prepared_pdf_file_paths[0]]
 
239
  estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
240
  print("Estimated total processing time:", str(estimate_total_processing_time))
241
 
242
+ return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state, page_sizes
243
 
244
  # If we have reached the last page, return message and outputs
245
  if current_loop_page >= number_of_pages:
 
255
 
256
  review_out_file_paths.extend(out_review_file_path)
257
 
258
+ return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state, page_sizes
259
 
260
  # Create allow list
261
  # If string, assume file path
 
326
 
327
  progress(0.5, desc="Redacting file")
328
 
329
+ # Run through file loop, redact each file at a time
 
 
 
 
 
 
 
 
 
 
330
  for file in file_paths_loop:
331
  if isinstance(file, str):
332
  file_path = file
 
346
  out_message = "No file selected"
347
  print(out_message)
348
  raise Exception(out_message)
349
+
350
  if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
351
 
352
  #Analyse and redact image-based pdf or image
 
356
 
357
  print("Redacting file " + pdf_file_name_with_ext + " as an image-based file")
358
 
359
+ pymupdf_doc, all_decision_process_table, log_files_output_paths, new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, page_sizes = redact_image_pdf(file_path,
360
  prepared_pdf_image_paths,
361
  language,
362
  chosen_redact_entities,
 
399
  # Analyse text-based pdf
400
  print('Redacting file as text-based PDF')
401
 
402
+ pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, page_sizes = redact_text_pdf(file_path,
403
  prepared_pdf_image_paths,language,
404
  chosen_redact_entities,
405
  chosen_redact_comprehend_entities,
 
426
  print(out_message)
427
  raise Exception(out_message)
428
 
429
+ # Output file paths
430
+ out_orig_pdf_file_path = output_folder + pdf_file_name_with_ext
431
+ out_review_file_path = out_orig_pdf_file_path + '_review_file.csv'
432
+
433
  # If at last page, save to file
434
  if current_loop_page >= number_of_pages:
435
 
 
451
 
452
  out_file_paths.append(out_redacted_pdf_file_path)
453
 
 
 
454
  #logs_output_file_name = out_orig_pdf_file_path + "_decision_process_output.csv"
455
  #all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
456
  #log_files_output_paths.append(logs_output_file_name)
457
 
458
+ # Convert OCR result bounding boxes to relative values
459
+ #print("all_line_level_ocr_results_df:", all_line_level_ocr_results_df)
460
+ #print("page_sizes:", page_sizes)
461
+ #print("all_line_level_ocr_results_df:", all_line_level_ocr_results_df)
462
+
463
+ page_sizes_df = pd.DataFrame(page_sizes)
464
+
465
+ page_sizes_df["page"] = page_sizes_df["page"].astype(int)
466
+ all_line_level_ocr_results_df["page"] = all_line_level_ocr_results_df["page"].astype(int)
467
+
468
+ all_line_level_ocr_results_df = all_line_level_ocr_results_df.merge(page_sizes_df, on="page", how="left")
469
+
470
+ all_line_level_ocr_results_df["left"] = all_line_level_ocr_results_df["left"] / all_line_level_ocr_results_df["image_width"]
471
+ all_line_level_ocr_results_df["width"] = all_line_level_ocr_results_df["width"] / all_line_level_ocr_results_df["image_width"]
472
+ all_line_level_ocr_results_df["top"] = all_line_level_ocr_results_df["top"] / all_line_level_ocr_results_df["image_height"]
473
+ all_line_level_ocr_results_df["height"] = all_line_level_ocr_results_df["height"] / all_line_level_ocr_results_df["image_height"]
474
+
475
+ #print("all_line_level_ocr_results_df in choose and run redactor:", all_line_level_ocr_results_df)
476
+
477
  all_text_output_file_name = out_orig_pdf_file_path + "_ocr_output.csv"
478
  all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
479
  out_file_paths.append(all_text_output_file_name)
480
 
481
+ # Save the gradio_annotation_boxes to a review csv file
482
  try:
483
+ #print("annotations_all_pages before in choose and run redactor:", annotations_all_pages)
484
+ #print("all_decision_process_table before in choose and run redactor:", all_decision_process_table)
485
+ #print("page_sizes before in choose and run redactor:", page_sizes)
486
+
487
+ review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table, page_sizes)
488
 
489
+ #print("annotation_all_pages:", annotations_all_pages)
490
+ #print("all_decision_process_table after in choose and run redactor:", all_decision_process_table)
491
+ #print("review_df after in choose and run redactor:", review_df)
492
+
493
+ review_df["page"] = review_df["page"].astype(int)
494
+ if "image_height" not in review_df.columns:
495
+ review_df = review_df.merge(page_sizes_df, on="page", how="left")
496
+
497
+ # If all coordinates all greater than one, this is a absolute image coordinates - change back to relative coordinates
498
+ if review_df["xmin"].max() >= 1 and review_df["xmax"].max() >= 1 and review_df["ymin"].max() >= 1 and review_df["ymax"].max() >= 1:
499
+ review_df["xmin"] = review_df["xmin"] / review_df["image_width"]
500
+ review_df["xmax"] = review_df["xmax"] / review_df["image_width"]
501
+ review_df["ymin"] = review_df["ymin"] / review_df["image_height"]
502
+ review_df["ymax"] = review_df["ymax"] / review_df["image_height"]
503
+
504
+ # Don't need page sizes in outputs
505
+ review_df.drop(["image_width", "image_height", "mediabox_width", "mediabox_height", "cropbox_width", "cropbox_height"], axis=1, inplace=True, errors="ignore")
506
+
507
+ #print("review_df:", review_df)
508
+
509
  review_df.to_csv(out_review_file_path, index=None)
510
  out_file_paths.append(out_review_file_path)
511
 
 
519
  #print("Saving annotations to JSON")
520
 
521
  except Exception as e:
522
+ print("Could not save annotations to csv file in choose and run redactor:", e)
523
 
524
  # Make a combined message for the file
525
  if isinstance(out_message, list):
 
540
  time_taken = toc - tic
541
  estimated_time_taken_state = estimated_time_taken_state + time_taken
542
 
 
543
  # If textract requests made, write to logging file
544
  if all_request_metadata:
545
  all_request_metadata_str = '\n'.join(all_request_metadata).strip()
 
560
  out_file_paths = list(set(out_file_paths))
561
  review_out_file_paths = [prepared_pdf_file_paths[0], out_review_file_path]
562
 
563
+ return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_df, page_sizes
564
 
565
  def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
566
  '''
 
767
  x2 = pymupdf_x2
768
 
769
  if hasattr(annot, 'text') and annot.text:
770
+ img_annotation_box["text"] = str(annot.text)
771
  else:
772
  img_annotation_box["text"] = ""
773
 
 
784
  img_annotation_box["ymax"] = annot.top + annot.height
785
  img_annotation_box["color"] = (0,0,0)
786
  try:
787
+ img_annotation_box["label"] = str(annot.entity_type)
788
  except:
789
  img_annotation_box["label"] = "Redaction"
790
 
791
  if hasattr(annot, 'text') and annot.text:
792
+ img_annotation_box["text"] = str(annot.text)
793
  else:
794
  img_annotation_box["text"] = ""
795
 
 
824
  img_annotation_box["label"] = str(annot["/T"])
825
 
826
  if hasattr(annot, 'Contents'):
827
+ img_annotation_box["text"] = str(annot.Contents)
828
  else:
829
  img_annotation_box["text"] = ""
830
  else:
 
850
  }
851
 
852
  page.apply_redactions(images=0, graphics=0)
853
+ page.set_cropbox = original_cropbox # Set CropBox to original size
854
  page.clean_contents()
855
 
856
  return page, out_annotation_boxes
 
1059
 
1060
 
1061
  if analysis_type == textract_option and textract_client == "":
1062
+ out_message = "Connection to AWS Textract service unsuccessful."
1063
+ print(out_message)
1064
+ raise Exception(out_message)
1065
 
1066
  tic = time.perf_counter()
1067
 
 
1069
  out_message = "PDF does not exist as images. Converting pages to image"
1070
  print(out_message)
1071
 
1072
+ prepared_pdf_file_paths, image_sizes = process_file(file_path)
1073
 
1074
  number_of_pages = len(prepared_pdf_file_paths)
1075
  print("Number of pages:", str(number_of_pages))
 
1086
  # If running Textract, check if file already exists. If it does, load in existing data
1087
  if analysis_type == textract_option:
1088
 
1089
+ textract_json_file_path = output_folder + file_name + "_textract.json"
1090
 
1091
+ # Usage
1092
+ textract_data, is_missing, log_files_output_paths = load_and_convert_textract_json(textract_json_file_path, log_files_output_paths)
 
 
 
 
 
 
 
 
 
 
 
1093
 
1094
  ###
1095
  if current_loop_page == 0: page_loop_start = 0
 
1098
  progress_bar = tqdm(range(page_loop_start, number_of_pages), unit="pages remaining", desc="Redacting pages")
1099
 
1100
  original_cropboxes = []
1101
+ page_sizes = []
1102
 
1103
  for page_no in progress_bar:
1104
 
 
1120
  image_annotations = {"image": image, "boxes": []}
1121
  pymupdf_page = pymupdf_doc.load_page(page_no)
1122
 
1123
+ # Set visible page size to biggest size (mediabox) for redaction
1124
+ original_cropboxes.append(pymupdf_page.cropbox.irect) # Save original CropBox
1125
  pymupdf_page.set_cropbox(pymupdf_page.mediabox) # Set CropBox to MediaBox
1126
 
1127
  if page_no >= page_min and page_no < page_max:
 
1129
  #print("Image is in range of pages to redact")
1130
  if isinstance(image, str):
1131
  image = Image.open(image)
1132
+ elif not isinstance(image, Image.Image):
1133
+ raise TypeError(f"Unexpected image type: {type(image)}") # Ensure image is valid
1134
 
1135
  # Need image size to convert textract OCR outputs to the correct sizes
1136
  page_width, page_height = image.size
1137
 
1138
+ out_page_image_sizes = {"page":(page_no+1), "image_width":page_width, "image_height":page_height, "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":original_cropboxes[-1].width, "cropbox_height":original_cropboxes[-1].height}
1139
+ page_sizes.append(out_page_image_sizes)
1140
+
1141
  # Possibility to use different languages
1142
  if language == 'en': ocr_lang = 'eng'
1143
  else: ocr_lang = language
 
1159
  try:
1160
  text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
1161
 
1162
+ if textract_json_file_path not in log_files_output_paths:
1163
+ log_files_output_paths.append(textract_json_file_path)
1164
 
1165
  textract_data = {"pages":[text_blocks]}
1166
  except Exception as e:
 
1219
  else:
1220
  redaction_bboxes = []
1221
 
 
 
 
 
1222
  # # Save decision making process
1223
  # bboxes_str = str(redaction_bboxes)
1224
  # with open(interim_results_file_path, "w") as f:
 
1327
 
1328
  if analysis_type == textract_option:
1329
  # Write the updated existing textract data back to the JSON file
1330
+ with open(textract_json_file_path, 'w') as json_file:
1331
  json.dump(textract_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
1332
 
1333
+ if textract_json_file_path not in log_files_output_paths:
1334
+ log_files_output_paths.append(textract_json_file_path)
1335
 
1336
+ print("At end of redact_image_pdf function where time over max.", textract_json_file_path, "not found in log_files_output_paths, appended to list:", log_files_output_paths)
1337
 
1338
  current_loop_page += 1
1339
 
1340
+ return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, page_sizes
1341
 
1342
  if is_pdf(file_path) == False:
1343
  images.append(image)
 
1362
 
1363
  if analysis_type == textract_option:
1364
  # Write the updated existing textract data back to the JSON file
1365
+ with open(textract_json_file_path, 'w') as json_file:
1366
  json.dump(textract_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
1367
 
1368
+ if textract_json_file_path not in log_files_output_paths:
1369
+ log_files_output_paths.append(textract_json_file_path)
1370
 
1371
+ return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, page_sizes
1372
 
1373
  if analysis_type == textract_option:
1374
  # Write the updated existing textract data back to the JSON file
1375
 
1376
+ with open(textract_json_file_path, 'w') as json_file:
1377
  json.dump(textract_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
1378
+ if textract_json_file_path not in log_files_output_paths:
1379
+ log_files_output_paths.append(textract_json_file_path)
1380
 
1381
+ return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, page_sizes
1382
 
1383
 
1384
  ###
 
1610
  - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
1611
  - progress: Progress tracking object
1612
  '''
1613
+ page_sizes = []
1614
+ out_page_image_sizes = {}
1615
 
1616
  if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
1617
  print("Connection to AWS Comprehend service not found.")
1618
 
1619
+ return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, page_sizes
1620
 
1621
  # Update custom word list analyser object with any new words that have been added to the custom deny list
1622
  if custom_recogniser_word_list:
 
1647
  else: page_loop_start = current_loop_page
1648
 
1649
  original_cropboxes = []
1650
+ page_sizes = []
1651
 
1652
  progress_bar = tqdm(range(current_loop_page, number_of_pages), unit="pages remaining", desc="Redacting pages")
1653
 
 
1668
  image_annotations = {"image": image, "boxes": []}
1669
  pymupdf_page = pymupdf_doc.load_page(page_no)
1670
 
1671
+ original_cropboxes.append(pymupdf_page.cropbox.irect) # Save original CropBox
1672
  pymupdf_page.set_cropbox(pymupdf_page.mediabox) # Set CropBox to MediaBox
1673
 
1674
  if page_min <= page_no < page_max:
 
1676
  if isinstance(image, str):
1677
  image_path = image
1678
  image = Image.open(image_path)
1679
+ elif not isinstance(image, Image.Image):
1680
+ raise TypeError(f"Unexpected image type: {type(image)}") # Ensure image is valid
1681
+
1682
+ # Need image size to convert textract OCR outputs to the correct sizes
1683
+ page_width, page_height = image.size
1684
+
1685
+ out_page_image_sizes = {"page":(page_no+1), "image_width":page_width, "image_height":page_height, "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":original_cropboxes[-1].width, "cropbox_height":original_cropboxes[-1].height}
1686
+ page_sizes.append(out_page_image_sizes)
1687
 
1688
  for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
1689
 
 
1805
 
1806
  current_loop_page += 1
1807
 
1808
+ return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, page_sizes
1809
 
1810
 
1811
  # Check if the image already exists in annotations_all_pages
 
1824
  page_break_return = True
1825
  progress.close(_tqdm=progress_bar)
1826
 
1827
+ return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, page_sizes
1828
 
1829
 
1830
+ return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, page_sizes
tools/helper_functions.py CHANGED
@@ -34,7 +34,7 @@ aws_pii_detector = "AWS Comprehend"
34
  output_folder = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/')
35
  print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
36
 
37
- session_output_folder = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'True')
38
  print(f'The value of SESSION_OUTPUT_FOLDER is {session_output_folder}')
39
 
40
  input_folder = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/')
@@ -60,10 +60,10 @@ def reset_state_vars():
60
  show_share_button=False,
61
  show_remove_button=False,
62
  interactive=False
63
- ), [], [], [], pd.DataFrame(), pd.DataFrame(), []
64
 
65
  def reset_review_vars():
66
- return [], pd.DataFrame(), pd.DataFrame()
67
 
68
  def load_in_default_allow_list(allow_list_file_path):
69
  if isinstance(allow_list_file_path, str):
 
34
  output_folder = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/')
35
  print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
36
 
37
+ session_output_folder = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'False')
38
  print(f'The value of SESSION_OUTPUT_FOLDER is {session_output_folder}')
39
 
40
  input_folder = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/')
 
60
  show_share_button=False,
61
  show_remove_button=False,
62
  interactive=False
63
+ ), [], [], pd.DataFrame(), pd.DataFrame(), []
64
 
65
  def reset_review_vars():
66
+ return pd.DataFrame(), pd.DataFrame()
67
 
68
  def load_in_default_allow_list(allow_list_file_path):
69
  if isinstance(allow_list_file_path, str):
tools/load_spacy_model_custom_recognisers.py CHANGED
@@ -11,14 +11,14 @@ import Levenshtein
11
  import re
12
  import gradio as gr
13
 
14
- model_name = "en_core_web_sm" #"en_core_web_trf"
15
  score_threshold = 0.001
16
  custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"]
17
 
18
  #Load spacy model
19
  try:
20
- import en_core_web_sm
21
- nlp = en_core_web_sm.load()
22
  print("Successfully imported spaCy model")
23
 
24
  except:
 
11
  import re
12
  import gradio as gr
13
 
14
+ model_name = "en_core_web_lg" #"en_core_web_sm" #"en_core_web_trf"
15
  score_threshold = 0.001
16
  custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"]
17
 
18
  #Load spacy model
19
  try:
20
+ import en_core_web_lg #en_core_web_sm
21
+ nlp = en_core_web_lg.load() #en_core_web_sm.load()
22
  print("Successfully imported spaCy model")
23
 
24
  except:
tools/redaction_review.py CHANGED
@@ -7,7 +7,7 @@ import uuid
7
  from typing import List
8
  from gradio_image_annotation import image_annotator
9
  from gradio_image_annotation.image_annotator import AnnotatedImageData
10
- from tools.file_conversion import is_pdf, convert_review_json_to_pandas_df, CUSTOM_BOX_COLOUR
11
  from tools.helper_functions import get_file_name_without_type, output_folder, detect_file_type
12
  from tools.file_redaction import redact_page_with_pymupdf
13
  import json
@@ -84,56 +84,146 @@ def remove_duplicate_images_with_blank_boxes(data: List[dict]) -> List[dict]:
84
 
85
  return result
86
 
87
- def get_recogniser_dataframe_out(image_annotator_object, recogniser_dataframe_gr):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  recogniser_entities_list = ["Redaction"]
89
- recogniser_entities_drop = gr.Dropdown(value="", choices=[""], allow_custom_value=True, interactive=True)
90
- recogniser_dataframe_out = recogniser_dataframe_gr
91
 
92
  try:
93
- review_dataframe = convert_review_json_to_pandas_df(image_annotator_object)[["page", "label"]]
94
- recogniser_entities = review_dataframe["label"].unique().tolist()
95
- recogniser_entities.append("ALL")
96
- recogniser_entities_for_drop = sorted(recogniser_entities)
 
 
 
 
 
 
97
 
 
 
98
 
99
- recogniser_dataframe_out = gr.Dataframe(review_dataframe)
100
- recogniser_entities_drop = gr.Dropdown(value=recogniser_entities_for_drop[0], choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
101
 
102
- recogniser_entities_list = [entity for entity in recogniser_entities_for_drop if entity != 'Redaction' and entity != 'ALL'] # Remove any existing 'Redaction'
103
- recogniser_entities_list.insert(0, 'Redaction') # Add 'Redaction' to the start of the list
104
 
105
  except Exception as e:
106
  print("Could not extract recogniser information:", e)
107
- recogniser_dataframe_out = recogniser_dataframe_gr
108
- recogniser_entities_drop = gr.Dropdown(value="", choices=[""], allow_custom_value=True, interactive=True)
 
109
  recogniser_entities_list = ["Redaction"]
 
 
 
 
110
 
111
- return recogniser_dataframe_out, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list
112
 
113
- def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, recogniser_entities_drop=gr.Dropdown(value="ALL", allow_custom_value=True), recogniser_dataframe_gr=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]})), zoom:int=100):
 
 
114
  '''
115
- Update a gradio_image_annotation object with new annotation data
116
- '''
117
  recogniser_entities_list = ["Redaction"]
118
  recogniser_dataframe_out = pd.DataFrame()
119
 
120
- if recogniser_dataframe_gr.empty:
121
- recogniser_dataframe_gr, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list = get_recogniser_dataframe_out(image_annotator_object, recogniser_dataframe_gr)
122
- elif recogniser_dataframe_gr.iloc[0,0] == "":
123
- recogniser_dataframe_gr, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list = get_recogniser_dataframe_out(image_annotator_object, recogniser_dataframe_gr)
124
  else:
125
- review_dataframe = update_entities_df(recogniser_entities_drop, recogniser_dataframe_gr)
126
- recogniser_dataframe_out = gr.Dataframe(review_dataframe)
127
- recogniser_entities_list = recogniser_dataframe_gr["label"].unique().tolist()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
- recogniser_entities_list = sorted(recogniser_entities_list)
130
- recogniser_entities_list = [entity for entity in recogniser_entities_list if entity != 'Redaction'] # Remove any existing 'Redaction'
131
- recogniser_entities_list.insert(0, 'Redaction') # Add 'Redaction' to the start of the list
132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
  zoom_str = str(zoom) + '%'
135
  recogniser_colour_list = [(0, 0, 0) for _ in range(len(recogniser_entities_list))]
136
 
 
 
 
 
137
  if not image_annotator_object:
138
  page_num_reported = 1
139
 
@@ -156,9 +246,9 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
156
  handles_cursor=True,
157
  interactive=True
158
  )
159
- number_reported = gr.Number(label = "Page (press enter to change)", value=page_num_reported, precision=0)
160
 
161
- return out_image_annotator, number_reported, number_reported, page_num_reported, recogniser_entities_drop, recogniser_dataframe_out, recogniser_dataframe_gr
162
 
163
  #print("page_num at start of update_annotator function:", page_num)
164
 
@@ -181,9 +271,7 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
181
  page_num_reported = page_max_reported
182
 
183
  image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
184
-
185
-
186
-
187
  out_image_annotator = image_annotator(
188
  value = image_annotator_object[page_num_reported - 1],
189
  boxes_alpha=0.1,
@@ -204,11 +292,22 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
204
  interactive=True
205
  )
206
 
207
- number_reported = gr.Number(label = "Page (press enter to change)", value=page_num_reported, precision=0)
208
-
209
- return out_image_annotator, number_reported, number_reported, page_num_reported, recogniser_entities_drop, recogniser_dataframe_out, recogniser_dataframe_gr
210
-
211
- def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_page:int, previous_page:int, all_image_annotations:List[AnnotatedImageData], recogniser_entities_drop=gr.Dropdown(value="ALL", allow_custom_value=True),recogniser_dataframe=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]})), clear_all:bool=False):
 
 
 
 
 
 
 
 
 
 
 
212
  '''
213
  Overwrite current image annotations with modifications
214
  '''
@@ -216,43 +315,30 @@ def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_
216
  if not current_page:
217
  current_page = 1
218
 
219
- #If no previous page or is 0, i.e. first time run, then rewrite current page
220
- #if not previous_page:
221
- # previous_page = current_page
222
-
223
- #print("image_annotated:", image_annotated)
224
 
225
- image_annotated['image'] = all_image_annotations[previous_page - 1]["image"]
226
 
227
  if clear_all == False:
228
- all_image_annotations[previous_page - 1] = image_annotated
229
  else:
230
  all_image_annotations[previous_page - 1]["boxes"] = []
231
 
232
- #print("all_image_annotations:", all_image_annotations)
233
-
234
- # Rewrite all_image_annotations search dataframe with latest updates
235
- try:
236
- review_dataframe = convert_review_json_to_pandas_df(all_image_annotations)[["page", "label"]]
237
- #print("review_dataframe['label']", review_dataframe["label"])
238
- recogniser_entities = review_dataframe["label"].unique().tolist()
239
- recogniser_entities.append("ALL")
240
- recogniser_entities = sorted(recogniser_entities)
241
-
242
- recogniser_dataframe_out = gr.Dataframe(review_dataframe)
243
- #recogniser_dataframe_gr = gr.Dataframe(review_dataframe)
244
- recogniser_entities_drop = gr.Dropdown(value=recogniser_entities_drop, choices=recogniser_entities, allow_custom_value=True, interactive=True)
245
- except Exception as e:
246
- print("Could not extract recogniser information:", e)
247
- recogniser_dataframe_out = recogniser_dataframe
248
-
249
- return all_image_annotations, current_page, current_page, recogniser_entities_drop, recogniser_dataframe_out
250
-
251
- def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, review_file_state, output_folder:str = output_folder, save_pdf:bool=True, progress=gr.Progress(track_tqdm=True)):
252
  '''
253
  Apply modified redactions to a pymupdf and export review files
254
  '''
255
- #print("all_image_annotations:", all_image_annotations)
256
 
257
  output_files = []
258
  output_log_files = []
@@ -260,11 +346,11 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
260
 
261
  #print("File paths in apply_redactions:", file_paths)
262
 
263
- image_annotated['image'] = all_image_annotations[current_page - 1]["image"]
264
 
265
- all_image_annotations[current_page - 1] = image_annotated
266
 
267
- if not image_annotated:
268
  print("No image annotations found")
269
  return doc, all_image_annotations
270
 
@@ -287,7 +373,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
287
 
288
  draw = ImageDraw.Draw(image)
289
 
290
- for img_annotation_box in image_annotated['boxes']:
291
  coords = [img_annotation_box["xmin"],
292
  img_annotation_box["ymin"],
293
  img_annotation_box["xmax"],
@@ -318,6 +404,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
318
  output_files.append(orig_pdf_file_path)
319
 
320
  number_of_pages = pdf_doc.page_count
 
321
 
322
  print("Saving pages to file.")
323
 
@@ -340,8 +427,17 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
340
  elif isinstance(image_loc, str):
341
  image = Image.open(image_loc)
342
 
 
 
 
 
343
  pymupdf_page = pdf_doc.load_page(i) #doc.load_page(current_page -1)
344
- pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image)
 
 
 
 
 
345
 
346
  else:
347
  print("File type not recognised.")
@@ -370,31 +466,140 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
370
  # output_log_files.append(out_annotation_file_path)
371
 
372
  #print("Saving annotations to CSV review file")
373
-
374
- #print("review_file_state:", review_file_state)
 
375
 
376
  # Convert json to csv and also save this
377
- review_df = convert_review_json_to_pandas_df(all_image_annotations, review_file_state)
378
  out_review_file_file_path = output_folder + file_name_with_ext + '_review_file.csv'
 
 
379
  review_df.to_csv(out_review_file_file_path, index=None)
380
  output_files.append(out_review_file_file_path)
381
 
382
  except Exception as e:
383
- print("Could not save annotations to csv file:", e)
384
 
385
  return doc, all_image_annotations, output_files, output_log_files
386
 
387
  def get_boxes_json(annotations:AnnotatedImageData):
388
  return annotations["boxes"]
389
 
390
- def update_entities_df(choice:str, df:pd.DataFrame):
391
- if choice=="ALL":
392
- return df
393
- else:
394
- return df.loc[df["label"]==choice,:]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
395
 
396
  def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
 
 
397
  row_value_page = evt.row_value[0] # This is the page number value
 
 
 
 
 
398
  return row_value_page
399
 
400
  def convert_image_coords_to_adobe(pdf_page_width:float, pdf_page_height:float, image_width:float, image_height:float, x1:float, y1:float, x2:float, y2:float):
@@ -454,7 +659,7 @@ def create_xfdf(df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, image_paths:L
454
 
455
  # Load cropbox sizes
456
  if document_cropboxes:
457
- print("Document cropboxes:", document_cropboxes)
458
 
459
  # Extract numbers safely using regex
460
  match = re.findall(r"[-+]?\d*\.\d+|\d+", document_cropboxes[page_python_format])
 
7
  from typing import List
8
  from gradio_image_annotation import image_annotator
9
  from gradio_image_annotation.image_annotator import AnnotatedImageData
10
+ from tools.file_conversion import is_pdf, convert_review_json_to_pandas_df, convert_pandas_df_to_review_json, CUSTOM_BOX_COLOUR
11
  from tools.helper_functions import get_file_name_without_type, output_folder, detect_file_type
12
  from tools.file_redaction import redact_page_with_pymupdf
13
  import json
 
84
 
85
  return result
86
 
87
+ def update_dropdown_list_based_on_dataframe(df:pd.DataFrame, column:str) -> List["str"]:
88
+ '''
89
+ Gather unique elements from a string pandas Series, then append 'ALL' to the start and return the list.
90
+ '''
91
+
92
+ entities = df[column].astype(str).unique().tolist()
93
+ entities_for_drop = sorted(entities)
94
+ entities_for_drop.insert(0, "ALL")
95
+
96
+ return entities_for_drop
97
+
98
+ def get_filtered_recogniser_dataframe_and_dropdowns(image_annotator_object:AnnotatedImageData,
99
+ recogniser_dataframe_modified:pd.DataFrame,
100
+ recogniser_dropdown_value:str,
101
+ text_dropdown_value:str,
102
+ page_dropdown_value:str,
103
+ review_df:pd.DataFrame=[],
104
+ page_sizes:List[str]=[]):
105
+ '''
106
+ Create a filtered recogniser dataframe and associated dropdowns based on current information in the image annotator and review data frame.
107
+ '''
108
+
109
  recogniser_entities_list = ["Redaction"]
110
+ recogniser_dataframe_out = recogniser_dataframe_modified
 
111
 
112
  try:
113
+ review_dataframe = convert_review_json_to_pandas_df(image_annotator_object, review_df, page_sizes)
114
+
115
+ print("in get_filtered_recogniser_dataframe_and_dropdowns, recogniser_dropdown_value:", recogniser_dropdown_value)
116
+
117
+ recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(review_dataframe, "label")
118
+ recogniser_entities_drop = gr.Dropdown(value=recogniser_dropdown_value, choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
119
+
120
+ # This is the choice list for entities when creating a new redaction box
121
+ recogniser_entities_list = [entity for entity in recogniser_entities_for_drop.copy() if entity != 'Redaction' and entity != 'ALL'] # Remove any existing 'Redaction'
122
+ recogniser_entities_list.insert(0, 'Redaction') # Add 'Redaction' to the start of the list
123
 
124
+ text_entities_for_drop = update_dropdown_list_based_on_dataframe(review_dataframe, "text")
125
+ text_entities_drop = gr.Dropdown(value=text_dropdown_value, choices=text_entities_for_drop, allow_custom_value=True, interactive=True)
126
 
127
+ page_entities_for_drop = update_dropdown_list_based_on_dataframe(review_dataframe, "page")
128
+ page_entities_drop = gr.Dropdown(value=page_dropdown_value, choices=page_entities_for_drop, allow_custom_value=True, interactive=True)
129
 
130
+ recogniser_dataframe_out = gr.Dataframe(review_dataframe[["page", "label", "text"]], show_search="filter", col_count=(3, "fixed"), type="pandas", headers=["page", "label", "text"])
 
131
 
132
  except Exception as e:
133
  print("Could not extract recogniser information:", e)
134
+ recogniser_dataframe_out = recogniser_dataframe_modified[["page", "label", "text"]]
135
+
136
+ recogniser_entities_drop = gr.Dropdown(value=recogniser_dropdown_value, choices=recogniser_dataframe_out["label"].astype(str).unique().tolist(), allow_custom_value=True, interactive=True)
137
  recogniser_entities_list = ["Redaction"]
138
+ text_entities_drop = gr.Dropdown(value=text_dropdown_value, choices=recogniser_dataframe_out["text"].astype(str).unique().tolist(), allow_custom_value=True, interactive=True)
139
+ page_entities_drop = gr.Dropdown(value=page_dropdown_value, choices=recogniser_dataframe_out["page"].astype(str).unique().tolist(), allow_custom_value=True, interactive=True)
140
+
141
+ return recogniser_dataframe_out, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list, text_entities_drop, page_entities_drop
142
 
 
143
 
144
+ def update_recogniser_dataframes(image_annotator_object:AnnotatedImageData, recogniser_dataframe_modified:pd.DataFrame, recogniser_entities_dropdown_value:str="ALL", text_dropdown_value:str="ALL", page_dropdown_value:str="ALL", review_df:pd.DataFrame=[], page_sizes:list[str]=[]):
145
+ '''
146
+ Update recogniser dataframe information that appears alongside the pdf pages on the review screen.
147
  '''
 
 
148
  recogniser_entities_list = ["Redaction"]
149
  recogniser_dataframe_out = pd.DataFrame()
150
 
151
+ if recogniser_dataframe_modified.empty:
152
+ recogniser_dataframe_modified, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list, text_entities_drop, page_entities_drop = get_filtered_recogniser_dataframe_and_dropdowns(image_annotator_object, recogniser_dataframe_modified, recogniser_entities_dropdown_value, text_dropdown_value, page_dropdown_value, review_df, page_sizes)
153
+ elif recogniser_dataframe_modified.iloc[0,0] == "":
154
+ recogniser_dataframe_modified, recogniser_dataframe_out, recogniser_entities_dropdown_value, recogniser_entities_list, text_entities_drop, page_entities_drop = get_filtered_recogniser_dataframe_and_dropdowns(image_annotator_object, recogniser_dataframe_modified, recogniser_entities_dropdown_value, text_dropdown_value, page_dropdown_value, review_df, page_sizes)
155
  else:
156
+ print("recogniser dataframe is not empty")
157
+ review_dataframe, text_entities_drop, page_entities_drop = update_entities_df_recogniser_entities(recogniser_entities_dropdown_value, recogniser_dataframe_modified, page_dropdown_value, text_dropdown_value)
158
+ recogniser_dataframe_out = gr.Dataframe(review_dataframe[["page", "label", "text"]], show_search="filter", col_count=(3, "fixed"), type="pandas", headers=["page", "label", "text"])
159
+
160
+ recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(recogniser_dataframe_modified, "label")
161
+ recogniser_entities_drop = gr.Dropdown(value=recogniser_entities_dropdown_value, choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
162
+
163
+ recogniser_entities_list_base = recogniser_dataframe_modified["label"].astype(str).unique().tolist()
164
+
165
+ # Recogniser entities list is the list of choices that appear when you make a new redaction box
166
+ recogniser_entities_list = [entity for entity in recogniser_entities_list_base if entity != 'Redaction']
167
+ recogniser_entities_list.insert(0, 'Redaction')
168
+
169
+ return recogniser_entities_list, recogniser_dataframe_out, recogniser_dataframe_modified, recogniser_entities_drop, text_entities_drop, page_entities_drop
170
+
171
+
172
+ def undo_last_removal(backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base):
173
+ return backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base
174
+
175
+ def exclude_selected_items_from_redaction(review_df: pd.DataFrame, selected_rows_df: pd.DataFrame, image_file_paths:List[str], page_sizes:List[dict], image_annotations_state:dict, recogniser_entity_dataframe_base:pd.DataFrame):
176
+ '''
177
+ Remove selected items from the review dataframe from the annotation object and review dataframe.
178
+ '''
179
+
180
+ backup_review_state = review_df
181
+ backup_image_annotations_state = image_annotations_state
182
+ backup_recogniser_entity_dataframe_base = recogniser_entity_dataframe_base
183
 
184
+ if not selected_rows_df.empty and not review_df.empty:
185
+ # Ensure selected_rows_df has the same relevant columns
186
+ selected_subset = selected_rows_df[['label', 'page', 'text']].drop_duplicates()
187
 
188
+ # Perform anti-join using merge with an indicator column
189
+ merged_df = review_df.merge(selected_subset, on=['label', 'page', 'text'], how='left', indicator=True)
190
+
191
+ # Keep only the rows that do not have a match in selected_rows_df
192
+ out_review_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])
193
+
194
+ out_image_annotations_state = convert_pandas_df_to_review_json(out_review_df, image_file_paths, page_sizes)
195
+ recogniser_entity_dataframe_base = out_review_df[["page", "label", "text"]]
196
+
197
+ else:
198
+ out_review_df = review_df
199
+ recogniser_entity_dataframe_base = pd.DataFrame()
200
+ out_image_annotations_state = {}
201
+
202
+ return out_review_df, out_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base
203
+
204
+ def update_annotator(image_annotator_object:AnnotatedImageData,
205
+ page_num:int,
206
+ recogniser_entities_dropdown_value:str="ALL",
207
+ page_dropdown_value:str="ALL",
208
+ text_dropdown_value:str="ALL",
209
+ recogniser_dataframe_modified=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), type="pandas", headers=["page", "label", "text"]), zoom:int=100,
210
+ review_df:pd.DataFrame=[],
211
+ page_sizes:List[dict]=[]):
212
+ '''
213
+ Update a gradio_image_annotation object with new annotation data.
214
+ '''
215
+ # First, update the dataframe containing the found recognisers
216
+ recogniser_entities_list, recogniser_dataframe_out, recogniser_dataframe_modified, recogniser_entities_dropdown_value, text_entities_drop, page_entities_drop = update_recogniser_dataframes(image_annotator_object, recogniser_dataframe_modified, recogniser_entities_dropdown_value, text_dropdown_value, page_dropdown_value, review_df, page_sizes)
217
+
218
+ #print("Creating output annotator object in update_annotator function")
219
 
220
  zoom_str = str(zoom) + '%'
221
  recogniser_colour_list = [(0, 0, 0) for _ in range(len(recogniser_entities_list))]
222
 
223
+ #print("recogniser_entities_list:", recogniser_entities_list)
224
+ #print("recogniser_colour_list:", recogniser_colour_list)
225
+ #print("zoom_str:", zoom_str)
226
+
227
  if not image_annotator_object:
228
  page_num_reported = 1
229
 
 
246
  handles_cursor=True,
247
  interactive=True
248
  )
249
+ number_reported = gr.Number(label = "Current page", value=page_num_reported, precision=0)
250
 
251
+ return out_image_annotator, number_reported, number_reported, page_num_reported, recogniser_entities_dropdown_value, recogniser_dataframe_out, recogniser_dataframe_modified, text_entities_drop, page_entities_drop
252
 
253
  #print("page_num at start of update_annotator function:", page_num)
254
 
 
271
  page_num_reported = page_max_reported
272
 
273
  image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
274
+
 
 
275
  out_image_annotator = image_annotator(
276
  value = image_annotator_object[page_num_reported - 1],
277
  boxes_alpha=0.1,
 
292
  interactive=True
293
  )
294
 
295
+ number_reported = gr.Number(label = "Current page", value=page_num_reported, precision=0)
296
+
297
+ return out_image_annotator, number_reported, number_reported, page_num_reported, recogniser_entities_dropdown_value, recogniser_dataframe_out, recogniser_dataframe_modified, text_entities_drop, page_entities_drop
298
+
299
+ def modify_existing_page_redactions(image_annotator_object:AnnotatedImageData,
300
+ current_page:int,
301
+ previous_page:int,
302
+ all_image_annotations:List[AnnotatedImageData],
303
+ recogniser_entities_dropdown_value="ALL",
304
+ text_dropdown_value="ALL",
305
+ page_dropdown_value="ALL",
306
+ recogniser_dataframe=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), show_search="filter", col_count=(3, "fixed"), type="pandas", headers=["page", "label", "text"]),
307
+ review_dataframe:pd.DataFrame=[],
308
+ page_sizes:List[dict]=[],
309
+ clear_all:bool=False
310
+ ):
311
  '''
312
  Overwrite current image annotations with modifications
313
  '''
 
315
  if not current_page:
316
  current_page = 1
317
 
318
+ print("in modify_existing_page_redactions - recogniser_entities_dropdown_value:", recogniser_entities_dropdown_value)
 
 
 
 
319
 
320
+ image_annotator_object['image'] = all_image_annotations[previous_page - 1]["image"]
321
 
322
  if clear_all == False:
323
+ all_image_annotations[previous_page - 1] = image_annotator_object
324
  else:
325
  all_image_annotations[previous_page - 1]["boxes"] = []
326
 
327
+ return all_image_annotations, current_page, current_page
328
+
329
+ def apply_redactions(image_annotator_object:AnnotatedImageData,
330
+ file_paths:List[str],
331
+ doc:Document,
332
+ all_image_annotations:List[AnnotatedImageData],
333
+ current_page:int,
334
+ review_file_state:pd.DataFrame,
335
+ output_folder:str = output_folder,
336
+ save_pdf:bool=True,
337
+ page_sizes:List[dict]=[],
338
+ progress=gr.Progress(track_tqdm=True)):
 
 
 
 
 
 
 
 
339
  '''
340
  Apply modified redactions to a pymupdf and export review files
341
  '''
 
342
 
343
  output_files = []
344
  output_log_files = []
 
346
 
347
  #print("File paths in apply_redactions:", file_paths)
348
 
349
+ image_annotator_object['image'] = all_image_annotations[current_page - 1]["image"]
350
 
351
+ all_image_annotations[current_page - 1] = image_annotator_object
352
 
353
+ if not image_annotator_object:
354
  print("No image annotations found")
355
  return doc, all_image_annotations
356
 
 
373
 
374
  draw = ImageDraw.Draw(image)
375
 
376
+ for img_annotation_box in image_annotator_object['boxes']:
377
  coords = [img_annotation_box["xmin"],
378
  img_annotation_box["ymin"],
379
  img_annotation_box["xmax"],
 
404
  output_files.append(orig_pdf_file_path)
405
 
406
  number_of_pages = pdf_doc.page_count
407
+ original_cropboxes = []
408
 
409
  print("Saving pages to file.")
410
 
 
427
  elif isinstance(image_loc, str):
428
  image = Image.open(image_loc)
429
 
430
+
431
+ #print("all_image_annotations for page:", all_image_annotations[i])
432
+ #print("image:", image)
433
+
434
  pymupdf_page = pdf_doc.load_page(i) #doc.load_page(current_page -1)
435
+ original_cropboxes.append(pymupdf_page.cropbox.irect)
436
+ pymupdf_page.set_cropbox = pymupdf_page.mediabox
437
+ #print("pymupdf_page:", pymupdf_page)
438
+ # print("original_cropboxes:", original_cropboxes)
439
+
440
+ pymupdf_page = redact_page_with_pymupdf(page=pymupdf_page, page_annotations=all_image_annotations[i], image=image, original_cropbox=original_cropboxes[-1])
441
 
442
  else:
443
  print("File type not recognised.")
 
466
  # output_log_files.append(out_annotation_file_path)
467
 
468
  #print("Saving annotations to CSV review file")
469
+ #print("all_image_annotations before conversion in apply redactions:", all_image_annotations)
470
+ #print("review_file_state before conversion in apply redactions:", review_file_state)
471
+ #print("page_sizes before conversion in apply redactions:", page_sizes)
472
 
473
  # Convert json to csv and also save this
474
+ review_df = convert_review_json_to_pandas_df(all_image_annotations, review_file_state, page_sizes=page_sizes)
475
  out_review_file_file_path = output_folder + file_name_with_ext + '_review_file.csv'
476
+
477
+ print("Saving review file after convert_review_json function in apply redactions")
478
  review_df.to_csv(out_review_file_file_path, index=None)
479
  output_files.append(out_review_file_file_path)
480
 
481
  except Exception as e:
482
+ print("In apply redactions function, could not save annotations to csv file:", e)
483
 
484
  return doc, all_image_annotations, output_files, output_log_files
485
 
486
  def get_boxes_json(annotations:AnnotatedImageData):
487
  return annotations["boxes"]
488
 
489
+ def update_entities_df_recogniser_entities(choice:str, df:pd.DataFrame, page_dropdown_value:str, text_dropdown_value:str):
490
+ '''
491
+ Update the rows in a dataframe depending on the user choice from a dropdown
492
+ '''
493
+ if isinstance(choice, str):
494
+ choice = [choice]
495
+ if isinstance(page_dropdown_value, str):
496
+ page_dropdown_value = [page_dropdown_value]
497
+ if isinstance(text_dropdown_value, str):
498
+ text_dropdown_value = [text_dropdown_value]
499
+
500
+ filtered_df = df.copy()
501
+
502
+ # Apply filtering based on dropdown selections
503
+ if not "ALL" in page_dropdown_value:
504
+ filtered_df = filtered_df[filtered_df["page"].astype(str).isin(page_dropdown_value)]
505
+
506
+ if not "ALL" in text_dropdown_value:
507
+ filtered_df = filtered_df[filtered_df["text"].astype(str).isin(text_dropdown_value)]
508
+
509
+ if not "ALL" in choice:
510
+ filtered_df = filtered_df[filtered_df["label"].astype(str).isin(choice)]
511
+
512
+ recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "label")
513
+ recogniser_entities_drop = gr.Dropdown(value=choice[0], choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
514
+
515
+ text_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "text")
516
+ text_entities_drop = gr.Dropdown(value=text_dropdown_value[0], choices=text_entities_for_drop, allow_custom_value=True, interactive=True)
517
+
518
+ page_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "page")
519
+ page_entities_drop = gr.Dropdown(value=page_dropdown_value[0], choices=page_entities_for_drop, allow_custom_value=True, interactive=True)
520
+
521
+ return filtered_df, text_entities_drop, page_entities_drop
522
+
523
+ def update_entities_df_page(choice:str, df:pd.DataFrame, label_dropdown_value:str, text_dropdown_value:str):
524
+ '''
525
+ Update the rows in a dataframe depending on the user choice from a dropdown
526
+ '''
527
+ if isinstance(choice, str):
528
+ choice = [choice]
529
+ if isinstance(label_dropdown_value, str):
530
+ label_dropdown_value = [label_dropdown_value]
531
+ if isinstance(text_dropdown_value, str):
532
+ text_dropdown_value = [text_dropdown_value]
533
+
534
+ filtered_df = df.copy()
535
+
536
+ # Apply filtering based on dropdown selections
537
+ if not "ALL" in text_dropdown_value:
538
+ filtered_df = filtered_df[filtered_df["text"].astype(str).isin(text_dropdown_value)]
539
+
540
+ if not "ALL" in label_dropdown_value:
541
+ filtered_df = filtered_df[filtered_df["label"].astype(str).isin(label_dropdown_value)]
542
+
543
+ if not "ALL" in choice:
544
+ filtered_df = filtered_df[filtered_df["page"].astype(str).isin(choice)]
545
+
546
+ recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "label")
547
+ recogniser_entities_drop = gr.Dropdown(value=label_dropdown_value[0], choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
548
+
549
+ text_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "text")
550
+ text_entities_drop = gr.Dropdown(value=text_dropdown_value[0], choices=text_entities_for_drop, allow_custom_value=True, interactive=True)
551
+
552
+ page_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "page")
553
+ page_entities_drop = gr.Dropdown(value=choice[0], choices=page_entities_for_drop, allow_custom_value=True, interactive=True)
554
+
555
+ return filtered_df, recogniser_entities_drop, text_entities_drop
556
+
557
+ def update_entities_df_text(choice:str, df:pd.DataFrame, label_dropdown_value:str, page_dropdown_value:str):
558
+ '''
559
+ Update the rows in a dataframe depending on the user choice from a dropdown
560
+ '''
561
+ if isinstance(choice, str):
562
+ choice = [choice]
563
+ if isinstance(label_dropdown_value, str):
564
+ label_dropdown_value = [label_dropdown_value]
565
+ if isinstance(page_dropdown_value, str):
566
+ page_dropdown_value = [page_dropdown_value]
567
+
568
+ filtered_df = df.copy()
569
+
570
+ # Apply filtering based on dropdown selections
571
+ if not "ALL" in page_dropdown_value:
572
+ filtered_df = filtered_df[filtered_df["page"].astype(str).isin(page_dropdown_value)]
573
+
574
+ if not "ALL" in label_dropdown_value:
575
+ filtered_df = filtered_df[filtered_df["label"].astype(str).isin(label_dropdown_value)]
576
+
577
+ if not "ALL" in choice:
578
+ filtered_df = filtered_df[filtered_df["text"].astype(str).isin(choice)]
579
+
580
+ recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "label")
581
+ recogniser_entities_drop = gr.Dropdown(value=label_dropdown_value[0], choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
582
+
583
+ text_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "text")
584
+ text_entities_drop = gr.Dropdown(value=choice[0], choices=text_entities_for_drop, allow_custom_value=True, interactive=True)
585
+
586
+ page_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "page")
587
+ page_entities_drop = gr.Dropdown(value=page_dropdown_value[0], choices=page_entities_for_drop, allow_custom_value=True, interactive=True)
588
+
589
+ return filtered_df, recogniser_entities_drop, page_entities_drop
590
+
591
+ def reset_dropdowns():
592
+ return gr.Dropdown(value="ALL", allow_custom_value=True), gr.Dropdown(value="ALL", allow_custom_value=True), gr.Dropdown(value="ALL", allow_custom_value=True)
593
 
594
  def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
595
+ print("evt.row_value[0]:", evt.row_value[0])
596
+
597
  row_value_page = evt.row_value[0] # This is the page number value
598
+
599
+ if isinstance(row_value_page, list):
600
+ row_value_page = row_value_page[0]
601
+
602
+ print("row_value_page:", row_value_page)
603
  return row_value_page
604
 
605
  def convert_image_coords_to_adobe(pdf_page_width:float, pdf_page_height:float, image_width:float, image_height:float, x1:float, y1:float, x2:float, y2:float):
 
659
 
660
  # Load cropbox sizes
661
  if document_cropboxes:
662
+ #print("Document cropboxes:", document_cropboxes)
663
 
664
  # Extract numbers safely using regex
665
  match = re.findall(r"[-+]?\d*\.\d+|\d+", document_cropboxes[page_python_format])