Merge pull request #3 from seanpedrick-case/dev
Browse files- README.md +3 -5
- app.py +13 -15
- doc_redaction_amplify_app +0 -1
- tools/custom_image_analyser_engine.py +3 -3
- tools/file_conversion.py +87 -39
- tools/file_redaction.py +51 -21
- tools/helper_functions.py +1 -1
- tools/redaction_review.py +69 -53
README.md
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
---
|
| 2 |
title: Document redaction
|
| 3 |
-
emoji:
|
| 4 |
colorFrom: blue
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
app_file: app.py
|
| 8 |
pinned: false
|
|
@@ -12,9 +12,7 @@ license: agpl-3.0
|
|
| 12 |
|
| 13 |
Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
|
| 14 |
|
| 15 |
-
To identify text in documents, the 'local' text/OCR image analysis uses spacy/tesseract, and works ok for documents with typed text. If available, choose 'AWS Textract service' to redact more complex elements e.g. signatures or handwriting.
|
| 16 |
-
|
| 17 |
-
Then, choose a method for PII identification. 'Local' is quick and gives good results if you are primarily looking for a custom list of terms to redact (see Redaction settings). If available, AWS Comprehend gives better results at a small cost.
|
| 18 |
|
| 19 |
After redaction, review suggested redactions on the 'Review redactions' tab. The original pdf can be uploaded here alongside a '...redaction_file.csv' to continue a previous redaction/review task. See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or custom terms to always include/ exclude from redaction.
|
| 20 |
|
|
|
|
| 1 |
---
|
| 2 |
title: Document redaction
|
| 3 |
+
emoji: π
|
| 4 |
colorFrom: blue
|
| 5 |
+
colorTo: yellow
|
| 6 |
sdk: docker
|
| 7 |
app_file: app.py
|
| 8 |
pinned: false
|
|
|
|
| 12 |
|
| 13 |
Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
|
| 14 |
|
| 15 |
+
To identify text in documents, the 'local' text/OCR image analysis uses spacy/tesseract, and works ok for documents with typed text. If available, choose 'AWS Textract service' to redact more complex elements e.g. signatures or handwriting. Then, choose a method for PII identification. 'Local' is quick and gives good results if you are primarily looking for a custom list of terms to redact (see Redaction settings). If available, AWS Comprehend gives better results at a small cost.
|
|
|
|
|
|
|
| 16 |
|
| 17 |
After redaction, review suggested redactions on the 'Review redactions' tab. The original pdf can be uploaded here alongside a '...redaction_file.csv' to continue a previous redaction/review task. See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or custom terms to always include/ exclude from redaction.
|
| 18 |
|
app.py
CHANGED
|
@@ -41,8 +41,6 @@ full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREET
|
|
| 41 |
|
| 42 |
language = 'en'
|
| 43 |
|
| 44 |
-
|
| 45 |
-
|
| 46 |
host_name = socket.gethostname()
|
| 47 |
feedback_logs_folder = 'feedback/' + today_rev + '/' + host_name + '/'
|
| 48 |
access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
|
|
@@ -121,7 +119,7 @@ with app:
|
|
| 121 |
|
| 122 |
|
| 123 |
## Annotator zoom value
|
| 124 |
-
annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=
|
| 125 |
zoom_true_bool = gr.State(True)
|
| 126 |
zoom_false_bool = gr.State(False)
|
| 127 |
|
|
@@ -160,9 +158,7 @@ with app:
|
|
| 160 |
|
| 161 |
Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](https://github.com/seanpedrick-case/doc_redaction/blob/main/README.md) for a walkthrough on how to use the app. Below is a very brief overview.
|
| 162 |
|
| 163 |
-
To identify text in documents, the 'local' text/OCR image analysis uses spacy/tesseract, and works ok for documents with typed text. If available, choose 'AWS Textract service' to redact more complex elements e.g. signatures or handwriting.
|
| 164 |
-
|
| 165 |
-
Then, choose a method for PII identification. 'Local' is quick and gives good results if you are primarily looking for a custom list of terms to redact (see Redaction settings). If available, AWS Comprehend gives better results at a small cost.
|
| 166 |
|
| 167 |
After redaction, review suggested redactions on the 'Review redactions' tab. The original pdf can be uploaded here alongside a '...redaction_file.csv' to continue a previous redaction/review task. See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or custom terms to always include/ exclude from redaction.
|
| 168 |
|
|
@@ -203,7 +199,7 @@ with app:
|
|
| 203 |
|
| 204 |
with gr.Accordion(label = "Review redaction file", open=True):
|
| 205 |
output_review_files = gr.File(label="Review output files", file_count='multiple', height=file_input_height)
|
| 206 |
-
upload_previous_review_file_btn = gr.Button("Review previously created redaction file (upload original PDF and ...review_file.csv)")
|
| 207 |
|
| 208 |
with gr.Row():
|
| 209 |
annotation_last_page_button = gr.Button("Previous page", scale = 3)
|
|
@@ -215,12 +211,10 @@ with app:
|
|
| 215 |
annotate_zoom_out = gr.Button("Zoom out")
|
| 216 |
with gr.Row():
|
| 217 |
clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
|
| 218 |
-
annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
|
| 219 |
-
|
| 220 |
|
| 221 |
with gr.Row():
|
| 222 |
|
| 223 |
-
with gr.Column(scale=
|
| 224 |
|
| 225 |
zoom_str = str(annotator_zoom_number) + '%'
|
| 226 |
|
|
@@ -242,9 +236,13 @@ with app:
|
|
| 242 |
interactive=False
|
| 243 |
)
|
| 244 |
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
|
| 249 |
with gr.Row():
|
| 250 |
annotation_last_page_button_bottom = gr.Button("Previous page", scale = 3)
|
|
@@ -321,7 +319,7 @@ with app:
|
|
| 321 |
###
|
| 322 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
|
| 323 |
|
| 324 |
-
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator]).\
|
| 325 |
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
|
| 326 |
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
| 327 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
|
|
@@ -473,7 +471,7 @@ print(f'The value of RUN_DIRECT_MODE is {RUN_DIRECT_MODE}')
|
|
| 473 |
MAX_QUEUE_SIZE = int(get_or_create_env_var('MAX_QUEUE_SIZE', '5'))
|
| 474 |
print(f'The value of RUN_DIRECT_MODE is {MAX_QUEUE_SIZE}')
|
| 475 |
|
| 476 |
-
MAX_FILE_SIZE = get_or_create_env_var('MAX_FILE_SIZE', '
|
| 477 |
print(f'The value of MAX_FILE_SIZE is {MAX_FILE_SIZE}')
|
| 478 |
|
| 479 |
GRADIO_SERVER_PORT = int(get_or_create_env_var('GRADIO_SERVER_PORT', '7860'))
|
|
|
|
| 41 |
|
| 42 |
language = 'en'
|
| 43 |
|
|
|
|
|
|
|
| 44 |
host_name = socket.gethostname()
|
| 45 |
feedback_logs_folder = 'feedback/' + today_rev + '/' + host_name + '/'
|
| 46 |
access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
|
|
|
|
| 119 |
|
| 120 |
|
| 121 |
## Annotator zoom value
|
| 122 |
+
annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=80, precision=0, visible=False)
|
| 123 |
zoom_true_bool = gr.State(True)
|
| 124 |
zoom_false_bool = gr.State(False)
|
| 125 |
|
|
|
|
| 158 |
|
| 159 |
Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](https://github.com/seanpedrick-case/doc_redaction/blob/main/README.md) for a walkthrough on how to use the app. Below is a very brief overview.
|
| 160 |
|
| 161 |
+
To identify text in documents, the 'local' text/OCR image analysis uses spacy/tesseract, and works ok for documents with typed text. If available, choose 'AWS Textract service' to redact more complex elements e.g. signatures or handwriting. Then, choose a method for PII identification. 'Local' is quick and gives good results if you are primarily looking for a custom list of terms to redact (see Redaction settings). If available, AWS Comprehend gives better results at a small cost.
|
|
|
|
|
|
|
| 162 |
|
| 163 |
After redaction, review suggested redactions on the 'Review redactions' tab. The original pdf can be uploaded here alongside a '...redaction_file.csv' to continue a previous redaction/review task. See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or custom terms to always include/ exclude from redaction.
|
| 164 |
|
|
|
|
| 199 |
|
| 200 |
with gr.Accordion(label = "Review redaction file", open=True):
|
| 201 |
output_review_files = gr.File(label="Review output files", file_count='multiple', height=file_input_height)
|
| 202 |
+
upload_previous_review_file_btn = gr.Button("Review previously created redaction file (upload original PDF and ...review_file.csv)", variant="primary")
|
| 203 |
|
| 204 |
with gr.Row():
|
| 205 |
annotation_last_page_button = gr.Button("Previous page", scale = 3)
|
|
|
|
| 211 |
annotate_zoom_out = gr.Button("Zoom out")
|
| 212 |
with gr.Row():
|
| 213 |
clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
|
|
|
|
|
|
|
| 214 |
|
| 215 |
with gr.Row():
|
| 216 |
|
| 217 |
+
with gr.Column(scale=1):
|
| 218 |
|
| 219 |
zoom_str = str(annotator_zoom_number) + '%'
|
| 220 |
|
|
|
|
| 236 |
interactive=False
|
| 237 |
)
|
| 238 |
|
| 239 |
+
with gr.Row():
|
| 240 |
+
annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
|
| 241 |
+
|
| 242 |
+
#with gr.Column(scale=1):
|
| 243 |
+
with gr.Row():
|
| 244 |
+
recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
|
| 245 |
+
recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", label="Search results. Click to go to page")
|
| 246 |
|
| 247 |
with gr.Row():
|
| 248 |
annotation_last_page_button_bottom = gr.Button("Previous page", scale = 3)
|
|
|
|
| 319 |
###
|
| 320 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
|
| 321 |
|
| 322 |
+
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state]).\
|
| 323 |
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
|
| 324 |
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
| 325 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
|
|
|
|
| 471 |
MAX_QUEUE_SIZE = int(get_or_create_env_var('MAX_QUEUE_SIZE', '5'))
|
| 472 |
print(f'The value of RUN_DIRECT_MODE is {MAX_QUEUE_SIZE}')
|
| 473 |
|
| 474 |
+
MAX_FILE_SIZE = get_or_create_env_var('MAX_FILE_SIZE', '250mb')
|
| 475 |
print(f'The value of MAX_FILE_SIZE is {MAX_FILE_SIZE}')
|
| 476 |
|
| 477 |
GRADIO_SERVER_PORT = int(get_or_create_env_var('GRADIO_SERVER_PORT', '7860'))
|
doc_redaction_amplify_app
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
Subproject commit 9585642e4d1f72fc49971789693d5584661084c8
|
|
|
|
|
|
tools/custom_image_analyser_engine.py
CHANGED
|
@@ -637,9 +637,9 @@ class CustomImageAnalyzerEngine:
|
|
| 637 |
result_reset_pos.start = 0
|
| 638 |
result_reset_pos.end = len(relevant_text)
|
| 639 |
|
| 640 |
-
print("result_reset_pos:", result_reset_pos)
|
| 641 |
-
print("relevant_line_ocr_result:", relevant_line_ocr_result)
|
| 642 |
-
print("ocr_results_with_children_line_level:", ocr_results_with_children_line_level)
|
| 643 |
|
| 644 |
# Map the analyzer results to bounding boxes for this line
|
| 645 |
line_results = self.map_analyzer_results_to_bounding_boxes(
|
|
|
|
| 637 |
result_reset_pos.start = 0
|
| 638 |
result_reset_pos.end = len(relevant_text)
|
| 639 |
|
| 640 |
+
#print("result_reset_pos:", result_reset_pos)
|
| 641 |
+
#print("relevant_line_ocr_result:", relevant_line_ocr_result)
|
| 642 |
+
#print("ocr_results_with_children_line_level:", ocr_results_with_children_line_level)
|
| 643 |
|
| 644 |
# Map the analyzer results to bounding boxes for this line
|
| 645 |
line_results = self.map_analyzer_results_to_bounding_boxes(
|
tools/file_conversion.py
CHANGED
|
@@ -16,6 +16,7 @@ from typing import List, Optional
|
|
| 16 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 17 |
|
| 18 |
image_dpi = 300.0
|
|
|
|
| 19 |
|
| 20 |
def is_pdf_or_image(filename):
|
| 21 |
"""
|
|
@@ -51,26 +52,57 @@ def is_pdf(filename):
|
|
| 51 |
CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "")
|
| 52 |
print(f'The value of CUSTOM_BOX_COLOUR is {CUSTOM_BOX_COLOUR}')
|
| 53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_dir: str = 'input') -> tuple[int, str]:
|
| 55 |
try:
|
| 56 |
-
# Construct the full output directory path
|
| 57 |
output_dir = os.path.join(os.getcwd(), output_dir)
|
| 58 |
-
|
| 59 |
-
# Use the output_dir to construct the out_path
|
| 60 |
out_path = os.path.join(output_dir, f"{os.path.basename(pdf_path)}_{page_num}.png")
|
| 61 |
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
| 62 |
-
|
| 63 |
if os.path.exists(out_path):
|
| 64 |
-
#
|
| 65 |
image = Image.open(out_path)
|
| 66 |
else:
|
| 67 |
-
#
|
| 68 |
image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
|
| 69 |
dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
|
| 70 |
image = image_l[0]
|
| 71 |
image = image.convert("L")
|
| 72 |
image.save(out_path, format="PNG")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
return page_num, out_path
|
|
|
|
| 74 |
except Exception as e:
|
| 75 |
print(f"Error processing page {page_num + 1}: {e}")
|
| 76 |
return page_num, None
|
|
@@ -683,14 +715,20 @@ def join_values_within_threshold(df1, df2):
|
|
| 683 |
print(final_df)
|
| 684 |
|
| 685 |
|
| 686 |
-
def convert_review_json_to_pandas_df(
|
|
|
|
|
|
|
|
|
|
| 687 |
# Flatten the data
|
| 688 |
-
|
| 689 |
|
| 690 |
-
|
| 691 |
-
|
|
|
|
|
|
|
|
|
|
| 692 |
#print("flattened_data:", flattened_data)
|
| 693 |
-
image_path =
|
| 694 |
|
| 695 |
# Use regex to find the number before .png
|
| 696 |
match = re.search(r'_(\d+)\.png$', image_path)
|
|
@@ -701,56 +739,66 @@ def convert_review_json_to_pandas_df(data:List[dict], text_join_data=pd.DataFram
|
|
| 701 |
else:
|
| 702 |
print("No number found before .png")
|
| 703 |
|
| 704 |
-
# Check if 'boxes' is in the
|
| 705 |
-
if 'boxes' not in
|
| 706 |
-
|
| 707 |
|
| 708 |
-
for box in
|
| 709 |
if 'text' not in box:
|
| 710 |
-
data_to_add = {"image": image_path, "page": reported_number, **box} # "text":
|
| 711 |
else:
|
| 712 |
-
data_to_add = {"image": image_path, "page": reported_number, "text":
|
| 713 |
#print("data_to_add:", data_to_add)
|
| 714 |
-
|
| 715 |
|
| 716 |
# Convert to a DataFrame
|
| 717 |
-
|
|
|
|
|
|
|
|
|
|
| 718 |
|
| 719 |
# Join on additional text data from decision output results if included
|
| 720 |
-
if not
|
| 721 |
-
#print("
|
| 722 |
-
#print("
|
| 723 |
-
|
| 724 |
-
|
| 725 |
-
|
|
|
|
|
|
|
| 726 |
# Round to the closest number divisible by 5
|
| 727 |
-
|
| 728 |
-
|
|
|
|
| 729 |
|
| 730 |
-
|
|
|
|
|
|
|
| 731 |
|
| 732 |
-
|
| 733 |
|
| 734 |
-
|
| 735 |
|
| 736 |
-
|
| 737 |
|
| 738 |
-
|
| 739 |
-
|
|
|
|
|
|
|
| 740 |
|
| 741 |
-
|
| 742 |
|
| 743 |
-
return
|
| 744 |
|
| 745 |
-
def convert_pandas_df_to_review_json(
|
| 746 |
'''
|
| 747 |
Convert a review csv to a json file for use by the Gradio Annotation object
|
| 748 |
'''
|
| 749 |
# Keep only necessary columns
|
| 750 |
-
|
| 751 |
|
| 752 |
# Group the DataFrame by the 'image' column
|
| 753 |
-
grouped_csv_pages =
|
| 754 |
|
| 755 |
# Create a list to hold the JSON data
|
| 756 |
json_data = []
|
|
@@ -758,7 +806,7 @@ def convert_pandas_df_to_review_json(df: pd.DataFrame, image_paths: List[Image.I
|
|
| 758 |
for n, pdf_image_path in enumerate(image_paths):
|
| 759 |
reported_page_number = int(n + 1)
|
| 760 |
|
| 761 |
-
if reported_page_number in
|
| 762 |
|
| 763 |
# Convert each relevant group to a list of box dictionaries
|
| 764 |
selected_csv_pages = grouped_csv_pages.get_group(reported_page_number)
|
|
|
|
| 16 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 17 |
|
| 18 |
image_dpi = 300.0
|
| 19 |
+
Image.MAX_IMAGE_PIXELS = None
|
| 20 |
|
| 21 |
def is_pdf_or_image(filename):
|
| 22 |
"""
|
|
|
|
| 52 |
CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "")
|
| 53 |
print(f'The value of CUSTOM_BOX_COLOUR is {CUSTOM_BOX_COLOUR}')
|
| 54 |
|
| 55 |
+
import os
|
| 56 |
+
from pdf2image import convert_from_path
|
| 57 |
+
from PIL import Image
|
| 58 |
+
|
| 59 |
def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_dir: str = 'input') -> tuple[int, str]:
|
| 60 |
try:
|
| 61 |
+
# Construct the full output directory path
|
| 62 |
output_dir = os.path.join(os.getcwd(), output_dir)
|
|
|
|
|
|
|
| 63 |
out_path = os.path.join(output_dir, f"{os.path.basename(pdf_path)}_{page_num}.png")
|
| 64 |
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
| 65 |
+
|
| 66 |
if os.path.exists(out_path):
|
| 67 |
+
# Load existing image
|
| 68 |
image = Image.open(out_path)
|
| 69 |
else:
|
| 70 |
+
# Convert PDF page to image
|
| 71 |
image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
|
| 72 |
dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
|
| 73 |
image = image_l[0]
|
| 74 |
image = image.convert("L")
|
| 75 |
image.save(out_path, format="PNG")
|
| 76 |
+
|
| 77 |
+
# Check file size and resize if necessary
|
| 78 |
+
max_size = 5 * 1024 * 1024 # 5 MB in bytes # 5
|
| 79 |
+
file_size = os.path.getsize(out_path)
|
| 80 |
+
|
| 81 |
+
# Resize images if they are too big
|
| 82 |
+
if file_size > max_size:
|
| 83 |
+
# Start with the original image size
|
| 84 |
+
width, height = image.size
|
| 85 |
+
|
| 86 |
+
print(f"Image size before {new_width}x{new_height}, original file_size: {file_size}")
|
| 87 |
+
|
| 88 |
+
while file_size > max_size:
|
| 89 |
+
# Reduce the size by a factor (e.g., 50% of the current size)
|
| 90 |
+
new_width = int(width * 0.5)
|
| 91 |
+
new_height = int(height * 0.5)
|
| 92 |
+
image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
|
| 93 |
+
|
| 94 |
+
# Save the resized image
|
| 95 |
+
image.save(out_path, format="PNG", optimize=True)
|
| 96 |
+
|
| 97 |
+
# Update the file size
|
| 98 |
+
file_size = os.path.getsize(out_path)
|
| 99 |
+
print(f"Resized to {new_width}x{new_height}, new file_size: {file_size}")
|
| 100 |
+
|
| 101 |
+
# Update the dimensions for the next iteration
|
| 102 |
+
width, height = new_width, new_height
|
| 103 |
+
|
| 104 |
return page_num, out_path
|
| 105 |
+
|
| 106 |
except Exception as e:
|
| 107 |
print(f"Error processing page {page_num + 1}: {e}")
|
| 108 |
return page_num, None
|
|
|
|
| 715 |
print(final_df)
|
| 716 |
|
| 717 |
|
| 718 |
+
def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decision_output:pd.DataFrame=pd.DataFrame()) -> pd.DataFrame:
|
| 719 |
+
'''
|
| 720 |
+
Convert the annotation json data to a dataframe format. Add on any text from the initial review_file dataframe by joining on pages/co-ordinates (doesn't work very well currently).
|
| 721 |
+
'''
|
| 722 |
# Flatten the data
|
| 723 |
+
flattened_annotation_data = []
|
| 724 |
|
| 725 |
+
if not isinstance(redaction_decision_output, pd.DataFrame):
|
| 726 |
+
redaction_decision_output = pd.DataFrame()
|
| 727 |
+
|
| 728 |
+
for annotation in all_annotations:
|
| 729 |
+
#print("annotation:", annotation)
|
| 730 |
#print("flattened_data:", flattened_data)
|
| 731 |
+
image_path = annotation["image"]
|
| 732 |
|
| 733 |
# Use regex to find the number before .png
|
| 734 |
match = re.search(r'_(\d+)\.png$', image_path)
|
|
|
|
| 739 |
else:
|
| 740 |
print("No number found before .png")
|
| 741 |
|
| 742 |
+
# Check if 'boxes' is in the annotation, if not, add an empty list
|
| 743 |
+
if 'boxes' not in annotation:
|
| 744 |
+
annotation['boxes'] = []
|
| 745 |
|
| 746 |
+
for box in annotation["boxes"]:
|
| 747 |
if 'text' not in box:
|
| 748 |
+
data_to_add = {"image": image_path, "page": reported_number, **box} # "text": annotation['text'],
|
| 749 |
else:
|
| 750 |
+
data_to_add = {"image": image_path, "page": reported_number, "text": annotation['text'], **box}
|
| 751 |
#print("data_to_add:", data_to_add)
|
| 752 |
+
flattened_annotation_data.append(data_to_add)
|
| 753 |
|
| 754 |
# Convert to a DataFrame
|
| 755 |
+
annotation_data_as_df = pd.DataFrame(flattened_annotation_data)
|
| 756 |
+
|
| 757 |
+
#print("redaction_decision_output:", redaction_decision_output)
|
| 758 |
+
#print("annotation_data_as_df:", annotation_data_as_df)
|
| 759 |
|
| 760 |
# Join on additional text data from decision output results if included
|
| 761 |
+
if not redaction_decision_output.empty:
|
| 762 |
+
#print("redaction_decision_output is not empty")
|
| 763 |
+
#print("redaction_decision_output:", redaction_decision_output)
|
| 764 |
+
#print("annotation_data_as_df:", annotation_data_as_df)
|
| 765 |
+
redaction_decision_output['page'] = redaction_decision_output['page'].astype(str)
|
| 766 |
+
annotation_data_as_df['page'] = annotation_data_as_df['page'].astype(str)
|
| 767 |
+
redaction_decision_output = redaction_decision_output[['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page', 'text']]
|
| 768 |
+
|
| 769 |
# Round to the closest number divisible by 5
|
| 770 |
+
redaction_decision_output.loc[:, ['xmin', 'ymin', 'xmax', 'ymax']] = (redaction_decision_output[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
|
| 771 |
+
|
| 772 |
+
redaction_decision_output = redaction_decision_output.drop_duplicates(['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page'])
|
| 773 |
|
| 774 |
+
#annotation_data_as_df[['xmin1', 'ymin1', 'xmax1', 'ymax1']] = (annotation_data_as_df[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
|
| 775 |
+
|
| 776 |
+
annotation_data_as_df.loc[:, ['xmin1', 'ymin1', 'xmax1', 'ymax1']] = (annotation_data_as_df[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
|
| 777 |
|
| 778 |
+
annotation_data_as_df = annotation_data_as_df.merge(redaction_decision_output, left_on = ['xmin1', 'ymin1', 'xmax1', 'ymax1', 'label', 'page'], right_on = ['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page'], how = "left", suffixes=("", "_y"))
|
| 779 |
|
| 780 |
+
annotation_data_as_df = annotation_data_as_df.drop(['xmin1', 'ymin1', 'xmax1', 'ymax1', 'xmin_y', 'ymin_y', 'xmax_y', 'ymax_y'], axis=1, errors="ignore")
|
| 781 |
|
| 782 |
+
annotation_data_as_df = annotation_data_as_df[["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"]]
|
| 783 |
|
| 784 |
+
# Ensure required columns exist, filling with blank if they don't
|
| 785 |
+
for col in ["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"]:
|
| 786 |
+
if col not in annotation_data_as_df.columns:
|
| 787 |
+
annotation_data_as_df[col] = ''
|
| 788 |
|
| 789 |
+
annotation_data_as_df = annotation_data_as_df.sort_values(['page', 'ymin', 'xmin', 'label'])
|
| 790 |
|
| 791 |
+
return annotation_data_as_df
|
| 792 |
|
| 793 |
+
def convert_pandas_df_to_review_json(review_file_df: pd.DataFrame, image_paths: List[Image.Image]) -> List[dict]:
|
| 794 |
'''
|
| 795 |
Convert a review csv to a json file for use by the Gradio Annotation object
|
| 796 |
'''
|
| 797 |
# Keep only necessary columns
|
| 798 |
+
review_file_df = review_file_df[["image", "page", "xmin", "ymin", "xmax", "ymax", "color", "label"]]
|
| 799 |
|
| 800 |
# Group the DataFrame by the 'image' column
|
| 801 |
+
grouped_csv_pages = review_file_df.groupby('page')
|
| 802 |
|
| 803 |
# Create a list to hold the JSON data
|
| 804 |
json_data = []
|
|
|
|
| 806 |
for n, pdf_image_path in enumerate(image_paths):
|
| 807 |
reported_page_number = int(n + 1)
|
| 808 |
|
| 809 |
+
if reported_page_number in review_file_df["page"].values:
|
| 810 |
|
| 811 |
# Convert each relevant group to a list of box dictionaries
|
| 812 |
selected_csv_pages = grouped_csv_pages.get_group(reported_page_number)
|
tools/file_redaction.py
CHANGED
|
@@ -288,7 +288,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 288 |
|
| 289 |
print("Redacting file " + file_path_without_ext + " as an image-based file")
|
| 290 |
|
| 291 |
-
pymupdf_doc,all_decision_process_table,log_files_output_paths,new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number = redact_image_pdf(file_path,
|
| 292 |
prepared_pdf_image_paths,
|
| 293 |
language,
|
| 294 |
chosen_redact_entities,
|
|
@@ -314,9 +314,12 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 314 |
custom_recogniser_word_list,
|
| 315 |
redact_whole_page_list)
|
| 316 |
|
|
|
|
|
|
|
|
|
|
| 317 |
# Save Textract request metadata (if exists)
|
| 318 |
if new_request_metadata:
|
| 319 |
-
print("Request metadata:", new_request_metadata)
|
| 320 |
all_request_metadata.append(new_request_metadata)
|
| 321 |
|
| 322 |
elif in_redact_method == text_ocr_option:
|
|
@@ -396,10 +399,11 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 396 |
json.dump(annotations_all_pages, f)
|
| 397 |
log_files_output_paths.append(out_annotation_file_path)
|
| 398 |
|
| 399 |
-
|
| 400 |
|
| 401 |
# Convert json to csv and also save this
|
| 402 |
#print("annotations_all_pages:", annotations_all_pages)
|
|
|
|
| 403 |
|
| 404 |
review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
|
| 405 |
|
|
@@ -975,11 +979,11 @@ def redact_image_pdf(file_path:str,
|
|
| 975 |
if analysis_type == textract_option:
|
| 976 |
|
| 977 |
json_file_path = output_folder + file_name + "_textract.json"
|
| 978 |
-
|
| 979 |
|
| 980 |
if not os.path.exists(json_file_path):
|
| 981 |
print("No existing Textract results file found.")
|
| 982 |
-
|
| 983 |
#text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
| 984 |
#log_files_output_paths.append(json_file_path)
|
| 985 |
#request_metadata = request_metadata + "\n" + new_request_metadata
|
|
@@ -988,8 +992,12 @@ def redact_image_pdf(file_path:str,
|
|
| 988 |
# Open the file and load the JSON data
|
| 989 |
no_textract_file = False
|
| 990 |
print("Found existing Textract json results file.")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 991 |
with open(json_file_path, 'r') as json_file:
|
| 992 |
-
|
| 993 |
|
| 994 |
###
|
| 995 |
|
|
@@ -1046,32 +1054,46 @@ def redact_image_pdf(file_path:str,
|
|
| 1046 |
image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
|
| 1047 |
pdf_page_as_bytes = image_buffer.getvalue()
|
| 1048 |
|
| 1049 |
-
if not
|
| 1050 |
-
|
| 1051 |
-
|
| 1052 |
-
|
|
|
|
|
|
|
| 1053 |
|
| 1054 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1055 |
|
| 1056 |
else:
|
| 1057 |
# Check if the current reported_page_number exists in the loaded JSON
|
| 1058 |
-
page_exists = any(page['page_no'] == reported_page_number for page in
|
| 1059 |
|
| 1060 |
if not page_exists: # If the page does not exist, analyze again
|
| 1061 |
print(f"Page number {reported_page_number} not found in existing Textract data. Analysing.")
|
| 1062 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1063 |
|
| 1064 |
# Check if "pages" key exists, if not, initialize it as an empty list
|
| 1065 |
-
if "pages" not in
|
| 1066 |
-
|
| 1067 |
|
| 1068 |
# Append the new page data
|
| 1069 |
-
|
| 1070 |
|
| 1071 |
request_metadata = request_metadata + "\n" + new_request_metadata
|
| 1072 |
else:
|
| 1073 |
# If the page exists, retrieve the data
|
| 1074 |
-
text_blocks = next(page['data'] for page in
|
| 1075 |
|
| 1076 |
|
| 1077 |
line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
|
|
@@ -1214,7 +1236,10 @@ def redact_image_pdf(file_path:str,
|
|
| 1214 |
if analysis_type == textract_option:
|
| 1215 |
# Write the updated existing textract data back to the JSON file
|
| 1216 |
with open(json_file_path, 'w') as json_file:
|
| 1217 |
-
json.dump(
|
|
|
|
|
|
|
|
|
|
| 1218 |
|
| 1219 |
current_loop_page += 1
|
| 1220 |
|
|
@@ -1245,7 +1270,10 @@ def redact_image_pdf(file_path:str,
|
|
| 1245 |
if analysis_type == textract_option:
|
| 1246 |
# Write the updated existing textract data back to the JSON file
|
| 1247 |
with open(json_file_path, 'w') as json_file:
|
| 1248 |
-
json.dump(
|
|
|
|
|
|
|
|
|
|
| 1249 |
|
| 1250 |
return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
| 1251 |
|
|
@@ -1253,7 +1281,9 @@ def redact_image_pdf(file_path:str,
|
|
| 1253 |
# Write the updated existing textract data back to the JSON file
|
| 1254 |
|
| 1255 |
with open(json_file_path, 'w') as json_file:
|
| 1256 |
-
json.dump(
|
|
|
|
|
|
|
| 1257 |
|
| 1258 |
return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
| 1259 |
|
|
@@ -1495,7 +1525,7 @@ def create_text_redaction_process_results(analyser_results, analysed_bounding_bo
|
|
| 1495 |
analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].apply(pd.Series)
|
| 1496 |
|
| 1497 |
# Convert the new columns to integers (if needed)
|
| 1498 |
-
analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']].astype(float)
|
| 1499 |
|
| 1500 |
analysed_bounding_boxes_df_text = analysed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
|
| 1501 |
analysed_bounding_boxes_df_text.columns = ["label", "start", "end", "score"]
|
|
|
|
| 288 |
|
| 289 |
print("Redacting file " + file_path_without_ext + " as an image-based file")
|
| 290 |
|
| 291 |
+
pymupdf_doc, all_decision_process_table, log_files_output_paths, new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number = redact_image_pdf(file_path,
|
| 292 |
prepared_pdf_image_paths,
|
| 293 |
language,
|
| 294 |
chosen_redact_entities,
|
|
|
|
| 314 |
custom_recogniser_word_list,
|
| 315 |
redact_whole_page_list)
|
| 316 |
|
| 317 |
+
|
| 318 |
+
#print("log_files_output_paths at end of image redact function:", log_files_output_paths)
|
| 319 |
+
|
| 320 |
# Save Textract request metadata (if exists)
|
| 321 |
if new_request_metadata:
|
| 322 |
+
#print("Request metadata:", new_request_metadata)
|
| 323 |
all_request_metadata.append(new_request_metadata)
|
| 324 |
|
| 325 |
elif in_redact_method == text_ocr_option:
|
|
|
|
| 399 |
json.dump(annotations_all_pages, f)
|
| 400 |
log_files_output_paths.append(out_annotation_file_path)
|
| 401 |
|
| 402 |
+
print("Saving annotations to CSV")
|
| 403 |
|
| 404 |
# Convert json to csv and also save this
|
| 405 |
#print("annotations_all_pages:", annotations_all_pages)
|
| 406 |
+
#print("all_decision_process_table:", all_decision_process_table)
|
| 407 |
|
| 408 |
review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
|
| 409 |
|
|
|
|
| 979 |
if analysis_type == textract_option:
|
| 980 |
|
| 981 |
json_file_path = output_folder + file_name + "_textract.json"
|
| 982 |
+
|
| 983 |
|
| 984 |
if not os.path.exists(json_file_path):
|
| 985 |
print("No existing Textract results file found.")
|
| 986 |
+
textract_data = {}
|
| 987 |
#text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
| 988 |
#log_files_output_paths.append(json_file_path)
|
| 989 |
#request_metadata = request_metadata + "\n" + new_request_metadata
|
|
|
|
| 992 |
# Open the file and load the JSON data
|
| 993 |
no_textract_file = False
|
| 994 |
print("Found existing Textract json results file.")
|
| 995 |
+
|
| 996 |
+
if json_file_path not in log_files_output_paths:
|
| 997 |
+
log_files_output_paths.append(json_file_path)
|
| 998 |
+
|
| 999 |
with open(json_file_path, 'r') as json_file:
|
| 1000 |
+
textract_data = json.load(json_file)
|
| 1001 |
|
| 1002 |
###
|
| 1003 |
|
|
|
|
| 1054 |
image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
|
| 1055 |
pdf_page_as_bytes = image_buffer.getvalue()
|
| 1056 |
|
| 1057 |
+
if not textract_data:
|
| 1058 |
+
try:
|
| 1059 |
+
text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
| 1060 |
+
|
| 1061 |
+
if json_file_path not in log_files_output_paths:
|
| 1062 |
+
log_files_output_paths.append(json_file_path)
|
| 1063 |
|
| 1064 |
+
textract_data = {"pages":[text_blocks]}
|
| 1065 |
+
except Exception as e:
|
| 1066 |
+
print("Textract extraction for page", reported_page_number, "failed due to:", e)
|
| 1067 |
+
textract_data = {"pages":[]}
|
| 1068 |
+
new_request_metadata = "Failed Textract API call"
|
| 1069 |
+
|
| 1070 |
+
request_metadata = request_metadata + "\n" + new_request_metadata
|
| 1071 |
|
| 1072 |
else:
|
| 1073 |
# Check if the current reported_page_number exists in the loaded JSON
|
| 1074 |
+
page_exists = any(page['page_no'] == reported_page_number for page in textract_data.get("pages", []))
|
| 1075 |
|
| 1076 |
if not page_exists: # If the page does not exist, analyze again
|
| 1077 |
print(f"Page number {reported_page_number} not found in existing Textract data. Analysing.")
|
| 1078 |
+
|
| 1079 |
+
try:
|
| 1080 |
+
text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
| 1081 |
+
except Exception as e:
|
| 1082 |
+
print("Textract extraction for page", reported_page_number, "failed due to:", e)
|
| 1083 |
+
text_bocks = []
|
| 1084 |
+
new_request_metadata = "Failed Textract API call"
|
| 1085 |
|
| 1086 |
# Check if "pages" key exists, if not, initialize it as an empty list
|
| 1087 |
+
if "pages" not in textract_data:
|
| 1088 |
+
textract_data["pages"] = []
|
| 1089 |
|
| 1090 |
# Append the new page data
|
| 1091 |
+
textract_data["pages"].append(text_blocks)
|
| 1092 |
|
| 1093 |
request_metadata = request_metadata + "\n" + new_request_metadata
|
| 1094 |
else:
|
| 1095 |
# If the page exists, retrieve the data
|
| 1096 |
+
text_blocks = next(page['data'] for page in textract_data["pages"] if page['page_no'] == reported_page_number)
|
| 1097 |
|
| 1098 |
|
| 1099 |
line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
|
|
|
|
| 1236 |
if analysis_type == textract_option:
|
| 1237 |
# Write the updated existing textract data back to the JSON file
|
| 1238 |
with open(json_file_path, 'w') as json_file:
|
| 1239 |
+
json.dump(textract_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
| 1240 |
+
|
| 1241 |
+
if json_file_path not in log_files_output_paths:
|
| 1242 |
+
log_files_output_paths.append(json_file_path)
|
| 1243 |
|
| 1244 |
current_loop_page += 1
|
| 1245 |
|
|
|
|
| 1270 |
if analysis_type == textract_option:
|
| 1271 |
# Write the updated existing textract data back to the JSON file
|
| 1272 |
with open(json_file_path, 'w') as json_file:
|
| 1273 |
+
json.dump(textract_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
| 1274 |
+
|
| 1275 |
+
if json_file_path not in log_files_output_paths:
|
| 1276 |
+
log_files_output_paths.append(json_file_path)
|
| 1277 |
|
| 1278 |
return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
| 1279 |
|
|
|
|
| 1281 |
# Write the updated existing textract data back to the JSON file
|
| 1282 |
|
| 1283 |
with open(json_file_path, 'w') as json_file:
|
| 1284 |
+
json.dump(textract_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
| 1285 |
+
if json_file_path not in log_files_output_paths:
|
| 1286 |
+
log_files_output_paths.append(json_file_path)
|
| 1287 |
|
| 1288 |
return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
| 1289 |
|
|
|
|
| 1525 |
analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].apply(pd.Series)
|
| 1526 |
|
| 1527 |
# Convert the new columns to integers (if needed)
|
| 1528 |
+
analysed_bounding_boxes_df_new.loc[:, ['xmin', 'ymin', 'xmax', 'ymax']] = (analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
|
| 1529 |
|
| 1530 |
analysed_bounding_boxes_df_text = analysed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
|
| 1531 |
analysed_bounding_boxes_df_text.columns = ["label", "start", "end", "score"]
|
tools/helper_functions.py
CHANGED
|
@@ -17,7 +17,7 @@ def reset_state_vars():
|
|
| 17 |
show_share_button=False,
|
| 18 |
show_remove_button=False,
|
| 19 |
interactive=False
|
| 20 |
-
)
|
| 21 |
|
| 22 |
def get_or_create_env_var(var_name, default_value):
|
| 23 |
# Get the environment variable if it exists
|
|
|
|
| 17 |
show_share_button=False,
|
| 18 |
show_remove_button=False,
|
| 19 |
interactive=False
|
| 20 |
+
), [], []
|
| 21 |
|
| 22 |
def get_or_create_env_var(var_name, default_value):
|
| 23 |
# Get the environment variable if it exists
|
tools/redaction_review.py
CHANGED
|
@@ -13,6 +13,7 @@ import os
|
|
| 13 |
import pymupdf
|
| 14 |
from fitz import Document
|
| 15 |
from PIL import ImageDraw, Image
|
|
|
|
| 16 |
|
| 17 |
def decrease_page(number:int):
|
| 18 |
'''
|
|
@@ -44,41 +45,86 @@ def update_zoom(current_zoom_level:int, annotate_current_page:int, decrease:bool
|
|
| 44 |
if current_zoom_level >= 70:
|
| 45 |
current_zoom_level -= 10
|
| 46 |
else:
|
| 47 |
-
if current_zoom_level <
|
| 48 |
current_zoom_level += 10
|
| 49 |
|
| 50 |
return current_zoom_level, annotate_current_page
|
| 51 |
|
| 52 |
-
def
|
| 53 |
'''
|
| 54 |
-
|
| 55 |
'''
|
| 56 |
-
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
recogniser_entities.append("ALL")
|
| 65 |
-
recogniser_entities = sorted(recogniser_entities)
|
| 66 |
|
| 67 |
-
#print("recogniser_entities:", recogniser_entities)
|
| 68 |
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
else:
|
| 77 |
review_dataframe = update_entities_df(recogniser_entities_drop, recogniser_dataframe_gr)
|
| 78 |
recogniser_dataframe_out = gr.Dataframe(review_dataframe)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
|
|
|
| 80 |
|
| 81 |
zoom_str = str(zoom) + '%'
|
|
|
|
| 82 |
|
| 83 |
if not image_annotator_object:
|
| 84 |
page_num_reported = 1
|
|
@@ -87,8 +133,8 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
|
|
| 87 |
image_annotator_object[page_num_reported - 1],
|
| 88 |
boxes_alpha=0.1,
|
| 89 |
box_thickness=1,
|
| 90 |
-
|
| 91 |
-
|
| 92 |
show_label=False,
|
| 93 |
height=zoom_str,
|
| 94 |
width=zoom_str,
|
|
@@ -126,44 +172,14 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
|
|
| 126 |
if page_num_reported > page_max_reported:
|
| 127 |
page_num_reported = page_max_reported
|
| 128 |
|
| 129 |
-
from collections import defaultdict
|
| 130 |
-
|
| 131 |
-
# Remove duplicate elements that are blank
|
| 132 |
-
def remove_duplicate_images_with_blank_boxes(data: List[AnnotatedImageData]) -> List[AnnotatedImageData]:
|
| 133 |
-
# Group items by 'image'
|
| 134 |
-
image_groups = defaultdict(list)
|
| 135 |
-
for item in data:
|
| 136 |
-
image_groups[item['image']].append(item)
|
| 137 |
-
|
| 138 |
-
# Process each group to retain only the entry with non-empty boxes, if available
|
| 139 |
-
result = []
|
| 140 |
-
for image, items in image_groups.items():
|
| 141 |
-
# Filter items with non-empty boxes
|
| 142 |
-
non_empty_boxes = [item for item in items if item['boxes']]
|
| 143 |
-
if non_empty_boxes:
|
| 144 |
-
# Keep the first entry with non-empty boxes
|
| 145 |
-
result.append(non_empty_boxes[0])
|
| 146 |
-
else:
|
| 147 |
-
# If no non-empty boxes, keep the first item with empty boxes
|
| 148 |
-
result.append(items[0])
|
| 149 |
-
|
| 150 |
-
#print("result:", result)
|
| 151 |
-
|
| 152 |
-
return result
|
| 153 |
-
|
| 154 |
-
#print("image_annotator_object in update_annotator before function:", image_annotator_object)
|
| 155 |
-
|
| 156 |
image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
|
| 157 |
|
| 158 |
-
#print("image_annotator_object in update_annotator after function:", image_annotator_object)
|
| 159 |
-
#print("image_annotator_object[page_num_reported - 1]:", image_annotator_object[page_num_reported - 1])
|
| 160 |
-
|
| 161 |
out_image_annotator = image_annotator(
|
| 162 |
value = image_annotator_object[page_num_reported - 1],
|
| 163 |
boxes_alpha=0.1,
|
| 164 |
box_thickness=1,
|
| 165 |
-
|
| 166 |
-
|
| 167 |
show_label=False,
|
| 168 |
height=zoom_str,
|
| 169 |
width=zoom_str,
|
|
|
|
| 13 |
import pymupdf
|
| 14 |
from fitz import Document
|
| 15 |
from PIL import ImageDraw, Image
|
| 16 |
+
from collections import defaultdict
|
| 17 |
|
| 18 |
def decrease_page(number:int):
|
| 19 |
'''
|
|
|
|
| 45 |
if current_zoom_level >= 70:
|
| 46 |
current_zoom_level -= 10
|
| 47 |
else:
|
| 48 |
+
if current_zoom_level < 110:
|
| 49 |
current_zoom_level += 10
|
| 50 |
|
| 51 |
return current_zoom_level, annotate_current_page
|
| 52 |
|
| 53 |
+
def remove_duplicate_images_with_blank_boxes(data: List[dict]) -> List[dict]:
|
| 54 |
'''
|
| 55 |
+
Remove items from the annotator object where the same page exists twice.
|
| 56 |
'''
|
| 57 |
+
# Group items by 'image'
|
| 58 |
+
image_groups = defaultdict(list)
|
| 59 |
+
for item in data:
|
| 60 |
+
image_groups[item['image']].append(item)
|
| 61 |
+
|
| 62 |
+
# Process each group to prioritize items with non-empty boxes
|
| 63 |
+
result = []
|
| 64 |
+
for image, items in image_groups.items():
|
| 65 |
+
# Filter items with non-empty boxes
|
| 66 |
+
non_empty_boxes = [item for item in items if item.get('boxes')]
|
| 67 |
+
if non_empty_boxes:
|
| 68 |
+
# Keep the first entry with non-empty boxes
|
| 69 |
+
result.append(non_empty_boxes[0])
|
| 70 |
+
else:
|
| 71 |
+
# If all items have empty or missing boxes, keep the first item
|
| 72 |
+
result.append(items[0])
|
| 73 |
+
|
| 74 |
+
return result
|
| 75 |
+
|
| 76 |
+
def get_recogniser_dataframe_out(image_annotator_object, recogniser_dataframe_gr):
|
| 77 |
+
recogniser_entities_list = ["Redaction"]
|
| 78 |
+
recogniser_entities_drop = gr.Dropdown(value="", choices=[""], allow_custom_value=True, interactive=True)
|
| 79 |
+
recogniser_dataframe_out = recogniser_dataframe_gr
|
| 80 |
|
| 81 |
+
try:
|
| 82 |
+
review_dataframe = convert_review_json_to_pandas_df(image_annotator_object)[["page", "label"]]
|
| 83 |
+
recogniser_entities = review_dataframe["label"].unique().tolist()
|
| 84 |
+
recogniser_entities.append("ALL")
|
| 85 |
+
recogniser_entities_for_drop = sorted(recogniser_entities)
|
|
|
|
|
|
|
| 86 |
|
|
|
|
| 87 |
|
| 88 |
+
recogniser_dataframe_out = gr.Dataframe(review_dataframe)
|
| 89 |
+
recogniser_entities_drop = gr.Dropdown(value=recogniser_entities_for_drop[0], choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
|
| 90 |
+
|
| 91 |
+
recogniser_entities_list = [entity for entity in recogniser_entities_for_drop if entity != 'Redaction' and entity != 'ALL'] # Remove any existing 'Redaction'
|
| 92 |
+
recogniser_entities_list.insert(0, 'Redaction') # Add 'Redaction' to the start of the list
|
| 93 |
+
|
| 94 |
+
except Exception as e:
|
| 95 |
+
print("Could not extract recogniser information:", e)
|
| 96 |
+
recogniser_dataframe_out = recogniser_dataframe_gr
|
| 97 |
+
recogniser_entities_drop = gr.Dropdown(value="", choices=[""], allow_custom_value=True, interactive=True)
|
| 98 |
+
recogniser_entities_list = ["Redaction"]
|
| 99 |
|
| 100 |
+
return recogniser_dataframe_out, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list
|
| 101 |
+
|
| 102 |
+
def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, recogniser_entities_drop=gr.Dropdown(value="ALL", allow_custom_value=True), recogniser_dataframe_gr=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]})), zoom:int=100):
|
| 103 |
+
'''
|
| 104 |
+
Update a gradio_image_annotation object with new annotation data
|
| 105 |
+
'''
|
| 106 |
+
recogniser_entities_list = ["Redaction"]
|
| 107 |
+
recogniser_dataframe_out = pd.DataFrame()
|
| 108 |
+
|
| 109 |
+
if recogniser_dataframe_gr.empty:
|
| 110 |
+
recogniser_dataframe_gr, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list = get_recogniser_dataframe_out(image_annotator_object, recogniser_dataframe_gr)
|
| 111 |
+
elif recogniser_dataframe_gr.iloc[0,0] == "":
|
| 112 |
+
recogniser_dataframe_gr, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list = get_recogniser_dataframe_out(image_annotator_object, recogniser_dataframe_gr)
|
| 113 |
else:
|
| 114 |
review_dataframe = update_entities_df(recogniser_entities_drop, recogniser_dataframe_gr)
|
| 115 |
recogniser_dataframe_out = gr.Dataframe(review_dataframe)
|
| 116 |
+
recogniser_entities_list = recogniser_dataframe_gr["label"].unique().tolist()
|
| 117 |
+
|
| 118 |
+
print("recogniser_entities_list all options:", recogniser_entities_list)
|
| 119 |
+
|
| 120 |
+
recogniser_entities_list = sorted(recogniser_entities_list)
|
| 121 |
+
recogniser_entities_list = [entity for entity in recogniser_entities_list if entity != 'Redaction'] # Remove any existing 'Redaction'
|
| 122 |
+
recogniser_entities_list.insert(0, 'Redaction') # Add 'Redaction' to the start of the list
|
| 123 |
|
| 124 |
+
print("recogniser_entities_list:", recogniser_entities_list)
|
| 125 |
|
| 126 |
zoom_str = str(zoom) + '%'
|
| 127 |
+
recogniser_colour_list = [(0, 0, 0) for _ in range(len(recogniser_entities_list))]
|
| 128 |
|
| 129 |
if not image_annotator_object:
|
| 130 |
page_num_reported = 1
|
|
|
|
| 133 |
image_annotator_object[page_num_reported - 1],
|
| 134 |
boxes_alpha=0.1,
|
| 135 |
box_thickness=1,
|
| 136 |
+
label_list=recogniser_entities_list,
|
| 137 |
+
label_colors=recogniser_colour_list,
|
| 138 |
show_label=False,
|
| 139 |
height=zoom_str,
|
| 140 |
width=zoom_str,
|
|
|
|
| 172 |
if page_num_reported > page_max_reported:
|
| 173 |
page_num_reported = page_max_reported
|
| 174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
|
| 176 |
|
|
|
|
|
|
|
|
|
|
| 177 |
out_image_annotator = image_annotator(
|
| 178 |
value = image_annotator_object[page_num_reported - 1],
|
| 179 |
boxes_alpha=0.1,
|
| 180 |
box_thickness=1,
|
| 181 |
+
label_list=recogniser_entities_list,
|
| 182 |
+
label_colors=recogniser_colour_list,
|
| 183 |
show_label=False,
|
| 184 |
height=zoom_str,
|
| 185 |
width=zoom_str,
|