Merge pull request #3 from seanpedrick-case/dev
Browse files- README.md +3 -5
- app.py +13 -15
- doc_redaction_amplify_app +0 -1
- tools/custom_image_analyser_engine.py +3 -3
- tools/file_conversion.py +87 -39
- tools/file_redaction.py +51 -21
- tools/helper_functions.py +1 -1
- tools/redaction_review.py +69 -53
README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
---
|
2 |
title: Document redaction
|
3 |
-
emoji:
|
4 |
colorFrom: blue
|
5 |
-
colorTo:
|
6 |
sdk: docker
|
7 |
app_file: app.py
|
8 |
pinned: false
|
@@ -12,9 +12,7 @@ license: agpl-3.0
|
|
12 |
|
13 |
Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
|
14 |
|
15 |
-
To identify text in documents, the 'local' text/OCR image analysis uses spacy/tesseract, and works ok for documents with typed text. If available, choose 'AWS Textract service' to redact more complex elements e.g. signatures or handwriting.
|
16 |
-
|
17 |
-
Then, choose a method for PII identification. 'Local' is quick and gives good results if you are primarily looking for a custom list of terms to redact (see Redaction settings). If available, AWS Comprehend gives better results at a small cost.
|
18 |
|
19 |
After redaction, review suggested redactions on the 'Review redactions' tab. The original pdf can be uploaded here alongside a '...redaction_file.csv' to continue a previous redaction/review task. See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or custom terms to always include/ exclude from redaction.
|
20 |
|
|
|
1 |
---
|
2 |
title: Document redaction
|
3 |
+
emoji: π
|
4 |
colorFrom: blue
|
5 |
+
colorTo: yellow
|
6 |
sdk: docker
|
7 |
app_file: app.py
|
8 |
pinned: false
|
|
|
12 |
|
13 |
Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
|
14 |
|
15 |
+
To identify text in documents, the 'local' text/OCR image analysis uses spacy/tesseract, and works ok for documents with typed text. If available, choose 'AWS Textract service' to redact more complex elements e.g. signatures or handwriting. Then, choose a method for PII identification. 'Local' is quick and gives good results if you are primarily looking for a custom list of terms to redact (see Redaction settings). If available, AWS Comprehend gives better results at a small cost.
|
|
|
|
|
16 |
|
17 |
After redaction, review suggested redactions on the 'Review redactions' tab. The original pdf can be uploaded here alongside a '...redaction_file.csv' to continue a previous redaction/review task. See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or custom terms to always include/ exclude from redaction.
|
18 |
|
app.py
CHANGED
@@ -41,8 +41,6 @@ full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREET
|
|
41 |
|
42 |
language = 'en'
|
43 |
|
44 |
-
|
45 |
-
|
46 |
host_name = socket.gethostname()
|
47 |
feedback_logs_folder = 'feedback/' + today_rev + '/' + host_name + '/'
|
48 |
access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
|
@@ -121,7 +119,7 @@ with app:
|
|
121 |
|
122 |
|
123 |
## Annotator zoom value
|
124 |
-
annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=
|
125 |
zoom_true_bool = gr.State(True)
|
126 |
zoom_false_bool = gr.State(False)
|
127 |
|
@@ -160,9 +158,7 @@ with app:
|
|
160 |
|
161 |
Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](https://github.com/seanpedrick-case/doc_redaction/blob/main/README.md) for a walkthrough on how to use the app. Below is a very brief overview.
|
162 |
|
163 |
-
To identify text in documents, the 'local' text/OCR image analysis uses spacy/tesseract, and works ok for documents with typed text. If available, choose 'AWS Textract service' to redact more complex elements e.g. signatures or handwriting.
|
164 |
-
|
165 |
-
Then, choose a method for PII identification. 'Local' is quick and gives good results if you are primarily looking for a custom list of terms to redact (see Redaction settings). If available, AWS Comprehend gives better results at a small cost.
|
166 |
|
167 |
After redaction, review suggested redactions on the 'Review redactions' tab. The original pdf can be uploaded here alongside a '...redaction_file.csv' to continue a previous redaction/review task. See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or custom terms to always include/ exclude from redaction.
|
168 |
|
@@ -203,7 +199,7 @@ with app:
|
|
203 |
|
204 |
with gr.Accordion(label = "Review redaction file", open=True):
|
205 |
output_review_files = gr.File(label="Review output files", file_count='multiple', height=file_input_height)
|
206 |
-
upload_previous_review_file_btn = gr.Button("Review previously created redaction file (upload original PDF and ...review_file.csv)")
|
207 |
|
208 |
with gr.Row():
|
209 |
annotation_last_page_button = gr.Button("Previous page", scale = 3)
|
@@ -215,12 +211,10 @@ with app:
|
|
215 |
annotate_zoom_out = gr.Button("Zoom out")
|
216 |
with gr.Row():
|
217 |
clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
|
218 |
-
annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
|
219 |
-
|
220 |
|
221 |
with gr.Row():
|
222 |
|
223 |
-
with gr.Column(scale=
|
224 |
|
225 |
zoom_str = str(annotator_zoom_number) + '%'
|
226 |
|
@@ -242,9 +236,13 @@ with app:
|
|
242 |
interactive=False
|
243 |
)
|
244 |
|
245 |
-
|
246 |
-
|
247 |
-
|
|
|
|
|
|
|
|
|
248 |
|
249 |
with gr.Row():
|
250 |
annotation_last_page_button_bottom = gr.Button("Previous page", scale = 3)
|
@@ -321,7 +319,7 @@ with app:
|
|
321 |
###
|
322 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
|
323 |
|
324 |
-
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator]).\
|
325 |
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
|
326 |
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
327 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
|
@@ -473,7 +471,7 @@ print(f'The value of RUN_DIRECT_MODE is {RUN_DIRECT_MODE}')
|
|
473 |
MAX_QUEUE_SIZE = int(get_or_create_env_var('MAX_QUEUE_SIZE', '5'))
|
474 |
print(f'The value of RUN_DIRECT_MODE is {MAX_QUEUE_SIZE}')
|
475 |
|
476 |
-
MAX_FILE_SIZE = get_or_create_env_var('MAX_FILE_SIZE', '
|
477 |
print(f'The value of MAX_FILE_SIZE is {MAX_FILE_SIZE}')
|
478 |
|
479 |
GRADIO_SERVER_PORT = int(get_or_create_env_var('GRADIO_SERVER_PORT', '7860'))
|
|
|
41 |
|
42 |
language = 'en'
|
43 |
|
|
|
|
|
44 |
host_name = socket.gethostname()
|
45 |
feedback_logs_folder = 'feedback/' + today_rev + '/' + host_name + '/'
|
46 |
access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
|
|
|
119 |
|
120 |
|
121 |
## Annotator zoom value
|
122 |
+
annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=80, precision=0, visible=False)
|
123 |
zoom_true_bool = gr.State(True)
|
124 |
zoom_false_bool = gr.State(False)
|
125 |
|
|
|
158 |
|
159 |
Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](https://github.com/seanpedrick-case/doc_redaction/blob/main/README.md) for a walkthrough on how to use the app. Below is a very brief overview.
|
160 |
|
161 |
+
To identify text in documents, the 'local' text/OCR image analysis uses spacy/tesseract, and works ok for documents with typed text. If available, choose 'AWS Textract service' to redact more complex elements e.g. signatures or handwriting. Then, choose a method for PII identification. 'Local' is quick and gives good results if you are primarily looking for a custom list of terms to redact (see Redaction settings). If available, AWS Comprehend gives better results at a small cost.
|
|
|
|
|
162 |
|
163 |
After redaction, review suggested redactions on the 'Review redactions' tab. The original pdf can be uploaded here alongside a '...redaction_file.csv' to continue a previous redaction/review task. See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or custom terms to always include/ exclude from redaction.
|
164 |
|
|
|
199 |
|
200 |
with gr.Accordion(label = "Review redaction file", open=True):
|
201 |
output_review_files = gr.File(label="Review output files", file_count='multiple', height=file_input_height)
|
202 |
+
upload_previous_review_file_btn = gr.Button("Review previously created redaction file (upload original PDF and ...review_file.csv)", variant="primary")
|
203 |
|
204 |
with gr.Row():
|
205 |
annotation_last_page_button = gr.Button("Previous page", scale = 3)
|
|
|
211 |
annotate_zoom_out = gr.Button("Zoom out")
|
212 |
with gr.Row():
|
213 |
clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
|
|
|
|
|
214 |
|
215 |
with gr.Row():
|
216 |
|
217 |
+
with gr.Column(scale=1):
|
218 |
|
219 |
zoom_str = str(annotator_zoom_number) + '%'
|
220 |
|
|
|
236 |
interactive=False
|
237 |
)
|
238 |
|
239 |
+
with gr.Row():
|
240 |
+
annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
|
241 |
+
|
242 |
+
#with gr.Column(scale=1):
|
243 |
+
with gr.Row():
|
244 |
+
recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
|
245 |
+
recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", label="Search results. Click to go to page")
|
246 |
|
247 |
with gr.Row():
|
248 |
annotation_last_page_button_bottom = gr.Button("Previous page", scale = 3)
|
|
|
319 |
###
|
320 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
|
321 |
|
322 |
+
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state]).\
|
323 |
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
|
324 |
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
325 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
|
|
|
471 |
MAX_QUEUE_SIZE = int(get_or_create_env_var('MAX_QUEUE_SIZE', '5'))
|
472 |
print(f'The value of RUN_DIRECT_MODE is {MAX_QUEUE_SIZE}')
|
473 |
|
474 |
+
MAX_FILE_SIZE = get_or_create_env_var('MAX_FILE_SIZE', '250mb')
|
475 |
print(f'The value of MAX_FILE_SIZE is {MAX_FILE_SIZE}')
|
476 |
|
477 |
GRADIO_SERVER_PORT = int(get_or_create_env_var('GRADIO_SERVER_PORT', '7860'))
|
doc_redaction_amplify_app
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
Subproject commit 9585642e4d1f72fc49971789693d5584661084c8
|
|
|
|
tools/custom_image_analyser_engine.py
CHANGED
@@ -637,9 +637,9 @@ class CustomImageAnalyzerEngine:
|
|
637 |
result_reset_pos.start = 0
|
638 |
result_reset_pos.end = len(relevant_text)
|
639 |
|
640 |
-
print("result_reset_pos:", result_reset_pos)
|
641 |
-
print("relevant_line_ocr_result:", relevant_line_ocr_result)
|
642 |
-
print("ocr_results_with_children_line_level:", ocr_results_with_children_line_level)
|
643 |
|
644 |
# Map the analyzer results to bounding boxes for this line
|
645 |
line_results = self.map_analyzer_results_to_bounding_boxes(
|
|
|
637 |
result_reset_pos.start = 0
|
638 |
result_reset_pos.end = len(relevant_text)
|
639 |
|
640 |
+
#print("result_reset_pos:", result_reset_pos)
|
641 |
+
#print("relevant_line_ocr_result:", relevant_line_ocr_result)
|
642 |
+
#print("ocr_results_with_children_line_level:", ocr_results_with_children_line_level)
|
643 |
|
644 |
# Map the analyzer results to bounding boxes for this line
|
645 |
line_results = self.map_analyzer_results_to_bounding_boxes(
|
tools/file_conversion.py
CHANGED
@@ -16,6 +16,7 @@ from typing import List, Optional
|
|
16 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
17 |
|
18 |
image_dpi = 300.0
|
|
|
19 |
|
20 |
def is_pdf_or_image(filename):
|
21 |
"""
|
@@ -51,26 +52,57 @@ def is_pdf(filename):
|
|
51 |
CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "")
|
52 |
print(f'The value of CUSTOM_BOX_COLOUR is {CUSTOM_BOX_COLOUR}')
|
53 |
|
|
|
|
|
|
|
|
|
54 |
def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_dir: str = 'input') -> tuple[int, str]:
|
55 |
try:
|
56 |
-
# Construct the full output directory path
|
57 |
output_dir = os.path.join(os.getcwd(), output_dir)
|
58 |
-
|
59 |
-
# Use the output_dir to construct the out_path
|
60 |
out_path = os.path.join(output_dir, f"{os.path.basename(pdf_path)}_{page_num}.png")
|
61 |
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
62 |
-
|
63 |
if os.path.exists(out_path):
|
64 |
-
#
|
65 |
image = Image.open(out_path)
|
66 |
else:
|
67 |
-
#
|
68 |
image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
|
69 |
dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
|
70 |
image = image_l[0]
|
71 |
image = image.convert("L")
|
72 |
image.save(out_path, format="PNG")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
return page_num, out_path
|
|
|
74 |
except Exception as e:
|
75 |
print(f"Error processing page {page_num + 1}: {e}")
|
76 |
return page_num, None
|
@@ -683,14 +715,20 @@ def join_values_within_threshold(df1, df2):
|
|
683 |
print(final_df)
|
684 |
|
685 |
|
686 |
-
def convert_review_json_to_pandas_df(
|
|
|
|
|
|
|
687 |
# Flatten the data
|
688 |
-
|
689 |
|
690 |
-
|
691 |
-
|
|
|
|
|
|
|
692 |
#print("flattened_data:", flattened_data)
|
693 |
-
image_path =
|
694 |
|
695 |
# Use regex to find the number before .png
|
696 |
match = re.search(r'_(\d+)\.png$', image_path)
|
@@ -701,56 +739,66 @@ def convert_review_json_to_pandas_df(data:List[dict], text_join_data=pd.DataFram
|
|
701 |
else:
|
702 |
print("No number found before .png")
|
703 |
|
704 |
-
# Check if 'boxes' is in the
|
705 |
-
if 'boxes' not in
|
706 |
-
|
707 |
|
708 |
-
for box in
|
709 |
if 'text' not in box:
|
710 |
-
data_to_add = {"image": image_path, "page": reported_number, **box} # "text":
|
711 |
else:
|
712 |
-
data_to_add = {"image": image_path, "page": reported_number, "text":
|
713 |
#print("data_to_add:", data_to_add)
|
714 |
-
|
715 |
|
716 |
# Convert to a DataFrame
|
717 |
-
|
|
|
|
|
|
|
718 |
|
719 |
# Join on additional text data from decision output results if included
|
720 |
-
if not
|
721 |
-
#print("
|
722 |
-
#print("
|
723 |
-
|
724 |
-
|
725 |
-
|
|
|
|
|
726 |
# Round to the closest number divisible by 5
|
727 |
-
|
728 |
-
|
|
|
729 |
|
730 |
-
|
|
|
|
|
731 |
|
732 |
-
|
733 |
|
734 |
-
|
735 |
|
736 |
-
|
737 |
|
738 |
-
|
739 |
-
|
|
|
|
|
740 |
|
741 |
-
|
742 |
|
743 |
-
return
|
744 |
|
745 |
-
def convert_pandas_df_to_review_json(
|
746 |
'''
|
747 |
Convert a review csv to a json file for use by the Gradio Annotation object
|
748 |
'''
|
749 |
# Keep only necessary columns
|
750 |
-
|
751 |
|
752 |
# Group the DataFrame by the 'image' column
|
753 |
-
grouped_csv_pages =
|
754 |
|
755 |
# Create a list to hold the JSON data
|
756 |
json_data = []
|
@@ -758,7 +806,7 @@ def convert_pandas_df_to_review_json(df: pd.DataFrame, image_paths: List[Image.I
|
|
758 |
for n, pdf_image_path in enumerate(image_paths):
|
759 |
reported_page_number = int(n + 1)
|
760 |
|
761 |
-
if reported_page_number in
|
762 |
|
763 |
# Convert each relevant group to a list of box dictionaries
|
764 |
selected_csv_pages = grouped_csv_pages.get_group(reported_page_number)
|
|
|
16 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
17 |
|
18 |
image_dpi = 300.0
|
19 |
+
Image.MAX_IMAGE_PIXELS = None
|
20 |
|
21 |
def is_pdf_or_image(filename):
|
22 |
"""
|
|
|
52 |
CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "")
|
53 |
print(f'The value of CUSTOM_BOX_COLOUR is {CUSTOM_BOX_COLOUR}')
|
54 |
|
55 |
+
import os
|
56 |
+
from pdf2image import convert_from_path
|
57 |
+
from PIL import Image
|
58 |
+
|
59 |
def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_dir: str = 'input') -> tuple[int, str]:
|
60 |
try:
|
61 |
+
# Construct the full output directory path
|
62 |
output_dir = os.path.join(os.getcwd(), output_dir)
|
|
|
|
|
63 |
out_path = os.path.join(output_dir, f"{os.path.basename(pdf_path)}_{page_num}.png")
|
64 |
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
65 |
+
|
66 |
if os.path.exists(out_path):
|
67 |
+
# Load existing image
|
68 |
image = Image.open(out_path)
|
69 |
else:
|
70 |
+
# Convert PDF page to image
|
71 |
image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
|
72 |
dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
|
73 |
image = image_l[0]
|
74 |
image = image.convert("L")
|
75 |
image.save(out_path, format="PNG")
|
76 |
+
|
77 |
+
# Check file size and resize if necessary
|
78 |
+
max_size = 5 * 1024 * 1024 # 5 MB in bytes # 5
|
79 |
+
file_size = os.path.getsize(out_path)
|
80 |
+
|
81 |
+
# Resize images if they are too big
|
82 |
+
if file_size > max_size:
|
83 |
+
# Start with the original image size
|
84 |
+
width, height = image.size
|
85 |
+
|
86 |
+
print(f"Image size before {new_width}x{new_height}, original file_size: {file_size}")
|
87 |
+
|
88 |
+
while file_size > max_size:
|
89 |
+
# Reduce the size by a factor (e.g., 50% of the current size)
|
90 |
+
new_width = int(width * 0.5)
|
91 |
+
new_height = int(height * 0.5)
|
92 |
+
image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
|
93 |
+
|
94 |
+
# Save the resized image
|
95 |
+
image.save(out_path, format="PNG", optimize=True)
|
96 |
+
|
97 |
+
# Update the file size
|
98 |
+
file_size = os.path.getsize(out_path)
|
99 |
+
print(f"Resized to {new_width}x{new_height}, new file_size: {file_size}")
|
100 |
+
|
101 |
+
# Update the dimensions for the next iteration
|
102 |
+
width, height = new_width, new_height
|
103 |
+
|
104 |
return page_num, out_path
|
105 |
+
|
106 |
except Exception as e:
|
107 |
print(f"Error processing page {page_num + 1}: {e}")
|
108 |
return page_num, None
|
|
|
715 |
print(final_df)
|
716 |
|
717 |
|
718 |
+
def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decision_output:pd.DataFrame=pd.DataFrame()) -> pd.DataFrame:
|
719 |
+
'''
|
720 |
+
Convert the annotation json data to a dataframe format. Add on any text from the initial review_file dataframe by joining on pages/co-ordinates (doesn't work very well currently).
|
721 |
+
'''
|
722 |
# Flatten the data
|
723 |
+
flattened_annotation_data = []
|
724 |
|
725 |
+
if not isinstance(redaction_decision_output, pd.DataFrame):
|
726 |
+
redaction_decision_output = pd.DataFrame()
|
727 |
+
|
728 |
+
for annotation in all_annotations:
|
729 |
+
#print("annotation:", annotation)
|
730 |
#print("flattened_data:", flattened_data)
|
731 |
+
image_path = annotation["image"]
|
732 |
|
733 |
# Use regex to find the number before .png
|
734 |
match = re.search(r'_(\d+)\.png$', image_path)
|
|
|
739 |
else:
|
740 |
print("No number found before .png")
|
741 |
|
742 |
+
# Check if 'boxes' is in the annotation, if not, add an empty list
|
743 |
+
if 'boxes' not in annotation:
|
744 |
+
annotation['boxes'] = []
|
745 |
|
746 |
+
for box in annotation["boxes"]:
|
747 |
if 'text' not in box:
|
748 |
+
data_to_add = {"image": image_path, "page": reported_number, **box} # "text": annotation['text'],
|
749 |
else:
|
750 |
+
data_to_add = {"image": image_path, "page": reported_number, "text": annotation['text'], **box}
|
751 |
#print("data_to_add:", data_to_add)
|
752 |
+
flattened_annotation_data.append(data_to_add)
|
753 |
|
754 |
# Convert to a DataFrame
|
755 |
+
annotation_data_as_df = pd.DataFrame(flattened_annotation_data)
|
756 |
+
|
757 |
+
#print("redaction_decision_output:", redaction_decision_output)
|
758 |
+
#print("annotation_data_as_df:", annotation_data_as_df)
|
759 |
|
760 |
# Join on additional text data from decision output results if included
|
761 |
+
if not redaction_decision_output.empty:
|
762 |
+
#print("redaction_decision_output is not empty")
|
763 |
+
#print("redaction_decision_output:", redaction_decision_output)
|
764 |
+
#print("annotation_data_as_df:", annotation_data_as_df)
|
765 |
+
redaction_decision_output['page'] = redaction_decision_output['page'].astype(str)
|
766 |
+
annotation_data_as_df['page'] = annotation_data_as_df['page'].astype(str)
|
767 |
+
redaction_decision_output = redaction_decision_output[['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page', 'text']]
|
768 |
+
|
769 |
# Round to the closest number divisible by 5
|
770 |
+
redaction_decision_output.loc[:, ['xmin', 'ymin', 'xmax', 'ymax']] = (redaction_decision_output[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
|
771 |
+
|
772 |
+
redaction_decision_output = redaction_decision_output.drop_duplicates(['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page'])
|
773 |
|
774 |
+
#annotation_data_as_df[['xmin1', 'ymin1', 'xmax1', 'ymax1']] = (annotation_data_as_df[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
|
775 |
+
|
776 |
+
annotation_data_as_df.loc[:, ['xmin1', 'ymin1', 'xmax1', 'ymax1']] = (annotation_data_as_df[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
|
777 |
|
778 |
+
annotation_data_as_df = annotation_data_as_df.merge(redaction_decision_output, left_on = ['xmin1', 'ymin1', 'xmax1', 'ymax1', 'label', 'page'], right_on = ['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page'], how = "left", suffixes=("", "_y"))
|
779 |
|
780 |
+
annotation_data_as_df = annotation_data_as_df.drop(['xmin1', 'ymin1', 'xmax1', 'ymax1', 'xmin_y', 'ymin_y', 'xmax_y', 'ymax_y'], axis=1, errors="ignore")
|
781 |
|
782 |
+
annotation_data_as_df = annotation_data_as_df[["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"]]
|
783 |
|
784 |
+
# Ensure required columns exist, filling with blank if they don't
|
785 |
+
for col in ["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"]:
|
786 |
+
if col not in annotation_data_as_df.columns:
|
787 |
+
annotation_data_as_df[col] = ''
|
788 |
|
789 |
+
annotation_data_as_df = annotation_data_as_df.sort_values(['page', 'ymin', 'xmin', 'label'])
|
790 |
|
791 |
+
return annotation_data_as_df
|
792 |
|
793 |
+
def convert_pandas_df_to_review_json(review_file_df: pd.DataFrame, image_paths: List[Image.Image]) -> List[dict]:
|
794 |
'''
|
795 |
Convert a review csv to a json file for use by the Gradio Annotation object
|
796 |
'''
|
797 |
# Keep only necessary columns
|
798 |
+
review_file_df = review_file_df[["image", "page", "xmin", "ymin", "xmax", "ymax", "color", "label"]]
|
799 |
|
800 |
# Group the DataFrame by the 'image' column
|
801 |
+
grouped_csv_pages = review_file_df.groupby('page')
|
802 |
|
803 |
# Create a list to hold the JSON data
|
804 |
json_data = []
|
|
|
806 |
for n, pdf_image_path in enumerate(image_paths):
|
807 |
reported_page_number = int(n + 1)
|
808 |
|
809 |
+
if reported_page_number in review_file_df["page"].values:
|
810 |
|
811 |
# Convert each relevant group to a list of box dictionaries
|
812 |
selected_csv_pages = grouped_csv_pages.get_group(reported_page_number)
|
tools/file_redaction.py
CHANGED
@@ -288,7 +288,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
288 |
|
289 |
print("Redacting file " + file_path_without_ext + " as an image-based file")
|
290 |
|
291 |
-
pymupdf_doc,all_decision_process_table,log_files_output_paths,new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number = redact_image_pdf(file_path,
|
292 |
prepared_pdf_image_paths,
|
293 |
language,
|
294 |
chosen_redact_entities,
|
@@ -314,9 +314,12 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
314 |
custom_recogniser_word_list,
|
315 |
redact_whole_page_list)
|
316 |
|
|
|
|
|
|
|
317 |
# Save Textract request metadata (if exists)
|
318 |
if new_request_metadata:
|
319 |
-
print("Request metadata:", new_request_metadata)
|
320 |
all_request_metadata.append(new_request_metadata)
|
321 |
|
322 |
elif in_redact_method == text_ocr_option:
|
@@ -396,10 +399,11 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
396 |
json.dump(annotations_all_pages, f)
|
397 |
log_files_output_paths.append(out_annotation_file_path)
|
398 |
|
399 |
-
|
400 |
|
401 |
# Convert json to csv and also save this
|
402 |
#print("annotations_all_pages:", annotations_all_pages)
|
|
|
403 |
|
404 |
review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
|
405 |
|
@@ -975,11 +979,11 @@ def redact_image_pdf(file_path:str,
|
|
975 |
if analysis_type == textract_option:
|
976 |
|
977 |
json_file_path = output_folder + file_name + "_textract.json"
|
978 |
-
|
979 |
|
980 |
if not os.path.exists(json_file_path):
|
981 |
print("No existing Textract results file found.")
|
982 |
-
|
983 |
#text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
984 |
#log_files_output_paths.append(json_file_path)
|
985 |
#request_metadata = request_metadata + "\n" + new_request_metadata
|
@@ -988,8 +992,12 @@ def redact_image_pdf(file_path:str,
|
|
988 |
# Open the file and load the JSON data
|
989 |
no_textract_file = False
|
990 |
print("Found existing Textract json results file.")
|
|
|
|
|
|
|
|
|
991 |
with open(json_file_path, 'r') as json_file:
|
992 |
-
|
993 |
|
994 |
###
|
995 |
|
@@ -1046,32 +1054,46 @@ def redact_image_pdf(file_path:str,
|
|
1046 |
image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
|
1047 |
pdf_page_as_bytes = image_buffer.getvalue()
|
1048 |
|
1049 |
-
if not
|
1050 |
-
|
1051 |
-
|
1052 |
-
|
|
|
|
|
1053 |
|
1054 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
1055 |
|
1056 |
else:
|
1057 |
# Check if the current reported_page_number exists in the loaded JSON
|
1058 |
-
page_exists = any(page['page_no'] == reported_page_number for page in
|
1059 |
|
1060 |
if not page_exists: # If the page does not exist, analyze again
|
1061 |
print(f"Page number {reported_page_number} not found in existing Textract data. Analysing.")
|
1062 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
1063 |
|
1064 |
# Check if "pages" key exists, if not, initialize it as an empty list
|
1065 |
-
if "pages" not in
|
1066 |
-
|
1067 |
|
1068 |
# Append the new page data
|
1069 |
-
|
1070 |
|
1071 |
request_metadata = request_metadata + "\n" + new_request_metadata
|
1072 |
else:
|
1073 |
# If the page exists, retrieve the data
|
1074 |
-
text_blocks = next(page['data'] for page in
|
1075 |
|
1076 |
|
1077 |
line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
|
@@ -1214,7 +1236,10 @@ def redact_image_pdf(file_path:str,
|
|
1214 |
if analysis_type == textract_option:
|
1215 |
# Write the updated existing textract data back to the JSON file
|
1216 |
with open(json_file_path, 'w') as json_file:
|
1217 |
-
json.dump(
|
|
|
|
|
|
|
1218 |
|
1219 |
current_loop_page += 1
|
1220 |
|
@@ -1245,7 +1270,10 @@ def redact_image_pdf(file_path:str,
|
|
1245 |
if analysis_type == textract_option:
|
1246 |
# Write the updated existing textract data back to the JSON file
|
1247 |
with open(json_file_path, 'w') as json_file:
|
1248 |
-
json.dump(
|
|
|
|
|
|
|
1249 |
|
1250 |
return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
1251 |
|
@@ -1253,7 +1281,9 @@ def redact_image_pdf(file_path:str,
|
|
1253 |
# Write the updated existing textract data back to the JSON file
|
1254 |
|
1255 |
with open(json_file_path, 'w') as json_file:
|
1256 |
-
json.dump(
|
|
|
|
|
1257 |
|
1258 |
return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
1259 |
|
@@ -1495,7 +1525,7 @@ def create_text_redaction_process_results(analyser_results, analysed_bounding_bo
|
|
1495 |
analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].apply(pd.Series)
|
1496 |
|
1497 |
# Convert the new columns to integers (if needed)
|
1498 |
-
analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']].astype(float)
|
1499 |
|
1500 |
analysed_bounding_boxes_df_text = analysed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
|
1501 |
analysed_bounding_boxes_df_text.columns = ["label", "start", "end", "score"]
|
|
|
288 |
|
289 |
print("Redacting file " + file_path_without_ext + " as an image-based file")
|
290 |
|
291 |
+
pymupdf_doc, all_decision_process_table, log_files_output_paths, new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number = redact_image_pdf(file_path,
|
292 |
prepared_pdf_image_paths,
|
293 |
language,
|
294 |
chosen_redact_entities,
|
|
|
314 |
custom_recogniser_word_list,
|
315 |
redact_whole_page_list)
|
316 |
|
317 |
+
|
318 |
+
#print("log_files_output_paths at end of image redact function:", log_files_output_paths)
|
319 |
+
|
320 |
# Save Textract request metadata (if exists)
|
321 |
if new_request_metadata:
|
322 |
+
#print("Request metadata:", new_request_metadata)
|
323 |
all_request_metadata.append(new_request_metadata)
|
324 |
|
325 |
elif in_redact_method == text_ocr_option:
|
|
|
399 |
json.dump(annotations_all_pages, f)
|
400 |
log_files_output_paths.append(out_annotation_file_path)
|
401 |
|
402 |
+
print("Saving annotations to CSV")
|
403 |
|
404 |
# Convert json to csv and also save this
|
405 |
#print("annotations_all_pages:", annotations_all_pages)
|
406 |
+
#print("all_decision_process_table:", all_decision_process_table)
|
407 |
|
408 |
review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
|
409 |
|
|
|
979 |
if analysis_type == textract_option:
|
980 |
|
981 |
json_file_path = output_folder + file_name + "_textract.json"
|
982 |
+
|
983 |
|
984 |
if not os.path.exists(json_file_path):
|
985 |
print("No existing Textract results file found.")
|
986 |
+
textract_data = {}
|
987 |
#text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
988 |
#log_files_output_paths.append(json_file_path)
|
989 |
#request_metadata = request_metadata + "\n" + new_request_metadata
|
|
|
992 |
# Open the file and load the JSON data
|
993 |
no_textract_file = False
|
994 |
print("Found existing Textract json results file.")
|
995 |
+
|
996 |
+
if json_file_path not in log_files_output_paths:
|
997 |
+
log_files_output_paths.append(json_file_path)
|
998 |
+
|
999 |
with open(json_file_path, 'r') as json_file:
|
1000 |
+
textract_data = json.load(json_file)
|
1001 |
|
1002 |
###
|
1003 |
|
|
|
1054 |
image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
|
1055 |
pdf_page_as_bytes = image_buffer.getvalue()
|
1056 |
|
1057 |
+
if not textract_data:
|
1058 |
+
try:
|
1059 |
+
text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
1060 |
+
|
1061 |
+
if json_file_path not in log_files_output_paths:
|
1062 |
+
log_files_output_paths.append(json_file_path)
|
1063 |
|
1064 |
+
textract_data = {"pages":[text_blocks]}
|
1065 |
+
except Exception as e:
|
1066 |
+
print("Textract extraction for page", reported_page_number, "failed due to:", e)
|
1067 |
+
textract_data = {"pages":[]}
|
1068 |
+
new_request_metadata = "Failed Textract API call"
|
1069 |
+
|
1070 |
+
request_metadata = request_metadata + "\n" + new_request_metadata
|
1071 |
|
1072 |
else:
|
1073 |
# Check if the current reported_page_number exists in the loaded JSON
|
1074 |
+
page_exists = any(page['page_no'] == reported_page_number for page in textract_data.get("pages", []))
|
1075 |
|
1076 |
if not page_exists: # If the page does not exist, analyze again
|
1077 |
print(f"Page number {reported_page_number} not found in existing Textract data. Analysing.")
|
1078 |
+
|
1079 |
+
try:
|
1080 |
+
text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
1081 |
+
except Exception as e:
|
1082 |
+
print("Textract extraction for page", reported_page_number, "failed due to:", e)
|
1083 |
+
text_bocks = []
|
1084 |
+
new_request_metadata = "Failed Textract API call"
|
1085 |
|
1086 |
# Check if "pages" key exists, if not, initialize it as an empty list
|
1087 |
+
if "pages" not in textract_data:
|
1088 |
+
textract_data["pages"] = []
|
1089 |
|
1090 |
# Append the new page data
|
1091 |
+
textract_data["pages"].append(text_blocks)
|
1092 |
|
1093 |
request_metadata = request_metadata + "\n" + new_request_metadata
|
1094 |
else:
|
1095 |
# If the page exists, retrieve the data
|
1096 |
+
text_blocks = next(page['data'] for page in textract_data["pages"] if page['page_no'] == reported_page_number)
|
1097 |
|
1098 |
|
1099 |
line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
|
|
|
1236 |
if analysis_type == textract_option:
|
1237 |
# Write the updated existing textract data back to the JSON file
|
1238 |
with open(json_file_path, 'w') as json_file:
|
1239 |
+
json.dump(textract_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
1240 |
+
|
1241 |
+
if json_file_path not in log_files_output_paths:
|
1242 |
+
log_files_output_paths.append(json_file_path)
|
1243 |
|
1244 |
current_loop_page += 1
|
1245 |
|
|
|
1270 |
if analysis_type == textract_option:
|
1271 |
# Write the updated existing textract data back to the JSON file
|
1272 |
with open(json_file_path, 'w') as json_file:
|
1273 |
+
json.dump(textract_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
1274 |
+
|
1275 |
+
if json_file_path not in log_files_output_paths:
|
1276 |
+
log_files_output_paths.append(json_file_path)
|
1277 |
|
1278 |
return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
1279 |
|
|
|
1281 |
# Write the updated existing textract data back to the JSON file
|
1282 |
|
1283 |
with open(json_file_path, 'w') as json_file:
|
1284 |
+
json.dump(textract_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
1285 |
+
if json_file_path not in log_files_output_paths:
|
1286 |
+
log_files_output_paths.append(json_file_path)
|
1287 |
|
1288 |
return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
1289 |
|
|
|
1525 |
analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].apply(pd.Series)
|
1526 |
|
1527 |
# Convert the new columns to integers (if needed)
|
1528 |
+
analysed_bounding_boxes_df_new.loc[:, ['xmin', 'ymin', 'xmax', 'ymax']] = (analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
|
1529 |
|
1530 |
analysed_bounding_boxes_df_text = analysed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
|
1531 |
analysed_bounding_boxes_df_text.columns = ["label", "start", "end", "score"]
|
tools/helper_functions.py
CHANGED
@@ -17,7 +17,7 @@ def reset_state_vars():
|
|
17 |
show_share_button=False,
|
18 |
show_remove_button=False,
|
19 |
interactive=False
|
20 |
-
)
|
21 |
|
22 |
def get_or_create_env_var(var_name, default_value):
|
23 |
# Get the environment variable if it exists
|
|
|
17 |
show_share_button=False,
|
18 |
show_remove_button=False,
|
19 |
interactive=False
|
20 |
+
), [], []
|
21 |
|
22 |
def get_or_create_env_var(var_name, default_value):
|
23 |
# Get the environment variable if it exists
|
tools/redaction_review.py
CHANGED
@@ -13,6 +13,7 @@ import os
|
|
13 |
import pymupdf
|
14 |
from fitz import Document
|
15 |
from PIL import ImageDraw, Image
|
|
|
16 |
|
17 |
def decrease_page(number:int):
|
18 |
'''
|
@@ -44,41 +45,86 @@ def update_zoom(current_zoom_level:int, annotate_current_page:int, decrease:bool
|
|
44 |
if current_zoom_level >= 70:
|
45 |
current_zoom_level -= 10
|
46 |
else:
|
47 |
-
if current_zoom_level <
|
48 |
current_zoom_level += 10
|
49 |
|
50 |
return current_zoom_level, annotate_current_page
|
51 |
|
52 |
-
def
|
53 |
'''
|
54 |
-
|
55 |
'''
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
recogniser_entities.append("ALL")
|
65 |
-
recogniser_entities = sorted(recogniser_entities)
|
66 |
|
67 |
-
#print("recogniser_entities:", recogniser_entities)
|
68 |
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
else:
|
77 |
review_dataframe = update_entities_df(recogniser_entities_drop, recogniser_dataframe_gr)
|
78 |
recogniser_dataframe_out = gr.Dataframe(review_dataframe)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
|
|
|
80 |
|
81 |
zoom_str = str(zoom) + '%'
|
|
|
82 |
|
83 |
if not image_annotator_object:
|
84 |
page_num_reported = 1
|
@@ -87,8 +133,8 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
|
|
87 |
image_annotator_object[page_num_reported - 1],
|
88 |
boxes_alpha=0.1,
|
89 |
box_thickness=1,
|
90 |
-
|
91 |
-
|
92 |
show_label=False,
|
93 |
height=zoom_str,
|
94 |
width=zoom_str,
|
@@ -126,44 +172,14 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
|
|
126 |
if page_num_reported > page_max_reported:
|
127 |
page_num_reported = page_max_reported
|
128 |
|
129 |
-
from collections import defaultdict
|
130 |
-
|
131 |
-
# Remove duplicate elements that are blank
|
132 |
-
def remove_duplicate_images_with_blank_boxes(data: List[AnnotatedImageData]) -> List[AnnotatedImageData]:
|
133 |
-
# Group items by 'image'
|
134 |
-
image_groups = defaultdict(list)
|
135 |
-
for item in data:
|
136 |
-
image_groups[item['image']].append(item)
|
137 |
-
|
138 |
-
# Process each group to retain only the entry with non-empty boxes, if available
|
139 |
-
result = []
|
140 |
-
for image, items in image_groups.items():
|
141 |
-
# Filter items with non-empty boxes
|
142 |
-
non_empty_boxes = [item for item in items if item['boxes']]
|
143 |
-
if non_empty_boxes:
|
144 |
-
# Keep the first entry with non-empty boxes
|
145 |
-
result.append(non_empty_boxes[0])
|
146 |
-
else:
|
147 |
-
# If no non-empty boxes, keep the first item with empty boxes
|
148 |
-
result.append(items[0])
|
149 |
-
|
150 |
-
#print("result:", result)
|
151 |
-
|
152 |
-
return result
|
153 |
-
|
154 |
-
#print("image_annotator_object in update_annotator before function:", image_annotator_object)
|
155 |
-
|
156 |
image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
|
157 |
|
158 |
-
#print("image_annotator_object in update_annotator after function:", image_annotator_object)
|
159 |
-
#print("image_annotator_object[page_num_reported - 1]:", image_annotator_object[page_num_reported - 1])
|
160 |
-
|
161 |
out_image_annotator = image_annotator(
|
162 |
value = image_annotator_object[page_num_reported - 1],
|
163 |
boxes_alpha=0.1,
|
164 |
box_thickness=1,
|
165 |
-
|
166 |
-
|
167 |
show_label=False,
|
168 |
height=zoom_str,
|
169 |
width=zoom_str,
|
|
|
13 |
import pymupdf
|
14 |
from fitz import Document
|
15 |
from PIL import ImageDraw, Image
|
16 |
+
from collections import defaultdict
|
17 |
|
18 |
def decrease_page(number:int):
|
19 |
'''
|
|
|
45 |
if current_zoom_level >= 70:
|
46 |
current_zoom_level -= 10
|
47 |
else:
|
48 |
+
if current_zoom_level < 110:
|
49 |
current_zoom_level += 10
|
50 |
|
51 |
return current_zoom_level, annotate_current_page
|
52 |
|
53 |
+
def remove_duplicate_images_with_blank_boxes(data: List[dict]) -> List[dict]:
|
54 |
'''
|
55 |
+
Remove items from the annotator object where the same page exists twice.
|
56 |
'''
|
57 |
+
# Group items by 'image'
|
58 |
+
image_groups = defaultdict(list)
|
59 |
+
for item in data:
|
60 |
+
image_groups[item['image']].append(item)
|
61 |
+
|
62 |
+
# Process each group to prioritize items with non-empty boxes
|
63 |
+
result = []
|
64 |
+
for image, items in image_groups.items():
|
65 |
+
# Filter items with non-empty boxes
|
66 |
+
non_empty_boxes = [item for item in items if item.get('boxes')]
|
67 |
+
if non_empty_boxes:
|
68 |
+
# Keep the first entry with non-empty boxes
|
69 |
+
result.append(non_empty_boxes[0])
|
70 |
+
else:
|
71 |
+
# If all items have empty or missing boxes, keep the first item
|
72 |
+
result.append(items[0])
|
73 |
+
|
74 |
+
return result
|
75 |
+
|
76 |
+
def get_recogniser_dataframe_out(image_annotator_object, recogniser_dataframe_gr):
|
77 |
+
recogniser_entities_list = ["Redaction"]
|
78 |
+
recogniser_entities_drop = gr.Dropdown(value="", choices=[""], allow_custom_value=True, interactive=True)
|
79 |
+
recogniser_dataframe_out = recogniser_dataframe_gr
|
80 |
|
81 |
+
try:
|
82 |
+
review_dataframe = convert_review_json_to_pandas_df(image_annotator_object)[["page", "label"]]
|
83 |
+
recogniser_entities = review_dataframe["label"].unique().tolist()
|
84 |
+
recogniser_entities.append("ALL")
|
85 |
+
recogniser_entities_for_drop = sorted(recogniser_entities)
|
|
|
|
|
86 |
|
|
|
87 |
|
88 |
+
recogniser_dataframe_out = gr.Dataframe(review_dataframe)
|
89 |
+
recogniser_entities_drop = gr.Dropdown(value=recogniser_entities_for_drop[0], choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
|
90 |
+
|
91 |
+
recogniser_entities_list = [entity for entity in recogniser_entities_for_drop if entity != 'Redaction' and entity != 'ALL'] # Remove any existing 'Redaction'
|
92 |
+
recogniser_entities_list.insert(0, 'Redaction') # Add 'Redaction' to the start of the list
|
93 |
+
|
94 |
+
except Exception as e:
|
95 |
+
print("Could not extract recogniser information:", e)
|
96 |
+
recogniser_dataframe_out = recogniser_dataframe_gr
|
97 |
+
recogniser_entities_drop = gr.Dropdown(value="", choices=[""], allow_custom_value=True, interactive=True)
|
98 |
+
recogniser_entities_list = ["Redaction"]
|
99 |
|
100 |
+
return recogniser_dataframe_out, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list
|
101 |
+
|
102 |
+
def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, recogniser_entities_drop=gr.Dropdown(value="ALL", allow_custom_value=True), recogniser_dataframe_gr=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]})), zoom:int=100):
|
103 |
+
'''
|
104 |
+
Update a gradio_image_annotation object with new annotation data
|
105 |
+
'''
|
106 |
+
recogniser_entities_list = ["Redaction"]
|
107 |
+
recogniser_dataframe_out = pd.DataFrame()
|
108 |
+
|
109 |
+
if recogniser_dataframe_gr.empty:
|
110 |
+
recogniser_dataframe_gr, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list = get_recogniser_dataframe_out(image_annotator_object, recogniser_dataframe_gr)
|
111 |
+
elif recogniser_dataframe_gr.iloc[0,0] == "":
|
112 |
+
recogniser_dataframe_gr, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list = get_recogniser_dataframe_out(image_annotator_object, recogniser_dataframe_gr)
|
113 |
else:
|
114 |
review_dataframe = update_entities_df(recogniser_entities_drop, recogniser_dataframe_gr)
|
115 |
recogniser_dataframe_out = gr.Dataframe(review_dataframe)
|
116 |
+
recogniser_entities_list = recogniser_dataframe_gr["label"].unique().tolist()
|
117 |
+
|
118 |
+
print("recogniser_entities_list all options:", recogniser_entities_list)
|
119 |
+
|
120 |
+
recogniser_entities_list = sorted(recogniser_entities_list)
|
121 |
+
recogniser_entities_list = [entity for entity in recogniser_entities_list if entity != 'Redaction'] # Remove any existing 'Redaction'
|
122 |
+
recogniser_entities_list.insert(0, 'Redaction') # Add 'Redaction' to the start of the list
|
123 |
|
124 |
+
print("recogniser_entities_list:", recogniser_entities_list)
|
125 |
|
126 |
zoom_str = str(zoom) + '%'
|
127 |
+
recogniser_colour_list = [(0, 0, 0) for _ in range(len(recogniser_entities_list))]
|
128 |
|
129 |
if not image_annotator_object:
|
130 |
page_num_reported = 1
|
|
|
133 |
image_annotator_object[page_num_reported - 1],
|
134 |
boxes_alpha=0.1,
|
135 |
box_thickness=1,
|
136 |
+
label_list=recogniser_entities_list,
|
137 |
+
label_colors=recogniser_colour_list,
|
138 |
show_label=False,
|
139 |
height=zoom_str,
|
140 |
width=zoom_str,
|
|
|
172 |
if page_num_reported > page_max_reported:
|
173 |
page_num_reported = page_max_reported
|
174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
|
176 |
|
|
|
|
|
|
|
177 |
out_image_annotator = image_annotator(
|
178 |
value = image_annotator_object[page_num_reported - 1],
|
179 |
boxes_alpha=0.1,
|
180 |
box_thickness=1,
|
181 |
+
label_list=recogniser_entities_list,
|
182 |
+
label_colors=recogniser_colour_list,
|
183 |
show_label=False,
|
184 |
height=zoom_str,
|
185 |
width=zoom_str,
|