Commit
·
3bff849
1
Parent(s):
601fcda
Updated command line redaction script with more options
Browse files- Dockerfile +1 -1
- tools/cli_redact.py +149 -69
- tools/custom_image_analyser_engine.py +19 -19
- tools/data_anonymise.py +1 -1
- tools/example_cli_calls.txt +11 -0
- tools/file_conversion.py +29 -55
- tools/file_redaction.py +56 -41
- tools/redaction_review.py +29 -29
Dockerfile
CHANGED
@@ -101,7 +101,7 @@ RUN mkdir -p /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache /tmp /var/tmp ${XDG_
|
|
101 |
&& chmod 755 \
|
102 |
${APP_HOME}/.local/share/spacy/data \
|
103 |
mkdir -p /usr/share/tessdata && \
|
104 |
-
chmod 755 /usr/share/tessdata
|
105 |
|
106 |
# Copy installed packages from builder stage
|
107 |
COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
|
|
|
101 |
&& chmod 755 \
|
102 |
${APP_HOME}/.local/share/spacy/data \
|
103 |
mkdir -p /usr/share/tessdata && \
|
104 |
+
chmod 755 /usr/share/tessdata
|
105 |
|
106 |
# Copy installed packages from builder stage
|
107 |
COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
|
tools/cli_redact.py
CHANGED
@@ -1,84 +1,164 @@
|
|
1 |
import argparse
|
2 |
import os
|
3 |
-
|
4 |
-
from tools.
|
|
|
5 |
from tools.file_conversion import get_input_file_names, prepare_image_or_pdf
|
6 |
from tools.file_redaction import choose_and_run_redactor
|
7 |
-
|
8 |
-
from datetime import datetime
|
9 |
-
|
10 |
-
chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER', 'CREDIT_DEBIT_CVV', 'CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS',
|
11 |
-
'NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD',
|
12 |
-
'IP_ADDRESS','MAC_ADDRESS','LICENSE_PLATE',
|
13 |
-
'VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER',
|
14 |
-
'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE',
|
15 |
-
'UK_NATIONAL_HEALTH_SERVICE_NUMBER']
|
16 |
-
chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS",
|
17 |
-
"STREETNAME", "UKPOSTCODE"]
|
18 |
-
|
19 |
-
def main(first_loop_state=True, latest_file_completed=0, output_summary="", output_file_list=None,
|
20 |
-
log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
|
21 |
-
current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_line_level_ocr_results = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),chosen_comprehend_entities = chosen_comprehend_entities, chosen_redact_entities = chosen_redact_entities, handwrite_signature_checkbox = ["Redact all identified handwriting", "Redact all identified signatures"]):
|
22 |
-
|
23 |
-
if output_file_list is None:
|
24 |
-
output_file_list = []
|
25 |
-
if log_files_list is None:
|
26 |
-
log_files_list = []
|
27 |
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
# Optional arguments with defaults matching the GUI app
|
34 |
-
parser.add_argument('--ocr_method', choices=[text_ocr_option, tesseract_ocr_option, textract_option],
|
35 |
-
default='Quick image analysis', help='OCR method to use')
|
36 |
-
parser.add_argument('--pii_detector', choices=[local_pii_detector, aws_pii_detector],
|
37 |
-
default='Local', help='PII detection method')
|
38 |
-
parser.add_argument('--page_min', type=int, default=0, help='First page to redact')
|
39 |
-
parser.add_argument('--page_max', type=int, default=0, help='Last page to redact')
|
40 |
-
parser.add_argument('--allow_list', help='Path to allow list CSV file')
|
41 |
-
parser.add_argument('--output_dir', default='output/', help='Output directory')
|
42 |
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
-
#
|
46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
-
#
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
-
#
|
52 |
-
|
53 |
-
|
54 |
-
|
|
|
|
|
|
|
55 |
|
56 |
-
|
57 |
-
file_name_no_ext, file_name_with_ext, full_file_name = get_input_file_names(file_obj)
|
58 |
|
59 |
-
#
|
|
|
|
|
|
|
60 |
|
61 |
-
#
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
-
|
81 |
-
|
|
|
|
|
|
|
|
|
|
|
82 |
|
83 |
if __name__ == "__main__":
|
84 |
-
main()
|
|
|
1 |
import argparse
|
2 |
import os
|
3 |
+
import pandas as pd
|
4 |
+
from tools.config import get_or_create_env_var, LOCAL_PII_OPTION, AWS_PII_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION
|
5 |
+
from tools.helper_functions import ensure_output_folder_exists
|
6 |
from tools.file_conversion import get_input_file_names, prepare_image_or_pdf
|
7 |
from tools.file_redaction import choose_and_run_redactor
|
8 |
+
from tools.anonymisation import anonymise_files_with_open_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
+
# --- Constants and Configuration ---
|
11 |
+
INPUT_FOLDER = 'input/'
|
12 |
+
OUTPUT_FOLDER = 'output/'
|
13 |
+
DEFAULT_LANGUAGE = 'en'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
+
# Define entities for redaction
|
16 |
+
chosen_comprehend_entities = [
|
17 |
+
'BANK_ACCOUNT_NUMBER', 'BANK_ROUTING', 'CREDIT_DEBIT_NUMBER',
|
18 |
+
'CREDIT_DEBIT_CVV', 'CREDIT_DEBIT_EXPIRY', 'PIN', 'EMAIL', 'ADDRESS',
|
19 |
+
'NAME', 'PHONE', 'PASSPORT_NUMBER', 'DRIVER_ID', 'USERNAME', 'PASSWORD',
|
20 |
+
'IP_ADDRESS', 'MAC_ADDRESS', 'LICENSE_PLATE', 'VEHICLE_IDENTIFICATION_NUMBER',
|
21 |
+
'UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER',
|
22 |
+
'SWIFT_CODE', 'UK_NATIONAL_HEALTH_SERVICE_NUMBER'
|
23 |
+
]
|
24 |
+
chosen_redact_entities = [
|
25 |
+
"TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"
|
26 |
+
]
|
27 |
+
|
28 |
+
# --- Main CLI Function ---
|
29 |
+
def main():
|
30 |
+
"""
|
31 |
+
A unified command-line interface to prepare, redact, and anonymise various document types.
|
32 |
+
"""
|
33 |
+
parser = argparse.ArgumentParser(
|
34 |
+
description='A versatile CLI for redacting PII from PDF/image files and anonymising Word/tabular data.',
|
35 |
+
formatter_class=argparse.RawTextHelpFormatter
|
36 |
+
)
|
37 |
|
38 |
+
# --- General Arguments (apply to all file types) ---
|
39 |
+
general_group = parser.add_argument_group('General Options')
|
40 |
+
general_group.add_argument('--input_file', required=True, help='Path to the input file to process.')
|
41 |
+
general_group.add_argument('--output_dir', default=OUTPUT_FOLDER, help='Directory for all output files.')
|
42 |
+
general_group.add_argument('--language', default=DEFAULT_LANGUAGE, help='Language of the document content.')
|
43 |
+
general_group.add_argument('--allow_list', help='Path to a CSV file with words to exclude from redaction.')
|
44 |
+
general_group.add_argument('--pii_detector',
|
45 |
+
choices=[LOCAL_PII_OPTION, AWS_PII_OPTION],
|
46 |
+
default=LOCAL_PII_OPTION,
|
47 |
+
help='Core PII detection method (Local or AWS).')
|
48 |
+
general_group.add_argument('--aws_access_key', default='', help='Your AWS Access Key ID.')
|
49 |
+
general_group.add_argument('--aws_secret_key', default='', help='Your AWS Secret Access Key.')
|
50 |
|
51 |
+
# --- PDF/Image Redaction Arguments ---
|
52 |
+
pdf_group = parser.add_argument_group('PDF/Image Redaction Options (.pdf, .png, .jpg)')
|
53 |
+
pdf_group.add_argument('--ocr_method',
|
54 |
+
choices=[SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION],
|
55 |
+
default=TESSERACT_TEXT_EXTRACT_OPTION,
|
56 |
+
help='OCR method for text extraction from images.')
|
57 |
+
pdf_group.add_argument('--page_min', type=int, default=0, help='First page to redact.')
|
58 |
+
pdf_group.add_argument('--page_max', type=int, default=999, help='Last page to redact.')
|
59 |
+
pdf_group.add_argument('--prepare_for_review', action='store_true', help='Prepare files for reviewing redactions.')
|
60 |
+
pdf_group.add_argument('--no_images', action='store_false', dest='prepare_images', help='Disable image creation for PDF pages.')
|
61 |
|
62 |
+
# --- Word/Tabular Anonymisation Arguments ---
|
63 |
+
tabular_group = parser.add_argument_group('Word/Tabular Anonymisation Options (.docx, .csv, .xlsx)')
|
64 |
+
tabular_group.add_argument('--anon_strat', choices=['redact', 'encrypt', 'hash'], default='redact', help='The anonymisation strategy to apply.')
|
65 |
+
tabular_group.add_argument('--columns', nargs='+', default=[], help='A list of column names to anonymise in tabular data.')
|
66 |
+
tabular_group.add_argument('--excel_sheets', nargs='+', default=[], help='Specific Excel sheet names to process.')
|
67 |
+
tabular_group.add_argument('--deny_list', help='Path to a CSV file with specific terms/phrases to redact.')
|
68 |
+
tabular_group.add_argument('--fuzzy_mistakes', type=int, default=1, help='Number of allowed spelling mistakes for fuzzy matching.')
|
69 |
|
70 |
+
args = parser.parse_args()
|
|
|
71 |
|
72 |
+
# --- Initial Setup ---
|
73 |
+
ensure_output_folder_exists(args.output_dir)
|
74 |
+
_, file_extension = os.path.splitext(args.input_file)
|
75 |
+
file_extension = file_extension.lower()
|
76 |
|
77 |
+
# Load allow/deny lists
|
78 |
+
allow_list = pd.read_csv(args.allow_list) if args.allow_list else pd.DataFrame()
|
79 |
+
deny_list = pd.read_csv(args.deny_list).iloc[:, 0].tolist() if args.deny_list else []
|
80 |
+
|
81 |
+
|
82 |
+
# --- Route to the Correct Workflow Based on File Type ---
|
83 |
+
|
84 |
+
# Workflow 1: PDF/Image Redaction
|
85 |
+
if file_extension in ['.pdf', '.png', '.jpg', '.jpeg']:
|
86 |
+
print("--- Detected PDF/Image file. Starting Redaction Workflow... ---")
|
87 |
+
try:
|
88 |
+
# Step 1: Prepare the document
|
89 |
+
print("\nStep 1: Preparing document...")
|
90 |
+
(
|
91 |
+
prep_summary, prepared_pdf_paths, image_file_paths, _, _, pdf_doc,
|
92 |
+
image_annotations, _, original_cropboxes, page_sizes, textract_output_found, _, _, _, _
|
93 |
+
) = prepare_image_or_pdf(
|
94 |
+
file_paths=[args.input_file], text_extract_method=args.ocr_method,
|
95 |
+
all_line_level_ocr_results_df=pd.DataFrame(), all_page_line_level_ocr_results_with_words_df=pd.DataFrame(),
|
96 |
+
first_loop_state=True, prepare_for_review=args.prepare_for_review,
|
97 |
+
output_folder=args.output_dir, prepare_images=args.prepare_images
|
98 |
+
)
|
99 |
+
print(f"Preparation complete. {prep_summary}")
|
100 |
+
|
101 |
+
# Step 2: Redact the prepared document
|
102 |
+
print("\nStep 2: Running redaction...")
|
103 |
+
(
|
104 |
+
output_summary, output_files, _, _, log_files, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _
|
105 |
+
) = choose_and_run_redactor(
|
106 |
+
file_paths=[args.input_file], prepared_pdf_file_paths=prepared_pdf_paths,
|
107 |
+
pdf_image_file_paths=image_file_paths, chosen_redact_entities=chosen_redact_entities,
|
108 |
+
chosen_redact_comprehend_entities=chosen_comprehend_entities, text_extraction_method=args.ocr_method,
|
109 |
+
in_allow_list=allow_list, first_loop_state=True, page_min=args.page_min, page_max=args.page_max,
|
110 |
+
pymupdf_doc=pdf_doc, annotations_all_pages=image_annotations, page_sizes=page_sizes,
|
111 |
+
document_cropboxes=original_cropboxes, pii_identification_method=args.pii_detector,
|
112 |
+
aws_access_key_textbox=args.aws_access_key, aws_secret_key_textbox=args.aws_secret_key,
|
113 |
+
language=args.language, output_folder=args.output_dir
|
114 |
+
)
|
115 |
+
|
116 |
+
print("\n--- Redaction Process Complete ---")
|
117 |
+
print(f"Summary: {output_summary}")
|
118 |
+
print(f"\nOutput files saved to: {args.output_dir}")
|
119 |
+
print("Generated Files:", sorted(output_files))
|
120 |
+
if log_files: print("Log Files:", sorted(log_files))
|
121 |
+
|
122 |
+
except Exception as e:
|
123 |
+
print(f"\nAn error occurred during the PDF/Image redaction workflow: {e}")
|
124 |
+
|
125 |
+
# Workflow 2: Word/Tabular Data Anonymisation
|
126 |
+
elif file_extension in ['.docx', '.xlsx', '.xls', '.csv', '.parquet']:
|
127 |
+
print("--- Detected Word/Tabular file. Starting Anonymisation Workflow... ---")
|
128 |
+
try:
|
129 |
+
# Run the anonymisation function directly
|
130 |
+
output_summary, output_files, _, _, log_files, _, _ = anonymise_files_with_open_text(
|
131 |
+
file_paths=[args.input_file],
|
132 |
+
in_text="", # Not used for file-based operations
|
133 |
+
anon_strat=args.anon_strat,
|
134 |
+
chosen_cols=args.columns,
|
135 |
+
chosen_redact_entities=chosen_redact_entities,
|
136 |
+
in_allow_list=allow_list,
|
137 |
+
in_excel_sheets=args.excel_sheets,
|
138 |
+
first_loop_state=True,
|
139 |
+
output_folder=args.output_dir,
|
140 |
+
in_deny_list=deny_list,
|
141 |
+
max_fuzzy_spelling_mistakes_num=args.fuzzy_mistakes,
|
142 |
+
pii_identification_method=args.pii_detector,
|
143 |
+
chosen_redact_comprehend_entities=chosen_comprehend_entities,
|
144 |
+
aws_access_key_textbox=args.aws_access_key,
|
145 |
+
aws_secret_key_textbox=args.aws_secret_key,
|
146 |
+
language=args.language
|
147 |
+
)
|
148 |
+
|
149 |
+
print("\n--- Anonymisation Process Complete ---")
|
150 |
+
print(f"Summary: {output_summary}")
|
151 |
+
print(f"\nOutput files saved to: {args.output_dir}")
|
152 |
+
print("Generated Files:", sorted(output_files))
|
153 |
+
if log_files: print("Log Files:", sorted(log_files))
|
154 |
|
155 |
+
except Exception as e:
|
156 |
+
print(f"\nAn error occurred during the Word/Tabular anonymisation workflow: {e}")
|
157 |
+
|
158 |
+
else:
|
159 |
+
print(f"Error: Unsupported file type '{file_extension}'.")
|
160 |
+
print("Supported types for redaction: .pdf, .png, .jpg, .jpeg")
|
161 |
+
print("Supported types for anonymisation: .docx, .xlsx, .xls, .csv, .parquet")
|
162 |
|
163 |
if __name__ == "__main__":
|
164 |
+
main()
|
tools/custom_image_analyser_engine.py
CHANGED
@@ -696,8 +696,8 @@ class CustomImageAnalyzerEngine:
|
|
696 |
) -> List[CustomImageRecognizerResult]:
|
697 |
|
698 |
page_text = ""
|
699 |
-
page_text_mapping =
|
700 |
-
all_text_line_results =
|
701 |
comprehend_query_number = 0
|
702 |
print("custom_entities:", custom_entities)
|
703 |
|
@@ -774,13 +774,13 @@ class CustomImageAnalyzerEngine:
|
|
774 |
|
775 |
# Process text in batches for AWS Comprehend
|
776 |
current_batch = ""
|
777 |
-
current_batch_mapping =
|
778 |
batch_char_count = 0
|
779 |
batch_word_count = 0
|
780 |
|
781 |
for i, text_line in enumerate(line_level_ocr_results):
|
782 |
words = text_line.text.split()
|
783 |
-
word_start_positions =
|
784 |
current_pos = 0
|
785 |
|
786 |
for word in words:
|
@@ -839,7 +839,7 @@ class CustomImageAnalyzerEngine:
|
|
839 |
comprehend_query_number += 1
|
840 |
|
841 |
# Process results and create bounding boxes
|
842 |
-
combined_results =
|
843 |
for i, text_line in enumerate(line_level_ocr_results):
|
844 |
line_results = next((results for idx, results in all_text_line_results if idx == i), [])
|
845 |
if line_results and i < len(ocr_results_with_words):
|
@@ -872,7 +872,7 @@ class CustomImageAnalyzerEngine:
|
|
872 |
allow_list: List[str],
|
873 |
ocr_results_with_words_child_info: Dict[str, Dict]
|
874 |
) -> List[CustomImageRecognizerResult]:
|
875 |
-
redaction_bboxes =
|
876 |
|
877 |
for redaction_relevant_ocr_result in redaction_relevant_ocr_results:
|
878 |
#print("ocr_results_with_words_child_info:", ocr_results_with_words_child_info)
|
@@ -895,7 +895,7 @@ class CustomImageAnalyzerEngine:
|
|
895 |
matched_words = matched_text.split()
|
896 |
|
897 |
# Find the corresponding words in the OCR results
|
898 |
-
matching_word_boxes =
|
899 |
|
900 |
current_position = 0
|
901 |
|
@@ -1236,13 +1236,13 @@ def run_page_text_redaction(
|
|
1236 |
)
|
1237 |
|
1238 |
current_batch = ""
|
1239 |
-
current_batch_mapping =
|
1240 |
batch_char_count = 0
|
1241 |
batch_word_count = 0
|
1242 |
|
1243 |
for i, text_line in enumerate(line_level_text_results_list):
|
1244 |
words = text_line.text.split()
|
1245 |
-
word_start_positions =
|
1246 |
|
1247 |
# Calculate word start positions within the line
|
1248 |
current_pos = 0
|
@@ -1320,12 +1320,12 @@ def merge_text_bounding_boxes(analyser_results:dict, characters: List[LTChar], c
|
|
1320 |
'''
|
1321 |
Merge identified bounding boxes containing PII that are very close to one another
|
1322 |
'''
|
1323 |
-
analysed_bounding_boxes =
|
1324 |
-
original_bounding_boxes =
|
1325 |
|
1326 |
if len(analyser_results) > 0 and len(characters) > 0:
|
1327 |
# Extract bounding box coordinates for sorting
|
1328 |
-
bounding_boxes =
|
1329 |
for result in analyser_results:
|
1330 |
#print("Result:", result)
|
1331 |
char_boxes = [char.bbox for char in characters[result.start:result.end] if isinstance(char, LTChar)]
|
@@ -1346,11 +1346,11 @@ def merge_text_bounding_boxes(analyser_results:dict, characters: List[LTChar], c
|
|
1346 |
# Sort the results by y-coordinate and then by x-coordinate
|
1347 |
bounding_boxes.sort()
|
1348 |
|
1349 |
-
merged_bounding_boxes =
|
1350 |
current_box = None
|
1351 |
current_y = None
|
1352 |
current_result = None
|
1353 |
-
current_text =
|
1354 |
|
1355 |
for y, x, result, next_box, text in bounding_boxes:
|
1356 |
if current_y is None or current_box is None:
|
@@ -1406,7 +1406,7 @@ def merge_text_bounding_boxes(analyser_results:dict, characters: List[LTChar], c
|
|
1406 |
return analysed_bounding_boxes
|
1407 |
|
1408 |
def recreate_page_line_level_ocr_results_with_page(page_line_level_ocr_results_with_words: dict):
|
1409 |
-
reconstructed_results =
|
1410 |
|
1411 |
# Assume all lines belong to the same page, so we can just read it from one item
|
1412 |
#page = next(iter(page_line_level_ocr_results_with_words.values()))["page"]
|
@@ -1445,7 +1445,7 @@ def split_words_and_punctuation_from_line(line_of_words: List[OCRResult]) -> Lis
|
|
1445 |
# Punctuation that will be split off. Hyphen is not included.
|
1446 |
PUNCTUATION_TO_SPLIT = {'.', ',', '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'}
|
1447 |
|
1448 |
-
new_word_list =
|
1449 |
|
1450 |
for word_result in line_of_words:
|
1451 |
word_text = word_result.text
|
@@ -1528,8 +1528,8 @@ def combine_ocr_results(ocr_results: List[OCRResult], x_threshold: float = 50.0,
|
|
1528 |
if not ocr_results:
|
1529 |
return {"page": page, "results": []}, {"page": page, "results": {}}
|
1530 |
|
1531 |
-
lines =
|
1532 |
-
current_line =
|
1533 |
for result in sorted(ocr_results, key=lambda x: (x.top, x.left)):
|
1534 |
if not current_line or abs(result.top - current_line[0].top) <= y_threshold:
|
1535 |
current_line.append(result)
|
@@ -1539,7 +1539,7 @@ def combine_ocr_results(ocr_results: List[OCRResult], x_threshold: float = 50.0,
|
|
1539 |
if current_line:
|
1540 |
lines.append(sorted(current_line, key=lambda x: x.left))
|
1541 |
|
1542 |
-
page_line_level_ocr_results =
|
1543 |
page_line_level_ocr_results_with_words = {}
|
1544 |
line_counter = 1
|
1545 |
|
|
|
696 |
) -> List[CustomImageRecognizerResult]:
|
697 |
|
698 |
page_text = ""
|
699 |
+
page_text_mapping = list()
|
700 |
+
all_text_line_results = list()
|
701 |
comprehend_query_number = 0
|
702 |
print("custom_entities:", custom_entities)
|
703 |
|
|
|
774 |
|
775 |
# Process text in batches for AWS Comprehend
|
776 |
current_batch = ""
|
777 |
+
current_batch_mapping = list()
|
778 |
batch_char_count = 0
|
779 |
batch_word_count = 0
|
780 |
|
781 |
for i, text_line in enumerate(line_level_ocr_results):
|
782 |
words = text_line.text.split()
|
783 |
+
word_start_positions = list()
|
784 |
current_pos = 0
|
785 |
|
786 |
for word in words:
|
|
|
839 |
comprehend_query_number += 1
|
840 |
|
841 |
# Process results and create bounding boxes
|
842 |
+
combined_results = list()
|
843 |
for i, text_line in enumerate(line_level_ocr_results):
|
844 |
line_results = next((results for idx, results in all_text_line_results if idx == i), [])
|
845 |
if line_results and i < len(ocr_results_with_words):
|
|
|
872 |
allow_list: List[str],
|
873 |
ocr_results_with_words_child_info: Dict[str, Dict]
|
874 |
) -> List[CustomImageRecognizerResult]:
|
875 |
+
redaction_bboxes = list()
|
876 |
|
877 |
for redaction_relevant_ocr_result in redaction_relevant_ocr_results:
|
878 |
#print("ocr_results_with_words_child_info:", ocr_results_with_words_child_info)
|
|
|
895 |
matched_words = matched_text.split()
|
896 |
|
897 |
# Find the corresponding words in the OCR results
|
898 |
+
matching_word_boxes = list()
|
899 |
|
900 |
current_position = 0
|
901 |
|
|
|
1236 |
)
|
1237 |
|
1238 |
current_batch = ""
|
1239 |
+
current_batch_mapping = list()
|
1240 |
batch_char_count = 0
|
1241 |
batch_word_count = 0
|
1242 |
|
1243 |
for i, text_line in enumerate(line_level_text_results_list):
|
1244 |
words = text_line.text.split()
|
1245 |
+
word_start_positions = list()
|
1246 |
|
1247 |
# Calculate word start positions within the line
|
1248 |
current_pos = 0
|
|
|
1320 |
'''
|
1321 |
Merge identified bounding boxes containing PII that are very close to one another
|
1322 |
'''
|
1323 |
+
analysed_bounding_boxes = list()
|
1324 |
+
original_bounding_boxes = list() # List to hold original bounding boxes
|
1325 |
|
1326 |
if len(analyser_results) > 0 and len(characters) > 0:
|
1327 |
# Extract bounding box coordinates for sorting
|
1328 |
+
bounding_boxes = list()
|
1329 |
for result in analyser_results:
|
1330 |
#print("Result:", result)
|
1331 |
char_boxes = [char.bbox for char in characters[result.start:result.end] if isinstance(char, LTChar)]
|
|
|
1346 |
# Sort the results by y-coordinate and then by x-coordinate
|
1347 |
bounding_boxes.sort()
|
1348 |
|
1349 |
+
merged_bounding_boxes = list()
|
1350 |
current_box = None
|
1351 |
current_y = None
|
1352 |
current_result = None
|
1353 |
+
current_text = list()
|
1354 |
|
1355 |
for y, x, result, next_box, text in bounding_boxes:
|
1356 |
if current_y is None or current_box is None:
|
|
|
1406 |
return analysed_bounding_boxes
|
1407 |
|
1408 |
def recreate_page_line_level_ocr_results_with_page(page_line_level_ocr_results_with_words: dict):
|
1409 |
+
reconstructed_results = list()
|
1410 |
|
1411 |
# Assume all lines belong to the same page, so we can just read it from one item
|
1412 |
#page = next(iter(page_line_level_ocr_results_with_words.values()))["page"]
|
|
|
1445 |
# Punctuation that will be split off. Hyphen is not included.
|
1446 |
PUNCTUATION_TO_SPLIT = {'.', ',', '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'}
|
1447 |
|
1448 |
+
new_word_list = list()
|
1449 |
|
1450 |
for word_result in line_of_words:
|
1451 |
word_text = word_result.text
|
|
|
1528 |
if not ocr_results:
|
1529 |
return {"page": page, "results": []}, {"page": page, "results": {}}
|
1530 |
|
1531 |
+
lines = list()
|
1532 |
+
current_line = list()
|
1533 |
for result in sorted(ocr_results, key=lambda x: (x.top, x.left)):
|
1534 |
if not current_line or abs(result.top - current_line[0].top) <= y_threshold:
|
1535 |
current_line.append(result)
|
|
|
1539 |
if current_line:
|
1540 |
lines.append(sorted(current_line, key=lambda x: x.left))
|
1541 |
|
1542 |
+
page_line_level_ocr_results = list()
|
1543 |
page_line_level_ocr_results_with_words = {}
|
1544 |
line_counter = 1
|
1545 |
|
tools/data_anonymise.py
CHANGED
@@ -327,7 +327,7 @@ def anonymise_files_with_open_text(file_paths: List[str],
|
|
327 |
This function anonymises data files based on the provided parameters.
|
328 |
|
329 |
Parameters:
|
330 |
-
- file_paths (List[str]): A list of file paths to anonymise.
|
331 |
- in_text (str): The text to anonymise if file_paths is 'open_text'.
|
332 |
- anon_strat (str): The anonymisation strategy to use.
|
333 |
- chosen_cols (List[str]): A list of column names to anonymise.
|
|
|
327 |
This function anonymises data files based on the provided parameters.
|
328 |
|
329 |
Parameters:
|
330 |
+
- file_paths (List[str]): A list of file paths to anonymise: '.xlsx', '.xls', '.csv', '.parquet', or '.docx'.
|
331 |
- in_text (str): The text to anonymise if file_paths is 'open_text'.
|
332 |
- anon_strat (str): The anonymisation strategy to use.
|
333 |
- chosen_cols (List[str]): A list of column names to anonymise.
|
tools/example_cli_calls.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
python cli_redact.py --help
|
2 |
+
|
3 |
+
python cli_redact.py \
|
4 |
+
--input_file "documents/confidential-report.pdf" \
|
5 |
+
--output_dir "output/redacted_reports/" \
|
6 |
+
--ocr_method "Local OCR model - PDFs without selectable text" \
|
7 |
+
--pii_detector "Local" \
|
8 |
+
--page_min 2 \
|
9 |
+
--page_max 10 \
|
10 |
+
--allow_list "config/project_allowlist.csv"
|
11 |
+
|
tools/file_conversion.py
CHANGED
@@ -72,7 +72,7 @@ def check_image_size_and_reduce(out_path:str, image:Image):
|
|
72 |
Check if a given image size is above around 4.5mb, and reduce size if necessary. 5mb is the maximum possible to submit to AWS Textract.
|
73 |
'''
|
74 |
|
75 |
-
all_img_details =
|
76 |
page_num = 0
|
77 |
|
78 |
# Check file size and resize if necessary
|
@@ -168,9 +168,9 @@ def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min
|
|
168 |
# Set page max to length of pdf if not specified
|
169 |
if page_max == 0: page_max = page_count
|
170 |
|
171 |
-
results =
|
172 |
with ThreadPoolExecutor(max_workers=num_threads) as executor:
|
173 |
-
futures =
|
174 |
for page_num in range(page_min, page_max):
|
175 |
futures.append(executor.submit(process_single_page_for_image_conversion, pdf_path, page_num, image_dpi, create_images=create_images, input_folder=input_folder))
|
176 |
|
@@ -222,10 +222,10 @@ def process_file_for_image_creation(file_path:str, prepare_for_review:bool=False
|
|
222 |
|
223 |
else:
|
224 |
print(f"{file_path} is not an image or PDF file.")
|
225 |
-
img_path =
|
226 |
-
image_sizes_width =
|
227 |
-
image_sizes_height =
|
228 |
-
all_img_details =
|
229 |
|
230 |
return img_path, image_sizes_width, image_sizes_height, all_img_details
|
231 |
|
@@ -234,7 +234,7 @@ def get_input_file_names(file_input:List[str]):
|
|
234 |
Get list of input files to report to logs.
|
235 |
'''
|
236 |
|
237 |
-
all_relevant_files =
|
238 |
file_name_with_extension = ""
|
239 |
full_file_name = ""
|
240 |
total_pdf_page_count = 0
|
@@ -419,8 +419,8 @@ def redact_whole_pymupdf_page(rect_height:float, rect_width:float, page:Page, cu
|
|
419 |
return whole_page_img_annotation_box
|
420 |
|
421 |
def create_page_size_objects(pymupdf_doc:Document, image_sizes_width:List[float], image_sizes_height:List[float], image_file_paths:List[str]):
|
422 |
-
page_sizes =
|
423 |
-
original_cropboxes =
|
424 |
|
425 |
for page_no, page in enumerate(pymupdf_doc):
|
426 |
reported_page_no = page_no + 1
|
@@ -443,9 +443,6 @@ def create_page_size_objects(pymupdf_doc:Document, image_sizes_width:List[float]
|
|
443 |
out_page_image_sizes['cropbox_x_offset'] = pymupdf_page.cropbox.x0 - pymupdf_page.mediabox.x0
|
444 |
|
445 |
# cropbox_y_offset_from_top: Distance from MediaBox top edge to CropBox top edge
|
446 |
-
# MediaBox top y = mediabox.y1
|
447 |
-
# CropBox top y = cropbox.y1
|
448 |
-
# The difference is mediabox.y1 - cropbox.y1
|
449 |
out_page_image_sizes['cropbox_y_offset_from_top'] = pymupdf_page.mediabox.y1 - pymupdf_page.cropbox.y1
|
450 |
|
451 |
if image_sizes_width and image_sizes_height:
|
@@ -460,7 +457,7 @@ def word_level_ocr_output_to_dataframe(ocr_results: dict) -> pd.DataFrame:
|
|
460 |
'''
|
461 |
Convert a json of ocr results to a dataframe
|
462 |
'''
|
463 |
-
rows =
|
464 |
ocr_result_page = ocr_results[0]
|
465 |
|
466 |
for ocr_result in ocr_results:
|
@@ -540,11 +537,11 @@ def prepare_image_or_pdf(
|
|
540 |
|
541 |
tic = time.perf_counter()
|
542 |
json_from_csv = False
|
543 |
-
original_cropboxes =
|
544 |
-
converted_file_paths =
|
545 |
-
image_file_paths =
|
546 |
-
# pymupdf_doc =
|
547 |
-
all_img_details =
|
548 |
review_file_csv = pd.DataFrame()
|
549 |
out_textract_path = ""
|
550 |
combined_out_message = ""
|
@@ -557,15 +554,15 @@ def prepare_image_or_pdf(
|
|
557 |
# If this is the first time around, set variables to 0/blank
|
558 |
if first_loop_state==True:
|
559 |
latest_file_completed = 0
|
560 |
-
out_message =
|
561 |
-
all_annotations_object =
|
562 |
else:
|
563 |
print("Now redacting file", str(latest_file_completed))
|
564 |
|
565 |
# If combined out message or converted_file_paths are blank, change to a list so it can be appended to
|
566 |
if isinstance(out_message, str): out_message = [out_message]
|
567 |
|
568 |
-
if not file_paths: file_paths =
|
569 |
|
570 |
if isinstance(file_paths, dict): file_paths = os.path.abspath(file_paths["name"])
|
571 |
|
@@ -595,8 +592,8 @@ def prepare_image_or_pdf(
|
|
595 |
|
596 |
# Loop through files to load in
|
597 |
for file in file_paths_loop:
|
598 |
-
converted_file_path =
|
599 |
-
image_file_path =
|
600 |
|
601 |
if isinstance(file, str):
|
602 |
file_path = file
|
@@ -631,12 +628,12 @@ def prepare_image_or_pdf(
|
|
631 |
|
632 |
#Create base version of the annotation object that doesn't have any annotations in it
|
633 |
if (not all_annotations_object) & (prepare_for_review == True):
|
634 |
-
all_annotations_object =
|
635 |
|
636 |
for image_path in image_file_paths:
|
637 |
annotation = {}
|
638 |
annotation["image"] = image_path
|
639 |
-
annotation["boxes"] =
|
640 |
|
641 |
all_annotations_object.append(annotation)
|
642 |
|
@@ -826,29 +823,6 @@ def prepare_image_or_pdf(
|
|
826 |
else:
|
827 |
print(f"Skipping {file_path}: Expected 1 JSON file, found {len(json_files)}")
|
828 |
|
829 |
-
# elif file_extension in ['.csv'] and "ocr_output" in file_path:
|
830 |
-
# continue
|
831 |
-
|
832 |
-
# Must be something else, return with error message
|
833 |
-
# else:
|
834 |
-
# if prepare_for_review == False:
|
835 |
-
# if text_extract_method == TESSERACT_TEXT_EXTRACT_OPTION or text_extract_method == TEXTRACT_TEXT_EXTRACT_OPTION:
|
836 |
-
# if is_pdf_or_image(file_path) == False:
|
837 |
-
# out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
838 |
-
# print(out_message)
|
839 |
-
# raise Exception(out_message)
|
840 |
-
|
841 |
-
# else:# text_extract_method == SELECTABLE_TEXT_EXTRACT_OPTION:
|
842 |
-
# if is_pdf(file_path) == False:
|
843 |
-
# out_message = "Please upload a PDF file for text analysis."
|
844 |
-
# print(out_message)
|
845 |
-
# raise Exception(out_message)
|
846 |
-
# else:
|
847 |
-
# message = f"File {file_name_with_ext} not a recognised type for review, skipping"
|
848 |
-
# print(message)
|
849 |
-
# gr.Info(message)
|
850 |
-
# continue
|
851 |
-
|
852 |
converted_file_paths.append(converted_file_path)
|
853 |
image_file_paths.extend(image_file_path)
|
854 |
|
@@ -966,7 +940,7 @@ def remove_duplicate_images_with_blank_boxes(data: List[dict]) -> List[dict]:
|
|
966 |
image_groups[item['image']].append(item)
|
967 |
|
968 |
# Process each group to prioritize items with non-empty boxes
|
969 |
-
result =
|
970 |
for image, items in image_groups.items():
|
971 |
# Filter items with non-empty boxes
|
972 |
non_empty_boxes = [item for item in items if item.get('boxes')]
|
@@ -1496,7 +1470,7 @@ def create_annotation_dicts_from_annotation_df(
|
|
1496 |
def convert_annotation_json_to_review_df(
|
1497 |
all_annotations: List[dict],
|
1498 |
redaction_decision_output: pd.DataFrame = pd.DataFrame(),
|
1499 |
-
page_sizes: List[dict] =
|
1500 |
do_proximity_match: bool = True
|
1501 |
) -> pd.DataFrame:
|
1502 |
'''
|
@@ -2021,7 +1995,7 @@ def fill_missing_ids(df: pd.DataFrame, column_name: str = 'id', length: int = 12
|
|
2021 |
# --- Generate Unique IDs ---
|
2022 |
character_set = string.ascii_letters + string.digits # a-z, A-Z, 0-9
|
2023 |
generated_ids_set = set() # Keep track of IDs generated *in this run*
|
2024 |
-
new_ids_list =
|
2025 |
|
2026 |
max_possible_ids = len(character_set) ** length
|
2027 |
if num_needed > max_possible_ids:
|
@@ -2228,14 +2202,14 @@ def convert_review_df_to_annotation_json(
|
|
2228 |
|
2229 |
|
2230 |
# --- Build JSON Structure ---
|
2231 |
-
json_data =
|
2232 |
output_cols_for_boxes = [col for col in ["label", "color", xmin, ymin, xmax, ymax, "id", "text"] if col in review_file_df.columns]
|
2233 |
|
2234 |
# Iterate through page_sizes_df to define the structure (one entry per image path)
|
2235 |
for _, row in page_sizes_df.iterrows():
|
2236 |
page_num = row['page'] # Already Int64
|
2237 |
pdf_image_path = row['image_path']
|
2238 |
-
annotation_boxes =
|
2239 |
|
2240 |
# Check if the page exists in the grouped annotations (using the faster set lookup)
|
2241 |
# Check pd.notna because page_num could be <NA> if conversion failed
|
@@ -2254,7 +2228,7 @@ def convert_review_df_to_annotation_json(
|
|
2254 |
|
2255 |
except KeyError:
|
2256 |
print(f"Warning: Group key {page_num} not found despite being in group_keys (should not happen).")
|
2257 |
-
annotation_boxes =
|
2258 |
|
2259 |
# Append the structured data for this image/page
|
2260 |
json_data.append({
|
|
|
72 |
Check if a given image size is above around 4.5mb, and reduce size if necessary. 5mb is the maximum possible to submit to AWS Textract.
|
73 |
'''
|
74 |
|
75 |
+
all_img_details = list()
|
76 |
page_num = 0
|
77 |
|
78 |
# Check file size and resize if necessary
|
|
|
168 |
# Set page max to length of pdf if not specified
|
169 |
if page_max == 0: page_max = page_count
|
170 |
|
171 |
+
results = list()
|
172 |
with ThreadPoolExecutor(max_workers=num_threads) as executor:
|
173 |
+
futures = list()
|
174 |
for page_num in range(page_min, page_max):
|
175 |
futures.append(executor.submit(process_single_page_for_image_conversion, pdf_path, page_num, image_dpi, create_images=create_images, input_folder=input_folder))
|
176 |
|
|
|
222 |
|
223 |
else:
|
224 |
print(f"{file_path} is not an image or PDF file.")
|
225 |
+
img_path = list()
|
226 |
+
image_sizes_width = list()
|
227 |
+
image_sizes_height = list()
|
228 |
+
all_img_details = list()
|
229 |
|
230 |
return img_path, image_sizes_width, image_sizes_height, all_img_details
|
231 |
|
|
|
234 |
Get list of input files to report to logs.
|
235 |
'''
|
236 |
|
237 |
+
all_relevant_files = list()
|
238 |
file_name_with_extension = ""
|
239 |
full_file_name = ""
|
240 |
total_pdf_page_count = 0
|
|
|
419 |
return whole_page_img_annotation_box
|
420 |
|
421 |
def create_page_size_objects(pymupdf_doc:Document, image_sizes_width:List[float], image_sizes_height:List[float], image_file_paths:List[str]):
|
422 |
+
page_sizes = list()
|
423 |
+
original_cropboxes = list()
|
424 |
|
425 |
for page_no, page in enumerate(pymupdf_doc):
|
426 |
reported_page_no = page_no + 1
|
|
|
443 |
out_page_image_sizes['cropbox_x_offset'] = pymupdf_page.cropbox.x0 - pymupdf_page.mediabox.x0
|
444 |
|
445 |
# cropbox_y_offset_from_top: Distance from MediaBox top edge to CropBox top edge
|
|
|
|
|
|
|
446 |
out_page_image_sizes['cropbox_y_offset_from_top'] = pymupdf_page.mediabox.y1 - pymupdf_page.cropbox.y1
|
447 |
|
448 |
if image_sizes_width and image_sizes_height:
|
|
|
457 |
'''
|
458 |
Convert a json of ocr results to a dataframe
|
459 |
'''
|
460 |
+
rows = list()
|
461 |
ocr_result_page = ocr_results[0]
|
462 |
|
463 |
for ocr_result in ocr_results:
|
|
|
537 |
|
538 |
tic = time.perf_counter()
|
539 |
json_from_csv = False
|
540 |
+
original_cropboxes = list() # Store original CropBox values
|
541 |
+
converted_file_paths = list()
|
542 |
+
image_file_paths = list()
|
543 |
+
# pymupdf_doc = list()
|
544 |
+
all_img_details = list()
|
545 |
review_file_csv = pd.DataFrame()
|
546 |
out_textract_path = ""
|
547 |
combined_out_message = ""
|
|
|
554 |
# If this is the first time around, set variables to 0/blank
|
555 |
if first_loop_state==True:
|
556 |
latest_file_completed = 0
|
557 |
+
out_message = list()
|
558 |
+
all_annotations_object = list()
|
559 |
else:
|
560 |
print("Now redacting file", str(latest_file_completed))
|
561 |
|
562 |
# If combined out message or converted_file_paths are blank, change to a list so it can be appended to
|
563 |
if isinstance(out_message, str): out_message = [out_message]
|
564 |
|
565 |
+
if not file_paths: file_paths = list()
|
566 |
|
567 |
if isinstance(file_paths, dict): file_paths = os.path.abspath(file_paths["name"])
|
568 |
|
|
|
592 |
|
593 |
# Loop through files to load in
|
594 |
for file in file_paths_loop:
|
595 |
+
converted_file_path = list()
|
596 |
+
image_file_path = list()
|
597 |
|
598 |
if isinstance(file, str):
|
599 |
file_path = file
|
|
|
628 |
|
629 |
#Create base version of the annotation object that doesn't have any annotations in it
|
630 |
if (not all_annotations_object) & (prepare_for_review == True):
|
631 |
+
all_annotations_object = list()
|
632 |
|
633 |
for image_path in image_file_paths:
|
634 |
annotation = {}
|
635 |
annotation["image"] = image_path
|
636 |
+
annotation["boxes"] = list()
|
637 |
|
638 |
all_annotations_object.append(annotation)
|
639 |
|
|
|
823 |
else:
|
824 |
print(f"Skipping {file_path}: Expected 1 JSON file, found {len(json_files)}")
|
825 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
826 |
converted_file_paths.append(converted_file_path)
|
827 |
image_file_paths.extend(image_file_path)
|
828 |
|
|
|
940 |
image_groups[item['image']].append(item)
|
941 |
|
942 |
# Process each group to prioritize items with non-empty boxes
|
943 |
+
result = list()
|
944 |
for image, items in image_groups.items():
|
945 |
# Filter items with non-empty boxes
|
946 |
non_empty_boxes = [item for item in items if item.get('boxes')]
|
|
|
1470 |
def convert_annotation_json_to_review_df(
|
1471 |
all_annotations: List[dict],
|
1472 |
redaction_decision_output: pd.DataFrame = pd.DataFrame(),
|
1473 |
+
page_sizes: List[dict] = list(),
|
1474 |
do_proximity_match: bool = True
|
1475 |
) -> pd.DataFrame:
|
1476 |
'''
|
|
|
1995 |
# --- Generate Unique IDs ---
|
1996 |
character_set = string.ascii_letters + string.digits # a-z, A-Z, 0-9
|
1997 |
generated_ids_set = set() # Keep track of IDs generated *in this run*
|
1998 |
+
new_ids_list = list() # Store the generated IDs in order
|
1999 |
|
2000 |
max_possible_ids = len(character_set) ** length
|
2001 |
if num_needed > max_possible_ids:
|
|
|
2202 |
|
2203 |
|
2204 |
# --- Build JSON Structure ---
|
2205 |
+
json_data = list()
|
2206 |
output_cols_for_boxes = [col for col in ["label", "color", xmin, ymin, xmax, ymax, "id", "text"] if col in review_file_df.columns]
|
2207 |
|
2208 |
# Iterate through page_sizes_df to define the structure (one entry per image path)
|
2209 |
for _, row in page_sizes_df.iterrows():
|
2210 |
page_num = row['page'] # Already Int64
|
2211 |
pdf_image_path = row['image_path']
|
2212 |
+
annotation_boxes = list() # Default to empty list
|
2213 |
|
2214 |
# Check if the page exists in the grouped annotations (using the faster set lookup)
|
2215 |
# Check pd.notna because page_num could be <NA> if conversion failed
|
|
|
2228 |
|
2229 |
except KeyError:
|
2230 |
print(f"Warning: Group key {page_num} not found despite being in group_keys (should not happen).")
|
2231 |
+
annotation_boxes = list() # Keep empty
|
2232 |
|
2233 |
# Append the structured data for this image/page
|
2234 |
json_data.append({
|
tools/file_redaction.py
CHANGED
@@ -201,7 +201,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
201 |
pdf_file_name_with_ext = ""
|
202 |
pdf_file_name_without_ext = ""
|
203 |
page_break_return = False
|
204 |
-
blank_request_metadata =
|
205 |
all_textract_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
|
206 |
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
207 |
|
@@ -387,7 +387,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
387 |
if not in_allow_list.empty:
|
388 |
in_allow_list_flat = in_allow_list.iloc[:,0].tolist()
|
389 |
else:
|
390 |
-
in_allow_list_flat =
|
391 |
|
392 |
# If string, assume file path
|
393 |
if isinstance(custom_recogniser_word_list, str):
|
@@ -396,7 +396,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
396 |
if not custom_recogniser_word_list.empty:
|
397 |
custom_recogniser_word_list_flat = custom_recogniser_word_list.iloc[:, 0].tolist()
|
398 |
else:
|
399 |
-
custom_recogniser_word_list_flat =
|
400 |
|
401 |
# Sort the strings in order from the longest string to the shortest
|
402 |
custom_recogniser_word_list_flat = sorted(custom_recogniser_word_list_flat, key=len, reverse=True)
|
@@ -412,7 +412,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
412 |
print("Could not convert whole page redaction data to number list due to:", e)
|
413 |
redact_whole_page_list_flat = redact_whole_page_list.iloc[:,0].tolist()
|
414 |
else:
|
415 |
-
redact_whole_page_list_flat =
|
416 |
|
417 |
|
418 |
|
@@ -1100,7 +1100,7 @@ def set_cropbox_safely(page: Page, original_cropbox: Optional[Rect]):
|
|
1100 |
else:
|
1101 |
page.set_cropbox(original_cropbox)
|
1102 |
|
1103 |
-
def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None, custom_colours:bool=False, redact_whole_page:bool=False, convert_pikepdf_to_pymupdf_coords:bool=True, original_cropbox:List[Rect]=
|
1104 |
|
1105 |
rect_height = page.rect.height
|
1106 |
rect_width = page.rect.width
|
@@ -1127,7 +1127,7 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None,
|
|
1127 |
image_dimensions = {}
|
1128 |
|
1129 |
out_annotation_boxes = {}
|
1130 |
-
all_image_annotation_boxes =
|
1131 |
|
1132 |
if isinstance(image, Image.Image):
|
1133 |
image_path = move_page_info(str(page))
|
@@ -1238,10 +1238,25 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None,
|
|
1238 |
# IMAGE-BASED OCR PDF TEXT DETECTION/REDACTION WITH TESSERACT OR AWS TEXTRACT
|
1239 |
###
|
1240 |
|
1241 |
-
def merge_img_bboxes(bboxes:list, combined_results: Dict, page_signature_recogniser_results=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1242 |
|
1243 |
-
all_bboxes =
|
1244 |
-
merged_bboxes =
|
1245 |
grouped_bboxes = defaultdict(list)
|
1246 |
|
1247 |
# Deep copy original bounding boxes to retain them
|
@@ -1256,7 +1271,7 @@ def merge_img_bboxes(bboxes:list, combined_results: Dict, page_signature_recogni
|
|
1256 |
merged_bboxes.extend(copy.deepcopy(page_signature_recogniser_results))
|
1257 |
|
1258 |
# Reconstruct bounding boxes for substrings of interest
|
1259 |
-
reconstructed_bboxes =
|
1260 |
for bbox in bboxes:
|
1261 |
bbox_box = (bbox.left, bbox.top, bbox.left + bbox.width, bbox.top + bbox.height)
|
1262 |
for line_text, line_info in combined_results.items():
|
@@ -1266,7 +1281,7 @@ def merge_img_bboxes(bboxes:list, combined_results: Dict, page_signature_recogni
|
|
1266 |
start_char = line_text.index(bbox.text)
|
1267 |
end_char = start_char + len(bbox.text)
|
1268 |
|
1269 |
-
relevant_words =
|
1270 |
current_char = 0
|
1271 |
for word in line_info['words']:
|
1272 |
word_end = current_char + len(word['text'])
|
@@ -1501,8 +1516,8 @@ def redact_image_pdf(file_path:str,
|
|
1501 |
progress_bar = tqdm(range(page_loop_start, number_of_pages), unit="pages remaining", desc="Redacting pages")
|
1502 |
|
1503 |
# If there's data from a previous run (passed in via the DataFrame parameters), add it
|
1504 |
-
all_line_level_ocr_results_list =
|
1505 |
-
all_pages_decision_process_list =
|
1506 |
|
1507 |
if not all_page_line_level_ocr_results_df.empty:
|
1508 |
all_line_level_ocr_results_list.extend(all_page_line_level_ocr_results_df.to_dict('records'))
|
@@ -1513,10 +1528,10 @@ def redact_image_pdf(file_path:str,
|
|
1513 |
# Go through each page
|
1514 |
for page_no in progress_bar:
|
1515 |
|
1516 |
-
handwriting_or_signature_boxes =
|
1517 |
-
page_signature_recogniser_results =
|
1518 |
-
page_handwriting_recogniser_results =
|
1519 |
-
page_line_level_ocr_results_with_words =
|
1520 |
page_break_return = False
|
1521 |
reported_page_number = str(page_no + 1)
|
1522 |
|
@@ -1567,7 +1582,7 @@ def redact_image_pdf(file_path:str,
|
|
1567 |
)
|
1568 |
|
1569 |
page_line_level_ocr_results_with_words = matching_page if matching_page else []
|
1570 |
-
else: page_line_level_ocr_results_with_words =
|
1571 |
|
1572 |
if page_line_level_ocr_results_with_words:
|
1573 |
print("Found OCR results for page in existing OCR with words object")
|
@@ -1581,7 +1596,7 @@ def redact_image_pdf(file_path:str,
|
|
1581 |
|
1582 |
# Check if page exists in existing textract data. If not, send to service to analyse
|
1583 |
if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
|
1584 |
-
text_blocks =
|
1585 |
|
1586 |
if not textract_data:
|
1587 |
try:
|
@@ -1619,7 +1634,7 @@ def redact_image_pdf(file_path:str,
|
|
1619 |
text_blocks, new_textract_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
1620 |
|
1621 |
# Check if "pages" key exists, if not, initialise it as an empty list
|
1622 |
-
if "pages" not in textract_data: textract_data["pages"] =
|
1623 |
|
1624 |
# Append the new page data
|
1625 |
textract_data["pages"].append(text_blocks)
|
@@ -1627,11 +1642,11 @@ def redact_image_pdf(file_path:str,
|
|
1627 |
except Exception as e:
|
1628 |
out_message = "Textract extraction for page " + reported_page_number + " failed due to:" + str(e)
|
1629 |
print(out_message)
|
1630 |
-
text_blocks =
|
1631 |
new_textract_request_metadata = "Failed Textract API call"
|
1632 |
|
1633 |
# Check if "pages" key exists, if not, initialise it as an empty list
|
1634 |
-
if "pages" not in textract_data: textract_data["pages"] =
|
1635 |
|
1636 |
raise Exception(out_message)
|
1637 |
|
@@ -1678,12 +1693,12 @@ def redact_image_pdf(file_path:str,
|
|
1678 |
|
1679 |
comprehend_query_number = comprehend_query_number + comprehend_query_number_new
|
1680 |
|
1681 |
-
else: page_redaction_bounding_boxes =
|
1682 |
|
1683 |
# Merge redaction bounding boxes that are close together
|
1684 |
page_merged_redaction_bboxes = merge_img_bboxes(page_redaction_bounding_boxes, page_line_level_ocr_results_with_words['results'], page_signature_recogniser_results, page_handwriting_recogniser_results, handwrite_signature_checkbox)
|
1685 |
|
1686 |
-
else: page_merged_redaction_bboxes =
|
1687 |
|
1688 |
# 3. Draw the merged boxes
|
1689 |
## Apply annotations to pdf with pymupdf
|
@@ -1710,7 +1725,7 @@ def redact_image_pdf(file_path:str,
|
|
1710 |
fill = (0, 0, 0) # Fill colour for redactions
|
1711 |
draw = ImageDraw.Draw(image)
|
1712 |
|
1713 |
-
all_image_annotations_boxes =
|
1714 |
|
1715 |
for box in page_merged_redaction_bboxes:
|
1716 |
|
@@ -1914,9 +1929,9 @@ def create_line_level_ocr_results_from_characters(char_objects:List, line_number
|
|
1914 |
Create OCRResult objects based on a list of pdfminer LTChar objects.
|
1915 |
This version is corrected to use the specified OCRResult class definition.
|
1916 |
"""
|
1917 |
-
line_level_results_out =
|
1918 |
-
line_level_characters_out =
|
1919 |
-
character_objects_out =
|
1920 |
|
1921 |
full_text = ""
|
1922 |
# [x0, y0, x1, y1]
|
@@ -1943,7 +1958,7 @@ def create_line_level_ocr_results_from_characters(char_objects:List, line_number
|
|
1943 |
line_level_characters_out.append(character_objects_out)
|
1944 |
|
1945 |
# Reset for the next line
|
1946 |
-
character_objects_out =
|
1947 |
full_text = ""
|
1948 |
overall_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')]
|
1949 |
line_number += 1
|
@@ -2003,7 +2018,7 @@ def generate_words_for_line(line_chars: List) -> List[Dict[str, Any]]:
|
|
2003 |
# The hyphen '-' is intentionally excluded to keep words like 'high-tech' together.
|
2004 |
PUNCTUATION_TO_SPLIT = {'.', ',', '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'}
|
2005 |
|
2006 |
-
line_words =
|
2007 |
current_word_text = ""
|
2008 |
current_word_bbox = [float('inf'), float('inf'), -1, -1] # [x0, y0, x1, y1]
|
2009 |
prev_char = None
|
@@ -2152,7 +2167,7 @@ def create_text_redaction_process_results(analyser_results, analysed_bounding_bo
|
|
2152 |
return decision_process_table
|
2153 |
|
2154 |
def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
|
2155 |
-
pikepdf_redaction_annotations_on_page =
|
2156 |
for analysed_bounding_box in analysed_bounding_boxes:
|
2157 |
|
2158 |
bounding_box = analysed_bounding_box["boundingBox"]
|
@@ -2282,7 +2297,7 @@ def redact_text_pdf(
|
|
2282 |
|
2283 |
#file_name = get_file_name_without_type(file_path)
|
2284 |
|
2285 |
-
if not all_page_line_level_ocr_results_with_words: all_page_line_level_ocr_results_with_words =
|
2286 |
|
2287 |
# Check that page_min and page_max are within expected ranges
|
2288 |
if page_max > number_of_pages or page_max == 0: page_max = number_of_pages
|
@@ -2315,20 +2330,20 @@ def redact_text_pdf(
|
|
2315 |
# Go page by page
|
2316 |
for page_layout in extract_pages(file_path, page_numbers = [page_no], maxpages=1):
|
2317 |
|
2318 |
-
all_page_line_text_extraction_characters =
|
2319 |
-
all_page_line_level_text_extraction_results_list =
|
2320 |
-
page_analyser_results =
|
2321 |
-
page_redaction_bounding_boxes =
|
2322 |
|
2323 |
-
characters =
|
2324 |
-
pikepdf_redaction_annotations_on_page =
|
2325 |
page_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"])
|
2326 |
page_text_ocr_outputs = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height", "line"])
|
2327 |
-
page_text_ocr_outputs_list =
|
2328 |
|
2329 |
text_line_no = 1
|
2330 |
for n, text_container in enumerate(page_layout):
|
2331 |
-
characters =
|
2332 |
|
2333 |
if isinstance(text_container, LTTextContainer) or isinstance(text_container, LTAnno):
|
2334 |
characters = get_text_container_characters(text_container)
|
@@ -2390,7 +2405,7 @@ def redact_text_pdf(
|
|
2390 |
# Annotate redactions on page
|
2391 |
pikepdf_redaction_annotations_on_page = create_pikepdf_annotations_for_bounding_boxes(page_redaction_bounding_boxes)
|
2392 |
|
2393 |
-
else: pikepdf_redaction_annotations_on_page =
|
2394 |
|
2395 |
# Make pymupdf page redactions
|
2396 |
if redact_whole_page_list:
|
|
|
201 |
pdf_file_name_with_ext = ""
|
202 |
pdf_file_name_without_ext = ""
|
203 |
page_break_return = False
|
204 |
+
blank_request_metadata = list()
|
205 |
all_textract_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
|
206 |
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
207 |
|
|
|
387 |
if not in_allow_list.empty:
|
388 |
in_allow_list_flat = in_allow_list.iloc[:,0].tolist()
|
389 |
else:
|
390 |
+
in_allow_list_flat = list()
|
391 |
|
392 |
# If string, assume file path
|
393 |
if isinstance(custom_recogniser_word_list, str):
|
|
|
396 |
if not custom_recogniser_word_list.empty:
|
397 |
custom_recogniser_word_list_flat = custom_recogniser_word_list.iloc[:, 0].tolist()
|
398 |
else:
|
399 |
+
custom_recogniser_word_list_flat = list()
|
400 |
|
401 |
# Sort the strings in order from the longest string to the shortest
|
402 |
custom_recogniser_word_list_flat = sorted(custom_recogniser_word_list_flat, key=len, reverse=True)
|
|
|
412 |
print("Could not convert whole page redaction data to number list due to:", e)
|
413 |
redact_whole_page_list_flat = redact_whole_page_list.iloc[:,0].tolist()
|
414 |
else:
|
415 |
+
redact_whole_page_list_flat = list()
|
416 |
|
417 |
|
418 |
|
|
|
1100 |
else:
|
1101 |
page.set_cropbox(original_cropbox)
|
1102 |
|
1103 |
+
def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None, custom_colours:bool=False, redact_whole_page:bool=False, convert_pikepdf_to_pymupdf_coords:bool=True, original_cropbox:List[Rect]= list(), page_sizes_df:pd.DataFrame=pd.DataFrame()):
|
1104 |
|
1105 |
rect_height = page.rect.height
|
1106 |
rect_width = page.rect.width
|
|
|
1127 |
image_dimensions = {}
|
1128 |
|
1129 |
out_annotation_boxes = {}
|
1130 |
+
all_image_annotation_boxes = list()
|
1131 |
|
1132 |
if isinstance(image, Image.Image):
|
1133 |
image_path = move_page_info(str(page))
|
|
|
1238 |
# IMAGE-BASED OCR PDF TEXT DETECTION/REDACTION WITH TESSERACT OR AWS TEXTRACT
|
1239 |
###
|
1240 |
|
1241 |
+
def merge_img_bboxes(bboxes: list, combined_results: Dict, page_signature_recogniser_results: list = list(), page_handwriting_recogniser_results: list = list(), handwrite_signature_checkbox: List[str] = ["Extract handwriting", "Extract signatures"], horizontal_threshold: int = 50, vertical_threshold: int = 12):
|
1242 |
+
"""
|
1243 |
+
Merges bounding boxes for image annotations based on the provided results from signature and handwriting recognizers.
|
1244 |
+
|
1245 |
+
Args:
|
1246 |
+
bboxes (list): A list of bounding boxes to be merged.
|
1247 |
+
combined_results (Dict): A dictionary containing combined results with line text and their corresponding bounding boxes.
|
1248 |
+
page_signature_recogniser_results (list, optional): A list of results from the signature recognizer. Defaults to an empty list.
|
1249 |
+
page_handwriting_recogniser_results (list, optional): A list of results from the handwriting recognizer. Defaults to an empty list.
|
1250 |
+
handwrite_signature_checkbox (List[str], optional): A list of options indicating whether to extract handwriting and signatures. Defaults to ["Extract handwriting", "Extract signatures"].
|
1251 |
+
horizontal_threshold (int, optional): The threshold for merging bounding boxes horizontally. Defaults to 50.
|
1252 |
+
vertical_threshold (int, optional): The threshold for merging bounding boxes vertically. Defaults to 12.
|
1253 |
+
|
1254 |
+
Returns:
|
1255 |
+
None: This function modifies the bounding boxes in place and does not return a value.
|
1256 |
+
"""
|
1257 |
|
1258 |
+
all_bboxes = list()
|
1259 |
+
merged_bboxes = list()
|
1260 |
grouped_bboxes = defaultdict(list)
|
1261 |
|
1262 |
# Deep copy original bounding boxes to retain them
|
|
|
1271 |
merged_bboxes.extend(copy.deepcopy(page_signature_recogniser_results))
|
1272 |
|
1273 |
# Reconstruct bounding boxes for substrings of interest
|
1274 |
+
reconstructed_bboxes = list()
|
1275 |
for bbox in bboxes:
|
1276 |
bbox_box = (bbox.left, bbox.top, bbox.left + bbox.width, bbox.top + bbox.height)
|
1277 |
for line_text, line_info in combined_results.items():
|
|
|
1281 |
start_char = line_text.index(bbox.text)
|
1282 |
end_char = start_char + len(bbox.text)
|
1283 |
|
1284 |
+
relevant_words = list()
|
1285 |
current_char = 0
|
1286 |
for word in line_info['words']:
|
1287 |
word_end = current_char + len(word['text'])
|
|
|
1516 |
progress_bar = tqdm(range(page_loop_start, number_of_pages), unit="pages remaining", desc="Redacting pages")
|
1517 |
|
1518 |
# If there's data from a previous run (passed in via the DataFrame parameters), add it
|
1519 |
+
all_line_level_ocr_results_list = list()
|
1520 |
+
all_pages_decision_process_list = list()
|
1521 |
|
1522 |
if not all_page_line_level_ocr_results_df.empty:
|
1523 |
all_line_level_ocr_results_list.extend(all_page_line_level_ocr_results_df.to_dict('records'))
|
|
|
1528 |
# Go through each page
|
1529 |
for page_no in progress_bar:
|
1530 |
|
1531 |
+
handwriting_or_signature_boxes = list()
|
1532 |
+
page_signature_recogniser_results = list()
|
1533 |
+
page_handwriting_recogniser_results = list()
|
1534 |
+
page_line_level_ocr_results_with_words = list()
|
1535 |
page_break_return = False
|
1536 |
reported_page_number = str(page_no + 1)
|
1537 |
|
|
|
1582 |
)
|
1583 |
|
1584 |
page_line_level_ocr_results_with_words = matching_page if matching_page else []
|
1585 |
+
else: page_line_level_ocr_results_with_words = list()
|
1586 |
|
1587 |
if page_line_level_ocr_results_with_words:
|
1588 |
print("Found OCR results for page in existing OCR with words object")
|
|
|
1596 |
|
1597 |
# Check if page exists in existing textract data. If not, send to service to analyse
|
1598 |
if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
|
1599 |
+
text_blocks = list()
|
1600 |
|
1601 |
if not textract_data:
|
1602 |
try:
|
|
|
1634 |
text_blocks, new_textract_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
1635 |
|
1636 |
# Check if "pages" key exists, if not, initialise it as an empty list
|
1637 |
+
if "pages" not in textract_data: textract_data["pages"] = list()
|
1638 |
|
1639 |
# Append the new page data
|
1640 |
textract_data["pages"].append(text_blocks)
|
|
|
1642 |
except Exception as e:
|
1643 |
out_message = "Textract extraction for page " + reported_page_number + " failed due to:" + str(e)
|
1644 |
print(out_message)
|
1645 |
+
text_blocks = list()
|
1646 |
new_textract_request_metadata = "Failed Textract API call"
|
1647 |
|
1648 |
# Check if "pages" key exists, if not, initialise it as an empty list
|
1649 |
+
if "pages" not in textract_data: textract_data["pages"] = list()
|
1650 |
|
1651 |
raise Exception(out_message)
|
1652 |
|
|
|
1693 |
|
1694 |
comprehend_query_number = comprehend_query_number + comprehend_query_number_new
|
1695 |
|
1696 |
+
else: page_redaction_bounding_boxes = list()
|
1697 |
|
1698 |
# Merge redaction bounding boxes that are close together
|
1699 |
page_merged_redaction_bboxes = merge_img_bboxes(page_redaction_bounding_boxes, page_line_level_ocr_results_with_words['results'], page_signature_recogniser_results, page_handwriting_recogniser_results, handwrite_signature_checkbox)
|
1700 |
|
1701 |
+
else: page_merged_redaction_bboxes = list()
|
1702 |
|
1703 |
# 3. Draw the merged boxes
|
1704 |
## Apply annotations to pdf with pymupdf
|
|
|
1725 |
fill = (0, 0, 0) # Fill colour for redactions
|
1726 |
draw = ImageDraw.Draw(image)
|
1727 |
|
1728 |
+
all_image_annotations_boxes = list()
|
1729 |
|
1730 |
for box in page_merged_redaction_bboxes:
|
1731 |
|
|
|
1929 |
Create OCRResult objects based on a list of pdfminer LTChar objects.
|
1930 |
This version is corrected to use the specified OCRResult class definition.
|
1931 |
"""
|
1932 |
+
line_level_results_out = list()
|
1933 |
+
line_level_characters_out = list()
|
1934 |
+
character_objects_out = list()
|
1935 |
|
1936 |
full_text = ""
|
1937 |
# [x0, y0, x1, y1]
|
|
|
1958 |
line_level_characters_out.append(character_objects_out)
|
1959 |
|
1960 |
# Reset for the next line
|
1961 |
+
character_objects_out = list()
|
1962 |
full_text = ""
|
1963 |
overall_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')]
|
1964 |
line_number += 1
|
|
|
2018 |
# The hyphen '-' is intentionally excluded to keep words like 'high-tech' together.
|
2019 |
PUNCTUATION_TO_SPLIT = {'.', ',', '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'}
|
2020 |
|
2021 |
+
line_words = list()
|
2022 |
current_word_text = ""
|
2023 |
current_word_bbox = [float('inf'), float('inf'), -1, -1] # [x0, y0, x1, y1]
|
2024 |
prev_char = None
|
|
|
2167 |
return decision_process_table
|
2168 |
|
2169 |
def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
|
2170 |
+
pikepdf_redaction_annotations_on_page = list()
|
2171 |
for analysed_bounding_box in analysed_bounding_boxes:
|
2172 |
|
2173 |
bounding_box = analysed_bounding_box["boundingBox"]
|
|
|
2297 |
|
2298 |
#file_name = get_file_name_without_type(file_path)
|
2299 |
|
2300 |
+
if not all_page_line_level_ocr_results_with_words: all_page_line_level_ocr_results_with_words = list()
|
2301 |
|
2302 |
# Check that page_min and page_max are within expected ranges
|
2303 |
if page_max > number_of_pages or page_max == 0: page_max = number_of_pages
|
|
|
2330 |
# Go page by page
|
2331 |
for page_layout in extract_pages(file_path, page_numbers = [page_no], maxpages=1):
|
2332 |
|
2333 |
+
all_page_line_text_extraction_characters = list()
|
2334 |
+
all_page_line_level_text_extraction_results_list = list()
|
2335 |
+
page_analyser_results = list()
|
2336 |
+
page_redaction_bounding_boxes = list()
|
2337 |
|
2338 |
+
characters = list()
|
2339 |
+
pikepdf_redaction_annotations_on_page = list()
|
2340 |
page_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"])
|
2341 |
page_text_ocr_outputs = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height", "line"])
|
2342 |
+
page_text_ocr_outputs_list = list()
|
2343 |
|
2344 |
text_line_no = 1
|
2345 |
for n, text_container in enumerate(page_layout):
|
2346 |
+
characters = list()
|
2347 |
|
2348 |
if isinstance(text_container, LTTextContainer) or isinstance(text_container, LTAnno):
|
2349 |
characters = get_text_container_characters(text_container)
|
|
|
2405 |
# Annotate redactions on page
|
2406 |
pikepdf_redaction_annotations_on_page = create_pikepdf_annotations_for_bounding_boxes(page_redaction_bounding_boxes)
|
2407 |
|
2408 |
+
else: pikepdf_redaction_annotations_on_page = list()
|
2409 |
|
2410 |
# Make pymupdf page redactions
|
2411 |
if redact_whole_page_list:
|
tools/redaction_review.py
CHANGED
@@ -99,8 +99,8 @@ def get_filtered_recogniser_dataframe_and_dropdowns(page_image_annotator_object:
|
|
99 |
recogniser_dropdown_value:str,
|
100 |
text_dropdown_value:str,
|
101 |
page_dropdown_value:str,
|
102 |
-
review_df:pd.DataFrame=
|
103 |
-
page_sizes:List[str]=
|
104 |
'''
|
105 |
Create a filtered recogniser dataframe and associated dropdowns based on current information in the image annotator and review data frame.
|
106 |
'''
|
@@ -147,7 +147,7 @@ def get_filtered_recogniser_dataframe_and_dropdowns(page_image_annotator_object:
|
|
147 |
|
148 |
return recogniser_dataframe_out_gr, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list, text_entities_drop, page_entities_drop
|
149 |
|
150 |
-
def update_recogniser_dataframes(page_image_annotator_object:AnnotatedImageData, recogniser_dataframe_base:pd.DataFrame, recogniser_entities_dropdown_value:str="ALL", text_dropdown_value:str="ALL", page_dropdown_value:str="ALL", review_df:pd.DataFrame=
|
151 |
'''
|
152 |
Update recogniser dataframe information that appears alongside the pdf pages on the review screen.
|
153 |
'''
|
@@ -265,7 +265,7 @@ def update_annotator_page_from_review_df(
|
|
265 |
if not current_page_review_df.empty:
|
266 |
# Convert the current page's review data to annotation list format for *this page*
|
267 |
|
268 |
-
current_page_annotations_list =
|
269 |
# Define expected annotation dict keys, including 'image', 'page', coords, 'label', 'text', 'color' etc.
|
270 |
# Assuming review_df has compatible columns
|
271 |
expected_annotation_keys = ['label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id'] # Add/remove as needed
|
@@ -340,7 +340,7 @@ def update_annotator_page_from_review_df(
|
|
340 |
if not page_sizes_df.empty:
|
341 |
page_sizes = page_sizes_df.to_dict(orient='records')
|
342 |
else:
|
343 |
-
page_sizes =
|
344 |
|
345 |
# --- Re-evaluate Coordinate Multiplication and Duplicate Removal ---
|
346 |
# Let's assume remove_duplicate_images_with_blank_boxes expects the raw list of dicts state format:
|
@@ -609,7 +609,7 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
|
|
609 |
merged_df = merged_df.sort_values('image')
|
610 |
|
611 |
|
612 |
-
final_annotations_list =
|
613 |
box_cols = ['label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
|
614 |
|
615 |
# Now, when we group, we use `sort=False`. This tells groupby to respect the
|
@@ -622,7 +622,7 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
|
|
622 |
# Check if the group has actual annotations. iloc[0] is safe because even pages
|
623 |
# without annotations will have one row with NaN values from the merge.
|
624 |
if pd.isna(group.iloc[0].get('id')):
|
625 |
-
boxes =
|
626 |
else:
|
627 |
valid_box_cols = [col for col in box_cols if col in group.columns]
|
628 |
# We should also sort the boxes within a page for consistency (e.g., left-to-right)
|
@@ -751,7 +751,7 @@ def update_annotator_object_and_filter_df(
|
|
751 |
recogniser_dataframe_base:pd.DataFrame=None, # Simplified default
|
752 |
zoom:int=100,
|
753 |
review_df:pd.DataFrame=None, # Use None for default empty DataFrame
|
754 |
-
page_sizes:List[dict]=
|
755 |
doc_full_file_name_textbox:str='',
|
756 |
input_folder:str=INPUT_FOLDER
|
757 |
) -> Tuple[image_annotator, gr.Number, gr.Number, int, str, gr.Dataframe, pd.DataFrame, List[str], List[str], List[dict], List[AnnotatedImageData]]:
|
@@ -775,7 +775,7 @@ def update_annotator_object_and_filter_df(
|
|
775 |
# Return blank/default outputs
|
776 |
|
777 |
blank_annotator = image_annotator(
|
778 |
-
value = None, boxes_alpha=0.1, box_thickness=1, label_list=
|
779 |
show_label=False, height=zoom_str, width=zoom_str, box_min_size=1,
|
780 |
box_selected_thickness=2, handle_size=4, sources=None,
|
781 |
show_clear_button=False, show_share_button=False, show_remove_button=False,
|
@@ -851,7 +851,7 @@ def update_annotator_object_and_filter_df(
|
|
851 |
if not page_sizes_df.empty:
|
852 |
page_sizes = page_sizes_df.to_dict(orient='records')
|
853 |
else:
|
854 |
-
page_sizes =
|
855 |
|
856 |
# --- OPTIMIZATION: Prepare data *only* for the current page for display ---
|
857 |
current_page_image_annotator_object = None
|
@@ -907,12 +907,12 @@ def update_annotator_object_and_filter_df(
|
|
907 |
|
908 |
except Exception as e:
|
909 |
print(f"Error calling update_recogniser_dataframes: {e}. Returning empty/default filter data.")
|
910 |
-
recogniser_entities_list =
|
911 |
-
recogniser_colour_list =
|
912 |
recogniser_dataframe_out_gr = gr.Dataframe(pd.DataFrame(columns=["page", "label", "text", "id"]))
|
913 |
recogniser_dataframe_modified = pd.DataFrame(columns=["page", "label", "text", "id"])
|
914 |
-
text_entities_drop =
|
915 |
-
page_entities_drop =
|
916 |
|
917 |
|
918 |
# --- Final Output Components ---
|
@@ -946,7 +946,7 @@ def update_annotator_object_and_filter_df(
|
|
946 |
interactive=True # Keep interactive if data is present
|
947 |
)
|
948 |
|
949 |
-
page_entities_drop_redaction_list =
|
950 |
all_pages_in_doc_list = [str(i) for i in range(1, len(page_sizes) + 1)]
|
951 |
page_entities_drop_redaction_list.extend(all_pages_in_doc_list)
|
952 |
|
@@ -970,7 +970,7 @@ def update_all_page_annotation_object_based_on_previous_page(
|
|
970 |
current_page:int,
|
971 |
previous_page:int,
|
972 |
all_image_annotations:List[AnnotatedImageData],
|
973 |
-
page_sizes:List[dict]=
|
974 |
clear_all:bool=False
|
975 |
):
|
976 |
'''
|
@@ -991,7 +991,7 @@ def update_all_page_annotation_object_based_on_previous_page(
|
|
991 |
page_image_annotator_object, all_image_annotations = replace_annotator_object_img_np_array_with_page_sizes_image_path(all_image_annotations, page_image_annotator_object, page_sizes, previous_page)
|
992 |
|
993 |
if clear_all == False: all_image_annotations[previous_page_zero_index] = page_image_annotator_object
|
994 |
-
else: all_image_annotations[previous_page_zero_index]["boxes"] =
|
995 |
|
996 |
return all_image_annotations, current_page, current_page
|
997 |
|
@@ -1003,16 +1003,16 @@ def apply_redactions_to_review_df_and_files(page_image_annotator_object:Annotate
|
|
1003 |
review_file_state:pd.DataFrame,
|
1004 |
output_folder:str = OUTPUT_FOLDER,
|
1005 |
save_pdf:bool=True,
|
1006 |
-
page_sizes:List[dict]=
|
1007 |
COMPRESS_REDACTED_PDF:bool=COMPRESS_REDACTED_PDF,
|
1008 |
progress=gr.Progress(track_tqdm=True)):
|
1009 |
'''
|
1010 |
Apply modified redactions to a pymupdf and export review files.
|
1011 |
'''
|
1012 |
|
1013 |
-
output_files =
|
1014 |
-
output_log_files =
|
1015 |
-
pdf_doc =
|
1016 |
review_df = review_file_state
|
1017 |
|
1018 |
page_image_annotator_object = all_image_annotations[current_page - 1]
|
@@ -1078,7 +1078,7 @@ def apply_redactions_to_review_df_and_files(page_image_annotator_object:Annotate
|
|
1078 |
doc = [image]
|
1079 |
|
1080 |
elif file_extension in '.csv':
|
1081 |
-
pdf_doc =
|
1082 |
|
1083 |
# If working with pdfs
|
1084 |
elif is_pdf(file_path) == True:
|
@@ -1088,7 +1088,7 @@ def apply_redactions_to_review_df_and_files(page_image_annotator_object:Annotate
|
|
1088 |
output_files.append(orig_pdf_file_path)
|
1089 |
|
1090 |
number_of_pages = pdf_doc.page_count
|
1091 |
-
original_cropboxes =
|
1092 |
|
1093 |
page_sizes_df = pd.DataFrame(page_sizes)
|
1094 |
page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
|
@@ -1619,7 +1619,7 @@ def convert_pymupdf_coords_to_adobe(x1: float, y1: float, x2: float, y2: float,
|
|
1619 |
|
1620 |
return x1, adobe_y1, x2, adobe_y2
|
1621 |
|
1622 |
-
def create_xfdf(review_file_df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, image_paths:List[str]=
|
1623 |
'''
|
1624 |
Create an xfdf file from a review csv file and a pdf
|
1625 |
'''
|
@@ -1711,11 +1711,11 @@ def create_xfdf(review_file_df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, i
|
|
1711 |
reparsed = minidom.parseString(rough_string)
|
1712 |
return reparsed.toxml() #.toprettyxml(indent=" ")
|
1713 |
|
1714 |
-
def convert_df_to_xfdf(input_files:List[str], pdf_doc:Document, image_paths:List[str], output_folder:str = OUTPUT_FOLDER, document_cropboxes:List=
|
1715 |
'''
|
1716 |
Load in files to convert a review file into an Adobe comment file format
|
1717 |
'''
|
1718 |
-
output_paths =
|
1719 |
pdf_name = ""
|
1720 |
file_path_name = ""
|
1721 |
|
@@ -1814,7 +1814,7 @@ def parse_xfdf(xfdf_path:str):
|
|
1814 |
# Define the namespace
|
1815 |
namespace = {'xfdf': 'http://ns.adobe.com/xfdf/'}
|
1816 |
|
1817 |
-
redactions =
|
1818 |
|
1819 |
# Find all redact elements using the namespace
|
1820 |
for redact in root.findall('.//xfdf:redact', namespaces=namespace):
|
@@ -1846,8 +1846,8 @@ def convert_xfdf_to_dataframe(file_paths_list:List[str], pymupdf_doc, image_path
|
|
1846 |
Returns:
|
1847 |
- DataFrame containing redaction information
|
1848 |
'''
|
1849 |
-
output_paths =
|
1850 |
-
xfdf_paths =
|
1851 |
df = pd.DataFrame()
|
1852 |
|
1853 |
# Sort the file paths so that the pdfs come first
|
|
|
99 |
recogniser_dropdown_value:str,
|
100 |
text_dropdown_value:str,
|
101 |
page_dropdown_value:str,
|
102 |
+
review_df:pd.DataFrame=list(),
|
103 |
+
page_sizes:List[str]=list()):
|
104 |
'''
|
105 |
Create a filtered recogniser dataframe and associated dropdowns based on current information in the image annotator and review data frame.
|
106 |
'''
|
|
|
147 |
|
148 |
return recogniser_dataframe_out_gr, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list, text_entities_drop, page_entities_drop
|
149 |
|
150 |
+
def update_recogniser_dataframes(page_image_annotator_object:AnnotatedImageData, recogniser_dataframe_base:pd.DataFrame, recogniser_entities_dropdown_value:str="ALL", text_dropdown_value:str="ALL", page_dropdown_value:str="ALL", review_df:pd.DataFrame=list(), page_sizes:list[str]=list()):
|
151 |
'''
|
152 |
Update recogniser dataframe information that appears alongside the pdf pages on the review screen.
|
153 |
'''
|
|
|
265 |
if not current_page_review_df.empty:
|
266 |
# Convert the current page's review data to annotation list format for *this page*
|
267 |
|
268 |
+
current_page_annotations_list = list()
|
269 |
# Define expected annotation dict keys, including 'image', 'page', coords, 'label', 'text', 'color' etc.
|
270 |
# Assuming review_df has compatible columns
|
271 |
expected_annotation_keys = ['label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id'] # Add/remove as needed
|
|
|
340 |
if not page_sizes_df.empty:
|
341 |
page_sizes = page_sizes_df.to_dict(orient='records')
|
342 |
else:
|
343 |
+
page_sizes = list() # Ensure page_sizes is a list if df is empty
|
344 |
|
345 |
# --- Re-evaluate Coordinate Multiplication and Duplicate Removal ---
|
346 |
# Let's assume remove_duplicate_images_with_blank_boxes expects the raw list of dicts state format:
|
|
|
609 |
merged_df = merged_df.sort_values('image')
|
610 |
|
611 |
|
612 |
+
final_annotations_list = list()
|
613 |
box_cols = ['label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
|
614 |
|
615 |
# Now, when we group, we use `sort=False`. This tells groupby to respect the
|
|
|
622 |
# Check if the group has actual annotations. iloc[0] is safe because even pages
|
623 |
# without annotations will have one row with NaN values from the merge.
|
624 |
if pd.isna(group.iloc[0].get('id')):
|
625 |
+
boxes = list()
|
626 |
else:
|
627 |
valid_box_cols = [col for col in box_cols if col in group.columns]
|
628 |
# We should also sort the boxes within a page for consistency (e.g., left-to-right)
|
|
|
751 |
recogniser_dataframe_base:pd.DataFrame=None, # Simplified default
|
752 |
zoom:int=100,
|
753 |
review_df:pd.DataFrame=None, # Use None for default empty DataFrame
|
754 |
+
page_sizes:List[dict]=list(),
|
755 |
doc_full_file_name_textbox:str='',
|
756 |
input_folder:str=INPUT_FOLDER
|
757 |
) -> Tuple[image_annotator, gr.Number, gr.Number, int, str, gr.Dataframe, pd.DataFrame, List[str], List[str], List[dict], List[AnnotatedImageData]]:
|
|
|
775 |
# Return blank/default outputs
|
776 |
|
777 |
blank_annotator = image_annotator(
|
778 |
+
value = None, boxes_alpha=0.1, box_thickness=1, label_list=list(), label_colors=list(),
|
779 |
show_label=False, height=zoom_str, width=zoom_str, box_min_size=1,
|
780 |
box_selected_thickness=2, handle_size=4, sources=None,
|
781 |
show_clear_button=False, show_share_button=False, show_remove_button=False,
|
|
|
851 |
if not page_sizes_df.empty:
|
852 |
page_sizes = page_sizes_df.to_dict(orient='records')
|
853 |
else:
|
854 |
+
page_sizes = list() # Ensure page_sizes is a list if df is empty
|
855 |
|
856 |
# --- OPTIMIZATION: Prepare data *only* for the current page for display ---
|
857 |
current_page_image_annotator_object = None
|
|
|
907 |
|
908 |
except Exception as e:
|
909 |
print(f"Error calling update_recogniser_dataframes: {e}. Returning empty/default filter data.")
|
910 |
+
recogniser_entities_list = list()
|
911 |
+
recogniser_colour_list = list()
|
912 |
recogniser_dataframe_out_gr = gr.Dataframe(pd.DataFrame(columns=["page", "label", "text", "id"]))
|
913 |
recogniser_dataframe_modified = pd.DataFrame(columns=["page", "label", "text", "id"])
|
914 |
+
text_entities_drop = list()
|
915 |
+
page_entities_drop = list()
|
916 |
|
917 |
|
918 |
# --- Final Output Components ---
|
|
|
946 |
interactive=True # Keep interactive if data is present
|
947 |
)
|
948 |
|
949 |
+
page_entities_drop_redaction_list = list()
|
950 |
all_pages_in_doc_list = [str(i) for i in range(1, len(page_sizes) + 1)]
|
951 |
page_entities_drop_redaction_list.extend(all_pages_in_doc_list)
|
952 |
|
|
|
970 |
current_page:int,
|
971 |
previous_page:int,
|
972 |
all_image_annotations:List[AnnotatedImageData],
|
973 |
+
page_sizes:List[dict]=list(),
|
974 |
clear_all:bool=False
|
975 |
):
|
976 |
'''
|
|
|
991 |
page_image_annotator_object, all_image_annotations = replace_annotator_object_img_np_array_with_page_sizes_image_path(all_image_annotations, page_image_annotator_object, page_sizes, previous_page)
|
992 |
|
993 |
if clear_all == False: all_image_annotations[previous_page_zero_index] = page_image_annotator_object
|
994 |
+
else: all_image_annotations[previous_page_zero_index]["boxes"] = list()
|
995 |
|
996 |
return all_image_annotations, current_page, current_page
|
997 |
|
|
|
1003 |
review_file_state:pd.DataFrame,
|
1004 |
output_folder:str = OUTPUT_FOLDER,
|
1005 |
save_pdf:bool=True,
|
1006 |
+
page_sizes:List[dict]=list(),
|
1007 |
COMPRESS_REDACTED_PDF:bool=COMPRESS_REDACTED_PDF,
|
1008 |
progress=gr.Progress(track_tqdm=True)):
|
1009 |
'''
|
1010 |
Apply modified redactions to a pymupdf and export review files.
|
1011 |
'''
|
1012 |
|
1013 |
+
output_files = list()
|
1014 |
+
output_log_files = list()
|
1015 |
+
pdf_doc = list()
|
1016 |
review_df = review_file_state
|
1017 |
|
1018 |
page_image_annotator_object = all_image_annotations[current_page - 1]
|
|
|
1078 |
doc = [image]
|
1079 |
|
1080 |
elif file_extension in '.csv':
|
1081 |
+
pdf_doc = list()
|
1082 |
|
1083 |
# If working with pdfs
|
1084 |
elif is_pdf(file_path) == True:
|
|
|
1088 |
output_files.append(orig_pdf_file_path)
|
1089 |
|
1090 |
number_of_pages = pdf_doc.page_count
|
1091 |
+
original_cropboxes = list()
|
1092 |
|
1093 |
page_sizes_df = pd.DataFrame(page_sizes)
|
1094 |
page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
|
|
|
1619 |
|
1620 |
return x1, adobe_y1, x2, adobe_y2
|
1621 |
|
1622 |
+
def create_xfdf(review_file_df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, image_paths:List[str]=list(), document_cropboxes:List=list(), page_sizes:List[dict]=list()):
|
1623 |
'''
|
1624 |
Create an xfdf file from a review csv file and a pdf
|
1625 |
'''
|
|
|
1711 |
reparsed = minidom.parseString(rough_string)
|
1712 |
return reparsed.toxml() #.toprettyxml(indent=" ")
|
1713 |
|
1714 |
+
def convert_df_to_xfdf(input_files:List[str], pdf_doc:Document, image_paths:List[str], output_folder:str = OUTPUT_FOLDER, document_cropboxes:List=list(), page_sizes:List[dict]=list()):
|
1715 |
'''
|
1716 |
Load in files to convert a review file into an Adobe comment file format
|
1717 |
'''
|
1718 |
+
output_paths = list()
|
1719 |
pdf_name = ""
|
1720 |
file_path_name = ""
|
1721 |
|
|
|
1814 |
# Define the namespace
|
1815 |
namespace = {'xfdf': 'http://ns.adobe.com/xfdf/'}
|
1816 |
|
1817 |
+
redactions = list()
|
1818 |
|
1819 |
# Find all redact elements using the namespace
|
1820 |
for redact in root.findall('.//xfdf:redact', namespaces=namespace):
|
|
|
1846 |
Returns:
|
1847 |
- DataFrame containing redaction information
|
1848 |
'''
|
1849 |
+
output_paths = list()
|
1850 |
+
xfdf_paths = list()
|
1851 |
df = pd.DataFrame()
|
1852 |
|
1853 |
# Sort the file paths so that the pdfs come first
|